This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Re: Regexp recursion limit too low?
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
5 * "A fair jaw-cracker dwarf-language must be." --Samwise Gamgee
6 */
7
61296642
DM
8/* This file contains functions for compiling a regular expression. See
9 * also regexec.c which funnily enough, contains functions for executing
166f8a29 10 * a regular expression.
e4a054ea
DM
11 *
12 * This file is also copied at build time to ext/re/re_comp.c, where
13 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
14 * This causes the main functions to be compiled under new names and with
15 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
16 */
17
a687059c
LW
18/* NOTE: this is derived from Henry Spencer's regexp code, and should not
19 * confused with the original package (see point 3 below). Thanks, Henry!
20 */
21
22/* Additional note: this code is very heavily munged from Henry's version
23 * in places. In some spots I've traded clarity for efficiency, so don't
24 * blame Henry for some of the lack of readability.
25 */
26
e50aee73
AD
27/* The names of the functions have been changed from regcomp and
28 * regexec to pregcomp and pregexec in order to avoid conflicts
29 * with the POSIX routines of the same names.
30*/
31
b9d5759e 32#ifdef PERL_EXT_RE_BUILD
54df2634 33#include "re_top.h"
b81d288d 34#endif
56953603 35
a687059c 36/*
e50aee73 37 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
38 *
39 * Copyright (c) 1986 by University of Toronto.
40 * Written by Henry Spencer. Not derived from licensed software.
41 *
42 * Permission is granted to anyone to use this software for any
43 * purpose on any computer system, and to redistribute it freely,
44 * subject to the following restrictions:
45 *
46 * 1. The author is not responsible for the consequences of use of
47 * this software, no matter how awful, even if they arise
48 * from defects in it.
49 *
50 * 2. The origin of this software must not be misrepresented, either
51 * by explicit claim or by omission.
52 *
53 * 3. Altered versions must be plainly marked as such, and must not
54 * be misrepresented as being the original software.
55 *
56 *
57 **** Alterations to Henry's code are...
58 ****
4bb101f2 59 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
fdf8c088 60 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007 by Larry Wall and others
a687059c 61 ****
9ef589d8
LW
62 **** You may distribute under the terms of either the GNU General Public
63 **** License or the Artistic License, as specified in the README file.
64
a687059c
LW
65 *
66 * Beware that some of this code is subtly aware of the way operator
67 * precedence is structured in regular expressions. Serious changes in
68 * regular-expression syntax might require a total rethink.
69 */
70#include "EXTERN.h"
864dbfa3 71#define PERL_IN_REGCOMP_C
a687059c 72#include "perl.h"
d06ea78c 73
acfe0abc 74#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
75# include "INTERN.h"
76#endif
c277df42
IZ
77
78#define REG_COMP_C
54df2634
NC
79#ifdef PERL_IN_XSUB_RE
80# include "re_comp.h"
81#else
82# include "regcomp.h"
83#endif
a687059c 84
d4cce5f1 85#ifdef op
11343788 86#undef op
d4cce5f1 87#endif /* op */
11343788 88
fe14fcc3 89#ifdef MSDOS
7e4e8c89 90# if defined(BUGGY_MSC6)
fe14fcc3 91 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 92# pragma optimize("a",off)
fe14fcc3 93 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
94# pragma optimize("w",on )
95# endif /* BUGGY_MSC6 */
fe14fcc3
LW
96#endif /* MSDOS */
97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
830247a4 102typedef struct RExC_state_t {
e2509266 103 U32 flags; /* are we folding, multilining? */
830247a4 104 char *precomp; /* uncompiled string. */
f8fc2ecf
YO
105 regexp *rx; /* perl core regexp structure */
106 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 107 char *start; /* Start of input for compile */
830247a4
IZ
108 char *end; /* End of input for compile */
109 char *parse; /* Input-scan pointer. */
110 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 111 regnode *emit_start; /* Start of emitted-code area */
ffc61ed2 112 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
113 I32 naughty; /* How bad is this pattern? */
114 I32 sawback; /* Did we see \1, ...? */
115 U32 seen;
116 I32 size; /* Code size. */
c74340f9
YO
117 I32 npar; /* Capture buffer count, (OPEN). */
118 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 119 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
120 I32 extralen;
121 I32 seen_zerolen;
122 I32 seen_evals;
40d049e4
YO
123 regnode **open_parens; /* pointers to open parens */
124 regnode **close_parens; /* pointers to close parens */
125 regnode *opend; /* END node in program */
1aa99e6b 126 I32 utf8;
6bda09f9 127 HV *charnames; /* cache of named sequences */
81714fb9 128 HV *paren_names; /* Paren names */
1f1031fe 129
40d049e4
YO
130 regnode **recurse; /* Recurse regops */
131 I32 recurse_count; /* Number of recurse regops */
830247a4
IZ
132#if ADD_TO_REGEXEC
133 char *starttry; /* -Dr: where regtry was called. */
134#define RExC_starttry (pRExC_state->starttry)
135#endif
3dab1dad 136#ifdef DEBUGGING
be8e71aa 137 const char *lastparse;
3dab1dad 138 I32 lastnum;
1f1031fe 139 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
140#define RExC_lastparse (pRExC_state->lastparse)
141#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 142#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 143#endif
830247a4
IZ
144} RExC_state_t;
145
e2509266 146#define RExC_flags (pRExC_state->flags)
830247a4
IZ
147#define RExC_precomp (pRExC_state->precomp)
148#define RExC_rx (pRExC_state->rx)
f8fc2ecf 149#define RExC_rxi (pRExC_state->rxi)
fac92740 150#define RExC_start (pRExC_state->start)
830247a4
IZ
151#define RExC_end (pRExC_state->end)
152#define RExC_parse (pRExC_state->parse)
153#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
154#ifdef RE_TRACK_PATTERN_OFFSETS
155#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
156#endif
830247a4 157#define RExC_emit (pRExC_state->emit)
fac92740 158#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
159#define RExC_naughty (pRExC_state->naughty)
160#define RExC_sawback (pRExC_state->sawback)
161#define RExC_seen (pRExC_state->seen)
162#define RExC_size (pRExC_state->size)
163#define RExC_npar (pRExC_state->npar)
e2e6a0f1 164#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
165#define RExC_extralen (pRExC_state->extralen)
166#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
167#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 168#define RExC_utf8 (pRExC_state->utf8)
fc8cd66c 169#define RExC_charnames (pRExC_state->charnames)
40d049e4
YO
170#define RExC_open_parens (pRExC_state->open_parens)
171#define RExC_close_parens (pRExC_state->close_parens)
172#define RExC_opend (pRExC_state->opend)
81714fb9 173#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
174#define RExC_recurse (pRExC_state->recurse)
175#define RExC_recurse_count (pRExC_state->recurse_count)
830247a4 176
cde0cee5 177
a687059c
LW
178#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
179#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
180 ((*s) == '{' && regcurly(s)))
a687059c 181
35c8bce7
LW
182#ifdef SPSTART
183#undef SPSTART /* dratted cpp namespace... */
184#endif
a687059c
LW
185/*
186 * Flags to be passed up and down.
187 */
a687059c 188#define WORST 0 /* Worst case. */
a3b492c3
YO
189#define HASWIDTH 0x01 /* Known to match non-null strings. */
190#define SIMPLE 0x02 /* Simple enough to be STAR/PLUS operand. */
191#define SPSTART 0x04 /* Starts with * or +. */
192#define TRYAGAIN 0x08 /* Weeded out a declaration. */
193#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 194
3dab1dad
YO
195#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
196
07be1b83
YO
197/* whether trie related optimizations are enabled */
198#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
199#define TRIE_STUDY_OPT
786e8c11 200#define FULL_TRIE_STUDY
07be1b83
YO
201#define TRIE_STCLASS
202#endif
1de06328
YO
203
204
40d049e4
YO
205
206#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
207#define PBITVAL(paren) (1 << ((paren) & 7))
208#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
209#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
210#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
211
212
1de06328
YO
213/* About scan_data_t.
214
215 During optimisation we recurse through the regexp program performing
216 various inplace (keyhole style) optimisations. In addition study_chunk
217 and scan_commit populate this data structure with information about
218 what strings MUST appear in the pattern. We look for the longest
219 string that must appear for at a fixed location, and we look for the
220 longest string that may appear at a floating location. So for instance
221 in the pattern:
222
223 /FOO[xX]A.*B[xX]BAR/
224
225 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
226 strings (because they follow a .* construct). study_chunk will identify
227 both FOO and BAR as being the longest fixed and floating strings respectively.
228
229 The strings can be composites, for instance
230
231 /(f)(o)(o)/
232
233 will result in a composite fixed substring 'foo'.
234
235 For each string some basic information is maintained:
236
237 - offset or min_offset
238 This is the position the string must appear at, or not before.
239 It also implicitly (when combined with minlenp) tells us how many
240 character must match before the string we are searching.
241 Likewise when combined with minlenp and the length of the string
242 tells us how many characters must appear after the string we have
243 found.
244
245 - max_offset
246 Only used for floating strings. This is the rightmost point that
247 the string can appear at. Ifset to I32 max it indicates that the
248 string can occur infinitely far to the right.
249
250 - minlenp
251 A pointer to the minimum length of the pattern that the string
252 was found inside. This is important as in the case of positive
253 lookahead or positive lookbehind we can have multiple patterns
254 involved. Consider
255
256 /(?=FOO).*F/
257
258 The minimum length of the pattern overall is 3, the minimum length
259 of the lookahead part is 3, but the minimum length of the part that
260 will actually match is 1. So 'FOO's minimum length is 3, but the
261 minimum length for the F is 1. This is important as the minimum length
262 is used to determine offsets in front of and behind the string being
263 looked for. Since strings can be composites this is the length of the
264 pattern at the time it was commited with a scan_commit. Note that
265 the length is calculated by study_chunk, so that the minimum lengths
266 are not known until the full pattern has been compiled, thus the
267 pointer to the value.
268
269 - lookbehind
270
271 In the case of lookbehind the string being searched for can be
272 offset past the start point of the final matching string.
273 If this value was just blithely removed from the min_offset it would
274 invalidate some of the calculations for how many chars must match
275 before or after (as they are derived from min_offset and minlen and
276 the length of the string being searched for).
277 When the final pattern is compiled and the data is moved from the
278 scan_data_t structure into the regexp structure the information
279 about lookbehind is factored in, with the information that would
280 have been lost precalculated in the end_shift field for the
281 associated string.
282
283 The fields pos_min and pos_delta are used to store the minimum offset
284 and the delta to the maximum offset at the current point in the pattern.
285
286*/
2c2d71f5
JH
287
288typedef struct scan_data_t {
1de06328
YO
289 /*I32 len_min; unused */
290 /*I32 len_delta; unused */
2c2d71f5
JH
291 I32 pos_min;
292 I32 pos_delta;
293 SV *last_found;
1de06328 294 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
295 I32 last_start_min;
296 I32 last_start_max;
1de06328
YO
297 SV **longest; /* Either &l_fixed, or &l_float. */
298 SV *longest_fixed; /* longest fixed string found in pattern */
299 I32 offset_fixed; /* offset where it starts */
300 I32 *minlen_fixed; /* pointer to the minlen relevent to the string */
301 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
302 SV *longest_float; /* longest floating string found in pattern */
303 I32 offset_float_min; /* earliest point in string it can appear */
304 I32 offset_float_max; /* latest point in string it can appear */
305 I32 *minlen_float; /* pointer to the minlen relevent to the string */
306 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
307 I32 flags;
308 I32 whilem_c;
cb434fcc 309 I32 *last_closep;
653099ff 310 struct regnode_charclass_class *start_class;
2c2d71f5
JH
311} scan_data_t;
312
a687059c 313/*
e50aee73 314 * Forward declarations for pregcomp()'s friends.
a687059c 315 */
a0d0e21e 316
27da23d5 317static const scan_data_t zero_scan_data =
1de06328 318 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
319
320#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
321#define SF_BEFORE_SEOL 0x0001
322#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
323#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
324#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
325
09b7f37c
CB
326#ifdef NO_UNARY_PLUS
327# define SF_FIX_SHIFT_EOL (0+2)
328# define SF_FL_SHIFT_EOL (0+4)
329#else
330# define SF_FIX_SHIFT_EOL (+2)
331# define SF_FL_SHIFT_EOL (+4)
332#endif
c277df42
IZ
333
334#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
335#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
336
337#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
338#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
339#define SF_IS_INF 0x0040
340#define SF_HAS_PAR 0x0080
341#define SF_IN_PAR 0x0100
342#define SF_HAS_EVAL 0x0200
343#define SCF_DO_SUBSTR 0x0400
653099ff
GS
344#define SCF_DO_STCLASS_AND 0x0800
345#define SCF_DO_STCLASS_OR 0x1000
346#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 347#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 348
786e8c11 349#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 350#define SCF_SEEN_ACCEPT 0x8000
07be1b83 351
eb160463 352#define UTF (RExC_utf8 != 0)
bbe252da
YO
353#define LOC ((RExC_flags & RXf_PMf_LOCALE) != 0)
354#define FOLD ((RExC_flags & RXf_PMf_FOLD) != 0)
a0ed51b3 355
ffc61ed2 356#define OOB_UNICODE 12345678
93733859 357#define OOB_NAMEDCLASS -1
b8c5462f 358
a0ed51b3
LW
359#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
360#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
361
8615cb43 362
b45f050a
JF
363/* length of regex to show in messages that don't mark a position within */
364#define RegexLengthToShowInErrorMessages 127
365
366/*
367 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
368 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
369 * op/pragma/warn/regcomp.
370 */
7253e4e3
RK
371#define MARKER1 "<-- HERE" /* marker as it appears in the description */
372#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 373
7253e4e3 374#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
375
376/*
377 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
378 * arg. Show regex, up to a maximum length. If it's too long, chop and add
379 * "...".
380 */
58e23c8d 381#define _FAIL(code) STMT_START { \
bfed75c6 382 const char *ellipses = ""; \
ccb2c380
MP
383 IV len = RExC_end - RExC_precomp; \
384 \
385 if (!SIZE_ONLY) \
386 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
387 if (len > RegexLengthToShowInErrorMessages) { \
388 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
389 len = RegexLengthToShowInErrorMessages - 10; \
390 ellipses = "..."; \
391 } \
58e23c8d 392 code; \
ccb2c380 393} STMT_END
8615cb43 394
58e23c8d
YO
395#define FAIL(msg) _FAIL( \
396 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
397 msg, (int)len, RExC_precomp, ellipses))
398
399#define FAIL2(msg,arg) _FAIL( \
400 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
401 arg, (int)len, RExC_precomp, ellipses))
402
b45f050a 403/*
b45f050a
JF
404 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
405 */
ccb2c380 406#define Simple_vFAIL(m) STMT_START { \
a28509cc 407 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
408 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
409 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
410} STMT_END
b45f050a
JF
411
412/*
413 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
414 */
ccb2c380
MP
415#define vFAIL(m) STMT_START { \
416 if (!SIZE_ONLY) \
417 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
418 Simple_vFAIL(m); \
419} STMT_END
b45f050a
JF
420
421/*
422 * Like Simple_vFAIL(), but accepts two arguments.
423 */
ccb2c380 424#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 425 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
426 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
427 (int)offset, RExC_precomp, RExC_precomp + offset); \
428} STMT_END
b45f050a
JF
429
430/*
431 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
432 */
ccb2c380
MP
433#define vFAIL2(m,a1) STMT_START { \
434 if (!SIZE_ONLY) \
435 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
436 Simple_vFAIL2(m, a1); \
437} STMT_END
b45f050a
JF
438
439
440/*
441 * Like Simple_vFAIL(), but accepts three arguments.
442 */
ccb2c380 443#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 444 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
445 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
446 (int)offset, RExC_precomp, RExC_precomp + offset); \
447} STMT_END
b45f050a
JF
448
449/*
450 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
451 */
ccb2c380
MP
452#define vFAIL3(m,a1,a2) STMT_START { \
453 if (!SIZE_ONLY) \
454 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx); \
455 Simple_vFAIL3(m, a1, a2); \
456} STMT_END
b45f050a
JF
457
458/*
459 * Like Simple_vFAIL(), but accepts four arguments.
460 */
ccb2c380 461#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 462 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
463 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
464 (int)offset, RExC_precomp, RExC_precomp + offset); \
465} STMT_END
b45f050a 466
ccb2c380 467#define vWARN(loc,m) STMT_START { \
a28509cc 468 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
469 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
470 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
471} STMT_END
472
473#define vWARNdep(loc,m) STMT_START { \
a28509cc 474 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
475 Perl_warner(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
476 "%s" REPORT_LOCATION, \
477 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
478} STMT_END
479
480
481#define vWARN2(loc, m, a1) STMT_START { \
a28509cc 482 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
483 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
484 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
485} STMT_END
486
487#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 488 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
489 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
490 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
491} STMT_END
492
493#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 494 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
495 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
496 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
497} STMT_END
498
499#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 500 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
501 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
502 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
503} STMT_END
9d1d55b5 504
8615cb43 505
cd439c50 506/* Allow for side effects in s */
ccb2c380
MP
507#define REGC(c,s) STMT_START { \
508 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
509} STMT_END
cd439c50 510
fac92740
MJD
511/* Macros for recording node offsets. 20001227 mjd@plover.com
512 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
513 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
514 * Element 0 holds the number n.
07be1b83 515 * Position is 1 indexed.
fac92740 516 */
7122b237
YO
517#ifndef RE_TRACK_PATTERN_OFFSETS
518#define Set_Node_Offset_To_R(node,byte)
519#define Set_Node_Offset(node,byte)
520#define Set_Cur_Node_Offset
521#define Set_Node_Length_To_R(node,len)
522#define Set_Node_Length(node,len)
523#define Set_Node_Cur_Length(node)
524#define Node_Offset(n)
525#define Node_Length(n)
526#define Set_Node_Offset_Length(node,offset,len)
527#define ProgLen(ri) ri->u.proglen
528#define SetProgLen(ri,x) ri->u.proglen = x
529#else
530#define ProgLen(ri) ri->u.offsets[0]
531#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
532#define Set_Node_Offset_To_R(node,byte) STMT_START { \
533 if (! SIZE_ONLY) { \
534 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 535 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 536 if((node) < 0) { \
551405c4 537 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
538 } else { \
539 RExC_offsets[2*(node)-1] = (byte); \
540 } \
541 } \
542} STMT_END
543
544#define Set_Node_Offset(node,byte) \
545 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
546#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
547
548#define Set_Node_Length_To_R(node,len) STMT_START { \
549 if (! SIZE_ONLY) { \
550 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 551 __LINE__, (int)(node), (int)(len))); \
ccb2c380 552 if((node) < 0) { \
551405c4 553 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
554 } else { \
555 RExC_offsets[2*(node)] = (len); \
556 } \
557 } \
558} STMT_END
559
560#define Set_Node_Length(node,len) \
561 Set_Node_Length_To_R((node)-RExC_emit_start, len)
562#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
563#define Set_Node_Cur_Length(node) \
564 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
565
566/* Get offsets and lengths */
567#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
568#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
569
07be1b83
YO
570#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
571 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
572 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
573} STMT_END
7122b237 574#endif
07be1b83
YO
575
576#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
577#define EXPERIMENTAL_INPLACESCAN
7122b237 578#endif /*RE_TRACK_PATTERN_OFFSETS*/
07be1b83 579
304ee84b
YO
580#define DEBUG_STUDYDATA(str,data,depth) \
581DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 582 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
583 "%*s" str "Pos:%"IVdf"/%"IVdf \
584 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
585 (int)(depth)*2, "", \
586 (IV)((data)->pos_min), \
587 (IV)((data)->pos_delta), \
304ee84b 588 (UV)((data)->flags), \
1de06328 589 (IV)((data)->whilem_c), \
304ee84b
YO
590 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
591 is_inf ? "INF " : "" \
1de06328
YO
592 ); \
593 if ((data)->last_found) \
594 PerlIO_printf(Perl_debug_log, \
595 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
596 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
597 SvPVX_const((data)->last_found), \
598 (IV)((data)->last_end), \
599 (IV)((data)->last_start_min), \
600 (IV)((data)->last_start_max), \
601 ((data)->longest && \
602 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
603 SvPVX_const((data)->longest_fixed), \
604 (IV)((data)->offset_fixed), \
605 ((data)->longest && \
606 (data)->longest==&((data)->longest_float)) ? "*" : "", \
607 SvPVX_const((data)->longest_float), \
608 (IV)((data)->offset_float_min), \
609 (IV)((data)->offset_float_max) \
610 ); \
611 PerlIO_printf(Perl_debug_log,"\n"); \
612});
613
acfe0abc 614static void clear_re(pTHX_ void *r);
4327152a 615
653099ff 616/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 617 Update the longest found anchored substring and the longest found
653099ff
GS
618 floating substrings if needed. */
619
4327152a 620STATIC void
304ee84b 621S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 622{
e1ec3a88
AL
623 const STRLEN l = CHR_SVLEN(data->last_found);
624 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 625 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 626
c277df42 627 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 628 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
629 if (*data->longest == data->longest_fixed) {
630 data->offset_fixed = l ? data->last_start_min : data->pos_min;
631 if (data->flags & SF_BEFORE_EOL)
b81d288d 632 data->flags
c277df42
IZ
633 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
634 else
635 data->flags &= ~SF_FIX_BEFORE_EOL;
1de06328
YO
636 data->minlen_fixed=minlenp;
637 data->lookbehind_fixed=0;
a0ed51b3 638 }
304ee84b 639 else { /* *data->longest == data->longest_float */
c277df42 640 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
641 data->offset_float_max = (l
642 ? data->last_start_max
c277df42 643 : data->pos_min + data->pos_delta);
304ee84b 644 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 645 data->offset_float_max = I32_MAX;
c277df42 646 if (data->flags & SF_BEFORE_EOL)
b81d288d 647 data->flags
c277df42
IZ
648 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
649 else
650 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
651 data->minlen_float=minlenp;
652 data->lookbehind_float=0;
c277df42
IZ
653 }
654 }
655 SvCUR_set(data->last_found, 0);
0eda9292 656 {
a28509cc 657 SV * const sv = data->last_found;
097eb12c
AL
658 if (SvUTF8(sv) && SvMAGICAL(sv)) {
659 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
660 if (mg)
661 mg->mg_len = 0;
662 }
0eda9292 663 }
c277df42
IZ
664 data->last_end = -1;
665 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 666 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
667}
668
653099ff
GS
669/* Can match anything (initialization) */
670STATIC void
097eb12c 671S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 672{
653099ff 673 ANYOF_CLASS_ZERO(cl);
f8bef550 674 ANYOF_BITMAP_SETALL(cl);
1aa99e6b 675 cl->flags = ANYOF_EOS|ANYOF_UNICODE_ALL;
653099ff
GS
676 if (LOC)
677 cl->flags |= ANYOF_LOCALE;
678}
679
680/* Can match anything (initialization) */
681STATIC int
5f66b61c 682S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
683{
684 int value;
685
aaa51d5e 686 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
687 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
688 return 1;
1aa99e6b
IH
689 if (!(cl->flags & ANYOF_UNICODE_ALL))
690 return 0;
10edeb5d 691 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 692 return 0;
653099ff
GS
693 return 1;
694}
695
696/* Can match anything (initialization) */
697STATIC void
097eb12c 698S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 699{
8ecf7187 700 Zero(cl, 1, struct regnode_charclass_class);
653099ff 701 cl->type = ANYOF;
830247a4 702 cl_anything(pRExC_state, cl);
653099ff
GS
703}
704
705STATIC void
097eb12c 706S_cl_init_zero(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 707{
8ecf7187 708 Zero(cl, 1, struct regnode_charclass_class);
653099ff 709 cl->type = ANYOF;
830247a4 710 cl_anything(pRExC_state, cl);
653099ff
GS
711 if (LOC)
712 cl->flags |= ANYOF_LOCALE;
713}
714
715/* 'And' a given class with another one. Can create false positives */
716/* We assume that cl is not inverted */
717STATIC void
5f66b61c 718S_cl_and(struct regnode_charclass_class *cl,
a28509cc 719 const struct regnode_charclass_class *and_with)
653099ff 720{
40d049e4
YO
721
722 assert(and_with->type == ANYOF);
653099ff
GS
723 if (!(and_with->flags & ANYOF_CLASS)
724 && !(cl->flags & ANYOF_CLASS)
725 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
726 && !(and_with->flags & ANYOF_FOLD)
727 && !(cl->flags & ANYOF_FOLD)) {
728 int i;
729
730 if (and_with->flags & ANYOF_INVERT)
731 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
732 cl->bitmap[i] &= ~and_with->bitmap[i];
733 else
734 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
735 cl->bitmap[i] &= and_with->bitmap[i];
736 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
737 if (!(and_with->flags & ANYOF_EOS))
738 cl->flags &= ~ANYOF_EOS;
1aa99e6b 739
14ebb1a2
JH
740 if (cl->flags & ANYOF_UNICODE_ALL && and_with->flags & ANYOF_UNICODE &&
741 !(and_with->flags & ANYOF_INVERT)) {
1aa99e6b
IH
742 cl->flags &= ~ANYOF_UNICODE_ALL;
743 cl->flags |= ANYOF_UNICODE;
744 ARG_SET(cl, ARG(and_with));
745 }
14ebb1a2
JH
746 if (!(and_with->flags & ANYOF_UNICODE_ALL) &&
747 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 748 cl->flags &= ~ANYOF_UNICODE_ALL;
14ebb1a2
JH
749 if (!(and_with->flags & (ANYOF_UNICODE|ANYOF_UNICODE_ALL)) &&
750 !(and_with->flags & ANYOF_INVERT))
1aa99e6b 751 cl->flags &= ~ANYOF_UNICODE;
653099ff
GS
752}
753
754/* 'OR' a given class with another one. Can create false positives */
755/* We assume that cl is not inverted */
756STATIC void
097eb12c 757S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 758{
653099ff
GS
759 if (or_with->flags & ANYOF_INVERT) {
760 /* We do not use
761 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
762 * <= (B1 | !B2) | (CL1 | !CL2)
763 * which is wasteful if CL2 is small, but we ignore CL2:
764 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
765 * XXXX Can we handle case-fold? Unclear:
766 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
767 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
768 */
769 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
770 && !(or_with->flags & ANYOF_FOLD)
771 && !(cl->flags & ANYOF_FOLD) ) {
772 int i;
773
774 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
775 cl->bitmap[i] |= ~or_with->bitmap[i];
776 } /* XXXX: logic is complicated otherwise */
777 else {
830247a4 778 cl_anything(pRExC_state, cl);
653099ff
GS
779 }
780 } else {
781 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
782 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
b81d288d 783 && (!(or_with->flags & ANYOF_FOLD)
653099ff
GS
784 || (cl->flags & ANYOF_FOLD)) ) {
785 int i;
786
787 /* OR char bitmap and class bitmap separately */
788 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
789 cl->bitmap[i] |= or_with->bitmap[i];
790 if (or_with->flags & ANYOF_CLASS) {
791 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
792 cl->classflags[i] |= or_with->classflags[i];
793 cl->flags |= ANYOF_CLASS;
794 }
795 }
796 else { /* XXXX: logic is complicated, leave it along for a moment. */
830247a4 797 cl_anything(pRExC_state, cl);
653099ff
GS
798 }
799 }
800 if (or_with->flags & ANYOF_EOS)
801 cl->flags |= ANYOF_EOS;
1aa99e6b
IH
802
803 if (cl->flags & ANYOF_UNICODE && or_with->flags & ANYOF_UNICODE &&
804 ARG(cl) != ARG(or_with)) {
805 cl->flags |= ANYOF_UNICODE_ALL;
806 cl->flags &= ~ANYOF_UNICODE;
807 }
808 if (or_with->flags & ANYOF_UNICODE_ALL) {
809 cl->flags |= ANYOF_UNICODE_ALL;
810 cl->flags &= ~ANYOF_UNICODE;
811 }
653099ff
GS
812}
813
a3621e74
YO
814#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
815#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
816#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
817#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
818
3dab1dad
YO
819
820#ifdef DEBUGGING
07be1b83 821/*
2b8b4781
NC
822 dump_trie(trie,widecharmap,revcharmap)
823 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
824 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
825
826 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
827 The _interim_ variants are used for debugging the interim
828 tables that are used to generate the final compressed
829 representation which is what dump_trie expects.
830
3dab1dad
YO
831 Part of the reason for their existance is to provide a form
832 of documentation as to how the different representations function.
07be1b83
YO
833
834*/
3dab1dad
YO
835
836/*
3dab1dad
YO
837 Dumps the final compressed table form of the trie to Perl_debug_log.
838 Used for debugging make_trie().
839*/
840
841STATIC void
2b8b4781
NC
842S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
843 AV *revcharmap, U32 depth)
3dab1dad
YO
844{
845 U32 state;
ab3bbdeb 846 SV *sv=sv_newmortal();
55eed653 847 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
848 GET_RE_DEBUG_FLAGS_DECL;
849
ab3bbdeb 850
3dab1dad
YO
851 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
852 (int)depth * 2 + 2,"",
853 "Match","Base","Ofs" );
854
855 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 856 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 857 if ( tmp ) {
ab3bbdeb
YO
858 PerlIO_printf( Perl_debug_log, "%*s",
859 colwidth,
ddc5bc0f 860 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
861 PL_colors[0], PL_colors[1],
862 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
863 PERL_PV_ESCAPE_FIRSTCHAR
864 )
865 );
3dab1dad
YO
866 }
867 }
868 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
869 (int)depth * 2 + 2,"");
870
871 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 872 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
873 PerlIO_printf( Perl_debug_log, "\n");
874
1e2e3d02 875 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 876 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
877
878 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
879
880 if ( trie->states[ state ].wordnum ) {
881 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
882 } else {
883 PerlIO_printf( Perl_debug_log, "%6s", "" );
884 }
885
886 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
887
888 if ( base ) {
889 U32 ofs = 0;
890
891 while( ( base + ofs < trie->uniquecharcount ) ||
892 ( base + ofs - trie->uniquecharcount < trie->lasttrans
893 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
894 ofs++;
895
896 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
897
898 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
899 if ( ( base + ofs >= trie->uniquecharcount ) &&
900 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
901 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
902 {
ab3bbdeb
YO
903 PerlIO_printf( Perl_debug_log, "%*"UVXf,
904 colwidth,
3dab1dad
YO
905 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
906 } else {
ab3bbdeb 907 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
908 }
909 }
910
911 PerlIO_printf( Perl_debug_log, "]");
912
913 }
914 PerlIO_printf( Perl_debug_log, "\n" );
915 }
916}
917/*
3dab1dad
YO
918 Dumps a fully constructed but uncompressed trie in list form.
919 List tries normally only are used for construction when the number of
920 possible chars (trie->uniquecharcount) is very high.
921 Used for debugging make_trie().
922*/
923STATIC void
55eed653 924S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
925 HV *widecharmap, AV *revcharmap, U32 next_alloc,
926 U32 depth)
3dab1dad
YO
927{
928 U32 state;
ab3bbdeb 929 SV *sv=sv_newmortal();
55eed653 930 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
931 GET_RE_DEBUG_FLAGS_DECL;
932 /* print out the table precompression. */
ab3bbdeb
YO
933 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
934 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
935 "------:-----+-----------------\n" );
3dab1dad
YO
936
937 for( state=1 ; state < next_alloc ; state ++ ) {
938 U16 charid;
939
ab3bbdeb 940 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
941 (int)depth * 2 + 2,"", (UV)state );
942 if ( ! trie->states[ state ].wordnum ) {
943 PerlIO_printf( Perl_debug_log, "%5s| ","");
944 } else {
945 PerlIO_printf( Perl_debug_log, "W%4x| ",
946 trie->states[ state ].wordnum
947 );
948 }
949 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 950 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
951 if ( tmp ) {
952 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
953 colwidth,
ddc5bc0f 954 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
955 PL_colors[0], PL_colors[1],
956 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
957 PERL_PV_ESCAPE_FIRSTCHAR
958 ) ,
1e2e3d02
YO
959 TRIE_LIST_ITEM(state,charid).forid,
960 (UV)TRIE_LIST_ITEM(state,charid).newstate
961 );
962 if (!(charid % 10))
664e119d
RGS
963 PerlIO_printf(Perl_debug_log, "\n%*s| ",
964 (int)((depth * 2) + 14), "");
1e2e3d02 965 }
ab3bbdeb
YO
966 }
967 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
968 }
969}
970
971/*
3dab1dad
YO
972 Dumps a fully constructed but uncompressed trie in table form.
973 This is the normal DFA style state transition table, with a few
974 twists to facilitate compression later.
975 Used for debugging make_trie().
976*/
977STATIC void
55eed653 978S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
979 HV *widecharmap, AV *revcharmap, U32 next_alloc,
980 U32 depth)
3dab1dad
YO
981{
982 U32 state;
983 U16 charid;
ab3bbdeb 984 SV *sv=sv_newmortal();
55eed653 985 int colwidth= widecharmap ? 6 : 4;
3dab1dad
YO
986 GET_RE_DEBUG_FLAGS_DECL;
987
988 /*
989 print out the table precompression so that we can do a visual check
990 that they are identical.
991 */
992
993 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
994
995 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 996 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 997 if ( tmp ) {
ab3bbdeb
YO
998 PerlIO_printf( Perl_debug_log, "%*s",
999 colwidth,
ddc5bc0f 1000 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1001 PL_colors[0], PL_colors[1],
1002 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1003 PERL_PV_ESCAPE_FIRSTCHAR
1004 )
1005 );
3dab1dad
YO
1006 }
1007 }
1008
1009 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1010
1011 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1012 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1013 }
1014
1015 PerlIO_printf( Perl_debug_log, "\n" );
1016
1017 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1018
1019 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1020 (int)depth * 2 + 2,"",
1021 (UV)TRIE_NODENUM( state ) );
1022
1023 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1024 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1025 if (v)
1026 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1027 else
1028 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1029 }
1030 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1031 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1032 } else {
1033 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1034 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1035 }
1036 }
07be1b83 1037}
3dab1dad
YO
1038
1039#endif
1040
786e8c11
YO
1041/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1042 startbranch: the first branch in the whole branch sequence
1043 first : start branch of sequence of branch-exact nodes.
1044 May be the same as startbranch
1045 last : Thing following the last branch.
1046 May be the same as tail.
1047 tail : item following the branch sequence
1048 count : words in the sequence
1049 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1050 depth : indent depth
3dab1dad 1051
786e8c11 1052Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1053
786e8c11
YO
1054A trie is an N'ary tree where the branches are determined by digital
1055decomposition of the key. IE, at the root node you look up the 1st character and
1056follow that branch repeat until you find the end of the branches. Nodes can be
1057marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1058
786e8c11 1059 /he|she|his|hers/
72f13be8 1060
786e8c11
YO
1061would convert into the following structure. Numbers represent states, letters
1062following numbers represent valid transitions on the letter from that state, if
1063the number is in square brackets it represents an accepting state, otherwise it
1064will be in parenthesis.
07be1b83 1065
786e8c11
YO
1066 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1067 | |
1068 | (2)
1069 | |
1070 (1) +-i->(6)-+-s->[7]
1071 |
1072 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1073
786e8c11
YO
1074 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1075
1076This shows that when matching against the string 'hers' we will begin at state 1
1077read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1078then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1079is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1080single traverse. We store a mapping from accepting to state to which word was
1081matched, and then when we have multiple possibilities we try to complete the
1082rest of the regex in the order in which they occured in the alternation.
1083
1084The only prior NFA like behaviour that would be changed by the TRIE support is
1085the silent ignoring of duplicate alternations which are of the form:
1086
1087 / (DUPE|DUPE) X? (?{ ... }) Y /x
1088
1089Thus EVAL blocks follwing a trie may be called a different number of times with
1090and without the optimisation. With the optimisations dupes will be silently
1091ignored. This inconsistant behaviour of EVAL type nodes is well established as
1092the following demonstrates:
1093
1094 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1095
1096which prints out 'word' three times, but
1097
1098 'words'=~/(word|word|word)(?{ print $1 })S/
1099
1100which doesnt print it out at all. This is due to other optimisations kicking in.
1101
1102Example of what happens on a structural level:
1103
1104The regexp /(ac|ad|ab)+/ will produce the folowing debug output:
1105
1106 1: CURLYM[1] {1,32767}(18)
1107 5: BRANCH(8)
1108 6: EXACT <ac>(16)
1109 8: BRANCH(11)
1110 9: EXACT <ad>(16)
1111 11: BRANCH(14)
1112 12: EXACT <ab>(16)
1113 16: SUCCEED(0)
1114 17: NOTHING(18)
1115 18: END(0)
1116
1117This would be optimizable with startbranch=5, first=5, last=16, tail=16
1118and should turn into:
1119
1120 1: CURLYM[1] {1,32767}(18)
1121 5: TRIE(16)
1122 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1123 <ac>
1124 <ad>
1125 <ab>
1126 16: SUCCEED(0)
1127 17: NOTHING(18)
1128 18: END(0)
1129
1130Cases where tail != last would be like /(?foo|bar)baz/:
1131
1132 1: BRANCH(4)
1133 2: EXACT <foo>(8)
1134 4: BRANCH(7)
1135 5: EXACT <bar>(8)
1136 7: TAIL(8)
1137 8: EXACT <baz>(10)
1138 10: END(0)
1139
1140which would be optimizable with startbranch=1, first=1, last=7, tail=8
1141and would end up looking like:
1142
1143 1: TRIE(8)
1144 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1145 <foo>
1146 <bar>
1147 7: TAIL(8)
1148 8: EXACT <baz>(10)
1149 10: END(0)
1150
1151 d = uvuni_to_utf8_flags(d, uv, 0);
1152
1153is the recommended Unicode-aware way of saying
1154
1155 *(d++) = uv;
1156*/
1157
1e2e3d02 1158#define TRIE_STORE_REVCHAR \
786e8c11 1159 STMT_START { \
1e2e3d02 1160 SV *tmp = newSVpvs(""); \
786e8c11 1161 if (UTF) SvUTF8_on(tmp); \
1e2e3d02 1162 Perl_sv_catpvf( aTHX_ tmp, "%c", (int)uvc ); \
2b8b4781 1163 av_push( revcharmap, tmp ); \
786e8c11
YO
1164 } STMT_END
1165
1166#define TRIE_READ_CHAR STMT_START { \
1167 wordlen++; \
1168 if ( UTF ) { \
1169 if ( folder ) { \
1170 if ( foldlen > 0 ) { \
1171 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1172 foldlen -= len; \
1173 scan += len; \
1174 len = 0; \
1175 } else { \
1176 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1177 uvc = to_uni_fold( uvc, foldbuf, &foldlen ); \
1178 foldlen -= UNISKIP( uvc ); \
1179 scan = foldbuf + UNISKIP( uvc ); \
1180 } \
1181 } else { \
1182 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1183 } \
1184 } else { \
1185 uvc = (U32)*uc; \
1186 len = 1; \
1187 } \
1188} STMT_END
1189
1190
1191
1192#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1193 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1194 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1195 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1196 } \
1197 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1198 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1199 TRIE_LIST_CUR( state )++; \
1200} STMT_END
07be1b83 1201
786e8c11
YO
1202#define TRIE_LIST_NEW(state) STMT_START { \
1203 Newxz( trie->states[ state ].trans.list, \
1204 4, reg_trie_trans_le ); \
1205 TRIE_LIST_CUR( state ) = 1; \
1206 TRIE_LIST_LEN( state ) = 4; \
1207} STMT_END
07be1b83 1208
786e8c11
YO
1209#define TRIE_HANDLE_WORD(state) STMT_START { \
1210 U16 dupe= trie->states[ state ].wordnum; \
1211 regnode * const noper_next = regnext( noper ); \
1212 \
1213 if (trie->wordlen) \
1214 trie->wordlen[ curword ] = wordlen; \
1215 DEBUG_r({ \
1216 /* store the word for dumping */ \
1217 SV* tmp; \
1218 if (OP(noper) != NOTHING) \
1219 tmp = newSVpvn(STRING(noper), STR_LEN(noper)); \
1220 else \
1221 tmp = newSVpvn( "", 0 ); \
1222 if ( UTF ) SvUTF8_on( tmp ); \
2b8b4781 1223 av_push( trie_words, tmp ); \
786e8c11
YO
1224 }); \
1225 \
1226 curword++; \
1227 \
1228 if ( noper_next < tail ) { \
1229 if (!trie->jump) \
c944940b 1230 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1231 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1232 if (!jumper) \
1233 jumper = noper_next; \
1234 if (!nextbranch) \
1235 nextbranch= regnext(cur); \
1236 } \
1237 \
1238 if ( dupe ) { \
1239 /* So it's a dupe. This means we need to maintain a */\
1240 /* linked-list from the first to the next. */\
1241 /* we only allocate the nextword buffer when there */\
1242 /* a dupe, so first time we have to do the allocation */\
1243 if (!trie->nextword) \
c944940b 1244 trie->nextword = (U16 *) \
446bd890 1245 PerlMemShared_calloc( word_count + 1, sizeof(U16)); \
786e8c11
YO
1246 while ( trie->nextword[dupe] ) \
1247 dupe= trie->nextword[dupe]; \
1248 trie->nextword[dupe]= curword; \
1249 } else { \
1250 /* we haven't inserted this word yet. */ \
1251 trie->states[ state ].wordnum = curword; \
1252 } \
1253} STMT_END
07be1b83 1254
3dab1dad 1255
786e8c11
YO
1256#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1257 ( ( base + charid >= ucharcount \
1258 && base + charid < ubound \
1259 && state == trie->trans[ base - ucharcount + charid ].check \
1260 && trie->trans[ base - ucharcount + charid ].next ) \
1261 ? trie->trans[ base - ucharcount + charid ].next \
1262 : ( state==1 ? special : 0 ) \
1263 )
3dab1dad 1264
786e8c11
YO
1265#define MADE_TRIE 1
1266#define MADE_JUMP_TRIE 2
1267#define MADE_EXACT_TRIE 4
3dab1dad 1268
a3621e74 1269STATIC I32
786e8c11 1270S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1271{
27da23d5 1272 dVAR;
a3621e74
YO
1273 /* first pass, loop through and scan words */
1274 reg_trie_data *trie;
55eed653 1275 HV *widecharmap = NULL;
2b8b4781 1276 AV *revcharmap = newAV();
a3621e74 1277 regnode *cur;
9f7f3913 1278 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1279 STRLEN len = 0;
1280 UV uvc = 0;
1281 U16 curword = 0;
1282 U32 next_alloc = 0;
786e8c11
YO
1283 regnode *jumper = NULL;
1284 regnode *nextbranch = NULL;
7f69552c 1285 regnode *convert = NULL;
a3621e74 1286 /* we just use folder as a flag in utf8 */
e1ec3a88 1287 const U8 * const folder = ( flags == EXACTF
a3621e74
YO
1288 ? PL_fold
1289 : ( flags == EXACTFL
1290 ? PL_fold_locale
1291 : NULL
1292 )
1293 );
1294
2b8b4781
NC
1295#ifdef DEBUGGING
1296 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1297 AV *trie_words = NULL;
1298 /* along with revcharmap, this only used during construction but both are
1299 * useful during debugging so we store them in the struct when debugging.
8e11feef 1300 */
2b8b4781
NC
1301#else
1302 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1303 STRLEN trie_charcount=0;
3dab1dad 1304#endif
2b8b4781 1305 SV *re_trie_maxbuff;
a3621e74 1306 GET_RE_DEBUG_FLAGS_DECL;
72f13be8
YO
1307#ifndef DEBUGGING
1308 PERL_UNUSED_ARG(depth);
1309#endif
a3621e74 1310
c944940b 1311 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1312 trie->refcount = 1;
3dab1dad 1313 trie->startstate = 1;
786e8c11 1314 trie->wordcount = word_count;
f8fc2ecf 1315 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1316 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1317 if (!(UTF && folder))
c944940b 1318 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
a3621e74 1319 DEBUG_r({
2b8b4781 1320 trie_words = newAV();
a3621e74 1321 });
a3621e74 1322
0111c4fd 1323 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1324 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1325 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1326 }
3dab1dad
YO
1327 DEBUG_OPTIMISE_r({
1328 PerlIO_printf( Perl_debug_log,
786e8c11 1329 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1330 (int)depth * 2 + 2, "",
1331 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1332 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1333 (int)depth);
3dab1dad 1334 });
7f69552c
YO
1335
1336 /* Find the node we are going to overwrite */
1337 if ( first == startbranch && OP( last ) != BRANCH ) {
1338 /* whole branch chain */
1339 convert = first;
1340 } else {
1341 /* branch sub-chain */
1342 convert = NEXTOPER( first );
1343 }
1344
a3621e74
YO
1345 /* -- First loop and Setup --
1346
1347 We first traverse the branches and scan each word to determine if it
1348 contains widechars, and how many unique chars there are, this is
1349 important as we have to build a table with at least as many columns as we
1350 have unique chars.
1351
1352 We use an array of integers to represent the character codes 0..255
1353 (trie->charmap) and we use a an HV* to store unicode characters. We use the
1354 native representation of the character value as the key and IV's for the
1355 coded index.
1356
1357 *TODO* If we keep track of how many times each character is used we can
1358 remap the columns so that the table compression later on is more
1359 efficient in terms of memory by ensuring most common value is in the
1360 middle and the least common are on the outside. IMO this would be better
1361 than a most to least common mapping as theres a decent chance the most
1362 common letter will share a node with the least common, meaning the node
1363 will not be compressable. With a middle is most common approach the worst
1364 case is when we have the least common nodes twice.
1365
1366 */
1367
a3621e74 1368 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1369 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1370 const U8 *uc = (U8*)STRING( noper );
a28509cc 1371 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1372 STRLEN foldlen = 0;
1373 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1374 const U8 *scan = (U8*)NULL;
07be1b83 1375 U32 wordlen = 0; /* required init */
3dab1dad 1376 STRLEN chars=0;
a3621e74 1377
3dab1dad
YO
1378 if (OP(noper) == NOTHING) {
1379 trie->minlen= 0;
1380 continue;
1381 }
1382 if (trie->bitmap) {
1383 TRIE_BITMAP_SET(trie,*uc);
1384 if ( folder ) TRIE_BITMAP_SET(trie,folder[ *uc ]);
1385 }
a3621e74 1386 for ( ; uc < e ; uc += len ) {
3dab1dad 1387 TRIE_CHARCOUNT(trie)++;
a3621e74 1388 TRIE_READ_CHAR;
3dab1dad 1389 chars++;
a3621e74
YO
1390 if ( uvc < 256 ) {
1391 if ( !trie->charmap[ uvc ] ) {
1392 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1393 if ( folder )
1394 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1395 TRIE_STORE_REVCHAR;
a3621e74
YO
1396 }
1397 } else {
1398 SV** svpp;
55eed653
NC
1399 if ( !widecharmap )
1400 widecharmap = newHV();
a3621e74 1401
55eed653 1402 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1403
1404 if ( !svpp )
e4584336 1405 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1406
1407 if ( !SvTRUE( *svpp ) ) {
1408 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1409 TRIE_STORE_REVCHAR;
a3621e74
YO
1410 }
1411 }
1412 }
3dab1dad
YO
1413 if( cur == first ) {
1414 trie->minlen=chars;
1415 trie->maxlen=chars;
1416 } else if (chars < trie->minlen) {
1417 trie->minlen=chars;
1418 } else if (chars > trie->maxlen) {
1419 trie->maxlen=chars;
1420 }
1421
a3621e74
YO
1422 } /* end first pass */
1423 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1424 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1425 (int)depth * 2 + 2,"",
55eed653 1426 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1427 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1428 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1429 );
c944940b 1430 trie->wordlen = (U32 *) PerlMemShared_calloc( word_count, sizeof(U32) );
a3621e74
YO
1431
1432 /*
1433 We now know what we are dealing with in terms of unique chars and
1434 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1435 representation using a flat table will take. If it's over a reasonable
1436 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1437 conservative but potentially much slower representation using an array
1438 of lists.
1439
1440 At the end we convert both representations into the same compressed
1441 form that will be used in regexec.c for matching with. The latter
1442 is a form that cannot be used to construct with but has memory
1443 properties similar to the list form and access properties similar
1444 to the table form making it both suitable for fast searches and
1445 small enough that its feasable to store for the duration of a program.
1446
1447 See the comment in the code where the compressed table is produced
1448 inplace from the flat tabe representation for an explanation of how
1449 the compression works.
1450
1451 */
1452
1453
3dab1dad 1454 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1455 /*
1456 Second Pass -- Array Of Lists Representation
1457
1458 Each state will be represented by a list of charid:state records
1459 (reg_trie_trans_le) the first such element holds the CUR and LEN
1460 points of the allocated array. (See defines above).
1461
1462 We build the initial structure using the lists, and then convert
1463 it into the compressed table form which allows faster lookups
1464 (but cant be modified once converted).
a3621e74
YO
1465 */
1466
a3621e74
YO
1467 STRLEN transcount = 1;
1468
1e2e3d02
YO
1469 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1470 "%*sCompiling trie using list compiler\n",
1471 (int)depth * 2 + 2, ""));
446bd890 1472
c944940b
JH
1473 trie->states = (reg_trie_state *)
1474 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1475 sizeof(reg_trie_state) );
a3621e74
YO
1476 TRIE_LIST_NEW(1);
1477 next_alloc = 2;
1478
1479 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1480
c445ea15
AL
1481 regnode * const noper = NEXTOPER( cur );
1482 U8 *uc = (U8*)STRING( noper );
1483 const U8 * const e = uc + STR_LEN( noper );
1484 U32 state = 1; /* required init */
1485 U16 charid = 0; /* sanity init */
1486 U8 *scan = (U8*)NULL; /* sanity init */
1487 STRLEN foldlen = 0; /* required init */
07be1b83 1488 U32 wordlen = 0; /* required init */
c445ea15
AL
1489 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1490
3dab1dad 1491 if (OP(noper) != NOTHING) {
786e8c11 1492 for ( ; uc < e ; uc += len ) {
c445ea15 1493
786e8c11 1494 TRIE_READ_CHAR;
c445ea15 1495
786e8c11
YO
1496 if ( uvc < 256 ) {
1497 charid = trie->charmap[ uvc ];
c445ea15 1498 } else {
55eed653 1499 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1500 if ( !svpp ) {
1501 charid = 0;
1502 } else {
1503 charid=(U16)SvIV( *svpp );
1504 }
c445ea15 1505 }
786e8c11
YO
1506 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1507 if ( charid ) {
a3621e74 1508
786e8c11
YO
1509 U16 check;
1510 U32 newstate = 0;
a3621e74 1511
786e8c11
YO
1512 charid--;
1513 if ( !trie->states[ state ].trans.list ) {
1514 TRIE_LIST_NEW( state );
c445ea15 1515 }
786e8c11
YO
1516 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1517 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1518 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1519 break;
1520 }
1521 }
1522 if ( ! newstate ) {
1523 newstate = next_alloc++;
1524 TRIE_LIST_PUSH( state, charid, newstate );
1525 transcount++;
1526 }
1527 state = newstate;
1528 } else {
1529 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1530 }
a28509cc 1531 }
c445ea15 1532 }
3dab1dad 1533 TRIE_HANDLE_WORD(state);
a3621e74
YO
1534
1535 } /* end second pass */
1536
1e2e3d02
YO
1537 /* next alloc is the NEXT state to be allocated */
1538 trie->statecount = next_alloc;
c944940b
JH
1539 trie->states = (reg_trie_state *)
1540 PerlMemShared_realloc( trie->states,
1541 next_alloc
1542 * sizeof(reg_trie_state) );
a3621e74 1543
3dab1dad 1544 /* and now dump it out before we compress it */
2b8b4781
NC
1545 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1546 revcharmap, next_alloc,
1547 depth+1)
1e2e3d02 1548 );
a3621e74 1549
c944940b
JH
1550 trie->trans = (reg_trie_trans *)
1551 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1552 {
1553 U32 state;
a3621e74
YO
1554 U32 tp = 0;
1555 U32 zp = 0;
1556
1557
1558 for( state=1 ; state < next_alloc ; state ++ ) {
1559 U32 base=0;
1560
1561 /*
1562 DEBUG_TRIE_COMPILE_MORE_r(
1563 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1564 );
1565 */
1566
1567 if (trie->states[state].trans.list) {
1568 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1569 U16 maxid=minid;
a28509cc 1570 U16 idx;
a3621e74
YO
1571
1572 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1573 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1574 if ( forid < minid ) {
1575 minid=forid;
1576 } else if ( forid > maxid ) {
1577 maxid=forid;
1578 }
a3621e74
YO
1579 }
1580 if ( transcount < tp + maxid - minid + 1) {
1581 transcount *= 2;
c944940b
JH
1582 trie->trans = (reg_trie_trans *)
1583 PerlMemShared_realloc( trie->trans,
446bd890
NC
1584 transcount
1585 * sizeof(reg_trie_trans) );
a3621e74
YO
1586 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1587 }
1588 base = trie->uniquecharcount + tp - minid;
1589 if ( maxid == minid ) {
1590 U32 set = 0;
1591 for ( ; zp < tp ; zp++ ) {
1592 if ( ! trie->trans[ zp ].next ) {
1593 base = trie->uniquecharcount + zp - minid;
1594 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1595 trie->trans[ zp ].check = state;
1596 set = 1;
1597 break;
1598 }
1599 }
1600 if ( !set ) {
1601 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1602 trie->trans[ tp ].check = state;
1603 tp++;
1604 zp = tp;
1605 }
1606 } else {
1607 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1608 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1609 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1610 trie->trans[ tid ].check = state;
1611 }
1612 tp += ( maxid - minid + 1 );
1613 }
1614 Safefree(trie->states[ state ].trans.list);
1615 }
1616 /*
1617 DEBUG_TRIE_COMPILE_MORE_r(
1618 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1619 );
1620 */
1621 trie->states[ state ].trans.base=base;
1622 }
cc601c31 1623 trie->lasttrans = tp + 1;
a3621e74
YO
1624 }
1625 } else {
1626 /*
1627 Second Pass -- Flat Table Representation.
1628
1629 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1630 We know that we will need Charcount+1 trans at most to store the data
1631 (one row per char at worst case) So we preallocate both structures
1632 assuming worst case.
1633
1634 We then construct the trie using only the .next slots of the entry
1635 structs.
1636
1637 We use the .check field of the first entry of the node temporarily to
1638 make compression both faster and easier by keeping track of how many non
1639 zero fields are in the node.
1640
1641 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1642 transition.
1643
1644 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1645 number representing the first entry of the node, and state as a
1646 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1647 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1648 are 2 entrys per node. eg:
1649
1650 A B A B
1651 1. 2 4 1. 3 7
1652 2. 0 3 3. 0 5
1653 3. 0 0 5. 0 0
1654 4. 0 0 7. 0 0
1655
1656 The table is internally in the right hand, idx form. However as we also
1657 have to deal with the states array which is indexed by nodenum we have to
1658 use TRIE_NODENUM() to convert.
1659
1660 */
1e2e3d02
YO
1661 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1662 "%*sCompiling trie using table compiler\n",
1663 (int)depth * 2 + 2, ""));
3dab1dad 1664
c944940b
JH
1665 trie->trans = (reg_trie_trans *)
1666 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1667 * trie->uniquecharcount + 1,
1668 sizeof(reg_trie_trans) );
1669 trie->states = (reg_trie_state *)
1670 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1671 sizeof(reg_trie_state) );
a3621e74
YO
1672 next_alloc = trie->uniquecharcount + 1;
1673
3dab1dad 1674
a3621e74
YO
1675 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1676
c445ea15 1677 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1678 const U8 *uc = (U8*)STRING( noper );
1679 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1680
1681 U32 state = 1; /* required init */
1682
1683 U16 charid = 0; /* sanity init */
1684 U32 accept_state = 0; /* sanity init */
1685 U8 *scan = (U8*)NULL; /* sanity init */
1686
1687 STRLEN foldlen = 0; /* required init */
07be1b83 1688 U32 wordlen = 0; /* required init */
a3621e74
YO
1689 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1690
3dab1dad 1691 if ( OP(noper) != NOTHING ) {
786e8c11 1692 for ( ; uc < e ; uc += len ) {
a3621e74 1693
786e8c11 1694 TRIE_READ_CHAR;
a3621e74 1695
786e8c11
YO
1696 if ( uvc < 256 ) {
1697 charid = trie->charmap[ uvc ];
1698 } else {
55eed653 1699 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1700 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1701 }
786e8c11
YO
1702 if ( charid ) {
1703 charid--;
1704 if ( !trie->trans[ state + charid ].next ) {
1705 trie->trans[ state + charid ].next = next_alloc;
1706 trie->trans[ state ].check++;
1707 next_alloc += trie->uniquecharcount;
1708 }
1709 state = trie->trans[ state + charid ].next;
1710 } else {
1711 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1712 }
1713 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1714 }
a3621e74 1715 }
3dab1dad
YO
1716 accept_state = TRIE_NODENUM( state );
1717 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1718
1719 } /* end second pass */
1720
3dab1dad 1721 /* and now dump it out before we compress it */
2b8b4781
NC
1722 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1723 revcharmap,
1724 next_alloc, depth+1));
a3621e74 1725
a3621e74
YO
1726 {
1727 /*
1728 * Inplace compress the table.*
1729
1730 For sparse data sets the table constructed by the trie algorithm will
1731 be mostly 0/FAIL transitions or to put it another way mostly empty.
1732 (Note that leaf nodes will not contain any transitions.)
1733
1734 This algorithm compresses the tables by eliminating most such
1735 transitions, at the cost of a modest bit of extra work during lookup:
1736
1737 - Each states[] entry contains a .base field which indicates the
1738 index in the state[] array wheres its transition data is stored.
1739
1740 - If .base is 0 there are no valid transitions from that node.
1741
1742 - If .base is nonzero then charid is added to it to find an entry in
1743 the trans array.
1744
1745 -If trans[states[state].base+charid].check!=state then the
1746 transition is taken to be a 0/Fail transition. Thus if there are fail
1747 transitions at the front of the node then the .base offset will point
1748 somewhere inside the previous nodes data (or maybe even into a node
1749 even earlier), but the .check field determines if the transition is
1750 valid.
1751
786e8c11 1752 XXX - wrong maybe?
a3621e74
YO
1753 The following process inplace converts the table to the compressed
1754 table: We first do not compress the root node 1,and mark its all its
1755 .check pointers as 1 and set its .base pointer as 1 as well. This
1756 allows to do a DFA construction from the compressed table later, and
1757 ensures that any .base pointers we calculate later are greater than
1758 0.
1759
1760 - We set 'pos' to indicate the first entry of the second node.
1761
1762 - We then iterate over the columns of the node, finding the first and
1763 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
1764 and set the .check pointers accordingly, and advance pos
1765 appropriately and repreat for the next node. Note that when we copy
1766 the next pointers we have to convert them from the original
1767 NODEIDX form to NODENUM form as the former is not valid post
1768 compression.
1769
1770 - If a node has no transitions used we mark its base as 0 and do not
1771 advance the pos pointer.
1772
1773 - If a node only has one transition we use a second pointer into the
1774 structure to fill in allocated fail transitions from other states.
1775 This pointer is independent of the main pointer and scans forward
1776 looking for null transitions that are allocated to a state. When it
1777 finds one it writes the single transition into the "hole". If the
786e8c11 1778 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
1779
1780 - Once compressed we can Renew/realloc the structures to release the
1781 excess space.
1782
1783 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
1784 specifically Fig 3.47 and the associated pseudocode.
1785
1786 demq
1787 */
a3b680e6 1788 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 1789 U32 state, charid;
a3621e74 1790 U32 pos = 0, zp=0;
1e2e3d02 1791 trie->statecount = laststate;
a3621e74
YO
1792
1793 for ( state = 1 ; state < laststate ; state++ ) {
1794 U8 flag = 0;
a28509cc
AL
1795 const U32 stateidx = TRIE_NODEIDX( state );
1796 const U32 o_used = trie->trans[ stateidx ].check;
1797 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
1798 trie->trans[ stateidx ].check = 0;
1799
1800 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
1801 if ( flag || trie->trans[ stateidx + charid ].next ) {
1802 if ( trie->trans[ stateidx + charid ].next ) {
1803 if (o_used == 1) {
1804 for ( ; zp < pos ; zp++ ) {
1805 if ( ! trie->trans[ zp ].next ) {
1806 break;
1807 }
1808 }
1809 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
1810 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1811 trie->trans[ zp ].check = state;
1812 if ( ++zp > pos ) pos = zp;
1813 break;
1814 }
1815 used--;
1816 }
1817 if ( !flag ) {
1818 flag = 1;
1819 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
1820 }
1821 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
1822 trie->trans[ pos ].check = state;
1823 pos++;
1824 }
1825 }
1826 }
cc601c31 1827 trie->lasttrans = pos + 1;
c944940b
JH
1828 trie->states = (reg_trie_state *)
1829 PerlMemShared_realloc( trie->states, laststate
1830 * sizeof(reg_trie_state) );
a3621e74 1831 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 1832 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
1833 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
1834 (int)depth * 2 + 2,"",
1835 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
1836 (IV)next_alloc,
1837 (IV)pos,
a3621e74
YO
1838 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
1839 );
1840
1841 } /* end table compress */
1842 }
1e2e3d02
YO
1843 DEBUG_TRIE_COMPILE_MORE_r(
1844 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1845 (int)depth * 2 + 2, "",
1846 (UV)trie->statecount,
1847 (UV)trie->lasttrans)
1848 );
cc601c31 1849 /* resize the trans array to remove unused space */
c944940b
JH
1850 trie->trans = (reg_trie_trans *)
1851 PerlMemShared_realloc( trie->trans, trie->lasttrans
1852 * sizeof(reg_trie_trans) );
a3621e74 1853
3dab1dad 1854 /* and now dump out the compressed format */
2b8b4781 1855 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
07be1b83 1856
3dab1dad 1857 { /* Modify the program and insert the new TRIE node*/
3dab1dad
YO
1858 U8 nodetype =(U8)(flags & 0xFF);
1859 char *str=NULL;
786e8c11 1860
07be1b83 1861#ifdef DEBUGGING
e62cc96a 1862 regnode *optimize = NULL;
7122b237
YO
1863#ifdef RE_TRACK_PATTERN_OFFSETS
1864
b57a0404
JH
1865 U32 mjd_offset = 0;
1866 U32 mjd_nodelen = 0;
7122b237
YO
1867#endif /* RE_TRACK_PATTERN_OFFSETS */
1868#endif /* DEBUGGING */
a3621e74 1869 /*
3dab1dad
YO
1870 This means we convert either the first branch or the first Exact,
1871 depending on whether the thing following (in 'last') is a branch
1872 or not and whther first is the startbranch (ie is it a sub part of
1873 the alternation or is it the whole thing.)
1874 Assuming its a sub part we conver the EXACT otherwise we convert
1875 the whole branch sequence, including the first.
a3621e74 1876 */
3dab1dad 1877 /* Find the node we are going to overwrite */
7f69552c 1878 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 1879 /* branch sub-chain */
3dab1dad 1880 NEXT_OFF( first ) = (U16)(last - first);
7122b237 1881#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
1882 DEBUG_r({
1883 mjd_offset= Node_Offset((convert));
1884 mjd_nodelen= Node_Length((convert));
1885 });
7122b237 1886#endif
7f69552c 1887 /* whole branch chain */
7122b237
YO
1888 }
1889#ifdef RE_TRACK_PATTERN_OFFSETS
1890 else {
7f69552c
YO
1891 DEBUG_r({
1892 const regnode *nop = NEXTOPER( convert );
1893 mjd_offset= Node_Offset((nop));
1894 mjd_nodelen= Node_Length((nop));
1895 });
07be1b83
YO
1896 }
1897 DEBUG_OPTIMISE_r(
1898 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
1899 (int)depth * 2 + 2, "",
786e8c11 1900 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 1901 );
7122b237 1902#endif
3dab1dad
YO
1903 /* But first we check to see if there is a common prefix we can
1904 split out as an EXACT and put in front of the TRIE node. */
1905 trie->startstate= 1;
55eed653 1906 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 1907 U32 state;
1e2e3d02 1908 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 1909 U32 ofs = 0;
8e11feef
RGS
1910 I32 idx = -1;
1911 U32 count = 0;
1912 const U32 base = trie->states[ state ].trans.base;
a3621e74 1913
3dab1dad 1914 if ( trie->states[state].wordnum )
8e11feef 1915 count = 1;
a3621e74 1916
8e11feef 1917 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
1918 if ( ( base + ofs >= trie->uniquecharcount ) &&
1919 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
1920 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1921 {
3dab1dad 1922 if ( ++count > 1 ) {
2b8b4781 1923 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 1924 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1925 if ( state == 1 ) break;
3dab1dad
YO
1926 if ( count == 2 ) {
1927 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
1928 DEBUG_OPTIMISE_r(
8e11feef
RGS
1929 PerlIO_printf(Perl_debug_log,
1930 "%*sNew Start State=%"UVuf" Class: [",
1931 (int)depth * 2 + 2, "",
786e8c11 1932 (UV)state));
be8e71aa 1933 if (idx >= 0) {
2b8b4781 1934 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 1935 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 1936
3dab1dad 1937 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
1938 if ( folder )
1939 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 1940 DEBUG_OPTIMISE_r(
07be1b83 1941 PerlIO_printf(Perl_debug_log, (char*)ch)
3dab1dad 1942 );
8e11feef
RGS
1943 }
1944 }
1945 TRIE_BITMAP_SET(trie,*ch);
1946 if ( folder )
1947 TRIE_BITMAP_SET(trie,folder[ *ch ]);
1948 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
1949 }
1950 idx = ofs;
1951 }
3dab1dad
YO
1952 }
1953 if ( count == 1 ) {
2b8b4781 1954 SV **tmp = av_fetch( revcharmap, idx, 0);
de734bd5
A
1955 char *ch = SvPV_nolen( *tmp );
1956 DEBUG_OPTIMISE_r({
1957 SV *sv=sv_newmortal();
8e11feef
RGS
1958 PerlIO_printf( Perl_debug_log,
1959 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
1960 (int)depth * 2 + 2, "",
de734bd5
A
1961 (UV)state, (UV)idx,
1962 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
1963 PL_colors[0], PL_colors[1],
1964 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1965 PERL_PV_ESCAPE_FIRSTCHAR
1966 )
1967 );
1968 });
3dab1dad
YO
1969 if ( state==1 ) {
1970 OP( convert ) = nodetype;
1971 str=STRING(convert);
1972 STR_LEN(convert)=0;
1973 }
de734bd5
A
1974 while (*ch) {
1975 *str++ = *ch++;
1976 STR_LEN(convert)++;
1977 }
1978
8e11feef 1979 } else {
f9049ba1 1980#ifdef DEBUGGING
8e11feef
RGS
1981 if (state>1)
1982 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 1983#endif
8e11feef
RGS
1984 break;
1985 }
1986 }
3dab1dad 1987 if (str) {
8e11feef 1988 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 1989 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 1990 trie->startstate = state;
07be1b83
YO
1991 trie->minlen -= (state - 1);
1992 trie->maxlen -= (state - 1);
1993 DEBUG_r({
1994 regnode *fix = convert;
de734bd5 1995 U32 word = trie->wordcount;
07be1b83
YO
1996 mjd_nodelen++;
1997 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
1998 while( ++fix < n ) {
1999 Set_Node_Offset_Length(fix, 0, 0);
2000 }
de734bd5 2001 while (word--) {
2b8b4781 2002 SV ** const tmp = av_fetch( trie_words, word, 0 );
de734bd5
A
2003 if (tmp) {
2004 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2005 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2006 else
2007 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2008 }
2009 }
07be1b83 2010 });
8e11feef
RGS
2011 if (trie->maxlen) {
2012 convert = n;
2013 } else {
3dab1dad 2014 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2015 DEBUG_r(optimize= n);
3dab1dad
YO
2016 }
2017 }
2018 }
a5ca303d
YO
2019 if (!jumper)
2020 jumper = last;
3dab1dad 2021 if ( trie->maxlen ) {
8e11feef
RGS
2022 NEXT_OFF( convert ) = (U16)(tail - convert);
2023 ARG_SET( convert, data_slot );
786e8c11
YO
2024 /* Store the offset to the first unabsorbed branch in
2025 jump[0], which is otherwise unused by the jump logic.
2026 We use this when dumping a trie and during optimisation. */
2027 if (trie->jump)
7f69552c 2028 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2029
786e8c11
YO
2030 /* XXXX */
2031 if ( !trie->states[trie->startstate].wordnum && trie->bitmap &&
1de06328 2032 ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2033 {
2034 OP( convert ) = TRIEC;
2035 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2036 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2037 trie->bitmap= NULL;
2038 } else
2039 OP( convert ) = TRIE;
a3621e74 2040
3dab1dad
YO
2041 /* store the type in the flags */
2042 convert->flags = nodetype;
a5ca303d
YO
2043 DEBUG_r({
2044 optimize = convert
2045 + NODE_STEP_REGNODE
2046 + regarglen[ OP( convert ) ];
2047 });
2048 /* XXX We really should free up the resource in trie now,
2049 as we won't use them - (which resources?) dmq */
3dab1dad 2050 }
a3621e74 2051 /* needed for dumping*/
e62cc96a 2052 DEBUG_r(if (optimize) {
07be1b83 2053 regnode *opt = convert;
bcdf7404 2054
e62cc96a 2055 while ( ++opt < optimize) {
07be1b83
YO
2056 Set_Node_Offset_Length(opt,0,0);
2057 }
786e8c11
YO
2058 /*
2059 Try to clean up some of the debris left after the
2060 optimisation.
a3621e74 2061 */
786e8c11 2062 while( optimize < jumper ) {
07be1b83 2063 mjd_nodelen += Node_Length((optimize));
a3621e74 2064 OP( optimize ) = OPTIMIZED;
07be1b83 2065 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2066 optimize++;
2067 }
07be1b83 2068 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2069 });
2070 } /* end node insert */
55eed653 2071 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2072#ifdef DEBUGGING
2073 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2074 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2075#else
2076 SvREFCNT_dec(revcharmap);
07be1b83 2077#endif
786e8c11
YO
2078 return trie->jump
2079 ? MADE_JUMP_TRIE
2080 : trie->startstate>1
2081 ? MADE_EXACT_TRIE
2082 : MADE_TRIE;
2083}
2084
2085STATIC void
2086S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2087{
2088/* The Trie is constructed and compressed now so we can build a fail array now if its needed
2089
2090 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2091 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2092 ISBN 0-201-10088-6
2093
2094 We find the fail state for each state in the trie, this state is the longest proper
2095 suffix of the current states 'word' that is also a proper prefix of another word in our
2096 trie. State 1 represents the word '' and is the thus the default fail state. This allows
2097 the DFA not to have to restart after its tried and failed a word at a given point, it
2098 simply continues as though it had been matching the other word in the first place.
2099 Consider
2100 'abcdgu'=~/abcdefg|cdgu/
2101 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
2102 fail, which would bring use to the state representing 'd' in the second word where we would
2103 try 'g' and succeed, prodceding to match 'cdgu'.
2104 */
2105 /* add a fail transition */
3251b653
NC
2106 const U32 trie_offset = ARG(source);
2107 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2108 U32 *q;
2109 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2110 const U32 numstates = trie->statecount;
786e8c11
YO
2111 const U32 ubound = trie->lasttrans + ucharcount;
2112 U32 q_read = 0;
2113 U32 q_write = 0;
2114 U32 charid;
2115 U32 base = trie->states[ 1 ].trans.base;
2116 U32 *fail;
2117 reg_ac_data *aho;
2118 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2119 GET_RE_DEBUG_FLAGS_DECL;
2120#ifndef DEBUGGING
2121 PERL_UNUSED_ARG(depth);
2122#endif
2123
2124
2125 ARG_SET( stclass, data_slot );
c944940b 2126 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2127 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2128 aho->trie=trie_offset;
446bd890
NC
2129 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2130 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2131 Newxz( q, numstates, U32);
c944940b 2132 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2133 aho->refcount = 1;
2134 fail = aho->fail;
2135 /* initialize fail[0..1] to be 1 so that we always have
2136 a valid final fail state */
2137 fail[ 0 ] = fail[ 1 ] = 1;
2138
2139 for ( charid = 0; charid < ucharcount ; charid++ ) {
2140 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2141 if ( newstate ) {
2142 q[ q_write ] = newstate;
2143 /* set to point at the root */
2144 fail[ q[ q_write++ ] ]=1;
2145 }
2146 }
2147 while ( q_read < q_write) {
2148 const U32 cur = q[ q_read++ % numstates ];
2149 base = trie->states[ cur ].trans.base;
2150
2151 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2152 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2153 if (ch_state) {
2154 U32 fail_state = cur;
2155 U32 fail_base;
2156 do {
2157 fail_state = fail[ fail_state ];
2158 fail_base = aho->states[ fail_state ].trans.base;
2159 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2160
2161 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2162 fail[ ch_state ] = fail_state;
2163 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2164 {
2165 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2166 }
2167 q[ q_write++ % numstates] = ch_state;
2168 }
2169 }
2170 }
2171 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2172 when we fail in state 1, this allows us to use the
2173 charclass scan to find a valid start char. This is based on the principle
2174 that theres a good chance the string being searched contains lots of stuff
2175 that cant be a start char.
2176 */
2177 fail[ 0 ] = fail[ 1 ] = 0;
2178 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2179 PerlIO_printf(Perl_debug_log,
2180 "%*sStclass Failtable (%"UVuf" states): 0",
2181 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2182 );
786e8c11
YO
2183 for( q_read=1; q_read<numstates; q_read++ ) {
2184 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2185 }
2186 PerlIO_printf(Perl_debug_log, "\n");
2187 });
2188 Safefree(q);
2189 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2190}
2191
786e8c11 2192
a3621e74 2193/*
5d1c421c
JH
2194 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2195 * These need to be revisited when a newer toolchain becomes available.
2196 */
2197#if defined(__sparc64__) && defined(__GNUC__)
2198# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2199# undef SPARC64_GCC_WORKAROUND
2200# define SPARC64_GCC_WORKAROUND 1
2201# endif
2202#endif
2203
07be1b83 2204#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2205 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2206 SV * const mysv=sv_newmortal(); \
2207 regnode *Next = regnext(scan); \
2208 regprop(RExC_rx, mysv, scan); \
7f69552c 2209 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2210 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2211 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2212 }});
07be1b83 2213
1de06328
YO
2214
2215
2216
2217
07be1b83
YO
2218#define JOIN_EXACT(scan,min,flags) \
2219 if (PL_regkind[OP(scan)] == EXACT) \
2220 join_exact(pRExC_state,(scan),(min),(flags),NULL,depth+1)
2221
be8e71aa 2222STATIC U32
07be1b83
YO
2223S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, I32 *min, U32 flags,regnode *val, U32 depth) {
2224 /* Merge several consecutive EXACTish nodes into one. */
2225 regnode *n = regnext(scan);
2226 U32 stringok = 1;
2227 regnode *next = scan + NODE_SZ_STR(scan);
2228 U32 merged = 0;
2229 U32 stopnow = 0;
2230#ifdef DEBUGGING
2231 regnode *stop = scan;
72f13be8 2232 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2233#else
d47053eb
RGS
2234 PERL_UNUSED_ARG(depth);
2235#endif
2236#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2237 PERL_UNUSED_ARG(flags);
2238 PERL_UNUSED_ARG(val);
07be1b83 2239#endif
07be1b83
YO
2240 DEBUG_PEEP("join",scan,depth);
2241
2242 /* Skip NOTHING, merge EXACT*. */
2243 while (n &&
2244 ( PL_regkind[OP(n)] == NOTHING ||
2245 (stringok && (OP(n) == OP(scan))))
2246 && NEXT_OFF(n)
2247 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX) {
2248
2249 if (OP(n) == TAIL || n > next)
2250 stringok = 0;
2251 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2252 DEBUG_PEEP("skip:",n,depth);
2253 NEXT_OFF(scan) += NEXT_OFF(n);
2254 next = n + NODE_STEP_REGNODE;
2255#ifdef DEBUGGING
2256 if (stringok)
2257 stop = n;
2258#endif
2259 n = regnext(n);
2260 }
2261 else if (stringok) {
786e8c11 2262 const unsigned int oldl = STR_LEN(scan);
07be1b83
YO
2263 regnode * const nnext = regnext(n);
2264
2265 DEBUG_PEEP("merg",n,depth);
2266
2267 merged++;
2268 if (oldl + STR_LEN(n) > U8_MAX)
2269 break;
2270 NEXT_OFF(scan) += NEXT_OFF(n);
2271 STR_LEN(scan) += STR_LEN(n);
2272 next = n + NODE_SZ_STR(n);
2273 /* Now we can overwrite *n : */
2274 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2275#ifdef DEBUGGING
2276 stop = next - 1;
2277#endif
2278 n = nnext;
2279 if (stopnow) break;
2280 }
2281
d47053eb
RGS
2282#ifdef EXPERIMENTAL_INPLACESCAN
2283 if (flags && !NEXT_OFF(n)) {
2284 DEBUG_PEEP("atch", val, depth);
2285 if (reg_off_by_arg[OP(n)]) {
2286 ARG_SET(n, val - n);
2287 }
2288 else {
2289 NEXT_OFF(n) = val - n;
2290 }
2291 stopnow = 1;
2292 }
07be1b83
YO
2293#endif
2294 }
2295
2296 if (UTF && ( OP(scan) == EXACTF ) && ( STR_LEN(scan) >= 6 ) ) {
2297 /*
2298 Two problematic code points in Unicode casefolding of EXACT nodes:
2299
2300 U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2301 U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2302
2303 which casefold to
2304
2305 Unicode UTF-8
2306
2307 U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2308 U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2309
2310 This means that in case-insensitive matching (or "loose matching",
2311 as Unicode calls it), an EXACTF of length six (the UTF-8 encoded byte
2312 length of the above casefolded versions) can match a target string
2313 of length two (the byte length of UTF-8 encoded U+0390 or U+03B0).
2314 This would rather mess up the minimum length computation.
2315
2316 What we'll do is to look for the tail four bytes, and then peek
2317 at the preceding two bytes to see whether we need to decrease
2318 the minimum length by four (six minus two).
2319
2320 Thanks to the design of UTF-8, there cannot be false matches:
2321 A sequence of valid UTF-8 bytes cannot be a subsequence of
2322 another valid sequence of UTF-8 bytes.
2323
2324 */
2325 char * const s0 = STRING(scan), *s, *t;
2326 char * const s1 = s0 + STR_LEN(scan) - 1;
2327 char * const s2 = s1 - 4;
e294cc5d
JH
2328#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
2329 const char t0[] = "\xaf\x49\xaf\x42";
2330#else
07be1b83 2331 const char t0[] = "\xcc\x88\xcc\x81";
e294cc5d 2332#endif
07be1b83
YO
2333 const char * const t1 = t0 + 3;
2334
2335 for (s = s0 + 2;
2336 s < s2 && (t = ninstr(s, s1, t0, t1));
2337 s = t + 4) {
e294cc5d
JH
2338#ifdef EBCDIC
2339 if (((U8)t[-1] == 0x68 && (U8)t[-2] == 0xB4) ||
2340 ((U8)t[-1] == 0x46 && (U8)t[-2] == 0xB5))
2341#else
07be1b83
YO
2342 if (((U8)t[-1] == 0xB9 && (U8)t[-2] == 0xCE) ||
2343 ((U8)t[-1] == 0x85 && (U8)t[-2] == 0xCF))
e294cc5d 2344#endif
07be1b83
YO
2345 *min -= 4;
2346 }
2347 }
2348
2349#ifdef DEBUGGING
2350 /* Allow dumping */
2351 n = scan + NODE_SZ_STR(scan);
2352 while (n <= stop) {
2353 if (PL_regkind[OP(n)] != NOTHING || OP(n) == NOTHING) {
2354 OP(n) = OPTIMIZED;
2355 NEXT_OFF(n) = 0;
2356 }
2357 n++;
2358 }
2359#endif
2360 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2361 return stopnow;
2362}
2363
653099ff
GS
2364/* REx optimizer. Converts nodes into quickier variants "in place".
2365 Finds fixed substrings. */
2366
a0288114 2367/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2368 to the position after last scanned or to NULL. */
2369
40d049e4
YO
2370#define INIT_AND_WITHP \
2371 assert(!and_withp); \
2372 Newx(and_withp,1,struct regnode_charclass_class); \
2373 SAVEFREEPV(and_withp)
07be1b83 2374
b515a41d
YO
2375/* this is a chain of data about sub patterns we are processing that
2376 need to be handled seperately/specially in study_chunk. Its so
2377 we can simulate recursion without losing state. */
2378struct scan_frame;
2379typedef struct scan_frame {
2380 regnode *last; /* last node to process in this frame */
2381 regnode *next; /* next node to process when last is reached */
2382 struct scan_frame *prev; /*previous frame*/
2383 I32 stop; /* what stopparen do we use */
2384} scan_frame;
2385
304ee84b
YO
2386
2387#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2388
76e3520e 2389STATIC I32
40d049e4 2390S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2391 I32 *minlenp, I32 *deltap,
40d049e4
YO
2392 regnode *last,
2393 scan_data_t *data,
2394 I32 stopparen,
2395 U8* recursed,
2396 struct regnode_charclass_class *and_withp,
2397 U32 flags, U32 depth)
c277df42
IZ
2398 /* scanp: Start here (read-write). */
2399 /* deltap: Write maxlen-minlen here. */
2400 /* last: Stop before this one. */
40d049e4
YO
2401 /* data: string data about the pattern */
2402 /* stopparen: treat close N as END */
2403 /* recursed: which subroutines have we recursed into */
2404 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2405{
97aff369 2406 dVAR;
c277df42
IZ
2407 I32 min = 0, pars = 0, code;
2408 regnode *scan = *scanp, *next;
2409 I32 delta = 0;
2410 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2411 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2412 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2413 scan_data_t data_fake;
a3621e74 2414 SV *re_trie_maxbuff = NULL;
786e8c11 2415 regnode *first_non_open = scan;
e2e6a0f1 2416 I32 stopmin = I32_MAX;
8aa23a47
YO
2417 scan_frame *frame = NULL;
2418
a3621e74 2419 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2420
13a24bad 2421#ifdef DEBUGGING
40d049e4 2422 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2423#endif
40d049e4 2424
786e8c11 2425 if ( depth == 0 ) {
40d049e4 2426 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2427 first_non_open=regnext(first_non_open);
2428 }
2429
b81d288d 2430
8aa23a47
YO
2431 fake_study_recurse:
2432 while ( scan && OP(scan) != END && scan < last ){
2433 /* Peephole optimizer: */
304ee84b 2434 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47
YO
2435 DEBUG_PEEP("Peep",scan,depth);
2436 JOIN_EXACT(scan,&min,0);
2437
2438 /* Follow the next-chain of the current node and optimize
2439 away all the NOTHINGs from it. */
2440 if (OP(scan) != CURLYX) {
2441 const int max = (reg_off_by_arg[OP(scan)]
2442 ? I32_MAX
2443 /* I32 may be smaller than U16 on CRAYs! */
2444 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
2445 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
2446 int noff;
2447 regnode *n = scan;
2448
2449 /* Skip NOTHING and LONGJMP. */
2450 while ((n = regnext(n))
2451 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
2452 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
2453 && off + noff < max)
2454 off += noff;
2455 if (reg_off_by_arg[OP(scan)])
2456 ARG(scan) = off;
2457 else
2458 NEXT_OFF(scan) = off;
2459 }
a3621e74 2460
c277df42 2461
8aa23a47
YO
2462
2463 /* The principal pseudo-switch. Cannot be a switch, since we
2464 look into several different things. */
2465 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
2466 || OP(scan) == IFTHEN) {
2467 next = regnext(scan);
2468 code = OP(scan);
2469 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
2470
2471 if (OP(next) == code || code == IFTHEN) {
2472 /* NOTE - There is similar code to this block below for handling
2473 TRIE nodes on a re-study. If you change stuff here check there
2474 too. */
2475 I32 max1 = 0, min1 = I32_MAX, num = 0;
2476 struct regnode_charclass_class accum;
2477 regnode * const startbranch=scan;
2478
2479 if (flags & SCF_DO_SUBSTR)
304ee84b 2480 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
2481 if (flags & SCF_DO_STCLASS)
2482 cl_init_zero(pRExC_state, &accum);
2483
2484 while (OP(scan) == code) {
2485 I32 deltanext, minnext, f = 0, fake;
2486 struct regnode_charclass_class this_class;
2487
2488 num++;
2489 data_fake.flags = 0;
2490 if (data) {
2491 data_fake.whilem_c = data->whilem_c;
2492 data_fake.last_closep = data->last_closep;
2493 }
2494 else
2495 data_fake.last_closep = &fake;
58e23c8d
YO
2496
2497 data_fake.pos_delta = delta;
8aa23a47
YO
2498 next = regnext(scan);
2499 scan = NEXTOPER(scan);
2500 if (code != BRANCH)
c277df42 2501 scan = NEXTOPER(scan);
8aa23a47
YO
2502 if (flags & SCF_DO_STCLASS) {
2503 cl_init(pRExC_state, &this_class);
2504 data_fake.start_class = &this_class;
2505 f = SCF_DO_STCLASS_AND;
58e23c8d 2506 }
8aa23a47
YO
2507 if (flags & SCF_WHILEM_VISITED_POS)
2508 f |= SCF_WHILEM_VISITED_POS;
2509
2510 /* we suppose the run is continuous, last=next...*/
2511 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2512 next, &data_fake,
2513 stopparen, recursed, NULL, f,depth+1);
2514 if (min1 > minnext)
2515 min1 = minnext;
2516 if (max1 < minnext + deltanext)
2517 max1 = minnext + deltanext;
2518 if (deltanext == I32_MAX)
2519 is_inf = is_inf_internal = 1;
2520 scan = next;
2521 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
2522 pars++;
2523 if (data_fake.flags & SCF_SEEN_ACCEPT) {
2524 if ( stopmin > minnext)
2525 stopmin = min + min1;
2526 flags &= ~SCF_DO_SUBSTR;
2527 if (data)
2528 data->flags |= SCF_SEEN_ACCEPT;
2529 }
2530 if (data) {
2531 if (data_fake.flags & SF_HAS_EVAL)
2532 data->flags |= SF_HAS_EVAL;
2533 data->whilem_c = data_fake.whilem_c;
3dab1dad 2534 }
8aa23a47
YO
2535 if (flags & SCF_DO_STCLASS)
2536 cl_or(pRExC_state, &accum, &this_class);
2537 }
2538 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
2539 min1 = 0;
2540 if (flags & SCF_DO_SUBSTR) {
2541 data->pos_min += min1;
2542 data->pos_delta += max1 - min1;
2543 if (max1 != min1 || is_inf)
2544 data->longest = &(data->longest_float);
2545 }
2546 min += min1;
2547 delta += max1 - min1;
2548 if (flags & SCF_DO_STCLASS_OR) {
2549 cl_or(pRExC_state, data->start_class, &accum);
2550 if (min1) {
2551 cl_and(data->start_class, and_withp);
2552 flags &= ~SCF_DO_STCLASS;
653099ff 2553 }
8aa23a47
YO
2554 }
2555 else if (flags & SCF_DO_STCLASS_AND) {
2556 if (min1) {
2557 cl_and(data->start_class, &accum);
2558 flags &= ~SCF_DO_STCLASS;
de0c8cb8 2559 }
8aa23a47
YO
2560 else {
2561 /* Switch to OR mode: cache the old value of
2562 * data->start_class */
2563 INIT_AND_WITHP;
2564 StructCopy(data->start_class, and_withp,
2565 struct regnode_charclass_class);
2566 flags &= ~SCF_DO_STCLASS_AND;
2567 StructCopy(&accum, data->start_class,
2568 struct regnode_charclass_class);
2569 flags |= SCF_DO_STCLASS_OR;
2570 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 2571 }
8aa23a47 2572 }
a3621e74 2573
8aa23a47
YO
2574 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
2575 /* demq.
a3621e74 2576
8aa23a47
YO
2577 Assuming this was/is a branch we are dealing with: 'scan' now
2578 points at the item that follows the branch sequence, whatever
2579 it is. We now start at the beginning of the sequence and look
2580 for subsequences of
a3621e74 2581
8aa23a47
YO
2582 BRANCH->EXACT=>x1
2583 BRANCH->EXACT=>x2
2584 tail
a3621e74 2585
8aa23a47 2586 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 2587
8aa23a47
YO
2588 If we can find such a subseqence we need to turn the first
2589 element into a trie and then add the subsequent branch exact
2590 strings to the trie.
a3621e74 2591
8aa23a47 2592 We have two cases
a3621e74 2593
8aa23a47 2594 1. patterns where the whole set of branch can be converted.
a3621e74 2595
8aa23a47 2596 2. patterns where only a subset can be converted.
a3621e74 2597
8aa23a47
YO
2598 In case 1 we can replace the whole set with a single regop
2599 for the trie. In case 2 we need to keep the start and end
2600 branchs so
a3621e74 2601
8aa23a47
YO
2602 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
2603 becomes BRANCH TRIE; BRANCH X;
786e8c11 2604
8aa23a47
YO
2605 There is an additional case, that being where there is a
2606 common prefix, which gets split out into an EXACT like node
2607 preceding the TRIE node.
a3621e74 2608
8aa23a47
YO
2609 If x(1..n)==tail then we can do a simple trie, if not we make
2610 a "jump" trie, such that when we match the appropriate word
2611 we "jump" to the appopriate tail node. Essentailly we turn
2612 a nested if into a case structure of sorts.
b515a41d 2613
8aa23a47
YO
2614 */
2615
2616 int made=0;
2617 if (!re_trie_maxbuff) {
2618 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
2619 if (!SvIOK(re_trie_maxbuff))
2620 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
2621 }
2622 if ( SvIV(re_trie_maxbuff)>=0 ) {
2623 regnode *cur;
2624 regnode *first = (regnode *)NULL;
2625 regnode *last = (regnode *)NULL;
2626 regnode *tail = scan;
2627 U8 optype = 0;
2628 U32 count=0;
a3621e74
YO
2629
2630#ifdef DEBUGGING
8aa23a47 2631 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 2632#endif
8aa23a47
YO
2633 /* var tail is used because there may be a TAIL
2634 regop in the way. Ie, the exacts will point to the
2635 thing following the TAIL, but the last branch will
2636 point at the TAIL. So we advance tail. If we
2637 have nested (?:) we may have to move through several
2638 tails.
2639 */
2640
2641 while ( OP( tail ) == TAIL ) {
2642 /* this is the TAIL generated by (?:) */
2643 tail = regnext( tail );
2644 }
a3621e74 2645
8aa23a47
YO
2646
2647 DEBUG_OPTIMISE_r({
2648 regprop(RExC_rx, mysv, tail );
2649 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
2650 (int)depth * 2 + 2, "",
2651 "Looking for TRIE'able sequences. Tail node is: ",
2652 SvPV_nolen_const( mysv )
2653 );
2654 });
2655
2656 /*
2657
2658 step through the branches, cur represents each
2659 branch, noper is the first thing to be matched
2660 as part of that branch and noper_next is the
2661 regnext() of that node. if noper is an EXACT
2662 and noper_next is the same as scan (our current
2663 position in the regex) then the EXACT branch is
2664 a possible optimization target. Once we have
2665 two or more consequetive such branches we can
2666 create a trie of the EXACT's contents and stich
2667 it in place. If the sequence represents all of
2668 the branches we eliminate the whole thing and
2669 replace it with a single TRIE. If it is a
2670 subsequence then we need to stitch it in. This
2671 means the first branch has to remain, and needs
2672 to be repointed at the item on the branch chain
2673 following the last branch optimized. This could
2674 be either a BRANCH, in which case the
2675 subsequence is internal, or it could be the
2676 item following the branch sequence in which
2677 case the subsequence is at the end.
2678
2679 */
2680
2681 /* dont use tail as the end marker for this traverse */
2682 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2683 regnode * const noper = NEXTOPER( cur );
b515a41d 2684#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 2685 regnode * const noper_next = regnext( noper );
b515a41d
YO
2686#endif
2687
8aa23a47
YO
2688 DEBUG_OPTIMISE_r({
2689 regprop(RExC_rx, mysv, cur);
2690 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
2691 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
2692
2693 regprop(RExC_rx, mysv, noper);
2694 PerlIO_printf( Perl_debug_log, " -> %s",
2695 SvPV_nolen_const(mysv));
2696
2697 if ( noper_next ) {
2698 regprop(RExC_rx, mysv, noper_next );
2699 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
2700 SvPV_nolen_const(mysv));
2701 }
2702 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
2703 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
2704 });
2705 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
2706 : PL_regkind[ OP( noper ) ] == EXACT )
2707 || OP(noper) == NOTHING )
786e8c11 2708#ifdef NOJUMPTRIE
8aa23a47 2709 && noper_next == tail
786e8c11 2710#endif
8aa23a47
YO
2711 && count < U16_MAX)
2712 {
2713 count++;
2714 if ( !first || optype == NOTHING ) {
2715 if (!first) first = cur;
2716 optype = OP( noper );
2717 } else {
2718 last = cur;
2719 }
2720 } else {
2721 if ( last ) {
2722 make_trie( pRExC_state,
2723 startbranch, first, cur, tail, count,
2724 optype, depth+1 );
2725 }
2726 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 2727#ifdef NOJUMPTRIE
8aa23a47 2728 && noper_next == tail
786e8c11 2729#endif
8aa23a47
YO
2730 ){
2731 count = 1;
2732 first = cur;
2733 optype = OP( noper );
2734 } else {
2735 count = 0;
2736 first = NULL;
2737 optype = 0;
2738 }
2739 last = NULL;
2740 }
2741 }
2742 DEBUG_OPTIMISE_r({
2743 regprop(RExC_rx, mysv, cur);
2744 PerlIO_printf( Perl_debug_log,
2745 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
2746 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
2747
2748 });
2749 if ( last ) {
2750 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
3dab1dad 2751#ifdef TRIE_STUDY_OPT
8aa23a47
YO
2752 if ( ((made == MADE_EXACT_TRIE &&
2753 startbranch == first)
2754 || ( first_non_open == first )) &&
2755 depth==0 ) {
2756 flags |= SCF_TRIE_RESTUDY;
2757 if ( startbranch == first
2758 && scan == tail )
2759 {
2760 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
2761 }
2762 }
3dab1dad 2763#endif
8aa23a47
YO
2764 }
2765 }
2766
2767 } /* do trie */
2768
653099ff 2769 }
8aa23a47
YO
2770 else if ( code == BRANCHJ ) { /* single branch is optimized. */
2771 scan = NEXTOPER(NEXTOPER(scan));
2772 } else /* single branch is optimized. */
2773 scan = NEXTOPER(scan);
2774 continue;
2775 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
2776 scan_frame *newframe = NULL;
2777 I32 paren;
2778 regnode *start;
2779 regnode *end;
2780
2781 if (OP(scan) != SUSPEND) {
2782 /* set the pointer */
2783 if (OP(scan) == GOSUB) {
2784 paren = ARG(scan);
2785 RExC_recurse[ARG2L(scan)] = scan;
2786 start = RExC_open_parens[paren-1];
2787 end = RExC_close_parens[paren-1];
2788 } else {
2789 paren = 0;
f8fc2ecf 2790 start = RExC_rxi->program + 1;
8aa23a47
YO
2791 end = RExC_opend;
2792 }
2793 if (!recursed) {
2794 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
2795 SAVEFREEPV(recursed);
2796 }
2797 if (!PAREN_TEST(recursed,paren+1)) {
2798 PAREN_SET(recursed,paren+1);
2799 Newx(newframe,1,scan_frame);
2800 } else {
2801 if (flags & SCF_DO_SUBSTR) {
304ee84b 2802 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
2803 data->longest = &(data->longest_float);
2804 }
2805 is_inf = is_inf_internal = 1;
2806 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2807 cl_anything(pRExC_state, data->start_class);
2808 flags &= ~SCF_DO_STCLASS;
2809 }
2810 } else {
2811 Newx(newframe,1,scan_frame);
2812 paren = stopparen;
2813 start = scan+2;
2814 end = regnext(scan);
2815 }
2816 if (newframe) {
2817 assert(start);
2818 assert(end);
2819 SAVEFREEPV(newframe);
2820 newframe->next = regnext(scan);
2821 newframe->last = last;
2822 newframe->stop = stopparen;
2823 newframe->prev = frame;
2824
2825 frame = newframe;
2826 scan = start;
2827 stopparen = paren;
2828 last = end;
2829
2830 continue;
2831 }
2832 }
2833 else if (OP(scan) == EXACT) {
2834 I32 l = STR_LEN(scan);
2835 UV uc;
2836 if (UTF) {
2837 const U8 * const s = (U8*)STRING(scan);
2838 l = utf8_length(s, s + l);
2839 uc = utf8_to_uvchr(s, NULL);
2840 } else {
2841 uc = *((U8*)STRING(scan));
2842 }
2843 min += l;
2844 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2845 /* The code below prefers earlier match for fixed
2846 offset, later match for variable offset. */
2847 if (data->last_end == -1) { /* Update the start info. */
2848 data->last_start_min = data->pos_min;
2849 data->last_start_max = is_inf
2850 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 2851 }
8aa23a47
YO
2852 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
2853 if (UTF)
2854 SvUTF8_on(data->last_found);
2855 {
2856 SV * const sv = data->last_found;
2857 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2858 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2859 if (mg && mg->mg_len >= 0)
2860 mg->mg_len += utf8_length((U8*)STRING(scan),
2861 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 2862 }
8aa23a47
YO
2863 data->last_end = data->pos_min + l;
2864 data->pos_min += l; /* As in the first entry. */
2865 data->flags &= ~SF_BEFORE_EOL;
2866 }
2867 if (flags & SCF_DO_STCLASS_AND) {
2868 /* Check whether it is compatible with what we know already! */
2869 int compat = 1;
2870
2871 if (uc >= 0x100 ||
2872 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2873 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2874 && (!(data->start_class->flags & ANYOF_FOLD)
2875 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2876 )
2877 compat = 0;
2878 ANYOF_CLASS_ZERO(data->start_class);
2879 ANYOF_BITMAP_ZERO(data->start_class);
2880 if (compat)
2881 ANYOF_BITMAP_SET(data->start_class, uc);
2882 data->start_class->flags &= ~ANYOF_EOS;
2883 if (uc < 0x100)
2884 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
2885 }
2886 else if (flags & SCF_DO_STCLASS_OR) {
2887 /* false positive possible if the class is case-folded */
2888 if (uc < 0x100)
2889 ANYOF_BITMAP_SET(data->start_class, uc);
2890 else
2891 data->start_class->flags |= ANYOF_UNICODE_ALL;
2892 data->start_class->flags &= ~ANYOF_EOS;
2893 cl_and(data->start_class, and_withp);
2894 }
2895 flags &= ~SCF_DO_STCLASS;
2896 }
2897 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
2898 I32 l = STR_LEN(scan);
2899 UV uc = *((U8*)STRING(scan));
2900
2901 /* Search for fixed substrings supports EXACT only. */
2902 if (flags & SCF_DO_SUBSTR) {
2903 assert(data);
304ee84b 2904 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
2905 }
2906 if (UTF) {
2907 const U8 * const s = (U8 *)STRING(scan);
2908 l = utf8_length(s, s + l);
2909 uc = utf8_to_uvchr(s, NULL);
2910 }
2911 min += l;
2912 if (flags & SCF_DO_SUBSTR)
2913 data->pos_min += l;
2914 if (flags & SCF_DO_STCLASS_AND) {
2915 /* Check whether it is compatible with what we know already! */
2916 int compat = 1;
2917
2918 if (uc >= 0x100 ||
2919 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
2920 && !ANYOF_BITMAP_TEST(data->start_class, uc)
2921 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold[uc])))
2922 compat = 0;
2923 ANYOF_CLASS_ZERO(data->start_class);
2924 ANYOF_BITMAP_ZERO(data->start_class);
2925 if (compat) {
2926 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 2927 data->start_class->flags &= ~ANYOF_EOS;
8aa23a47
YO
2928 data->start_class->flags |= ANYOF_FOLD;
2929 if (OP(scan) == EXACTFL)
2930 data->start_class->flags |= ANYOF_LOCALE;
653099ff 2931 }
8aa23a47
YO
2932 }
2933 else if (flags & SCF_DO_STCLASS_OR) {
2934 if (data->start_class->flags & ANYOF_FOLD) {
2935 /* false positive possible if the class is case-folded.
2936 Assume that the locale settings are the same... */
1aa99e6b
IH
2937 if (uc < 0x100)
2938 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff
GS
2939 data->start_class->flags &= ~ANYOF_EOS;
2940 }
8aa23a47 2941 cl_and(data->start_class, and_withp);
653099ff 2942 }
8aa23a47
YO
2943 flags &= ~SCF_DO_STCLASS;
2944 }
2945 else if (strchr((const char*)PL_varies,OP(scan))) {
2946 I32 mincount, maxcount, minnext, deltanext, fl = 0;
2947 I32 f = flags, pos_before = 0;
2948 regnode * const oscan = scan;
2949 struct regnode_charclass_class this_class;
2950 struct regnode_charclass_class *oclass = NULL;
2951 I32 next_is_eval = 0;
2952
2953 switch (PL_regkind[OP(scan)]) {
2954 case WHILEM: /* End of (?:...)* . */
2955 scan = NEXTOPER(scan);
2956 goto finish;
2957 case PLUS:
2958 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2959 next = NEXTOPER(scan);
2960 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
2961 mincount = 1;
2962 maxcount = REG_INFTY;
2963 next = regnext(scan);
2964 scan = NEXTOPER(scan);
2965 goto do_curly;
2966 }
2967 }
2968 if (flags & SCF_DO_SUBSTR)
2969 data->pos_min++;
2970 min++;
2971 /* Fall through. */
2972 case STAR:
2973 if (flags & SCF_DO_STCLASS) {
2974 mincount = 0;
2975 maxcount = REG_INFTY;
2976 next = regnext(scan);
2977 scan = NEXTOPER(scan);
2978 goto do_curly;
2979 }
2980 is_inf = is_inf_internal = 1;
2981 scan = regnext(scan);
c277df42 2982 if (flags & SCF_DO_SUBSTR) {
304ee84b 2983 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 2984 data->longest = &(data->longest_float);
c277df42 2985 }
8aa23a47
YO
2986 goto optimize_curly_tail;
2987 case CURLY:
2988 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2989 && (scan->flags == stopparen))
2990 {
2991 mincount = 1;
2992 maxcount = 1;
2993 } else {
2994 mincount = ARG1(scan);
2995 maxcount = ARG2(scan);
653099ff 2996 }
8aa23a47
YO
2997 next = regnext(scan);
2998 if (OP(scan) == CURLYX) {
2999 I32 lp = (data ? *(data->last_closep) : 0);
3000 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3001 }
8aa23a47
YO
3002 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3003 next_is_eval = (OP(scan) == EVAL);
3004 do_curly:
3005 if (flags & SCF_DO_SUBSTR) {
304ee84b 3006 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3007 pos_before = data->pos_min;
b45f050a 3008 }
8aa23a47
YO
3009 if (data) {
3010 fl = data->flags;
3011 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3012 if (is_inf)
3013 data->flags |= SF_IS_INF;
3014 }
3015 if (flags & SCF_DO_STCLASS) {
3016 cl_init(pRExC_state, &this_class);
3017 oclass = data->start_class;
3018 data->start_class = &this_class;
3019 f |= SCF_DO_STCLASS_AND;
3020 f &= ~SCF_DO_STCLASS_OR;
3021 }
3022 /* These are the cases when once a subexpression
3023 fails at a particular position, it cannot succeed
3024 even after backtracking at the enclosing scope.
3025
3026 XXXX what if minimal match and we are at the
3027 initial run of {n,m}? */
3028 if ((mincount != maxcount - 1) && (maxcount != REG_INFTY))
3029 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3030
8aa23a47
YO
3031 /* This will finish on WHILEM, setting scan, or on NULL: */
3032 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3033 last, data, stopparen, recursed, NULL,
3034 (mincount == 0
3035 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3036
8aa23a47
YO
3037 if (flags & SCF_DO_STCLASS)
3038 data->start_class = oclass;
3039 if (mincount == 0 || minnext == 0) {
3040 if (flags & SCF_DO_STCLASS_OR) {
3041 cl_or(pRExC_state, data->start_class, &this_class);
3042 }
3043 else if (flags & SCF_DO_STCLASS_AND) {
3044 /* Switch to OR mode: cache the old value of
3045 * data->start_class */
3046 INIT_AND_WITHP;
3047 StructCopy(data->start_class, and_withp,
3048 struct regnode_charclass_class);
3049 flags &= ~SCF_DO_STCLASS_AND;
3050 StructCopy(&this_class, data->start_class,
3051 struct regnode_charclass_class);
3052 flags |= SCF_DO_STCLASS_OR;
3053 data->start_class->flags |= ANYOF_EOS;
3054 }
3055 } else { /* Non-zero len */
3056 if (flags & SCF_DO_STCLASS_OR) {
3057 cl_or(pRExC_state, data->start_class, &this_class);
3058 cl_and(data->start_class, and_withp);
3059 }
3060 else if (flags & SCF_DO_STCLASS_AND)
3061 cl_and(data->start_class, &this_class);
3062 flags &= ~SCF_DO_STCLASS;
3063 }
3064 if (!scan) /* It was not CURLYX, but CURLY. */
3065 scan = next;
3066 if ( /* ? quantifier ok, except for (?{ ... }) */
3067 (next_is_eval || !(mincount == 0 && maxcount == 1))
3068 && (minnext == 0) && (deltanext == 0)
3069 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
3070 && maxcount <= REG_INFTY/3 /* Complement check for big count */
3071 && ckWARN(WARN_REGEXP))
3072 {
3073 vWARN(RExC_parse,
3074 "Quantifier unexpected on zero-length expression");
3075 }
3076
3077 min += minnext * mincount;
3078 is_inf_internal |= ((maxcount == REG_INFTY
3079 && (minnext + deltanext) > 0)
3080 || deltanext == I32_MAX);
3081 is_inf |= is_inf_internal;
3082 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3083
3084 /* Try powerful optimization CURLYX => CURLYN. */
3085 if ( OP(oscan) == CURLYX && data
3086 && data->flags & SF_IN_PAR
3087 && !(data->flags & SF_HAS_EVAL)
3088 && !deltanext && minnext == 1 ) {
3089 /* Try to optimize to CURLYN. */
3090 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3091 regnode * const nxt1 = nxt;
497b47a8 3092#ifdef DEBUGGING
8aa23a47 3093 regnode *nxt2;
497b47a8 3094#endif
c277df42 3095
8aa23a47
YO
3096 /* Skip open. */
3097 nxt = regnext(nxt);
3098 if (!strchr((const char*)PL_simple,OP(nxt))
3099 && !(PL_regkind[OP(nxt)] == EXACT
3100 && STR_LEN(nxt) == 1))
3101 goto nogo;
497b47a8 3102#ifdef DEBUGGING
8aa23a47 3103 nxt2 = nxt;
497b47a8 3104#endif
8aa23a47
YO
3105 nxt = regnext(nxt);
3106 if (OP(nxt) != CLOSE)
3107 goto nogo;
3108 if (RExC_open_parens) {
3109 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3110 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3111 }
3112 /* Now we know that nxt2 is the only contents: */
3113 oscan->flags = (U8)ARG(nxt);
3114 OP(oscan) = CURLYN;
3115 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3116
c277df42 3117#ifdef DEBUGGING
8aa23a47
YO
3118 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3119 NEXT_OFF(nxt1+ 1) = 0; /* just for consistancy. */
3120 NEXT_OFF(nxt2) = 0; /* just for consistancy with CURLY. */
3121 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3122 OP(nxt + 1) = OPTIMIZED; /* was count. */
3123 NEXT_OFF(nxt+ 1) = 0; /* just for consistancy. */
b81d288d 3124#endif
8aa23a47
YO
3125 }
3126 nogo:
3127
3128 /* Try optimization CURLYX => CURLYM. */
3129 if ( OP(oscan) == CURLYX && data
3130 && !(data->flags & SF_HAS_PAR)
3131 && !(data->flags & SF_HAS_EVAL)
3132 && !deltanext /* atom is fixed width */
3133 && minnext != 0 /* CURLYM can't handle zero width */
3134 ) {
3135 /* XXXX How to optimize if data == 0? */
3136 /* Optimize to a simpler form. */
3137 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3138 regnode *nxt2;
3139
3140 OP(oscan) = CURLYM;
3141 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3142 && (OP(nxt2) != WHILEM))
3143 nxt = nxt2;
3144 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3145 /* Need to optimize away parenths. */
3146 if (data->flags & SF_IN_PAR) {
3147 /* Set the parenth number. */
3148 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3149
3150 if (OP(nxt) != CLOSE)
3151 FAIL("Panic opt close");
3152 oscan->flags = (U8)ARG(nxt);
3153 if (RExC_open_parens) {
3154 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3155 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3156 }
8aa23a47
YO
3157 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3158 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3159
c277df42 3160#ifdef DEBUGGING
8aa23a47
YO
3161 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3162 OP(nxt + 1) = OPTIMIZED; /* was count. */
3163 NEXT_OFF(nxt1 + 1) = 0; /* just for consistancy. */
3164 NEXT_OFF(nxt + 1) = 0; /* just for consistancy. */
b81d288d 3165#endif
c277df42 3166#if 0
8aa23a47
YO
3167 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3168 regnode *nnxt = regnext(nxt1);
3169
3170 if (nnxt == nxt) {
3171 if (reg_off_by_arg[OP(nxt1)])
3172 ARG_SET(nxt1, nxt2 - nxt1);
3173 else if (nxt2 - nxt1 < U16_MAX)
3174 NEXT_OFF(nxt1) = nxt2 - nxt1;
3175 else
3176 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3177 }
8aa23a47 3178 nxt1 = nnxt;
c277df42 3179 }
5d1c421c 3180#endif
8aa23a47
YO
3181 /* Optimize again: */
3182 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3183 NULL, stopparen, recursed, NULL, 0,depth+1);
3184 }
3185 else
3186 oscan->flags = 0;
3187 }
3188 else if ((OP(oscan) == CURLYX)
3189 && (flags & SCF_WHILEM_VISITED_POS)
3190 /* See the comment on a similar expression above.
3191 However, this time it not a subexpression
3192 we care about, but the expression itself. */
3193 && (maxcount == REG_INFTY)
3194 && data && ++data->whilem_c < 16) {
3195 /* This stays as CURLYX, we can put the count/of pair. */
3196 /* Find WHILEM (as in regexec.c) */
3197 regnode *nxt = oscan + NEXT_OFF(oscan);
3198
3199 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3200 nxt += ARG(nxt);
3201 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3202 | (RExC_whilem_seen << 4)); /* On WHILEM */
3203 }
3204 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3205 pars++;
3206 if (flags & SCF_DO_SUBSTR) {
3207 SV *last_str = NULL;
3208 int counted = mincount != 0;
a0ed51b3 3209
8aa23a47
YO
3210 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3211#if defined(SPARC64_GCC_WORKAROUND)
3212 I32 b = 0;
3213 STRLEN l = 0;
3214 const char *s = NULL;
3215 I32 old = 0;
b515a41d 3216
8aa23a47
YO
3217 if (pos_before >= data->last_start_min)
3218 b = pos_before;
3219 else
3220 b = data->last_start_min;
b515a41d 3221
8aa23a47
YO
3222 l = 0;
3223 s = SvPV_const(data->last_found, l);
3224 old = b - data->last_start_min;
3225
3226#else
3227 I32 b = pos_before >= data->last_start_min
3228 ? pos_before : data->last_start_min;
3229 STRLEN l;
3230 const char * const s = SvPV_const(data->last_found, l);
3231 I32 old = b - data->last_start_min;
3232#endif
3233
3234 if (UTF)
3235 old = utf8_hop((U8*)s, old) - (U8*)s;
3236
3237 l -= old;
3238 /* Get the added string: */
3239 last_str = newSVpvn(s + old, l);
3240 if (UTF)
3241 SvUTF8_on(last_str);
3242 if (deltanext == 0 && pos_before == b) {
3243 /* What was added is a constant string */
3244 if (mincount > 1) {
3245 SvGROW(last_str, (mincount * l) + 1);
3246 repeatcpy(SvPVX(last_str) + l,
3247 SvPVX_const(last_str), l, mincount - 1);
3248 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3249 /* Add additional parts. */
3250 SvCUR_set(data->last_found,
3251 SvCUR(data->last_found) - l);
3252 sv_catsv(data->last_found, last_str);
3253 {
3254 SV * sv = data->last_found;
3255 MAGIC *mg =
3256 SvUTF8(sv) && SvMAGICAL(sv) ?
3257 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3258 if (mg && mg->mg_len >= 0)
3259 mg->mg_len += CHR_SVLEN(last_str);
b515a41d 3260 }
8aa23a47 3261 data->last_end += l * (mincount - 1);
b515a41d 3262 }
8aa23a47
YO
3263 } else {
3264 /* start offset must point into the last copy */
3265 data->last_start_min += minnext * (mincount - 1);
3266 data->last_start_max += is_inf ? I32_MAX
3267 : (maxcount - 1) * (minnext + data->pos_delta);
3268 }
c277df42 3269 }
8aa23a47
YO
3270 /* It is counted once already... */
3271 data->pos_min += minnext * (mincount - counted);
3272 data->pos_delta += - counted * deltanext +
3273 (minnext + deltanext) * maxcount - minnext * mincount;
3274 if (mincount != maxcount) {
3275 /* Cannot extend fixed substrings found inside
3276 the group. */
304ee84b 3277 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3278 if (mincount && last_str) {
3279 SV * const sv = data->last_found;
3280 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3281 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3282
3283 if (mg)
3284 mg->mg_len = -1;
3285 sv_setsv(sv, last_str);
3286 data->last_end = data->pos_min;
3287 data->last_start_min =
3288 data->pos_min - CHR_SVLEN(last_str);
3289 data->last_start_max = is_inf
3290 ? I32_MAX
3291 : data->pos_min + data->pos_delta
3292 - CHR_SVLEN(last_str);
3293 }
3294 data->longest = &(data->longest_float);
3295 }
3296 SvREFCNT_dec(last_str);
c277df42 3297 }
8aa23a47
YO
3298 if (data && (fl & SF_HAS_EVAL))
3299 data->flags |= SF_HAS_EVAL;
3300 optimize_curly_tail:
3301 if (OP(oscan) != CURLYX) {
3302 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3303 && NEXT_OFF(next))
3304 NEXT_OFF(oscan) += NEXT_OFF(next);
3305 }
3306 continue;
3307 default: /* REF and CLUMP only? */
3308 if (flags & SCF_DO_SUBSTR) {
304ee84b 3309 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3310 data->longest = &(data->longest_float);
3311 }
3312 is_inf = is_inf_internal = 1;
3313 if (flags & SCF_DO_STCLASS_OR)
3314 cl_anything(pRExC_state, data->start_class);
3315 flags &= ~SCF_DO_STCLASS;
3316 break;
c277df42 3317 }
8aa23a47
YO
3318 }
3319 else if (strchr((const char*)PL_simple,OP(scan))) {
3320 int value = 0;
653099ff 3321
8aa23a47 3322 if (flags & SCF_DO_SUBSTR) {
304ee84b 3323 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3324 data->pos_min++;
3325 }
3326 min++;
3327 if (flags & SCF_DO_STCLASS) {
3328 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 3329
8aa23a47
YO
3330 /* Some of the logic below assumes that switching
3331 locale on will only add false positives. */
3332 switch (PL_regkind[OP(scan)]) {
3333 case SANY:
3334 default:
3335 do_default:
3336 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
3337 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3338 cl_anything(pRExC_state, data->start_class);
3339 break;
3340 case REG_ANY:
3341 if (OP(scan) == SANY)
3342 goto do_default;
3343 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
3344 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3345 || (data->start_class->flags & ANYOF_CLASS));
3346 cl_anything(pRExC_state, data->start_class);
653099ff 3347 }
8aa23a47
YO
3348 if (flags & SCF_DO_STCLASS_AND || !value)
3349 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
3350 break;
3351 case ANYOF:
3352 if (flags & SCF_DO_STCLASS_AND)
3353 cl_and(data->start_class,
3354 (struct regnode_charclass_class*)scan);
653099ff 3355 else
8aa23a47
YO
3356 cl_or(pRExC_state, data->start_class,
3357 (struct regnode_charclass_class*)scan);
3358 break;
3359 case ALNUM:
3360 if (flags & SCF_DO_STCLASS_AND) {
3361 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3362 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3363 for (value = 0; value < 256; value++)
3364 if (!isALNUM(value))
3365 ANYOF_BITMAP_CLEAR(data->start_class, value);
3366 }
653099ff 3367 }
8aa23a47
YO
3368 else {
3369 if (data->start_class->flags & ANYOF_LOCALE)
3370 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3371 else {
3372 for (value = 0; value < 256; value++)
3373 if (isALNUM(value))
3374 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 3375 }
8aa23a47
YO
3376 }
3377 break;
3378 case ALNUML:
3379 if (flags & SCF_DO_STCLASS_AND) {
3380 if (data->start_class->flags & ANYOF_LOCALE)
3381 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
3382 }
3383 else {
3384 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
3385 data->start_class->flags |= ANYOF_LOCALE;
3386 }
3387 break;
3388 case NALNUM:
3389 if (flags & SCF_DO_STCLASS_AND) {
3390 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3391 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
3392 for (value = 0; value < 256; value++)
3393 if (isALNUM(value))
3394 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3395 }
3396 }
8aa23a47
YO
3397 else {
3398 if (data->start_class->flags & ANYOF_LOCALE)
3399 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3400 else {
3401 for (value = 0; value < 256; value++)
3402 if (!isALNUM(value))
3403 ANYOF_BITMAP_SET(data->start_class, value);
3404 }
653099ff 3405 }
8aa23a47
YO
3406 break;
3407 case NALNUML:
3408 if (flags & SCF_DO_STCLASS_AND) {
3409 if (data->start_class->flags & ANYOF_LOCALE)
3410 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
653099ff 3411 }
8aa23a47
YO
3412 else {
3413 data->start_class->flags |= ANYOF_LOCALE;
3414 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
3415 }
3416 break;
3417 case SPACE:
3418 if (flags & SCF_DO_STCLASS_AND) {
3419 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3420 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3421 for (value = 0; value < 256; value++)
3422 if (!isSPACE(value))
3423 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff
GS
3424 }
3425 }
8aa23a47
YO
3426 else {
3427 if (data->start_class->flags & ANYOF_LOCALE)
3428 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3429 else {
3430 for (value = 0; value < 256; value++)
3431 if (isSPACE(value))
3432 ANYOF_BITMAP_SET(data->start_class, value);
3433 }
653099ff 3434 }
8aa23a47
YO
3435 break;
3436 case SPACEL:
3437 if (flags & SCF_DO_STCLASS_AND) {
3438 if (data->start_class->flags & ANYOF_LOCALE)
3439 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
3440 }
3441 else {
3442 data->start_class->flags |= ANYOF_LOCALE;
3443 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
3444 }
3445 break;
3446 case NSPACE:
3447 if (flags & SCF_DO_STCLASS_AND) {
3448 if (!(data->start_class->flags & ANYOF_LOCALE)) {
3449 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3450 for (value = 0; value < 256; value++)
3451 if (isSPACE(value))
3452 ANYOF_BITMAP_CLEAR(data->start_class, value);
653099ff 3453 }
8aa23a47
YO
3454 }
3455 else {
3456 if (data->start_class->flags & ANYOF_LOCALE)
3457 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3458 else {
3459 for (value = 0; value < 256; value++)
3460 if (!isSPACE(value))
3461 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3462 }
3463 }
8aa23a47
YO
3464 break;
3465 case NSPACEL:
3466 if (flags & SCF_DO_STCLASS_AND) {
3467 if (data->start_class->flags & ANYOF_LOCALE) {
3468 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
3469 for (value = 0; value < 256; value++)
3470 if (!isSPACE(value))
3471 ANYOF_BITMAP_CLEAR(data->start_class, value);
3472 }
653099ff 3473 }
8aa23a47
YO
3474 else {
3475 data->start_class->flags |= ANYOF_LOCALE;
3476 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
3477 }
3478 break;
3479 case DIGIT:
3480 if (flags & SCF_DO_STCLASS_AND) {
3481 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
3482 for (value = 0; value < 256; value++)
3483 if (!isDIGIT(value))
3484 ANYOF_BITMAP_CLEAR(data->start_class, value);
3485 }
3486 else {
3487 if (data->start_class->flags & ANYOF_LOCALE)
3488 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
3489 else {
3490 for (value = 0; value < 256; value++)
3491 if (isDIGIT(value))
3492 ANYOF_BITMAP_SET(data->start_class, value);
3493 }
3494 }
3495 break;
3496 case NDIGIT:
3497 if (flags & SCF_DO_STCLASS_AND) {
3498 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
3499 for (value = 0; value < 256; value++)
3500 if (isDIGIT(value))
3501 ANYOF_BITMAP_CLEAR(data->start_class, value);
3502 }
3503 else {
3504 if (data->start_class->flags & ANYOF_LOCALE)
3505 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
3506 else {
3507 for (value = 0; value < 256; value++)
3508 if (!isDIGIT(value))
3509 ANYOF_BITMAP_SET(data->start_class, value);
653099ff
GS
3510 }
3511 }
8aa23a47
YO
3512 break;
3513 }
3514 if (flags & SCF_DO_STCLASS_OR)
3515 cl_and(data->start_class, and_withp);
3516 flags &= ~SCF_DO_STCLASS;
3517 }
3518 }
3519 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
3520 data->flags |= (OP(scan) == MEOL
3521 ? SF_BEFORE_MEOL
3522 : SF_BEFORE_SEOL);
3523 }
3524 else if ( PL_regkind[OP(scan)] == BRANCHJ
3525 /* Lookbehind, or need to calculate parens/evals/stclass: */
3526 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3527 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
3528 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3529 || OP(scan) == UNLESSM )
3530 {
3531 /* Negative Lookahead/lookbehind
3532 In this case we can't do fixed string optimisation.
3533 */
1de06328 3534
8aa23a47
YO
3535 I32 deltanext, minnext, fake = 0;
3536 regnode *nscan;
3537 struct regnode_charclass_class intrnl;
3538 int f = 0;
1de06328 3539
8aa23a47
YO
3540 data_fake.flags = 0;
3541 if (data) {
3542 data_fake.whilem_c = data->whilem_c;
3543 data_fake.last_closep = data->last_closep;
c277df42 3544 }
8aa23a47
YO
3545 else
3546 data_fake.last_closep = &fake;
58e23c8d 3547 data_fake.pos_delta = delta;
8aa23a47
YO
3548 if ( flags & SCF_DO_STCLASS && !scan->flags
3549 && OP(scan) == IFMATCH ) { /* Lookahead */
3550 cl_init(pRExC_state, &intrnl);
3551 data_fake.start_class = &intrnl;
3552 f |= SCF_DO_STCLASS_AND;
3553 }
3554 if (flags & SCF_WHILEM_VISITED_POS)
3555 f |= SCF_WHILEM_VISITED_POS;
3556 next = regnext(scan);
3557 nscan = NEXTOPER(NEXTOPER(scan));
3558 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3559 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
3560 if (scan->flags) {
3561 if (deltanext) {
58e23c8d 3562 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3563 }
3564 else if (minnext > (I32)U8_MAX) {
58e23c8d 3565 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3566 }
3567 scan->flags = (U8)minnext;
3568 }
3569 if (data) {
3570 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3571 pars++;
3572 if (data_fake.flags & SF_HAS_EVAL)
3573 data->flags |= SF_HAS_EVAL;
3574 data->whilem_c = data_fake.whilem_c;
3575 }
3576 if (f & SCF_DO_STCLASS_AND) {
3577 const int was = (data->start_class->flags & ANYOF_EOS);
3578
3579 cl_and(data->start_class, &intrnl);
3580 if (was)
3581 data->start_class->flags |= ANYOF_EOS;
3582 }
cb434fcc 3583 }
8aa23a47
YO
3584#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3585 else {
3586 /* Positive Lookahead/lookbehind
3587 In this case we can do fixed string optimisation,
3588 but we must be careful about it. Note in the case of
3589 lookbehind the positions will be offset by the minimum
3590 length of the pattern, something we won't know about
3591 until after the recurse.
3592 */
3593 I32 deltanext, fake = 0;
3594 regnode *nscan;
3595 struct regnode_charclass_class intrnl;
3596 int f = 0;
3597 /* We use SAVEFREEPV so that when the full compile
3598 is finished perl will clean up the allocated
3599 minlens when its all done. This was we don't
3600 have to worry about freeing them when we know
3601 they wont be used, which would be a pain.
3602 */
3603 I32 *minnextp;
3604 Newx( minnextp, 1, I32 );
3605 SAVEFREEPV(minnextp);
3606
3607 if (data) {
3608 StructCopy(data, &data_fake, scan_data_t);
3609 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3610 f |= SCF_DO_SUBSTR;
3611 if (scan->flags)
304ee84b 3612 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
3613 data_fake.last_found=newSVsv(data->last_found);
3614 }
3615 }
3616 else
3617 data_fake.last_closep = &fake;
3618 data_fake.flags = 0;
58e23c8d 3619 data_fake.pos_delta = delta;
8aa23a47
YO
3620 if (is_inf)
3621 data_fake.flags |= SF_IS_INF;
3622 if ( flags & SCF_DO_STCLASS && !scan->flags
3623 && OP(scan) == IFMATCH ) { /* Lookahead */
3624 cl_init(pRExC_state, &intrnl);
3625 data_fake.start_class = &intrnl;
3626 f |= SCF_DO_STCLASS_AND;
3627 }
3628 if (flags & SCF_WHILEM_VISITED_POS)
3629 f |= SCF_WHILEM_VISITED_POS;
3630 next = regnext(scan);
3631 nscan = NEXTOPER(NEXTOPER(scan));
3632
3633 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
3634 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
3635 if (scan->flags) {
3636 if (deltanext) {
58e23c8d 3637 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
3638 }
3639 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 3640 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
3641 }
3642 scan->flags = (U8)*minnextp;
3643 }
3644
3645 *minnextp += min;
3646
3647 if (f & SCF_DO_STCLASS_AND) {
3648 const int was = (data->start_class->flags & ANYOF_EOS);
3649
3650 cl_and(data->start_class, &intrnl);
3651 if (was)
3652 data->start_class->flags |= ANYOF_EOS;
3653 }
3654 if (data) {
3655 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3656 pars++;
3657 if (data_fake.flags & SF_HAS_EVAL)
3658 data->flags |= SF_HAS_EVAL;
3659 data->whilem_c = data_fake.whilem_c;
3660 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3661 if (RExC_rx->minlen<*minnextp)
3662 RExC_rx->minlen=*minnextp;
304ee84b 3663 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
3664 SvREFCNT_dec(data_fake.last_found);
3665
3666 if ( data_fake.minlen_fixed != minlenp )
3667 {
3668 data->offset_fixed= data_fake.offset_fixed;
3669 data->minlen_fixed= data_fake.minlen_fixed;
3670 data->lookbehind_fixed+= scan->flags;
3671 }
3672 if ( data_fake.minlen_float != minlenp )
3673 {
3674 data->minlen_float= data_fake.minlen_float;
3675 data->offset_float_min=data_fake.offset_float_min;
3676 data->offset_float_max=data_fake.offset_float_max;
3677 data->lookbehind_float+= scan->flags;
3678 }
3679 }
3680 }
3681
3682
40d049e4 3683 }
8aa23a47
YO
3684#endif
3685 }
3686 else if (OP(scan) == OPEN) {
3687 if (stopparen != (I32)ARG(scan))
3688 pars++;
3689 }
3690 else if (OP(scan) == CLOSE) {
3691 if (stopparen == (I32)ARG(scan)) {
3692 break;
3693 }
3694 if ((I32)ARG(scan) == is_par) {
3695 next = regnext(scan);
b515a41d 3696
8aa23a47
YO
3697 if ( next && (OP(next) != WHILEM) && next < last)
3698 is_par = 0; /* Disable optimization */
40d049e4 3699 }
8aa23a47
YO
3700 if (data)
3701 *(data->last_closep) = ARG(scan);
3702 }
3703 else if (OP(scan) == EVAL) {
c277df42
IZ
3704 if (data)
3705 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
3706 }
3707 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
3708 if (flags & SCF_DO_SUBSTR) {
304ee84b 3709 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 3710 flags &= ~SCF_DO_SUBSTR;
40d049e4 3711 }
8aa23a47
YO
3712 if (data && OP(scan)==ACCEPT) {
3713 data->flags |= SCF_SEEN_ACCEPT;
3714 if (stopmin > min)
3715 stopmin = min;
e2e6a0f1 3716 }
8aa23a47
YO
3717 }
3718 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3719 {
0f5d15d6 3720 if (flags & SCF_DO_SUBSTR) {
304ee84b 3721 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
3722 data->longest = &(data->longest_float);
3723 }
3724 is_inf = is_inf_internal = 1;
653099ff 3725 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
830247a4 3726 cl_anything(pRExC_state, data->start_class);
96776eda 3727 flags &= ~SCF_DO_STCLASS;
8aa23a47 3728 }
58e23c8d 3729 else if (OP(scan) == GPOS) {
bbe252da 3730 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
3731 !(delta || is_inf || (data && data->pos_delta)))
3732 {
bbe252da
YO
3733 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
3734 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
3735 if (RExC_rx->gofs < (U32)min)
3736 RExC_rx->gofs = min;
3737 } else {
bbe252da 3738 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
3739 RExC_rx->gofs = 0;
3740 }
3741 }
786e8c11 3742#ifdef TRIE_STUDY_OPT
40d049e4 3743#ifdef FULL_TRIE_STUDY
8aa23a47
YO
3744 else if (PL_regkind[OP(scan)] == TRIE) {
3745 /* NOTE - There is similar code to this block above for handling
3746 BRANCH nodes on the initial study. If you change stuff here
3747 check there too. */
3748 regnode *trie_node= scan;
3749 regnode *tail= regnext(scan);
f8fc2ecf 3750 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3751 I32 max1 = 0, min1 = I32_MAX;
3752 struct regnode_charclass_class accum;
3753
3754 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 3755 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47
YO
3756 if (flags & SCF_DO_STCLASS)
3757 cl_init_zero(pRExC_state, &accum);
3758
3759 if (!trie->jump) {
3760 min1= trie->minlen;
3761 max1= trie->maxlen;
3762 } else {
3763 const regnode *nextbranch= NULL;
3764 U32 word;
3765
3766 for ( word=1 ; word <= trie->wordcount ; word++)
3767 {
3768 I32 deltanext=0, minnext=0, f = 0, fake;
3769 struct regnode_charclass_class this_class;
3770
3771 data_fake.flags = 0;
3772 if (data) {
3773 data_fake.whilem_c = data->whilem_c;
3774 data_fake.last_closep = data->last_closep;
3775 }
3776 else
3777 data_fake.last_closep = &fake;
58e23c8d 3778 data_fake.pos_delta = delta;
8aa23a47
YO
3779 if (flags & SCF_DO_STCLASS) {
3780 cl_init(pRExC_state, &this_class);
3781 data_fake.start_class = &this_class;
3782 f = SCF_DO_STCLASS_AND;
3783 }
3784 if (flags & SCF_WHILEM_VISITED_POS)
3785 f |= SCF_WHILEM_VISITED_POS;
3786
3787 if (trie->jump[word]) {
3788 if (!nextbranch)
3789 nextbranch = trie_node + trie->jump[0];
3790 scan= trie_node + trie->jump[word];
3791 /* We go from the jump point to the branch that follows
3792 it. Note this means we need the vestigal unused branches
3793 even though they arent otherwise used.
3794 */
3795 minnext = study_chunk(pRExC_state, &scan, minlenp,
3796 &deltanext, (regnode *)nextbranch, &data_fake,
3797 stopparen, recursed, NULL, f,depth+1);
3798 }
3799 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
3800 nextbranch= regnext((regnode*)nextbranch);
3801
3802 if (min1 > (I32)(minnext + trie->minlen))
3803 min1 = minnext + trie->minlen;
3804 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
3805 max1 = minnext + deltanext + trie->maxlen;
3806 if (deltanext == I32_MAX)
3807 is_inf = is_inf_internal = 1;
3808
3809 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3810 pars++;
3811 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3812 if ( stopmin > min + min1)
3813 stopmin = min + min1;
3814 flags &= ~SCF_DO_SUBSTR;
3815 if (data)
3816 data->flags |= SCF_SEEN_ACCEPT;
3817 }
3818 if (data) {
3819 if (data_fake.flags & SF_HAS_EVAL)
3820 data->flags |= SF_HAS_EVAL;
3821 data->whilem_c = data_fake.whilem_c;
3822 }
3823 if (flags & SCF_DO_STCLASS)
3824 cl_or(pRExC_state, &accum, &this_class);
3825 }
3826 }
3827 if (flags & SCF_DO_SUBSTR) {
3828 data->pos_min += min1;
3829 data->pos_delta += max1 - min1;
3830 if (max1 != min1 || is_inf)
3831 data->longest = &(data->longest_float);
3832 }
3833 min += min1;
3834 delta += max1 - min1;
3835 if (flags & SCF_DO_STCLASS_OR) {
3836 cl_or(pRExC_state, data->start_class, &accum);
3837 if (min1) {
3838 cl_and(data->start_class, and_withp);
3839 flags &= ~SCF_DO_STCLASS;
3840 }
3841 }
3842 else if (flags & SCF_DO_STCLASS_AND) {
3843 if (min1) {
3844 cl_and(data->start_class, &accum);
3845 flags &= ~SCF_DO_STCLASS;
3846 }
3847 else {
3848 /* Switch to OR mode: cache the old value of
3849 * data->start_class */
3850 INIT_AND_WITHP;
3851 StructCopy(data->start_class, and_withp,
3852 struct regnode_charclass_class);
3853 flags &= ~SCF_DO_STCLASS_AND;
3854 StructCopy(&accum, data->start_class,
3855 struct regnode_charclass_class);
3856 flags |= SCF_DO_STCLASS_OR;
3857 data->start_class->flags |= ANYOF_EOS;
3858 }
3859 }
3860 scan= tail;
3861 continue;
3862 }
786e8c11 3863#else
8aa23a47 3864 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 3865 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
3866 U8*bang=NULL;
3867
3868 min += trie->minlen;
3869 delta += (trie->maxlen - trie->minlen);
3870 flags &= ~SCF_DO_STCLASS; /* xxx */
3871 if (flags & SCF_DO_SUBSTR) {
304ee84b 3872 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3873 data->pos_min += trie->minlen;
3874 data->pos_delta += (trie->maxlen - trie->minlen);
3875 if (trie->maxlen != trie->minlen)
3876 data->longest = &(data->longest_float);
3877 }
3878 if (trie->jump) /* no more substrings -- for now /grr*/
3879 flags &= ~SCF_DO_SUBSTR;
b515a41d 3880 }
8aa23a47
YO
3881#endif /* old or new */
3882#endif /* TRIE_STUDY_OPT */
3883 /* Else: zero-length, ignore. */
3884 scan = regnext(scan);
3885 }
3886 if (frame) {
3887 last = frame->last;
3888 scan = frame->next;
3889 stopparen = frame->stop;
3890 frame = frame->prev;
3891 goto fake_study_recurse;
c277df42
IZ
3892 }
3893
3894 finish:
8aa23a47 3895 assert(!frame);
304ee84b 3896 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 3897
c277df42 3898 *scanp = scan;
aca2d497 3899 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 3900 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 3901 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 3902 if (is_par > (I32)U8_MAX)
c277df42
IZ
3903 is_par = 0;
3904 if (is_par && pars==1 && data) {
3905 data->flags |= SF_IN_PAR;
3906 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
3907 }
3908 else if (pars && data) {
c277df42
IZ
3909 data->flags |= SF_HAS_PAR;
3910 data->flags &= ~SF_IN_PAR;
3911 }
653099ff 3912 if (flags & SCF_DO_STCLASS_OR)
40d049e4 3913 cl_and(data->start_class, and_withp);
786e8c11
YO
3914 if (flags & SCF_TRIE_RESTUDY)
3915 data->flags |= SCF_TRIE_RESTUDY;
1de06328 3916
304ee84b 3917 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 3918
e2e6a0f1 3919 return min < stopmin ? min : stopmin;
c277df42
IZ
3920}
3921
2eccd3b2
NC
3922STATIC U32
3923S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 3924{
4a4e7719
NC
3925 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
3926
3927 Renewc(RExC_rxi->data,
3928 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
3929 char, struct reg_data);
3930 if(count)
f8fc2ecf 3931 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 3932 else
f8fc2ecf 3933 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
3934 RExC_rxi->data->count = count + n;
3935 Copy(s, RExC_rxi->data->what + count, n, U8);
3936 return count;
c277df42
IZ
3937}
3938
f8149455 3939/*XXX: todo make this not included in a non debugging perl */
76234dfb 3940#ifndef PERL_IN_XSUB_RE
d88dccdf 3941void
864dbfa3 3942Perl_reginitcolors(pTHX)
d88dccdf 3943{
97aff369 3944 dVAR;
1df70142 3945 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 3946 if (s) {
1df70142
AL
3947 char *t = savepv(s);
3948 int i = 0;
3949 PL_colors[0] = t;
d88dccdf 3950 while (++i < 6) {
1df70142
AL
3951 t = strchr(t, '\t');
3952 if (t) {
3953 *t = '\0';
3954 PL_colors[i] = ++t;
d88dccdf
IZ
3955 }
3956 else
1df70142 3957 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
3958 }
3959 } else {
1df70142 3960 int i = 0;
b81d288d 3961 while (i < 6)
06b5626a 3962 PL_colors[i++] = (char *)"";
d88dccdf
IZ
3963 }
3964 PL_colorset = 1;
3965}
76234dfb 3966#endif
8615cb43 3967
07be1b83 3968
786e8c11
YO
3969#ifdef TRIE_STUDY_OPT
3970#define CHECK_RESTUDY_GOTO \
3971 if ( \
3972 (data.flags & SCF_TRIE_RESTUDY) \
3973 && ! restudied++ \
3974 ) goto reStudy
3975#else
3976#define CHECK_RESTUDY_GOTO
3977#endif
f9f4320a 3978
a687059c 3979/*
e50aee73 3980 - pregcomp - compile a regular expression into internal code
a687059c
LW
3981 *
3982 * We can't allocate space until we know how big the compiled form will be,
3983 * but we can't compile it (and thus know how big it is) until we've got a
3984 * place to put the code. So we cheat: we compile it twice, once with code
3985 * generation turned off and size counting turned on, and once "for real".
3986 * This also means that we don't allocate space until we are sure that the
3987 * thing really will compile successfully, and we never have to move the
3988 * code and thus invalidate pointers into it. (Note that it has to be in
3989 * one piece because free() must be able to free it all.) [NB: not true in perl]
3990 *
3991 * Beware that the optimization-preparation code in here knows about some
3992 * of the structure of the compiled regexp. [I'll say.]
3993 */
b9b4dddf
YO
3994
3995
3996
f9f4320a 3997#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
3998#define RE_ENGINE_PTR &PL_core_reg_engine
3999#else
f9f4320a
YO
4000extern const struct regexp_engine my_reg_engine;
4001#define RE_ENGINE_PTR &my_reg_engine
4002#endif
6d5c990f
RGS
4003
4004#ifndef PERL_IN_XSUB_RE
a687059c 4005regexp *
864dbfa3 4006Perl_pregcomp(pTHX_ char *exp, char *xend, PMOP *pm)
a687059c 4007{
97aff369 4008 dVAR;
6d5c990f 4009 HV * const table = GvHV(PL_hintgv);
f9f4320a
YO
4010 /* Dispatch a request to compile a regexp to correct
4011 regexp engine. */
f9f4320a
YO
4012 if (table) {
4013 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4014 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4015 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4016 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4017 DEBUG_COMPILE_r({
8d8756e7 4018 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4019 SvIV(*ptr));
4020 });
f2f78491 4021 return CALLREGCOMP_ENG(eng, exp, xend, pm);
f9f4320a 4022 }
b9b4dddf 4023 }
2a5d9b1d
RGS
4024 return Perl_re_compile(aTHX_ exp, xend, pm);
4025}
6d5c990f 4026#endif
2a5d9b1d
RGS
4027
4028regexp *
4029Perl_re_compile(pTHX_ char *exp, char *xend, PMOP *pm)
4030{
4031 dVAR;
a0d0e21e 4032 register regexp *r;
f8fc2ecf 4033 register regexp_internal *ri;
c277df42 4034 regnode *scan;
c277df42 4035 regnode *first;
a0d0e21e 4036 I32 flags;
a0d0e21e
LW
4037 I32 minlen = 0;
4038 I32 sawplus = 0;
4039 I32 sawopen = 0;
2c2d71f5 4040 scan_data_t data;
830247a4 4041 RExC_state_t RExC_state;
be8e71aa 4042 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83
YO
4043#ifdef TRIE_STUDY_OPT
4044 int restudied= 0;
4045 RExC_state_t copyRExC_state;
4046#endif
2a5d9b1d 4047 GET_RE_DEBUG_FLAGS_DECL;
6d5c990f
RGS
4048 DEBUG_r(if (!PL_colorset) reginitcolors());
4049
a0d0e21e 4050 if (exp == NULL)
c277df42 4051 FAIL("NULL regexp argument");
a0d0e21e 4052
a5961de5 4053 RExC_utf8 = pm->op_pmdynflags & PMdf_CMP_UTF8;
a0ed51b3 4054
5cfc7842 4055 RExC_precomp = exp;
a3621e74 4056 DEBUG_COMPILE_r({
ab3bbdeb
YO
4057 SV *dsv= sv_newmortal();
4058 RE_PV_QUOTED_DECL(s, RExC_utf8,
4059 dsv, RExC_precomp, (xend - exp), 60);
4060 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4061 PL_colors[4],PL_colors[5],s);
a5961de5 4062 });
e2509266 4063 RExC_flags = pm->op_pmflags;
830247a4 4064 RExC_sawback = 0;
bbce6d69 4065
830247a4
IZ
4066 RExC_seen = 0;
4067 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4068 RExC_seen_evals = 0;
4069 RExC_extralen = 0;
c277df42 4070
bbce6d69 4071 /* First pass: determine size, legality. */
830247a4 4072 RExC_parse = exp;
fac92740 4073 RExC_start = exp;
830247a4
IZ
4074 RExC_end = xend;
4075 RExC_naughty = 0;
4076 RExC_npar = 1;
e2e6a0f1 4077 RExC_nestroot = 0;
830247a4
IZ
4078 RExC_size = 0L;
4079 RExC_emit = &PL_regdummy;
4080 RExC_whilem_seen = 0;
fc8cd66c 4081 RExC_charnames = NULL;
40d049e4
YO
4082 RExC_open_parens = NULL;
4083 RExC_close_parens = NULL;
4084 RExC_opend = NULL;
81714fb9 4085 RExC_paren_names = NULL;
1f1031fe
YO
4086#ifdef DEBUGGING
4087 RExC_paren_name_list = NULL;
4088#endif
40d049e4
YO
4089 RExC_recurse = NULL;
4090 RExC_recurse_count = 0;
81714fb9 4091
85ddcde9
JH
4092#if 0 /* REGC() is (currently) a NOP at the first pass.
4093 * Clever compilers notice this and complain. --jhi */
830247a4 4094 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4095#endif
3dab1dad
YO
4096 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n"));
4097 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4098 RExC_precomp = NULL;
a0d0e21e
LW
4099 return(NULL);
4100 }
07be1b83 4101 DEBUG_PARSE_r({
81714fb9
YO
4102 PerlIO_printf(Perl_debug_log,
4103 "Required size %"IVdf" nodes\n"
4104 "Starting second pass (creation)\n",
4105 (IV)RExC_size);
07be1b83
YO
4106 RExC_lastnum=0;
4107 RExC_lastparse=NULL;
4108 });
c277df42
IZ
4109 /* Small enough for pointer-storage convention?
4110 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
4111 if (RExC_size >= 0x10000L && RExC_extralen)
4112 RExC_size += RExC_extralen;
c277df42 4113 else
830247a4
IZ
4114 RExC_extralen = 0;
4115 if (RExC_whilem_seen > 15)
4116 RExC_whilem_seen = 15;
a0d0e21e 4117
e2e6a0f1
YO
4118#ifdef DEBUGGING
4119 /* Make room for a sentinel value at the end of the program */
4120 RExC_size++;
4121#endif
4122
f9f4320a
YO
4123 /* Allocate space and zero-initialize. Note, the two step process
4124 of zeroing when in debug mode, thus anything assigned has to
4125 happen after that */
f8fc2ecf
YO
4126 Newxz(r, 1, regexp);
4127 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
4128 char, regexp_internal);
4129 if ( r == NULL || ri == NULL )
b45f050a 4130 FAIL("Regexp out of space");
0f79a09d
GS
4131#ifdef DEBUGGING
4132 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 4133 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 4134#else
f8fc2ecf
YO
4135 /* bulk initialize base fields with 0. */
4136 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 4137#endif
58e23c8d
YO
4138
4139 /* non-zero initialization begins here */
f8fc2ecf 4140 RXi_SET( r, ri );
f9f4320a 4141 r->engine= RE_ENGINE_PTR;
c277df42 4142 r->refcnt = 1;
bbce6d69 4143 r->prelen = xend - exp;
bbe252da 4144 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME;
bcdf7404
YO
4145 {
4146 bool has_k = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
4147 bool has_minus = ((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD);
4148 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
4149 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD) >> 12);
4150 const char *fptr = STD_PAT_MODS; /*"msix"*/
4151 char *p;
4152 r->wraplen = r->prelen + has_minus + has_k + has_runon
4153 + (sizeof(STD_PAT_MODS) - 1)
4154 + (sizeof("(?:)") - 1);
4155
4156 Newx(r->wrapped, r->wraplen, char );
4157 p = r->wrapped;
4158 *p++='('; *p++='?';
4159 if (has_k)
4160 *p++ = KEEPCOPY_PAT_MOD; /*'k'*/
4161 {
4162 char *r = p + (sizeof(STD_PAT_MODS) - 1) + has_minus - 1;
4163 char *colon = r + 1;
4164 char ch;
4165
4166 while((ch = *fptr++)) {
4167 if(reganch & 1)
4168 *p++ = ch;
4169 else
4170 *r-- = ch;
4171 reganch >>= 1;
4172 }
4173 if(has_minus) {
4174 *r = '-';
4175 p = colon;
4176 }
4177 }
4178
4179 *p++=':';
4180 Copy(RExC_precomp, p, r->prelen, char);
4181 r->precomp = p;
4182 p += r->prelen;
4183 if (has_runon)
4184 *p++='\n';
4185 *p=')';
bcdf7404
YO
4186 }
4187
bbe252da 4188 r->intflags = 0;
830247a4 4189 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 4190
6bda09f9 4191 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
4192 Newxz(RExC_open_parens, RExC_npar,regnode *);
4193 SAVEFREEPV(RExC_open_parens);
4194 Newxz(RExC_close_parens,RExC_npar,regnode *);
4195 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
4196 }
4197
4198 /* Useful during FAIL. */
7122b237
YO
4199#ifdef RE_TRACK_PATTERN_OFFSETS
4200 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 4201 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 4202 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 4203 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 4204 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
4205#endif
4206 SetProgLen(ri,RExC_size);
830247a4 4207 RExC_rx = r;
f8fc2ecf 4208 RExC_rxi = ri;
bbce6d69 4209
4210 /* Second pass: emit code. */
e2509266 4211 RExC_flags = pm->op_pmflags; /* don't let top level (?i) bleed */
830247a4
IZ
4212 RExC_parse = exp;
4213 RExC_end = xend;
4214 RExC_naughty = 0;
4215 RExC_npar = 1;
f8fc2ecf
YO
4216 RExC_emit_start = ri->program;
4217 RExC_emit = ri->program;
e2e6a0f1
YO
4218#ifdef DEBUGGING
4219 /* put a sentinal on the end of the program so we can check for
4220 overwrites */
f8fc2ecf 4221 ri->program[RExC_size].type = 255;
e2e6a0f1 4222#endif
2cd61cdb 4223 /* Store the count of eval-groups for security checks: */
f8149455 4224 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 4225 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
3dab1dad 4226 if (reg(pRExC_state, 0, &flags,1) == NULL)
a0d0e21e 4227 return(NULL);
6bda09f9 4228
07be1b83
YO
4229 /* XXXX To minimize changes to RE engine we always allocate
4230 3-units-long substrs field. */
4231 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
4232 if (RExC_recurse_count) {
4233 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
4234 SAVEFREEPV(RExC_recurse);
4235 }
a0d0e21e 4236
07be1b83 4237reStudy:
1de06328 4238 r->minlen = minlen = sawplus = sawopen = 0;
07be1b83 4239 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 4240
07be1b83
YO
4241#ifdef TRIE_STUDY_OPT
4242 if ( restudied ) {
5d458dd8 4243 U32 seen=RExC_seen;
07be1b83 4244 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
4245
4246 RExC_state = copyRExC_state;
4247 if (seen & REG_TOP_LEVEL_BRANCHES)
4248 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
4249 else
4250 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 4251 if (data.last_found) {
07be1b83 4252 SvREFCNT_dec(data.longest_fixed);
07be1b83 4253 SvREFCNT_dec(data.longest_float);
07be1b83 4254 SvREFCNT_dec(data.last_found);
1de06328 4255 }
40d049e4 4256 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4257 } else {
40d049e4 4258 StructCopy(&zero_scan_data, &data, scan_data_t);
5d458dd8 4259 copyRExC_state = RExC_state;
07be1b83 4260 }
40d049e4
YO
4261#else
4262 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 4263#endif
fc8cd66c 4264
a0d0e21e 4265 /* Dig out information for optimizations. */
bbe252da 4266 r->extflags = pm->op_pmflags & RXf_PMf_COMPILETIME; /* Again? */
e2509266 4267 pm->op_pmflags = RExC_flags;
a0ed51b3 4268 if (UTF)
bbe252da 4269 r->extflags |= RXf_UTF8; /* Unicode in it? */
f8fc2ecf 4270 ri->regstclass = NULL;
830247a4 4271 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 4272 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 4273 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 4274
1de06328
YO
4275 /* testing for BRANCH here tells us whether there is "must appear"
4276 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 4277 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 4278 I32 fake;
c5254dd6 4279 STRLEN longest_float_length, longest_fixed_length;
07be1b83 4280 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 4281 int stclass_flag;
07be1b83 4282 I32 last_close = 0; /* pointed to by data */
a0d0e21e
LW
4283
4284 first = scan;
c277df42 4285 /* Skip introductions and multiplicators >= 1. */
a0d0e21e 4286 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 4287 /* An OR of *one* alternative - should not happen now. */
a0d0e21e 4288 (OP(first) == BRANCH && OP(regnext(first)) != BRANCH) ||
07be1b83
YO
4289 /* for now we can't handle lookbehind IFMATCH*/
4290 (OP(first) == IFMATCH && !first->flags) ||
a0d0e21e
LW
4291 (OP(first) == PLUS) ||
4292 (OP(first) == MINMOD) ||
653099ff 4293 /* An {n,m} with n>0 */
07be1b83
YO
4294 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) )
4295 {
786e8c11 4296
a0d0e21e
LW
4297 if (OP(first) == PLUS)
4298 sawplus = 1;
4299 else
3dab1dad 4300 first += regarglen[OP(first)];
07be1b83
YO
4301 if (OP(first) == IFMATCH) {
4302 first = NEXTOPER(first);
4303 first += EXTRA_STEP_2ARGS;
7c167cea 4304 } else /* XXX possible optimisation for /(?=)/ */
07be1b83 4305 first = NEXTOPER(first);
a687059c
LW
4306 }
4307
a0d0e21e
LW
4308 /* Starting-point info. */
4309 again:
786e8c11 4310 DEBUG_PEEP("first:",first,0);
07be1b83 4311 /* Ignore EXACT as we deal with it later. */
3dab1dad 4312 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 4313 if (OP(first) == EXACT)
6f207bd3 4314 NOOP; /* Empty, get anchored substr later. */
1aa99e6b 4315 else if ((OP(first) == EXACTF || OP(first) == EXACTFL))
f8fc2ecf 4316 ri->regstclass = first;
b3c9acc1 4317 }
07be1b83 4318#ifdef TRIE_STCLASS
786e8c11 4319 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 4320 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 4321 {
786e8c11 4322 regnode *trie_op;
07be1b83 4323 /* this can happen only on restudy */
786e8c11 4324 if ( OP(first) == TRIE ) {
c944940b 4325 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 4326 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
4327 StructCopy(first,trieop,struct regnode_1);
4328 trie_op=(regnode *)trieop;
4329 } else {
c944940b 4330 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 4331 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
4332 StructCopy(first,trieop,struct regnode_charclass);
4333 trie_op=(regnode *)trieop;
4334 }
1de06328 4335 OP(trie_op)+=2;
786e8c11 4336 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 4337 ri->regstclass = trie_op;
07be1b83
YO
4338 }
4339#endif
bfed75c6 4340 else if (strchr((const char*)PL_simple,OP(first)))
f8fc2ecf 4341 ri->regstclass = first;
3dab1dad
YO
4342 else if (PL_regkind[OP(first)] == BOUND ||
4343 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 4344 ri->regstclass = first;
3dab1dad 4345 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
4346 r->extflags |= (OP(first) == MBOL
4347 ? RXf_ANCH_MBOL
cad2e5aa 4348 : (OP(first) == SBOL
bbe252da
YO
4349 ? RXf_ANCH_SBOL
4350 : RXf_ANCH_BOL));
a0d0e21e 4351 first = NEXTOPER(first);
774d564b 4352 goto again;
4353 }
4354 else if (OP(first) == GPOS) {
bbe252da 4355 r->extflags |= RXf_ANCH_GPOS;
774d564b 4356 first = NEXTOPER(first);
4357 goto again;
a0d0e21e 4358 }
cf2a2b69
YO
4359 else if ((!sawopen || !RExC_sawback) &&
4360 (OP(first) == STAR &&
3dab1dad 4361 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 4362 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
4363 {
4364 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
4365 const int type =
4366 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
4367 ? RXf_ANCH_MBOL
4368 : RXf_ANCH_SBOL;
4369 r->extflags |= type;
4370 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 4371 first = NEXTOPER(first);
774d564b 4372 goto again;
a0d0e21e 4373 }
b81d288d 4374 if (sawplus && (!sawopen || !RExC_sawback)
830247a4 4375 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 4376 /* x+ must match at the 1st pos of run of x's */
bbe252da 4377 r->intflags |= PREGf_SKIP;
a0d0e21e 4378
c277df42 4379 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 4380#ifdef TRIE_STUDY_OPT
81714fb9 4381 DEBUG_PARSE_r(
be8e71aa
YO
4382 if (!restudied)
4383 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4384 (IV)(first - scan + 1))
4385 );
4386#else
81714fb9 4387 DEBUG_PARSE_r(
be8e71aa
YO
4388 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
4389 (IV)(first - scan + 1))
4390 );
4391#endif
4392
4393
a0d0e21e
LW
4394 /*
4395 * If there's something expensive in the r.e., find the
4396 * longest literal string that must appear and make it the
4397 * regmust. Resolve ties in favor of later strings, since
4398 * the regstart check works with the beginning of the r.e.
4399 * and avoiding duplication strengthens checking. Not a
4400 * strong reason, but sufficient in the absence of others.
4401 * [Now we resolve ties in favor of the earlier string if
c277df42 4402 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
4403 * earlier string may buy us something the later one won't.]
4404 */
de8c5301 4405
396482e1
GA
4406 data.longest_fixed = newSVpvs("");
4407 data.longest_float = newSVpvs("");
4408 data.last_found = newSVpvs("");
c277df42
IZ
4409 data.longest = &(data.longest_fixed);
4410 first = scan;
f8fc2ecf 4411 if (!ri->regstclass) {
830247a4 4412 cl_init(pRExC_state, &ch_class);
653099ff
GS
4413 data.start_class = &ch_class;
4414 stclass_flag = SCF_DO_STCLASS_AND;
4415 } else /* XXXX Check for BOUND? */
4416 stclass_flag = 0;
cb434fcc 4417 data.last_closep = &last_close;
de8c5301 4418
1de06328 4419 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
4420 &data, -1, NULL, NULL,
4421 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 4422
07be1b83 4423
786e8c11
YO
4424 CHECK_RESTUDY_GOTO;
4425
4426
830247a4 4427 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 4428 && data.last_start_min == 0 && data.last_end > 0
830247a4 4429 && !RExC_seen_zerolen
2bf803e2 4430 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
4431 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
4432 r->extflags |= RXf_CHECK_ALL;
304ee84b 4433 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
4434 SvREFCNT_dec(data.last_found);
4435
1de06328
YO
4436 /* Note that code very similar to this but for anchored string
4437 follows immediately below, changes may need to be made to both.
4438 Be careful.
4439 */
a0ed51b3 4440 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 4441 if (longest_float_length
c277df42
IZ
4442 || (data.flags & SF_FL_BEFORE_EOL
4443 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4444 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4445 {
1182767e 4446 I32 t,ml;
cf93c79d 4447
1de06328 4448 if (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
aca2d497
IZ
4449 && data.offset_fixed == data.offset_float_min
4450 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float))
4451 goto remove_float; /* As in (a)+. */
4452
1de06328
YO
4453 /* copy the information about the longest float from the reg_scan_data
4454 over to the program. */
33b8afdf
JH
4455 if (SvUTF8(data.longest_float)) {
4456 r->float_utf8 = data.longest_float;
c445ea15 4457 r->float_substr = NULL;
33b8afdf
JH
4458 } else {
4459 r->float_substr = data.longest_float;
c445ea15 4460 r->float_utf8 = NULL;
33b8afdf 4461 }
1de06328
YO
4462 /* float_end_shift is how many chars that must be matched that
4463 follow this item. We calculate it ahead of time as once the
4464 lookbehind offset is added in we lose the ability to correctly
4465 calculate it.*/
4466 ml = data.minlen_float ? *(data.minlen_float)
1182767e 4467 : (I32)longest_float_length;
1de06328
YO
4468 r->float_end_shift = ml - data.offset_float_min
4469 - longest_float_length + (SvTAIL(data.longest_float) != 0)
4470 + data.lookbehind_float;
4471 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 4472 r->float_max_offset = data.offset_float_max;
1182767e 4473 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
4474 r->float_max_offset -= data.lookbehind_float;
4475
cf93c79d
IZ
4476 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
4477 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 4478 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4479 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4480 }
4481 else {
aca2d497 4482 remove_float:
c445ea15 4483 r->float_substr = r->float_utf8 = NULL;
c277df42 4484 SvREFCNT_dec(data.longest_float);
c5254dd6 4485 longest_float_length = 0;
a0d0e21e 4486 }
c277df42 4487
1de06328
YO
4488 /* Note that code very similar to this but for floating string
4489 is immediately above, changes may need to be made to both.
4490 Be careful.
4491 */
a0ed51b3 4492 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
c5254dd6 4493 if (longest_fixed_length
c277df42
IZ
4494 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
4495 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4496 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 4497 {
1182767e 4498 I32 t,ml;
cf93c79d 4499
1de06328
YO
4500 /* copy the information about the longest fixed
4501 from the reg_scan_data over to the program. */
33b8afdf
JH
4502 if (SvUTF8(data.longest_fixed)) {
4503 r->anchored_utf8 = data.longest_fixed;
c445ea15 4504 r->anchored_substr = NULL;
33b8afdf
JH
4505 } else {
4506 r->anchored_substr = data.longest_fixed;
c445ea15 4507 r->anchored_utf8 = NULL;
33b8afdf 4508 }
1de06328
YO
4509 /* fixed_end_shift is how many chars that must be matched that
4510 follow this item. We calculate it ahead of time as once the
4511 lookbehind offset is added in we lose the ability to correctly
4512 calculate it.*/
4513 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 4514 : (I32)longest_fixed_length;
1de06328
YO
4515 r->anchored_end_shift = ml - data.offset_fixed
4516 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
4517 + data.lookbehind_fixed;
4518 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
4519
cf93c79d
IZ
4520 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
4521 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 4522 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 4523 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
4524 }
4525 else {
c445ea15 4526 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 4527 SvREFCNT_dec(data.longest_fixed);
c5254dd6 4528 longest_fixed_length = 0;
a0d0e21e 4529 }
f8fc2ecf
YO
4530 if (ri->regstclass
4531 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
4532 ri->regstclass = NULL;
33b8afdf
JH
4533 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
4534 && stclass_flag
653099ff 4535 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4536 && !cl_is_anything(data.start_class))
4537 {
2eccd3b2 4538 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4539
f8fc2ecf 4540 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4541 struct regnode_charclass_class);
4542 StructCopy(data.start_class,
f8fc2ecf 4543 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4544 struct regnode_charclass_class);
f8fc2ecf 4545 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4546 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4547 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 4548 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4549 PerlIO_printf(Perl_debug_log,
a0288114 4550 "synthetic stclass \"%s\".\n",
3f7c398e 4551 SvPVX_const(sv));});
653099ff 4552 }
c277df42
IZ
4553
4554 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 4555 if (longest_fixed_length > longest_float_length) {
1de06328 4556 r->check_end_shift = r->anchored_end_shift;
c277df42 4557 r->check_substr = r->anchored_substr;
33b8afdf 4558 r->check_utf8 = r->anchored_utf8;
c277df42 4559 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
4560 if (r->extflags & RXf_ANCH_SINGLE)
4561 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
4562 }
4563 else {
1de06328 4564 r->check_end_shift = r->float_end_shift;
c277df42 4565 r->check_substr = r->float_substr;
33b8afdf 4566 r->check_utf8 = r->float_utf8;
1de06328
YO
4567 r->check_offset_min = r->float_min_offset;
4568 r->check_offset_max = r->float_max_offset;
a0d0e21e 4569 }
30382c73
IZ
4570 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
4571 This should be changed ASAP! */
bbe252da
YO
4572 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
4573 r->extflags |= RXf_USE_INTUIT;
33b8afdf 4574 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 4575 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 4576 }
1de06328
YO
4577 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
4578 if ( (STRLEN)minlen < longest_float_length )
4579 minlen= longest_float_length;
4580 if ( (STRLEN)minlen < longest_fixed_length )
4581 minlen= longest_fixed_length;
4582 */
a0ed51b3
LW
4583 }
4584 else {
c277df42
IZ
4585 /* Several toplevels. Best we can is to set minlen. */
4586 I32 fake;
653099ff 4587 struct regnode_charclass_class ch_class;
cb434fcc 4588 I32 last_close = 0;
c277df42 4589
5d458dd8 4590 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 4591
f8fc2ecf 4592 scan = ri->program + 1;
830247a4 4593 cl_init(pRExC_state, &ch_class);
653099ff 4594 data.start_class = &ch_class;
cb434fcc 4595 data.last_closep = &last_close;
07be1b83 4596
de8c5301 4597
1de06328 4598 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 4599 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 4600
786e8c11 4601 CHECK_RESTUDY_GOTO;
07be1b83 4602
33b8afdf 4603 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 4604 = r->float_substr = r->float_utf8 = NULL;
653099ff 4605 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
4606 && !cl_is_anything(data.start_class))
4607 {
2eccd3b2 4608 const U32 n = add_data(pRExC_state, 1, "f");
653099ff 4609
f8fc2ecf 4610 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
4611 struct regnode_charclass_class);
4612 StructCopy(data.start_class,
f8fc2ecf 4613 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 4614 struct regnode_charclass_class);
f8fc2ecf 4615 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 4616 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 4617 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 4618 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 4619 PerlIO_printf(Perl_debug_log,
a0288114 4620 "synthetic stclass \"%s\".\n",
3f7c398e 4621 SvPVX_const(sv));});
653099ff 4622 }
a0d0e21e
LW
4623 }
4624
1de06328
YO
4625 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
4626 the "real" pattern. */
cf9788e3
RGS
4627 DEBUG_OPTIMISE_r({
4628 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 4629 (IV)minlen, (IV)r->minlen);
cf9788e3 4630 });
de8c5301 4631 r->minlenret = minlen;
1de06328
YO
4632 if (r->minlen < minlen)
4633 r->minlen = minlen;
4634
b81d288d 4635 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 4636 r->extflags |= RXf_GPOS_SEEN;
830247a4 4637 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 4638 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 4639 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 4640 r->extflags |= RXf_EVAL_SEEN;
f33976b4 4641 if (RExC_seen & REG_SEEN_CANY)
bbe252da 4642 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 4643 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 4644 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 4645 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 4646 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9
YO
4647 if (RExC_paren_names)
4648 r->paren_names = (HV*)SvREFCNT_inc(RExC_paren_names);
4649 else
4650 r->paren_names = NULL;
e357fc67
YO
4651 if (r->prelen == 3 && strEQ("\\s+", r->precomp))
4652 r->extflags |= RXf_WHITE;
4653 else if (r->prelen == 1 && r->precomp[0] == '^')
4654 r->extflags |= RXf_START_ONLY;
4655
1f1031fe
YO
4656#ifdef DEBUGGING
4657 if (RExC_paren_names) {
4658 ri->name_list_idx = add_data( pRExC_state, 1, "p" );
4659 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
4660 } else
1f1031fe 4661#endif
cde0cee5 4662 ri->name_list_idx = 0;
1f1031fe 4663
40d049e4
YO
4664 if (RExC_recurse_count) {
4665 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
4666 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
4667 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
4668 }
4669 }
a02a5408
JC
4670 Newxz(r->startp, RExC_npar, I32);
4671 Newxz(r->endp, RExC_npar, I32);
c74340f9
YO
4672 /* assume we don't need to swap parens around before we match */
4673
be8e71aa
YO
4674 DEBUG_DUMP_r({
4675 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
4676 regdump(r);
4677 });
7122b237
YO
4678#ifdef RE_TRACK_PATTERN_OFFSETS
4679 DEBUG_OFFSETS_r(if (ri->u.offsets) {
4680 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
4681 U32 i;
4682 GET_RE_DEBUG_FLAGS_DECL;
7122b237 4683 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 4684 for (i = 1; i <= len; i++) {
7122b237 4685 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 4686 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 4687 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
4688 }
4689 PerlIO_printf(Perl_debug_log, "\n");
4690 });
7122b237 4691#endif
a0d0e21e 4692 return(r);
a687059c
LW
4693}
4694
f9f4320a 4695#undef RE_ENGINE_PTR
3dab1dad 4696
93b32b6d 4697
81714fb9 4698SV*
93b32b6d 4699Perl_reg_named_buff_get(pTHX_ const REGEXP * const rx, SV* namesv, U32 flags)
81714fb9 4700{
44a2ac75
YO
4701 AV *retarray = NULL;
4702 SV *ret;
4703 if (flags & 1)
4704 retarray=newAV();
93b32b6d
YO
4705
4706 if (rx && rx->paren_names) {
4707 HE *he_str = hv_fetch_ent( rx->paren_names, namesv, 0, 0 );
4708 if (he_str) {
4709 IV i;
4710 SV* sv_dat=HeVAL(he_str);
4711 I32 *nums=(I32*)SvPVX(sv_dat);
4712 for ( i=0; i<SvIVX(sv_dat); i++ ) {
66e6b4c5
RGS
4713 if ((I32)(rx->nparens) >= nums[i]
4714 && rx->startp[nums[i]] != -1
4715 && rx->endp[nums[i]] != -1)
93b32b6d
YO
4716 {
4717 ret = CALLREG_NUMBUF(rx,nums[i],NULL);
4718 if (!retarray)
4719 return ret;
4720 } else {
4721 ret = newSVsv(&PL_sv_undef);
4722 }
4723 if (retarray) {
4724 SvREFCNT_inc(ret);
4725 av_push(retarray, ret);
81714fb9
YO
4726 }
4727 }
93b32b6d
YO
4728 if (retarray)
4729 return (SV*)retarray;
81714fb9
YO
4730 }
4731 }
44a2ac75
YO
4732 return NULL;
4733}
4734
4735SV*
93b32b6d 4736Perl_reg_numbered_buff_get(pTHX_ const REGEXP * const rx, I32 paren, SV* usesv)
44a2ac75
YO
4737{
4738 char *s = NULL;
a9d504c3 4739 I32 i = 0;
44a2ac75
YO
4740 I32 s1, t1;
4741 SV *sv = usesv ? usesv : newSVpvs("");
4742
cde0cee5
YO
4743 if (!rx->subbeg) {
4744 sv_setsv(sv,&PL_sv_undef);
4745 return sv;
4746 }
4747 else
4748 if (paren == -2 && rx->startp[0] != -1) {
44a2ac75
YO
4749 /* $` */
4750 i = rx->startp[0];
cde0cee5 4751 s = rx->subbeg;
44a2ac75
YO
4752 }
4753 else
cde0cee5 4754 if (paren == -1 && rx->endp[0] != -1) {
44a2ac75
YO
4755 /* $' */
4756 s = rx->subbeg + rx->endp[0];
4757 i = rx->sublen - rx->endp[0];
4758 }
4759 else
4760 if ( 0 <= paren && paren <= (I32)rx->nparens &&
4761 (s1 = rx->startp[paren]) != -1 &&
4762 (t1 = rx->endp[paren]) != -1)
4763 {
4764 /* $& $1 ... */
4765 i = t1 - s1;
4766 s = rx->subbeg + s1;
cde0cee5
YO
4767 } else {
4768 sv_setsv(sv,&PL_sv_undef);
4769 return sv;
4770 }
4771 assert(rx->sublen >= (s - rx->subbeg) + i );
4772 if (i >= 0) {
4773 const int oldtainted = PL_tainted;
4774 TAINT_NOT;
4775 sv_setpvn(sv, s, i);
4776 PL_tainted = oldtainted;
4777 if ( (rx->extflags & RXf_CANY_SEEN)
4778 ? (RX_MATCH_UTF8(rx)
4779 && (!i || is_utf8_string((U8*)s, i)))
4780 : (RX_MATCH_UTF8(rx)) )
4781 {
4782 SvUTF8_on(sv);
4783 }
4784 else
4785 SvUTF8_off(sv);
4786 if (PL_tainting) {
4787 if (RX_MATCH_TAINTED(rx)) {
4788 if (SvTYPE(sv) >= SVt_PVMG) {
4789 MAGIC* const mg = SvMAGIC(sv);
4790 MAGIC* mgt;
4791 PL_tainted = 1;
4792 SvMAGIC_set(sv, mg->mg_moremagic);
4793 SvTAINT(sv);
4794 if ((mgt = SvMAGIC(sv))) {
4795 mg->mg_moremagic = mgt;
4796 SvMAGIC_set(sv, mg);
44a2ac75 4797 }
cde0cee5
YO
4798 } else {
4799 PL_tainted = 1;
4800 SvTAINT(sv);
4801 }
4802 } else
4803 SvTAINTED_off(sv);
44a2ac75 4804 }
81714fb9 4805 } else {
44a2ac75 4806 sv_setsv(sv,&PL_sv_undef);
81714fb9 4807 }
44a2ac75 4808 return sv;
81714fb9 4809}
93b32b6d 4810
0a4db386 4811
894be9b7 4812/* Scans the name of a named buffer from the pattern.
0a4db386
YO
4813 * If flags is REG_RSN_RETURN_NULL returns null.
4814 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
4815 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
4816 * to the parsed name as looked up in the RExC_paren_names hash.
4817 * If there is an error throws a vFAIL().. type exception.
894be9b7 4818 */
0a4db386
YO
4819
4820#define REG_RSN_RETURN_NULL 0
4821#define REG_RSN_RETURN_NAME 1
4822#define REG_RSN_RETURN_DATA 2
4823
894be9b7
YO
4824STATIC SV*
4825S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags) {
4826 char *name_start = RExC_parse;
1f1031fe
YO
4827
4828 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
4829 /* skip IDFIRST by using do...while */
4830 if (UTF)
4831 do {
4832 RExC_parse += UTF8SKIP(RExC_parse);
4833 } while (isALNUM_utf8((U8*)RExC_parse));
4834 else
4835 do {
4836 RExC_parse++;
4837 } while (isALNUM(*RExC_parse));
894be9b7 4838 }
1f1031fe 4839
0a4db386
YO
4840 if ( flags ) {
4841 SV* sv_name = sv_2mortal(Perl_newSVpvn(aTHX_ name_start,
4842 (int)(RExC_parse - name_start)));
894be9b7 4843 if (UTF)
0a4db386
YO
4844 SvUTF8_on(sv_name);
4845 if ( flags == REG_RSN_RETURN_NAME)
4846 return sv_name;
4847 else if (flags==REG_RSN_RETURN_DATA) {
4848 HE *he_str = NULL;
4849 SV *sv_dat = NULL;
4850 if ( ! sv_name ) /* should not happen*/
4851 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
4852 if (RExC_paren_names)
4853 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
4854 if ( he_str )
4855 sv_dat = HeVAL(he_str);
4856 if ( ! sv_dat )
4857 vFAIL("Reference to nonexistent named group");
4858 return sv_dat;
4859 }
4860 else {
4861 Perl_croak(aTHX_ "panic: bad flag in reg_scan_name");
4862 }
4863 /* NOT REACHED */
894be9b7 4864 }
0a4db386 4865 return NULL;
894be9b7
YO
4866}
4867
3dab1dad
YO
4868#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
4869 int rem=(int)(RExC_end - RExC_parse); \
4870 int cut; \
4871 int num; \
4872 int iscut=0; \
4873 if (rem>10) { \
4874 rem=10; \
4875 iscut=1; \
4876 } \
4877 cut=10-rem; \
4878 if (RExC_lastparse!=RExC_parse) \
4879 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
4880 rem, RExC_parse, \
4881 cut + 4, \
4882 iscut ? "..." : "<" \
4883 ); \
4884 else \
4885 PerlIO_printf(Perl_debug_log,"%16s",""); \
4886 \
4887 if (SIZE_ONLY) \
4888 num=RExC_size; \
4889 else \
4890 num=REG_NODE_NUM(RExC_emit); \
4891 if (RExC_lastnum!=num) \
0a4db386 4892 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 4893 else \
0a4db386 4894 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
4895 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
4896 (int)((depth*2)), "", \
3dab1dad
YO
4897 (funcname) \
4898 ); \
4899 RExC_lastnum=num; \
4900 RExC_lastparse=RExC_parse; \
4901})
4902
07be1b83
YO
4903
4904
3dab1dad
YO
4905#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
4906 DEBUG_PARSE_MSG((funcname)); \
4907 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
4908})
6bda09f9
YO
4909#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
4910 DEBUG_PARSE_MSG((funcname)); \
4911 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
4912})
a687059c
LW
4913/*
4914 - reg - regular expression, i.e. main body or parenthesized thing
4915 *
4916 * Caller must absorb opening parenthesis.
4917 *
4918 * Combining parenthesis handling with the base level of regular expression
4919 * is a trifle forced, but the need to tie the tails of the branches to what
4920 * follows makes it hard to avoid.
4921 */
07be1b83
YO
4922#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
4923#ifdef DEBUGGING
4924#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
4925#else
4926#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
4927#endif
3dab1dad 4928
e2e6a0f1
YO
4929/* this idea is borrowed from STR_WITH_LEN in handy.h */
4930#define CHECK_WORD(s,v,l) \
4931 (((sizeof(s)-1)==(l)) && (strnEQ(start_verb, (s ""), (sizeof(s)-1))))
4932
76e3520e 4933STATIC regnode *
3dab1dad 4934S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 4935 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 4936{
27da23d5 4937 dVAR;
c277df42
IZ
4938 register regnode *ret; /* Will be the head of the group. */
4939 register regnode *br;
4940 register regnode *lastbr;
cbbf8932 4941 register regnode *ender = NULL;
a0d0e21e 4942 register I32 parno = 0;
cbbf8932
AL
4943 I32 flags;
4944 const I32 oregflags = RExC_flags;
6136c704
AL
4945 bool have_branch = 0;
4946 bool is_open = 0;
594d7033
YO
4947 I32 freeze_paren = 0;
4948 I32 after_freeze = 0;
9d1d55b5
JP
4949
4950 /* for (?g), (?gc), and (?o) warnings; warning
4951 about (?c) will warn about (?g) -- japhy */
4952
6136c704
AL
4953#define WASTED_O 0x01
4954#define WASTED_G 0x02
4955#define WASTED_C 0x04
4956#define WASTED_GC (0x02|0x04)
cbbf8932 4957 I32 wastedflags = 0x00;
9d1d55b5 4958
fac92740 4959 char * parse_start = RExC_parse; /* MJD */
a28509cc 4960 char * const oregcomp_parse = RExC_parse;
a0d0e21e 4961
3dab1dad
YO
4962 GET_RE_DEBUG_FLAGS_DECL;
4963 DEBUG_PARSE("reg ");
4964
4965
821b33a5 4966 *flagp = 0; /* Tentatively. */
a0d0e21e 4967
9d1d55b5 4968
a0d0e21e
LW
4969 /* Make an OPEN node, if parenthesized. */
4970 if (paren) {
e2e6a0f1
YO
4971 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
4972 char *start_verb = RExC_parse;
4973 STRLEN verb_len = 0;
4974 char *start_arg = NULL;
4975 unsigned char op = 0;
4976 int argok = 1;
4977 int internal_argval = 0; /* internal_argval is only useful if !argok */
4978 while ( *RExC_parse && *RExC_parse != ')' ) {
4979 if ( *RExC_parse == ':' ) {
4980 start_arg = RExC_parse + 1;
4981 break;
4982 }
4983 RExC_parse++;
4984 }
4985 ++start_verb;
4986 verb_len = RExC_parse - start_verb;
4987 if ( start_arg ) {
4988 RExC_parse++;
4989 while ( *RExC_parse && *RExC_parse != ')' )
4990 RExC_parse++;
4991 if ( *RExC_parse != ')' )
4992 vFAIL("Unterminated verb pattern argument");
4993 if ( RExC_parse == start_arg )
4994 start_arg = NULL;
4995 } else {
4996 if ( *RExC_parse != ')' )
4997 vFAIL("Unterminated verb pattern");
4998 }
5d458dd8 4999
e2e6a0f1
YO
5000 switch ( *start_verb ) {
5001 case 'A': /* (*ACCEPT) */
5002 if ( CHECK_WORD("ACCEPT",start_verb,verb_len) ) {
5003 op = ACCEPT;
5004 internal_argval = RExC_nestroot;
5005 }
5006 break;
5007 case 'C': /* (*COMMIT) */
5008 if ( CHECK_WORD("COMMIT",start_verb,verb_len) )
5009 op = COMMIT;
e2e6a0f1
YO
5010 break;
5011 case 'F': /* (*FAIL) */
5012 if ( verb_len==1 || CHECK_WORD("FAIL",start_verb,verb_len) ) {
5013 op = OPFAIL;
5014 argok = 0;
5015 }
5016 break;
5d458dd8
YO
5017 case ':': /* (*:NAME) */
5018 case 'M': /* (*MARK:NAME) */
5019 if ( verb_len==0 || CHECK_WORD("MARK",start_verb,verb_len) ) {
e2e6a0f1 5020 op = MARKPOINT;
5d458dd8
YO
5021 argok = -1;
5022 }
5023 break;
5024 case 'P': /* (*PRUNE) */
5025 if ( CHECK_WORD("PRUNE",start_verb,verb_len) )
5026 op = PRUNE;
e2e6a0f1 5027 break;
5d458dd8
YO
5028 case 'S': /* (*SKIP) */
5029 if ( CHECK_WORD("SKIP",start_verb,verb_len) )
5030 op = SKIP;
5031 break;
5032 case 'T': /* (*THEN) */
5033 /* [19:06] <TimToady> :: is then */
5034 if ( CHECK_WORD("THEN",start_verb,verb_len) ) {
5035 op = CUTGROUP;
5036 RExC_seen |= REG_SEEN_CUTGROUP;
5037 }
e2e6a0f1
YO
5038 break;
5039 }
5040 if ( ! op ) {
5041 RExC_parse++;
5042 vFAIL3("Unknown verb pattern '%.*s'",
5043 verb_len, start_verb);
5044 }
5045 if ( argok ) {
5046 if ( start_arg && internal_argval ) {
5047 vFAIL3("Verb pattern '%.*s' may not have an argument",
5048 verb_len, start_verb);
5049 } else if ( argok < 0 && !start_arg ) {
5050 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
5051 verb_len, start_verb);
5052 } else {
5053 ret = reganode(pRExC_state, op, internal_argval);
5054 if ( ! internal_argval && ! SIZE_ONLY ) {
5055 if (start_arg) {
5056 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
5057 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5058 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
5059 ret->flags = 0;
5060 } else {
5061 ret->flags = 1;
5062 }
5063 }
5064 }
5065 if (!internal_argval)
5066 RExC_seen |= REG_SEEN_VERBARG;
5067 } else if ( start_arg ) {
5068 vFAIL3("Verb pattern '%.*s' may not have an argument",
5069 verb_len, start_verb);
5070 } else {
5071 ret = reg_node(pRExC_state, op);
5072 }
5073 nextchar(pRExC_state);
5074 return ret;
5075 } else
fac92740 5076 if (*RExC_parse == '?') { /* (?...) */
6136c704 5077 bool is_logical = 0;
a28509cc 5078 const char * const seqstart = RExC_parse;
ca9dfc88 5079
830247a4
IZ
5080 RExC_parse++;
5081 paren = *RExC_parse++;
c277df42 5082 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 5083 switch (paren) {
894be9b7 5084
1f1031fe
YO
5085 case 'P': /* (?P...) variants for those used to PCRE/Python */
5086 paren = *RExC_parse++;
5087 if ( paren == '<') /* (?P<...>) named capture */
5088 goto named_capture;
5089 else if (paren == '>') { /* (?P>name) named recursion */
5090 goto named_recursion;
5091 }
5092 else if (paren == '=') { /* (?P=...) named backref */
5093 /* this pretty much dupes the code for \k<NAME> in regatom(), if
5094 you change this make sure you change that */
5095 char* name_start = RExC_parse;
5096 U32 num = 0;
5097 SV *sv_dat = reg_scan_name(pRExC_state,
5098 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5099 if (RExC_parse == name_start || *RExC_parse != ')')
5100 vFAIL2("Sequence %.3s... not terminated",parse_start);
5101
5102 if (!SIZE_ONLY) {
5103 num = add_data( pRExC_state, 1, "S" );
5104 RExC_rxi->data->data[num]=(void*)sv_dat;
5105 SvREFCNT_inc(sv_dat);
5106 }
5107 RExC_sawback = 1;
5108 ret = reganode(pRExC_state,
5109 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
5110 num);
5111 *flagp |= HASWIDTH;
5112
5113 Set_Node_Offset(ret, parse_start+1);
5114 Set_Node_Cur_Length(ret); /* MJD */
5115
5116 nextchar(pRExC_state);
5117 return ret;
5118 }
57b84237
YO
5119 RExC_parse++;
5120 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5121 /*NOTREACHED*/
5122 case '<': /* (?<...) */
b81d288d 5123 if (*RExC_parse == '!')
c277df42 5124 paren = ',';
0a4db386 5125 else if (*RExC_parse != '=')
1f1031fe 5126 named_capture:
0a4db386 5127 { /* (?<...>) */
81714fb9 5128 char *name_start;
894be9b7 5129 SV *svname;
81714fb9
YO
5130 paren= '>';
5131 case '\'': /* (?'...') */
5132 name_start= RExC_parse;
0a4db386
YO
5133 svname = reg_scan_name(pRExC_state,
5134 SIZE_ONLY ? /* reverse test from the others */
5135 REG_RSN_RETURN_NAME :
5136 REG_RSN_RETURN_NULL);
57b84237
YO
5137 if (RExC_parse == name_start) {
5138 RExC_parse++;
5139 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5140 /*NOTREACHED*/
5141 }
81714fb9
YO
5142 if (*RExC_parse != paren)
5143 vFAIL2("Sequence (?%c... not terminated",
5144 paren=='>' ? '<' : paren);
5145 if (SIZE_ONLY) {
e62cc96a
YO
5146 HE *he_str;
5147 SV *sv_dat = NULL;
894be9b7
YO
5148 if (!svname) /* shouldnt happen */
5149 Perl_croak(aTHX_
5150 "panic: reg_scan_name returned NULL");
81714fb9
YO
5151 if (!RExC_paren_names) {
5152 RExC_paren_names= newHV();
5153 sv_2mortal((SV*)RExC_paren_names);
1f1031fe
YO
5154#ifdef DEBUGGING
5155 RExC_paren_name_list= newAV();
5156 sv_2mortal((SV*)RExC_paren_name_list);
5157#endif
81714fb9
YO
5158 }
5159 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 5160 if ( he_str )
81714fb9 5161 sv_dat = HeVAL(he_str);
e62cc96a 5162 if ( ! sv_dat ) {
81714fb9 5163 /* croak baby croak */
e62cc96a
YO
5164 Perl_croak(aTHX_
5165 "panic: paren_name hash element allocation failed");
5166 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
5167 /* (?|...) can mean we have dupes so scan to check
5168 its already been stored. Maybe a flag indicating
5169 we are inside such a construct would be useful,
5170 but the arrays are likely to be quite small, so
5171 for now we punt -- dmq */
5172 IV count = SvIV(sv_dat);
5173 I32 *pv = (I32*)SvPVX(sv_dat);
5174 IV i;
5175 for ( i = 0 ; i < count ; i++ ) {
5176 if ( pv[i] == RExC_npar ) {
5177 count = 0;
5178 break;
5179 }
5180 }
5181 if ( count ) {
5182 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
5183 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
5184 pv[count] = RExC_npar;
5185 SvIVX(sv_dat)++;
5186 }
81714fb9
YO
5187 } else {
5188 (void)SvUPGRADE(sv_dat,SVt_PVNV);
5189 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
5190 SvIOK_on(sv_dat);
5191 SvIVX(sv_dat)= 1;
e62cc96a 5192 }
1f1031fe
YO
5193#ifdef DEBUGGING
5194 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
5195 SvREFCNT_dec(svname);
5196#endif
e62cc96a 5197
81714fb9
YO
5198 /*sv_dump(sv_dat);*/
5199 }
5200 nextchar(pRExC_state);
5201 paren = 1;
5202 goto capturing_parens;
5203 }
5204 RExC_seen |= REG_SEEN_LOOKBEHIND;
830247a4 5205 RExC_parse++;
fac92740
MJD
5206 case '=': /* (?=...) */
5207 case '!': /* (?!...) */
830247a4 5208 RExC_seen_zerolen++;
e2e6a0f1
YO
5209 if (*RExC_parse == ')') {
5210 ret=reg_node(pRExC_state, OPFAIL);
5211 nextchar(pRExC_state);
5212 return ret;
5213 }
594d7033
YO
5214 break;
5215 case '|': /* (?|...) */
5216 /* branch reset, behave like a (?:...) except that
5217 buffers in alternations share the same numbers */
5218 paren = ':';
5219 after_freeze = freeze_paren = RExC_npar;
5220 break;
fac92740
MJD
5221 case ':': /* (?:...) */
5222 case '>': /* (?>...) */
a0d0e21e 5223 break;
fac92740
MJD
5224 case '$': /* (?$...) */
5225 case '@': /* (?@...) */
8615cb43 5226 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 5227 break;
fac92740 5228 case '#': /* (?#...) */
830247a4
IZ
5229 while (*RExC_parse && *RExC_parse != ')')
5230 RExC_parse++;
5231 if (*RExC_parse != ')')
c277df42 5232 FAIL("Sequence (?#... not terminated");
830247a4 5233 nextchar(pRExC_state);
a0d0e21e
LW
5234 *flagp = TRYAGAIN;
5235 return NULL;
894be9b7
YO
5236 case '0' : /* (?0) */
5237 case 'R' : /* (?R) */
5238 if (*RExC_parse != ')')
6bda09f9 5239 FAIL("Sequence (?R) not terminated");
1a147d38 5240 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 5241 *flagp |= POSTPONED;
7f69552c
YO
5242 nextchar(pRExC_state);
5243 return ret;
5244 /*notreached*/
894be9b7
YO
5245 { /* named and numeric backreferences */
5246 I32 num;
894be9b7
YO
5247 case '&': /* (?&NAME) */
5248 parse_start = RExC_parse - 1;
1f1031fe 5249 named_recursion:
894be9b7 5250 {
0a4db386
YO
5251 SV *sv_dat = reg_scan_name(pRExC_state,
5252 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5253 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
5254 }
5255 goto gen_recurse_regop;
5256 /* NOT REACHED */
542fa716
YO
5257 case '+':
5258 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5259 RExC_parse++;
5260 vFAIL("Illegal pattern");
5261 }
5262 goto parse_recursion;
5263 /* NOT REACHED*/
5264 case '-': /* (?-1) */
5265 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
5266 RExC_parse--; /* rewind to let it be handled later */
5267 goto parse_flags;
5268 }
5269 /*FALLTHROUGH */
6bda09f9
YO
5270 case '1': case '2': case '3': case '4': /* (?1) */
5271 case '5': case '6': case '7': case '8': case '9':
5272 RExC_parse--;
542fa716 5273 parse_recursion:
894be9b7
YO
5274 num = atoi(RExC_parse);
5275 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
5276 if (*RExC_parse == '-')
5277 RExC_parse++;
6bda09f9
YO
5278 while (isDIGIT(*RExC_parse))
5279 RExC_parse++;
5280 if (*RExC_parse!=')')
5281 vFAIL("Expecting close bracket");
894be9b7
YO
5282
5283 gen_recurse_regop:
542fa716
YO
5284 if ( paren == '-' ) {
5285 /*
5286 Diagram of capture buffer numbering.
5287 Top line is the normal capture buffer numbers
5288 Botton line is the negative indexing as from
5289 the X (the (?-2))
5290
5291 + 1 2 3 4 5 X 6 7
5292 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
5293 - 5 4 3 2 1 X x x
5294
5295 */
5296 num = RExC_npar + num;
5297 if (num < 1) {
5298 RExC_parse++;
5299 vFAIL("Reference to nonexistent group");
5300 }
5301 } else if ( paren == '+' ) {
5302 num = RExC_npar + num - 1;
5303 }
5304
1a147d38 5305 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
5306 if (!SIZE_ONLY) {
5307 if (num > (I32)RExC_rx->nparens) {
5308 RExC_parse++;
5309 vFAIL("Reference to nonexistent group");
5310 }
40d049e4 5311 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 5312 RExC_emit++;
226de585 5313 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 5314 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 5315 } else {
6bda09f9 5316 RExC_size++;
6bda09f9 5317 }
0a4db386 5318 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 5319 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
5320 Set_Node_Offset(ret, parse_start); /* MJD */
5321
a3b492c3 5322 *flagp |= POSTPONED;
6bda09f9
YO
5323 nextchar(pRExC_state);
5324 return ret;
894be9b7
YO
5325 } /* named and numeric backreferences */
5326 /* NOT REACHED */
5327
fac92740 5328 case 'p': /* (?p...) */
9014280d 5329 if (SIZE_ONLY && ckWARN2(WARN_DEPRECATED, WARN_REGEXP))
f9373011 5330 vWARNdep(RExC_parse, "(?p{}) is deprecated - use (??{})");
8c8ad484 5331 /* FALL THROUGH*/
fac92740 5332 case '?': /* (??...) */
6136c704 5333 is_logical = 1;
57b84237
YO
5334 if (*RExC_parse != '{') {
5335 RExC_parse++;
5336 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5337 /*NOTREACHED*/
5338 }
a3b492c3 5339 *flagp |= POSTPONED;
830247a4 5340 paren = *RExC_parse++;
0f5d15d6 5341 /* FALL THROUGH */
fac92740 5342 case '{': /* (?{...}) */
c277df42 5343 {
2eccd3b2
NC
5344 I32 count = 1;
5345 U32 n = 0;
c277df42 5346 char c;
830247a4 5347 char *s = RExC_parse;
c277df42 5348
830247a4
IZ
5349 RExC_seen_zerolen++;
5350 RExC_seen |= REG_SEEN_EVAL;
5351 while (count && (c = *RExC_parse)) {
6136c704
AL
5352 if (c == '\\') {
5353 if (RExC_parse[1])
5354 RExC_parse++;
5355 }
b81d288d 5356 else if (c == '{')
c277df42 5357 count++;
b81d288d 5358 else if (c == '}')
c277df42 5359 count--;
830247a4 5360 RExC_parse++;
c277df42 5361 }
6136c704 5362 if (*RExC_parse != ')') {
b81d288d 5363 RExC_parse = s;
b45f050a
JF
5364 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
5365 }
c277df42 5366 if (!SIZE_ONLY) {
f3548bdc 5367 PAD *pad;
6136c704
AL
5368 OP_4tree *sop, *rop;
5369 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 5370
569233ed
SB
5371 ENTER;
5372 Perl_save_re_context(aTHX);
f3548bdc 5373 rop = sv_compile_2op(sv, &sop, "re", &pad);
9b978d73
DM
5374 sop->op_private |= OPpREFCOUNTED;
5375 /* re_dup will OpREFCNT_inc */
5376 OpREFCNT_set(sop, 1);
569233ed 5377 LEAVE;
c277df42 5378
830247a4 5379 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
5380 RExC_rxi->data->data[n] = (void*)rop;
5381 RExC_rxi->data->data[n+1] = (void*)sop;
5382 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 5383 SvREFCNT_dec(sv);
a0ed51b3 5384 }
e24b16f9 5385 else { /* First pass */
830247a4 5386 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 5387 && IN_PERL_RUNTIME)
2cd61cdb
IZ
5388 /* No compiled RE interpolated, has runtime
5389 components ===> unsafe. */
5390 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 5391 if (PL_tainting && PL_tainted)
cc6b7395 5392 FAIL("Eval-group in insecure regular expression");
54df2634 5393#if PERL_VERSION > 8
923e4eb5 5394 if (IN_PERL_COMPILETIME)
b5c19bd7 5395 PL_cv_has_eval = 1;
54df2634 5396#endif
c277df42 5397 }
b5c19bd7 5398
830247a4 5399 nextchar(pRExC_state);
6136c704 5400 if (is_logical) {
830247a4 5401 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5402 if (!SIZE_ONLY)
5403 ret->flags = 2;
3dab1dad 5404 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 5405 /* deal with the length of this later - MJD */
0f5d15d6
IZ
5406 return ret;
5407 }
ccb2c380
MP
5408 ret = reganode(pRExC_state, EVAL, n);
5409 Set_Node_Length(ret, RExC_parse - parse_start + 1);
5410 Set_Node_Offset(ret, parse_start);
5411 return ret;
c277df42 5412 }
fac92740 5413 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 5414 {
0a4db386 5415 int is_define= 0;
fac92740 5416 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
5417 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
5418 || RExC_parse[1] == '<'
830247a4 5419 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42
IZ
5420 I32 flag;
5421
830247a4 5422 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
5423 if (!SIZE_ONLY)
5424 ret->flags = 1;
3dab1dad 5425 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 5426 goto insert_if;
b81d288d 5427 }
a0ed51b3 5428 }
0a4db386
YO
5429 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
5430 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
5431 {
5432 char ch = RExC_parse[0] == '<' ? '>' : '\'';
5433 char *name_start= RExC_parse++;
2eccd3b2 5434 U32 num = 0;
0a4db386
YO
5435 SV *sv_dat=reg_scan_name(pRExC_state,
5436 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5437 if (RExC_parse == name_start || *RExC_parse != ch)
5438 vFAIL2("Sequence (?(%c... not terminated",
5439 (ch == '>' ? '<' : ch));
5440 RExC_parse++;
5441 if (!SIZE_ONLY) {
5442 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 5443 RExC_rxi->data->data[num]=(void*)sv_dat;
0a4db386
YO
5444 SvREFCNT_inc(sv_dat);
5445 }
5446 ret = reganode(pRExC_state,NGROUPP,num);
5447 goto insert_if_check_paren;
5448 }
5449 else if (RExC_parse[0] == 'D' &&
5450 RExC_parse[1] == 'E' &&
5451 RExC_parse[2] == 'F' &&
5452 RExC_parse[3] == 'I' &&
5453 RExC_parse[4] == 'N' &&
5454 RExC_parse[5] == 'E')
5455 {
5456 ret = reganode(pRExC_state,DEFINEP,0);
5457 RExC_parse +=6 ;
5458 is_define = 1;
5459 goto insert_if_check_paren;
5460 }
5461 else if (RExC_parse[0] == 'R') {
5462 RExC_parse++;
5463 parno = 0;
5464 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
5465 parno = atoi(RExC_parse++);
5466 while (isDIGIT(*RExC_parse))
5467 RExC_parse++;
5468 } else if (RExC_parse[0] == '&') {
5469 SV *sv_dat;
5470 RExC_parse++;
5471 sv_dat = reg_scan_name(pRExC_state,
5472 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
5473 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
5474 }
1a147d38 5475 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
5476 goto insert_if_check_paren;
5477 }
830247a4 5478 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 5479 /* (?(1)...) */
6136c704 5480 char c;
830247a4 5481 parno = atoi(RExC_parse++);
c277df42 5482
830247a4
IZ
5483 while (isDIGIT(*RExC_parse))
5484 RExC_parse++;
fac92740 5485 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 5486
0a4db386 5487 insert_if_check_paren:
830247a4 5488 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 5489 vFAIL("Switch condition not recognized");
c277df42 5490 insert_if:
3dab1dad
YO
5491 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
5492 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 5493 if (br == NULL)
830247a4 5494 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 5495 else
3dab1dad 5496 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 5497 c = *nextchar(pRExC_state);
d1b80229
IZ
5498 if (flags&HASWIDTH)
5499 *flagp |= HASWIDTH;
c277df42 5500 if (c == '|') {
0a4db386
YO
5501 if (is_define)
5502 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 5503 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
5504 regbranch(pRExC_state, &flags, 1,depth+1);
5505 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
5506 if (flags&HASWIDTH)
5507 *flagp |= HASWIDTH;
830247a4 5508 c = *nextchar(pRExC_state);
a0ed51b3
LW
5509 }
5510 else
c277df42
IZ
5511 lastbr = NULL;
5512 if (c != ')')
8615cb43 5513 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 5514 ender = reg_node(pRExC_state, TAIL);
3dab1dad 5515 REGTAIL(pRExC_state, br, ender);
c277df42 5516 if (lastbr) {
3dab1dad
YO
5517 REGTAIL(pRExC_state, lastbr, ender);
5518 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
5519 }
5520 else
3dab1dad 5521 REGTAIL(pRExC_state, ret, ender);
c277df42 5522 return ret;
a0ed51b3
LW
5523 }
5524 else {
830247a4 5525 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
5526 }
5527 }
1b1626e4 5528 case 0:
830247a4 5529 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 5530 vFAIL("Sequence (? incomplete");
1b1626e4 5531 break;
a0d0e21e 5532 default:
cde0cee5
YO
5533 --RExC_parse;
5534 parse_flags: /* (?i) */
5535 {
5536 U32 posflags = 0, negflags = 0;
5537 U32 *flagsp = &posflags;
5538
5539 while (*RExC_parse) {
5540 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
5541 /* (?g), (?gc) and (?o) are useless here
5542 and must be globally applied -- japhy */
cde0cee5
YO
5543 switch (*RExC_parse) {
5544 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
5545 case 'o':
5546 case 'g':
9d1d55b5 5547 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 5548 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
5549 if (! (wastedflags & wflagbit) ) {
5550 wastedflags |= wflagbit;
5551 vWARN5(
5552 RExC_parse + 1,
5553 "Useless (%s%c) - %suse /%c modifier",
5554 flagsp == &negflags ? "?-" : "?",
5555 *RExC_parse,
5556 flagsp == &negflags ? "don't " : "",
5557 *RExC_parse
5558 );
5559 }
5560 }
cde0cee5
YO
5561 break;
5562
5563 case 'c':
9d1d55b5 5564 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
5565 if (! (wastedflags & WASTED_C) ) {
5566 wastedflags |= WASTED_GC;
9d1d55b5
JP
5567 vWARN3(
5568 RExC_parse + 1,
5569 "Useless (%sc) - %suse /gc modifier",
5570 flagsp == &negflags ? "?-" : "?",
5571 flagsp == &negflags ? "don't " : ""
5572 );
5573 }
5574 }
cde0cee5
YO
5575 break;
5576 case 'k':
5577 if (flagsp == &negflags) {
5578 if (SIZE_ONLY && ckWARN(WARN_REGEXP))
5579 vWARN(RExC_parse + 1,"Useless use of (?-k)");
5580 } else {
5581 *flagsp |= RXf_PMf_KEEPCOPY;
5582 }
5583 break;
5584 case '-':
57b84237
YO
5585 if (flagsp == &negflags) {
5586 RExC_parse++;
5587 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5588 /*NOTREACHED*/
5589 }
cde0cee5
YO
5590 flagsp = &negflags;
5591 wastedflags = 0; /* reset so (?g-c) warns twice */
5592 break;
5593 case ':':
5594 paren = ':';
5595 /*FALLTHROUGH*/
5596 case ')':
5597 RExC_flags |= posflags;
5598 RExC_flags &= ~negflags;
5599 nextchar(pRExC_state);
5600 if (paren != ':') {
5601 *flagp = TRYAGAIN;
5602 return NULL;
5603 } else {
5604 ret = NULL;
5605 goto parse_rest;
5606 }
5607 /*NOTREACHED*/
5608 default:
cde0cee5
YO
5609 RExC_parse++;
5610 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
5611 /*NOTREACHED*/
5612 }
830247a4 5613 ++RExC_parse;
48c036b1 5614 }
cde0cee5 5615 }} /* one for the default block, one for the switch */
a0d0e21e 5616 }
fac92740 5617 else { /* (...) */
81714fb9 5618 capturing_parens:
830247a4
IZ
5619 parno = RExC_npar;
5620 RExC_npar++;
e2e6a0f1 5621
830247a4 5622 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
5623 if (!SIZE_ONLY ){
5624 if (!RExC_nestroot)
5625 RExC_nestroot = parno;
c009da3d
YO
5626 if (RExC_seen & REG_SEEN_RECURSE
5627 && !RExC_open_parens[parno-1])
5628 {
e2e6a0f1 5629 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
5630 "Setting open paren #%"IVdf" to %d\n",
5631 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
5632 RExC_open_parens[parno-1]= ret;
5633 }
6bda09f9 5634 }
fac92740
MJD
5635 Set_Node_Length(ret, 1); /* MJD */
5636 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 5637 is_open = 1;
a0d0e21e 5638 }
a0ed51b3 5639 }
fac92740 5640 else /* ! paren */
a0d0e21e 5641 ret = NULL;
cde0cee5
YO
5642
5643 parse_rest:
a0d0e21e 5644 /* Pick up the branches, linking them together. */
fac92740 5645 parse_start = RExC_parse; /* MJD */
3dab1dad 5646 br = regbranch(pRExC_state, &flags, 1,depth+1);
fac92740 5647 /* branch_len = (paren != 0); */
2af232bd 5648
a0d0e21e
LW
5649 if (br == NULL)
5650 return(NULL);
830247a4
IZ
5651 if (*RExC_parse == '|') {
5652 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 5653 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 5654 }
fac92740 5655 else { /* MJD */
6bda09f9 5656 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
5657 Set_Node_Length(br, paren != 0);
5658 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
5659 }
c277df42
IZ
5660 have_branch = 1;
5661 if (SIZE_ONLY)
830247a4 5662 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
5663 }
5664 else if (paren == ':') {
c277df42
IZ
5665 *flagp |= flags&SIMPLE;
5666 }
6136c704 5667 if (is_open) { /* Starts with OPEN. */
3dab1dad 5668 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
5669 }
5670 else if (paren != '?') /* Not Conditional */
a0d0e21e 5671 ret = br;
8ae10a67 5672 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 5673 lastbr = br;
830247a4
IZ
5674 while (*RExC_parse == '|') {
5675 if (!SIZE_ONLY && RExC_extralen) {
5676 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 5677 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
5678 }
5679 if (SIZE_ONLY)
830247a4
IZ
5680 RExC_extralen += 2; /* Account for LONGJMP. */
5681 nextchar(pRExC_state);
594d7033
YO
5682 if (freeze_paren) {
5683 if (RExC_npar > after_freeze)
5684 after_freeze = RExC_npar;
5685 RExC_npar = freeze_paren;
5686 }
3dab1dad 5687 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 5688
a687059c 5689 if (br == NULL)
a0d0e21e 5690 return(NULL);
3dab1dad 5691 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 5692 lastbr = br;
8ae10a67 5693 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
5694 }
5695
c277df42
IZ
5696 if (have_branch || paren != ':') {
5697 /* Make a closing node, and hook it on the end. */
5698 switch (paren) {
5699 case ':':
830247a4 5700 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
5701 break;
5702 case 1:
830247a4 5703 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
5704 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
5705 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
5706 "Setting close paren #%"IVdf" to %d\n",
5707 (IV)parno, REG_NODE_NUM(ender)));
5708 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
5709 if (RExC_nestroot == parno)
5710 RExC_nestroot = 0;
40d049e4 5711 }
fac92740
MJD
5712 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
5713 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
5714 break;
5715 case '<':
c277df42
IZ
5716 case ',':
5717 case '=':
5718 case '!':
c277df42 5719 *flagp &= ~HASWIDTH;
821b33a5
IZ
5720 /* FALL THROUGH */
5721 case '>':
830247a4 5722 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
5723 break;
5724 case 0:
830247a4 5725 ender = reg_node(pRExC_state, END);
40d049e4
YO
5726 if (!SIZE_ONLY) {
5727 assert(!RExC_opend); /* there can only be one! */
5728 RExC_opend = ender;
5729 }
c277df42
IZ
5730 break;
5731 }
eaf3ca90 5732 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 5733
9674d46a 5734 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
5735 if (depth==1)
5736 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5737
c277df42 5738 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
5739 for (br = ret; br; br = regnext(br)) {
5740 const U8 op = PL_regkind[OP(br)];
5741 if (op == BRANCH) {
07be1b83 5742 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
5743 }
5744 else if (op == BRANCHJ) {
07be1b83 5745 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 5746 }
c277df42
IZ
5747 }
5748 }
a0d0e21e 5749 }
c277df42
IZ
5750
5751 {
e1ec3a88
AL
5752 const char *p;
5753 static const char parens[] = "=!<,>";
c277df42
IZ
5754
5755 if (paren && (p = strchr(parens, paren))) {
eb160463 5756 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
5757 int flag = (p - parens) > 1;
5758
5759 if (paren == '>')
5760 node = SUSPEND, flag = 0;
6bda09f9 5761 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
5762 Set_Node_Cur_Length(ret);
5763 Set_Node_Offset(ret, parse_start + 1);
c277df42 5764 ret->flags = flag;
07be1b83 5765 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 5766 }
a0d0e21e
LW
5767 }
5768
5769 /* Check for proper termination. */
ce3e6498 5770 if (paren) {
e2509266 5771 RExC_flags = oregflags;
830247a4
IZ
5772 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
5773 RExC_parse = oregcomp_parse;
380a0633 5774 vFAIL("Unmatched (");
ce3e6498 5775 }
a0ed51b3 5776 }
830247a4
IZ
5777 else if (!paren && RExC_parse < RExC_end) {
5778 if (*RExC_parse == ')') {
5779 RExC_parse++;
380a0633 5780 vFAIL("Unmatched )");
a0ed51b3
LW
5781 }
5782 else
b45f050a 5783 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
5784 /* NOTREACHED */
5785 }
594d7033
YO
5786 if (after_freeze)
5787 RExC_npar = after_freeze;
a0d0e21e 5788 return(ret);
a687059c
LW
5789}
5790
5791/*
5792 - regbranch - one alternative of an | operator
5793 *
5794 * Implements the concatenation operator.
5795 */
76e3520e 5796STATIC regnode *
3dab1dad 5797S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 5798{
97aff369 5799 dVAR;
c277df42
IZ
5800 register regnode *ret;
5801 register regnode *chain = NULL;
5802 register regnode *latest;
5803 I32 flags = 0, c = 0;
3dab1dad
YO
5804 GET_RE_DEBUG_FLAGS_DECL;
5805 DEBUG_PARSE("brnc");
b81d288d 5806 if (first)
c277df42
IZ
5807 ret = NULL;
5808 else {
b81d288d 5809 if (!SIZE_ONLY && RExC_extralen)
830247a4 5810 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 5811 else {
830247a4 5812 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
5813 Set_Node_Length(ret, 1);
5814 }
c277df42
IZ
5815 }
5816
b81d288d 5817 if (!first && SIZE_ONLY)
830247a4 5818 RExC_extralen += 1; /* BRANCHJ */
b81d288d 5819
c277df42 5820 *flagp = WORST; /* Tentatively. */
a0d0e21e 5821
830247a4
IZ
5822 RExC_parse--;
5823 nextchar(pRExC_state);
5824 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 5825 flags &= ~TRYAGAIN;
3dab1dad 5826 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5827 if (latest == NULL) {
5828 if (flags & TRYAGAIN)
5829 continue;
5830 return(NULL);
a0ed51b3
LW
5831 }
5832 else if (ret == NULL)
c277df42 5833 ret = latest;
8ae10a67 5834 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 5835 if (chain == NULL) /* First piece. */
a0d0e21e
LW
5836 *flagp |= flags&SPSTART;
5837 else {
830247a4 5838 RExC_naughty++;
3dab1dad 5839 REGTAIL(pRExC_state, chain, latest);
a687059c 5840 }
a0d0e21e 5841 chain = latest;
c277df42
IZ
5842 c++;
5843 }
5844 if (chain == NULL) { /* Loop ran zero times. */
830247a4 5845 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
5846 if (ret == NULL)
5847 ret = chain;
5848 }
5849 if (c == 1) {
5850 *flagp |= flags&SIMPLE;
a0d0e21e 5851 }
a687059c 5852
d4c19fe8 5853 return ret;
a687059c
LW
5854}
5855
5856/*
5857 - regpiece - something followed by possible [*+?]
5858 *
5859 * Note that the branching code sequences used for ? and the general cases
5860 * of * and + are somewhat optimized: they use the same NOTHING node as
5861 * both the endmarker for their branch list and the body of the last branch.
5862 * It might seem that this node could be dispensed with entirely, but the
5863 * endmarker role is not redundant.
5864 */
76e3520e 5865STATIC regnode *
3dab1dad 5866S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 5867{
97aff369 5868 dVAR;
c277df42 5869 register regnode *ret;
a0d0e21e
LW
5870 register char op;
5871 register char *next;
5872 I32 flags;
1df70142 5873 const char * const origparse = RExC_parse;
a0d0e21e 5874 I32 min;
c277df42 5875 I32 max = REG_INFTY;
fac92740 5876 char *parse_start;
10edeb5d 5877 const char *maxpos = NULL;
3dab1dad
YO
5878 GET_RE_DEBUG_FLAGS_DECL;
5879 DEBUG_PARSE("piec");
a0d0e21e 5880
3dab1dad 5881 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
5882 if (ret == NULL) {
5883 if (flags & TRYAGAIN)
5884 *flagp |= TRYAGAIN;
5885 return(NULL);
5886 }
5887
830247a4 5888 op = *RExC_parse;
a0d0e21e 5889
830247a4 5890 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 5891 maxpos = NULL;
fac92740 5892 parse_start = RExC_parse; /* MJD */
830247a4 5893 next = RExC_parse + 1;
a0d0e21e
LW
5894 while (isDIGIT(*next) || *next == ',') {
5895 if (*next == ',') {
5896 if (maxpos)
5897 break;
5898 else
5899 maxpos = next;
a687059c 5900 }
a0d0e21e
LW
5901 next++;
5902 }
5903 if (*next == '}') { /* got one */
5904 if (!maxpos)
5905 maxpos = next;
830247a4
IZ
5906 RExC_parse++;
5907 min = atoi(RExC_parse);
a0d0e21e
LW
5908 if (*maxpos == ',')
5909 maxpos++;
5910 else
830247a4 5911 maxpos = RExC_parse;
a0d0e21e
LW
5912 max = atoi(maxpos);
5913 if (!max && *maxpos != '0')
c277df42
IZ
5914 max = REG_INFTY; /* meaning "infinity" */
5915 else if (max >= REG_INFTY)
8615cb43 5916 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
5917 RExC_parse = next;
5918 nextchar(pRExC_state);
a0d0e21e
LW
5919
5920 do_curly:
5921 if ((flags&SIMPLE)) {
830247a4 5922 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 5923 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
5924 Set_Node_Offset(ret, parse_start+1); /* MJD */
5925 Set_Node_Cur_Length(ret);
a0d0e21e
LW
5926 }
5927 else {
3dab1dad 5928 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
5929
5930 w->flags = 0;
3dab1dad 5931 REGTAIL(pRExC_state, ret, w);
830247a4 5932 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
5933 reginsert(pRExC_state, LONGJMP,ret, depth+1);
5934 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
5935 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
5936 }
6bda09f9 5937 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
5938 /* MJD hk */
5939 Set_Node_Offset(ret, parse_start+1);
2af232bd 5940 Set_Node_Length(ret,
fac92740 5941 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 5942
830247a4 5943 if (!SIZE_ONLY && RExC_extralen)
c277df42 5944 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 5945 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 5946 if (SIZE_ONLY)
830247a4
IZ
5947 RExC_whilem_seen++, RExC_extralen += 3;
5948 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 5949 }
c277df42 5950 ret->flags = 0;
a0d0e21e
LW
5951
5952 if (min > 0)
821b33a5
IZ
5953 *flagp = WORST;
5954 if (max > 0)
5955 *flagp |= HASWIDTH;
a0d0e21e 5956 if (max && max < min)
8615cb43 5957 vFAIL("Can't do {n,m} with n > m");
c277df42 5958 if (!SIZE_ONLY) {
eb160463
GS
5959 ARG1_SET(ret, (U16)min);
5960 ARG2_SET(ret, (U16)max);
a687059c 5961 }
a687059c 5962
a0d0e21e 5963 goto nest_check;
a687059c 5964 }
a0d0e21e 5965 }
a687059c 5966
a0d0e21e
LW
5967 if (!ISMULT1(op)) {
5968 *flagp = flags;
a687059c 5969 return(ret);
a0d0e21e 5970 }
bb20fd44 5971
c277df42 5972#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
5973
5974 /* if this is reinstated, don't forget to put this back into perldiag:
5975
5976 =item Regexp *+ operand could be empty at {#} in regex m/%s/
5977
5978 (F) The part of the regexp subject to either the * or + quantifier
5979 could match an empty string. The {#} shows in the regular
5980 expression about where the problem was discovered.
5981
5982 */
5983
bb20fd44 5984 if (!(flags&HASWIDTH) && op != '?')
b45f050a 5985 vFAIL("Regexp *+ operand could be empty");
b81d288d 5986#endif
bb20fd44 5987
fac92740 5988 parse_start = RExC_parse;
830247a4 5989 nextchar(pRExC_state);
a0d0e21e 5990
821b33a5 5991 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
5992
5993 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 5994 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 5995 ret->flags = 0;
830247a4 5996 RExC_naughty += 4;
a0d0e21e
LW
5997 }
5998 else if (op == '*') {
5999 min = 0;
6000 goto do_curly;
a0ed51b3
LW
6001 }
6002 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 6003 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 6004 ret->flags = 0;
830247a4 6005 RExC_naughty += 3;
a0d0e21e
LW
6006 }
6007 else if (op == '+') {
6008 min = 1;
6009 goto do_curly;
a0ed51b3
LW
6010 }
6011 else if (op == '?') {
a0d0e21e
LW
6012 min = 0; max = 1;
6013 goto do_curly;
6014 }
6015 nest_check:
a3b492c3 6016 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3 && ckWARN(WARN_REGEXP)) {
830247a4 6017 vWARN3(RExC_parse,
b45f050a 6018 "%.*s matches null string many times",
afd78fd5 6019 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
b45f050a 6020 origparse);
a0d0e21e
LW
6021 }
6022
b9b4dddf 6023 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 6024 nextchar(pRExC_state);
6bda09f9 6025 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 6026 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 6027 }
b9b4dddf
YO
6028#ifndef REG_ALLOW_MINMOD_SUSPEND
6029 else
6030#endif
6031 if (RExC_parse < RExC_end && *RExC_parse == '+') {
6032 regnode *ender;
6033 nextchar(pRExC_state);
6034 ender = reg_node(pRExC_state, SUCCEED);
6035 REGTAIL(pRExC_state, ret, ender);
6036 reginsert(pRExC_state, SUSPEND, ret, depth+1);
6037 ret->flags = 0;
6038 ender = reg_node(pRExC_state, TAIL);
6039 REGTAIL(pRExC_state, ret, ender);
6040 /*ret= ender;*/
6041 }
6042
6043 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 6044 RExC_parse++;
b45f050a
JF
6045 vFAIL("Nested quantifiers");
6046 }
a0d0e21e
LW
6047
6048 return(ret);
a687059c
LW
6049}
6050
fc8cd66c
YO
6051
6052/* reg_namedseq(pRExC_state,UVp)
6053
6054 This is expected to be called by a parser routine that has
6055 recognized'\N' and needs to handle the rest. RExC_parse is
6056 expected to point at the first char following the N at the time
6057 of the call.
6058
6059 If valuep is non-null then it is assumed that we are parsing inside
6060 of a charclass definition and the first codepoint in the resolved
6061 string is returned via *valuep and the routine will return NULL.
6062 In this mode if a multichar string is returned from the charnames
6063 handler a warning will be issued, and only the first char in the
6064 sequence will be examined. If the string returned is zero length
6065 then the value of *valuep is undefined and NON-NULL will
6066 be returned to indicate failure. (This will NOT be a valid pointer
6067 to a regnode.)
6068
6069 If value is null then it is assumed that we are parsing normal text
6070 and inserts a new EXACT node into the program containing the resolved
6071 string and returns a pointer to the new node. If the string is
6072 zerolength a NOTHING node is emitted.
6073
6074 On success RExC_parse is set to the char following the endbrace.
6075 Parsing failures will generate a fatal errorvia vFAIL(...)
6076
6077 NOTE: We cache all results from the charnames handler locally in
6078 the RExC_charnames hash (created on first use) to prevent a charnames
6079 handler from playing silly-buggers and returning a short string and
6080 then a long string for a given pattern. Since the regexp program
6081 size is calculated during an initial parse this would result
6082 in a buffer overrun so we cache to prevent the charname result from
6083 changing during the course of the parse.
6084
6085 */
6086STATIC regnode *
6087S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep)
6088{
6089 char * name; /* start of the content of the name */
6090 char * endbrace; /* endbrace following the name */
6091 SV *sv_str = NULL;
6092 SV *sv_name = NULL;
6093 STRLEN len; /* this has various purposes throughout the code */
6094 bool cached = 0; /* if this is true then we shouldn't refcount dev sv_str */
6095 regnode *ret = NULL;
6096
6097 if (*RExC_parse != '{') {
6098 vFAIL("Missing braces on \\N{}");
6099 }
6100 name = RExC_parse+1;
6101 endbrace = strchr(RExC_parse, '}');
6102 if ( ! endbrace ) {
6103 RExC_parse++;
6104 vFAIL("Missing right brace on \\N{}");
6105 }
6106 RExC_parse = endbrace + 1;
6107
6108
6109 /* RExC_parse points at the beginning brace,
6110 endbrace points at the last */
6111 if ( name[0]=='U' && name[1]=='+' ) {
6112 /* its a "unicode hex" notation {U+89AB} */
6113 I32 fl = PERL_SCAN_ALLOW_UNDERSCORES
6114 | PERL_SCAN_DISALLOW_PREFIX
6115 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
6116 UV cp;
196f1508 6117 len = (STRLEN)(endbrace - name - 2);
fc8cd66c 6118 cp = grok_hex(name + 2, &len, &fl, NULL);
196f1508 6119 if ( len != (STRLEN)(endbrace - name - 2) ) {
fc8cd66c
YO
6120 cp = 0xFFFD;
6121 }
6122 if (cp > 0xff)
6123 RExC_utf8 = 1;
6124 if ( valuep ) {
6125 *valuep = cp;
6126 return NULL;
6127 }
6128 sv_str= Perl_newSVpvf_nocontext("%c",(int)cp);
6129 } else {
6130 /* fetch the charnames handler for this scope */
6131 HV * const table = GvHV(PL_hintgv);
6132 SV **cvp= table ?
6133 hv_fetchs(table, "charnames", FALSE) :
6134 NULL;
6135 SV *cv= cvp ? *cvp : NULL;
6136 HE *he_str;
6137 int count;
6138 /* create an SV with the name as argument */
6139 sv_name = newSVpvn(name, endbrace - name);
6140
6141 if (!table || !(PL_hints & HINT_LOCALIZE_HH)) {
6142 vFAIL2("Constant(\\N{%s}) unknown: "
6143 "(possibly a missing \"use charnames ...\")",
6144 SvPVX(sv_name));
6145 }
6146 if (!cvp || !SvOK(*cvp)) { /* when $^H{charnames} = undef; */
6147 vFAIL2("Constant(\\N{%s}): "
6148 "$^H{charnames} is not defined",SvPVX(sv_name));
6149 }
6150
6151
6152
6153 if (!RExC_charnames) {
6154 /* make sure our cache is allocated */
6155 RExC_charnames = newHV();
6bda09f9 6156 sv_2mortal((SV*)RExC_charnames);
fc8cd66c
YO
6157 }
6158 /* see if we have looked this one up before */
6159 he_str = hv_fetch_ent( RExC_charnames, sv_name, 0, 0 );
6160 if ( he_str ) {
6161 sv_str = HeVAL(he_str);
6162 cached = 1;
6163 } else {
6164 dSP ;
6165
6166 ENTER ;
6167 SAVETMPS ;
6168 PUSHMARK(SP) ;
6169
6170 XPUSHs(sv_name);
6171
6172 PUTBACK ;
6173
6174 count= call_sv(cv, G_SCALAR);
6175
6176 if (count == 1) { /* XXXX is this right? dmq */
6177 sv_str = POPs;
6178 SvREFCNT_inc_simple_void(sv_str);
6179 }
6180
6181 SPAGAIN ;
6182 PUTBACK ;
6183 FREETMPS ;
6184 LEAVE ;
6185
6186 if ( !sv_str || !SvOK(sv_str) ) {
6187 vFAIL2("Constant(\\N{%s}): Call to &{$^H{charnames}} "
6188 "did not return a defined value",SvPVX(sv_name));
6189 }
6190 if (hv_store_ent( RExC_charnames, sv_name, sv_str, 0))
6191 cached = 1;
6192 }
6193 }
6194 if (valuep) {
6195 char *p = SvPV(sv_str, len);
6196 if (len) {
6197 STRLEN numlen = 1;
6198 if ( SvUTF8(sv_str) ) {
196f1508 6199 *valuep = utf8_to_uvchr((U8*)p, &numlen);
fc8cd66c
YO
6200 if (*valuep > 0x7F)
6201 RExC_utf8 = 1;
6202 /* XXXX
6203 We have to turn on utf8 for high bit chars otherwise
6204 we get failures with
6205
6206 "ss" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6207 "SS" =~ /[\N{LATIN SMALL LETTER SHARP S}]/i
6208
6209 This is different from what \x{} would do with the same
6210 codepoint, where the condition is > 0xFF.
6211 - dmq
6212 */
6213
6214
6215 } else {
6216 *valuep = (UV)*p;
6217 /* warn if we havent used the whole string? */
6218 }
6219 if (numlen<len && SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6220 vWARN2(RExC_parse,
6221 "Ignoring excess chars from \\N{%s} in character class",
6222 SvPVX(sv_name)
6223 );
6224 }
6225 } else if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6226 vWARN2(RExC_parse,
6227 "Ignoring zero length \\N{%s} in character class",
6228 SvPVX(sv_name)
6229 );
6230 }
6231 if (sv_name)
6232 SvREFCNT_dec(sv_name);
6233 if (!cached)
6234 SvREFCNT_dec(sv_str);
6235 return len ? NULL : (regnode *)&len;
6236 } else if(SvCUR(sv_str)) {
6237
6238 char *s;
6239 char *p, *pend;
6240 STRLEN charlen = 1;
d008bc60 6241#ifdef DEBUGGING
fc8cd66c 6242 char * parse_start = name-3; /* needed for the offsets */
d008bc60 6243#endif
fc8cd66c
YO
6244 GET_RE_DEBUG_FLAGS_DECL; /* needed for the offsets */
6245
6246 ret = reg_node(pRExC_state,
6247 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
6248 s= STRING(ret);
6249
6250 if ( RExC_utf8 && !SvUTF8(sv_str) ) {
6251 sv_utf8_upgrade(sv_str);
6252 } else if ( !RExC_utf8 && SvUTF8(sv_str) ) {
6253 RExC_utf8= 1;
6254 }
6255
6256 p = SvPV(sv_str, len);
6257 pend = p + len;
6258 /* len is the length written, charlen is the size the char read */
6259 for ( len = 0; p < pend; p += charlen ) {
6260 if (UTF) {
196f1508 6261 UV uvc = utf8_to_uvchr((U8*)p, &charlen);
fc8cd66c
YO
6262 if (FOLD) {
6263 STRLEN foldlen,numlen;
6264 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
6265 uvc = toFOLD_uni(uvc, tmpbuf, &foldlen);
6266 /* Emit all the Unicode characters. */
6267
6268 for (foldbuf = tmpbuf;
6269 foldlen;
6270 foldlen -= numlen)
6271 {
6272 uvc = utf8_to_uvchr(foldbuf, &numlen);
6273 if (numlen > 0) {
6274 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6275 s += unilen;
6276 len += unilen;
6277 /* In EBCDIC the numlen
6278 * and unilen can differ. */
6279 foldbuf += numlen;
6280 if (numlen >= foldlen)
6281 break;
6282 }
6283 else
6284 break; /* "Can't happen." */
6285 }
6286 } else {
6287 const STRLEN unilen = reguni(pRExC_state, uvc, s);
6288 if (unilen > 0) {
6289 s += unilen;
6290 len += unilen;
6291 }
6292 }
6293 } else {
6294 len++;
6295 REGC(*p, s++);
6296 }
6297 }
6298 if (SIZE_ONLY) {
6299 RExC_size += STR_SZ(len);
6300 } else {
6301 STR_LEN(ret) = len;
6302 RExC_emit += STR_SZ(len);
6303 }
6304 Set_Node_Cur_Length(ret); /* MJD */
6305 RExC_parse--;
6306 nextchar(pRExC_state);
6307 } else {
6308 ret = reg_node(pRExC_state,NOTHING);
6309 }
6310 if (!cached) {
6311 SvREFCNT_dec(sv_str);
6312 }
6313 if (sv_name) {
6314 SvREFCNT_dec(sv_name);
6315 }
6316 return ret;
6317
6318}
6319
6320
9e08bc66
TS
6321/*
6322 * reg_recode
6323 *
6324 * It returns the code point in utf8 for the value in *encp.
6325 * value: a code value in the source encoding
6326 * encp: a pointer to an Encode object
6327 *
6328 * If the result from Encode is not a single character,
6329 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
6330 */
6331STATIC UV
6332S_reg_recode(pTHX_ const char value, SV **encp)
6333{
6334 STRLEN numlen = 1;
6335 SV * const sv = sv_2mortal(newSVpvn(&value, numlen));
6336 const char * const s = encp && *encp ? sv_recode_to_utf8(sv, *encp)
6337 : SvPVX(sv);
6338 const STRLEN newlen = SvCUR(sv);
6339 UV uv = UNICODE_REPLACEMENT;
6340
6341 if (newlen)
6342 uv = SvUTF8(sv)
6343 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
6344 : *(U8*)s;
6345
6346 if (!newlen || numlen != newlen) {
6347 uv = UNICODE_REPLACEMENT;
6348 if (encp)
6349 *encp = NULL;
6350 }
6351 return uv;
6352}
6353
fc8cd66c 6354
a687059c
LW
6355/*
6356 - regatom - the lowest level
ee9b8eae
YO
6357
6358 Try to identify anything special at the start of the pattern. If there
6359 is, then handle it as required. This may involve generating a single regop,
6360 such as for an assertion; or it may involve recursing, such as to
6361 handle a () structure.
6362
6363 If the string doesn't start with something special then we gobble up
6364 as much literal text as we can.
6365
6366 Once we have been able to handle whatever type of thing started the
6367 sequence, we return.
6368
6369 Note: we have to be careful with escapes, as they can be both literal
6370 and special, and in the case of \10 and friends can either, depending
6371 on context. Specifically there are two seperate switches for handling
6372 escape sequences, with the one for handling literal escapes requiring
6373 a dummy entry for all of the special escapes that are actually handled
6374 by the other.
6375*/
6376
76e3520e 6377STATIC regnode *
3dab1dad 6378S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 6379{
97aff369 6380 dVAR;
cbbf8932 6381 register regnode *ret = NULL;
a0d0e21e 6382 I32 flags;
45948336 6383 char *parse_start = RExC_parse;
3dab1dad
YO
6384 GET_RE_DEBUG_FLAGS_DECL;
6385 DEBUG_PARSE("atom");
a0d0e21e
LW
6386 *flagp = WORST; /* Tentatively. */
6387
ee9b8eae 6388
a0d0e21e 6389tryagain:
830247a4 6390 switch (*RExC_parse) {
a0d0e21e 6391 case '^':
830247a4
IZ
6392 RExC_seen_zerolen++;
6393 nextchar(pRExC_state);
bbe252da 6394 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6395 ret = reg_node(pRExC_state, MBOL);
bbe252da 6396 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6397 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6398 else
830247a4 6399 ret = reg_node(pRExC_state, BOL);
fac92740 6400 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6401 break;
6402 case '$':
830247a4 6403 nextchar(pRExC_state);
b81d288d 6404 if (*RExC_parse)
830247a4 6405 RExC_seen_zerolen++;
bbe252da 6406 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 6407 ret = reg_node(pRExC_state, MEOL);
bbe252da 6408 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 6409 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6410 else
830247a4 6411 ret = reg_node(pRExC_state, EOL);
fac92740 6412 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6413 break;
6414 case '.':
830247a4 6415 nextchar(pRExC_state);
bbe252da 6416 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
6417 ret = reg_node(pRExC_state, SANY);
6418 else
6419 ret = reg_node(pRExC_state, REG_ANY);
6420 *flagp |= HASWIDTH|SIMPLE;
830247a4 6421 RExC_naughty++;
fac92740 6422 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
6423 break;
6424 case '[':
b45f050a 6425 {
3dab1dad
YO
6426 char * const oregcomp_parse = ++RExC_parse;
6427 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
6428 if (*RExC_parse != ']') {
6429 RExC_parse = oregcomp_parse;
b45f050a
JF
6430 vFAIL("Unmatched [");
6431 }
830247a4 6432 nextchar(pRExC_state);
a0d0e21e 6433 *flagp |= HASWIDTH|SIMPLE;
fac92740 6434 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 6435 break;
b45f050a 6436 }
a0d0e21e 6437 case '(':
830247a4 6438 nextchar(pRExC_state);
3dab1dad 6439 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 6440 if (ret == NULL) {
bf93d4cc 6441 if (flags & TRYAGAIN) {
830247a4 6442 if (RExC_parse == RExC_end) {
bf93d4cc
GS
6443 /* Make parent create an empty node if needed. */
6444 *flagp |= TRYAGAIN;
6445 return(NULL);
6446 }
a0d0e21e 6447 goto tryagain;
bf93d4cc 6448 }
a0d0e21e
LW
6449 return(NULL);
6450 }
a3b492c3 6451 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
6452 break;
6453 case '|':
6454 case ')':
6455 if (flags & TRYAGAIN) {
6456 *flagp |= TRYAGAIN;
6457 return NULL;
6458 }
b45f050a 6459 vFAIL("Internal urp");
a0d0e21e
LW
6460 /* Supposed to be caught earlier. */
6461 break;
85afd4ae 6462 case '{':
830247a4
IZ
6463 if (!regcurly(RExC_parse)) {
6464 RExC_parse++;
85afd4ae
CS
6465 goto defchar;
6466 }
6467 /* FALL THROUGH */
a0d0e21e
LW
6468 case '?':
6469 case '+':
6470 case '*':
830247a4 6471 RExC_parse++;
b45f050a 6472 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
6473 break;
6474 case '\\':
ee9b8eae
YO
6475 /* Special Escapes
6476
6477 This switch handles escape sequences that resolve to some kind
6478 of special regop and not to literal text. Escape sequnces that
6479 resolve to literal text are handled below in the switch marked
6480 "Literal Escapes".
6481
6482 Every entry in this switch *must* have a corresponding entry
6483 in the literal escape switch. However, the opposite is not
6484 required, as the default for this switch is to jump to the
6485 literal text handling code.
6486 */
830247a4 6487 switch (*++RExC_parse) {
ee9b8eae 6488 /* Special Escapes */
a0d0e21e 6489 case 'A':
830247a4
IZ
6490 RExC_seen_zerolen++;
6491 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 6492 *flagp |= SIMPLE;
ee9b8eae 6493 goto finish_meta_pat;
a0d0e21e 6494 case 'G':
830247a4
IZ
6495 ret = reg_node(pRExC_state, GPOS);
6496 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 6497 *flagp |= SIMPLE;
ee9b8eae
YO
6498 goto finish_meta_pat;
6499 case 'K':
6500 RExC_seen_zerolen++;
6501 ret = reg_node(pRExC_state, KEEPS);
6502 *flagp |= SIMPLE;
6503 goto finish_meta_pat;
a0d0e21e 6504 case 'Z':
830247a4 6505 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 6506 *flagp |= SIMPLE;
a1917ab9 6507 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 6508 goto finish_meta_pat;
b85d18e9 6509 case 'z':
830247a4 6510 ret = reg_node(pRExC_state, EOS);
b85d18e9 6511 *flagp |= SIMPLE;
830247a4 6512 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 6513 goto finish_meta_pat;
4a2d328f 6514 case 'C':
f33976b4
DB
6515 ret = reg_node(pRExC_state, CANY);
6516 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 6517 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6518 goto finish_meta_pat;
a0ed51b3 6519 case 'X':
830247a4 6520 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 6521 *flagp |= HASWIDTH;
ee9b8eae 6522 goto finish_meta_pat;
a0d0e21e 6523 case 'w':
eb160463 6524 ret = reg_node(pRExC_state, (U8)(LOC ? ALNUML : ALNUM));
a0d0e21e 6525 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6526 goto finish_meta_pat;
a0d0e21e 6527 case 'W':
eb160463 6528 ret = reg_node(pRExC_state, (U8)(LOC ? NALNUML : NALNUM));
a0d0e21e 6529 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6530 goto finish_meta_pat;
a0d0e21e 6531 case 'b':
830247a4
IZ
6532 RExC_seen_zerolen++;
6533 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6534 ret = reg_node(pRExC_state, (U8)(LOC ? BOUNDL : BOUND));
a0d0e21e 6535 *flagp |= SIMPLE;
ee9b8eae 6536 goto finish_meta_pat;
a0d0e21e 6537 case 'B':
830247a4
IZ
6538 RExC_seen_zerolen++;
6539 RExC_seen |= REG_SEEN_LOOKBEHIND;
eb160463 6540 ret = reg_node(pRExC_state, (U8)(LOC ? NBOUNDL : NBOUND));
a0d0e21e 6541 *flagp |= SIMPLE;
ee9b8eae 6542 goto finish_meta_pat;
a0d0e21e 6543 case 's':
eb160463 6544 ret = reg_node(pRExC_state, (U8)(LOC ? SPACEL : SPACE));
a0d0e21e 6545 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6546 goto finish_meta_pat;
a0d0e21e 6547 case 'S':
eb160463 6548 ret = reg_node(pRExC_state, (U8)(LOC ? NSPACEL : NSPACE));
a0d0e21e 6549 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6550 goto finish_meta_pat;
a0d0e21e 6551 case 'd':
ffc61ed2 6552 ret = reg_node(pRExC_state, DIGIT);
a0d0e21e 6553 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 6554 goto finish_meta_pat;
a0d0e21e 6555 case 'D':
ffc61ed2 6556 ret = reg_node(pRExC_state, NDIGIT);
a0d0e21e 6557 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
6558 goto finish_meta_pat;
6559 case 'v':
6560 ret = reganode(pRExC_state, PRUNE, 0);
6561 ret->flags = 1;
6562 *flagp |= SIMPLE;
6563 goto finish_meta_pat;
6564 case 'V':
6565 ret = reganode(pRExC_state, SKIP, 0);
6566 ret->flags = 1;
6567 *flagp |= SIMPLE;
6568 finish_meta_pat:
830247a4 6569 nextchar(pRExC_state);
fac92740 6570 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 6571 break;
a14b48bc
LW
6572 case 'p':
6573 case 'P':
3568d838 6574 {
3dab1dad 6575 char* const oldregxend = RExC_end;
d008bc60 6576#ifdef DEBUGGING
ccb2c380 6577 char* parse_start = RExC_parse - 2;
d008bc60 6578#endif
a14b48bc 6579
830247a4 6580 if (RExC_parse[1] == '{') {
3568d838 6581 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
6582 RExC_end = strchr(RExC_parse, '}');
6583 if (!RExC_end) {
3dab1dad 6584 const U8 c = (U8)*RExC_parse;
830247a4
IZ
6585 RExC_parse += 2;
6586 RExC_end = oldregxend;
0da60cf5 6587 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 6588 }
830247a4 6589 RExC_end++;
a14b48bc 6590 }
af6f566e 6591 else {
830247a4 6592 RExC_end = RExC_parse + 2;
af6f566e
HS
6593 if (RExC_end > oldregxend)
6594 RExC_end = oldregxend;
6595 }
830247a4 6596 RExC_parse--;
a14b48bc 6597
3dab1dad 6598 ret = regclass(pRExC_state,depth+1);
a14b48bc 6599
830247a4
IZ
6600 RExC_end = oldregxend;
6601 RExC_parse--;
ccb2c380
MP
6602
6603 Set_Node_Offset(ret, parse_start + 2);
6604 Set_Node_Cur_Length(ret);
830247a4 6605 nextchar(pRExC_state);
a14b48bc
LW
6606 *flagp |= HASWIDTH|SIMPLE;
6607 }
6608 break;
fc8cd66c
YO
6609 case 'N':
6610 /* Handle \N{NAME} here and not below because it can be
6611 multicharacter. join_exact() will join them up later on.
6612 Also this makes sure that things like /\N{BLAH}+/ and
6613 \N{BLAH} being multi char Just Happen. dmq*/
6614 ++RExC_parse;
6615 ret= reg_namedseq(pRExC_state, NULL);
6616 break;
0a4db386 6617 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 6618 parse_named_seq:
81714fb9
YO
6619 {
6620 char ch= RExC_parse[1];
1f1031fe
YO
6621 if (ch != '<' && ch != '\'' && ch != '{') {
6622 RExC_parse++;
6623 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 6624 } else {
1f1031fe
YO
6625 /* this pretty much dupes the code for (?P=...) in reg(), if
6626 you change this make sure you change that */
81714fb9 6627 char* name_start = (RExC_parse += 2);
2eccd3b2 6628 U32 num = 0;
0a4db386
YO
6629 SV *sv_dat = reg_scan_name(pRExC_state,
6630 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 6631 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 6632 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
6633 vFAIL2("Sequence %.3s... not terminated",parse_start);
6634
6635 if (!SIZE_ONLY) {
6636 num = add_data( pRExC_state, 1, "S" );
6637 RExC_rxi->data->data[num]=(void*)sv_dat;
6638 SvREFCNT_inc(sv_dat);
6639 }
6640
81714fb9
YO
6641 RExC_sawback = 1;
6642 ret = reganode(pRExC_state,
6643 (U8)(FOLD ? (LOC ? NREFFL : NREFF) : NREF),
6644 num);
6645 *flagp |= HASWIDTH;
1f1031fe 6646
81714fb9
YO
6647 /* override incorrect value set in reganode MJD */
6648 Set_Node_Offset(ret, parse_start+1);
6649 Set_Node_Cur_Length(ret); /* MJD */
6650 nextchar(pRExC_state);
1f1031fe 6651
81714fb9
YO
6652 }
6653 break;
1f1031fe 6654 }
2bf803e2 6655 case 'g':
a0d0e21e
LW
6656 case '1': case '2': case '3': case '4':
6657 case '5': case '6': case '7': case '8': case '9':
6658 {
c74340f9 6659 I32 num;
2bf803e2
YO
6660 bool isg = *RExC_parse == 'g';
6661 bool isrel = 0;
6662 bool hasbrace = 0;
6663 if (isg) {
c74340f9 6664 RExC_parse++;
2bf803e2
YO
6665 if (*RExC_parse == '{') {
6666 RExC_parse++;
6667 hasbrace = 1;
6668 }
6669 if (*RExC_parse == '-') {
6670 RExC_parse++;
6671 isrel = 1;
6672 }
1f1031fe
YO
6673 if (hasbrace && !isDIGIT(*RExC_parse)) {
6674 if (isrel) RExC_parse--;
6675 RExC_parse -= 2;
6676 goto parse_named_seq;
6677 } }
c74340f9
YO
6678 num = atoi(RExC_parse);
6679 if (isrel) {
5624f11d 6680 num = RExC_npar - num;
c74340f9
YO
6681 if (num < 1)
6682 vFAIL("Reference to nonexistent or unclosed group");
6683 }
2bf803e2 6684 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
6685 goto defchar;
6686 else {
3dab1dad 6687 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
6688 while (isDIGIT(*RExC_parse))
6689 RExC_parse++;
1f1031fe
YO
6690 if (parse_start == RExC_parse - 1)
6691 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
6692 if (hasbrace) {
6693 if (*RExC_parse != '}')
6694 vFAIL("Unterminated \\g{...} pattern");
6695 RExC_parse++;
6696 }
c74340f9
YO
6697 if (!SIZE_ONLY) {
6698 if (num > (I32)RExC_rx->nparens)
6699 vFAIL("Reference to nonexistent group");
c74340f9 6700 }
830247a4 6701 RExC_sawback = 1;
eb160463
GS
6702 ret = reganode(pRExC_state,
6703 (U8)(FOLD ? (LOC ? REFFL : REFF) : REF),
6704 num);
a0d0e21e 6705 *flagp |= HASWIDTH;
2af232bd 6706
fac92740 6707 /* override incorrect value set in reganode MJD */
2af232bd 6708 Set_Node_Offset(ret, parse_start+1);
fac92740 6709 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
6710 RExC_parse--;
6711 nextchar(pRExC_state);
a0d0e21e
LW
6712 }
6713 }
6714 break;
6715 case '\0':
830247a4 6716 if (RExC_parse >= RExC_end)
b45f050a 6717 FAIL("Trailing \\");
a0d0e21e
LW
6718 /* FALL THROUGH */
6719 default:
a0288114 6720 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 6721 back into the quick-grab loop below */
45948336 6722 parse_start--;
a0d0e21e
LW
6723 goto defchar;
6724 }
6725 break;
4633a7c4
LW
6726
6727 case '#':
bbe252da 6728 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 6729 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
6730 goto tryagain;
6731 }
6732 /* FALL THROUGH */
6733
a0d0e21e 6734 default: {
ba210ebe 6735 register STRLEN len;
58ae7d3f 6736 register UV ender;
a0d0e21e 6737 register char *p;
3dab1dad 6738 char *s;
80aecb99 6739 STRLEN foldlen;
89ebb4a3 6740 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
f06dbbb7
JH
6741
6742 parse_start = RExC_parse - 1;
a0d0e21e 6743
830247a4 6744 RExC_parse++;
a0d0e21e
LW
6745
6746 defchar:
58ae7d3f 6747 ender = 0;
eb160463
GS
6748 ret = reg_node(pRExC_state,
6749 (U8)(FOLD ? (LOC ? EXACTFL : EXACTF) : EXACT));
cd439c50 6750 s = STRING(ret);
830247a4
IZ
6751 for (len = 0, p = RExC_parse - 1;
6752 len < 127 && p < RExC_end;
a0d0e21e
LW
6753 len++)
6754 {
3dab1dad 6755 char * const oldp = p;
5b5a24f7 6756
bbe252da 6757 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 6758 p = regwhite( pRExC_state, p );
a0d0e21e
LW
6759 switch (*p) {
6760 case '^':
6761 case '$':
6762 case '.':
6763 case '[':
6764 case '(':
6765 case ')':
6766 case '|':
6767 goto loopdone;
6768 case '\\':
ee9b8eae
YO
6769 /* Literal Escapes Switch
6770
6771 This switch is meant to handle escape sequences that
6772 resolve to a literal character.
6773
6774 Every escape sequence that represents something
6775 else, like an assertion or a char class, is handled
6776 in the switch marked 'Special Escapes' above in this
6777 routine, but also has an entry here as anything that
6778 isn't explicitly mentioned here will be treated as
6779 an unescaped equivalent literal.
6780 */
6781
a0d0e21e 6782 switch (*++p) {
ee9b8eae
YO
6783 /* These are all the special escapes. */
6784 case 'A': /* Start assertion */
6785 case 'b': case 'B': /* Word-boundary assertion*/
6786 case 'C': /* Single char !DANGEROUS! */
6787 case 'd': case 'D': /* digit class */
6788 case 'g': case 'G': /* generic-backref, pos assertion */
6789 case 'k': case 'K': /* named backref, keep marker */
6790 case 'N': /* named char sequence */
6791 case 'p': case 'P': /* unicode property */
6792 case 's': case 'S': /* space class */
6793 case 'v': case 'V': /* (*PRUNE) and (*SKIP) */
6794 case 'w': case 'W': /* word class */
6795 case 'X': /* eXtended Unicode "combining character sequence" */
6796 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
6797 --p;
6798 goto loopdone;
ee9b8eae
YO
6799
6800 /* Anything after here is an escape that resolves to a
6801 literal. (Except digits, which may or may not)
6802 */
a0d0e21e
LW
6803 case 'n':
6804 ender = '\n';
6805 p++;
a687059c 6806 break;
a0d0e21e
LW
6807 case 'r':
6808 ender = '\r';
6809 p++;
a687059c 6810 break;
a0d0e21e
LW
6811 case 't':
6812 ender = '\t';
6813 p++;
a687059c 6814 break;
a0d0e21e
LW
6815 case 'f':
6816 ender = '\f';
6817 p++;
a687059c 6818 break;
a0d0e21e 6819 case 'e':
c7f1f016 6820 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 6821 p++;
a687059c 6822 break;
a0d0e21e 6823 case 'a':
c7f1f016 6824 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 6825 p++;
a687059c 6826 break;
a0d0e21e 6827 case 'x':
a0ed51b3 6828 if (*++p == '{') {
1df70142 6829 char* const e = strchr(p, '}');
b81d288d 6830
b45f050a 6831 if (!e) {
830247a4 6832 RExC_parse = p + 1;
b45f050a
JF
6833 vFAIL("Missing right brace on \\x{}");
6834 }
de5f0749 6835 else {
a4c04bdc
NC
6836 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
6837 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 6838 STRLEN numlen = e - p - 1;
53305cf1 6839 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028
JH
6840 if (ender > 0xff)
6841 RExC_utf8 = 1;
a0ed51b3
LW
6842 p = e + 1;
6843 }
a0ed51b3
LW
6844 }
6845 else {
a4c04bdc 6846 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 6847 STRLEN numlen = 2;
53305cf1 6848 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
6849 p += numlen;
6850 }
9e08bc66
TS
6851 if (PL_encoding && ender < 0x100)
6852 goto recode_encoding;
a687059c 6853 break;
a0d0e21e
LW
6854 case 'c':
6855 p++;
bbce6d69 6856 ender = UCHARAT(p++);
6857 ender = toCTRL(ender);
a687059c 6858 break;
a0d0e21e
LW
6859 case '0': case '1': case '2': case '3':case '4':
6860 case '5': case '6': case '7': case '8':case '9':
6861 if (*p == '0' ||
830247a4 6862 (isDIGIT(p[1]) && atoi(p) >= RExC_npar) ) {
53305cf1 6863 I32 flags = 0;
1df70142 6864 STRLEN numlen = 3;
53305cf1 6865 ender = grok_oct(p, &numlen, &flags, NULL);
a0d0e21e
LW
6866 p += numlen;
6867 }
6868 else {
6869 --p;
6870 goto loopdone;
a687059c 6871 }
9e08bc66
TS
6872 if (PL_encoding && ender < 0x100)
6873 goto recode_encoding;
6874 break;
6875 recode_encoding:
6876 {
6877 SV* enc = PL_encoding;
6878 ender = reg_recode((const char)(U8)ender, &enc);
6879 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
6880 vWARN(p, "Invalid escape in the specified encoding");
6881 RExC_utf8 = 1;
6882 }
a687059c 6883 break;
a0d0e21e 6884 case '\0':
830247a4 6885 if (p >= RExC_end)
b45f050a 6886 FAIL("Trailing \\");
a687059c 6887 /* FALL THROUGH */
a0d0e21e 6888 default:
041457d9 6889 if (!SIZE_ONLY&& isALPHA(*p) && ckWARN(WARN_REGEXP))
4193bef7 6890 vWARN2(p + 1, "Unrecognized escape \\%c passed through", UCHARAT(p));
a0ed51b3 6891 goto normal_default;
a0d0e21e
LW
6892 }
6893 break;
a687059c 6894 default:
a0ed51b3 6895 normal_default:
fd400ab9 6896 if (UTF8_IS_START(*p) && UTF) {
1df70142 6897 STRLEN numlen;
5e12f4fb 6898 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 6899 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
6900 p += numlen;
6901 }
6902 else
6903 ender = *p++;
a0d0e21e 6904 break;
a687059c 6905 }
bcdf7404
YO
6906 if ( RExC_flags & RXf_PMf_EXTENDED)
6907 p = regwhite( pRExC_state, p );
60a8b682
JH
6908 if (UTF && FOLD) {
6909 /* Prime the casefolded buffer. */
ac7e0132 6910 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
60a8b682 6911 }
bcdf7404 6912 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
6913 if (len)
6914 p = oldp;
16ea2a2e 6915 else if (UTF) {
80aecb99 6916 if (FOLD) {
60a8b682 6917 /* Emit all the Unicode characters. */
1df70142 6918 STRLEN numlen;
80aecb99
JH
6919 for (foldbuf = tmpbuf;
6920 foldlen;
6921 foldlen -= numlen) {
6922 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6923 if (numlen > 0) {
71207a34 6924 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6925 s += unilen;
6926 len += unilen;
6927 /* In EBCDIC the numlen
6928 * and unilen can differ. */
9dc45d57 6929 foldbuf += numlen;
47654450
JH
6930 if (numlen >= foldlen)
6931 break;
9dc45d57
JH
6932 }
6933 else
6934 break; /* "Can't happen." */
80aecb99
JH
6935 }
6936 }
6937 else {
71207a34 6938 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6939 if (unilen > 0) {
0ebc6274
JH
6940 s += unilen;
6941 len += unilen;
9dc45d57 6942 }
80aecb99 6943 }
a0ed51b3 6944 }
a0d0e21e
LW
6945 else {
6946 len++;
eb160463 6947 REGC((char)ender, s++);
a0d0e21e
LW
6948 }
6949 break;
a687059c 6950 }
16ea2a2e 6951 if (UTF) {
80aecb99 6952 if (FOLD) {
60a8b682 6953 /* Emit all the Unicode characters. */
1df70142 6954 STRLEN numlen;
80aecb99
JH
6955 for (foldbuf = tmpbuf;
6956 foldlen;
6957 foldlen -= numlen) {
6958 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 6959 if (numlen > 0) {
71207a34 6960 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
6961 len += unilen;
6962 s += unilen;
6963 /* In EBCDIC the numlen
6964 * and unilen can differ. */
9dc45d57 6965 foldbuf += numlen;
47654450
JH
6966 if (numlen >= foldlen)
6967 break;
9dc45d57
JH
6968 }
6969 else
6970 break;
80aecb99
JH
6971 }
6972 }
6973 else {
71207a34 6974 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 6975 if (unilen > 0) {
0ebc6274
JH
6976 s += unilen;
6977 len += unilen;
9dc45d57 6978 }
80aecb99
JH
6979 }
6980 len--;
a0ed51b3
LW
6981 }
6982 else
eb160463 6983 REGC((char)ender, s++);
a0d0e21e
LW
6984 }
6985 loopdone:
830247a4 6986 RExC_parse = p - 1;
fac92740 6987 Set_Node_Cur_Length(ret); /* MJD */
830247a4 6988 nextchar(pRExC_state);
793db0cb
JH
6989 {
6990 /* len is STRLEN which is unsigned, need to copy to signed */
6991 IV iv = len;
6992 if (iv < 0)
6993 vFAIL("Internal disaster");
6994 }
a0d0e21e
LW
6995 if (len > 0)
6996 *flagp |= HASWIDTH;
090f7165 6997 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 6998 *flagp |= SIMPLE;
3dab1dad 6999
cd439c50 7000 if (SIZE_ONLY)
830247a4 7001 RExC_size += STR_SZ(len);
3dab1dad
YO
7002 else {
7003 STR_LEN(ret) = len;
830247a4 7004 RExC_emit += STR_SZ(len);
07be1b83 7005 }
3dab1dad 7006 }
a0d0e21e
LW
7007 break;
7008 }
a687059c 7009
a0d0e21e 7010 return(ret);
a687059c
LW
7011}
7012
873ef191 7013STATIC char *
bcdf7404 7014S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 7015{
bcdf7404 7016 const char *e = RExC_end;
5b5a24f7
CS
7017 while (p < e) {
7018 if (isSPACE(*p))
7019 ++p;
7020 else if (*p == '#') {
bcdf7404 7021 bool ended = 0;
5b5a24f7 7022 do {
bcdf7404
YO
7023 if (*p++ == '\n') {
7024 ended = 1;
7025 break;
7026 }
7027 } while (p < e);
7028 if (!ended)
7029 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
7030 }
7031 else
7032 break;
7033 }
7034 return p;
7035}
7036
b8c5462f
JH
7037/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
7038 Character classes ([:foo:]) can also be negated ([:^foo:]).
7039 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
7040 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 7041 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
7042
7043#define POSIXCC_DONE(c) ((c) == ':')
7044#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
7045#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
7046
b8c5462f 7047STATIC I32
830247a4 7048S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 7049{
97aff369 7050 dVAR;
936ed897 7051 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 7052
830247a4 7053 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 7054 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 7055 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 7056 const char c = UCHARAT(RExC_parse);
097eb12c 7057 char* const s = RExC_parse++;
b81d288d 7058
9a86a77b 7059 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
7060 RExC_parse++;
7061 if (RExC_parse == RExC_end)
620e46c5 7062 /* Grandfather lone [:, [=, [. */
830247a4 7063 RExC_parse = s;
620e46c5 7064 else {
3dab1dad 7065 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
7066 assert(*t == c);
7067
9a86a77b 7068 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 7069 const char *posixcc = s + 1;
830247a4 7070 RExC_parse++; /* skip over the ending ] */
3dab1dad 7071
b8c5462f 7072 if (*s == ':') {
1df70142
AL
7073 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
7074 const I32 skip = t - posixcc;
80916619
NC
7075
7076 /* Initially switch on the length of the name. */
7077 switch (skip) {
7078 case 4:
3dab1dad
YO
7079 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
7080 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 7081 break;
80916619
NC
7082 case 5:
7083 /* Names all of length 5. */
7084 /* alnum alpha ascii blank cntrl digit graph lower
7085 print punct space upper */
7086 /* Offset 4 gives the best switch position. */
7087 switch (posixcc[4]) {
7088 case 'a':
3dab1dad
YO
7089 if (memEQ(posixcc, "alph", 4)) /* alpha */
7090 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
7091 break;
7092 case 'e':
3dab1dad
YO
7093 if (memEQ(posixcc, "spac", 4)) /* space */
7094 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
7095 break;
7096 case 'h':
3dab1dad
YO
7097 if (memEQ(posixcc, "grap", 4)) /* graph */
7098 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
7099 break;
7100 case 'i':
3dab1dad
YO
7101 if (memEQ(posixcc, "asci", 4)) /* ascii */
7102 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
7103 break;
7104 case 'k':
3dab1dad
YO
7105 if (memEQ(posixcc, "blan", 4)) /* blank */
7106 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
7107 break;
7108 case 'l':
3dab1dad
YO
7109 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
7110 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
7111 break;
7112 case 'm':
3dab1dad
YO
7113 if (memEQ(posixcc, "alnu", 4)) /* alnum */
7114 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
7115 break;
7116 case 'r':
3dab1dad
YO
7117 if (memEQ(posixcc, "lowe", 4)) /* lower */
7118 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
7119 else if (memEQ(posixcc, "uppe", 4)) /* upper */
7120 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
7121 break;
7122 case 't':
3dab1dad
YO
7123 if (memEQ(posixcc, "digi", 4)) /* digit */
7124 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
7125 else if (memEQ(posixcc, "prin", 4)) /* print */
7126 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
7127 else if (memEQ(posixcc, "punc", 4)) /* punct */
7128 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 7129 break;
b8c5462f
JH
7130 }
7131 break;
80916619 7132 case 6:
3dab1dad
YO
7133 if (memEQ(posixcc, "xdigit", 6))
7134 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
7135 break;
7136 }
80916619
NC
7137
7138 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
7139 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
7140 t - s - 1, s + 1);
80916619
NC
7141 assert (posixcc[skip] == ':');
7142 assert (posixcc[skip+1] == ']');
b45f050a 7143 } else if (!SIZE_ONLY) {
b8c5462f 7144 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 7145
830247a4 7146 /* adjust RExC_parse so the warning shows after
b45f050a 7147 the class closes */
9a86a77b 7148 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 7149 RExC_parse++;
b45f050a
JF
7150 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7151 }
b8c5462f
JH
7152 } else {
7153 /* Maternal grandfather:
7154 * "[:" ending in ":" but not in ":]" */
830247a4 7155 RExC_parse = s;
767d463e 7156 }
620e46c5
JH
7157 }
7158 }
7159
b8c5462f
JH
7160 return namedclass;
7161}
7162
7163STATIC void
830247a4 7164S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 7165{
97aff369 7166 dVAR;
3dab1dad 7167 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
7168 const char *s = RExC_parse;
7169 const char c = *s++;
b8c5462f 7170
3dab1dad 7171 while (isALNUM(*s))
b8c5462f
JH
7172 s++;
7173 if (*s && c == *s && s[1] == ']') {
cd84f5b2
RGS
7174 if (ckWARN(WARN_REGEXP))
7175 vWARN3(s+2,
7176 "POSIX syntax [%c %c] belongs inside character classes",
7177 c, c);
b45f050a
JF
7178
7179 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 7180 if (POSIXCC_NOTYET(c)) {
830247a4 7181 /* adjust RExC_parse so the error shows after
b45f050a 7182 the class closes */
9a86a77b 7183 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 7184 NOOP;
b45f050a
JF
7185 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
7186 }
b8c5462f
JH
7187 }
7188 }
620e46c5
JH
7189}
7190
7f6f358c 7191
89836f1f
YO
7192#define _C_C_T_(NAME,TEST,WORD) \
7193ANYOF_##NAME: \
7194 if (LOC) \
7195 ANYOF_CLASS_SET(ret, ANYOF_##NAME); \
7196 else { \
7197 for (value = 0; value < 256; value++) \
7198 if (TEST) \
7199 ANYOF_BITMAP_SET(ret, value); \
7200 } \
7201 yesno = '+'; \
7202 what = WORD; \
7203 break; \
7204case ANYOF_N##NAME: \
7205 if (LOC) \
7206 ANYOF_CLASS_SET(ret, ANYOF_N##NAME); \
7207 else { \
7208 for (value = 0; value < 256; value++) \
7209 if (!TEST) \
7210 ANYOF_BITMAP_SET(ret, value); \
7211 } \
7212 yesno = '!'; \
7213 what = WORD; \
7214 break
7215
7216
7f6f358c
YO
7217/*
7218 parse a class specification and produce either an ANYOF node that
89836f1f
YO
7219 matches the pattern or if the pattern matches a single char only and
7220 that char is < 256 and we are case insensitive then we produce an
7221 EXACT node instead.
7f6f358c 7222*/
89836f1f 7223
76e3520e 7224STATIC regnode *
3dab1dad 7225S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 7226{
97aff369 7227 dVAR;
9ef43ace 7228 register UV value = 0;
9a86a77b 7229 register UV nextvalue;
3568d838 7230 register IV prevvalue = OOB_UNICODE;
ffc61ed2 7231 register IV range = 0;
c277df42 7232 register regnode *ret;
ba210ebe 7233 STRLEN numlen;
ffc61ed2 7234 IV namedclass;
cbbf8932 7235 char *rangebegin = NULL;
936ed897 7236 bool need_class = 0;
c445ea15 7237 SV *listsv = NULL;
ffc61ed2 7238 UV n;
9e55ce06 7239 bool optimize_invert = TRUE;
cbbf8932 7240 AV* unicode_alternate = NULL;
1b2d223b
JH
7241#ifdef EBCDIC
7242 UV literal_endpoint = 0;
7243#endif
7f6f358c 7244 UV stored = 0; /* number of chars stored in the class */
ffc61ed2 7245
3dab1dad 7246 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 7247 case we need to change the emitted regop to an EXACT. */
07be1b83 7248 const char * orig_parse = RExC_parse;
72f13be8 7249 GET_RE_DEBUG_FLAGS_DECL;
76e84362
SH
7250#ifndef DEBUGGING
7251 PERL_UNUSED_ARG(depth);
7252#endif
72f13be8 7253
3dab1dad 7254 DEBUG_PARSE("clas");
7f6f358c
YO
7255
7256 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
7257 ret = reganode(pRExC_state, ANYOF, 0);
7258
7259 if (!SIZE_ONLY)
7260 ANYOF_FLAGS(ret) = 0;
7261
9a86a77b 7262 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
7263 RExC_naughty++;
7264 RExC_parse++;
7265 if (!SIZE_ONLY)
7266 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
7267 }
a0d0e21e 7268
73060fc4 7269 if (SIZE_ONLY) {
830247a4 7270 RExC_size += ANYOF_SKIP;
73060fc4
JH
7271 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
7272 }
936ed897 7273 else {
830247a4 7274 RExC_emit += ANYOF_SKIP;
936ed897
IZ
7275 if (FOLD)
7276 ANYOF_FLAGS(ret) |= ANYOF_FOLD;
7277 if (LOC)
7278 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
ffc61ed2 7279 ANYOF_BITMAP_ZERO(ret);
396482e1 7280 listsv = newSVpvs("# comment\n");
a0d0e21e 7281 }
b8c5462f 7282
9a86a77b
JH
7283 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7284
b938889d 7285 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 7286 checkposixcc(pRExC_state);
b8c5462f 7287
f064b6ad
HS
7288 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
7289 if (UCHARAT(RExC_parse) == ']')
7290 goto charclassloop;
ffc61ed2 7291
fc8cd66c 7292parseit:
9a86a77b 7293 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
7294
7295 charclassloop:
7296
7297 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
7298
73b437c8 7299 if (!range)
830247a4 7300 rangebegin = RExC_parse;
ffc61ed2 7301 if (UTF) {
5e12f4fb 7302 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 7303 RExC_end - RExC_parse,
9f7f3913 7304 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7305 RExC_parse += numlen;
7306 }
7307 else
7308 value = UCHARAT(RExC_parse++);
7f6f358c 7309
9a86a77b
JH
7310 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
7311 if (value == '[' && POSIXCC(nextvalue))
830247a4 7312 namedclass = regpposixcc(pRExC_state, value);
620e46c5 7313 else if (value == '\\') {
ffc61ed2 7314 if (UTF) {
5e12f4fb 7315 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 7316 RExC_end - RExC_parse,
9f7f3913 7317 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
7318 RExC_parse += numlen;
7319 }
7320 else
7321 value = UCHARAT(RExC_parse++);
470c3474 7322 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 7323 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
7324 * be a problem later if we want switch on Unicode.
7325 * A similar issue a little bit later when switching on
7326 * namedclass. --jhi */
ffc61ed2 7327 switch ((I32)value) {
b8c5462f
JH
7328 case 'w': namedclass = ANYOF_ALNUM; break;
7329 case 'W': namedclass = ANYOF_NALNUM; break;
7330 case 's': namedclass = ANYOF_SPACE; break;
7331 case 'S': namedclass = ANYOF_NSPACE; break;
7332 case 'd': namedclass = ANYOF_DIGIT; break;
7333 case 'D': namedclass = ANYOF_NDIGIT; break;
fc8cd66c
YO
7334 case 'N': /* Handle \N{NAME} in class */
7335 {
7336 /* We only pay attention to the first char of
7337 multichar strings being returned. I kinda wonder
7338 if this makes sense as it does change the behaviour
7339 from earlier versions, OTOH that behaviour was broken
7340 as well. */
7341 UV v; /* value is register so we cant & it /grrr */
7342 if (reg_namedseq(pRExC_state, &v)) {
7343 goto parseit;
7344 }
7345 value= v;
7346 }
7347 break;
ffc61ed2
JH
7348 case 'p':
7349 case 'P':
3dab1dad
YO
7350 {
7351 char *e;
af6f566e 7352 if (RExC_parse >= RExC_end)
2a4859cd 7353 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 7354 if (*RExC_parse == '{') {
1df70142 7355 const U8 c = (U8)value;
ffc61ed2
JH
7356 e = strchr(RExC_parse++, '}');
7357 if (!e)
0da60cf5 7358 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
7359 while (isSPACE(UCHARAT(RExC_parse)))
7360 RExC_parse++;
7361 if (e == RExC_parse)
0da60cf5 7362 vFAIL2("Empty \\%c{}", c);
ffc61ed2 7363 n = e - RExC_parse;
ab13f0c7
JH
7364 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
7365 n--;
ffc61ed2
JH
7366 }
7367 else {
7368 e = RExC_parse;
7369 n = 1;
7370 }
7371 if (!SIZE_ONLY) {
ab13f0c7
JH
7372 if (UCHARAT(RExC_parse) == '^') {
7373 RExC_parse++;
7374 n--;
7375 value = value == 'p' ? 'P' : 'p'; /* toggle */
7376 while (isSPACE(UCHARAT(RExC_parse))) {
7377 RExC_parse++;
7378 n--;
7379 }
7380 }
097eb12c
AL
7381 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%.*s\n",
7382 (value=='p' ? '+' : '!'), (int)n, RExC_parse);
ffc61ed2
JH
7383 }
7384 RExC_parse = e + 1;
7385 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
f81125e2 7386 namedclass = ANYOF_MAX; /* no official name, but it's named */
3dab1dad 7387 }
f81125e2 7388 break;
b8c5462f
JH
7389 case 'n': value = '\n'; break;
7390 case 'r': value = '\r'; break;
7391 case 't': value = '\t'; break;
7392 case 'f': value = '\f'; break;
7393 case 'b': value = '\b'; break;
c7f1f016
NIS
7394 case 'e': value = ASCII_TO_NATIVE('\033');break;
7395 case 'a': value = ASCII_TO_NATIVE('\007');break;
b8c5462f 7396 case 'x':
ffc61ed2 7397 if (*RExC_parse == '{') {
a4c04bdc
NC
7398 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
7399 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 7400 char * const e = strchr(RExC_parse++, '}');
b81d288d 7401 if (!e)
ffc61ed2 7402 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
7403
7404 numlen = e - RExC_parse;
7405 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7406 RExC_parse = e + 1;
7407 }
7408 else {
a4c04bdc 7409 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
7410 numlen = 2;
7411 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
7412 RExC_parse += numlen;
7413 }
9e08bc66
TS
7414 if (PL_encoding && value < 0x100)
7415 goto recode_encoding;
b8c5462f
JH
7416 break;
7417 case 'c':
830247a4 7418 value = UCHARAT(RExC_parse++);
b8c5462f
JH
7419 value = toCTRL(value);
7420 break;
7421 case '0': case '1': case '2': case '3': case '4':
7422 case '5': case '6': case '7': case '8': case '9':
9e08bc66
TS
7423 {
7424 I32 flags = 0;
7425 numlen = 3;
7426 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
7427 RExC_parse += numlen;
7428 if (PL_encoding && value < 0x100)
7429 goto recode_encoding;
7430 break;
7431 }
7432 recode_encoding:
7433 {
7434 SV* enc = PL_encoding;
7435 value = reg_recode((const char)(U8)value, &enc);
7436 if (!enc && SIZE_ONLY && ckWARN(WARN_REGEXP))
7437 vWARN(RExC_parse,
7438 "Invalid escape in the specified encoding");
7439 break;
7440 }
1028017a 7441 default:
041457d9 7442 if (!SIZE_ONLY && isALPHA(value) && ckWARN(WARN_REGEXP))
ffc61ed2
JH
7443 vWARN2(RExC_parse,
7444 "Unrecognized escape \\%c in character class passed through",
7445 (int)value);
1028017a 7446 break;
b8c5462f 7447 }
ffc61ed2 7448 } /* end of \blah */
1b2d223b
JH
7449#ifdef EBCDIC
7450 else
7451 literal_endpoint++;
7452#endif
ffc61ed2
JH
7453
7454 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
7455
7456 if (!SIZE_ONLY && !need_class)
936ed897 7457 ANYOF_CLASS_ZERO(ret);
ffc61ed2 7458
936ed897 7459 need_class = 1;
ffc61ed2
JH
7460
7461 /* a bad range like a-\d, a-[:digit:] ? */
7462 if (range) {
73b437c8 7463 if (!SIZE_ONLY) {
afd78fd5 7464 if (ckWARN(WARN_REGEXP)) {
097eb12c 7465 const int w =
afd78fd5
JH
7466 RExC_parse >= rangebegin ?
7467 RExC_parse - rangebegin : 0;
830247a4 7468 vWARN4(RExC_parse,
b45f050a 7469 "False [] range \"%*.*s\"",
097eb12c 7470 w, w, rangebegin);
afd78fd5 7471 }
3568d838
JH
7472 if (prevvalue < 256) {
7473 ANYOF_BITMAP_SET(ret, prevvalue);
ffc61ed2
JH
7474 ANYOF_BITMAP_SET(ret, '-');
7475 }
7476 else {
7477 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
7478 Perl_sv_catpvf(aTHX_ listsv,
3568d838 7479 "%04"UVxf"\n%04"UVxf"\n", (UV)prevvalue, (UV) '-');
ffc61ed2 7480 }
b8c5462f 7481 }
ffc61ed2
JH
7482
7483 range = 0; /* this was not a true range */
73b437c8 7484 }
ffc61ed2 7485
89836f1f
YO
7486
7487
73b437c8 7488 if (!SIZE_ONLY) {
c49a72a9
NC
7489 const char *what = NULL;
7490 char yesno = 0;
7491
3568d838
JH
7492 if (namedclass > OOB_NAMEDCLASS)
7493 optimize_invert = FALSE;
e2962f66
JH
7494 /* Possible truncation here but in some 64-bit environments
7495 * the compiler gets heartburn about switch on 64-bit values.
7496 * A similar issue a little earlier when switching on value.
98f323fa 7497 * --jhi */
e2962f66 7498 switch ((I32)namedclass) {
89836f1f
YO
7499 case _C_C_T_(ALNUM, isALNUM(value), "Word");
7500 case _C_C_T_(ALNUMC, isALNUMC(value), "Alnum");
7501 case _C_C_T_(ALPHA, isALPHA(value), "Alpha");
7502 case _C_C_T_(BLANK, isBLANK(value), "Blank");
7503 case _C_C_T_(CNTRL, isCNTRL(value), "Cntrl");
7504 case _C_C_T_(GRAPH, isGRAPH(value), "Graph");
7505 case _C_C_T_(LOWER, isLOWER(value), "Lower");
7506 case _C_C_T_(PRINT, isPRINT(value), "Print");
7507 case _C_C_T_(PSXSPC, isPSXSPC(value), "Space");
7508 case _C_C_T_(PUNCT, isPUNCT(value), "Punct");
7509 case _C_C_T_(SPACE, isSPACE(value), "SpacePerl");
7510 case _C_C_T_(UPPER, isUPPER(value), "Upper");
7511 case _C_C_T_(XDIGIT, isXDIGIT(value), "XDigit");
73b437c8
JH
7512 case ANYOF_ASCII:
7513 if (LOC)
936ed897 7514 ANYOF_CLASS_SET(ret, ANYOF_ASCII);
73b437c8 7515 else {
c7f1f016 7516#ifndef EBCDIC
1ba5c669
JH
7517 for (value = 0; value < 128; value++)
7518 ANYOF_BITMAP_SET(ret, value);
7519#else /* EBCDIC */
ffbc6a93 7520 for (value = 0; value < 256; value++) {
3a3c4447
JH
7521 if (isASCII(value))
7522 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7523 }
1ba5c669 7524#endif /* EBCDIC */
73b437c8 7525 }
c49a72a9
NC
7526 yesno = '+';
7527 what = "ASCII";
73b437c8
JH
7528 break;
7529 case ANYOF_NASCII:
7530 if (LOC)
936ed897 7531 ANYOF_CLASS_SET(ret, ANYOF_NASCII);
73b437c8 7532 else {
c7f1f016 7533#ifndef EBCDIC
1ba5c669
JH
7534 for (value = 128; value < 256; value++)
7535 ANYOF_BITMAP_SET(ret, value);
7536#else /* EBCDIC */
ffbc6a93 7537 for (value = 0; value < 256; value++) {
3a3c4447
JH
7538 if (!isASCII(value))
7539 ANYOF_BITMAP_SET(ret, value);
ffbc6a93 7540 }
1ba5c669 7541#endif /* EBCDIC */
73b437c8 7542 }
c49a72a9
NC
7543 yesno = '!';
7544 what = "ASCII";
89836f1f 7545 break;
ffc61ed2
JH
7546 case ANYOF_DIGIT:
7547 if (LOC)
7548 ANYOF_CLASS_SET(ret, ANYOF_DIGIT);
7549 else {
7550 /* consecutive digits assumed */
7551 for (value = '0'; value <= '9'; value++)
7552 ANYOF_BITMAP_SET(ret, value);
7553 }
c49a72a9
NC
7554 yesno = '+';
7555 what = "Digit";
ffc61ed2
JH
7556 break;
7557 case ANYOF_NDIGIT:
7558 if (LOC)
7559 ANYOF_CLASS_SET(ret, ANYOF_NDIGIT);
7560 else {
7561 /* consecutive digits assumed */
7562 for (value = 0; value < '0'; value++)
7563 ANYOF_BITMAP_SET(ret, value);
7564 for (value = '9' + 1; value < 256; value++)
7565 ANYOF_BITMAP_SET(ret, value);
7566 }
c49a72a9
NC
7567 yesno = '!';
7568 what = "Digit";
89836f1f 7569 break;
f81125e2
JP
7570 case ANYOF_MAX:
7571 /* this is to handle \p and \P */
7572 break;
73b437c8 7573 default:
b45f050a 7574 vFAIL("Invalid [::] class");
73b437c8 7575 break;
b8c5462f 7576 }
c49a72a9
NC
7577 if (what) {
7578 /* Strings such as "+utf8::isWord\n" */
7579 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::Is%s\n", yesno, what);
7580 }
b8c5462f 7581 if (LOC)
936ed897 7582 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
73b437c8 7583 continue;
a0d0e21e 7584 }
ffc61ed2
JH
7585 } /* end of namedclass \blah */
7586
a0d0e21e 7587 if (range) {
eb160463 7588 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
7589 const int w = RExC_parse - rangebegin;
7590 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 7591 range = 0; /* not a valid range */
73b437c8 7592 }
a0d0e21e
LW
7593 }
7594 else {
3568d838 7595 prevvalue = value; /* save the beginning of the range */
830247a4
IZ
7596 if (*RExC_parse == '-' && RExC_parse+1 < RExC_end &&
7597 RExC_parse[1] != ']') {
7598 RExC_parse++;
ffc61ed2
JH
7599
7600 /* a bad range like \w-, [:word:]- ? */
7601 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 7602 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 7603 const int w =
afd78fd5
JH
7604 RExC_parse >= rangebegin ?
7605 RExC_parse - rangebegin : 0;
830247a4 7606 vWARN4(RExC_parse,
b45f050a 7607 "False [] range \"%*.*s\"",
097eb12c 7608 w, w, rangebegin);
afd78fd5 7609 }
73b437c8 7610 if (!SIZE_ONLY)
936ed897 7611 ANYOF_BITMAP_SET(ret, '-');
73b437c8 7612 } else
ffc61ed2
JH
7613 range = 1; /* yeah, it's a range! */
7614 continue; /* but do it the next time */
a0d0e21e 7615 }
a687059c 7616 }
ffc61ed2 7617
93733859 7618 /* now is the next time */
07be1b83 7619 /*stored += (value - prevvalue + 1);*/
ae5c130c 7620 if (!SIZE_ONLY) {
3568d838 7621 if (prevvalue < 256) {
1df70142 7622 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 7623 IV i;
3568d838 7624#ifdef EBCDIC
1b2d223b
JH
7625 /* In EBCDIC [\x89-\x91] should include
7626 * the \x8e but [i-j] should not. */
7627 if (literal_endpoint == 2 &&
7628 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
7629 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 7630 {
3568d838
JH
7631 if (isLOWER(prevvalue)) {
7632 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7633 if (isLOWER(i))
7634 ANYOF_BITMAP_SET(ret, i);
7635 } else {
3568d838 7636 for (i = prevvalue; i <= ceilvalue; i++)
ffc61ed2
JH
7637 if (isUPPER(i))
7638 ANYOF_BITMAP_SET(ret, i);
7639 }
8ada0baa 7640 }
ffc61ed2 7641 else
8ada0baa 7642#endif
07be1b83
YO
7643 for (i = prevvalue; i <= ceilvalue; i++) {
7644 if (!ANYOF_BITMAP_TEST(ret,i)) {
7645 stored++;
7646 ANYOF_BITMAP_SET(ret, i);
7647 }
7648 }
3568d838 7649 }
a5961de5 7650 if (value > 255 || UTF) {
1df70142
AL
7651 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
7652 const UV natvalue = NATIVE_TO_UNI(value);
07be1b83 7653 stored+=2; /* can't optimize this class */
ffc61ed2 7654 ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
b08decb7 7655 if (prevnatvalue < natvalue) { /* what about > ? */
ffc61ed2 7656 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
b08decb7
JH
7657 prevnatvalue, natvalue);
7658 }
7659 else if (prevnatvalue == natvalue) {
7660 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", natvalue);
09091399 7661 if (FOLD) {
89ebb4a3 7662 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
254ba52a 7663 STRLEN foldlen;
1df70142 7664 const UV f = to_uni_fold(natvalue, foldbuf, &foldlen);
254ba52a 7665
e294cc5d
JH
7666#ifdef EBCDIC /* RD t/uni/fold ff and 6b */
7667 if (RExC_precomp[0] == ':' &&
7668 RExC_precomp[1] == '[' &&
7669 (f == 0xDF || f == 0x92)) {
7670 f = NATIVE_TO_UNI(f);
7671 }
7672#endif
c840d2a2
JH
7673 /* If folding and foldable and a single
7674 * character, insert also the folded version
7675 * to the charclass. */
9e55ce06 7676 if (f != value) {
e294cc5d
JH
7677#ifdef EBCDIC /* RD tunifold ligatures s,t fb05, fb06 */
7678 if ((RExC_precomp[0] == ':' &&
7679 RExC_precomp[1] == '[' &&
7680 (f == 0xA2 &&
7681 (value == 0xFB05 || value == 0xFB06))) ?
7682 foldlen == ((STRLEN)UNISKIP(f) - 1) :
7683 foldlen == (STRLEN)UNISKIP(f) )
7684#else
eb160463 7685 if (foldlen == (STRLEN)UNISKIP(f))
e294cc5d 7686#endif
9e55ce06
JH
7687 Perl_sv_catpvf(aTHX_ listsv,
7688 "%04"UVxf"\n", f);
7689 else {
7690 /* Any multicharacter foldings
7691 * require the following transform:
7692 * [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst)
7693 * where E folds into "pq" and F folds
7694 * into "rst", all other characters
7695 * fold to single characters. We save
7696 * away these multicharacter foldings,
7697 * to be later saved as part of the
7698 * additional "s" data. */
7699 SV *sv;
7700
7701 if (!unicode_alternate)
7702 unicode_alternate = newAV();
7703 sv = newSVpvn((char*)foldbuf, foldlen);
7704 SvUTF8_on(sv);
7705 av_push(unicode_alternate, sv);
7706 }
7707 }
254ba52a 7708
60a8b682
JH
7709 /* If folding and the value is one of the Greek
7710 * sigmas insert a few more sigmas to make the
7711 * folding rules of the sigmas to work right.
7712 * Note that not all the possible combinations
7713 * are handled here: some of them are handled
9e55ce06
JH
7714 * by the standard folding rules, and some of
7715 * them (literal or EXACTF cases) are handled
7716 * during runtime in regexec.c:S_find_byclass(). */
09091399
JH
7717 if (value == UNICODE_GREEK_SMALL_LETTER_FINAL_SIGMA) {
7718 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7719 (UV)UNICODE_GREEK_CAPITAL_LETTER_SIGMA);
09091399 7720 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7721 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7722 }
7723 else if (value == UNICODE_GREEK_CAPITAL_LETTER_SIGMA)
7724 Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n",
eb050b28 7725 (UV)UNICODE_GREEK_SMALL_LETTER_SIGMA);
09091399
JH
7726 }
7727 }
ffc61ed2 7728 }
1b2d223b
JH
7729#ifdef EBCDIC
7730 literal_endpoint = 0;
7731#endif
8ada0baa 7732 }
ffc61ed2
JH
7733
7734 range = 0; /* this range (if it was one) is done now */
a0d0e21e 7735 }
ffc61ed2 7736
936ed897 7737 if (need_class) {
4f66b38d 7738 ANYOF_FLAGS(ret) |= ANYOF_LARGE;
936ed897 7739 if (SIZE_ONLY)
830247a4 7740 RExC_size += ANYOF_CLASS_ADD_SKIP;
936ed897 7741 else
830247a4 7742 RExC_emit += ANYOF_CLASS_ADD_SKIP;
936ed897 7743 }
ffc61ed2 7744
7f6f358c
YO
7745
7746 if (SIZE_ONLY)
7747 return ret;
7748 /****** !SIZE_ONLY AFTER HERE *********/
7749
7750 if( stored == 1 && value < 256
7751 && !( ANYOF_FLAGS(ret) & ( ANYOF_FLAGS_ALL ^ ANYOF_FOLD ) )
7752 ) {
7753 /* optimize single char class to an EXACT node
7754 but *only* when its not a UTF/high char */
07be1b83
YO
7755 const char * cur_parse= RExC_parse;
7756 RExC_emit = (regnode *)orig_emit;
7757 RExC_parse = (char *)orig_parse;
7f6f358c
YO
7758 ret = reg_node(pRExC_state,
7759 (U8)((ANYOF_FLAGS(ret) & ANYOF_FOLD) ? EXACTF : EXACT));
07be1b83 7760 RExC_parse = (char *)cur_parse;
7f6f358c
YO
7761 *STRING(ret)= (char)value;
7762 STR_LEN(ret)= 1;
7763 RExC_emit += STR_SZ(1);
7764 return ret;
7765 }
ae5c130c 7766 /* optimize case-insensitive simple patterns (e.g. /[a-z]/i) */
7f6f358c 7767 if ( /* If the only flag is folding (plus possibly inversion). */
516a5887
JH
7768 ((ANYOF_FLAGS(ret) & (ANYOF_FLAGS_ALL ^ ANYOF_INVERT)) == ANYOF_FOLD)
7769 ) {
a0ed51b3 7770 for (value = 0; value < 256; ++value) {
936ed897 7771 if (ANYOF_BITMAP_TEST(ret, value)) {
eb160463 7772 UV fold = PL_fold[value];
ffc61ed2
JH
7773
7774 if (fold != value)
7775 ANYOF_BITMAP_SET(ret, fold);
ae5c130c
GS
7776 }
7777 }
936ed897 7778 ANYOF_FLAGS(ret) &= ~ANYOF_FOLD;
ae5c130c 7779 }
ffc61ed2 7780
ae5c130c 7781 /* optimize inverted simple patterns (e.g. [^a-z]) */
7f6f358c 7782 if (optimize_invert &&
ffc61ed2
JH
7783 /* If the only flag is inversion. */
7784 (ANYOF_FLAGS(ret) & ANYOF_FLAGS_ALL) == ANYOF_INVERT) {
b8c5462f 7785 for (value = 0; value < ANYOF_BITMAP_SIZE; ++value)
936ed897 7786 ANYOF_BITMAP(ret)[value] ^= ANYOF_FLAGS_ALL;
1aa99e6b 7787 ANYOF_FLAGS(ret) = ANYOF_UNICODE_ALL;
ae5c130c 7788 }
7f6f358c 7789 {
097eb12c 7790 AV * const av = newAV();
ffc61ed2 7791 SV *rv;
9e55ce06 7792 /* The 0th element stores the character class description
6a0407ee 7793 * in its textual form: used later (regexec.c:Perl_regclass_swash())
9e55ce06
JH
7794 * to initialize the appropriate swash (which gets stored in
7795 * the 1st element), and also useful for dumping the regnode.
7796 * The 2nd element stores the multicharacter foldings,
6a0407ee 7797 * used later (regexec.c:S_reginclass()). */
ffc61ed2
JH
7798 av_store(av, 0, listsv);
7799 av_store(av, 1, NULL);
9e55ce06 7800 av_store(av, 2, (SV*)unicode_alternate);
ffc61ed2 7801 rv = newRV_noinc((SV*)av);
19860706 7802 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 7803 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 7804 ARG_SET(ret, n);
a0ed51b3 7805 }
a0ed51b3
LW
7806 return ret;
7807}
89836f1f
YO
7808#undef _C_C_T_
7809
a0ed51b3 7810
bcdf7404
YO
7811/* reg_skipcomment()
7812
7813 Absorbs an /x style # comments from the input stream.
7814 Returns true if there is more text remaining in the stream.
7815 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
7816 terminates the pattern without including a newline.
7817
7818 Note its the callers responsibility to ensure that we are
7819 actually in /x mode
7820
7821*/
7822
7823STATIC bool
7824S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
7825{
7826 bool ended = 0;
7827 while (RExC_parse < RExC_end)
7828 if (*RExC_parse++ == '\n') {
7829 ended = 1;
7830 break;
7831 }
7832 if (!ended) {
7833 /* we ran off the end of the pattern without ending
7834 the comment, so we have to add an \n when wrapping */
7835 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
7836 return 0;
7837 } else
7838 return 1;
7839}
7840
7841/* nextchar()
7842
7843 Advance that parse position, and optionall absorbs
7844 "whitespace" from the inputstream.
7845
7846 Without /x "whitespace" means (?#...) style comments only,
7847 with /x this means (?#...) and # comments and whitespace proper.
7848
7849 Returns the RExC_parse point from BEFORE the scan occurs.
7850
7851 This is the /x friendly way of saying RExC_parse++.
7852*/
7853
76e3520e 7854STATIC char*
830247a4 7855S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 7856{
097eb12c 7857 char* const retval = RExC_parse++;
a0d0e21e 7858
4633a7c4 7859 for (;;) {
830247a4
IZ
7860 if (*RExC_parse == '(' && RExC_parse[1] == '?' &&
7861 RExC_parse[2] == '#') {
e994fd66
AE
7862 while (*RExC_parse != ')') {
7863 if (RExC_parse == RExC_end)
7864 FAIL("Sequence (?#... not terminated");
830247a4 7865 RExC_parse++;
e994fd66 7866 }
830247a4 7867 RExC_parse++;
4633a7c4
LW
7868 continue;
7869 }
bbe252da 7870 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
7871 if (isSPACE(*RExC_parse)) {
7872 RExC_parse++;
748a9306
LW
7873 continue;
7874 }
830247a4 7875 else if (*RExC_parse == '#') {
bcdf7404
YO
7876 if ( reg_skipcomment( pRExC_state ) )
7877 continue;
748a9306 7878 }
748a9306 7879 }
4633a7c4 7880 return retval;
a0d0e21e 7881 }
a687059c
LW
7882}
7883
7884/*
c277df42 7885- reg_node - emit a node
a0d0e21e 7886*/
76e3520e 7887STATIC regnode * /* Location. */
830247a4 7888S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 7889{
97aff369 7890 dVAR;
c277df42 7891 register regnode *ptr;
504618e9 7892 regnode * const ret = RExC_emit;
07be1b83 7893 GET_RE_DEBUG_FLAGS_DECL;
a687059c 7894
c277df42 7895 if (SIZE_ONLY) {
830247a4
IZ
7896 SIZE_ALIGN(RExC_size);
7897 RExC_size += 1;
a0d0e21e
LW
7898 return(ret);
7899 }
e2e6a0f1
YO
7900#ifdef DEBUGGING
7901 if (OP(RExC_emit) == 255)
7902 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %s: %d ",
7903 reg_name[op], OP(RExC_emit));
7904#endif
c277df42 7905 NODE_ALIGN_FILL(ret);
a0d0e21e 7906 ptr = ret;
c277df42 7907 FILL_ADVANCE_NODE(ptr, op);
7122b237 7908#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 7909 if (RExC_offsets) { /* MJD */
07be1b83 7910 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740
MJD
7911 "reg_node", __LINE__,
7912 reg_name[op],
07be1b83
YO
7913 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
7914 ? "Overwriting end of array!\n" : "OK",
7915 (UV)(RExC_emit - RExC_emit_start),
7916 (UV)(RExC_parse - RExC_start),
7917 (UV)RExC_offsets[0]));
ccb2c380 7918 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 7919 }
7122b237 7920#endif
830247a4 7921 RExC_emit = ptr;
a0d0e21e 7922 return(ret);
a687059c
LW
7923}
7924
7925/*
a0d0e21e
LW
7926- reganode - emit a node with an argument
7927*/
76e3520e 7928STATIC regnode * /* Location. */
830247a4 7929S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 7930{
97aff369 7931 dVAR;
c277df42 7932 register regnode *ptr;
504618e9 7933 regnode * const ret = RExC_emit;
07be1b83 7934 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 7935
c277df42 7936 if (SIZE_ONLY) {
830247a4
IZ
7937 SIZE_ALIGN(RExC_size);
7938 RExC_size += 2;
6bda09f9
YO
7939 /*
7940 We can't do this:
7941
7942 assert(2==regarglen[op]+1);
7943
7944 Anything larger than this has to allocate the extra amount.
7945 If we changed this to be:
7946
7947 RExC_size += (1 + regarglen[op]);
7948
7949 then it wouldn't matter. Its not clear what side effect
7950 might come from that so its not done so far.
7951 -- dmq
7952 */
a0d0e21e
LW
7953 return(ret);
7954 }
e2e6a0f1
YO
7955#ifdef DEBUGGING
7956 if (OP(RExC_emit) == 255)
7957 Perl_croak(aTHX_ "panic: reganode overwriting end of allocated program space");
7958#endif
c277df42 7959 NODE_ALIGN_FILL(ret);
a0d0e21e 7960 ptr = ret;
c277df42 7961 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 7962#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 7963 if (RExC_offsets) { /* MJD */
07be1b83 7964 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 7965 "reganode",
ccb2c380
MP
7966 __LINE__,
7967 reg_name[op],
07be1b83 7968 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 7969 "Overwriting end of array!\n" : "OK",
07be1b83
YO
7970 (UV)(RExC_emit - RExC_emit_start),
7971 (UV)(RExC_parse - RExC_start),
7972 (UV)RExC_offsets[0]));
ccb2c380 7973 Set_Cur_Node_Offset;
fac92740 7974 }
7122b237 7975#endif
830247a4 7976 RExC_emit = ptr;
a0d0e21e 7977 return(ret);
fe14fcc3
LW
7978}
7979
7980/*
cd439c50 7981- reguni - emit (if appropriate) a Unicode character
a0ed51b3 7982*/
71207a34
AL
7983STATIC STRLEN
7984S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 7985{
97aff369 7986 dVAR;
71207a34 7987 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
7988}
7989
7990/*
a0d0e21e
LW
7991- reginsert - insert an operator in front of already-emitted operand
7992*
7993* Means relocating the operand.
7994*/
76e3520e 7995STATIC void
6bda09f9 7996S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 7997{
97aff369 7998 dVAR;
c277df42
IZ
7999 register regnode *src;
8000 register regnode *dst;
8001 register regnode *place;
504618e9 8002 const int offset = regarglen[(U8)op];
6bda09f9 8003 const int size = NODE_STEP_REGNODE + offset;
07be1b83 8004 GET_RE_DEBUG_FLAGS_DECL;
def51078 8005 PERL_UNUSED_ARG(depth);
22c35a8c 8006/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
6bda09f9 8007 DEBUG_PARSE_FMT("inst"," - %s",reg_name[op]);
c277df42 8008 if (SIZE_ONLY) {
6bda09f9 8009 RExC_size += size;
a0d0e21e
LW
8010 return;
8011 }
a687059c 8012
830247a4 8013 src = RExC_emit;
6bda09f9 8014 RExC_emit += size;
830247a4 8015 dst = RExC_emit;
40d049e4 8016 if (RExC_open_parens) {
6bda09f9 8017 int paren;
6d99fb9b 8018 DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);
6bda09f9 8019 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4
YO
8020 if ( RExC_open_parens[paren] >= opnd ) {
8021 DEBUG_PARSE_FMT("open"," - %d",size);
8022 RExC_open_parens[paren] += size;
8023 } else {
8024 DEBUG_PARSE_FMT("open"," - %s","ok");
8025 }
8026 if ( RExC_close_parens[paren] >= opnd ) {
8027 DEBUG_PARSE_FMT("close"," - %d",size);
8028 RExC_close_parens[paren] += size;
8029 } else {
8030 DEBUG_PARSE_FMT("close"," - %s","ok");
8031 }
8032 }
6bda09f9 8033 }
40d049e4 8034
fac92740 8035 while (src > opnd) {
c277df42 8036 StructCopy(--src, --dst, regnode);
7122b237 8037#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8038 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 8039 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 8040 "reg_insert",
ccb2c380
MP
8041 __LINE__,
8042 reg_name[op],
07be1b83
YO
8043 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
8044 ? "Overwriting end of array!\n" : "OK",
8045 (UV)(src - RExC_emit_start),
8046 (UV)(dst - RExC_emit_start),
8047 (UV)RExC_offsets[0]));
ccb2c380
MP
8048 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
8049 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 8050 }
7122b237 8051#endif
fac92740
MJD
8052 }
8053
a0d0e21e
LW
8054
8055 place = opnd; /* Op node, where operand used to be. */
7122b237 8056#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8057 if (RExC_offsets) { /* MJD */
07be1b83 8058 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 8059 "reginsert",
ccb2c380
MP
8060 __LINE__,
8061 reg_name[op],
07be1b83 8062 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 8063 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
8064 (UV)(place - RExC_emit_start),
8065 (UV)(RExC_parse - RExC_start),
786e8c11 8066 (UV)RExC_offsets[0]));
ccb2c380 8067 Set_Node_Offset(place, RExC_parse);
45948336 8068 Set_Node_Length(place, 1);
fac92740 8069 }
7122b237 8070#endif
c277df42
IZ
8071 src = NEXTOPER(place);
8072 FILL_ADVANCE_NODE(place, op);
8073 Zero(src, offset, regnode);
a687059c
LW
8074}
8075
8076/*
c277df42 8077- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 8078- SEE ALSO: regtail_study
a0d0e21e 8079*/
097eb12c 8080/* TODO: All three parms should be const */
76e3520e 8081STATIC void
3dab1dad 8082S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 8083{
97aff369 8084 dVAR;
c277df42 8085 register regnode *scan;
72f13be8 8086 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1
SP
8087#ifndef DEBUGGING
8088 PERL_UNUSED_ARG(depth);
8089#endif
a0d0e21e 8090
c277df42 8091 if (SIZE_ONLY)
a0d0e21e
LW
8092 return;
8093
8094 /* Find last node. */
8095 scan = p;
8096 for (;;) {
504618e9 8097 regnode * const temp = regnext(scan);
3dab1dad
YO
8098 DEBUG_PARSE_r({
8099 SV * const mysv=sv_newmortal();
8100 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
8101 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
8102 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
8103 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
8104 (temp == NULL ? "->" : ""),
8105 (temp == NULL ? reg_name[OP(val)] : "")
8106 );
3dab1dad
YO
8107 });
8108 if (temp == NULL)
8109 break;
8110 scan = temp;
8111 }
8112
8113 if (reg_off_by_arg[OP(scan)]) {
8114 ARG_SET(scan, val - scan);
8115 }
8116 else {
8117 NEXT_OFF(scan) = val - scan;
8118 }
8119}
8120
07be1b83 8121#ifdef DEBUGGING
3dab1dad
YO
8122/*
8123- regtail_study - set the next-pointer at the end of a node chain of p to val.
8124- Look for optimizable sequences at the same time.
8125- currently only looks for EXACT chains.
07be1b83
YO
8126
8127This is expermental code. The idea is to use this routine to perform
8128in place optimizations on branches and groups as they are constructed,
8129with the long term intention of removing optimization from study_chunk so
8130that it is purely analytical.
8131
8132Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
8133to control which is which.
8134
3dab1dad
YO
8135*/
8136/* TODO: All four parms should be const */
07be1b83 8137
3dab1dad
YO
8138STATIC U8
8139S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
8140{
8141 dVAR;
8142 register regnode *scan;
07be1b83
YO
8143 U8 exact = PSEUDO;
8144#ifdef EXPERIMENTAL_INPLACESCAN
8145 I32 min = 0;
8146#endif
8147
3dab1dad
YO
8148 GET_RE_DEBUG_FLAGS_DECL;
8149
07be1b83 8150
3dab1dad
YO
8151 if (SIZE_ONLY)
8152 return exact;
8153
8154 /* Find last node. */
8155
8156 scan = p;
8157 for (;;) {
8158 regnode * const temp = regnext(scan);
07be1b83
YO
8159#ifdef EXPERIMENTAL_INPLACESCAN
8160 if (PL_regkind[OP(scan)] == EXACT)
8161 if (join_exact(pRExC_state,scan,&min,1,val,depth+1))
8162 return EXACT;
8163#endif
3dab1dad
YO
8164 if ( exact ) {
8165 switch (OP(scan)) {
8166 case EXACT:
8167 case EXACTF:
8168 case EXACTFL:
8169 if( exact == PSEUDO )
8170 exact= OP(scan);
07be1b83
YO
8171 else if ( exact != OP(scan) )
8172 exact= 0;
3dab1dad
YO
8173 case NOTHING:
8174 break;
8175 default:
8176 exact= 0;
8177 }
8178 }
8179 DEBUG_PARSE_r({
8180 SV * const mysv=sv_newmortal();
8181 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
8182 regprop(RExC_rx, mysv, scan);
eaf3ca90 8183 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 8184 SvPV_nolen_const(mysv),
eaf3ca90
YO
8185 REG_NODE_NUM(scan),
8186 reg_name[exact]);
3dab1dad 8187 });
a0d0e21e
LW
8188 if (temp == NULL)
8189 break;
8190 scan = temp;
8191 }
07be1b83
YO
8192 DEBUG_PARSE_r({
8193 SV * const mysv_val=sv_newmortal();
8194 DEBUG_PARSE_MSG("");
8195 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
8196 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
8197 SvPV_nolen_const(mysv_val),
8198 (IV)REG_NODE_NUM(val),
8199 (IV)(val - scan)
07be1b83
YO
8200 );
8201 });
c277df42
IZ
8202 if (reg_off_by_arg[OP(scan)]) {
8203 ARG_SET(scan, val - scan);
a0ed51b3
LW
8204 }
8205 else {
c277df42
IZ
8206 NEXT_OFF(scan) = val - scan;
8207 }
3dab1dad
YO
8208
8209 return exact;
a687059c 8210}
07be1b83 8211#endif
a687059c
LW
8212
8213/*
a687059c
LW
8214 - regcurly - a little FSA that accepts {\d+,?\d*}
8215 */
79072805 8216STATIC I32
5f66b61c 8217S_regcurly(register const char *s)
a687059c
LW
8218{
8219 if (*s++ != '{')
8220 return FALSE;
f0fcb552 8221 if (!isDIGIT(*s))
a687059c 8222 return FALSE;
f0fcb552 8223 while (isDIGIT(*s))
a687059c
LW
8224 s++;
8225 if (*s == ',')
8226 s++;
f0fcb552 8227 while (isDIGIT(*s))
a687059c
LW
8228 s++;
8229 if (*s != '}')
8230 return FALSE;
8231 return TRUE;
8232}
8233
a687059c
LW
8234
8235/*
fd181c75 8236 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c
LW
8237 */
8238void
097eb12c 8239Perl_regdump(pTHX_ const regexp *r)
a687059c 8240{
35ff7856 8241#ifdef DEBUGGING
97aff369 8242 dVAR;
c445ea15 8243 SV * const sv = sv_newmortal();
ab3bbdeb 8244 SV *dsv= sv_newmortal();
f8fc2ecf 8245 RXi_GET_DECL(r,ri);
a687059c 8246
f8fc2ecf 8247 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
8248
8249 /* Header fields of interest. */
ab3bbdeb
YO
8250 if (r->anchored_substr) {
8251 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
8252 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 8253 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8254 "anchored %s%s at %"IVdf" ",
8255 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 8256 (IV)r->anchored_offset);
ab3bbdeb
YO
8257 } else if (r->anchored_utf8) {
8258 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
8259 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 8260 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8261 "anchored utf8 %s%s at %"IVdf" ",
8262 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 8263 (IV)r->anchored_offset);
ab3bbdeb
YO
8264 }
8265 if (r->float_substr) {
8266 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
8267 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 8268 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8269 "floating %s%s at %"IVdf"..%"UVuf" ",
8270 s, RE_SV_TAIL(r->float_substr),
7b0972df 8271 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
8272 } else if (r->float_utf8) {
8273 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
8274 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 8275 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
8276 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
8277 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 8278 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 8279 }
33b8afdf 8280 if (r->check_substr || r->check_utf8)
b81d288d 8281 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
8282 (const char *)
8283 (r->check_substr == r->float_substr
8284 && r->check_utf8 == r->float_utf8
8285 ? "(checking floating" : "(checking anchored"));
bbe252da 8286 if (r->extflags & RXf_NOSCAN)
c277df42 8287 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 8288 if (r->extflags & RXf_CHECK_ALL)
c277df42 8289 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 8290 if (r->check_substr || r->check_utf8)
c277df42
IZ
8291 PerlIO_printf(Perl_debug_log, ") ");
8292
f8fc2ecf
YO
8293 if (ri->regstclass) {
8294 regprop(r, sv, ri->regstclass);
1de06328 8295 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 8296 }
bbe252da 8297 if (r->extflags & RXf_ANCH) {
774d564b 8298 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 8299 if (r->extflags & RXf_ANCH_BOL)
774d564b 8300 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 8301 if (r->extflags & RXf_ANCH_MBOL)
c277df42 8302 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 8303 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 8304 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 8305 if (r->extflags & RXf_ANCH_GPOS)
774d564b 8306 PerlIO_printf(Perl_debug_log, "(GPOS)");
8307 PerlIO_putc(Perl_debug_log, ' ');
8308 }
bbe252da 8309 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 8310 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 8311 if (r->intflags & PREGf_SKIP)
760ac839 8312 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 8313 if (r->intflags & PREGf_IMPLICIT)
760ac839 8314 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 8315 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 8316 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 8317 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 8318 PerlIO_printf(Perl_debug_log, "\n");
65e66c80 8319#else
96a5add6 8320 PERL_UNUSED_CONTEXT;
65e66c80 8321 PERL_UNUSED_ARG(r);
17c3b450 8322#endif /* DEBUGGING */
a687059c
LW
8323}
8324
8325/*
a0d0e21e
LW
8326- regprop - printable representation of opcode
8327*/
46fc3d4c 8328void
32fc9b6a 8329Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 8330{
35ff7856 8331#ifdef DEBUGGING
97aff369 8332 dVAR;
9b155405 8333 register int k;
f8fc2ecf 8334 RXi_GET_DECL(prog,progi);
1de06328 8335 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 8336
a0d0e21e 8337
54dc92de 8338 sv_setpvn(sv, "", 0);
8aa23a47 8339
03363afd 8340 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
8341 /* It would be nice to FAIL() here, but this may be called from
8342 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 8343 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
bfed75c6 8344 sv_catpv(sv, reg_name[OP(o)]); /* Take off const! */
9b155405 8345
3dab1dad 8346 k = PL_regkind[OP(o)];
9b155405 8347
2a782b5b 8348 if (k == EXACT) {
396482e1 8349 SV * const dsv = sv_2mortal(newSVpvs(""));
ab3bbdeb
YO
8350 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
8351 * is a crude hack but it may be the best for now since
8352 * we have no flag "this EXACTish node was UTF-8"
8353 * --jhi */
8354 const char * const s =
ddc5bc0f 8355 pv_pretty(dsv, STRING(o), STR_LEN(o), 60,
ab3bbdeb
YO
8356 PL_colors[0], PL_colors[1],
8357 PERL_PV_ESCAPE_UNI_DETECT |
8358 PERL_PV_PRETTY_ELIPSES |
8359 PERL_PV_PRETTY_LTGT
8360 );
8361 Perl_sv_catpvf(aTHX_ sv, " %s", s );
bb263b4e 8362 } else if (k == TRIE) {
3dab1dad 8363 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 8364 * progi->data isn't available here */
1de06328 8365 const char op = OP(o);
647f639f 8366 const U32 n = ARG(o);
1de06328 8367 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 8368 (reg_ac_data *)progi->data->data[n] :
1de06328 8369 NULL;
3251b653
NC
8370 const reg_trie_data * const trie
8371 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328
YO
8372
8373 Perl_sv_catpvf(aTHX_ sv, "-%s",reg_name[o->flags]);
8374 DEBUG_TRIE_COMPILE_r(
8375 Perl_sv_catpvf(aTHX_ sv,
8376 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
8377 (UV)trie->startstate,
1e2e3d02 8378 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
8379 (UV)trie->wordcount,
8380 (UV)trie->minlen,
8381 (UV)trie->maxlen,
8382 (UV)TRIE_CHARCOUNT(trie),
8383 (UV)trie->uniquecharcount
8384 )
8385 );
8386 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
8387 int i;
8388 int rangestart = -1;
f46cb337 8389 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
1de06328
YO
8390 Perl_sv_catpvf(aTHX_ sv, "[");
8391 for (i = 0; i <= 256; i++) {
8392 if (i < 256 && BITMAP_TEST(bitmap,i)) {
8393 if (rangestart == -1)
8394 rangestart = i;
8395 } else if (rangestart != -1) {
8396 if (i <= rangestart + 3)
8397 for (; rangestart < i; rangestart++)
8398 put_byte(sv, rangestart);
8399 else {
8400 put_byte(sv, rangestart);
8401 sv_catpvs(sv, "-");
8402 put_byte(sv, i - 1);
8403 }
8404 rangestart = -1;
8405 }
8406 }
8407 Perl_sv_catpvf(aTHX_ sv, "]");
8408 }
8409
a3621e74 8410 } else if (k == CURLY) {
cb434fcc 8411 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
8412 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
8413 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 8414 }
2c2d71f5
JH
8415 else if (k == WHILEM && o->flags) /* Ordinal/of */
8416 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 8417 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 8418 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
1f1031fe 8419 if ( prog->paren_names ) {
ee9b8eae
YO
8420 if ( k != REF || OP(o) < NREF) {
8421 AV *list= (AV *)progi->data->data[progi->name_list_idx];
8422 SV **name= av_fetch(list, ARG(o), 0 );
8423 if (name)
8424 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
8425 }
8426 else {
8427 AV *list= (AV *)progi->data->data[ progi->name_list_idx ];
8428 SV *sv_dat=(SV*)progi->data->data[ ARG( o ) ];
8429 I32 *nums=(I32*)SvPVX(sv_dat);
8430 SV **name= av_fetch(list, nums[0], 0 );
8431 I32 n;
8432 if (name) {
8433 for ( n=0; n<SvIVX(sv_dat); n++ ) {
8434 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
8435 (n ? "," : ""), (IV)nums[n]);
8436 }
8437 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 8438 }
1f1031fe 8439 }
ee9b8eae 8440 }
1f1031fe 8441 } else if (k == GOSUB)
6bda09f9 8442 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
8443 else if (k == VERB) {
8444 if (!o->flags)
8445 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
be2597df 8446 SVfARG((SV*)progi->data->data[ ARG( o ) ]));
e2e6a0f1 8447 } else if (k == LOGICAL)
04ebc1ab 8448 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
8449 else if (k == ANYOF) {
8450 int i, rangestart = -1;
2d03de9c 8451 const U8 flags = ANYOF_FLAGS(o);
0bd48802
AL
8452
8453 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
8454 static const char * const anyofs[] = {
653099ff
GS
8455 "\\w",
8456 "\\W",
8457 "\\s",
8458 "\\S",
8459 "\\d",
8460 "\\D",
8461 "[:alnum:]",
8462 "[:^alnum:]",
8463 "[:alpha:]",
8464 "[:^alpha:]",
8465 "[:ascii:]",
8466 "[:^ascii:]",
8467 "[:ctrl:]",
8468 "[:^ctrl:]",
8469 "[:graph:]",
8470 "[:^graph:]",
8471 "[:lower:]",
8472 "[:^lower:]",
8473 "[:print:]",
8474 "[:^print:]",
8475 "[:punct:]",
8476 "[:^punct:]",
8477 "[:upper:]",
aaa51d5e 8478 "[:^upper:]",
653099ff 8479 "[:xdigit:]",
aaa51d5e
JF
8480 "[:^xdigit:]",
8481 "[:space:]",
8482 "[:^space:]",
8483 "[:blank:]",
8484 "[:^blank:]"
653099ff
GS
8485 };
8486
19860706 8487 if (flags & ANYOF_LOCALE)
396482e1 8488 sv_catpvs(sv, "{loc}");
19860706 8489 if (flags & ANYOF_FOLD)
396482e1 8490 sv_catpvs(sv, "{i}");
653099ff 8491 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 8492 if (flags & ANYOF_INVERT)
396482e1 8493 sv_catpvs(sv, "^");
ffc61ed2
JH
8494 for (i = 0; i <= 256; i++) {
8495 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
8496 if (rangestart == -1)
8497 rangestart = i;
8498 } else if (rangestart != -1) {
8499 if (i <= rangestart + 3)
8500 for (; rangestart < i; rangestart++)
653099ff 8501 put_byte(sv, rangestart);
ffc61ed2
JH
8502 else {
8503 put_byte(sv, rangestart);
396482e1 8504 sv_catpvs(sv, "-");
ffc61ed2 8505 put_byte(sv, i - 1);
653099ff 8506 }
ffc61ed2 8507 rangestart = -1;
653099ff 8508 }
847a199f 8509 }
ffc61ed2
JH
8510
8511 if (o->flags & ANYOF_CLASS)
bb7a0f54 8512 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
ffc61ed2
JH
8513 if (ANYOF_CLASS_TEST(o,i))
8514 sv_catpv(sv, anyofs[i]);
8515
8516 if (flags & ANYOF_UNICODE)
396482e1 8517 sv_catpvs(sv, "{unicode}");
1aa99e6b 8518 else if (flags & ANYOF_UNICODE_ALL)
396482e1 8519 sv_catpvs(sv, "{unicode_all}");
ffc61ed2
JH
8520
8521 {
8522 SV *lv;
32fc9b6a 8523 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
b81d288d 8524
ffc61ed2
JH
8525 if (lv) {
8526 if (sw) {
89ebb4a3 8527 U8 s[UTF8_MAXBYTES_CASE+1];
b81d288d 8528
ffc61ed2 8529 for (i = 0; i <= 256; i++) { /* just the first 256 */
1df70142 8530 uvchr_to_utf8(s, i);
ffc61ed2 8531
3568d838 8532 if (i < 256 && swash_fetch(sw, s, TRUE)) {
ffc61ed2
JH
8533 if (rangestart == -1)
8534 rangestart = i;
8535 } else if (rangestart != -1) {
ffc61ed2
JH
8536 if (i <= rangestart + 3)
8537 for (; rangestart < i; rangestart++) {
2d03de9c
AL
8538 const U8 * const e = uvchr_to_utf8(s,rangestart);
8539 U8 *p;
8540 for(p = s; p < e; p++)
ffc61ed2
JH
8541 put_byte(sv, *p);
8542 }
8543 else {
2d03de9c
AL
8544 const U8 *e = uvchr_to_utf8(s,rangestart);
8545 U8 *p;
8546 for (p = s; p < e; p++)
ffc61ed2 8547 put_byte(sv, *p);
396482e1 8548 sv_catpvs(sv, "-");
2d03de9c
AL
8549 e = uvchr_to_utf8(s, i-1);
8550 for (p = s; p < e; p++)
1df70142 8551 put_byte(sv, *p);
ffc61ed2
JH
8552 }
8553 rangestart = -1;
8554 }
19860706 8555 }
ffc61ed2 8556
396482e1 8557 sv_catpvs(sv, "..."); /* et cetera */
19860706 8558 }
fde631ed 8559
ffc61ed2 8560 {
2e0de35c 8561 char *s = savesvpv(lv);
c445ea15 8562 char * const origs = s;
b81d288d 8563
3dab1dad
YO
8564 while (*s && *s != '\n')
8565 s++;
b81d288d 8566
ffc61ed2 8567 if (*s == '\n') {
2d03de9c 8568 const char * const t = ++s;
ffc61ed2
JH
8569
8570 while (*s) {
8571 if (*s == '\n')
8572 *s = ' ';
8573 s++;
8574 }
8575 if (s[-1] == ' ')
8576 s[-1] = 0;
8577
8578 sv_catpv(sv, t);
fde631ed 8579 }
b81d288d 8580
ffc61ed2 8581 Safefree(origs);
fde631ed
JH
8582 }
8583 }
653099ff 8584 }
ffc61ed2 8585
653099ff
GS
8586 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
8587 }
9b155405 8588 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 8589 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 8590#else
96a5add6 8591 PERL_UNUSED_CONTEXT;
65e66c80
SP
8592 PERL_UNUSED_ARG(sv);
8593 PERL_UNUSED_ARG(o);
f9049ba1 8594 PERL_UNUSED_ARG(prog);
17c3b450 8595#endif /* DEBUGGING */
35ff7856 8596}
a687059c 8597
cad2e5aa
JH
8598SV *
8599Perl_re_intuit_string(pTHX_ regexp *prog)
8600{ /* Assume that RE_INTUIT is set */
97aff369 8601 dVAR;
a3621e74 8602 GET_RE_DEBUG_FLAGS_DECL;
96a5add6
AL
8603 PERL_UNUSED_CONTEXT;
8604
a3621e74 8605 DEBUG_COMPILE_r(
cfd0369c 8606 {
2d03de9c 8607 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 8608 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
8609
8610 if (!PL_colorset) reginitcolors();
8611 PerlIO_printf(Perl_debug_log,
a0288114 8612 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
8613 PL_colors[4],
8614 prog->check_substr ? "" : "utf8 ",
8615 PL_colors[5],PL_colors[0],
cad2e5aa
JH
8616 s,
8617 PL_colors[1],
8618 (strlen(s) > 60 ? "..." : ""));
8619 } );
8620
33b8afdf 8621 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
8622}
8623
84da74a7 8624/*
f8149455 8625 pregfree()
84da74a7 8626
f8149455
YO
8627 handles refcounting and freeing the perl core regexp structure. When
8628 it is necessary to actually free the structure the first thing it
8629 does is call the 'free' method of the regexp_engine associated to to
8630 the regexp, allowing the handling of the void *pprivate; member
8631 first. (This routine is not overridable by extensions, which is why
8632 the extensions free is called first.)
8633
8634 See regdupe and regdupe_internal if you change anything here.
84da74a7 8635*/
f8149455 8636#ifndef PERL_IN_XSUB_RE
2b69d0c2 8637void
864dbfa3 8638Perl_pregfree(pTHX_ struct regexp *r)
a687059c 8639{
27da23d5 8640 dVAR;
fc32ee4a 8641 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 8642
7821416a
IZ
8643 if (!r || (--r->refcnt > 0))
8644 return;
f8149455
YO
8645
8646 CALLREGFREE_PVT(r); /* free the private data */
ed252734 8647 RX_MATCH_COPY_FREE(r);
f8c7b90f 8648#ifdef PERL_OLD_COPY_ON_WRITE
ed252734
NC
8649 if (r->saved_copy)
8650 SvREFCNT_dec(r->saved_copy);
8651#endif
a193d654
GS
8652 if (r->substrs) {
8653 if (r->anchored_substr)
8654 SvREFCNT_dec(r->anchored_substr);
33b8afdf
JH
8655 if (r->anchored_utf8)
8656 SvREFCNT_dec(r->anchored_utf8);
a193d654
GS
8657 if (r->float_substr)
8658 SvREFCNT_dec(r->float_substr);
33b8afdf
JH
8659 if (r->float_utf8)
8660 SvREFCNT_dec(r->float_utf8);
2779dcf1 8661 Safefree(r->substrs);
a193d654 8662 }
81714fb9 8663 if (r->paren_names)
bcdf7404
YO
8664 SvREFCNT_dec(r->paren_names);
8665 Safefree(r->wrapped);
f8149455
YO
8666 Safefree(r->startp);
8667 Safefree(r->endp);
8668 Safefree(r);
8669}
8670#endif
8671
8672/* regfree_internal()
8673
8674 Free the private data in a regexp. This is overloadable by
8675 extensions. Perl takes care of the regexp structure in pregfree(),
8676 this covers the *pprivate pointer which technically perldoesnt
8677 know about, however of course we have to handle the
8678 regexp_internal structure when no extension is in use.
8679
8680 Note this is called before freeing anything in the regexp
8681 structure.
8682 */
8683
8684void
8685Perl_regfree_internal(pTHX_ struct regexp *r)
8686{
8687 dVAR;
8688 RXi_GET_DECL(r,ri);
8689 GET_RE_DEBUG_FLAGS_DECL;
8690
8691 DEBUG_COMPILE_r({
8692 if (!PL_colorset)
8693 reginitcolors();
8694 {
8695 SV *dsv= sv_newmortal();
8696 RE_PV_QUOTED_DECL(s, (r->extflags & RXf_UTF8),
8697 dsv, r->precomp, r->prelen, 60);
8698 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
8699 PL_colors[4],PL_colors[5],s);
8700 }
8701 });
7122b237
YO
8702#ifdef RE_TRACK_PATTERN_OFFSETS
8703 if (ri->u.offsets)
8704 Safefree(ri->u.offsets); /* 20010421 MJD */
8705#endif
f8fc2ecf
YO
8706 if (ri->data) {
8707 int n = ri->data->count;
f3548bdc
DM
8708 PAD* new_comppad = NULL;
8709 PAD* old_comppad;
4026c95a 8710 PADOFFSET refcnt;
dfad63ad 8711
c277df42 8712 while (--n >= 0) {
261faec3 8713 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 8714 switch (ri->data->what[n]) {
c277df42 8715 case 's':
81714fb9 8716 case 'S':
55eed653 8717 case 'u':
f8fc2ecf 8718 SvREFCNT_dec((SV*)ri->data->data[n]);
c277df42 8719 break;
653099ff 8720 case 'f':
f8fc2ecf 8721 Safefree(ri->data->data[n]);
653099ff 8722 break;
dfad63ad 8723 case 'p':
f8fc2ecf 8724 new_comppad = (AV*)ri->data->data[n];
dfad63ad 8725 break;
c277df42 8726 case 'o':
dfad63ad 8727 if (new_comppad == NULL)
cea2e8a9 8728 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
8729 PAD_SAVE_LOCAL(old_comppad,
8730 /* Watch out for global destruction's random ordering. */
c445ea15 8731 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 8732 );
b34c0dd4 8733 OP_REFCNT_LOCK;
f8fc2ecf 8734 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
8735 OP_REFCNT_UNLOCK;
8736 if (!refcnt)
f8fc2ecf 8737 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 8738
f3548bdc 8739 PAD_RESTORE_LOCAL(old_comppad);
dfad63ad
HS
8740 SvREFCNT_dec((SV*)new_comppad);
8741 new_comppad = NULL;
c277df42
IZ
8742 break;
8743 case 'n':
9e55ce06 8744 break;
07be1b83 8745 case 'T':
be8e71aa
YO
8746 { /* Aho Corasick add-on structure for a trie node.
8747 Used in stclass optimization only */
07be1b83 8748 U32 refcount;
f8fc2ecf 8749 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
8750 OP_REFCNT_LOCK;
8751 refcount = --aho->refcount;
8752 OP_REFCNT_UNLOCK;
8753 if ( !refcount ) {
446bd890
NC
8754 PerlMemShared_free(aho->states);
8755 PerlMemShared_free(aho->fail);
446bd890
NC
8756 /* do this last!!!! */
8757 PerlMemShared_free(ri->data->data[n]);
8758 PerlMemShared_free(ri->regstclass);
07be1b83
YO
8759 }
8760 }
8761 break;
a3621e74 8762 case 't':
07be1b83 8763 {
be8e71aa 8764 /* trie structure. */
07be1b83 8765 U32 refcount;
f8fc2ecf 8766 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
8767 OP_REFCNT_LOCK;
8768 refcount = --trie->refcount;
8769 OP_REFCNT_UNLOCK;
8770 if ( !refcount ) {
446bd890 8771 PerlMemShared_free(trie->charmap);
446bd890
NC
8772 PerlMemShared_free(trie->states);
8773 PerlMemShared_free(trie->trans);
07be1b83 8774 if (trie->bitmap)
446bd890 8775 PerlMemShared_free(trie->bitmap);
07be1b83 8776 if (trie->wordlen)
446bd890 8777 PerlMemShared_free(trie->wordlen);
786e8c11 8778 if (trie->jump)
446bd890 8779 PerlMemShared_free(trie->jump);
786e8c11 8780 if (trie->nextword)
446bd890 8781 PerlMemShared_free(trie->nextword);
446bd890
NC
8782 /* do this last!!!! */
8783 PerlMemShared_free(ri->data->data[n]);
a3621e74 8784 }
07be1b83
YO
8785 }
8786 break;
c277df42 8787 default:
f8fc2ecf 8788 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
8789 }
8790 }
f8fc2ecf
YO
8791 Safefree(ri->data->what);
8792 Safefree(ri->data);
a0d0e21e 8793 }
f8fc2ecf
YO
8794 if (ri->swap) {
8795 Safefree(ri->swap->startp);
8796 Safefree(ri->swap->endp);
8797 Safefree(ri->swap);
c74340f9 8798 }
f8fc2ecf 8799 Safefree(ri);
a687059c 8800}
c277df42 8801
84da74a7
YO
8802#define sv_dup_inc(s,t) SvREFCNT_inc(sv_dup(s,t))
8803#define av_dup_inc(s,t) (AV*)SvREFCNT_inc(sv_dup((SV*)s,t))
81714fb9 8804#define hv_dup_inc(s,t) (HV*)SvREFCNT_inc(sv_dup((SV*)s,t))
84da74a7
YO
8805#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
8806
8807/*
8808 regdupe - duplicate a regexp.
8809
8810 This routine is called by sv.c's re_dup and is expected to clone a
8811 given regexp structure. It is a no-op when not under USE_ITHREADS.
8812 (Originally this *was* re_dup() for change history see sv.c)
8813
f8149455
YO
8814 After all of the core data stored in struct regexp is duplicated
8815 the regexp_engine.dupe method is used to copy any private data
8816 stored in the *pprivate pointer. This allows extensions to handle
8817 any duplication it needs to do.
8818
8819 See pregfree() and regfree_internal() if you change anything here.
84da74a7 8820*/
a3c0e9ca 8821#if defined(USE_ITHREADS)
f8149455 8822#ifndef PERL_IN_XSUB_RE
84da74a7 8823regexp *
f8149455 8824Perl_re_dup(pTHX_ const regexp *r, CLONE_PARAMS *param)
84da74a7 8825{
84da74a7 8826 dVAR;
f8fc2ecf 8827 regexp *ret;
f8149455 8828 int i, npar;
84da74a7 8829 struct reg_substr_datum *s;
644c02aa 8830
84da74a7
YO
8831 if (!r)
8832 return (REGEXP *)NULL;
8833
8834 if ((ret = (REGEXP *)ptr_table_fetch(PL_ptr_table, r)))
8835 return ret;
8836
f8149455 8837
84da74a7 8838 npar = r->nparens+1;
f8fc2ecf 8839 Newxz(ret, 1, regexp);
84da74a7
YO
8840 Newx(ret->startp, npar, I32);
8841 Copy(r->startp, ret->startp, npar, I32);
8842 Newx(ret->endp, npar, I32);
f8149455 8843 Copy(r->endp, ret->endp, npar, I32);
84da74a7 8844
c945c181 8845 if (r->substrs) {
785a26d5
YO
8846 Newx(ret->substrs, 1, struct reg_substr_data);
8847 for (s = ret->substrs->data, i = 0; i < 3; i++, s++) {
8848 s->min_offset = r->substrs->data[i].min_offset;
8849 s->max_offset = r->substrs->data[i].max_offset;
8850 s->end_shift = r->substrs->data[i].end_shift;
8851 s->substr = sv_dup_inc(r->substrs->data[i].substr, param);
8852 s->utf8_substr = sv_dup_inc(r->substrs->data[i].utf8_substr, param);
8853 }
8854 } else
8855 ret->substrs = NULL;
f8149455 8856
bcdf7404
YO
8857 ret->wrapped = SAVEPVN(r->wrapped, r->wraplen);
8858 ret->precomp = ret->wrapped + (r->precomp - r->wrapped);
8859 ret->prelen = r->prelen;
8860 ret->wraplen = r->wraplen;
8861
f8149455
YO
8862 ret->refcnt = r->refcnt;
8863 ret->minlen = r->minlen;
8864 ret->minlenret = r->minlenret;
f8149455
YO
8865 ret->nparens = r->nparens;
8866 ret->lastparen = r->lastparen;
8867 ret->lastcloseparen = r->lastcloseparen;
8868 ret->intflags = r->intflags;
8869 ret->extflags = r->extflags;
8870
8871 ret->sublen = r->sublen;
8872
8873 ret->engine = r->engine;
8874
8875 ret->paren_names = hv_dup_inc(r->paren_names, param);
8876
8877 if (RX_MATCH_COPIED(ret))
8878 ret->subbeg = SAVEPVN(r->subbeg, r->sublen);
8879 else
8880 ret->subbeg = NULL;
8881#ifdef PERL_OLD_COPY_ON_WRITE
8882 ret->saved_copy = NULL;
8883#endif
8884
8885 ret->pprivate = r->pprivate;
785a26d5
YO
8886 if (ret->pprivate)
8887 RXi_SET(ret,CALLREGDUPE_PVT(ret,param));
f8149455
YO
8888
8889 ptr_table_store(PL_ptr_table, r, ret);
8890 return ret;
8891}
8892#endif /* PERL_IN_XSUB_RE */
8893
8894/*
8895 regdupe_internal()
8896
8897 This is the internal complement to regdupe() which is used to copy
8898 the structure pointed to by the *pprivate pointer in the regexp.
8899 This is the core version of the extension overridable cloning hook.
8900 The regexp structure being duplicated will be copied by perl prior
8901 to this and will be provided as the regexp *r argument, however
8902 with the /old/ structures pprivate pointer value. Thus this routine
8903 may override any copying normally done by perl.
8904
8905 It returns a pointer to the new regexp_internal structure.
8906*/
8907
8908void *
8909Perl_regdupe_internal(pTHX_ const regexp *r, CLONE_PARAMS *param)
8910{
8911 dVAR;
8912 regexp_internal *reti;
8913 int len, npar;
8914 RXi_GET_DECL(r,ri);
8915
8916 npar = r->nparens+1;
7122b237 8917 len = ProgLen(ri);
f8149455
YO
8918
8919 Newxc(reti, sizeof(regexp_internal) + (len+1)*sizeof(regnode), char, regexp_internal);
8920 Copy(ri->program, reti->program, len+1, regnode);
8921
8922 if(ri->swap) {
8923 Newx(reti->swap, 1, regexp_paren_ofs);
8924 /* no need to copy these */
8925 Newx(reti->swap->startp, npar, I32);
8926 Newx(reti->swap->endp, npar, I32);
8927 } else {
8928 reti->swap = NULL;
8929 }
8930
f8fc2ecf 8931 reti->regstclass = NULL;
bcdf7404 8932
f8fc2ecf 8933 if (ri->data) {
84da74a7 8934 struct reg_data *d;
f8fc2ecf 8935 const int count = ri->data->count;
84da74a7
YO
8936 int i;
8937
8938 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
8939 char, struct reg_data);
8940 Newx(d->what, count, U8);
8941
8942 d->count = count;
8943 for (i = 0; i < count; i++) {
f8fc2ecf 8944 d->what[i] = ri->data->what[i];
84da74a7 8945 switch (d->what[i]) {
55eed653 8946 /* legal options are one of: sSfpontTu
84da74a7
YO
8947 see also regcomp.h and pregfree() */
8948 case 's':
81714fb9 8949 case 'S':
0536c0a7 8950 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 8951 case 'u': /* actually an HV, but the dup function is identical. */
f8fc2ecf 8952 d->data[i] = sv_dup_inc((SV *)ri->data->data[i], param);
84da74a7 8953 break;
84da74a7
YO
8954 case 'f':
8955 /* This is cheating. */
8956 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 8957 StructCopy(ri->data->data[i], d->data[i],
84da74a7 8958 struct regnode_charclass_class);
f8fc2ecf 8959 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
8960 break;
8961 case 'o':
bbe252da
YO
8962 /* Compiled op trees are readonly and in shared memory,
8963 and can thus be shared without duplication. */
84da74a7 8964 OP_REFCNT_LOCK;
f8fc2ecf 8965 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
8966 OP_REFCNT_UNLOCK;
8967 break;
23eab42c
NC
8968 case 'T':
8969 /* Trie stclasses are readonly and can thus be shared
8970 * without duplication. We free the stclass in pregfree
8971 * when the corresponding reg_ac_data struct is freed.
8972 */
8973 reti->regstclass= ri->regstclass;
8974 /* Fall through */
84da74a7 8975 case 't':
84da74a7 8976 OP_REFCNT_LOCK;
0536c0a7 8977 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 8978 OP_REFCNT_UNLOCK;
0536c0a7
NC
8979 /* Fall through */
8980 case 'n':
8981 d->data[i] = ri->data->data[i];
84da74a7 8982 break;
84da74a7 8983 default:
f8fc2ecf 8984 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
8985 }
8986 }
8987
f8fc2ecf 8988 reti->data = d;
84da74a7
YO
8989 }
8990 else
f8fc2ecf 8991 reti->data = NULL;
84da74a7 8992
cde0cee5
YO
8993 reti->name_list_idx = ri->name_list_idx;
8994
7122b237
YO
8995#ifdef RE_TRACK_PATTERN_OFFSETS
8996 if (ri->u.offsets) {
8997 Newx(reti->u.offsets, 2*len+1, U32);
8998 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
8999 }
9000#else
9001 SetProgLen(reti,len);
9002#endif
9003
f8149455 9004 return (void*)reti;
84da74a7 9005}
f8149455
YO
9006
9007#endif /* USE_ITHREADS */
84da74a7 9008
de8c5301
YO
9009/*
9010 reg_stringify()
9011
9012 converts a regexp embedded in a MAGIC struct to its stringified form,
9013 caching the converted form in the struct and returns the cached
9014 string.
9015
9016 If lp is nonnull then it is used to return the length of the
9017 resulting string
9018
9019 If flags is nonnull and the returned string contains UTF8 then
f8149455 9020 (*flags & 1) will be true.
de8c5301
YO
9021
9022 If haseval is nonnull then it is used to return whether the pattern
9023 contains evals.
9024
9025 Normally called via macro:
9026
f8149455 9027 CALLREG_STRINGIFY(mg,&len,&utf8);
de8c5301
YO
9028
9029 And internally with
9030
f8149455 9031 CALLREG_AS_STR(mg,&lp,&flags,&haseval)
de8c5301
YO
9032
9033 See sv_2pv_flags() in sv.c for an example of internal usage.
9034
9035 */
f8149455 9036#ifndef PERL_IN_XSUB_RE
bcdf7404 9037
de8c5301
YO
9038char *
9039Perl_reg_stringify(pTHX_ MAGIC *mg, STRLEN *lp, U32 *flags, I32 *haseval ) {
9040 dVAR;
9041 const regexp * const re = (regexp *)mg->mg_obj;
de8c5301 9042 if (haseval)
f8149455 9043 *haseval = re->seen_evals;
de8c5301 9044 if (flags)
bbe252da 9045 *flags = ((re->extflags & RXf_UTF8) ? 1 : 0);
de8c5301 9046 if (lp)
bcdf7404
YO
9047 *lp = re->wraplen;
9048 return re->wrapped;
de8c5301
YO
9049}
9050
c277df42
IZ
9051/*
9052 - regnext - dig the "next" pointer out of a node
c277df42
IZ
9053 */
9054regnode *
864dbfa3 9055Perl_regnext(pTHX_ register regnode *p)
c277df42 9056{
97aff369 9057 dVAR;
c277df42
IZ
9058 register I32 offset;
9059
f8fc2ecf 9060 if (!p)
c277df42
IZ
9061 return(NULL);
9062
9063 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
9064 if (offset == 0)
9065 return(NULL);
9066
c277df42 9067 return(p+offset);
c277df42 9068}
76234dfb 9069#endif
c277df42 9070
01f988be 9071STATIC void
cea2e8a9 9072S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
9073{
9074 va_list args;
9075 STRLEN l1 = strlen(pat1);
9076 STRLEN l2 = strlen(pat2);
9077 char buf[512];
06bf62c7 9078 SV *msv;
73d840c0 9079 const char *message;
c277df42
IZ
9080
9081 if (l1 > 510)
9082 l1 = 510;
9083 if (l1 + l2 > 510)
9084 l2 = 510 - l1;
9085 Copy(pat1, buf, l1 , char);
9086 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
9087 buf[l1 + l2] = '\n';
9088 buf[l1 + l2 + 1] = '\0';
8736538c
AS
9089#ifdef I_STDARG
9090 /* ANSI variant takes additional second argument */
c277df42 9091 va_start(args, pat2);
8736538c
AS
9092#else
9093 va_start(args);
9094#endif
5a844595 9095 msv = vmess(buf, &args);
c277df42 9096 va_end(args);
cfd0369c 9097 message = SvPV_const(msv,l1);
c277df42
IZ
9098 if (l1 > 512)
9099 l1 = 512;
9100 Copy(message, buf, l1 , char);
197cf9b9 9101 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 9102 Perl_croak(aTHX_ "%s", buf);
c277df42 9103}
a0ed51b3
LW
9104
9105/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
9106
76234dfb 9107#ifndef PERL_IN_XSUB_RE
a0ed51b3 9108void
864dbfa3 9109Perl_save_re_context(pTHX)
b81d288d 9110{
97aff369 9111 dVAR;
1ade1aa1
NC
9112
9113 struct re_save_state *state;
9114
9115 SAVEVPTR(PL_curcop);
9116 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
9117
9118 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
9119 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
9120 SSPUSHINT(SAVEt_RE_STATE);
9121
46ab3289 9122 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 9123
a0ed51b3 9124 PL_reg_start_tmp = 0;
a0ed51b3 9125 PL_reg_start_tmpl = 0;
c445ea15 9126 PL_reg_oldsaved = NULL;
a5db57d6 9127 PL_reg_oldsavedlen = 0;
a5db57d6 9128 PL_reg_maxiter = 0;
a5db57d6 9129 PL_reg_leftiter = 0;
c445ea15 9130 PL_reg_poscache = NULL;
a5db57d6 9131 PL_reg_poscache_size = 0;
1ade1aa1
NC
9132#ifdef PERL_OLD_COPY_ON_WRITE
9133 PL_nrs = NULL;
9134#endif
ada6e8a9 9135
c445ea15
AL
9136 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
9137 if (PL_curpm) {
9138 const REGEXP * const rx = PM_GETRE(PL_curpm);
9139 if (rx) {
1df70142 9140 U32 i;
ada6e8a9 9141 for (i = 1; i <= rx->nparens; i++) {
1df70142 9142 char digits[TYPE_CHARS(long)];
d9fad198 9143 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
9144 GV *const *const gvp
9145 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
9146
b37c2d43
AL
9147 if (gvp) {
9148 GV * const gv = *gvp;
9149 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
9150 save_scalar(gv);
49f27e4b 9151 }
ada6e8a9
AMS
9152 }
9153 }
9154 }
a0ed51b3 9155}
76234dfb 9156#endif
51371543 9157
51371543 9158static void
acfe0abc 9159clear_re(pTHX_ void *r)
51371543 9160{
97aff369 9161 dVAR;
51371543
GS
9162 ReREFCNT_dec((regexp *)r);
9163}
ffbc6a93 9164
a28509cc
AL
9165#ifdef DEBUGGING
9166
9167STATIC void
9168S_put_byte(pTHX_ SV *sv, int c)
9169{
9170 if (isCNTRL(c) || c == 255 || !isPRINT(c))
9171 Perl_sv_catpvf(aTHX_ sv, "\\%o", c);
9172 else if (c == '-' || c == ']' || c == '\\' || c == '^')
9173 Perl_sv_catpvf(aTHX_ sv, "\\%c", c);
9174 else
9175 Perl_sv_catpvf(aTHX_ sv, "%c", c);
9176}
9177
786e8c11 9178
3dab1dad
YO
9179#define CLEAR_OPTSTART \
9180 if (optstart) STMT_START { \
70685ca0 9181 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
9182 optstart=NULL; \
9183 } STMT_END
9184
786e8c11 9185#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 9186
b5a2f8d8
NC
9187STATIC const regnode *
9188S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
9189 const regnode *last, const regnode *plast,
9190 SV* sv, I32 indent, U32 depth)
a28509cc 9191{
97aff369 9192 dVAR;
786e8c11 9193 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 9194 register const regnode *next;
3dab1dad 9195 const regnode *optstart= NULL;
1f1031fe 9196
f8fc2ecf 9197 RXi_GET_DECL(r,ri);
3dab1dad 9198 GET_RE_DEBUG_FLAGS_DECL;
1f1031fe 9199
786e8c11
YO
9200#ifdef DEBUG_DUMPUNTIL
9201 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
9202 last ? last-start : 0,plast ? plast-start : 0);
9203#endif
9204
9205 if (plast && plast < last)
9206 last= plast;
9207
9208 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 9209 /* While that wasn't END last time... */
a28509cc
AL
9210 NODE_ALIGN(node);
9211 op = OP(node);
de734bd5 9212 if (op == CLOSE || op == WHILEM)
786e8c11 9213 indent--;
b5a2f8d8 9214 next = regnext((regnode *)node);
1f1031fe 9215
a28509cc 9216 /* Where, what. */
8e11feef 9217 if (OP(node) == OPTIMIZED) {
e68ec53f 9218 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 9219 optstart = node;
3dab1dad 9220 else
8e11feef 9221 goto after_print;
3dab1dad
YO
9222 } else
9223 CLEAR_OPTSTART;
1f1031fe 9224
32fc9b6a 9225 regprop(r, sv, node);
a28509cc 9226 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 9227 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
9228
9229 if (OP(node) != OPTIMIZED) {
9230 if (next == NULL) /* Next ptr. */
9231 PerlIO_printf(Perl_debug_log, " (0)");
9232 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
9233 PerlIO_printf(Perl_debug_log, " (FAIL)");
9234 else
9235 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
9236 (void)PerlIO_putc(Perl_debug_log, '\n');
9237 }
9238
a28509cc
AL
9239 after_print:
9240 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
9241 assert(next);
9242 {
9243 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
9244 ? regnext((regnode *)next)
9245 : next);
be8e71aa
YO
9246 if (last && nnode > last)
9247 nnode = last;
786e8c11 9248 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 9249 }
a28509cc
AL
9250 }
9251 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 9252 assert(next);
786e8c11 9253 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
9254 }
9255 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 9256 const regnode *this_trie = node;
1de06328 9257 const char op = OP(node);
647f639f 9258 const U32 n = ARG(node);
1de06328 9259 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 9260 (reg_ac_data *)ri->data->data[n] :
1de06328 9261 NULL;
3251b653
NC
9262 const reg_trie_data * const trie =
9263 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781
NC
9264#ifdef DEBUGGING
9265 AV *const trie_words = (AV *) ri->data->data[n + TRIE_WORDS_OFFSET];
9266#endif
786e8c11 9267 const regnode *nextbranch= NULL;
a28509cc 9268 I32 word_idx;
1de06328 9269 sv_setpvn(sv, "", 0);
786e8c11 9270 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 9271 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
786e8c11
YO
9272
9273 PerlIO_printf(Perl_debug_log, "%*s%s ",
9274 (int)(2*(indent+3)), "",
9275 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
9276 PL_colors[0], PL_colors[1],
9277 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
9278 PERL_PV_PRETTY_ELIPSES |
7f69552c 9279 PERL_PV_PRETTY_LTGT
786e8c11
YO
9280 )
9281 : "???"
9282 );
9283 if (trie->jump) {
40d049e4 9284 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
9285 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
9286 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
9287 if (dist) {
9288 if (!nextbranch)
24b23f37 9289 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
9290 DUMPUNTIL(this_trie + dist, nextbranch);
9291 }
786e8c11
YO
9292 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
9293 nextbranch= regnext((regnode *)nextbranch);
9294 } else {
9295 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 9296 }
786e8c11
YO
9297 }
9298 if (last && next > last)
9299 node= last;
9300 else
9301 node= next;
a28509cc 9302 }
786e8c11
YO
9303 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
9304 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
9305 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
9306 }
9307 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 9308 assert(next);
786e8c11 9309 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
9310 }
9311 else if ( op == PLUS || op == STAR) {
786e8c11 9312 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc
AL
9313 }
9314 else if (op == ANYOF) {
9315 /* arglen 1 + class block */
9316 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_LARGE)
9317 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
9318 node = NEXTOPER(node);
9319 }
9320 else if (PL_regkind[(U8)op] == EXACT) {
9321 /* Literal string, where present. */
9322 node += NODE_SZ_STR(node) - 1;
9323 node = NEXTOPER(node);
9324 }
9325 else {
9326 node = NEXTOPER(node);
9327 node += regarglen[(U8)op];
9328 }
9329 if (op == CURLYX || op == OPEN)
786e8c11 9330 indent++;
a28509cc 9331 }
3dab1dad 9332 CLEAR_OPTSTART;
786e8c11 9333#ifdef DEBUG_DUMPUNTIL
70685ca0 9334 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 9335#endif
1de06328 9336 return node;
a28509cc
AL
9337}
9338
9339#endif /* DEBUGGING */
9340
241d1a3b
NC
9341/*
9342 * Local variables:
9343 * c-indentation-style: bsd
9344 * c-basic-offset: 4
9345 * indent-tabs-mode: t
9346 * End:
9347 *
37442d52
RGS
9348 * ex: set ts=8 sts=4 sw=4 noet:
9349 */