This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Skip stat.t readability test on VMS.
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
84#else
85# include "regcomp.h"
86#endif
a687059c 87
04e98a4d 88#include "dquote_static.c"
a3e1f3a6 89#include "charclass_invlists.h"
04e98a4d 90
d4cce5f1 91#ifdef op
11343788 92#undef op
d4cce5f1 93#endif /* op */
11343788 94
fe14fcc3 95#ifdef MSDOS
7e4e8c89 96# if defined(BUGGY_MSC6)
fe14fcc3 97 /* MSC 6.00A breaks on op/regexp.t test 85 unless we turn this off */
7e4e8c89 98# pragma optimize("a",off)
fe14fcc3 99 /* But MSC 6.00A is happy with 'w', for aliases only across function calls*/
7e4e8c89
NC
100# pragma optimize("w",on )
101# endif /* BUGGY_MSC6 */
fe14fcc3
LW
102#endif /* MSDOS */
103
a687059c
LW
104#ifndef STATIC
105#define STATIC static
106#endif
107
830247a4 108typedef struct RExC_state_t {
e2509266 109 U32 flags; /* are we folding, multilining? */
830247a4 110 char *precomp; /* uncompiled string. */
288b8c02 111 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
112 regexp *rx; /* perl core regexp structure */
113 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
117 I32 whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 119 regnode *emit_bound; /* First regnode outside of the allocated space */
ffc61ed2 120 regnode *emit; /* Code-emit pointer; &regdummy = don't = compiling */
830247a4
IZ
121 I32 naughty; /* How bad is this pattern? */
122 I32 sawback; /* Did we see \1, ...? */
123 U32 seen;
124 I32 size; /* Code size. */
c74340f9
YO
125 I32 npar; /* Capture buffer count, (OPEN). */
126 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 127 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
128 I32 extralen;
129 I32 seen_zerolen;
130 I32 seen_evals;
40d049e4
YO
131 regnode **open_parens; /* pointers to open parens */
132 regnode **close_parens; /* pointers to close parens */
133 regnode *opend; /* END node in program */
02daf0ab
YO
134 I32 utf8; /* whether the pattern is utf8 or not */
135 I32 orig_utf8; /* whether the pattern was originally in utf8 */
136 /* XXX use this for future optimisation of case
137 * where pattern must be upgraded to utf8. */
e40e74fe
KW
138 I32 uni_semantics; /* If a d charset modifier should use unicode
139 rules, even if the pattern is not in
140 utf8 */
81714fb9 141 HV *paren_names; /* Paren names */
1f1031fe 142
40d049e4
YO
143 regnode **recurse; /* Recurse regops */
144 I32 recurse_count; /* Number of recurse regops */
b57e4118 145 I32 in_lookbehind;
4624b182 146 I32 contains_locale;
bb3f3ed2 147 I32 override_recoding;
830247a4
IZ
148#if ADD_TO_REGEXEC
149 char *starttry; /* -Dr: where regtry was called. */
150#define RExC_starttry (pRExC_state->starttry)
151#endif
3dab1dad 152#ifdef DEBUGGING
be8e71aa 153 const char *lastparse;
3dab1dad 154 I32 lastnum;
1f1031fe 155 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
156#define RExC_lastparse (pRExC_state->lastparse)
157#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 158#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 159#endif
830247a4
IZ
160} RExC_state_t;
161
e2509266 162#define RExC_flags (pRExC_state->flags)
830247a4 163#define RExC_precomp (pRExC_state->precomp)
288b8c02 164#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 165#define RExC_rx (pRExC_state->rx)
f8fc2ecf 166#define RExC_rxi (pRExC_state->rxi)
fac92740 167#define RExC_start (pRExC_state->start)
830247a4
IZ
168#define RExC_end (pRExC_state->end)
169#define RExC_parse (pRExC_state->parse)
170#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
171#ifdef RE_TRACK_PATTERN_OFFSETS
172#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
173#endif
830247a4 174#define RExC_emit (pRExC_state->emit)
fac92740 175#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 176#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
177#define RExC_naughty (pRExC_state->naughty)
178#define RExC_sawback (pRExC_state->sawback)
179#define RExC_seen (pRExC_state->seen)
180#define RExC_size (pRExC_state->size)
181#define RExC_npar (pRExC_state->npar)
e2e6a0f1 182#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
183#define RExC_extralen (pRExC_state->extralen)
184#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
185#define RExC_seen_evals (pRExC_state->seen_evals)
1aa99e6b 186#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 187#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 188#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
189#define RExC_open_parens (pRExC_state->open_parens)
190#define RExC_close_parens (pRExC_state->close_parens)
191#define RExC_opend (pRExC_state->opend)
81714fb9 192#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
193#define RExC_recurse (pRExC_state->recurse)
194#define RExC_recurse_count (pRExC_state->recurse_count)
b57e4118 195#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 196#define RExC_contains_locale (pRExC_state->contains_locale)
bb3f3ed2 197#define RExC_override_recoding (pRExC_state->override_recoding)
830247a4 198
cde0cee5 199
a687059c
LW
200#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
201#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
202 ((*s) == '{' && regcurly(s)))
a687059c 203
35c8bce7
LW
204#ifdef SPSTART
205#undef SPSTART /* dratted cpp namespace... */
206#endif
a687059c
LW
207/*
208 * Flags to be passed up and down.
209 */
a687059c 210#define WORST 0 /* Worst case. */
a3b492c3 211#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee
KW
212
213/* Simple enough to be STAR/PLUS operand, in an EXACT node must be a single
d7b56a3c 214 * character, and if utf8, must be invariant. Note that this is not the same thing as REGNODE_SIMPLE */
fda99bee 215#define SIMPLE 0x02
a3b492c3
YO
216#define SPSTART 0x04 /* Starts with * or +. */
217#define TRYAGAIN 0x08 /* Weeded out a declaration. */
218#define POSTPONED 0x10 /* (?1),(?&name), (??{...}) or similar */
a687059c 219
3dab1dad
YO
220#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
221
07be1b83
YO
222/* whether trie related optimizations are enabled */
223#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
224#define TRIE_STUDY_OPT
786e8c11 225#define FULL_TRIE_STUDY
07be1b83
YO
226#define TRIE_STCLASS
227#endif
1de06328
YO
228
229
40d049e4
YO
230
231#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
232#define PBITVAL(paren) (1 << ((paren) & 7))
233#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
234#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
235#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
236
bbd61b5f
KW
237/* If not already in utf8, do a longjmp back to the beginning */
238#define UTF8_LONGJMP 42 /* Choose a value not likely to ever conflict */
239#define REQUIRE_UTF8 STMT_START { \
240 if (! UTF) JMPENV_JUMP(UTF8_LONGJMP); \
241 } STMT_END
40d049e4 242
1de06328
YO
243/* About scan_data_t.
244
245 During optimisation we recurse through the regexp program performing
246 various inplace (keyhole style) optimisations. In addition study_chunk
247 and scan_commit populate this data structure with information about
248 what strings MUST appear in the pattern. We look for the longest
3b753521 249 string that must appear at a fixed location, and we look for the
1de06328
YO
250 longest string that may appear at a floating location. So for instance
251 in the pattern:
252
253 /FOO[xX]A.*B[xX]BAR/
254
255 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
256 strings (because they follow a .* construct). study_chunk will identify
257 both FOO and BAR as being the longest fixed and floating strings respectively.
258
259 The strings can be composites, for instance
260
261 /(f)(o)(o)/
262
263 will result in a composite fixed substring 'foo'.
264
265 For each string some basic information is maintained:
266
267 - offset or min_offset
268 This is the position the string must appear at, or not before.
269 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
270 characters must match before the string we are searching for.
271 Likewise when combined with minlenp and the length of the string it
1de06328
YO
272 tells us how many characters must appear after the string we have
273 found.
274
275 - max_offset
276 Only used for floating strings. This is the rightmost point that
3b753521 277 the string can appear at. If set to I32 max it indicates that the
1de06328
YO
278 string can occur infinitely far to the right.
279
280 - minlenp
281 A pointer to the minimum length of the pattern that the string
282 was found inside. This is important as in the case of positive
283 lookahead or positive lookbehind we can have multiple patterns
284 involved. Consider
285
286 /(?=FOO).*F/
287
288 The minimum length of the pattern overall is 3, the minimum length
289 of the lookahead part is 3, but the minimum length of the part that
290 will actually match is 1. So 'FOO's minimum length is 3, but the
291 minimum length for the F is 1. This is important as the minimum length
292 is used to determine offsets in front of and behind the string being
293 looked for. Since strings can be composites this is the length of the
486ec47a 294 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
295 the length is calculated by study_chunk, so that the minimum lengths
296 are not known until the full pattern has been compiled, thus the
297 pointer to the value.
298
299 - lookbehind
300
301 In the case of lookbehind the string being searched for can be
302 offset past the start point of the final matching string.
303 If this value was just blithely removed from the min_offset it would
304 invalidate some of the calculations for how many chars must match
305 before or after (as they are derived from min_offset and minlen and
306 the length of the string being searched for).
307 When the final pattern is compiled and the data is moved from the
308 scan_data_t structure into the regexp structure the information
309 about lookbehind is factored in, with the information that would
310 have been lost precalculated in the end_shift field for the
311 associated string.
312
313 The fields pos_min and pos_delta are used to store the minimum offset
314 and the delta to the maximum offset at the current point in the pattern.
315
316*/
2c2d71f5
JH
317
318typedef struct scan_data_t {
1de06328
YO
319 /*I32 len_min; unused */
320 /*I32 len_delta; unused */
2c2d71f5
JH
321 I32 pos_min;
322 I32 pos_delta;
323 SV *last_found;
1de06328 324 I32 last_end; /* min value, <0 unless valid. */
2c2d71f5
JH
325 I32 last_start_min;
326 I32 last_start_max;
1de06328
YO
327 SV **longest; /* Either &l_fixed, or &l_float. */
328 SV *longest_fixed; /* longest fixed string found in pattern */
329 I32 offset_fixed; /* offset where it starts */
486ec47a 330 I32 *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
331 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
332 SV *longest_float; /* longest floating string found in pattern */
333 I32 offset_float_min; /* earliest point in string it can appear */
334 I32 offset_float_max; /* latest point in string it can appear */
486ec47a 335 I32 *minlen_float; /* pointer to the minlen relevant to the string */
1de06328 336 I32 lookbehind_float; /* is the position of the string modified by LB */
2c2d71f5
JH
337 I32 flags;
338 I32 whilem_c;
cb434fcc 339 I32 *last_closep;
653099ff 340 struct regnode_charclass_class *start_class;
2c2d71f5
JH
341} scan_data_t;
342
a687059c 343/*
e50aee73 344 * Forward declarations for pregcomp()'s friends.
a687059c 345 */
a0d0e21e 346
27da23d5 347static const scan_data_t zero_scan_data =
1de06328 348 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
349
350#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
351#define SF_BEFORE_SEOL 0x0001
352#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
353#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
354#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
355
09b7f37c
CB
356#ifdef NO_UNARY_PLUS
357# define SF_FIX_SHIFT_EOL (0+2)
358# define SF_FL_SHIFT_EOL (0+4)
359#else
360# define SF_FIX_SHIFT_EOL (+2)
361# define SF_FL_SHIFT_EOL (+4)
362#endif
c277df42
IZ
363
364#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
365#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
366
367#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
368#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
369#define SF_IS_INF 0x0040
370#define SF_HAS_PAR 0x0080
371#define SF_IN_PAR 0x0100
372#define SF_HAS_EVAL 0x0200
373#define SCF_DO_SUBSTR 0x0400
653099ff
GS
374#define SCF_DO_STCLASS_AND 0x0800
375#define SCF_DO_STCLASS_OR 0x1000
376#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 377#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 378
786e8c11 379#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 380#define SCF_SEEN_ACCEPT 0x8000
07be1b83 381
43fead97 382#define UTF cBOOL(RExC_utf8)
a62b1201
KW
383#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
384#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
385#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
386#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
387#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
2f7f8cb1
KW
388#define MORE_ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
389#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
a62b1201 390
43fead97 391#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 392
ffc61ed2 393#define OOB_UNICODE 12345678
93733859 394#define OOB_NAMEDCLASS -1
b8c5462f 395
a0ed51b3
LW
396#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
397#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
398
8615cb43 399
b45f050a
JF
400/* length of regex to show in messages that don't mark a position within */
401#define RegexLengthToShowInErrorMessages 127
402
403/*
404 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
405 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
406 * op/pragma/warn/regcomp.
407 */
7253e4e3
RK
408#define MARKER1 "<-- HERE" /* marker as it appears in the description */
409#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 410
7253e4e3 411#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%.*s" MARKER2 "%s/"
b45f050a
JF
412
413/*
414 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
415 * arg. Show regex, up to a maximum length. If it's too long, chop and add
416 * "...".
417 */
58e23c8d 418#define _FAIL(code) STMT_START { \
bfed75c6 419 const char *ellipses = ""; \
ccb2c380
MP
420 IV len = RExC_end - RExC_precomp; \
421 \
422 if (!SIZE_ONLY) \
288b8c02 423 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
424 if (len > RegexLengthToShowInErrorMessages) { \
425 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
426 len = RegexLengthToShowInErrorMessages - 10; \
427 ellipses = "..."; \
428 } \
58e23c8d 429 code; \
ccb2c380 430} STMT_END
8615cb43 431
58e23c8d
YO
432#define FAIL(msg) _FAIL( \
433 Perl_croak(aTHX_ "%s in regex m/%.*s%s/", \
434 msg, (int)len, RExC_precomp, ellipses))
435
436#define FAIL2(msg,arg) _FAIL( \
437 Perl_croak(aTHX_ msg " in regex m/%.*s%s/", \
438 arg, (int)len, RExC_precomp, ellipses))
439
b45f050a 440/*
b45f050a
JF
441 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
442 */
ccb2c380 443#define Simple_vFAIL(m) STMT_START { \
a28509cc 444 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
445 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
446 m, (int)offset, RExC_precomp, RExC_precomp + offset); \
447} STMT_END
b45f050a
JF
448
449/*
450 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
451 */
ccb2c380
MP
452#define vFAIL(m) STMT_START { \
453 if (!SIZE_ONLY) \
288b8c02 454 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
455 Simple_vFAIL(m); \
456} STMT_END
b45f050a
JF
457
458/*
459 * Like Simple_vFAIL(), but accepts two arguments.
460 */
ccb2c380 461#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 462 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
463 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, \
464 (int)offset, RExC_precomp, RExC_precomp + offset); \
465} STMT_END
b45f050a
JF
466
467/*
468 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
469 */
ccb2c380
MP
470#define vFAIL2(m,a1) STMT_START { \
471 if (!SIZE_ONLY) \
288b8c02 472 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
473 Simple_vFAIL2(m, a1); \
474} STMT_END
b45f050a
JF
475
476
477/*
478 * Like Simple_vFAIL(), but accepts three arguments.
479 */
ccb2c380 480#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 481 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
482 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, \
483 (int)offset, RExC_precomp, RExC_precomp + offset); \
484} STMT_END
b45f050a
JF
485
486/*
487 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
488 */
ccb2c380
MP
489#define vFAIL3(m,a1,a2) STMT_START { \
490 if (!SIZE_ONLY) \
288b8c02 491 SAVEDESTRUCTOR_X(clear_re,(void*)RExC_rx_sv); \
ccb2c380
MP
492 Simple_vFAIL3(m, a1, a2); \
493} STMT_END
b45f050a
JF
494
495/*
496 * Like Simple_vFAIL(), but accepts four arguments.
497 */
ccb2c380 498#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 499 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380
MP
500 S_re_croak2(aTHX_ m, REPORT_LOCATION, a1, a2, a3, \
501 (int)offset, RExC_precomp, RExC_precomp + offset); \
502} STMT_END
b45f050a 503
668c081a 504#define ckWARNreg(loc,m) STMT_START { \
a28509cc 505 const IV offset = loc - RExC_precomp; \
f10f4c18
NC
506 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
507 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
508} STMT_END
509
668c081a 510#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 511 const IV offset = loc - RExC_precomp; \
d1d15184 512 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18
NC
513 m REPORT_LOCATION, \
514 (int)offset, RExC_precomp, RExC_precomp + offset); \
ccb2c380
MP
515} STMT_END
516
2335b3d3
KW
517#define ckWARN2regdep(loc,m, a1) STMT_START { \
518 const IV offset = loc - RExC_precomp; \
519 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
520 m REPORT_LOCATION, \
521 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
522} STMT_END
523
668c081a 524#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 525 const IV offset = loc - RExC_precomp; \
668c081a 526 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
ccb2c380
MP
527 a1, (int)offset, RExC_precomp, RExC_precomp + offset); \
528} STMT_END
529
530#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 531 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
532 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
533 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
534} STMT_END
535
668c081a
NC
536#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
537 const IV offset = loc - RExC_precomp; \
538 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
539 a1, a2, (int)offset, RExC_precomp, RExC_precomp + offset); \
540} STMT_END
541
ccb2c380 542#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 543 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
544 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
545 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
546} STMT_END
547
668c081a
NC
548#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
549 const IV offset = loc - RExC_precomp; \
550 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
551 a1, a2, a3, (int)offset, RExC_precomp, RExC_precomp + offset); \
552} STMT_END
553
ccb2c380 554#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 555 const IV offset = loc - RExC_precomp; \
ccb2c380
MP
556 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
557 a1, a2, a3, a4, (int)offset, RExC_precomp, RExC_precomp + offset); \
558} STMT_END
9d1d55b5 559
8615cb43 560
cd439c50 561/* Allow for side effects in s */
ccb2c380
MP
562#define REGC(c,s) STMT_START { \
563 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
564} STMT_END
cd439c50 565
fac92740
MJD
566/* Macros for recording node offsets. 20001227 mjd@plover.com
567 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
568 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
569 * Element 0 holds the number n.
07be1b83 570 * Position is 1 indexed.
fac92740 571 */
7122b237
YO
572#ifndef RE_TRACK_PATTERN_OFFSETS
573#define Set_Node_Offset_To_R(node,byte)
574#define Set_Node_Offset(node,byte)
575#define Set_Cur_Node_Offset
576#define Set_Node_Length_To_R(node,len)
577#define Set_Node_Length(node,len)
578#define Set_Node_Cur_Length(node)
579#define Node_Offset(n)
580#define Node_Length(n)
581#define Set_Node_Offset_Length(node,offset,len)
582#define ProgLen(ri) ri->u.proglen
583#define SetProgLen(ri,x) ri->u.proglen = x
584#else
585#define ProgLen(ri) ri->u.offsets[0]
586#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
587#define Set_Node_Offset_To_R(node,byte) STMT_START { \
588 if (! SIZE_ONLY) { \
589 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 590 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 591 if((node) < 0) { \
551405c4 592 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
593 } else { \
594 RExC_offsets[2*(node)-1] = (byte); \
595 } \
596 } \
597} STMT_END
598
599#define Set_Node_Offset(node,byte) \
600 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
601#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
602
603#define Set_Node_Length_To_R(node,len) STMT_START { \
604 if (! SIZE_ONLY) { \
605 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 606 __LINE__, (int)(node), (int)(len))); \
ccb2c380 607 if((node) < 0) { \
551405c4 608 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
609 } else { \
610 RExC_offsets[2*(node)] = (len); \
611 } \
612 } \
613} STMT_END
614
615#define Set_Node_Length(node,len) \
616 Set_Node_Length_To_R((node)-RExC_emit_start, len)
617#define Set_Cur_Node_Length(len) Set_Node_Length(RExC_emit, len)
618#define Set_Node_Cur_Length(node) \
619 Set_Node_Length(node, RExC_parse - parse_start)
fac92740
MJD
620
621/* Get offsets and lengths */
622#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
623#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
624
07be1b83
YO
625#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
626 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
627 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
628} STMT_END
7122b237 629#endif
07be1b83
YO
630
631#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
632#define EXPERIMENTAL_INPLACESCAN
f427392e 633#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 634
304ee84b
YO
635#define DEBUG_STUDYDATA(str,data,depth) \
636DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 637 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
638 "%*s" str "Pos:%"IVdf"/%"IVdf \
639 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
640 (int)(depth)*2, "", \
641 (IV)((data)->pos_min), \
642 (IV)((data)->pos_delta), \
304ee84b 643 (UV)((data)->flags), \
1de06328 644 (IV)((data)->whilem_c), \
304ee84b
YO
645 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
646 is_inf ? "INF " : "" \
1de06328
YO
647 ); \
648 if ((data)->last_found) \
649 PerlIO_printf(Perl_debug_log, \
650 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
651 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
652 SvPVX_const((data)->last_found), \
653 (IV)((data)->last_end), \
654 (IV)((data)->last_start_min), \
655 (IV)((data)->last_start_max), \
656 ((data)->longest && \
657 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
658 SvPVX_const((data)->longest_fixed), \
659 (IV)((data)->offset_fixed), \
660 ((data)->longest && \
661 (data)->longest==&((data)->longest_float)) ? "*" : "", \
662 SvPVX_const((data)->longest_float), \
663 (IV)((data)->offset_float_min), \
664 (IV)((data)->offset_float_max) \
665 ); \
666 PerlIO_printf(Perl_debug_log,"\n"); \
667});
668
acfe0abc 669static void clear_re(pTHX_ void *r);
4327152a 670
653099ff 671/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 672 Update the longest found anchored substring and the longest found
653099ff
GS
673 floating substrings if needed. */
674
4327152a 675STATIC void
304ee84b 676S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data, I32 *minlenp, int is_inf)
c277df42 677{
e1ec3a88
AL
678 const STRLEN l = CHR_SVLEN(data->last_found);
679 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 680 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 681
7918f24d
NC
682 PERL_ARGS_ASSERT_SCAN_COMMIT;
683
c277df42 684 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 685 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
686 if (*data->longest == data->longest_fixed) {
687 data->offset_fixed = l ? data->last_start_min : data->pos_min;
688 if (data->flags & SF_BEFORE_EOL)
b81d288d 689 data->flags
c277df42
IZ
690 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
691 else
692 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 693 data->minlen_fixed=minlenp;
1de06328 694 data->lookbehind_fixed=0;
a0ed51b3 695 }
304ee84b 696 else { /* *data->longest == data->longest_float */
c277df42 697 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
698 data->offset_float_max = (l
699 ? data->last_start_max
c277df42 700 : data->pos_min + data->pos_delta);
304ee84b 701 if (is_inf || (U32)data->offset_float_max > (U32)I32_MAX)
9051bda5 702 data->offset_float_max = I32_MAX;
c277df42 703 if (data->flags & SF_BEFORE_EOL)
b81d288d 704 data->flags
c277df42
IZ
705 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
706 else
707 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
708 data->minlen_float=minlenp;
709 data->lookbehind_float=0;
c277df42
IZ
710 }
711 }
712 SvCUR_set(data->last_found, 0);
0eda9292 713 {
a28509cc 714 SV * const sv = data->last_found;
097eb12c
AL
715 if (SvUTF8(sv) && SvMAGICAL(sv)) {
716 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
717 if (mg)
718 mg->mg_len = 0;
719 }
0eda9292 720 }
c277df42
IZ
721 data->last_end = -1;
722 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 723 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
724}
725
653099ff
GS
726/* Can match anything (initialization) */
727STATIC void
3fffb88a 728S_cl_anything(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 729{
7918f24d
NC
730 PERL_ARGS_ASSERT_CL_ANYTHING;
731
f8bef550 732 ANYOF_BITMAP_SETALL(cl);
dd58aee1 733 cl->flags = ANYOF_CLASS|ANYOF_EOS|ANYOF_UNICODE_ALL
3ad98780 734 |ANYOF_LOC_NONBITMAP_FOLD|ANYOF_NON_UTF8_LATIN1_ALL;
3fffb88a
KW
735
736 /* If any portion of the regex is to operate under locale rules,
737 * initialization includes it. The reason this isn't done for all regexes
738 * is that the optimizer was written under the assumption that locale was
739 * all-or-nothing. Given the complexity and lack of documentation in the
740 * optimizer, and that there are inadequate test cases for locale, so many
741 * parts of it may not work properly, it is safest to avoid locale unless
742 * necessary. */
743 if (RExC_contains_locale) {
9d7a1e63 744 ANYOF_CLASS_SETALL(cl); /* /l uses class */
3fffb88a
KW
745 cl->flags |= ANYOF_LOCALE;
746 }
9d7a1e63
KW
747 else {
748 ANYOF_CLASS_ZERO(cl); /* Only /l uses class now */
749 }
653099ff
GS
750}
751
752/* Can match anything (initialization) */
753STATIC int
5f66b61c 754S_cl_is_anything(const struct regnode_charclass_class *cl)
653099ff
GS
755{
756 int value;
757
7918f24d
NC
758 PERL_ARGS_ASSERT_CL_IS_ANYTHING;
759
aaa51d5e 760 for (value = 0; value <= ANYOF_MAX; value += 2)
653099ff
GS
761 if (ANYOF_CLASS_TEST(cl, value) && ANYOF_CLASS_TEST(cl, value + 1))
762 return 1;
1aa99e6b
IH
763 if (!(cl->flags & ANYOF_UNICODE_ALL))
764 return 0;
10edeb5d 765 if (!ANYOF_BITMAP_TESTALLSET((const void*)cl))
f8bef550 766 return 0;
653099ff
GS
767 return 1;
768}
769
770/* Can match anything (initialization) */
771STATIC void
e755fd73 772S_cl_init(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl)
653099ff 773{
7918f24d
NC
774 PERL_ARGS_ASSERT_CL_INIT;
775
8ecf7187 776 Zero(cl, 1, struct regnode_charclass_class);
653099ff 777 cl->type = ANYOF;
3fffb88a 778 cl_anything(pRExC_state, cl);
1411dba4 779 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
653099ff
GS
780}
781
1051e1c4
KW
782/* These two functions currently do the exact same thing */
783#define cl_init_zero S_cl_init
653099ff 784
dd58aee1
KW
785/* 'AND' a given class with another one. Can create false positives. 'cl'
786 * should not be inverted. 'and_with->flags & ANYOF_CLASS' should be 0 if
787 * 'and_with' is a regnode_charclass instead of a regnode_charclass_class. */
653099ff 788STATIC void
5f66b61c 789S_cl_and(struct regnode_charclass_class *cl,
a28509cc 790 const struct regnode_charclass_class *and_with)
653099ff 791{
7918f24d 792 PERL_ARGS_ASSERT_CL_AND;
40d049e4
YO
793
794 assert(and_with->type == ANYOF);
1e6ade67 795
c6b76537 796 /* I (khw) am not sure all these restrictions are necessary XXX */
1e6ade67
KW
797 if (!(ANYOF_CLASS_TEST_ANY_SET(and_with))
798 && !(ANYOF_CLASS_TEST_ANY_SET(cl))
653099ff 799 && (and_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
800 && !(and_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
801 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) {
653099ff
GS
802 int i;
803
804 if (and_with->flags & ANYOF_INVERT)
805 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
806 cl->bitmap[i] &= ~and_with->bitmap[i];
807 else
808 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
809 cl->bitmap[i] &= and_with->bitmap[i];
810 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
1aa99e6b 811
c6b76537 812 if (and_with->flags & ANYOF_INVERT) {
8951c461 813
c6b76537
KW
814 /* Here, the and'ed node is inverted. Get the AND of the flags that
815 * aren't affected by the inversion. Those that are affected are
816 * handled individually below */
817 U8 affected_flags = cl->flags & ~INVERSION_UNAFFECTED_FLAGS;
818 cl->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
819 cl->flags |= affected_flags;
820
821 /* We currently don't know how to deal with things that aren't in the
822 * bitmap, but we know that the intersection is no greater than what
823 * is already in cl, so let there be false positives that get sorted
824 * out after the synthetic start class succeeds, and the node is
825 * matched for real. */
826
827 /* The inversion of these two flags indicate that the resulting
828 * intersection doesn't have them */
829 if (and_with->flags & ANYOF_UNICODE_ALL) {
4713bfe1
KW
830 cl->flags &= ~ANYOF_UNICODE_ALL;
831 }
c6b76537
KW
832 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
833 cl->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
137165a6 834 }
1aa99e6b 835 }
c6b76537 836 else { /* and'd node is not inverted */
3ad98780
KW
837 U8 outside_bitmap_but_not_utf8; /* Temp variable */
838
137165a6 839 if (! ANYOF_NONBITMAP(and_with)) {
c6b76537
KW
840
841 /* Here 'and_with' doesn't match anything outside the bitmap
842 * (except possibly ANYOF_UNICODE_ALL), which means the
843 * intersection can't either, except for ANYOF_UNICODE_ALL, in
844 * which case we don't know what the intersection is, but it's no
845 * greater than what cl already has, so can just leave it alone,
846 * with possible false positives */
847 if (! (and_with->flags & ANYOF_UNICODE_ALL)) {
848 ARG_SET(cl, ANYOF_NONBITMAP_EMPTY);
871d0d1a 849 cl->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
c6b76537 850 }
137165a6 851 }
c6b76537
KW
852 else if (! ANYOF_NONBITMAP(cl)) {
853
854 /* Here, 'and_with' does match something outside the bitmap, and cl
855 * doesn't have a list of things to match outside the bitmap. If
856 * cl can match all code points above 255, the intersection will
3ad98780
KW
857 * be those above-255 code points that 'and_with' matches. If cl
858 * can't match all Unicode code points, it means that it can't
859 * match anything outside the bitmap (since the 'if' that got us
860 * into this block tested for that), so we leave the bitmap empty.
861 */
c6b76537
KW
862 if (cl->flags & ANYOF_UNICODE_ALL) {
863 ARG_SET(cl, ARG(and_with));
3ad98780
KW
864
865 /* and_with's ARG may match things that don't require UTF8.
866 * And now cl's will too, in spite of this being an 'and'. See
867 * the comments below about the kludge */
868 cl->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
c6b76537
KW
869 }
870 }
871 else {
872 /* Here, both 'and_with' and cl match something outside the
873 * bitmap. Currently we do not do the intersection, so just match
874 * whatever cl had at the beginning. */
875 }
876
877
3ad98780
KW
878 /* Take the intersection of the two sets of flags. However, the
879 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
880 * kludge around the fact that this flag is not treated like the others
881 * which are initialized in cl_anything(). The way the optimizer works
882 * is that the synthetic start class (SSC) is initialized to match
883 * anything, and then the first time a real node is encountered, its
884 * values are AND'd with the SSC's with the result being the values of
885 * the real node. However, there are paths through the optimizer where
886 * the AND never gets called, so those initialized bits are set
887 * inappropriately, which is not usually a big deal, as they just cause
888 * false positives in the SSC, which will just mean a probably
889 * imperceptible slow down in execution. However this bit has a
890 * higher false positive consequence in that it can cause utf8.pm,
891 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
892 * bigger slowdown and also causes significant extra memory to be used.
893 * In order to prevent this, the code now takes a different tack. The
894 * bit isn't set unless some part of the regular expression needs it,
895 * but once set it won't get cleared. This means that these extra
896 * modules won't get loaded unless there was some path through the
897 * pattern that would have required them anyway, and so any false
898 * positives that occur by not ANDing them out when they could be
899 * aren't as severe as they would be if we treated this bit like all
900 * the others */
901 outside_bitmap_but_not_utf8 = (cl->flags | and_with->flags)
902 & ANYOF_NONBITMAP_NON_UTF8;
c6b76537 903 cl->flags &= and_with->flags;
3ad98780 904 cl->flags |= outside_bitmap_but_not_utf8;
137165a6 905 }
653099ff
GS
906}
907
dd58aee1
KW
908/* 'OR' a given class with another one. Can create false positives. 'cl'
909 * should not be inverted. 'or_with->flags & ANYOF_CLASS' should be 0 if
910 * 'or_with' is a regnode_charclass instead of a regnode_charclass_class. */
653099ff 911STATIC void
3fffb88a 912S_cl_or(const RExC_state_t *pRExC_state, struct regnode_charclass_class *cl, const struct regnode_charclass_class *or_with)
653099ff 913{
7918f24d
NC
914 PERL_ARGS_ASSERT_CL_OR;
915
653099ff 916 if (or_with->flags & ANYOF_INVERT) {
c6b76537
KW
917
918 /* Here, the or'd node is to be inverted. This means we take the
919 * complement of everything not in the bitmap, but currently we don't
920 * know what that is, so give up and match anything */
921 if (ANYOF_NONBITMAP(or_with)) {
3fffb88a 922 cl_anything(pRExC_state, cl);
c6b76537 923 }
653099ff
GS
924 /* We do not use
925 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
926 * <= (B1 | !B2) | (CL1 | !CL2)
927 * which is wasteful if CL2 is small, but we ignore CL2:
928 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
929 * XXXX Can we handle case-fold? Unclear:
930 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
931 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
932 */
c6b76537 933 else if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
934 && !(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
935 && !(cl->flags & ANYOF_LOC_NONBITMAP_FOLD) ) {
653099ff
GS
936 int i;
937
938 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
939 cl->bitmap[i] |= ~or_with->bitmap[i];
940 } /* XXXX: logic is complicated otherwise */
941 else {
3fffb88a 942 cl_anything(pRExC_state, cl);
653099ff 943 }
c6b76537
KW
944
945 /* And, we can just take the union of the flags that aren't affected
946 * by the inversion */
947 cl->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
948
949 /* For the remaining flags:
950 ANYOF_UNICODE_ALL and inverted means to not match anything above
951 255, which means that the union with cl should just be
952 what cl has in it, so can ignore this flag
953 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
954 is 127-255 to match them, but then invert that, so the
955 union with cl should just be what cl has in it, so can
956 ignore this flag
957 */
958 } else { /* 'or_with' is not inverted */
653099ff
GS
959 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
960 if ( (or_with->flags & ANYOF_LOCALE) == (cl->flags & ANYOF_LOCALE)
39065660
KW
961 && (!(or_with->flags & ANYOF_LOC_NONBITMAP_FOLD)
962 || (cl->flags & ANYOF_LOC_NONBITMAP_FOLD)) ) {
653099ff
GS
963 int i;
964
965 /* OR char bitmap and class bitmap separately */
966 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
967 cl->bitmap[i] |= or_with->bitmap[i];
1e6ade67 968 if (ANYOF_CLASS_TEST_ANY_SET(or_with)) {
653099ff
GS
969 for (i = 0; i < ANYOF_CLASSBITMAP_SIZE; i++)
970 cl->classflags[i] |= or_with->classflags[i];
971 cl->flags |= ANYOF_CLASS;
972 }
973 }
974 else { /* XXXX: logic is complicated, leave it along for a moment. */
3fffb88a 975 cl_anything(pRExC_state, cl);
653099ff 976 }
9826f543 977
c6b76537
KW
978 if (ANYOF_NONBITMAP(or_with)) {
979
980 /* Use the added node's outside-the-bit-map match if there isn't a
981 * conflict. If there is a conflict (both nodes match something
982 * outside the bitmap, but what they match outside is not the same
983 * pointer, and hence not easily compared until XXX we extend
984 * inversion lists this far), give up and allow the start class to
d94b1d13
KW
985 * match everything outside the bitmap. If that stuff is all above
986 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
c6b76537
KW
987 if (! ANYOF_NONBITMAP(cl)) {
988 ARG_SET(cl, ARG(or_with));
989 }
990 else if (ARG(cl) != ARG(or_with)) {
d94b1d13
KW
991
992 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
993 cl_anything(pRExC_state, cl);
994 }
995 else {
996 cl->flags |= ANYOF_UNICODE_ALL;
997 }
c6b76537 998 }
4c34a693 999 }
0b9668ee
KW
1000
1001 /* Take the union */
1002 cl->flags |= or_with->flags;
1aa99e6b 1003 }
653099ff
GS
1004}
1005
a3621e74
YO
1006#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1007#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1008#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1009#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1010
3dab1dad
YO
1011
1012#ifdef DEBUGGING
07be1b83 1013/*
2b8b4781
NC
1014 dump_trie(trie,widecharmap,revcharmap)
1015 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1016 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1017
1018 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1019 The _interim_ variants are used for debugging the interim
1020 tables that are used to generate the final compressed
1021 representation which is what dump_trie expects.
1022
486ec47a 1023 Part of the reason for their existence is to provide a form
3dab1dad 1024 of documentation as to how the different representations function.
07be1b83
YO
1025
1026*/
3dab1dad
YO
1027
1028/*
3dab1dad
YO
1029 Dumps the final compressed table form of the trie to Perl_debug_log.
1030 Used for debugging make_trie().
1031*/
b9a59e08 1032
3dab1dad 1033STATIC void
2b8b4781
NC
1034S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1035 AV *revcharmap, U32 depth)
3dab1dad
YO
1036{
1037 U32 state;
ab3bbdeb 1038 SV *sv=sv_newmortal();
55eed653 1039 int colwidth= widecharmap ? 6 : 4;
2e64971a 1040 U16 word;
3dab1dad
YO
1041 GET_RE_DEBUG_FLAGS_DECL;
1042
7918f24d 1043 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1044
3dab1dad
YO
1045 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1046 (int)depth * 2 + 2,"",
1047 "Match","Base","Ofs" );
1048
1049 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1050 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1051 if ( tmp ) {
ab3bbdeb
YO
1052 PerlIO_printf( Perl_debug_log, "%*s",
1053 colwidth,
ddc5bc0f 1054 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1055 PL_colors[0], PL_colors[1],
1056 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1057 PERL_PV_ESCAPE_FIRSTCHAR
1058 )
1059 );
3dab1dad
YO
1060 }
1061 }
1062 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1063 (int)depth * 2 + 2,"");
1064
1065 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1066 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1067 PerlIO_printf( Perl_debug_log, "\n");
1068
1e2e3d02 1069 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1070 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
1071
1072 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1073
1074 if ( trie->states[ state ].wordnum ) {
1075 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1076 } else {
1077 PerlIO_printf( Perl_debug_log, "%6s", "" );
1078 }
1079
1080 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1081
1082 if ( base ) {
1083 U32 ofs = 0;
1084
1085 while( ( base + ofs < trie->uniquecharcount ) ||
1086 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1087 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1088 ofs++;
1089
1090 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1091
1092 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1093 if ( ( base + ofs >= trie->uniquecharcount ) &&
1094 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1095 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1096 {
ab3bbdeb
YO
1097 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1098 colwidth,
3dab1dad
YO
1099 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1100 } else {
ab3bbdeb 1101 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1102 }
1103 }
1104
1105 PerlIO_printf( Perl_debug_log, "]");
1106
1107 }
1108 PerlIO_printf( Perl_debug_log, "\n" );
1109 }
2e64971a
DM
1110 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1111 for (word=1; word <= trie->wordcount; word++) {
1112 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1113 (int)word, (int)(trie->wordinfo[word].prev),
1114 (int)(trie->wordinfo[word].len));
1115 }
1116 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
1117}
1118/*
3dab1dad
YO
1119 Dumps a fully constructed but uncompressed trie in list form.
1120 List tries normally only are used for construction when the number of
1121 possible chars (trie->uniquecharcount) is very high.
1122 Used for debugging make_trie().
1123*/
1124STATIC void
55eed653 1125S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1126 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1127 U32 depth)
3dab1dad
YO
1128{
1129 U32 state;
ab3bbdeb 1130 SV *sv=sv_newmortal();
55eed653 1131 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1132 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1133
1134 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1135
3dab1dad 1136 /* print out the table precompression. */
ab3bbdeb
YO
1137 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1138 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1139 "------:-----+-----------------\n" );
3dab1dad
YO
1140
1141 for( state=1 ; state < next_alloc ; state ++ ) {
1142 U16 charid;
1143
ab3bbdeb 1144 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1145 (int)depth * 2 + 2,"", (UV)state );
1146 if ( ! trie->states[ state ].wordnum ) {
1147 PerlIO_printf( Perl_debug_log, "%5s| ","");
1148 } else {
1149 PerlIO_printf( Perl_debug_log, "W%4x| ",
1150 trie->states[ state ].wordnum
1151 );
1152 }
1153 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1154 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1155 if ( tmp ) {
1156 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1157 colwidth,
ddc5bc0f 1158 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1159 PL_colors[0], PL_colors[1],
1160 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1161 PERL_PV_ESCAPE_FIRSTCHAR
1162 ) ,
1e2e3d02
YO
1163 TRIE_LIST_ITEM(state,charid).forid,
1164 (UV)TRIE_LIST_ITEM(state,charid).newstate
1165 );
1166 if (!(charid % 10))
664e119d
RGS
1167 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1168 (int)((depth * 2) + 14), "");
1e2e3d02 1169 }
ab3bbdeb
YO
1170 }
1171 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1172 }
1173}
1174
1175/*
3dab1dad
YO
1176 Dumps a fully constructed but uncompressed trie in table form.
1177 This is the normal DFA style state transition table, with a few
1178 twists to facilitate compression later.
1179 Used for debugging make_trie().
1180*/
1181STATIC void
55eed653 1182S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1183 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1184 U32 depth)
3dab1dad
YO
1185{
1186 U32 state;
1187 U16 charid;
ab3bbdeb 1188 SV *sv=sv_newmortal();
55eed653 1189 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1190 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1191
1192 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1193
1194 /*
1195 print out the table precompression so that we can do a visual check
1196 that they are identical.
1197 */
1198
1199 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1200
1201 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1202 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1203 if ( tmp ) {
ab3bbdeb
YO
1204 PerlIO_printf( Perl_debug_log, "%*s",
1205 colwidth,
ddc5bc0f 1206 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1207 PL_colors[0], PL_colors[1],
1208 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1209 PERL_PV_ESCAPE_FIRSTCHAR
1210 )
1211 );
3dab1dad
YO
1212 }
1213 }
1214
1215 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1216
1217 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1218 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1219 }
1220
1221 PerlIO_printf( Perl_debug_log, "\n" );
1222
1223 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1224
1225 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1226 (int)depth * 2 + 2,"",
1227 (UV)TRIE_NODENUM( state ) );
1228
1229 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1230 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1231 if (v)
1232 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1233 else
1234 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1235 }
1236 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1237 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1238 } else {
1239 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1240 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1241 }
1242 }
07be1b83 1243}
3dab1dad
YO
1244
1245#endif
1246
2e64971a 1247
786e8c11
YO
1248/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1249 startbranch: the first branch in the whole branch sequence
1250 first : start branch of sequence of branch-exact nodes.
1251 May be the same as startbranch
1252 last : Thing following the last branch.
1253 May be the same as tail.
1254 tail : item following the branch sequence
1255 count : words in the sequence
1256 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1257 depth : indent depth
3dab1dad 1258
786e8c11 1259Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1260
786e8c11
YO
1261A trie is an N'ary tree where the branches are determined by digital
1262decomposition of the key. IE, at the root node you look up the 1st character and
1263follow that branch repeat until you find the end of the branches. Nodes can be
1264marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1265
786e8c11 1266 /he|she|his|hers/
72f13be8 1267
786e8c11
YO
1268would convert into the following structure. Numbers represent states, letters
1269following numbers represent valid transitions on the letter from that state, if
1270the number is in square brackets it represents an accepting state, otherwise it
1271will be in parenthesis.
07be1b83 1272
786e8c11
YO
1273 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1274 | |
1275 | (2)
1276 | |
1277 (1) +-i->(6)-+-s->[7]
1278 |
1279 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1280
786e8c11
YO
1281 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1282
1283This shows that when matching against the string 'hers' we will begin at state 1
1284read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1285then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1286is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1287single traverse. We store a mapping from accepting to state to which word was
1288matched, and then when we have multiple possibilities we try to complete the
1289rest of the regex in the order in which they occured in the alternation.
1290
1291The only prior NFA like behaviour that would be changed by the TRIE support is
1292the silent ignoring of duplicate alternations which are of the form:
1293
1294 / (DUPE|DUPE) X? (?{ ... }) Y /x
1295
4b714af6 1296Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1297and without the optimisation. With the optimisations dupes will be silently
486ec47a 1298ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1299the following demonstrates:
1300
1301 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1302
1303which prints out 'word' three times, but
1304
1305 'words'=~/(word|word|word)(?{ print $1 })S/
1306
1307which doesnt print it out at all. This is due to other optimisations kicking in.
1308
1309Example of what happens on a structural level:
1310
486ec47a 1311The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1312
1313 1: CURLYM[1] {1,32767}(18)
1314 5: BRANCH(8)
1315 6: EXACT <ac>(16)
1316 8: BRANCH(11)
1317 9: EXACT <ad>(16)
1318 11: BRANCH(14)
1319 12: EXACT <ab>(16)
1320 16: SUCCEED(0)
1321 17: NOTHING(18)
1322 18: END(0)
1323
1324This would be optimizable with startbranch=5, first=5, last=16, tail=16
1325and should turn into:
1326
1327 1: CURLYM[1] {1,32767}(18)
1328 5: TRIE(16)
1329 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1330 <ac>
1331 <ad>
1332 <ab>
1333 16: SUCCEED(0)
1334 17: NOTHING(18)
1335 18: END(0)
1336
1337Cases where tail != last would be like /(?foo|bar)baz/:
1338
1339 1: BRANCH(4)
1340 2: EXACT <foo>(8)
1341 4: BRANCH(7)
1342 5: EXACT <bar>(8)
1343 7: TAIL(8)
1344 8: EXACT <baz>(10)
1345 10: END(0)
1346
1347which would be optimizable with startbranch=1, first=1, last=7, tail=8
1348and would end up looking like:
1349
1350 1: TRIE(8)
1351 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1352 <foo>
1353 <bar>
1354 7: TAIL(8)
1355 8: EXACT <baz>(10)
1356 10: END(0)
1357
1358 d = uvuni_to_utf8_flags(d, uv, 0);
1359
1360is the recommended Unicode-aware way of saying
1361
1362 *(d++) = uv;
1363*/
1364
1e2e3d02 1365#define TRIE_STORE_REVCHAR \
786e8c11 1366 STMT_START { \
73031816
NC
1367 if (UTF) { \
1368 SV *zlopp = newSV(2); \
88c9ea1e
CB
1369 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1370 unsigned const char *const kapow = uvuni_to_utf8(flrbbbbb, uvc & 0xFF); \
73031816
NC
1371 SvCUR_set(zlopp, kapow - flrbbbbb); \
1372 SvPOK_on(zlopp); \
1373 SvUTF8_on(zlopp); \
1374 av_push(revcharmap, zlopp); \
1375 } else { \
6bdeddd2 1376 char ooooff = (char)uvc; \
73031816
NC
1377 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1378 } \
1379 } STMT_END
786e8c11
YO
1380
1381#define TRIE_READ_CHAR STMT_START { \
1382 wordlen++; \
1383 if ( UTF ) { \
1384 if ( folder ) { \
1385 if ( foldlen > 0 ) { \
1386 uvc = utf8n_to_uvuni( scan, UTF8_MAXLEN, &len, uniflags ); \
1387 foldlen -= len; \
1388 scan += len; \
1389 len = 0; \
1390 } else { \
c81f2f9e
KW
1391 len = UTF8SKIP(uc);\
1392 uvc = to_utf8_fold( uc, foldbuf, &foldlen); \
786e8c11
YO
1393 foldlen -= UNISKIP( uvc ); \
1394 scan = foldbuf + UNISKIP( uvc ); \
1395 } \
1396 } else { \
1397 uvc = utf8n_to_uvuni( (const U8*)uc, UTF8_MAXLEN, &len, uniflags);\
1398 } \
1399 } else { \
1400 uvc = (U32)*uc; \
1401 len = 1; \
1402 } \
1403} STMT_END
1404
1405
1406
1407#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1408 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1409 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1410 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1411 } \
1412 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1413 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1414 TRIE_LIST_CUR( state )++; \
1415} STMT_END
07be1b83 1416
786e8c11
YO
1417#define TRIE_LIST_NEW(state) STMT_START { \
1418 Newxz( trie->states[ state ].trans.list, \
1419 4, reg_trie_trans_le ); \
1420 TRIE_LIST_CUR( state ) = 1; \
1421 TRIE_LIST_LEN( state ) = 4; \
1422} STMT_END
07be1b83 1423
786e8c11
YO
1424#define TRIE_HANDLE_WORD(state) STMT_START { \
1425 U16 dupe= trie->states[ state ].wordnum; \
1426 regnode * const noper_next = regnext( noper ); \
1427 \
786e8c11
YO
1428 DEBUG_r({ \
1429 /* store the word for dumping */ \
1430 SV* tmp; \
1431 if (OP(noper) != NOTHING) \
740cce10 1432 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1433 else \
740cce10 1434 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1435 av_push( trie_words, tmp ); \
786e8c11
YO
1436 }); \
1437 \
1438 curword++; \
2e64971a
DM
1439 trie->wordinfo[curword].prev = 0; \
1440 trie->wordinfo[curword].len = wordlen; \
1441 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1442 \
1443 if ( noper_next < tail ) { \
1444 if (!trie->jump) \
c944940b 1445 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1446 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1447 if (!jumper) \
1448 jumper = noper_next; \
1449 if (!nextbranch) \
1450 nextbranch= regnext(cur); \
1451 } \
1452 \
1453 if ( dupe ) { \
2e64971a
DM
1454 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1455 /* chain, so that when the bits of chain are later */\
1456 /* linked together, the dups appear in the chain */\
1457 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1458 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1459 } else { \
1460 /* we haven't inserted this word yet. */ \
1461 trie->states[ state ].wordnum = curword; \
1462 } \
1463} STMT_END
07be1b83 1464
3dab1dad 1465
786e8c11
YO
1466#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1467 ( ( base + charid >= ucharcount \
1468 && base + charid < ubound \
1469 && state == trie->trans[ base - ucharcount + charid ].check \
1470 && trie->trans[ base - ucharcount + charid ].next ) \
1471 ? trie->trans[ base - ucharcount + charid ].next \
1472 : ( state==1 ? special : 0 ) \
1473 )
3dab1dad 1474
786e8c11
YO
1475#define MADE_TRIE 1
1476#define MADE_JUMP_TRIE 2
1477#define MADE_EXACT_TRIE 4
3dab1dad 1478
a3621e74 1479STATIC I32
786e8c11 1480S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1481{
27da23d5 1482 dVAR;
a3621e74
YO
1483 /* first pass, loop through and scan words */
1484 reg_trie_data *trie;
55eed653 1485 HV *widecharmap = NULL;
2b8b4781 1486 AV *revcharmap = newAV();
a3621e74 1487 regnode *cur;
9f7f3913 1488 const U32 uniflags = UTF8_ALLOW_DEFAULT;
a3621e74
YO
1489 STRLEN len = 0;
1490 UV uvc = 0;
1491 U16 curword = 0;
1492 U32 next_alloc = 0;
786e8c11
YO
1493 regnode *jumper = NULL;
1494 regnode *nextbranch = NULL;
7f69552c 1495 regnode *convert = NULL;
2e64971a 1496 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1497 /* we just use folder as a flag in utf8 */
1e696034 1498 const U8 * folder = NULL;
a3621e74 1499
2b8b4781
NC
1500#ifdef DEBUGGING
1501 const U32 data_slot = add_data( pRExC_state, 4, "tuuu" );
1502 AV *trie_words = NULL;
1503 /* along with revcharmap, this only used during construction but both are
1504 * useful during debugging so we store them in the struct when debugging.
8e11feef 1505 */
2b8b4781
NC
1506#else
1507 const U32 data_slot = add_data( pRExC_state, 2, "tu" );
3dab1dad 1508 STRLEN trie_charcount=0;
3dab1dad 1509#endif
2b8b4781 1510 SV *re_trie_maxbuff;
a3621e74 1511 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1512
1513 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1514#ifndef DEBUGGING
1515 PERL_UNUSED_ARG(depth);
1516#endif
a3621e74 1517
1e696034 1518 switch (flags) {
c46d03cf 1519 case EXACT: break;
2f7f8cb1 1520 case EXACTFA:
1e696034
KW
1521 case EXACTFU: folder = PL_fold_latin1; break;
1522 case EXACTF: folder = PL_fold; break;
1523 case EXACTFL: folder = PL_fold_locale; break;
c46d03cf 1524 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u", (unsigned) flags );
1e696034
KW
1525 }
1526
c944940b 1527 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1528 trie->refcount = 1;
3dab1dad 1529 trie->startstate = 1;
786e8c11 1530 trie->wordcount = word_count;
f8fc2ecf 1531 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1532 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3dab1dad 1533 if (!(UTF && folder))
c944940b 1534 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1535 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1536 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1537
a3621e74 1538 DEBUG_r({
2b8b4781 1539 trie_words = newAV();
a3621e74 1540 });
a3621e74 1541
0111c4fd 1542 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1543 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1544 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1545 }
3dab1dad
YO
1546 DEBUG_OPTIMISE_r({
1547 PerlIO_printf( Perl_debug_log,
786e8c11 1548 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1549 (int)depth * 2 + 2, "",
1550 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1551 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1552 (int)depth);
3dab1dad 1553 });
7f69552c
YO
1554
1555 /* Find the node we are going to overwrite */
1556 if ( first == startbranch && OP( last ) != BRANCH ) {
1557 /* whole branch chain */
1558 convert = first;
1559 } else {
1560 /* branch sub-chain */
1561 convert = NEXTOPER( first );
1562 }
1563
a3621e74
YO
1564 /* -- First loop and Setup --
1565
1566 We first traverse the branches and scan each word to determine if it
1567 contains widechars, and how many unique chars there are, this is
1568 important as we have to build a table with at least as many columns as we
1569 have unique chars.
1570
1571 We use an array of integers to represent the character codes 0..255
38a44b82 1572 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1573 native representation of the character value as the key and IV's for the
1574 coded index.
1575
1576 *TODO* If we keep track of how many times each character is used we can
1577 remap the columns so that the table compression later on is more
3b753521 1578 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1579 middle and the least common are on the outside. IMO this would be better
1580 than a most to least common mapping as theres a decent chance the most
1581 common letter will share a node with the least common, meaning the node
486ec47a 1582 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1583 case is when we have the least common nodes twice.
1584
1585 */
1586
a3621e74 1587 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
c445ea15 1588 regnode * const noper = NEXTOPER( cur );
e1ec3a88 1589 const U8 *uc = (U8*)STRING( noper );
a28509cc 1590 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1591 STRLEN foldlen = 0;
1592 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
2af232bd 1593 const U8 *scan = (U8*)NULL;
07be1b83 1594 U32 wordlen = 0; /* required init */
02daf0ab
YO
1595 STRLEN chars = 0;
1596 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1597
3dab1dad
YO
1598 if (OP(noper) == NOTHING) {
1599 trie->minlen= 0;
1600 continue;
1601 }
02daf0ab
YO
1602 if ( set_bit ) /* bitmap only alloced when !(UTF&&Folding) */
1603 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1604 regardless of encoding */
1605
a3621e74 1606 for ( ; uc < e ; uc += len ) {
3dab1dad 1607 TRIE_CHARCOUNT(trie)++;
a3621e74 1608 TRIE_READ_CHAR;
3dab1dad 1609 chars++;
a3621e74
YO
1610 if ( uvc < 256 ) {
1611 if ( !trie->charmap[ uvc ] ) {
1612 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
1613 if ( folder )
1614 trie->charmap[ folder[ uvc ] ] = trie->charmap[ uvc ];
3dab1dad 1615 TRIE_STORE_REVCHAR;
a3621e74 1616 }
02daf0ab 1617 if ( set_bit ) {
62012aee
KW
1618 /* store the codepoint in the bitmap, and its folded
1619 * equivalent. */
02daf0ab 1620 TRIE_BITMAP_SET(trie,uvc);
0921ee73
T
1621
1622 /* store the folded codepoint */
1623 if ( folder ) TRIE_BITMAP_SET(trie,folder[ uvc ]);
1624
1625 if ( !UTF ) {
1626 /* store first byte of utf8 representation of
acdf4139
KW
1627 variant codepoints */
1628 if (! UNI_IS_INVARIANT(uvc)) {
1629 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1630 }
1631 }
02daf0ab
YO
1632 set_bit = 0; /* We've done our bit :-) */
1633 }
a3621e74
YO
1634 } else {
1635 SV** svpp;
55eed653
NC
1636 if ( !widecharmap )
1637 widecharmap = newHV();
a3621e74 1638
55eed653 1639 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1640
1641 if ( !svpp )
e4584336 1642 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1643
1644 if ( !SvTRUE( *svpp ) ) {
1645 sv_setiv( *svpp, ++trie->uniquecharcount );
3dab1dad 1646 TRIE_STORE_REVCHAR;
a3621e74
YO
1647 }
1648 }
1649 }
3dab1dad
YO
1650 if( cur == first ) {
1651 trie->minlen=chars;
1652 trie->maxlen=chars;
1653 } else if (chars < trie->minlen) {
1654 trie->minlen=chars;
1655 } else if (chars > trie->maxlen) {
1656 trie->maxlen=chars;
1657 }
1658
a3621e74
YO
1659 } /* end first pass */
1660 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1661 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1662 (int)depth * 2 + 2,"",
55eed653 1663 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1664 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1665 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1666 );
a3621e74
YO
1667
1668 /*
1669 We now know what we are dealing with in terms of unique chars and
1670 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1671 representation using a flat table will take. If it's over a reasonable
1672 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1673 conservative but potentially much slower representation using an array
1674 of lists.
1675
1676 At the end we convert both representations into the same compressed
1677 form that will be used in regexec.c for matching with. The latter
1678 is a form that cannot be used to construct with but has memory
1679 properties similar to the list form and access properties similar
1680 to the table form making it both suitable for fast searches and
1681 small enough that its feasable to store for the duration of a program.
1682
1683 See the comment in the code where the compressed table is produced
1684 inplace from the flat tabe representation for an explanation of how
1685 the compression works.
1686
1687 */
1688
1689
2e64971a
DM
1690 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1691 prev_states[1] = 0;
1692
3dab1dad 1693 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1694 /*
1695 Second Pass -- Array Of Lists Representation
1696
1697 Each state will be represented by a list of charid:state records
1698 (reg_trie_trans_le) the first such element holds the CUR and LEN
1699 points of the allocated array. (See defines above).
1700
1701 We build the initial structure using the lists, and then convert
1702 it into the compressed table form which allows faster lookups
1703 (but cant be modified once converted).
a3621e74
YO
1704 */
1705
a3621e74
YO
1706 STRLEN transcount = 1;
1707
1e2e3d02
YO
1708 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1709 "%*sCompiling trie using list compiler\n",
1710 (int)depth * 2 + 2, ""));
686b73d4 1711
c944940b
JH
1712 trie->states = (reg_trie_state *)
1713 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1714 sizeof(reg_trie_state) );
a3621e74
YO
1715 TRIE_LIST_NEW(1);
1716 next_alloc = 2;
1717
1718 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1719
c445ea15
AL
1720 regnode * const noper = NEXTOPER( cur );
1721 U8 *uc = (U8*)STRING( noper );
1722 const U8 * const e = uc + STR_LEN( noper );
1723 U32 state = 1; /* required init */
1724 U16 charid = 0; /* sanity init */
1725 U8 *scan = (U8*)NULL; /* sanity init */
1726 STRLEN foldlen = 0; /* required init */
07be1b83 1727 U32 wordlen = 0; /* required init */
c445ea15
AL
1728 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1729
3dab1dad 1730 if (OP(noper) != NOTHING) {
786e8c11 1731 for ( ; uc < e ; uc += len ) {
c445ea15 1732
786e8c11 1733 TRIE_READ_CHAR;
c445ea15 1734
786e8c11
YO
1735 if ( uvc < 256 ) {
1736 charid = trie->charmap[ uvc ];
c445ea15 1737 } else {
55eed653 1738 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1739 if ( !svpp ) {
1740 charid = 0;
1741 } else {
1742 charid=(U16)SvIV( *svpp );
1743 }
c445ea15 1744 }
786e8c11
YO
1745 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1746 if ( charid ) {
a3621e74 1747
786e8c11
YO
1748 U16 check;
1749 U32 newstate = 0;
a3621e74 1750
786e8c11
YO
1751 charid--;
1752 if ( !trie->states[ state ].trans.list ) {
1753 TRIE_LIST_NEW( state );
c445ea15 1754 }
786e8c11
YO
1755 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1756 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1757 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1758 break;
1759 }
1760 }
1761 if ( ! newstate ) {
1762 newstate = next_alloc++;
2e64971a 1763 prev_states[newstate] = state;
786e8c11
YO
1764 TRIE_LIST_PUSH( state, charid, newstate );
1765 transcount++;
1766 }
1767 state = newstate;
1768 } else {
1769 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1770 }
a28509cc 1771 }
c445ea15 1772 }
3dab1dad 1773 TRIE_HANDLE_WORD(state);
a3621e74
YO
1774
1775 } /* end second pass */
1776
1e2e3d02
YO
1777 /* next alloc is the NEXT state to be allocated */
1778 trie->statecount = next_alloc;
c944940b
JH
1779 trie->states = (reg_trie_state *)
1780 PerlMemShared_realloc( trie->states,
1781 next_alloc
1782 * sizeof(reg_trie_state) );
a3621e74 1783
3dab1dad 1784 /* and now dump it out before we compress it */
2b8b4781
NC
1785 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1786 revcharmap, next_alloc,
1787 depth+1)
1e2e3d02 1788 );
a3621e74 1789
c944940b
JH
1790 trie->trans = (reg_trie_trans *)
1791 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1792 {
1793 U32 state;
a3621e74
YO
1794 U32 tp = 0;
1795 U32 zp = 0;
1796
1797
1798 for( state=1 ; state < next_alloc ; state ++ ) {
1799 U32 base=0;
1800
1801 /*
1802 DEBUG_TRIE_COMPILE_MORE_r(
1803 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1804 );
1805 */
1806
1807 if (trie->states[state].trans.list) {
1808 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1809 U16 maxid=minid;
a28509cc 1810 U16 idx;
a3621e74
YO
1811
1812 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1813 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1814 if ( forid < minid ) {
1815 minid=forid;
1816 } else if ( forid > maxid ) {
1817 maxid=forid;
1818 }
a3621e74
YO
1819 }
1820 if ( transcount < tp + maxid - minid + 1) {
1821 transcount *= 2;
c944940b
JH
1822 trie->trans = (reg_trie_trans *)
1823 PerlMemShared_realloc( trie->trans,
446bd890
NC
1824 transcount
1825 * sizeof(reg_trie_trans) );
a3621e74
YO
1826 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1827 }
1828 base = trie->uniquecharcount + tp - minid;
1829 if ( maxid == minid ) {
1830 U32 set = 0;
1831 for ( ; zp < tp ; zp++ ) {
1832 if ( ! trie->trans[ zp ].next ) {
1833 base = trie->uniquecharcount + zp - minid;
1834 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1835 trie->trans[ zp ].check = state;
1836 set = 1;
1837 break;
1838 }
1839 }
1840 if ( !set ) {
1841 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1842 trie->trans[ tp ].check = state;
1843 tp++;
1844 zp = tp;
1845 }
1846 } else {
1847 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 1848 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
1849 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
1850 trie->trans[ tid ].check = state;
1851 }
1852 tp += ( maxid - minid + 1 );
1853 }
1854 Safefree(trie->states[ state ].trans.list);
1855 }
1856 /*
1857 DEBUG_TRIE_COMPILE_MORE_r(
1858 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
1859 );
1860 */
1861 trie->states[ state ].trans.base=base;
1862 }
cc601c31 1863 trie->lasttrans = tp + 1;
a3621e74
YO
1864 }
1865 } else {
1866 /*
1867 Second Pass -- Flat Table Representation.
1868
1869 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
1870 We know that we will need Charcount+1 trans at most to store the data
1871 (one row per char at worst case) So we preallocate both structures
1872 assuming worst case.
1873
1874 We then construct the trie using only the .next slots of the entry
1875 structs.
1876
3b753521 1877 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
1878 make compression both faster and easier by keeping track of how many non
1879 zero fields are in the node.
1880
1881 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
1882 transition.
1883
1884 There are two terms at use here: state as a TRIE_NODEIDX() which is a
1885 number representing the first entry of the node, and state as a
1886 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
1887 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
1888 are 2 entrys per node. eg:
1889
1890 A B A B
1891 1. 2 4 1. 3 7
1892 2. 0 3 3. 0 5
1893 3. 0 0 5. 0 0
1894 4. 0 0 7. 0 0
1895
1896 The table is internally in the right hand, idx form. However as we also
1897 have to deal with the states array which is indexed by nodenum we have to
1898 use TRIE_NODENUM() to convert.
1899
1900 */
1e2e3d02
YO
1901 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1902 "%*sCompiling trie using table compiler\n",
1903 (int)depth * 2 + 2, ""));
3dab1dad 1904
c944940b
JH
1905 trie->trans = (reg_trie_trans *)
1906 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
1907 * trie->uniquecharcount + 1,
1908 sizeof(reg_trie_trans) );
1909 trie->states = (reg_trie_state *)
1910 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1911 sizeof(reg_trie_state) );
a3621e74
YO
1912 next_alloc = trie->uniquecharcount + 1;
1913
3dab1dad 1914
a3621e74
YO
1915 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1916
c445ea15 1917 regnode * const noper = NEXTOPER( cur );
a28509cc
AL
1918 const U8 *uc = (U8*)STRING( noper );
1919 const U8 * const e = uc + STR_LEN( noper );
a3621e74
YO
1920
1921 U32 state = 1; /* required init */
1922
1923 U16 charid = 0; /* sanity init */
1924 U32 accept_state = 0; /* sanity init */
1925 U8 *scan = (U8*)NULL; /* sanity init */
1926
1927 STRLEN foldlen = 0; /* required init */
07be1b83 1928 U32 wordlen = 0; /* required init */
a3621e74
YO
1929 U8 foldbuf[ UTF8_MAXBYTES_CASE + 1 ];
1930
3dab1dad 1931 if ( OP(noper) != NOTHING ) {
786e8c11 1932 for ( ; uc < e ; uc += len ) {
a3621e74 1933
786e8c11 1934 TRIE_READ_CHAR;
a3621e74 1935
786e8c11
YO
1936 if ( uvc < 256 ) {
1937 charid = trie->charmap[ uvc ];
1938 } else {
55eed653 1939 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 1940 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 1941 }
786e8c11
YO
1942 if ( charid ) {
1943 charid--;
1944 if ( !trie->trans[ state + charid ].next ) {
1945 trie->trans[ state + charid ].next = next_alloc;
1946 trie->trans[ state ].check++;
2e64971a
DM
1947 prev_states[TRIE_NODENUM(next_alloc)]
1948 = TRIE_NODENUM(state);
786e8c11
YO
1949 next_alloc += trie->uniquecharcount;
1950 }
1951 state = trie->trans[ state + charid ].next;
1952 } else {
1953 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
1954 }
1955 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 1956 }
a3621e74 1957 }
3dab1dad
YO
1958 accept_state = TRIE_NODENUM( state );
1959 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
1960
1961 } /* end second pass */
1962
3dab1dad 1963 /* and now dump it out before we compress it */
2b8b4781
NC
1964 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
1965 revcharmap,
1966 next_alloc, depth+1));
a3621e74 1967
a3621e74
YO
1968 {
1969 /*
1970 * Inplace compress the table.*
1971
1972 For sparse data sets the table constructed by the trie algorithm will
1973 be mostly 0/FAIL transitions or to put it another way mostly empty.
1974 (Note that leaf nodes will not contain any transitions.)
1975
1976 This algorithm compresses the tables by eliminating most such
1977 transitions, at the cost of a modest bit of extra work during lookup:
1978
1979 - Each states[] entry contains a .base field which indicates the
1980 index in the state[] array wheres its transition data is stored.
1981
3b753521 1982 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
1983
1984 - If .base is nonzero then charid is added to it to find an entry in
1985 the trans array.
1986
1987 -If trans[states[state].base+charid].check!=state then the
1988 transition is taken to be a 0/Fail transition. Thus if there are fail
1989 transitions at the front of the node then the .base offset will point
1990 somewhere inside the previous nodes data (or maybe even into a node
1991 even earlier), but the .check field determines if the transition is
1992 valid.
1993
786e8c11 1994 XXX - wrong maybe?
a3621e74 1995 The following process inplace converts the table to the compressed
3b753521 1996 table: We first do not compress the root node 1,and mark all its
a3621e74 1997 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
1998 allows us to do a DFA construction from the compressed table later,
1999 and ensures that any .base pointers we calculate later are greater
2000 than 0.
a3621e74
YO
2001
2002 - We set 'pos' to indicate the first entry of the second node.
2003
2004 - We then iterate over the columns of the node, finding the first and
2005 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2006 and set the .check pointers accordingly, and advance pos
2007 appropriately and repreat for the next node. Note that when we copy
2008 the next pointers we have to convert them from the original
2009 NODEIDX form to NODENUM form as the former is not valid post
2010 compression.
2011
2012 - If a node has no transitions used we mark its base as 0 and do not
2013 advance the pos pointer.
2014
2015 - If a node only has one transition we use a second pointer into the
2016 structure to fill in allocated fail transitions from other states.
2017 This pointer is independent of the main pointer and scans forward
2018 looking for null transitions that are allocated to a state. When it
2019 finds one it writes the single transition into the "hole". If the
786e8c11 2020 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
2021
2022 - Once compressed we can Renew/realloc the structures to release the
2023 excess space.
2024
2025 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2026 specifically Fig 3.47 and the associated pseudocode.
2027
2028 demq
2029 */
a3b680e6 2030 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 2031 U32 state, charid;
a3621e74 2032 U32 pos = 0, zp=0;
1e2e3d02 2033 trie->statecount = laststate;
a3621e74
YO
2034
2035 for ( state = 1 ; state < laststate ; state++ ) {
2036 U8 flag = 0;
a28509cc
AL
2037 const U32 stateidx = TRIE_NODEIDX( state );
2038 const U32 o_used = trie->trans[ stateidx ].check;
2039 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
2040 trie->trans[ stateidx ].check = 0;
2041
2042 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2043 if ( flag || trie->trans[ stateidx + charid ].next ) {
2044 if ( trie->trans[ stateidx + charid ].next ) {
2045 if (o_used == 1) {
2046 for ( ; zp < pos ; zp++ ) {
2047 if ( ! trie->trans[ zp ].next ) {
2048 break;
2049 }
2050 }
2051 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2052 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2053 trie->trans[ zp ].check = state;
2054 if ( ++zp > pos ) pos = zp;
2055 break;
2056 }
2057 used--;
2058 }
2059 if ( !flag ) {
2060 flag = 1;
2061 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2062 }
2063 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2064 trie->trans[ pos ].check = state;
2065 pos++;
2066 }
2067 }
2068 }
cc601c31 2069 trie->lasttrans = pos + 1;
c944940b
JH
2070 trie->states = (reg_trie_state *)
2071 PerlMemShared_realloc( trie->states, laststate
2072 * sizeof(reg_trie_state) );
a3621e74 2073 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 2074 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2075 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2076 (int)depth * 2 + 2,"",
2077 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
2078 (IV)next_alloc,
2079 (IV)pos,
a3621e74
YO
2080 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2081 );
2082
2083 } /* end table compress */
2084 }
1e2e3d02
YO
2085 DEBUG_TRIE_COMPILE_MORE_r(
2086 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2087 (int)depth * 2 + 2, "",
2088 (UV)trie->statecount,
2089 (UV)trie->lasttrans)
2090 );
cc601c31 2091 /* resize the trans array to remove unused space */
c944940b
JH
2092 trie->trans = (reg_trie_trans *)
2093 PerlMemShared_realloc( trie->trans, trie->lasttrans
2094 * sizeof(reg_trie_trans) );
a3621e74 2095
3b753521 2096 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2097 U8 nodetype =(U8)(flags & 0xFF);
2098 char *str=NULL;
786e8c11 2099
07be1b83 2100#ifdef DEBUGGING
e62cc96a 2101 regnode *optimize = NULL;
7122b237
YO
2102#ifdef RE_TRACK_PATTERN_OFFSETS
2103
b57a0404
JH
2104 U32 mjd_offset = 0;
2105 U32 mjd_nodelen = 0;
7122b237
YO
2106#endif /* RE_TRACK_PATTERN_OFFSETS */
2107#endif /* DEBUGGING */
a3621e74 2108 /*
3dab1dad
YO
2109 This means we convert either the first branch or the first Exact,
2110 depending on whether the thing following (in 'last') is a branch
2111 or not and whther first is the startbranch (ie is it a sub part of
2112 the alternation or is it the whole thing.)
3b753521 2113 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2114 the whole branch sequence, including the first.
a3621e74 2115 */
3dab1dad 2116 /* Find the node we are going to overwrite */
7f69552c 2117 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2118 /* branch sub-chain */
3dab1dad 2119 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2120#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2121 DEBUG_r({
2122 mjd_offset= Node_Offset((convert));
2123 mjd_nodelen= Node_Length((convert));
2124 });
7122b237 2125#endif
7f69552c 2126 /* whole branch chain */
7122b237
YO
2127 }
2128#ifdef RE_TRACK_PATTERN_OFFSETS
2129 else {
7f69552c
YO
2130 DEBUG_r({
2131 const regnode *nop = NEXTOPER( convert );
2132 mjd_offset= Node_Offset((nop));
2133 mjd_nodelen= Node_Length((nop));
2134 });
07be1b83
YO
2135 }
2136 DEBUG_OPTIMISE_r(
2137 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2138 (int)depth * 2 + 2, "",
786e8c11 2139 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2140 );
7122b237 2141#endif
3dab1dad
YO
2142 /* But first we check to see if there is a common prefix we can
2143 split out as an EXACT and put in front of the TRIE node. */
2144 trie->startstate= 1;
55eed653 2145 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2146 U32 state;
1e2e3d02 2147 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2148 U32 ofs = 0;
8e11feef
RGS
2149 I32 idx = -1;
2150 U32 count = 0;
2151 const U32 base = trie->states[ state ].trans.base;
a3621e74 2152
3dab1dad 2153 if ( trie->states[state].wordnum )
8e11feef 2154 count = 1;
a3621e74 2155
8e11feef 2156 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2157 if ( ( base + ofs >= trie->uniquecharcount ) &&
2158 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2159 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2160 {
3dab1dad 2161 if ( ++count > 1 ) {
2b8b4781 2162 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2163 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2164 if ( state == 1 ) break;
3dab1dad
YO
2165 if ( count == 2 ) {
2166 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2167 DEBUG_OPTIMISE_r(
8e11feef
RGS
2168 PerlIO_printf(Perl_debug_log,
2169 "%*sNew Start State=%"UVuf" Class: [",
2170 (int)depth * 2 + 2, "",
786e8c11 2171 (UV)state));
be8e71aa 2172 if (idx >= 0) {
2b8b4781 2173 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2174 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2175
3dab1dad 2176 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2177 if ( folder )
2178 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2179 DEBUG_OPTIMISE_r(
f1f66076 2180 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2181 );
8e11feef
RGS
2182 }
2183 }
2184 TRIE_BITMAP_SET(trie,*ch);
2185 if ( folder )
2186 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2187 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2188 }
2189 idx = ofs;
2190 }
3dab1dad
YO
2191 }
2192 if ( count == 1 ) {
2b8b4781 2193 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2194 STRLEN len;
2195 char *ch = SvPV( *tmp, len );
de734bd5
A
2196 DEBUG_OPTIMISE_r({
2197 SV *sv=sv_newmortal();
8e11feef
RGS
2198 PerlIO_printf( Perl_debug_log,
2199 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2200 (int)depth * 2 + 2, "",
de734bd5
A
2201 (UV)state, (UV)idx,
2202 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2203 PL_colors[0], PL_colors[1],
2204 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2205 PERL_PV_ESCAPE_FIRSTCHAR
2206 )
2207 );
2208 });
3dab1dad
YO
2209 if ( state==1 ) {
2210 OP( convert ) = nodetype;
2211 str=STRING(convert);
2212 STR_LEN(convert)=0;
2213 }
c490c714
YO
2214 STR_LEN(convert) += len;
2215 while (len--)
de734bd5 2216 *str++ = *ch++;
8e11feef 2217 } else {
f9049ba1 2218#ifdef DEBUGGING
8e11feef
RGS
2219 if (state>1)
2220 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2221#endif
8e11feef
RGS
2222 break;
2223 }
2224 }
2e64971a 2225 trie->prefixlen = (state-1);
3dab1dad 2226 if (str) {
8e11feef 2227 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2228 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2229 trie->startstate = state;
07be1b83
YO
2230 trie->minlen -= (state - 1);
2231 trie->maxlen -= (state - 1);
33809eae
JH
2232#ifdef DEBUGGING
2233 /* At least the UNICOS C compiler choked on this
2234 * being argument to DEBUG_r(), so let's just have
2235 * it right here. */
2236 if (
2237#ifdef PERL_EXT_RE_BUILD
2238 1
2239#else
2240 DEBUG_r_TEST
2241#endif
2242 ) {
2243 regnode *fix = convert;
2244 U32 word = trie->wordcount;
2245 mjd_nodelen++;
2246 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2247 while( ++fix < n ) {
2248 Set_Node_Offset_Length(fix, 0, 0);
2249 }
2250 while (word--) {
2251 SV ** const tmp = av_fetch( trie_words, word, 0 );
2252 if (tmp) {
2253 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2254 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2255 else
2256 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2257 }
2258 }
2259 }
2260#endif
8e11feef
RGS
2261 if (trie->maxlen) {
2262 convert = n;
2263 } else {
3dab1dad 2264 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2265 DEBUG_r(optimize= n);
3dab1dad
YO
2266 }
2267 }
2268 }
a5ca303d
YO
2269 if (!jumper)
2270 jumper = last;
3dab1dad 2271 if ( trie->maxlen ) {
8e11feef
RGS
2272 NEXT_OFF( convert ) = (U16)(tail - convert);
2273 ARG_SET( convert, data_slot );
786e8c11
YO
2274 /* Store the offset to the first unabsorbed branch in
2275 jump[0], which is otherwise unused by the jump logic.
2276 We use this when dumping a trie and during optimisation. */
2277 if (trie->jump)
7f69552c 2278 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2279
6c48061a
YO
2280 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2281 * and there is a bitmap
2282 * and the first "jump target" node we found leaves enough room
2283 * then convert the TRIE node into a TRIEC node, with the bitmap
2284 * embedded inline in the opcode - this is hypothetically faster.
2285 */
2286 if ( !trie->states[trie->startstate].wordnum
2287 && trie->bitmap
2288 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2289 {
2290 OP( convert ) = TRIEC;
2291 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2292 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2293 trie->bitmap= NULL;
2294 } else
2295 OP( convert ) = TRIE;
a3621e74 2296
3dab1dad
YO
2297 /* store the type in the flags */
2298 convert->flags = nodetype;
a5ca303d
YO
2299 DEBUG_r({
2300 optimize = convert
2301 + NODE_STEP_REGNODE
2302 + regarglen[ OP( convert ) ];
2303 });
2304 /* XXX We really should free up the resource in trie now,
2305 as we won't use them - (which resources?) dmq */
3dab1dad 2306 }
a3621e74 2307 /* needed for dumping*/
e62cc96a 2308 DEBUG_r(if (optimize) {
07be1b83 2309 regnode *opt = convert;
bcdf7404 2310
e62cc96a 2311 while ( ++opt < optimize) {
07be1b83
YO
2312 Set_Node_Offset_Length(opt,0,0);
2313 }
786e8c11
YO
2314 /*
2315 Try to clean up some of the debris left after the
2316 optimisation.
a3621e74 2317 */
786e8c11 2318 while( optimize < jumper ) {
07be1b83 2319 mjd_nodelen += Node_Length((optimize));
a3621e74 2320 OP( optimize ) = OPTIMIZED;
07be1b83 2321 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2322 optimize++;
2323 }
07be1b83 2324 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2325 });
2326 } /* end node insert */
2e64971a
DM
2327
2328 /* Finish populating the prev field of the wordinfo array. Walk back
2329 * from each accept state until we find another accept state, and if
2330 * so, point the first word's .prev field at the second word. If the
2331 * second already has a .prev field set, stop now. This will be the
2332 * case either if we've already processed that word's accept state,
3b753521
FN
2333 * or that state had multiple words, and the overspill words were
2334 * already linked up earlier.
2e64971a
DM
2335 */
2336 {
2337 U16 word;
2338 U32 state;
2339 U16 prev;
2340
2341 for (word=1; word <= trie->wordcount; word++) {
2342 prev = 0;
2343 if (trie->wordinfo[word].prev)
2344 continue;
2345 state = trie->wordinfo[word].accept;
2346 while (state) {
2347 state = prev_states[state];
2348 if (!state)
2349 break;
2350 prev = trie->states[state].wordnum;
2351 if (prev)
2352 break;
2353 }
2354 trie->wordinfo[word].prev = prev;
2355 }
2356 Safefree(prev_states);
2357 }
2358
2359
2360 /* and now dump out the compressed format */
2361 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2362
55eed653 2363 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2364#ifdef DEBUGGING
2365 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2366 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2367#else
2368 SvREFCNT_dec(revcharmap);
07be1b83 2369#endif
786e8c11
YO
2370 return trie->jump
2371 ? MADE_JUMP_TRIE
2372 : trie->startstate>1
2373 ? MADE_EXACT_TRIE
2374 : MADE_TRIE;
2375}
2376
2377STATIC void
2378S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2379{
3b753521 2380/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2381
2382 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2383 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2384 ISBN 0-201-10088-6
2385
2386 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2387 suffix of the current state's 'word' that is also a proper prefix of another word in our
2388 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2389 the DFA not to have to restart after its tried and failed a word at a given point, it
2390 simply continues as though it had been matching the other word in the first place.
2391 Consider
2392 'abcdgu'=~/abcdefg|cdgu/
2393 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2394 fail, which would bring us to the state representing 'd' in the second word where we would
2395 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2396 */
2397 /* add a fail transition */
3251b653
NC
2398 const U32 trie_offset = ARG(source);
2399 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2400 U32 *q;
2401 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2402 const U32 numstates = trie->statecount;
786e8c11
YO
2403 const U32 ubound = trie->lasttrans + ucharcount;
2404 U32 q_read = 0;
2405 U32 q_write = 0;
2406 U32 charid;
2407 U32 base = trie->states[ 1 ].trans.base;
2408 U32 *fail;
2409 reg_ac_data *aho;
2410 const U32 data_slot = add_data( pRExC_state, 1, "T" );
2411 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2412
2413 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2414#ifndef DEBUGGING
2415 PERL_UNUSED_ARG(depth);
2416#endif
2417
2418
2419 ARG_SET( stclass, data_slot );
c944940b 2420 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2421 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2422 aho->trie=trie_offset;
446bd890
NC
2423 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2424 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2425 Newxz( q, numstates, U32);
c944940b 2426 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2427 aho->refcount = 1;
2428 fail = aho->fail;
2429 /* initialize fail[0..1] to be 1 so that we always have
2430 a valid final fail state */
2431 fail[ 0 ] = fail[ 1 ] = 1;
2432
2433 for ( charid = 0; charid < ucharcount ; charid++ ) {
2434 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2435 if ( newstate ) {
2436 q[ q_write ] = newstate;
2437 /* set to point at the root */
2438 fail[ q[ q_write++ ] ]=1;
2439 }
2440 }
2441 while ( q_read < q_write) {
2442 const U32 cur = q[ q_read++ % numstates ];
2443 base = trie->states[ cur ].trans.base;
2444
2445 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2446 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2447 if (ch_state) {
2448 U32 fail_state = cur;
2449 U32 fail_base;
2450 do {
2451 fail_state = fail[ fail_state ];
2452 fail_base = aho->states[ fail_state ].trans.base;
2453 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2454
2455 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2456 fail[ ch_state ] = fail_state;
2457 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2458 {
2459 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2460 }
2461 q[ q_write++ % numstates] = ch_state;
2462 }
2463 }
2464 }
2465 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2466 when we fail in state 1, this allows us to use the
2467 charclass scan to find a valid start char. This is based on the principle
2468 that theres a good chance the string being searched contains lots of stuff
2469 that cant be a start char.
2470 */
2471 fail[ 0 ] = fail[ 1 ] = 0;
2472 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2473 PerlIO_printf(Perl_debug_log,
2474 "%*sStclass Failtable (%"UVuf" states): 0",
2475 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2476 );
786e8c11
YO
2477 for( q_read=1; q_read<numstates; q_read++ ) {
2478 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2479 }
2480 PerlIO_printf(Perl_debug_log, "\n");
2481 });
2482 Safefree(q);
2483 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2484}
2485
786e8c11 2486
a3621e74 2487/*
5d1c421c
JH
2488 * There are strange code-generation bugs caused on sparc64 by gcc-2.95.2.
2489 * These need to be revisited when a newer toolchain becomes available.
2490 */
2491#if defined(__sparc64__) && defined(__GNUC__)
2492# if __GNUC__ < 2 || (__GNUC__ == 2 && __GNUC_MINOR__ < 96)
2493# undef SPARC64_GCC_WORKAROUND
2494# define SPARC64_GCC_WORKAROUND 1
2495# endif
2496#endif
2497
07be1b83 2498#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2499 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2500 SV * const mysv=sv_newmortal(); \
2501 regnode *Next = regnext(scan); \
2502 regprop(RExC_rx, mysv, scan); \
7f69552c 2503 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2504 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2505 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2506 }});
07be1b83 2507
1de06328 2508
bb914485
KW
2509/* The below joins as many adjacent EXACTish nodes as possible into a single
2510 * one, and looks for problematic sequences of characters whose folds vs.
2511 * non-folds have sufficiently different lengths, that the optimizer would be
2512 * fooled into rejecting legitimate matches of them, and the trie construction
2513 * code can't cope with them. The joining is only done if:
2514 * 1) there is room in the current conglomerated node to entirely contain the
2515 * next one.
2516 * 2) they are the exact same node type
2517 *
2518 * The adjacent nodes actually may be separated by NOTHING kind nodes, and
2519 * these get optimized out
2520 *
9d071ca8
KW
2521 * If there are problematic code sequences, *min_subtract is set to the delta
2522 * that the minimum size of the node can be less than its actual size. And,
2523 * the node type of the result is changed to reflect that it contains these
bb914485
KW
2524 * sequences.
2525 *
a0c4c608
KW
2526 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2527 * and contains LATIN SMALL LETTER SHARP S
f758bddf 2528 *
bb914485
KW
2529 * This is as good a place as any to discuss the design of handling these
2530 * problematic sequences. It's been wrong in Perl for a very long time. There
2531 * are three code points in Unicode whose folded lengths differ so much from
2532 * the un-folded lengths that it causes problems for the optimizer and trie
2533 * construction. Why only these are problematic, and not others where lengths
2534 * also differ is something I (khw) do not understand. New versions of Unicode
2535 * might add more such code points. Hopefully the logic in fold_grind.t that
287722f3 2536 * figures out what to test (in part by verifying that each size-combination
bb914485 2537 * gets tested) will catch any that do come along, so they can be added to the
287722f3
KW
2538 * special handling below. The chances of new ones are actually rather small,
2539 * as most, if not all, of the world's scripts that have casefolding have
2540 * already been encoded by Unicode. Also, a number of Unicode's decisions were
2541 * made to allow compatibility with pre-existing standards, and almost all of
2542 * those have already been dealt with. These would otherwise be the most
2543 * likely candidates for generating further tricky sequences. In other words,
2544 * Unicode by itself is unlikely to add new ones unless it is for compatibility
a0c4c608 2545 * with pre-existing standards, and there aren't many of those left.
bb914485
KW
2546 *
2547 * The previous designs for dealing with these involved assigning a special
2548 * node for them. This approach doesn't work, as evidenced by this example:
a0c4c608 2549 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
bb914485
KW
2550 * Both these fold to "sss", but if the pattern is parsed to create a node of
2551 * that would match just the \xDF, it won't be able to handle the case where a
2552 * successful match would have to cross the node's boundary. The new approach
2553 * that hopefully generally solves the problem generates an EXACTFU_SS node
2554 * that is "sss".
2555 *
2556 * There are a number of components to the approach (a lot of work for just
2557 * three code points!):
2558 * 1) This routine examines each EXACTFish node that could contain the
9d071ca8
KW
2559 * problematic sequences. It returns in *min_subtract how much to
2560 * subtract from the the actual length of the string to get a real minimum
2561 * for one that could match it. This number is usually 0 except for the
2562 * problematic sequences. This delta is used by the caller to adjust the
2563 * min length of the match, and the delta between min and max, so that the
2564 * optimizer doesn't reject these possibilities based on size constraints.
bb914485
KW
2565 * 2) These sequences are not currently correctly handled by the trie code
2566 * either, so it changes the joined node type to ops that are not handled
2567 * by trie's, those new ops being EXACTFU_SS and EXACTFU_NO_TRIE.
2568 * 3) This is sufficient for the two Greek sequences (described below), but
2569 * the one involving the Sharp s (\xDF) needs more. The node type
2570 * EXACTFU_SS is used for an EXACTFU node that contains at least one "ss"
2571 * sequence in it. For non-UTF-8 patterns and strings, this is the only
2572 * case where there is a possible fold length change. That means that a
2573 * regular EXACTFU node without UTF-8 involvement doesn't have to concern
2574 * itself with length changes, and so can be processed faster. regexec.c
2575 * takes advantage of this. Generally, an EXACTFish node that is in UTF-8
2576 * is pre-folded by regcomp.c. This saves effort in regex matching.
2577 * However, probably mostly for historical reasons, the pre-folding isn't
a0c4c608
KW
2578 * done for non-UTF8 patterns (and it can't be for EXACTF and EXACTFL
2579 * nodes, as what they fold to isn't known until runtime.) The fold
2580 * possibilities for the non-UTF8 patterns are quite simple, except for
2581 * the sharp s. All the ones that don't involve a UTF-8 target string
2582 * are members of a fold-pair, and arrays are set up for all of them
2583 * that quickly find the other member of the pair. It might actually
2584 * be faster to pre-fold these, but it isn't currently done, except for
2585 * the sharp s. Code elsewhere in this file makes sure that it gets
2586 * folded to 'ss', even if the pattern isn't UTF-8. This avoids the
2587 * issues described in the next item.
bb914485
KW
2588 * 4) A problem remains for the sharp s in EXACTF nodes. Whether it matches
2589 * 'ss' or not is not knowable at compile time. It will match iff the
2590 * target string is in UTF-8, unlike the EXACTFU nodes, where it always
2591 * matches; and the EXACTFL and EXACTFA nodes where it never does. Thus
2592 * it can't be folded to "ss" at compile time, unlike EXACTFU does as
2593 * described in item 3). An assumption that the optimizer part of
2594 * regexec.c (probably unwittingly) makes is that a character in the
2595 * pattern corresponds to at most a single character in the target string.
2596 * (And I do mean character, and not byte here, unlike other parts of the
2597 * documentation that have never been updated to account for multibyte
2598 * Unicode.) This assumption is wrong only in this case, as all other
2599 * cases are either 1-1 folds when no UTF-8 is involved; or is true by
2600 * virtue of having this file pre-fold UTF-8 patterns. I'm
2601 * reluctant to try to change this assumption, so instead the code punts.
9d071ca8
KW
2602 * This routine examines EXACTF nodes for the sharp s, and returns a
2603 * boolean indicating whether or not the node is an EXACTF node that
2604 * contains a sharp s. When it is true, the caller sets a flag that later
2605 * causes the optimizer in this file to not set values for the floating
2606 * and fixed string lengths, and thus avoids the optimizer code in
2607 * regexec.c that makes the invalid assumption. Thus, there is no
2608 * optimization based on string lengths for EXACTF nodes that contain the
2609 * sharp s. This only happens for /id rules (which means the pattern
2610 * isn't in UTF-8).
bb914485 2611 */
1de06328 2612
9d071ca8 2613#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
07be1b83 2614 if (PL_regkind[OP(scan)] == EXACT) \
9d071ca8 2615 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
07be1b83 2616
be8e71aa 2617STATIC U32
9d071ca8 2618S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
07be1b83
YO
2619 /* Merge several consecutive EXACTish nodes into one. */
2620 regnode *n = regnext(scan);
2621 U32 stringok = 1;
2622 regnode *next = scan + NODE_SZ_STR(scan);
2623 U32 merged = 0;
2624 U32 stopnow = 0;
2625#ifdef DEBUGGING
2626 regnode *stop = scan;
72f13be8 2627 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2628#else
d47053eb
RGS
2629 PERL_UNUSED_ARG(depth);
2630#endif
7918f24d
NC
2631
2632 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2633#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2634 PERL_UNUSED_ARG(flags);
2635 PERL_UNUSED_ARG(val);
07be1b83 2636#endif
07be1b83 2637 DEBUG_PEEP("join",scan,depth);
bb914485 2638
3f410cf6
KW
2639 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2640 * EXACT ones that are mergeable to the current one. */
2641 while (n
2642 && (PL_regkind[OP(n)] == NOTHING
2643 || (stringok && OP(n) == OP(scan)))
07be1b83 2644 && NEXT_OFF(n)
3f410cf6
KW
2645 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2646 {
07be1b83
YO
2647
2648 if (OP(n) == TAIL || n > next)
2649 stringok = 0;
2650 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2651 DEBUG_PEEP("skip:",n,depth);
2652 NEXT_OFF(scan) += NEXT_OFF(n);
2653 next = n + NODE_STEP_REGNODE;
2654#ifdef DEBUGGING
2655 if (stringok)
2656 stop = n;
2657#endif
2658 n = regnext(n);
2659 }
2660 else if (stringok) {
786e8c11 2661 const unsigned int oldl = STR_LEN(scan);
07be1b83 2662 regnode * const nnext = regnext(n);
b2230d39
KW
2663
2664 if (oldl + STR_LEN(n) > U8_MAX)
2665 break;
07be1b83
YO
2666
2667 DEBUG_PEEP("merg",n,depth);
07be1b83 2668 merged++;
b2230d39 2669
07be1b83
YO
2670 NEXT_OFF(scan) += NEXT_OFF(n);
2671 STR_LEN(scan) += STR_LEN(n);
2672 next = n + NODE_SZ_STR(n);
2673 /* Now we can overwrite *n : */
2674 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2675#ifdef DEBUGGING
2676 stop = next - 1;
2677#endif
2678 n = nnext;
2679 if (stopnow) break;
2680 }
2681
d47053eb
RGS
2682#ifdef EXPERIMENTAL_INPLACESCAN
2683 if (flags && !NEXT_OFF(n)) {
2684 DEBUG_PEEP("atch", val, depth);
2685 if (reg_off_by_arg[OP(n)]) {
2686 ARG_SET(n, val - n);
2687 }
2688 else {
2689 NEXT_OFF(n) = val - n;
2690 }
2691 stopnow = 1;
2692 }
07be1b83
YO
2693#endif
2694 }
2c2b7f86 2695
9d071ca8 2696 *min_subtract = 0;
f758bddf 2697 *has_exactf_sharp_s = FALSE;
f646642f 2698
3f410cf6
KW
2699 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2700 * can now analyze for sequences of problematic code points. (Prior to
2701 * this final joining, sequences could have been split over boundaries, and
a0c4c608
KW
2702 * hence missed). The sequences only happen in folding, hence for any
2703 * non-EXACT EXACTish node */
86d6fcad 2704 if (OP(scan) != EXACT) {
f758bddf
KW
2705 U8 *s;
2706 U8 * s0 = (U8*) STRING(scan);
2707 U8 * const s_end = s0 + STR_LEN(scan);
2708
2709 /* The below is perhaps overboard, but this allows us to save a test
2710 * each time through the loop at the expense of a mask. This is
2711 * because on both EBCDIC and ASCII machines, 'S' and 's' differ by a
2712 * single bit. On ASCII they are 32 apart; on EBCDIC, they are 64.
2713 * This uses an exclusive 'or' to find that bit and then inverts it to
2714 * form a mask, with just a single 0, in the bit position where 'S' and
2715 * 's' differ. */
2716 const U8 S_or_s_mask = ~ ('S' ^ 's');
2717 const U8 s_masked = 's' & S_or_s_mask;
2718
2719 /* One pass is made over the node's string looking for all the
2720 * possibilities. to avoid some tests in the loop, there are two main
2721 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2722 * non-UTF-8 */
2723 if (UTF) {
86d6fcad 2724
f758bddf
KW
2725 /* There are two problematic Greek code points in Unicode
2726 * casefolding
86d6fcad
KW
2727 *
2728 * U+0390 - GREEK SMALL LETTER IOTA WITH DIALYTIKA AND TONOS
2729 * U+03B0 - GREEK SMALL LETTER UPSILON WITH DIALYTIKA AND TONOS
2730 *
2731 * which casefold to
2732 *
2733 * Unicode UTF-8
2734 *
2735 * U+03B9 U+0308 U+0301 0xCE 0xB9 0xCC 0x88 0xCC 0x81
2736 * U+03C5 U+0308 U+0301 0xCF 0x85 0xCC 0x88 0xCC 0x81
2737 *
2738 * This means that in case-insensitive matching (or "loose
2739 * matching", as Unicode calls it), an EXACTF of length six (the
2740 * UTF-8 encoded byte length of the above casefolded versions) can
2741 * match a target string of length two (the byte length of UTF-8
2742 * encoded U+0390 or U+03B0). This would rather mess up the
2743 * minimum length computation. (there are other code points that
2744 * also fold to these two sequences, but the delta is smaller)
2745 *
f758bddf
KW
2746 * If these sequences are found, the minimum length is decreased by
2747 * four (six minus two).
86d6fcad 2748 *
f758bddf
KW
2749 * Similarly, 'ss' may match the single char and byte LATIN SMALL
2750 * LETTER SHARP S. We decrease the min length by 1 for each
2751 * occurrence of 'ss' found */
3f410cf6 2752
e294cc5d 2753#ifdef EBCDIC /* RD tunifold greek 0390 and 03B0 */
f758bddf
KW
2754# define U390_first_byte 0xb4
2755 const U8 U390_tail[] = "\x68\xaf\x49\xaf\x42";
2756# define U3B0_first_byte 0xb5
2757 const U8 U3B0_tail[] = "\x46\xaf\x49\xaf\x42";
e294cc5d 2758#else
f758bddf
KW
2759# define U390_first_byte 0xce
2760 const U8 U390_tail[] = "\xb9\xcc\x88\xcc\x81";
2761# define U3B0_first_byte 0xcf
2762 const U8 U3B0_tail[] = "\x85\xcc\x88\xcc\x81";
e294cc5d 2763#endif
f758bddf
KW
2764 const U8 len = sizeof(U390_tail); /* (-1 for NUL; +1 for 1st byte;
2765 yields a net of 0 */
2766 /* Examine the string for one of the problematic sequences */
2767 for (s = s0;
2768 s < s_end - 1; /* Can stop 1 before the end, as minimum length
2769 * sequence we are looking for is 2 */
2770 s += UTF8SKIP(s))
86d6fcad 2771 {
bb914485 2772
f758bddf
KW
2773 /* Look for the first byte in each problematic sequence */
2774 switch (*s) {
2775 /* We don't have to worry about other things that fold to
2776 * 's' (such as the long s, U+017F), as all above-latin1
2777 * code points have been pre-folded */
2778 case 's':
2779 case 'S':
2780
a0c4c608
KW
2781 /* Current character is an 's' or 'S'. If next one is
2782 * as well, we have the dreaded sequence */
f758bddf
KW
2783 if (((*(s+1) & S_or_s_mask) == s_masked)
2784 /* These two node types don't have special handling
2785 * for 'ss' */
2786 && OP(scan) != EXACTFL && OP(scan) != EXACTFA)
2787 {
9d071ca8 2788 *min_subtract += 1;
f758bddf
KW
2789 OP(scan) = EXACTFU_SS;
2790 s++; /* No need to look at this character again */
2791 }
2792 break;
2793
2794 case U390_first_byte:
2795 if (s_end - s >= len
2796
2797 /* The 1's are because are skipping comparing the
2798 * first byte */
2799 && memEQ(s + 1, U390_tail, len - 1))
2800 {
2801 goto greek_sequence;
2802 }
2803 break;
2804
2805 case U3B0_first_byte:
2806 if (! (s_end - s >= len
2807 && memEQ(s + 1, U3B0_tail, len - 1)))
2808 {
2809 break;
2810 }
2811 greek_sequence:
9d071ca8 2812 *min_subtract += 4;
f758bddf
KW
2813
2814 /* This can't currently be handled by trie's, so change
2815 * the node type to indicate this. If EXACTFA and
2816 * EXACTFL were ever to be handled by trie's, this
2817 * would have to be changed. If this node has already
2818 * been changed to EXACTFU_SS in this loop, leave it as
2819 * is. (I (khw) think it doesn't matter in regexec.c
2820 * for UTF patterns, but no need to change it */
2821 if (OP(scan) == EXACTFU) {
2822 OP(scan) = EXACTFU_NO_TRIE;
2823 }
2824 s += 6; /* We already know what this sequence is. Skip
2825 the rest of it */
2826 break;
bb914485
KW
2827 }
2828 }
2829 }
f758bddf 2830 else if (OP(scan) != EXACTFL && OP(scan) != EXACTFA) {
bb914485 2831
f758bddf
KW
2832 /* Here, the pattern is not UTF-8. We need to look only for the
2833 * 'ss' sequence, and in the EXACTF case, the sharp s, which can be
2834 * in the final position. Otherwise we can stop looking 1 byte
2835 * earlier because have to find both the first and second 's' */
2836 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2837
2838 for (s = s0; s < upper; s++) {
2839 switch (*s) {
2840 case 'S':
2841 case 's':
2842 if (s_end - s > 1
2843 && ((*(s+1) & S_or_s_mask) == s_masked))
2844 {
9d071ca8 2845 *min_subtract += 1;
f758bddf
KW
2846
2847 /* EXACTF nodes need to know that the minimum
2848 * length changed so that a sharp s in the string
2849 * can match this ss in the pattern, but they
2850 * remain EXACTF nodes, as they are not trie'able,
2851 * so don't have to invent a new node type to
2852 * exclude them from the trie code */
2853 if (OP(scan) != EXACTF) {
2854 OP(scan) = EXACTFU_SS;
2855 }
2856 s++;
2857 }
2858 break;
2859 case LATIN_SMALL_LETTER_SHARP_S:
2860 if (OP(scan) == EXACTF) {
2861 *has_exactf_sharp_s = TRUE;
2862 }
2863 break;
86d6fcad
KW
2864 }
2865 }
2866 }
07be1b83 2867 }
3f410cf6 2868
07be1b83 2869#ifdef DEBUGGING
bb789b09
DM
2870 /* Allow dumping but overwriting the collection of skipped
2871 * ops and/or strings with fake optimized ops */
07be1b83
YO
2872 n = scan + NODE_SZ_STR(scan);
2873 while (n <= stop) {
bb789b09
DM
2874 OP(n) = OPTIMIZED;
2875 FLAGS(n) = 0;
2876 NEXT_OFF(n) = 0;
07be1b83
YO
2877 n++;
2878 }
2879#endif
2880 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
2881 return stopnow;
2882}
2883
486ec47a 2884/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
2885 Finds fixed substrings. */
2886
a0288114 2887/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
2888 to the position after last scanned or to NULL. */
2889
40d049e4
YO
2890#define INIT_AND_WITHP \
2891 assert(!and_withp); \
2892 Newx(and_withp,1,struct regnode_charclass_class); \
2893 SAVEFREEPV(and_withp)
07be1b83 2894
b515a41d 2895/* this is a chain of data about sub patterns we are processing that
486ec47a 2896 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
2897 we can simulate recursion without losing state. */
2898struct scan_frame;
2899typedef struct scan_frame {
2900 regnode *last; /* last node to process in this frame */
2901 regnode *next; /* next node to process when last is reached */
2902 struct scan_frame *prev; /*previous frame*/
2903 I32 stop; /* what stopparen do we use */
2904} scan_frame;
2905
304ee84b
YO
2906
2907#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
2908
e1d1eefb
YO
2909#define CASE_SYNST_FNC(nAmE) \
2910case nAmE: \
2911 if (flags & SCF_DO_STCLASS_AND) { \
2912 for (value = 0; value < 256; value++) \
2913 if (!is_ ## nAmE ## _cp(value)) \
2914 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2915 } \
2916 else { \
2917 for (value = 0; value < 256; value++) \
2918 if (is_ ## nAmE ## _cp(value)) \
2919 ANYOF_BITMAP_SET(data->start_class, value); \
2920 } \
2921 break; \
2922case N ## nAmE: \
2923 if (flags & SCF_DO_STCLASS_AND) { \
2924 for (value = 0; value < 256; value++) \
2925 if (is_ ## nAmE ## _cp(value)) \
2926 ANYOF_BITMAP_CLEAR(data->start_class, value); \
2927 } \
2928 else { \
2929 for (value = 0; value < 256; value++) \
2930 if (!is_ ## nAmE ## _cp(value)) \
2931 ANYOF_BITMAP_SET(data->start_class, value); \
2932 } \
2933 break
2934
2935
2936
76e3520e 2937STATIC I32
40d049e4 2938S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
1de06328 2939 I32 *minlenp, I32 *deltap,
40d049e4
YO
2940 regnode *last,
2941 scan_data_t *data,
2942 I32 stopparen,
2943 U8* recursed,
2944 struct regnode_charclass_class *and_withp,
2945 U32 flags, U32 depth)
c277df42
IZ
2946 /* scanp: Start here (read-write). */
2947 /* deltap: Write maxlen-minlen here. */
2948 /* last: Stop before this one. */
40d049e4
YO
2949 /* data: string data about the pattern */
2950 /* stopparen: treat close N as END */
2951 /* recursed: which subroutines have we recursed into */
2952 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 2953{
97aff369 2954 dVAR;
c277df42
IZ
2955 I32 min = 0, pars = 0, code;
2956 regnode *scan = *scanp, *next;
2957 I32 delta = 0;
2958 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 2959 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
2960 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
2961 scan_data_t data_fake;
a3621e74 2962 SV *re_trie_maxbuff = NULL;
786e8c11 2963 regnode *first_non_open = scan;
e2e6a0f1 2964 I32 stopmin = I32_MAX;
8aa23a47 2965 scan_frame *frame = NULL;
a3621e74 2966 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 2967
7918f24d
NC
2968 PERL_ARGS_ASSERT_STUDY_CHUNK;
2969
13a24bad 2970#ifdef DEBUGGING
40d049e4 2971 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 2972#endif
40d049e4 2973
786e8c11 2974 if ( depth == 0 ) {
40d049e4 2975 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
2976 first_non_open=regnext(first_non_open);
2977 }
2978
b81d288d 2979
8aa23a47
YO
2980 fake_study_recurse:
2981 while ( scan && OP(scan) != END && scan < last ){
9d071ca8
KW
2982 UV min_subtract = 0; /* How much to subtract from the minimum node
2983 length to get a real minimum (because the
2984 folded version may be shorter) */
f758bddf 2985 bool has_exactf_sharp_s = FALSE;
8aa23a47 2986 /* Peephole optimizer: */
304ee84b 2987 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47 2988 DEBUG_PEEP("Peep",scan,depth);
a0c4c608
KW
2989
2990 /* Its not clear to khw or hv why this is done here, and not in the
2991 * clauses that deal with EXACT nodes. khw's guess is that it's
2992 * because of a previous design */
9d071ca8 2993 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
8aa23a47
YO
2994
2995 /* Follow the next-chain of the current node and optimize
2996 away all the NOTHINGs from it. */
2997 if (OP(scan) != CURLYX) {
2998 const int max = (reg_off_by_arg[OP(scan)]
2999 ? I32_MAX
3000 /* I32 may be smaller than U16 on CRAYs! */
3001 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3002 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3003 int noff;
3004 regnode *n = scan;
686b73d4 3005
8aa23a47
YO
3006 /* Skip NOTHING and LONGJMP. */
3007 while ((n = regnext(n))
3008 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3009 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3010 && off + noff < max)
3011 off += noff;
3012 if (reg_off_by_arg[OP(scan)])
3013 ARG(scan) = off;
3014 else
3015 NEXT_OFF(scan) = off;
3016 }
a3621e74 3017
c277df42 3018
8aa23a47
YO
3019
3020 /* The principal pseudo-switch. Cannot be a switch, since we
3021 look into several different things. */
3022 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3023 || OP(scan) == IFTHEN) {
3024 next = regnext(scan);
3025 code = OP(scan);
3026 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
686b73d4 3027
8aa23a47
YO
3028 if (OP(next) == code || code == IFTHEN) {
3029 /* NOTE - There is similar code to this block below for handling
3030 TRIE nodes on a re-study. If you change stuff here check there
3031 too. */
3032 I32 max1 = 0, min1 = I32_MAX, num = 0;
3033 struct regnode_charclass_class accum;
3034 regnode * const startbranch=scan;
686b73d4 3035
8aa23a47 3036 if (flags & SCF_DO_SUBSTR)
304ee84b 3037 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47 3038 if (flags & SCF_DO_STCLASS)
e755fd73 3039 cl_init_zero(pRExC_state, &accum);
8aa23a47
YO
3040
3041 while (OP(scan) == code) {
3042 I32 deltanext, minnext, f = 0, fake;
3043 struct regnode_charclass_class this_class;
3044
3045 num++;
3046 data_fake.flags = 0;
3047 if (data) {
3048 data_fake.whilem_c = data->whilem_c;
3049 data_fake.last_closep = data->last_closep;
3050 }
3051 else
3052 data_fake.last_closep = &fake;
58e23c8d
YO
3053
3054 data_fake.pos_delta = delta;
8aa23a47
YO
3055 next = regnext(scan);
3056 scan = NEXTOPER(scan);
3057 if (code != BRANCH)
c277df42 3058 scan = NEXTOPER(scan);
8aa23a47 3059 if (flags & SCF_DO_STCLASS) {
e755fd73 3060 cl_init(pRExC_state, &this_class);
8aa23a47
YO
3061 data_fake.start_class = &this_class;
3062 f = SCF_DO_STCLASS_AND;
58e23c8d 3063 }
8aa23a47
YO
3064 if (flags & SCF_WHILEM_VISITED_POS)
3065 f |= SCF_WHILEM_VISITED_POS;
3066
3067 /* we suppose the run is continuous, last=next...*/
3068 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3069 next, &data_fake,
3070 stopparen, recursed, NULL, f,depth+1);
3071 if (min1 > minnext)
3072 min1 = minnext;
3073 if (max1 < minnext + deltanext)
3074 max1 = minnext + deltanext;
3075 if (deltanext == I32_MAX)
3076 is_inf = is_inf_internal = 1;
3077 scan = next;
3078 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3079 pars++;
3080 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3081 if ( stopmin > minnext)
3082 stopmin = min + min1;
3083 flags &= ~SCF_DO_SUBSTR;
3084 if (data)
3085 data->flags |= SCF_SEEN_ACCEPT;
3086 }
3087 if (data) {
3088 if (data_fake.flags & SF_HAS_EVAL)
3089 data->flags |= SF_HAS_EVAL;
3090 data->whilem_c = data_fake.whilem_c;
3dab1dad 3091 }
8aa23a47 3092 if (flags & SCF_DO_STCLASS)
3fffb88a 3093 cl_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
3094 }
3095 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3096 min1 = 0;
3097 if (flags & SCF_DO_SUBSTR) {
3098 data->pos_min += min1;
3099 data->pos_delta += max1 - min1;
3100 if (max1 != min1 || is_inf)
3101 data->longest = &(data->longest_float);
3102 }
3103 min += min1;
3104 delta += max1 - min1;
3105 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3106 cl_or(pRExC_state, data->start_class, &accum);
8aa23a47
YO
3107 if (min1) {
3108 cl_and(data->start_class, and_withp);
3109 flags &= ~SCF_DO_STCLASS;
653099ff 3110 }
8aa23a47
YO
3111 }
3112 else if (flags & SCF_DO_STCLASS_AND) {
3113 if (min1) {
3114 cl_and(data->start_class, &accum);
3115 flags &= ~SCF_DO_STCLASS;
de0c8cb8 3116 }
8aa23a47
YO
3117 else {
3118 /* Switch to OR mode: cache the old value of
3119 * data->start_class */
3120 INIT_AND_WITHP;
3121 StructCopy(data->start_class, and_withp,
3122 struct regnode_charclass_class);
3123 flags &= ~SCF_DO_STCLASS_AND;
3124 StructCopy(&accum, data->start_class,
3125 struct regnode_charclass_class);
3126 flags |= SCF_DO_STCLASS_OR;
3127 data->start_class->flags |= ANYOF_EOS;
de0c8cb8 3128 }
8aa23a47 3129 }
a3621e74 3130
8aa23a47
YO
3131 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3132 /* demq.
a3621e74 3133
8aa23a47
YO
3134 Assuming this was/is a branch we are dealing with: 'scan' now
3135 points at the item that follows the branch sequence, whatever
3136 it is. We now start at the beginning of the sequence and look
3137 for subsequences of
a3621e74 3138
8aa23a47
YO
3139 BRANCH->EXACT=>x1
3140 BRANCH->EXACT=>x2
3141 tail
a3621e74 3142
8aa23a47 3143 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 3144
486ec47a 3145 If we can find such a subsequence we need to turn the first
8aa23a47
YO
3146 element into a trie and then add the subsequent branch exact
3147 strings to the trie.
a3621e74 3148
8aa23a47 3149 We have two cases
a3621e74 3150
3b753521 3151 1. patterns where the whole set of branches can be converted.
a3621e74 3152
8aa23a47 3153 2. patterns where only a subset can be converted.
a3621e74 3154
8aa23a47
YO
3155 In case 1 we can replace the whole set with a single regop
3156 for the trie. In case 2 we need to keep the start and end
3b753521 3157 branches so
a3621e74 3158
8aa23a47
YO
3159 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3160 becomes BRANCH TRIE; BRANCH X;
786e8c11 3161
8aa23a47
YO
3162 There is an additional case, that being where there is a
3163 common prefix, which gets split out into an EXACT like node
3164 preceding the TRIE node.
a3621e74 3165
8aa23a47
YO
3166 If x(1..n)==tail then we can do a simple trie, if not we make
3167 a "jump" trie, such that when we match the appropriate word
486ec47a 3168 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 3169 a nested if into a case structure of sorts.
b515a41d 3170
8aa23a47 3171 */
686b73d4 3172
8aa23a47
YO
3173 int made=0;
3174 if (!re_trie_maxbuff) {
3175 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3176 if (!SvIOK(re_trie_maxbuff))
3177 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3178 }
3179 if ( SvIV(re_trie_maxbuff)>=0 ) {
3180 regnode *cur;
3181 regnode *first = (regnode *)NULL;
3182 regnode *last = (regnode *)NULL;
3183 regnode *tail = scan;
3184 U8 optype = 0;
3185 U32 count=0;
a3621e74
YO
3186
3187#ifdef DEBUGGING
8aa23a47 3188 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 3189#endif
8aa23a47
YO
3190 /* var tail is used because there may be a TAIL
3191 regop in the way. Ie, the exacts will point to the
3192 thing following the TAIL, but the last branch will
3193 point at the TAIL. So we advance tail. If we
3194 have nested (?:) we may have to move through several
3195 tails.
3196 */
3197
3198 while ( OP( tail ) == TAIL ) {
3199 /* this is the TAIL generated by (?:) */
3200 tail = regnext( tail );
3201 }
a3621e74 3202
8aa23a47
YO
3203
3204 DEBUG_OPTIMISE_r({
3205 regprop(RExC_rx, mysv, tail );
3206 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3207 (int)depth * 2 + 2, "",
3208 "Looking for TRIE'able sequences. Tail node is: ",
3209 SvPV_nolen_const( mysv )
3210 );
3211 });
3212
3213 /*
3214
3215 step through the branches, cur represents each
3216 branch, noper is the first thing to be matched
3217 as part of that branch and noper_next is the
3218 regnext() of that node. if noper is an EXACT
3219 and noper_next is the same as scan (our current
3220 position in the regex) then the EXACT branch is
3221 a possible optimization target. Once we have
486ec47a 3222 two or more consecutive such branches we can
8aa23a47
YO
3223 create a trie of the EXACT's contents and stich
3224 it in place. If the sequence represents all of
3225 the branches we eliminate the whole thing and
3226 replace it with a single TRIE. If it is a
3227 subsequence then we need to stitch it in. This
3228 means the first branch has to remain, and needs
3229 to be repointed at the item on the branch chain
3230 following the last branch optimized. This could
3231 be either a BRANCH, in which case the
3232 subsequence is internal, or it could be the
3233 item following the branch sequence in which
3234 case the subsequence is at the end.
3235
3236 */
3237
3238 /* dont use tail as the end marker for this traverse */
3239 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3240 regnode * const noper = NEXTOPER( cur );
b515a41d 3241#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 3242 regnode * const noper_next = regnext( noper );
b515a41d
YO
3243#endif
3244
8aa23a47
YO
3245 DEBUG_OPTIMISE_r({
3246 regprop(RExC_rx, mysv, cur);
3247 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3248 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3249
3250 regprop(RExC_rx, mysv, noper);
3251 PerlIO_printf( Perl_debug_log, " -> %s",
3252 SvPV_nolen_const(mysv));
3253
3254 if ( noper_next ) {
3255 regprop(RExC_rx, mysv, noper_next );
3256 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3257 SvPV_nolen_const(mysv));
3258 }
3259 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d)\n",
3260 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur) );
3261 });
3262 if ( (((first && optype!=NOTHING) ? OP( noper ) == optype
3263 : PL_regkind[ OP( noper ) ] == EXACT )
3264 || OP(noper) == NOTHING )
786e8c11 3265#ifdef NOJUMPTRIE
8aa23a47 3266 && noper_next == tail
786e8c11 3267#endif
8aa23a47
YO
3268 && count < U16_MAX)
3269 {
3270 count++;
3271 if ( !first || optype == NOTHING ) {
3272 if (!first) first = cur;
3273 optype = OP( noper );
3274 } else {
3275 last = cur;
3276 }
3277 } else {
a0a388a1 3278/*
fbebf34e
KW
3279 Currently the trie logic handles case insensitive matching properly only
3280 when the pattern is UTF-8 and the node is EXACTFU (thus forcing unicode
3281 semantics).
0abd0d78
YO
3282
3283 If/when this is fixed the following define can be swapped
3284 in below to fully enable trie logic.
3285
a0a388a1 3286#define TRIE_TYPE_IS_SAFE 1
0abd0d78 3287
a0c4c608
KW
3288Note that join_exact() assumes that the other types of EXACTFish nodes are not
3289used in tries, so that would have to be updated if this changed
3290
0abd0d78 3291*/
fbebf34e 3292#define TRIE_TYPE_IS_SAFE ((UTF && optype == EXACTFU) || optype==EXACT)
0abd0d78 3293
a0a388a1 3294 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47
YO
3295 make_trie( pRExC_state,
3296 startbranch, first, cur, tail, count,
3297 optype, depth+1 );
3298 }
3299 if ( PL_regkind[ OP( noper ) ] == EXACT
786e8c11 3300#ifdef NOJUMPTRIE
8aa23a47 3301 && noper_next == tail
786e8c11 3302#endif
8aa23a47
YO
3303 ){
3304 count = 1;
3305 first = cur;
3306 optype = OP( noper );
3307 } else {
3308 count = 0;
3309 first = NULL;
3310 optype = 0;
3311 }
3312 last = NULL;
3313 }
3314 }
3315 DEBUG_OPTIMISE_r({
3316 regprop(RExC_rx, mysv, cur);
3317 PerlIO_printf( Perl_debug_log,
3318 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3319 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3320
3321 });
a0a388a1
YO
3322
3323 if ( last && TRIE_TYPE_IS_SAFE ) {
8aa23a47 3324 made= make_trie( pRExC_state, startbranch, first, scan, tail, count, optype, depth+1 );
686b73d4 3325#ifdef TRIE_STUDY_OPT
8aa23a47
YO
3326 if ( ((made == MADE_EXACT_TRIE &&
3327 startbranch == first)
3328 || ( first_non_open == first )) &&
3329 depth==0 ) {
3330 flags |= SCF_TRIE_RESTUDY;
3331 if ( startbranch == first
3332 && scan == tail )
3333 {
3334 RExC_seen &=~REG_TOP_LEVEL_BRANCHES;
3335 }
3336 }
3dab1dad 3337#endif
8aa23a47
YO
3338 }
3339 }
3340
3341 } /* do trie */
3342
653099ff 3343 }
8aa23a47
YO
3344 else if ( code == BRANCHJ ) { /* single branch is optimized. */
3345 scan = NEXTOPER(NEXTOPER(scan));
3346 } else /* single branch is optimized. */
3347 scan = NEXTOPER(scan);
3348 continue;
3349 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB || OP(scan) == GOSTART) {
3350 scan_frame *newframe = NULL;
3351 I32 paren;
3352 regnode *start;
3353 regnode *end;
3354
3355 if (OP(scan) != SUSPEND) {
3356 /* set the pointer */
3357 if (OP(scan) == GOSUB) {
3358 paren = ARG(scan);
3359 RExC_recurse[ARG2L(scan)] = scan;
3360 start = RExC_open_parens[paren-1];
3361 end = RExC_close_parens[paren-1];
3362 } else {
3363 paren = 0;
f8fc2ecf 3364 start = RExC_rxi->program + 1;
8aa23a47
YO
3365 end = RExC_opend;
3366 }
3367 if (!recursed) {
3368 Newxz(recursed, (((RExC_npar)>>3) +1), U8);
3369 SAVEFREEPV(recursed);
3370 }
3371 if (!PAREN_TEST(recursed,paren+1)) {
3372 PAREN_SET(recursed,paren+1);
3373 Newx(newframe,1,scan_frame);
3374 } else {
3375 if (flags & SCF_DO_SUBSTR) {
304ee84b 3376 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3377 data->longest = &(data->longest_float);
3378 }
3379 is_inf = is_inf_internal = 1;
3380 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 3381 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
3382 flags &= ~SCF_DO_STCLASS;
3383 }
3384 } else {
3385 Newx(newframe,1,scan_frame);
3386 paren = stopparen;
3387 start = scan+2;
3388 end = regnext(scan);
3389 }
3390 if (newframe) {
3391 assert(start);
3392 assert(end);
3393 SAVEFREEPV(newframe);
3394 newframe->next = regnext(scan);
3395 newframe->last = last;
3396 newframe->stop = stopparen;
3397 newframe->prev = frame;
3398
3399 frame = newframe;
3400 scan = start;
3401 stopparen = paren;
3402 last = end;
3403
3404 continue;
3405 }
3406 }
3407 else if (OP(scan) == EXACT) {
3408 I32 l = STR_LEN(scan);
3409 UV uc;
3410 if (UTF) {
3411 const U8 * const s = (U8*)STRING(scan);
3412 l = utf8_length(s, s + l);
3413 uc = utf8_to_uvchr(s, NULL);
3414 } else {
3415 uc = *((U8*)STRING(scan));
3416 }
3417 min += l;
3418 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
3419 /* The code below prefers earlier match for fixed
3420 offset, later match for variable offset. */
3421 if (data->last_end == -1) { /* Update the start info. */
3422 data->last_start_min = data->pos_min;
3423 data->last_start_max = is_inf
3424 ? I32_MAX : data->pos_min + data->pos_delta;
b515a41d 3425 }
8aa23a47
YO
3426 sv_catpvn(data->last_found, STRING(scan), STR_LEN(scan));
3427 if (UTF)
3428 SvUTF8_on(data->last_found);
3429 {
3430 SV * const sv = data->last_found;
3431 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3432 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3433 if (mg && mg->mg_len >= 0)
3434 mg->mg_len += utf8_length((U8*)STRING(scan),
3435 (U8*)STRING(scan)+STR_LEN(scan));
b515a41d 3436 }
8aa23a47
YO
3437 data->last_end = data->pos_min + l;
3438 data->pos_min += l; /* As in the first entry. */
3439 data->flags &= ~SF_BEFORE_EOL;
3440 }
3441 if (flags & SCF_DO_STCLASS_AND) {
3442 /* Check whether it is compatible with what we know already! */
3443 int compat = 1;
3444
54251c2e 3445
486ec47a 3446 /* If compatible, we or it in below. It is compatible if is
54251c2e
KW
3447 * in the bitmp and either 1) its bit or its fold is set, or 2)
3448 * it's for a locale. Even if there isn't unicode semantics
3449 * here, at runtime there may be because of matching against a
3450 * utf8 string, so accept a possible false positive for
3451 * latin1-range folds */
8aa23a47
YO
3452 if (uc >= 0x100 ||
3453 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3454 && !ANYOF_BITMAP_TEST(data->start_class, uc)
39065660 3455 && (!(data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD)
54251c2e 3456 || !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
8aa23a47 3457 )
d18bf9dc 3458 {
8aa23a47 3459 compat = 0;
d18bf9dc 3460 }
8aa23a47
YO
3461 ANYOF_CLASS_ZERO(data->start_class);
3462 ANYOF_BITMAP_ZERO(data->start_class);
3463 if (compat)
3464 ANYOF_BITMAP_SET(data->start_class, uc);
d18bf9dc
KW
3465 else if (uc >= 0x100) {
3466 int i;
3467
3468 /* Some Unicode code points fold to the Latin1 range; as
3469 * XXX temporary code, instead of figuring out if this is
3470 * one, just assume it is and set all the start class bits
3471 * that could be some such above 255 code point's fold
3472 * which will generate fals positives. As the code
3473 * elsewhere that does compute the fold settles down, it
3474 * can be extracted out and re-used here */
3475 for (i = 0; i < 256; i++){
3476 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3477 ANYOF_BITMAP_SET(data->start_class, i);
3478 }
3479 }
3480 }
8aa23a47
YO
3481 data->start_class->flags &= ~ANYOF_EOS;
3482 if (uc < 0x100)
3483 data->start_class->flags &= ~ANYOF_UNICODE_ALL;
3484 }
3485 else if (flags & SCF_DO_STCLASS_OR) {
3486 /* false positive possible if the class is case-folded */
3487 if (uc < 0x100)
3488 ANYOF_BITMAP_SET(data->start_class, uc);
3489 else
3490 data->start_class->flags |= ANYOF_UNICODE_ALL;
3491 data->start_class->flags &= ~ANYOF_EOS;
3492 cl_and(data->start_class, and_withp);
3493 }
3494 flags &= ~SCF_DO_STCLASS;
3495 }
3496 else if (PL_regkind[OP(scan)] == EXACT) { /* But OP != EXACT! */
3497 I32 l = STR_LEN(scan);
3498 UV uc = *((U8*)STRING(scan));
3499
3500 /* Search for fixed substrings supports EXACT only. */
3501 if (flags & SCF_DO_SUBSTR) {
3502 assert(data);
304ee84b 3503 SCAN_COMMIT(pRExC_state, data, minlenp);
8aa23a47
YO
3504 }
3505 if (UTF) {
3506 const U8 * const s = (U8 *)STRING(scan);
3507 l = utf8_length(s, s + l);
3508 uc = utf8_to_uvchr(s, NULL);
3509 }
f758bddf
KW
3510 else if (has_exactf_sharp_s) {
3511 RExC_seen |= REG_SEEN_EXACTF_SHARP_S;
bb914485 3512 }
9d071ca8 3513 min += l - min_subtract;
f646642f
KW
3514 if (min < 0) {
3515 min = 0;
3516 }
9d071ca8 3517 delta += min_subtract;
f646642f 3518 if (flags & SCF_DO_SUBSTR) {
9d071ca8 3519 data->pos_min += l - min_subtract;
f646642f
KW
3520 if (data->pos_min < 0) {
3521 data->pos_min = 0;
3522 }
9d071ca8
KW
3523 data->pos_delta += min_subtract;
3524 if (min_subtract) {
d2197104
KW
3525 data->longest = &(data->longest_float);
3526 }
f646642f 3527 }
8aa23a47
YO
3528 if (flags & SCF_DO_STCLASS_AND) {
3529 /* Check whether it is compatible with what we know already! */
3530 int compat = 1;
8aa23a47 3531 if (uc >= 0x100 ||
54251c2e
KW
3532 (!(data->start_class->flags & (ANYOF_CLASS | ANYOF_LOCALE))
3533 && !ANYOF_BITMAP_TEST(data->start_class, uc)
3534 && !ANYOF_BITMAP_TEST(data->start_class, PL_fold_latin1[uc])))
3535 {
8aa23a47 3536 compat = 0;
54251c2e 3537 }
8aa23a47
YO
3538 ANYOF_CLASS_ZERO(data->start_class);
3539 ANYOF_BITMAP_ZERO(data->start_class);
3540 if (compat) {
3541 ANYOF_BITMAP_SET(data->start_class, uc);
653099ff 3542 data->start_class->flags &= ~ANYOF_EOS;
39065660 3543 data->start_class->flags |= ANYOF_LOC_NONBITMAP_FOLD;
970c8436 3544 if (OP(scan) == EXACTFL) {
af302e7f
KW
3545 /* XXX This set is probably no longer necessary, and
3546 * probably wrong as LOCALE now is on in the initial
3547 * state */
8aa23a47 3548 data->start_class->flags |= ANYOF_LOCALE;
970c8436
KW
3549 }
3550 else {
3551
54251c2e
KW
3552 /* Also set the other member of the fold pair. In case
3553 * that unicode semantics is called for at runtime, use
3554 * the full latin1 fold. (Can't do this for locale,
a0c4c608 3555 * because not known until runtime) */
54251c2e 3556 ANYOF_BITMAP_SET(data->start_class, PL_fold_latin1[uc]);
e22b340a 3557
a0c4c608
KW
3558 /* All other (EXACTFL handled above) folds except under
3559 * /iaa that include s, S, and sharp_s also may include
3560 * the others */
e22b340a
KW
3561 if (OP(scan) != EXACTFA) {
3562 if (uc == 's' || uc == 'S') {
3563 ANYOF_BITMAP_SET(data->start_class,
3564 LATIN_SMALL_LETTER_SHARP_S);
3565 }
3566 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3567 ANYOF_BITMAP_SET(data->start_class, 's');
3568 ANYOF_BITMAP_SET(data->start_class, 'S');
3569 }
3570 }
970c8436 3571 }
653099ff 3572 }
d18bf9dc
KW
3573 else if (uc >= 0x100) {
3574 int i;
3575 for (i = 0; i < 256; i++){
3576 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)) {
3577 ANYOF_BITMAP_SET(data->start_class, i);
3578 }
3579 }
3580 }
8aa23a47
YO
3581 }
3582 else if (flags & SCF_DO_STCLASS_OR) {
39065660 3583 if (data->start_class->flags & ANYOF_LOC_NONBITMAP_FOLD) {
8aa23a47
YO
3584 /* false positive possible if the class is case-folded.
3585 Assume that the locale settings are the same... */
970c8436 3586 if (uc < 0x100) {
1aa99e6b 3587 ANYOF_BITMAP_SET(data->start_class, uc);
970c8436
KW
3588 if (OP(scan) != EXACTFL) {
3589
3590 /* And set the other member of the fold pair, but
3591 * can't do that in locale because not known until
3592 * run-time */
3593 ANYOF_BITMAP_SET(data->start_class,
54251c2e 3594 PL_fold_latin1[uc]);
e22b340a
KW
3595
3596 /* All folds except under /iaa that include s, S,
3597 * and sharp_s also may include the others */
3598 if (OP(scan) != EXACTFA) {
3599 if (uc == 's' || uc == 'S') {
3600 ANYOF_BITMAP_SET(data->start_class,
3601 LATIN_SMALL_LETTER_SHARP_S);
3602 }
3603 else if (uc == LATIN_SMALL_LETTER_SHARP_S) {
3604 ANYOF_BITMAP_SET(data->start_class, 's');
3605 ANYOF_BITMAP_SET(data->start_class, 'S');
3606 }
3607 }
970c8436
KW
3608 }
3609 }
653099ff
GS
3610 data->start_class->flags &= ~ANYOF_EOS;
3611 }
8aa23a47 3612 cl_and(data->start_class, and_withp);
653099ff 3613 }
8aa23a47
YO
3614 flags &= ~SCF_DO_STCLASS;
3615 }
e52fc539 3616 else if (REGNODE_VARIES(OP(scan))) {
8aa23a47
YO
3617 I32 mincount, maxcount, minnext, deltanext, fl = 0;
3618 I32 f = flags, pos_before = 0;
3619 regnode * const oscan = scan;
3620 struct regnode_charclass_class this_class;
3621 struct regnode_charclass_class *oclass = NULL;
3622 I32 next_is_eval = 0;
3623
3624 switch (PL_regkind[OP(scan)]) {
3625 case WHILEM: /* End of (?:...)* . */
3626 scan = NEXTOPER(scan);
3627 goto finish;
3628 case PLUS:
3629 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
3630 next = NEXTOPER(scan);
3631 if (OP(next) == EXACT || (flags & SCF_DO_STCLASS)) {
3632 mincount = 1;
3633 maxcount = REG_INFTY;
3634 next = regnext(scan);
3635 scan = NEXTOPER(scan);
3636 goto do_curly;
3637 }
3638 }
3639 if (flags & SCF_DO_SUBSTR)
3640 data->pos_min++;
3641 min++;
3642 /* Fall through. */
3643 case STAR:
3644 if (flags & SCF_DO_STCLASS) {
3645 mincount = 0;
3646 maxcount = REG_INFTY;
3647 next = regnext(scan);
3648 scan = NEXTOPER(scan);
3649 goto do_curly;
3650 }
3651 is_inf = is_inf_internal = 1;
3652 scan = regnext(scan);
c277df42 3653 if (flags & SCF_DO_SUBSTR) {
304ee84b 3654 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot extend fixed substrings */
8aa23a47 3655 data->longest = &(data->longest_float);
c277df42 3656 }
8aa23a47
YO
3657 goto optimize_curly_tail;
3658 case CURLY:
3659 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
3660 && (scan->flags == stopparen))
3661 {
3662 mincount = 1;
3663 maxcount = 1;
3664 } else {
3665 mincount = ARG1(scan);
3666 maxcount = ARG2(scan);
653099ff 3667 }
8aa23a47
YO
3668 next = regnext(scan);
3669 if (OP(scan) == CURLYX) {
3670 I32 lp = (data ? *(data->last_closep) : 0);
3671 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
653099ff 3672 }
8aa23a47
YO
3673 scan = NEXTOPER(scan) + EXTRA_STEP_2ARGS;
3674 next_is_eval = (OP(scan) == EVAL);
3675 do_curly:
3676 if (flags & SCF_DO_SUBSTR) {
304ee84b 3677 if (mincount == 0) SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot extend fixed substrings */
8aa23a47 3678 pos_before = data->pos_min;
b45f050a 3679 }
8aa23a47
YO
3680 if (data) {
3681 fl = data->flags;
3682 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
3683 if (is_inf)
3684 data->flags |= SF_IS_INF;
3685 }
3686 if (flags & SCF_DO_STCLASS) {
e755fd73 3687 cl_init(pRExC_state, &this_class);
8aa23a47
YO
3688 oclass = data->start_class;
3689 data->start_class = &this_class;
3690 f |= SCF_DO_STCLASS_AND;
3691 f &= ~SCF_DO_STCLASS_OR;
3692 }
779bcb7d
NC
3693 /* Exclude from super-linear cache processing any {n,m}
3694 regops for which the combination of input pos and regex
3695 pos is not enough information to determine if a match
3696 will be possible.
3697
3698 For example, in the regex /foo(bar\s*){4,8}baz/ with the
3699 regex pos at the \s*, the prospects for a match depend not
3700 only on the input position but also on how many (bar\s*)
3701 repeats into the {4,8} we are. */
3702 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
8aa23a47 3703 f &= ~SCF_WHILEM_VISITED_POS;
b45f050a 3704
8aa23a47
YO
3705 /* This will finish on WHILEM, setting scan, or on NULL: */
3706 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3707 last, data, stopparen, recursed, NULL,
3708 (mincount == 0
3709 ? (f & ~SCF_DO_SUBSTR) : f),depth+1);
b515a41d 3710
8aa23a47
YO
3711 if (flags & SCF_DO_STCLASS)
3712 data->start_class = oclass;
3713 if (mincount == 0 || minnext == 0) {
3714 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3715 cl_or(pRExC_state, data->start_class, &this_class);
8aa23a47
YO
3716 }
3717 else if (flags & SCF_DO_STCLASS_AND) {
3718 /* Switch to OR mode: cache the old value of
3719 * data->start_class */
3720 INIT_AND_WITHP;
3721 StructCopy(data->start_class, and_withp,
3722 struct regnode_charclass_class);
3723 flags &= ~SCF_DO_STCLASS_AND;
3724 StructCopy(&this_class, data->start_class,
3725 struct regnode_charclass_class);
3726 flags |= SCF_DO_STCLASS_OR;
3727 data->start_class->flags |= ANYOF_EOS;
3728 }
3729 } else { /* Non-zero len */
3730 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 3731 cl_or(pRExC_state, data->start_class, &this_class);
8aa23a47
YO
3732 cl_and(data->start_class, and_withp);
3733 }
3734 else if (flags & SCF_DO_STCLASS_AND)
3735 cl_and(data->start_class, &this_class);
3736 flags &= ~SCF_DO_STCLASS;
3737 }
3738 if (!scan) /* It was not CURLYX, but CURLY. */
3739 scan = next;
3740 if ( /* ? quantifier ok, except for (?{ ... }) */
3741 (next_is_eval || !(mincount == 0 && maxcount == 1))
3742 && (minnext == 0) && (deltanext == 0)
3743 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
668c081a 3744 && maxcount <= REG_INFTY/3) /* Complement check for big count */
8aa23a47 3745 {
668c081a
NC
3746 ckWARNreg(RExC_parse,
3747 "Quantifier unexpected on zero-length expression");
8aa23a47
YO
3748 }
3749
3750 min += minnext * mincount;
3751 is_inf_internal |= ((maxcount == REG_INFTY
3752 && (minnext + deltanext) > 0)
3753 || deltanext == I32_MAX);
3754 is_inf |= is_inf_internal;
3755 delta += (minnext + deltanext) * maxcount - minnext * mincount;
3756
3757 /* Try powerful optimization CURLYX => CURLYN. */
3758 if ( OP(oscan) == CURLYX && data
3759 && data->flags & SF_IN_PAR
3760 && !(data->flags & SF_HAS_EVAL)
3761 && !deltanext && minnext == 1 ) {
3762 /* Try to optimize to CURLYN. */
3763 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS;
3764 regnode * const nxt1 = nxt;
497b47a8 3765#ifdef DEBUGGING
8aa23a47 3766 regnode *nxt2;
497b47a8 3767#endif
c277df42 3768
8aa23a47
YO
3769 /* Skip open. */
3770 nxt = regnext(nxt);
e52fc539 3771 if (!REGNODE_SIMPLE(OP(nxt))
8aa23a47
YO
3772 && !(PL_regkind[OP(nxt)] == EXACT
3773 && STR_LEN(nxt) == 1))
3774 goto nogo;
497b47a8 3775#ifdef DEBUGGING
8aa23a47 3776 nxt2 = nxt;
497b47a8 3777#endif
8aa23a47
YO
3778 nxt = regnext(nxt);
3779 if (OP(nxt) != CLOSE)
3780 goto nogo;
3781 if (RExC_open_parens) {
3782 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3783 RExC_close_parens[ARG(nxt1)-1]=nxt+2; /*close->while*/
3784 }
3785 /* Now we know that nxt2 is the only contents: */
3786 oscan->flags = (U8)ARG(nxt);
3787 OP(oscan) = CURLYN;
3788 OP(nxt1) = NOTHING; /* was OPEN. */
40d049e4 3789
c277df42 3790#ifdef DEBUGGING
8aa23a47 3791 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
fda99bee
KW
3792 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
3793 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
8aa23a47
YO
3794 OP(nxt) = OPTIMIZED; /* was CLOSE. */
3795 OP(nxt + 1) = OPTIMIZED; /* was count. */
fda99bee 3796 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
b81d288d 3797#endif
8aa23a47
YO
3798 }
3799 nogo:
3800
3801 /* Try optimization CURLYX => CURLYM. */
3802 if ( OP(oscan) == CURLYX && data
3803 && !(data->flags & SF_HAS_PAR)
3804 && !(data->flags & SF_HAS_EVAL)
3805 && !deltanext /* atom is fixed width */
3806 && minnext != 0 /* CURLYM can't handle zero width */
3807 ) {
3808 /* XXXX How to optimize if data == 0? */
3809 /* Optimize to a simpler form. */
3810 regnode *nxt = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN */
3811 regnode *nxt2;
3812
3813 OP(oscan) = CURLYM;
3814 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
3815 && (OP(nxt2) != WHILEM))
3816 nxt = nxt2;
3817 OP(nxt2) = SUCCEED; /* Whas WHILEM */
3818 /* Need to optimize away parenths. */
b3c0965f 3819 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
8aa23a47
YO
3820 /* Set the parenth number. */
3821 regnode *nxt1 = NEXTOPER(oscan) + EXTRA_STEP_2ARGS; /* OPEN*/
3822
8aa23a47
YO
3823 oscan->flags = (U8)ARG(nxt);
3824 if (RExC_open_parens) {
3825 RExC_open_parens[ARG(nxt1)-1]=oscan; /*open->CURLYM*/
3826 RExC_close_parens[ARG(nxt1)-1]=nxt2+1; /*close->NOTHING*/
40d049e4 3827 }
8aa23a47
YO
3828 OP(nxt1) = OPTIMIZED; /* was OPEN. */
3829 OP(nxt) = OPTIMIZED; /* was CLOSE. */
40d049e4 3830
c277df42 3831#ifdef DEBUGGING
8aa23a47
YO
3832 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
3833 OP(nxt + 1) = OPTIMIZED; /* was count. */
486ec47a
PA
3834 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
3835 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
b81d288d 3836#endif
c277df42 3837#if 0
8aa23a47
YO
3838 while ( nxt1 && (OP(nxt1) != WHILEM)) {
3839 regnode *nnxt = regnext(nxt1);
8aa23a47
YO
3840 if (nnxt == nxt) {
3841 if (reg_off_by_arg[OP(nxt1)])
3842 ARG_SET(nxt1, nxt2 - nxt1);
3843 else if (nxt2 - nxt1 < U16_MAX)
3844 NEXT_OFF(nxt1) = nxt2 - nxt1;
3845 else
3846 OP(nxt) = NOTHING; /* Cannot beautify */
c277df42 3847 }
8aa23a47 3848 nxt1 = nnxt;
c277df42 3849 }
5d1c421c 3850#endif
8aa23a47
YO
3851 /* Optimize again: */
3852 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
3853 NULL, stopparen, recursed, NULL, 0,depth+1);
3854 }
3855 else
3856 oscan->flags = 0;
3857 }
3858 else if ((OP(oscan) == CURLYX)
3859 && (flags & SCF_WHILEM_VISITED_POS)
3860 /* See the comment on a similar expression above.
3b753521 3861 However, this time it's not a subexpression
8aa23a47
YO
3862 we care about, but the expression itself. */
3863 && (maxcount == REG_INFTY)
3864 && data && ++data->whilem_c < 16) {
3865 /* This stays as CURLYX, we can put the count/of pair. */
3866 /* Find WHILEM (as in regexec.c) */
3867 regnode *nxt = oscan + NEXT_OFF(oscan);
3868
3869 if (OP(PREVOPER(nxt)) == NOTHING) /* LONGJMP */
3870 nxt += ARG(nxt);
3871 PREVOPER(nxt)->flags = (U8)(data->whilem_c
3872 | (RExC_whilem_seen << 4)); /* On WHILEM */
3873 }
3874 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
3875 pars++;
3876 if (flags & SCF_DO_SUBSTR) {
3877 SV *last_str = NULL;
3878 int counted = mincount != 0;
a0ed51b3 3879
8aa23a47
YO
3880 if (data->last_end > 0 && mincount != 0) { /* Ends with a string. */
3881#if defined(SPARC64_GCC_WORKAROUND)
3882 I32 b = 0;
3883 STRLEN l = 0;
3884 const char *s = NULL;
3885 I32 old = 0;
b515a41d 3886
8aa23a47
YO
3887 if (pos_before >= data->last_start_min)
3888 b = pos_before;
3889 else
3890 b = data->last_start_min;
b515a41d 3891
8aa23a47
YO
3892 l = 0;
3893 s = SvPV_const(data->last_found, l);
3894 old = b - data->last_start_min;
3895
3896#else
3897 I32 b = pos_before >= data->last_start_min
3898 ? pos_before : data->last_start_min;
3899 STRLEN l;
3900 const char * const s = SvPV_const(data->last_found, l);
3901 I32 old = b - data->last_start_min;
3902#endif
3903
3904 if (UTF)
3905 old = utf8_hop((U8*)s, old) - (U8*)s;
8aa23a47
YO
3906 l -= old;
3907 /* Get the added string: */
740cce10 3908 last_str = newSVpvn_utf8(s + old, l, UTF);
8aa23a47
YO
3909 if (deltanext == 0 && pos_before == b) {
3910 /* What was added is a constant string */
3911 if (mincount > 1) {
3912 SvGROW(last_str, (mincount * l) + 1);
3913 repeatcpy(SvPVX(last_str) + l,
3914 SvPVX_const(last_str), l, mincount - 1);
3915 SvCUR_set(last_str, SvCUR(last_str) * mincount);
3916 /* Add additional parts. */
3917 SvCUR_set(data->last_found,
3918 SvCUR(data->last_found) - l);
3919 sv_catsv(data->last_found, last_str);
3920 {
3921 SV * sv = data->last_found;
3922 MAGIC *mg =
3923 SvUTF8(sv) && SvMAGICAL(sv) ?
3924 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3925 if (mg && mg->mg_len >= 0)
bd94e887 3926 mg->mg_len += CHR_SVLEN(last_str) - l;
b515a41d 3927 }
8aa23a47 3928 data->last_end += l * (mincount - 1);
b515a41d 3929 }
8aa23a47
YO
3930 } else {
3931 /* start offset must point into the last copy */
3932 data->last_start_min += minnext * (mincount - 1);
3933 data->last_start_max += is_inf ? I32_MAX
3934 : (maxcount - 1) * (minnext + data->pos_delta);
3935 }
c277df42 3936 }
8aa23a47
YO
3937 /* It is counted once already... */
3938 data->pos_min += minnext * (mincount - counted);
3939 data->pos_delta += - counted * deltanext +
3940 (minnext + deltanext) * maxcount - minnext * mincount;
3941 if (mincount != maxcount) {
3942 /* Cannot extend fixed substrings found inside
3943 the group. */
304ee84b 3944 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
3945 if (mincount && last_str) {
3946 SV * const sv = data->last_found;
3947 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
3948 mg_find(sv, PERL_MAGIC_utf8) : NULL;
3949
3950 if (mg)
3951 mg->mg_len = -1;
3952 sv_setsv(sv, last_str);
3953 data->last_end = data->pos_min;
3954 data->last_start_min =
3955 data->pos_min - CHR_SVLEN(last_str);
3956 data->last_start_max = is_inf
3957 ? I32_MAX
3958 : data->pos_min + data->pos_delta
3959 - CHR_SVLEN(last_str);
3960 }
3961 data->longest = &(data->longest_float);
3962 }
3963 SvREFCNT_dec(last_str);
c277df42 3964 }
8aa23a47
YO
3965 if (data && (fl & SF_HAS_EVAL))
3966 data->flags |= SF_HAS_EVAL;
3967 optimize_curly_tail:
3968 if (OP(oscan) != CURLYX) {
3969 while (PL_regkind[OP(next = regnext(oscan))] == NOTHING
3970 && NEXT_OFF(next))
3971 NEXT_OFF(oscan) += NEXT_OFF(next);
3972 }
3973 continue;
f56b6394 3974 default: /* REF, ANYOFV, and CLUMP only? */
8aa23a47 3975 if (flags & SCF_DO_SUBSTR) {
304ee84b 3976 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
3977 data->longest = &(data->longest_float);
3978 }
3979 is_inf = is_inf_internal = 1;
3980 if (flags & SCF_DO_STCLASS_OR)
3fffb88a 3981 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
3982 flags &= ~SCF_DO_STCLASS;
3983 break;
c277df42 3984 }
8aa23a47 3985 }
e1d1eefb
YO
3986 else if (OP(scan) == LNBREAK) {
3987 if (flags & SCF_DO_STCLASS) {
3988 int value = 0;
3989 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
3990 if (flags & SCF_DO_STCLASS_AND) {
3991 for (value = 0; value < 256; value++)
e64b1bd1 3992 if (!is_VERTWS_cp(value))
b9a59e08
KW
3993 ANYOF_BITMAP_CLEAR(data->start_class, value);
3994 }
3995 else {
e1d1eefb 3996 for (value = 0; value < 256; value++)
e64b1bd1 3997 if (is_VERTWS_cp(value))
b9a59e08
KW
3998 ANYOF_BITMAP_SET(data->start_class, value);
3999 }
e1d1eefb
YO
4000 if (flags & SCF_DO_STCLASS_OR)
4001 cl_and(data->start_class, and_withp);
4002 flags &= ~SCF_DO_STCLASS;
4003 }
4004 min += 1;
f9a79580 4005 delta += 1;
e1d1eefb
YO
4006 if (flags & SCF_DO_SUBSTR) {
4007 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
4008 data->pos_min += 1;
f9a79580 4009 data->pos_delta += 1;
e1d1eefb
YO
4010 data->longest = &(data->longest_float);
4011 }
e1d1eefb 4012 }
e52fc539 4013 else if (REGNODE_SIMPLE(OP(scan))) {
8aa23a47 4014 int value = 0;
653099ff 4015
8aa23a47 4016 if (flags & SCF_DO_SUBSTR) {
304ee84b 4017 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47
YO
4018 data->pos_min++;
4019 }
4020 min++;
4021 if (flags & SCF_DO_STCLASS) {
4022 data->start_class->flags &= ~ANYOF_EOS; /* No match on empty */
b515a41d 4023
8aa23a47
YO
4024 /* Some of the logic below assumes that switching
4025 locale on will only add false positives. */
4026 switch (PL_regkind[OP(scan)]) {
4027 case SANY:
4028 default:
4029 do_default:
4030 /* Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d", OP(scan)); */
4031 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 4032 cl_anything(pRExC_state, data->start_class);
8aa23a47
YO
4033 break;
4034 case REG_ANY:
4035 if (OP(scan) == SANY)
4036 goto do_default;
4037 if (flags & SCF_DO_STCLASS_OR) { /* Everything but \n */
4038 value = (ANYOF_BITMAP_TEST(data->start_class,'\n')
3a15e693 4039 || ANYOF_CLASS_TEST_ANY_SET(data->start_class));
3fffb88a 4040 cl_anything(pRExC_state, data->start_class);
653099ff 4041 }
8aa23a47
YO
4042 if (flags & SCF_DO_STCLASS_AND || !value)
4043 ANYOF_BITMAP_CLEAR(data->start_class,'\n');
4044 break;
4045 case ANYOF:
4046 if (flags & SCF_DO_STCLASS_AND)
4047 cl_and(data->start_class,
4048 (struct regnode_charclass_class*)scan);
653099ff 4049 else
3fffb88a 4050 cl_or(pRExC_state, data->start_class,
8aa23a47
YO
4051 (struct regnode_charclass_class*)scan);
4052 break;
4053 case ALNUM:
4054 if (flags & SCF_DO_STCLASS_AND) {
4055 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4056 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NALNUM);
980866de 4057 if (OP(scan) == ALNUMU) {
a12cf05f
KW
4058 for (value = 0; value < 256; value++) {
4059 if (!isWORDCHAR_L1(value)) {
4060 ANYOF_BITMAP_CLEAR(data->start_class, value);
4061 }
4062 }
4063 } else {
4064 for (value = 0; value < 256; value++) {
4065 if (!isALNUM(value)) {
4066 ANYOF_BITMAP_CLEAR(data->start_class, value);
4067 }
4068 }
4069 }
8aa23a47 4070 }
653099ff 4071 }
8aa23a47
YO
4072 else {
4073 if (data->start_class->flags & ANYOF_LOCALE)
4074 ANYOF_CLASS_SET(data->start_class,ANYOF_ALNUM);
af302e7f
KW
4075
4076 /* Even if under locale, set the bits for non-locale
4077 * in case it isn't a true locale-node. This will
4078 * create false positives if it truly is locale */
4079 if (OP(scan) == ALNUMU) {
a12cf05f
KW
4080 for (value = 0; value < 256; value++) {
4081 if (isWORDCHAR_L1(value)) {
4082 ANYOF_BITMAP_SET(data->start_class, value);
4083 }
4084 }
4085 } else {
4086 for (value = 0; value < 256; value++) {
4087 if (isALNUM(value)) {
4088 ANYOF_BITMAP_SET(data->start_class, value);
4089 }
4090 }
4091 }
8aa23a47
YO
4092 }
4093 break;
8aa23a47
YO
4094 case NALNUM:
4095 if (flags & SCF_DO_STCLASS_AND) {
4096 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4097 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_ALNUM);
980866de 4098 if (OP(scan) == NALNUMU) {
a12cf05f
KW
4099 for (value = 0; value < 256; value++) {
4100 if (isWORDCHAR_L1(value)) {
4101 ANYOF_BITMAP_CLEAR(data->start_class, value);
4102 }
4103 }
4104 } else {
4105 for (value = 0; value < 256; value++) {
4106 if (isALNUM(value)) {
4107 ANYOF_BITMAP_CLEAR(data->start_class, value);
4108 }
4109 }
4110 }
653099ff
GS
4111 }
4112 }
8aa23a47
YO
4113 else {
4114 if (data->start_class->flags & ANYOF_LOCALE)
4115 ANYOF_CLASS_SET(data->start_class,ANYOF_NALNUM);
af302e7f 4116
75950e1c
KW
4117 /* Even if under locale, set the bits for non-locale in
4118 * case it isn't a true locale-node. This will create
4119 * false positives if it truly is locale */
4120 if (OP(scan) == NALNUMU) {
4121 for (value = 0; value < 256; value++) {
4122 if (! isWORDCHAR_L1(value)) {
4123 ANYOF_BITMAP_SET(data->start_class, value);
4124 }
e9a9c1bc 4125 }
75950e1c
KW
4126 } else {
4127 for (value = 0; value < 256; value++) {
4128 if (! isALNUM(value)) {
4129 ANYOF_BITMAP_SET(data->start_class, value);
4130 }
4131 }
4132 }
653099ff 4133 }
8aa23a47 4134 break;
8aa23a47
YO
4135 case SPACE:
4136 if (flags & SCF_DO_STCLASS_AND) {
4137 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4138 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NSPACE);
980866de 4139 if (OP(scan) == SPACEU) {
a12cf05f
KW
4140 for (value = 0; value < 256; value++) {
4141 if (!isSPACE_L1(value)) {
4142 ANYOF_BITMAP_CLEAR(data->start_class, value);
4143 }
4144 }
4145 } else {
4146 for (value = 0; value < 256; value++) {
4147 if (!isSPACE(value)) {
4148 ANYOF_BITMAP_CLEAR(data->start_class, value);
4149 }
4150 }
4151 }
653099ff
GS
4152 }
4153 }
8aa23a47 4154 else {
a12cf05f 4155 if (data->start_class->flags & ANYOF_LOCALE) {
8aa23a47 4156 ANYOF_CLASS_SET(data->start_class,ANYOF_SPACE);
a12cf05f 4157 }
af302e7f 4158 if (OP(scan) == SPACEU) {
a12cf05f
KW
4159 for (value = 0; value < 256; value++) {
4160 if (isSPACE_L1(value)) {
4161 ANYOF_BITMAP_SET(data->start_class, value);
4162 }
4163 }
4164 } else {
4165 for (value = 0; value < 256; value++) {
4166 if (isSPACE(value)) {
4167 ANYOF_BITMAP_SET(data->start_class, value);
4168 }
4169 }
8aa23a47 4170 }
653099ff 4171 }
8aa23a47 4172 break;
8aa23a47
YO
4173 case NSPACE:
4174 if (flags & SCF_DO_STCLASS_AND) {
4175 if (!(data->start_class->flags & ANYOF_LOCALE)) {
4176 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_SPACE);
980866de 4177 if (OP(scan) == NSPACEU) {
a12cf05f
KW
4178 for (value = 0; value < 256; value++) {
4179 if (isSPACE_L1(value)) {
4180 ANYOF_BITMAP_CLEAR(data->start_class, value);
4181 }
4182 }
4183 } else {
4184 for (value = 0; value < 256; value++) {
4185 if (isSPACE(value)) {
4186 ANYOF_BITMAP_CLEAR(data->start_class, value);
4187 }
4188 }
4189 }
653099ff 4190 }
8aa23a47
YO
4191 }
4192 else {
4193 if (data->start_class->flags & ANYOF_LOCALE)
4194 ANYOF_CLASS_SET(data->start_class,ANYOF_NSPACE);
af302e7f 4195 if (OP(scan) == NSPACEU) {
a12cf05f
KW
4196 for (value = 0; value < 256; value++) {
4197 if (!isSPACE_L1(value)) {
4198 ANYOF_BITMAP_SET(data->start_class, value);
4199 }
4200 }
4201 }
4202 else {
4203 for (value = 0; value < 256; value++) {
4204 if (!isSPACE(value)) {
4205 ANYOF_BITMAP_SET(data->start_class, value);
4206 }
4207 }
4208 }
653099ff 4209 }
8aa23a47 4210 break;
8aa23a47
YO
4211 case DIGIT:
4212 if (flags & SCF_DO_STCLASS_AND) {
bcc0256f 4213 if (!(data->start_class->flags & ANYOF_LOCALE)) {
bf3c5c06
KW
4214 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_NDIGIT);
4215 for (value = 0; value < 256; value++)
4216 if (!isDIGIT(value))
4217 ANYOF_BITMAP_CLEAR(data->start_class, value);
bcc0256f 4218 }
8aa23a47
YO
4219 }
4220 else {
4221 if (data->start_class->flags & ANYOF_LOCALE)
4222 ANYOF_CLASS_SET(data->start_class,ANYOF_DIGIT);
75950e1c
KW
4223 for (value = 0; value < 256; value++)
4224 if (isDIGIT(value))
4225 ANYOF_BITMAP_SET(data->start_class, value);
8aa23a47
YO
4226 }
4227 break;
4228 case NDIGIT:
4229 if (flags & SCF_DO_STCLASS_AND) {
bcc0256f 4230 if (!(data->start_class->flags & ANYOF_LOCALE))
bf3c5c06 4231 ANYOF_CLASS_CLEAR(data->start_class,ANYOF_DIGIT);
8aa23a47
YO
4232 for (value = 0; value < 256; value++)
4233 if (isDIGIT(value))
4234 ANYOF_BITMAP_CLEAR(data->start_class, value);
4235 }
4236 else {
4237 if (data->start_class->flags & ANYOF_LOCALE)
4238 ANYOF_CLASS_SET(data->start_class,ANYOF_NDIGIT);
75950e1c
KW
4239 for (value = 0; value < 256; value++)
4240 if (!isDIGIT(value))
4241 ANYOF_BITMAP_SET(data->start_class, value);
653099ff 4242 }
8aa23a47 4243 break;
e1d1eefb
YO
4244 CASE_SYNST_FNC(VERTWS);
4245 CASE_SYNST_FNC(HORIZWS);
686b73d4 4246
8aa23a47
YO
4247 }
4248 if (flags & SCF_DO_STCLASS_OR)
4249 cl_and(data->start_class, and_withp);
4250 flags &= ~SCF_DO_STCLASS;
4251 }
4252 }
4253 else if (PL_regkind[OP(scan)] == EOL && flags & SCF_DO_SUBSTR) {
4254 data->flags |= (OP(scan) == MEOL
4255 ? SF_BEFORE_MEOL
4256 : SF_BEFORE_SEOL);
4257 }
4258 else if ( PL_regkind[OP(scan)] == BRANCHJ
4259 /* Lookbehind, or need to calculate parens/evals/stclass: */
4260 && (scan->flags || data || (flags & SCF_DO_STCLASS))
4261 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM)) {
4262 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4263 || OP(scan) == UNLESSM )
4264 {
4265 /* Negative Lookahead/lookbehind
4266 In this case we can't do fixed string optimisation.
4267 */
1de06328 4268
8aa23a47
YO
4269 I32 deltanext, minnext, fake = 0;
4270 regnode *nscan;
4271 struct regnode_charclass_class intrnl;
4272 int f = 0;
1de06328 4273
8aa23a47
YO
4274 data_fake.flags = 0;
4275 if (data) {
4276 data_fake.whilem_c = data->whilem_c;
4277 data_fake.last_closep = data->last_closep;
c277df42 4278 }
8aa23a47
YO
4279 else
4280 data_fake.last_closep = &fake;
58e23c8d 4281 data_fake.pos_delta = delta;
8aa23a47
YO
4282 if ( flags & SCF_DO_STCLASS && !scan->flags
4283 && OP(scan) == IFMATCH ) { /* Lookahead */
e755fd73 4284 cl_init(pRExC_state, &intrnl);
8aa23a47
YO
4285 data_fake.start_class = &intrnl;
4286 f |= SCF_DO_STCLASS_AND;
4287 }
4288 if (flags & SCF_WHILEM_VISITED_POS)
4289 f |= SCF_WHILEM_VISITED_POS;
4290 next = regnext(scan);
4291 nscan = NEXTOPER(NEXTOPER(scan));
4292 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
4293 last, &data_fake, stopparen, recursed, NULL, f, depth+1);
4294 if (scan->flags) {
4295 if (deltanext) {
58e23c8d 4296 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
4297 }
4298 else if (minnext > (I32)U8_MAX) {
58e23c8d 4299 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
4300 }
4301 scan->flags = (U8)minnext;
4302 }
4303 if (data) {
4304 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4305 pars++;
4306 if (data_fake.flags & SF_HAS_EVAL)
4307 data->flags |= SF_HAS_EVAL;
4308 data->whilem_c = data_fake.whilem_c;
4309 }
4310 if (f & SCF_DO_STCLASS_AND) {
906cdd2b
HS
4311 if (flags & SCF_DO_STCLASS_OR) {
4312 /* OR before, AND after: ideally we would recurse with
4313 * data_fake to get the AND applied by study of the
4314 * remainder of the pattern, and then derecurse;
4315 * *** HACK *** for now just treat as "no information".
4316 * See [perl #56690].
4317 */
e755fd73 4318 cl_init(pRExC_state, data->start_class);
906cdd2b
HS
4319 } else {
4320 /* AND before and after: combine and continue */
4321 const int was = (data->start_class->flags & ANYOF_EOS);
4322
4323 cl_and(data->start_class, &intrnl);
4324 if (was)
4325 data->start_class->flags |= ANYOF_EOS;
4326 }
8aa23a47 4327 }
cb434fcc 4328 }
8aa23a47
YO
4329#if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
4330 else {
4331 /* Positive Lookahead/lookbehind
4332 In this case we can do fixed string optimisation,
4333 but we must be careful about it. Note in the case of
4334 lookbehind the positions will be offset by the minimum
4335 length of the pattern, something we won't know about
4336 until after the recurse.
4337 */
4338 I32 deltanext, fake = 0;
4339 regnode *nscan;
4340 struct regnode_charclass_class intrnl;
4341 int f = 0;
4342 /* We use SAVEFREEPV so that when the full compile
4343 is finished perl will clean up the allocated
3b753521 4344 minlens when it's all done. This way we don't
8aa23a47
YO
4345 have to worry about freeing them when we know
4346 they wont be used, which would be a pain.
4347 */
4348 I32 *minnextp;
4349 Newx( minnextp, 1, I32 );
4350 SAVEFREEPV(minnextp);
4351
4352 if (data) {
4353 StructCopy(data, &data_fake, scan_data_t);
4354 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
4355 f |= SCF_DO_SUBSTR;
4356 if (scan->flags)
304ee84b 4357 SCAN_COMMIT(pRExC_state, &data_fake,minlenp);
8aa23a47
YO
4358 data_fake.last_found=newSVsv(data->last_found);
4359 }
4360 }
4361 else
4362 data_fake.last_closep = &fake;
4363 data_fake.flags = 0;
58e23c8d 4364 data_fake.pos_delta = delta;
8aa23a47
YO
4365 if (is_inf)
4366 data_fake.flags |= SF_IS_INF;
4367 if ( flags & SCF_DO_STCLASS && !scan->flags
4368 && OP(scan) == IFMATCH ) { /* Lookahead */
e755fd73 4369 cl_init(pRExC_state, &intrnl);
8aa23a47
YO
4370 data_fake.start_class = &intrnl;
4371 f |= SCF_DO_STCLASS_AND;
4372 }
4373 if (flags & SCF_WHILEM_VISITED_POS)
4374 f |= SCF_WHILEM_VISITED_POS;
4375 next = regnext(scan);
4376 nscan = NEXTOPER(NEXTOPER(scan));
4377
4378 *minnextp = study_chunk(pRExC_state, &nscan, minnextp, &deltanext,
4379 last, &data_fake, stopparen, recursed, NULL, f,depth+1);
4380 if (scan->flags) {
4381 if (deltanext) {
58e23c8d 4382 FAIL("Variable length lookbehind not implemented");
8aa23a47
YO
4383 }
4384 else if (*minnextp > (I32)U8_MAX) {
58e23c8d 4385 FAIL2("Lookbehind longer than %"UVuf" not implemented", (UV)U8_MAX);
8aa23a47
YO
4386 }
4387 scan->flags = (U8)*minnextp;
4388 }
4389
4390 *minnextp += min;
4391
4392 if (f & SCF_DO_STCLASS_AND) {
4393 const int was = (data->start_class->flags & ANYOF_EOS);
4394
4395 cl_and(data->start_class, &intrnl);
4396 if (was)
4397 data->start_class->flags |= ANYOF_EOS;
4398 }
4399 if (data) {
4400 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4401 pars++;
4402 if (data_fake.flags & SF_HAS_EVAL)
4403 data->flags |= SF_HAS_EVAL;
4404 data->whilem_c = data_fake.whilem_c;
4405 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
4406 if (RExC_rx->minlen<*minnextp)
4407 RExC_rx->minlen=*minnextp;
304ee84b 4408 SCAN_COMMIT(pRExC_state, &data_fake, minnextp);
8aa23a47
YO
4409 SvREFCNT_dec(data_fake.last_found);
4410
4411 if ( data_fake.minlen_fixed != minlenp )
4412 {
4413 data->offset_fixed= data_fake.offset_fixed;
4414 data->minlen_fixed= data_fake.minlen_fixed;
4415 data->lookbehind_fixed+= scan->flags;
4416 }
4417 if ( data_fake.minlen_float != minlenp )
4418 {
4419 data->minlen_float= data_fake.minlen_float;
4420 data->offset_float_min=data_fake.offset_float_min;
4421 data->offset_float_max=data_fake.offset_float_max;
4422 data->lookbehind_float+= scan->flags;
4423 }
4424 }
4425 }
4426
4427
40d049e4 4428 }
8aa23a47
YO
4429#endif
4430 }
4431 else if (OP(scan) == OPEN) {
4432 if (stopparen != (I32)ARG(scan))
4433 pars++;
4434 }
4435 else if (OP(scan) == CLOSE) {
4436 if (stopparen == (I32)ARG(scan)) {
4437 break;
4438 }
4439 if ((I32)ARG(scan) == is_par) {
4440 next = regnext(scan);
b515a41d 4441
8aa23a47
YO
4442 if ( next && (OP(next) != WHILEM) && next < last)
4443 is_par = 0; /* Disable optimization */
40d049e4 4444 }
8aa23a47
YO
4445 if (data)
4446 *(data->last_closep) = ARG(scan);
4447 }
4448 else if (OP(scan) == EVAL) {
c277df42
IZ
4449 if (data)
4450 data->flags |= SF_HAS_EVAL;
8aa23a47
YO
4451 }
4452 else if ( PL_regkind[OP(scan)] == ENDLIKE ) {
4453 if (flags & SCF_DO_SUBSTR) {
304ee84b 4454 SCAN_COMMIT(pRExC_state,data,minlenp);
8aa23a47 4455 flags &= ~SCF_DO_SUBSTR;
40d049e4 4456 }
8aa23a47
YO
4457 if (data && OP(scan)==ACCEPT) {
4458 data->flags |= SCF_SEEN_ACCEPT;
4459 if (stopmin > min)
4460 stopmin = min;
e2e6a0f1 4461 }
8aa23a47
YO
4462 }
4463 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
4464 {
0f5d15d6 4465 if (flags & SCF_DO_SUBSTR) {
304ee84b 4466 SCAN_COMMIT(pRExC_state,data,minlenp);
0f5d15d6
IZ
4467 data->longest = &(data->longest_float);
4468 }
4469 is_inf = is_inf_internal = 1;
653099ff 4470 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3fffb88a 4471 cl_anything(pRExC_state, data->start_class);
96776eda 4472 flags &= ~SCF_DO_STCLASS;
8aa23a47 4473 }
58e23c8d 4474 else if (OP(scan) == GPOS) {
bbe252da 4475 if (!(RExC_rx->extflags & RXf_GPOS_FLOAT) &&
58e23c8d
YO
4476 !(delta || is_inf || (data && data->pos_delta)))
4477 {
bbe252da
YO
4478 if (!(RExC_rx->extflags & RXf_ANCH) && (flags & SCF_DO_SUBSTR))
4479 RExC_rx->extflags |= RXf_ANCH_GPOS;
58e23c8d
YO
4480 if (RExC_rx->gofs < (U32)min)
4481 RExC_rx->gofs = min;
4482 } else {
bbe252da 4483 RExC_rx->extflags |= RXf_GPOS_FLOAT;
58e23c8d
YO
4484 RExC_rx->gofs = 0;
4485 }
4486 }
786e8c11 4487#ifdef TRIE_STUDY_OPT
40d049e4 4488#ifdef FULL_TRIE_STUDY
8aa23a47
YO
4489 else if (PL_regkind[OP(scan)] == TRIE) {
4490 /* NOTE - There is similar code to this block above for handling
4491 BRANCH nodes on the initial study. If you change stuff here
4492 check there too. */
4493 regnode *trie_node= scan;
4494 regnode *tail= regnext(scan);
f8fc2ecf 4495 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4496 I32 max1 = 0, min1 = I32_MAX;
4497 struct regnode_charclass_class accum;
4498
4499 if (flags & SCF_DO_SUBSTR) /* XXXX Add !SUSPEND? */
304ee84b 4500 SCAN_COMMIT(pRExC_state, data,minlenp); /* Cannot merge strings after this. */
8aa23a47 4501 if (flags & SCF_DO_STCLASS)
e755fd73 4502 cl_init_zero(pRExC_state, &accum);
8aa23a47
YO
4503
4504 if (!trie->jump) {
4505 min1= trie->minlen;
4506 max1= trie->maxlen;
4507 } else {
4508 const regnode *nextbranch= NULL;
4509 U32 word;
4510
4511 for ( word=1 ; word <= trie->wordcount ; word++)
4512 {
4513 I32 deltanext=0, minnext=0, f = 0, fake;
4514 struct regnode_charclass_class this_class;
4515
4516 data_fake.flags = 0;
4517 if (data) {
4518 data_fake.whilem_c = data->whilem_c;
4519 data_fake.last_closep = data->last_closep;
4520 }
4521 else
4522 data_fake.last_closep = &fake;
58e23c8d 4523 data_fake.pos_delta = delta;
8aa23a47 4524 if (flags & SCF_DO_STCLASS) {
e755fd73 4525 cl_init(pRExC_state, &this_class);
8aa23a47
YO
4526 data_fake.start_class = &this_class;
4527 f = SCF_DO_STCLASS_AND;
4528 }
4529 if (flags & SCF_WHILEM_VISITED_POS)
4530 f |= SCF_WHILEM_VISITED_POS;
4531
4532 if (trie->jump[word]) {
4533 if (!nextbranch)
4534 nextbranch = trie_node + trie->jump[0];
4535 scan= trie_node + trie->jump[word];
4536 /* We go from the jump point to the branch that follows
4537 it. Note this means we need the vestigal unused branches
4538 even though they arent otherwise used.
4539 */
4540 minnext = study_chunk(pRExC_state, &scan, minlenp,
4541 &deltanext, (regnode *)nextbranch, &data_fake,
4542 stopparen, recursed, NULL, f,depth+1);
4543 }
4544 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
4545 nextbranch= regnext((regnode*)nextbranch);
4546
4547 if (min1 > (I32)(minnext + trie->minlen))
4548 min1 = minnext + trie->minlen;
4549 if (max1 < (I32)(minnext + deltanext + trie->maxlen))
4550 max1 = minnext + deltanext + trie->maxlen;
4551 if (deltanext == I32_MAX)
4552 is_inf = is_inf_internal = 1;
4553
4554 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
4555 pars++;
4556 if (data_fake.flags & SCF_SEEN_ACCEPT) {
4557 if ( stopmin > min + min1)
4558 stopmin = min + min1;
4559 flags &= ~SCF_DO_SUBSTR;
4560 if (data)
4561 data->flags |= SCF_SEEN_ACCEPT;
4562 }
4563 if (data) {
4564 if (data_fake.flags & SF_HAS_EVAL)
4565 data->flags |= SF_HAS_EVAL;
4566 data->whilem_c = data_fake.whilem_c;
4567 }
4568 if (flags & SCF_DO_STCLASS)
3fffb88a 4569 cl_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
4570 }
4571 }
4572 if (flags & SCF_DO_SUBSTR) {
4573 data->pos_min += min1;
4574 data->pos_delta += max1 - min1;
4575 if (max1 != min1 || is_inf)
4576 data->longest = &(data->longest_float);
4577 }
4578 min += min1;
4579 delta += max1 - min1;
4580 if (flags & SCF_DO_STCLASS_OR) {
3fffb88a 4581 cl_or(pRExC_state, data->start_class, &accum);
8aa23a47
YO
4582 if (min1) {
4583 cl_and(data->start_class, and_withp);
4584 flags &= ~SCF_DO_STCLASS;
4585 }
4586 }
4587 else if (flags & SCF_DO_STCLASS_AND) {
4588 if (min1) {
4589 cl_and(data->start_class, &accum);
4590 flags &= ~SCF_DO_STCLASS;
4591 }
4592 else {
4593 /* Switch to OR mode: cache the old value of
4594 * data->start_class */
4595 INIT_AND_WITHP;
4596 StructCopy(data->start_class, and_withp,
4597 struct regnode_charclass_class);
4598 flags &= ~SCF_DO_STCLASS_AND;
4599 StructCopy(&accum, data->start_class,
4600 struct regnode_charclass_class);
4601 flags |= SCF_DO_STCLASS_OR;
4602 data->start_class->flags |= ANYOF_EOS;
4603 }
4604 }
4605 scan= tail;
4606 continue;
4607 }
786e8c11 4608#else
8aa23a47 4609 else if (PL_regkind[OP(scan)] == TRIE) {
f8fc2ecf 4610 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
8aa23a47
YO
4611 U8*bang=NULL;
4612
4613 min += trie->minlen;
4614 delta += (trie->maxlen - trie->minlen);
4615 flags &= ~SCF_DO_STCLASS; /* xxx */
4616 if (flags & SCF_DO_SUBSTR) {
304ee84b 4617 SCAN_COMMIT(pRExC_state,data,minlenp); /* Cannot expect anything... */
8aa23a47
YO
4618 data->pos_min += trie->minlen;
4619 data->pos_delta += (trie->maxlen - trie->minlen);
4620 if (trie->maxlen != trie->minlen)
4621 data->longest = &(data->longest_float);
4622 }
4623 if (trie->jump) /* no more substrings -- for now /grr*/
4624 flags &= ~SCF_DO_SUBSTR;
b515a41d 4625 }
8aa23a47 4626#endif /* old or new */
686b73d4 4627#endif /* TRIE_STUDY_OPT */
e1d1eefb 4628
8aa23a47
YO
4629 /* Else: zero-length, ignore. */
4630 scan = regnext(scan);
4631 }
4632 if (frame) {
4633 last = frame->last;
4634 scan = frame->next;
4635 stopparen = frame->stop;
4636 frame = frame->prev;
4637 goto fake_study_recurse;
c277df42
IZ
4638 }
4639
4640 finish:
8aa23a47 4641 assert(!frame);
304ee84b 4642 DEBUG_STUDYDATA("pre-fin:",data,depth);
8aa23a47 4643
c277df42 4644 *scanp = scan;
aca2d497 4645 *deltap = is_inf_internal ? I32_MAX : delta;
b81d288d 4646 if (flags & SCF_DO_SUBSTR && is_inf)
c277df42 4647 data->pos_delta = I32_MAX - data->pos_min;
786e8c11 4648 if (is_par > (I32)U8_MAX)
c277df42
IZ
4649 is_par = 0;
4650 if (is_par && pars==1 && data) {
4651 data->flags |= SF_IN_PAR;
4652 data->flags &= ~SF_HAS_PAR;
a0ed51b3
LW
4653 }
4654 else if (pars && data) {
c277df42
IZ
4655 data->flags |= SF_HAS_PAR;
4656 data->flags &= ~SF_IN_PAR;
4657 }
653099ff 4658 if (flags & SCF_DO_STCLASS_OR)
40d049e4 4659 cl_and(data->start_class, and_withp);
786e8c11
YO
4660 if (flags & SCF_TRIE_RESTUDY)
4661 data->flags |= SCF_TRIE_RESTUDY;
1de06328 4662
304ee84b 4663 DEBUG_STUDYDATA("post-fin:",data,depth);
1de06328 4664
e2e6a0f1 4665 return min < stopmin ? min : stopmin;
c277df42
IZ
4666}
4667
2eccd3b2
NC
4668STATIC U32
4669S_add_data(RExC_state_t *pRExC_state, U32 n, const char *s)
c277df42 4670{
4a4e7719
NC
4671 U32 count = RExC_rxi->data ? RExC_rxi->data->count : 0;
4672
7918f24d
NC
4673 PERL_ARGS_ASSERT_ADD_DATA;
4674
4a4e7719
NC
4675 Renewc(RExC_rxi->data,
4676 sizeof(*RExC_rxi->data) + sizeof(void*) * (count + n - 1),
4677 char, struct reg_data);
4678 if(count)
f8fc2ecf 4679 Renew(RExC_rxi->data->what, count + n, U8);
4a4e7719 4680 else
f8fc2ecf 4681 Newx(RExC_rxi->data->what, n, U8);
4a4e7719
NC
4682 RExC_rxi->data->count = count + n;
4683 Copy(s, RExC_rxi->data->what + count, n, U8);
4684 return count;
c277df42
IZ
4685}
4686
f8149455 4687/*XXX: todo make this not included in a non debugging perl */
76234dfb 4688#ifndef PERL_IN_XSUB_RE
d88dccdf 4689void
864dbfa3 4690Perl_reginitcolors(pTHX)
d88dccdf 4691{
97aff369 4692 dVAR;
1df70142 4693 const char * const s = PerlEnv_getenv("PERL_RE_COLORS");
d88dccdf 4694 if (s) {
1df70142
AL
4695 char *t = savepv(s);
4696 int i = 0;
4697 PL_colors[0] = t;
d88dccdf 4698 while (++i < 6) {
1df70142
AL
4699 t = strchr(t, '\t');
4700 if (t) {
4701 *t = '\0';
4702 PL_colors[i] = ++t;
d88dccdf
IZ
4703 }
4704 else
1df70142 4705 PL_colors[i] = t = (char *)"";
d88dccdf
IZ
4706 }
4707 } else {
1df70142 4708 int i = 0;
b81d288d 4709 while (i < 6)
06b5626a 4710 PL_colors[i++] = (char *)"";
d88dccdf
IZ
4711 }
4712 PL_colorset = 1;
4713}
76234dfb 4714#endif
8615cb43 4715
07be1b83 4716
786e8c11
YO
4717#ifdef TRIE_STUDY_OPT
4718#define CHECK_RESTUDY_GOTO \
4719 if ( \
4720 (data.flags & SCF_TRIE_RESTUDY) \
4721 && ! restudied++ \
4722 ) goto reStudy
4723#else
4724#define CHECK_RESTUDY_GOTO
4725#endif
f9f4320a 4726
a687059c 4727/*
e50aee73 4728 - pregcomp - compile a regular expression into internal code
a687059c
LW
4729 *
4730 * We can't allocate space until we know how big the compiled form will be,
4731 * but we can't compile it (and thus know how big it is) until we've got a
4732 * place to put the code. So we cheat: we compile it twice, once with code
4733 * generation turned off and size counting turned on, and once "for real".
4734 * This also means that we don't allocate space until we are sure that the
4735 * thing really will compile successfully, and we never have to move the
4736 * code and thus invalidate pointers into it. (Note that it has to be in
4737 * one piece because free() must be able to free it all.) [NB: not true in perl]
4738 *
4739 * Beware that the optimization-preparation code in here knows about some
4740 * of the structure of the compiled regexp. [I'll say.]
4741 */
b9b4dddf
YO
4742
4743
4744
f9f4320a 4745#ifndef PERL_IN_XSUB_RE
f9f4320a
YO
4746#define RE_ENGINE_PTR &PL_core_reg_engine
4747#else
f9f4320a
YO
4748extern const struct regexp_engine my_reg_engine;
4749#define RE_ENGINE_PTR &my_reg_engine
4750#endif
6d5c990f
RGS
4751
4752#ifndef PERL_IN_XSUB_RE
3ab4a224 4753REGEXP *
1593ad57 4754Perl_pregcomp(pTHX_ SV * const pattern, const U32 flags)
a687059c 4755{
97aff369 4756 dVAR;
6d5c990f 4757 HV * const table = GvHV(PL_hintgv);
7918f24d
NC
4758
4759 PERL_ARGS_ASSERT_PREGCOMP;
4760
f9f4320a
YO
4761 /* Dispatch a request to compile a regexp to correct
4762 regexp engine. */
f9f4320a
YO
4763 if (table) {
4764 SV **ptr= hv_fetchs(table, "regcomp", FALSE);
6d5c990f 4765 GET_RE_DEBUG_FLAGS_DECL;
1e2e3d02 4766 if (ptr && SvIOK(*ptr) && SvIV(*ptr)) {
f9f4320a
YO
4767 const regexp_engine *eng=INT2PTR(regexp_engine*,SvIV(*ptr));
4768 DEBUG_COMPILE_r({
8d8756e7 4769 PerlIO_printf(Perl_debug_log, "Using engine %"UVxf"\n",
f9f4320a
YO
4770 SvIV(*ptr));
4771 });
3ab4a224 4772 return CALLREGCOMP_ENG(eng, pattern, flags);
f9f4320a 4773 }
b9b4dddf 4774 }
3ab4a224 4775 return Perl_re_compile(aTHX_ pattern, flags);
2a5d9b1d 4776}
6d5c990f 4777#endif
2a5d9b1d 4778
3ab4a224 4779REGEXP *
29b09c41 4780Perl_re_compile(pTHX_ SV * const pattern, U32 orig_pm_flags)
2a5d9b1d
RGS
4781{
4782 dVAR;
288b8c02
NC
4783 REGEXP *rx;
4784 struct regexp *r;
f8fc2ecf 4785 register regexp_internal *ri;
3ab4a224 4786 STRLEN plen;
4d6b2893 4787 char* VOL exp;
5d51ce98 4788 char* xend;
c277df42 4789 regnode *scan;
a0d0e21e 4790 I32 flags;
a0d0e21e 4791 I32 minlen = 0;
29b09c41 4792 U32 pm_flags;
e7f38d0f
YO
4793
4794 /* these are all flags - maybe they should be turned
4795 * into a single int with different bit masks */
4796 I32 sawlookahead = 0;
a0d0e21e
LW
4797 I32 sawplus = 0;
4798 I32 sawopen = 0;
29b09c41 4799 bool used_setjump = FALSE;
4624b182 4800 regex_charset initial_charset = get_regex_charset(orig_pm_flags);
e7f38d0f 4801
bbd61b5f
KW
4802 U8 jump_ret = 0;
4803 dJMPENV;
2c2d71f5 4804 scan_data_t data;
830247a4 4805 RExC_state_t RExC_state;
be8e71aa 4806 RExC_state_t * const pRExC_state = &RExC_state;
07be1b83 4807#ifdef TRIE_STUDY_OPT
5d51ce98 4808 int restudied;
07be1b83
YO
4809 RExC_state_t copyRExC_state;
4810#endif
2a5d9b1d 4811 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
4812
4813 PERL_ARGS_ASSERT_RE_COMPILE;
4814
6d5c990f 4815 DEBUG_r(if (!PL_colorset) reginitcolors());
a0d0e21e 4816
a3e1f3a6
KW
4817 /* Initialize these here instead of as-needed, as is quick and avoids
4818 * having to test them each time otherwise */
4819 if (! PL_AboveLatin1) {
4820 PL_AboveLatin1 = _new_invlist_C_array(AboveLatin1_invlist);
4821 PL_ASCII = _new_invlist_C_array(ASCII_invlist);
4822 PL_Latin1 = _new_invlist_C_array(Latin1_invlist);
3f427fd9
KW
4823
4824 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4825 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4826
4827 PL_L1PosixAlpha = _new_invlist_C_array(L1PosixAlpha_invlist);
4828 PL_PosixAlpha = _new_invlist_C_array(PosixAlpha_invlist);
4829
4830 PL_PosixBlank = _new_invlist_C_array(PosixBlank_invlist);
4831 PL_XPosixBlank = _new_invlist_C_array(XPosixBlank_invlist);
4832
dab0c3e7
KW
4833 PL_L1Cased = _new_invlist_C_array(L1Cased_invlist);
4834
3f427fd9
KW
4835 PL_PosixCntrl = _new_invlist_C_array(PosixCntrl_invlist);
4836 PL_XPosixCntrl = _new_invlist_C_array(XPosixCntrl_invlist);
4837
4838 PL_PosixDigit = _new_invlist_C_array(PosixDigit_invlist);
4839
4840 PL_L1PosixGraph = _new_invlist_C_array(L1PosixGraph_invlist);
4841 PL_PosixGraph = _new_invlist_C_array(PosixGraph_invlist);
4842
4843 PL_L1PosixAlnum = _new_invlist_C_array(L1PosixAlnum_invlist);
4844 PL_PosixAlnum = _new_invlist_C_array(PosixAlnum_invlist);
4845
3f427fd9
KW
4846 PL_L1PosixLower = _new_invlist_C_array(L1PosixLower_invlist);
4847 PL_PosixLower = _new_invlist_C_array(PosixLower_invlist);
4848
4849 PL_L1PosixPrint = _new_invlist_C_array(L1PosixPrint_invlist);
4850 PL_PosixPrint = _new_invlist_C_array(PosixPrint_invlist);
4851
4852 PL_L1PosixPunct = _new_invlist_C_array(L1PosixPunct_invlist);
4853 PL_PosixPunct = _new_invlist_C_array(PosixPunct_invlist);
4854
4855 PL_PerlSpace = _new_invlist_C_array(PerlSpace_invlist);
4856 PL_XPerlSpace = _new_invlist_C_array(XPerlSpace_invlist);
4857
4858 PL_PosixSpace = _new_invlist_C_array(PosixSpace_invlist);
4859 PL_XPosixSpace = _new_invlist_C_array(XPosixSpace_invlist);
4860
4861 PL_L1PosixUpper = _new_invlist_C_array(L1PosixUpper_invlist);
4862 PL_PosixUpper = _new_invlist_C_array(PosixUpper_invlist);
4863
4864 PL_VertSpace = _new_invlist_C_array(VertSpace_invlist);
4865
4866 PL_PosixWord = _new_invlist_C_array(PosixWord_invlist);
4867 PL_L1PosixWord = _new_invlist_C_array(L1PosixWord_invlist);
4868
4869 PL_PosixXDigit = _new_invlist_C_array(PosixXDigit_invlist);
4870 PL_XPosixXDigit = _new_invlist_C_array(XPosixXDigit_invlist);
a3e1f3a6
KW
4871 }
4872
11951bcb
KW
4873 exp = SvPV(pattern, plen);
4874
4875 if (plen == 0) { /* ignore the utf8ness if the pattern is 0 length */
4876 RExC_utf8 = RExC_orig_utf8 = 0;
4877 }
4878 else {
4879 RExC_utf8 = RExC_orig_utf8 = SvUTF8(pattern);
4880 }
e40e74fe 4881 RExC_uni_semantics = 0;
4624b182 4882 RExC_contains_locale = 0;
7b597bb8 4883
d6bd454d 4884 /****************** LONG JUMP TARGET HERE***********************/
bbd61b5f
KW
4885 /* Longjmp back to here if have to switch in midstream to utf8 */
4886 if (! RExC_orig_utf8) {
4887 JMPENV_PUSH(jump_ret);
29b09c41 4888 used_setjump = TRUE;
bbd61b5f
KW
4889 }
4890
5d51ce98 4891 if (jump_ret == 0) { /* First time through */
29b09c41 4892 xend = exp + plen;
29b09c41 4893
5d51ce98
KW
4894 DEBUG_COMPILE_r({
4895 SV *dsv= sv_newmortal();
4896 RE_PV_QUOTED_DECL(s, RExC_utf8,
4897 dsv, exp, plen, 60);
4898 PerlIO_printf(Perl_debug_log, "%sCompiling REx%s %s\n",
4899 PL_colors[4],PL_colors[5],s);
4900 });
4901 }
4902 else { /* longjumped back */
bbd61b5f
KW
4903 STRLEN len = plen;
4904
5d51ce98
KW
4905 /* If the cause for the longjmp was other than changing to utf8, pop
4906 * our own setjmp, and longjmp to the correct handler */
bbd61b5f
KW
4907 if (jump_ret != UTF8_LONGJMP) {
4908 JMPENV_POP;
4909 JMPENV_JUMP(jump_ret);
4910 }
4911
595598ee
KW
4912 GET_RE_DEBUG_FLAGS;
4913
bbd61b5f
KW
4914 /* It's possible to write a regexp in ascii that represents Unicode
4915 codepoints outside of the byte range, such as via \x{100}. If we
4916 detect such a sequence we have to convert the entire pattern to utf8
4917 and then recompile, as our sizing calculation will have been based
4918 on 1 byte == 1 character, but we will need to use utf8 to encode
4919 at least some part of the pattern, and therefore must convert the whole
4920 thing.
4921 -- dmq */
4922 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log,
4923 "UTF8 mismatch! Converting to utf8 for resizing and compile\n"));
3e0b93e8
KW
4924 exp = (char*)Perl_bytes_to_utf8(aTHX_
4925 (U8*)SvPV_nomg(pattern, plen),
4926 &len);
bbd61b5f
KW
4927 xend = exp + len;
4928 RExC_orig_utf8 = RExC_utf8 = 1;
4929 SAVEFREEPV(exp);
4930 }
4931
5d51ce98
KW
4932#ifdef TRIE_STUDY_OPT
4933 restudied = 0;
4934#endif
4935
29b09c41 4936 pm_flags = orig_pm_flags;
a62b1201 4937
4624b182
KW
4938 if (initial_charset == REGEX_LOCALE_CHARSET) {
4939 RExC_contains_locale = 1;
4940 }
4941 else if (RExC_utf8 && initial_charset == REGEX_DEPENDS_CHARSET) {
4942
4943 /* Set to use unicode semantics if the pattern is in utf8 and has the
4944 * 'depends' charset specified, as it means unicode when utf8 */
a62b1201 4945 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
29b09c41
KW
4946 }
4947
02daf0ab 4948 RExC_precomp = exp;
c737faaf 4949 RExC_flags = pm_flags;
830247a4 4950 RExC_sawback = 0;
bbce6d69 4951
830247a4 4952 RExC_seen = 0;
b57e4118 4953 RExC_in_lookbehind = 0;
830247a4
IZ
4954 RExC_seen_zerolen = *exp == '^' ? -1 : 0;
4955 RExC_seen_evals = 0;
4956 RExC_extralen = 0;
e2a7e165 4957 RExC_override_recoding = 0;
c277df42 4958
bbce6d69 4959 /* First pass: determine size, legality. */
830247a4 4960 RExC_parse = exp;
fac92740 4961 RExC_start = exp;
830247a4
IZ
4962 RExC_end = xend;
4963 RExC_naughty = 0;
4964 RExC_npar = 1;
e2e6a0f1 4965 RExC_nestroot = 0;
830247a4
IZ
4966 RExC_size = 0L;
4967 RExC_emit = &PL_regdummy;
4968 RExC_whilem_seen = 0;
40d049e4
YO
4969 RExC_open_parens = NULL;
4970 RExC_close_parens = NULL;
4971 RExC_opend = NULL;
81714fb9 4972 RExC_paren_names = NULL;
1f1031fe
YO
4973#ifdef DEBUGGING
4974 RExC_paren_name_list = NULL;
4975#endif
40d049e4
YO
4976 RExC_recurse = NULL;
4977 RExC_recurse_count = 0;
81714fb9 4978
85ddcde9
JH
4979#if 0 /* REGC() is (currently) a NOP at the first pass.
4980 * Clever compilers notice this and complain. --jhi */
830247a4 4981 REGC((U8)REG_MAGIC, (char*)RExC_emit);
85ddcde9 4982#endif
44bed856
KW
4983 DEBUG_PARSE_r(
4984 PerlIO_printf(Perl_debug_log, "Starting first pass (sizing)\n");
4985 RExC_lastnum=0;
4986 RExC_lastparse=NULL;
4987 );
3dab1dad 4988 if (reg(pRExC_state, 0, &flags,1) == NULL) {
c445ea15 4989 RExC_precomp = NULL;
a0d0e21e
LW
4990 return(NULL);
4991 }
bbd61b5f 4992
29b09c41
KW
4993 /* Here, finished first pass. Get rid of any added setjmp */
4994 if (used_setjump) {
bbd61b5f 4995 JMPENV_POP;
02daf0ab 4996 }
e40e74fe 4997
07be1b83 4998 DEBUG_PARSE_r({
81714fb9
YO
4999 PerlIO_printf(Perl_debug_log,
5000 "Required size %"IVdf" nodes\n"
5001 "Starting second pass (creation)\n",
5002 (IV)RExC_size);
07be1b83
YO
5003 RExC_lastnum=0;
5004 RExC_lastparse=NULL;
5005 });
e40e74fe
KW
5006
5007 /* The first pass could have found things that force Unicode semantics */
5008 if ((RExC_utf8 || RExC_uni_semantics)
5009 && get_regex_charset(pm_flags) == REGEX_DEPENDS_CHARSET)
5010 {
5011 set_regex_charset(&pm_flags, REGEX_UNICODE_CHARSET);
5012 }
5013
c277df42
IZ
5014 /* Small enough for pointer-storage convention?
5015 If extralen==0, this means that we will not need long jumps. */
830247a4
IZ
5016 if (RExC_size >= 0x10000L && RExC_extralen)
5017 RExC_size += RExC_extralen;
c277df42 5018 else
830247a4
IZ
5019 RExC_extralen = 0;
5020 if (RExC_whilem_seen > 15)
5021 RExC_whilem_seen = 15;
a0d0e21e 5022
f9f4320a
YO
5023 /* Allocate space and zero-initialize. Note, the two step process
5024 of zeroing when in debug mode, thus anything assigned has to
5025 happen after that */
d2f13c59 5026 rx = (REGEXP*) newSV_type(SVt_REGEXP);
288b8c02 5027 r = (struct regexp*)SvANY(rx);
f8fc2ecf
YO
5028 Newxc(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode),
5029 char, regexp_internal);
5030 if ( r == NULL || ri == NULL )
b45f050a 5031 FAIL("Regexp out of space");
0f79a09d
GS
5032#ifdef DEBUGGING
5033 /* avoid reading uninitialized memory in DEBUGGING code in study_chunk() */
f8fc2ecf 5034 Zero(ri, sizeof(regexp_internal) + (unsigned)RExC_size * sizeof(regnode), char);
58e23c8d 5035#else
f8fc2ecf
YO
5036 /* bulk initialize base fields with 0. */
5037 Zero(ri, sizeof(regexp_internal), char);
0f79a09d 5038#endif
58e23c8d
YO
5039
5040 /* non-zero initialization begins here */
f8fc2ecf 5041 RXi_SET( r, ri );
f9f4320a 5042 r->engine= RE_ENGINE_PTR;
c737faaf 5043 r->extflags = pm_flags;
bcdf7404 5044 {
f7819f85 5045 bool has_p = ((r->extflags & RXf_PMf_KEEPCOPY) == RXf_PMf_KEEPCOPY);
a62b1201 5046 bool has_charset = (get_regex_charset(r->extflags) != REGEX_DEPENDS_CHARSET);
c5ea2ffa
KW
5047
5048 /* The caret is output if there are any defaults: if not all the STD
5049 * flags are set, or if no character set specifier is needed */
5050 bool has_default =
5051 (((r->extflags & RXf_PMf_STD_PMMOD) != RXf_PMf_STD_PMMOD)
5052 || ! has_charset);
bcdf7404 5053 bool has_runon = ((RExC_seen & REG_SEEN_RUN_ON_COMMENT)==REG_SEEN_RUN_ON_COMMENT);
14f3b9f2
NC
5054 U16 reganch = (U16)((r->extflags & RXf_PMf_STD_PMMOD)
5055 >> RXf_PMf_STD_PMMOD_SHIFT);
bcdf7404
YO
5056 const char *fptr = STD_PAT_MODS; /*"msix"*/
5057 char *p;
fb85c044 5058 /* Allocate for the worst case, which is all the std flags are turned
c5ea2ffa
KW
5059 * on. If more precision is desired, we could do a population count of
5060 * the flags set. This could be done with a small lookup table, or by
5061 * shifting, masking and adding, or even, when available, assembly
5062 * language for a machine-language population count.
5063 * We never output a minus, as all those are defaults, so are
5064 * covered by the caret */
fb85c044 5065 const STRLEN wraplen = plen + has_p + has_runon
c5ea2ffa 5066 + has_default /* If needs a caret */
a62b1201
KW
5067
5068 /* If needs a character set specifier */
5069 + ((has_charset) ? MAX_CHARSET_NAME_LENGTH : 0)
bcdf7404
YO
5070 + (sizeof(STD_PAT_MODS) - 1)
5071 + (sizeof("(?:)") - 1);
5072
c5ea2ffa 5073 p = sv_grow(MUTABLE_SV(rx), wraplen + 1); /* +1 for the ending NUL */
f7c278bf 5074 SvPOK_on(rx);
8f6ae13c 5075 SvFLAGS(rx) |= SvUTF8(pattern);
bcdf7404 5076 *p++='('; *p++='?';
9de15fec
KW
5077
5078 /* If a default, cover it using the caret */
c5ea2ffa 5079 if (has_default) {
85508812 5080 *p++= DEFAULT_PAT_MOD;
fb85c044 5081 }
c5ea2ffa 5082 if (has_charset) {
a62b1201
KW
5083 STRLEN len;
5084 const char* const name = get_regex_charset_name(r->extflags, &len);
5085 Copy(name, p, len, char);
5086 p += len;
9de15fec 5087 }
f7819f85
A
5088 if (has_p)
5089 *p++ = KEEPCOPY_PAT_MOD; /*'p'*/
bcdf7404 5090 {
bcdf7404 5091 char ch;
bcdf7404
YO
5092 while((ch = *fptr++)) {
5093 if(reganch & 1)
5094 *p++ = ch;
bcdf7404
YO
5095 reganch >>= 1;
5096 }
bcdf7404
YO
5097 }
5098
28d8d7f4 5099 *p++ = ':';
bb661a58 5100 Copy(RExC_precomp, p, plen, char);
efd26800
NC
5101 assert ((RX_WRAPPED(rx) - p) < 16);
5102 r->pre_prefix = p - RX_WRAPPED(rx);
bb661a58 5103 p += plen;
bcdf7404 5104 if (has_runon)
28d8d7f4
YO
5105 *p++ = '\n';
5106 *p++ = ')';
5107 *p = 0;
fb85c044 5108 SvCUR_set(rx, p - SvPVX_const(rx));
bcdf7404
YO
5109 }
5110
bbe252da 5111 r->intflags = 0;
830247a4 5112 r->nparens = RExC_npar - 1; /* set early to validate backrefs */
81714fb9 5113
6bda09f9 5114 if (RExC_seen & REG_SEEN_RECURSE) {
40d049e4
YO
5115 Newxz(RExC_open_parens, RExC_npar,regnode *);
5116 SAVEFREEPV(RExC_open_parens);
5117 Newxz(RExC_close_parens,RExC_npar,regnode *);
5118 SAVEFREEPV(RExC_close_parens);
6bda09f9
YO
5119 }
5120
5121 /* Useful during FAIL. */
7122b237
YO
5122#ifdef RE_TRACK_PATTERN_OFFSETS
5123 Newxz(ri->u.offsets, 2*RExC_size+1, U32); /* MJD 20001228 */
a3621e74 5124 DEBUG_OFFSETS_r(PerlIO_printf(Perl_debug_log,
2af232bd 5125 "%s %"UVuf" bytes for offset annotations.\n",
7122b237 5126 ri->u.offsets ? "Got" : "Couldn't get",
392fbf5d 5127 (UV)((2*RExC_size+1) * sizeof(U32))));
7122b237
YO
5128#endif
5129 SetProgLen(ri,RExC_size);
288b8c02 5130 RExC_rx_sv = rx;
830247a4 5131 RExC_rx = r;
f8fc2ecf 5132 RExC_rxi = ri;
bbce6d69 5133
5134 /* Second pass: emit code. */
c737faaf 5135 RExC_flags = pm_flags; /* don't let top level (?i) bleed */
830247a4
IZ
5136 RExC_parse = exp;
5137 RExC_end = xend;
5138 RExC_naughty = 0;
5139 RExC_npar = 1;
f8fc2ecf
YO
5140 RExC_emit_start = ri->program;
5141 RExC_emit = ri->program;
3b57cd43
YO
5142 RExC_emit_bound = ri->program + RExC_size + 1;
5143
2cd61cdb 5144 /* Store the count of eval-groups for security checks: */
f8149455 5145 RExC_rx->seen_evals = RExC_seen_evals;
830247a4 5146 REGC((U8)REG_MAGIC, (char*) RExC_emit++);
80757612 5147 if (reg(pRExC_state, 0, &flags,1) == NULL) {
288b8c02 5148 ReREFCNT_dec(rx);
a0d0e21e 5149 return(NULL);
80757612 5150 }
07be1b83
YO
5151 /* XXXX To minimize changes to RE engine we always allocate
5152 3-units-long substrs field. */
5153 Newx(r->substrs, 1, struct reg_substr_data);
40d049e4
YO
5154 if (RExC_recurse_count) {
5155 Newxz(RExC_recurse,RExC_recurse_count,regnode *);
5156 SAVEFREEPV(RExC_recurse);
5157 }
a0d0e21e 5158
07be1b83 5159reStudy:
e7f38d0f 5160 r->minlen = minlen = sawlookahead = sawplus = sawopen = 0;
07be1b83 5161 Zero(r->substrs, 1, struct reg_substr_data);
a3621e74 5162
07be1b83 5163#ifdef TRIE_STUDY_OPT
0934c9d9
SH
5164 if (!restudied) {
5165 StructCopy(&zero_scan_data, &data, scan_data_t);
5166 copyRExC_state = RExC_state;
5167 } else {
5d458dd8 5168 U32 seen=RExC_seen;
07be1b83 5169 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log,"Restudying\n"));
5d458dd8
YO
5170
5171 RExC_state = copyRExC_state;
5172 if (seen & REG_TOP_LEVEL_BRANCHES)
5173 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
5174 else
5175 RExC_seen &= ~REG_TOP_LEVEL_BRANCHES;
1de06328 5176 if (data.last_found) {
07be1b83 5177 SvREFCNT_dec(data.longest_fixed);
07be1b83 5178 SvREFCNT_dec(data.longest_float);
07be1b83 5179 SvREFCNT_dec(data.last_found);
1de06328 5180 }
40d049e4 5181 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 5182 }
40d049e4
YO
5183#else
5184 StructCopy(&zero_scan_data, &data, scan_data_t);
07be1b83 5185#endif
fc8cd66c 5186
a0d0e21e 5187 /* Dig out information for optimizations. */
f7819f85 5188 r->extflags = RExC_flags; /* was pm_op */
c737faaf
YO
5189 /*dmq: removed as part of de-PMOP: pm->op_pmflags = RExC_flags; */
5190
a0ed51b3 5191 if (UTF)
8f6ae13c 5192 SvUTF8_on(rx); /* Unicode in it? */
f8fc2ecf 5193 ri->regstclass = NULL;
830247a4 5194 if (RExC_naughty >= 10) /* Probably an expensive pattern. */
bbe252da 5195 r->intflags |= PREGf_NAUGHTY;
f8fc2ecf 5196 scan = ri->program + 1; /* First BRANCH. */
2779dcf1 5197
1de06328
YO
5198 /* testing for BRANCH here tells us whether there is "must appear"
5199 data in the pattern. If there is then we can use it for optimisations */
eaf3ca90 5200 if (!(RExC_seen & REG_TOP_LEVEL_BRANCHES)) { /* Only one top-level choice. */
c277df42 5201 I32 fake;
c5254dd6 5202 STRLEN longest_float_length, longest_fixed_length;
07be1b83 5203 struct regnode_charclass_class ch_class; /* pointed to by data */
653099ff 5204 int stclass_flag;
07be1b83 5205 I32 last_close = 0; /* pointed to by data */
5339e136
YO
5206 regnode *first= scan;
5207 regnode *first_next= regnext(first);
639081d6
YO
5208 /*
5209 * Skip introductions and multiplicators >= 1
5210 * so that we can extract the 'meat' of the pattern that must
5211 * match in the large if() sequence following.
5212 * NOTE that EXACT is NOT covered here, as it is normally
5213 * picked up by the optimiser separately.
5214 *
5215 * This is unfortunate as the optimiser isnt handling lookahead
5216 * properly currently.
5217 *
5218 */
a0d0e21e 5219 while ((OP(first) == OPEN && (sawopen = 1)) ||
653099ff 5220 /* An OR of *one* alternative - should not happen now. */
5339e136 5221 (OP(first) == BRANCH && OP(first_next) != BRANCH) ||
07be1b83 5222 /* for now we can't handle lookbehind IFMATCH*/
e7f38d0f 5223 (OP(first) == IFMATCH && !first->flags && (sawlookahead = 1)) ||
a0d0e21e
LW
5224 (OP(first) == PLUS) ||
5225 (OP(first) == MINMOD) ||
653099ff 5226 /* An {n,m} with n>0 */
5339e136
YO
5227 (PL_regkind[OP(first)] == CURLY && ARG1(first) > 0) ||
5228 (OP(first) == NOTHING && PL_regkind[OP(first_next)] != END ))
07be1b83 5229 {
639081d6
YO
5230 /*
5231 * the only op that could be a regnode is PLUS, all the rest
5232 * will be regnode_1 or regnode_2.
5233 *
5234 */
a0d0e21e
LW
5235 if (OP(first) == PLUS)
5236 sawplus = 1;
5237 else
3dab1dad 5238 first += regarglen[OP(first)];
686b73d4 5239
639081d6 5240 first = NEXTOPER(first);
5339e136 5241 first_next= regnext(first);
a687059c
LW
5242 }
5243
a0d0e21e
LW
5244 /* Starting-point info. */
5245 again:
786e8c11 5246 DEBUG_PEEP("first:",first,0);
07be1b83 5247 /* Ignore EXACT as we deal with it later. */
3dab1dad 5248 if (PL_regkind[OP(first)] == EXACT) {
1aa99e6b 5249 if (OP(first) == EXACT)
6f207bd3 5250 NOOP; /* Empty, get anchored substr later. */
e5fbd0ff 5251 else
f8fc2ecf 5252 ri->regstclass = first;
b3c9acc1 5253 }
686b73d4 5254#ifdef TRIE_STCLASS
786e8c11 5255 else if (PL_regkind[OP(first)] == TRIE &&
f8fc2ecf 5256 ((reg_trie_data *)ri->data->data[ ARG(first) ])->minlen>0)
07be1b83 5257 {
786e8c11 5258 regnode *trie_op;
07be1b83 5259 /* this can happen only on restudy */
786e8c11 5260 if ( OP(first) == TRIE ) {
c944940b 5261 struct regnode_1 *trieop = (struct regnode_1 *)
446bd890 5262 PerlMemShared_calloc(1, sizeof(struct regnode_1));
786e8c11
YO
5263 StructCopy(first,trieop,struct regnode_1);
5264 trie_op=(regnode *)trieop;
5265 } else {
c944940b 5266 struct regnode_charclass *trieop = (struct regnode_charclass *)
446bd890 5267 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
786e8c11
YO
5268 StructCopy(first,trieop,struct regnode_charclass);
5269 trie_op=(regnode *)trieop;
5270 }
1de06328 5271 OP(trie_op)+=2;
786e8c11 5272 make_trie_failtable(pRExC_state, (regnode *)first, trie_op, 0);
f8fc2ecf 5273 ri->regstclass = trie_op;
07be1b83 5274 }
686b73d4 5275#endif
e52fc539 5276 else if (REGNODE_SIMPLE(OP(first)))
f8fc2ecf 5277 ri->regstclass = first;
3dab1dad
YO
5278 else if (PL_regkind[OP(first)] == BOUND ||
5279 PL_regkind[OP(first)] == NBOUND)
f8fc2ecf 5280 ri->regstclass = first;
3dab1dad 5281 else if (PL_regkind[OP(first)] == BOL) {
bbe252da
YO
5282 r->extflags |= (OP(first) == MBOL
5283 ? RXf_ANCH_MBOL
cad2e5aa 5284 : (OP(first) == SBOL
bbe252da
YO
5285 ? RXf_ANCH_SBOL
5286 : RXf_ANCH_BOL));
a0d0e21e 5287 first = NEXTOPER(first);
774d564b 5288 goto again;
5289 }
5290 else if (OP(first) == GPOS) {
bbe252da 5291 r->extflags |= RXf_ANCH_GPOS;
774d564b 5292 first = NEXTOPER(first);
5293 goto again;
a0d0e21e 5294 }
cf2a2b69
YO
5295 else if ((!sawopen || !RExC_sawback) &&
5296 (OP(first) == STAR &&
3dab1dad 5297 PL_regkind[OP(NEXTOPER(first))] == REG_ANY) &&
bbe252da 5298 !(r->extflags & RXf_ANCH) && !(RExC_seen & REG_SEEN_EVAL))
a0d0e21e
LW
5299 {
5300 /* turn .* into ^.* with an implied $*=1 */
1df70142
AL
5301 const int type =
5302 (OP(NEXTOPER(first)) == REG_ANY)
bbe252da
YO
5303 ? RXf_ANCH_MBOL
5304 : RXf_ANCH_SBOL;
5305 r->extflags |= type;
5306 r->intflags |= PREGf_IMPLICIT;
a0d0e21e 5307 first = NEXTOPER(first);
774d564b 5308 goto again;
a0d0e21e 5309 }
e7f38d0f 5310 if (sawplus && !sawlookahead && (!sawopen || !RExC_sawback)
830247a4 5311 && !(RExC_seen & REG_SEEN_EVAL)) /* May examine pos and $& */
cad2e5aa 5312 /* x+ must match at the 1st pos of run of x's */
bbe252da 5313 r->intflags |= PREGf_SKIP;
a0d0e21e 5314
c277df42 5315 /* Scan is after the zeroth branch, first is atomic matcher. */
be8e71aa 5316#ifdef TRIE_STUDY_OPT
81714fb9 5317 DEBUG_PARSE_r(
be8e71aa
YO
5318 if (!restudied)
5319 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5320 (IV)(first - scan + 1))
5321 );
5322#else
81714fb9 5323 DEBUG_PARSE_r(
be8e71aa
YO
5324 PerlIO_printf(Perl_debug_log, "first at %"IVdf"\n",
5325 (IV)(first - scan + 1))
5326 );
5327#endif
5328
5329
a0d0e21e
LW
5330 /*
5331 * If there's something expensive in the r.e., find the
5332 * longest literal string that must appear and make it the
5333 * regmust. Resolve ties in favor of later strings, since
5334 * the regstart check works with the beginning of the r.e.
5335 * and avoiding duplication strengthens checking. Not a
5336 * strong reason, but sufficient in the absence of others.
5337 * [Now we resolve ties in favor of the earlier string if
c277df42 5338 * it happens that c_offset_min has been invalidated, since the
a0d0e21e
LW
5339 * earlier string may buy us something the later one won't.]
5340 */
686b73d4 5341
396482e1
GA
5342 data.longest_fixed = newSVpvs("");
5343 data.longest_float = newSVpvs("");
5344 data.last_found = newSVpvs("");
c277df42
IZ
5345 data.longest = &(data.longest_fixed);
5346 first = scan;
f8fc2ecf 5347 if (!ri->regstclass) {
e755fd73 5348 cl_init(pRExC_state, &ch_class);
653099ff
GS
5349 data.start_class = &ch_class;
5350 stclass_flag = SCF_DO_STCLASS_AND;
5351 } else /* XXXX Check for BOUND? */
5352 stclass_flag = 0;
cb434fcc 5353 data.last_closep = &last_close;
de8c5301 5354
1de06328 5355 minlen = study_chunk(pRExC_state, &first, &minlen, &fake, scan + RExC_size, /* Up to end */
40d049e4
YO
5356 &data, -1, NULL, NULL,
5357 SCF_DO_SUBSTR | SCF_WHILEM_VISITED_POS | stclass_flag,0);
07be1b83 5358
686b73d4 5359
786e8c11
YO
5360 CHECK_RESTUDY_GOTO;
5361
5362
830247a4 5363 if ( RExC_npar == 1 && data.longest == &(data.longest_fixed)
b81d288d 5364 && data.last_start_min == 0 && data.last_end > 0
830247a4 5365 && !RExC_seen_zerolen
2bf803e2 5366 && !(RExC_seen & REG_SEEN_VERBARG)
bbe252da
YO
5367 && (!(RExC_seen & REG_SEEN_GPOS) || (r->extflags & RXf_ANCH_GPOS)))
5368 r->extflags |= RXf_CHECK_ALL;
304ee84b 5369 scan_commit(pRExC_state, &data,&minlen,0);
c277df42
IZ
5370 SvREFCNT_dec(data.last_found);
5371
1de06328
YO
5372 /* Note that code very similar to this but for anchored string
5373 follows immediately below, changes may need to be made to both.
5374 Be careful.
5375 */
a0ed51b3 5376 longest_float_length = CHR_SVLEN(data.longest_float);
c5254dd6 5377 if (longest_float_length
c277df42
IZ
5378 || (data.flags & SF_FL_BEFORE_EOL
5379 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 5380 || (RExC_flags & RXf_PMf_MULTILINE))))
1de06328 5381 {
1182767e 5382 I32 t,ml;
cf93c79d 5383
a0c4c608 5384 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
bb914485
KW
5385 if ((RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5386 || (SvCUR(data.longest_fixed) /* ok to leave SvCUR */
5387 && data.offset_fixed == data.offset_float_min
5388 && SvCUR(data.longest_fixed) == SvCUR(data.longest_float)))
aca2d497
IZ
5389 goto remove_float; /* As in (a)+. */
5390
1de06328
YO
5391 /* copy the information about the longest float from the reg_scan_data
5392 over to the program. */
33b8afdf
JH
5393 if (SvUTF8(data.longest_float)) {
5394 r->float_utf8 = data.longest_float;
c445ea15 5395 r->float_substr = NULL;
33b8afdf
JH
5396 } else {
5397 r->float_substr = data.longest_float;
c445ea15 5398 r->float_utf8 = NULL;
33b8afdf 5399 }
1de06328
YO
5400 /* float_end_shift is how many chars that must be matched that
5401 follow this item. We calculate it ahead of time as once the
5402 lookbehind offset is added in we lose the ability to correctly
5403 calculate it.*/
5404 ml = data.minlen_float ? *(data.minlen_float)
1182767e 5405 : (I32)longest_float_length;
1de06328
YO
5406 r->float_end_shift = ml - data.offset_float_min
5407 - longest_float_length + (SvTAIL(data.longest_float) != 0)
5408 + data.lookbehind_float;
5409 r->float_min_offset = data.offset_float_min - data.lookbehind_float;
c277df42 5410 r->float_max_offset = data.offset_float_max;
1182767e 5411 if (data.offset_float_max < I32_MAX) /* Don't offset infinity */
1de06328
YO
5412 r->float_max_offset -= data.lookbehind_float;
5413
cf93c79d
IZ
5414 t = (data.flags & SF_FL_BEFORE_EOL /* Can't have SEOL and MULTI */
5415 && (!(data.flags & SF_FL_BEFORE_MEOL)
bbe252da 5416 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 5417 fbm_compile(data.longest_float, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
5418 }
5419 else {
aca2d497 5420 remove_float:
c445ea15 5421 r->float_substr = r->float_utf8 = NULL;
c277df42 5422 SvREFCNT_dec(data.longest_float);
c5254dd6 5423 longest_float_length = 0;
a0d0e21e 5424 }
c277df42 5425
1de06328
YO
5426 /* Note that code very similar to this but for floating string
5427 is immediately above, changes may need to be made to both.
5428 Be careful.
5429 */
a0ed51b3 5430 longest_fixed_length = CHR_SVLEN(data.longest_fixed);
a0c4c608
KW
5431
5432 /* See comments for join_exact for why REG_SEEN_EXACTF_SHARP_S */
bb914485
KW
5433 if (! (RExC_seen & REG_SEEN_EXACTF_SHARP_S)
5434 && (longest_fixed_length
5435 || (data.flags & SF_FIX_BEFORE_EOL /* Cannot have SEOL and MULTI */
5436 && (!(data.flags & SF_FIX_BEFORE_MEOL)
5437 || (RExC_flags & RXf_PMf_MULTILINE)))) )
1de06328 5438 {
1182767e 5439 I32 t,ml;
cf93c79d 5440
1de06328
YO
5441 /* copy the information about the longest fixed
5442 from the reg_scan_data over to the program. */
33b8afdf
JH
5443 if (SvUTF8(data.longest_fixed)) {
5444 r->anchored_utf8 = data.longest_fixed;
c445ea15 5445 r->anchored_substr = NULL;
33b8afdf
JH
5446 } else {
5447 r->anchored_substr = data.longest_fixed;
c445ea15 5448 r->anchored_utf8 = NULL;
33b8afdf 5449 }
1de06328
YO
5450 /* fixed_end_shift is how many chars that must be matched that
5451 follow this item. We calculate it ahead of time as once the
5452 lookbehind offset is added in we lose the ability to correctly
5453 calculate it.*/
5454 ml = data.minlen_fixed ? *(data.minlen_fixed)
1182767e 5455 : (I32)longest_fixed_length;
1de06328
YO
5456 r->anchored_end_shift = ml - data.offset_fixed
5457 - longest_fixed_length + (SvTAIL(data.longest_fixed) != 0)
5458 + data.lookbehind_fixed;
5459 r->anchored_offset = data.offset_fixed - data.lookbehind_fixed;
5460
cf93c79d
IZ
5461 t = (data.flags & SF_FIX_BEFORE_EOL /* Can't have SEOL and MULTI */
5462 && (!(data.flags & SF_FIX_BEFORE_MEOL)
bbe252da 5463 || (RExC_flags & RXf_PMf_MULTILINE)));
33b8afdf 5464 fbm_compile(data.longest_fixed, t ? FBMcf_TAIL : 0);
a0ed51b3
LW
5465 }
5466 else {
c445ea15 5467 r->anchored_substr = r->anchored_utf8 = NULL;
c277df42 5468 SvREFCNT_dec(data.longest_fixed);
c5254dd6 5469 longest_fixed_length = 0;
a0d0e21e 5470 }
f8fc2ecf
YO
5471 if (ri->regstclass
5472 && (OP(ri->regstclass) == REG_ANY || OP(ri->regstclass) == SANY))
5473 ri->regstclass = NULL;
f4244008 5474
33b8afdf
JH
5475 if ((!(r->anchored_substr || r->anchored_utf8) || r->anchored_offset)
5476 && stclass_flag
653099ff 5477 && !(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5478 && !cl_is_anything(data.start_class))
5479 {
2eccd3b2 5480 const U32 n = add_data(pRExC_state, 1, "f");
c613755a 5481 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
653099ff 5482
f8fc2ecf 5483 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5484 struct regnode_charclass_class);
5485 StructCopy(data.start_class,
f8fc2ecf 5486 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5487 struct regnode_charclass_class);
f8fc2ecf 5488 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5489 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5490 DEBUG_COMPILE_r({ SV *sv = sv_newmortal();
32fc9b6a 5491 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5492 PerlIO_printf(Perl_debug_log,
a0288114 5493 "synthetic stclass \"%s\".\n",
3f7c398e 5494 SvPVX_const(sv));});
653099ff 5495 }
c277df42
IZ
5496
5497 /* A temporary algorithm prefers floated substr to fixed one to dig more info. */
c5254dd6 5498 if (longest_fixed_length > longest_float_length) {
1de06328 5499 r->check_end_shift = r->anchored_end_shift;
c277df42 5500 r->check_substr = r->anchored_substr;
33b8afdf 5501 r->check_utf8 = r->anchored_utf8;
c277df42 5502 r->check_offset_min = r->check_offset_max = r->anchored_offset;
bbe252da
YO
5503 if (r->extflags & RXf_ANCH_SINGLE)
5504 r->extflags |= RXf_NOSCAN;
a0ed51b3
LW
5505 }
5506 else {
1de06328 5507 r->check_end_shift = r->float_end_shift;
c277df42 5508 r->check_substr = r->float_substr;
33b8afdf 5509 r->check_utf8 = r->float_utf8;
1de06328
YO
5510 r->check_offset_min = r->float_min_offset;
5511 r->check_offset_max = r->float_max_offset;
a0d0e21e 5512 }
30382c73
IZ
5513 /* XXXX Currently intuiting is not compatible with ANCH_GPOS.
5514 This should be changed ASAP! */
bbe252da
YO
5515 if ((r->check_substr || r->check_utf8) && !(r->extflags & RXf_ANCH_GPOS)) {
5516 r->extflags |= RXf_USE_INTUIT;
33b8afdf 5517 if (SvTAIL(r->check_substr ? r->check_substr : r->check_utf8))
bbe252da 5518 r->extflags |= RXf_INTUIT_TAIL;
cad2e5aa 5519 }
1de06328
YO
5520 /* XXX Unneeded? dmq (shouldn't as this is handled elsewhere)
5521 if ( (STRLEN)minlen < longest_float_length )
5522 minlen= longest_float_length;
5523 if ( (STRLEN)minlen < longest_fixed_length )
5524 minlen= longest_fixed_length;
5525 */
a0ed51b3
LW
5526 }
5527 else {
c277df42
IZ
5528 /* Several toplevels. Best we can is to set minlen. */
5529 I32 fake;
653099ff 5530 struct regnode_charclass_class ch_class;
cb434fcc 5531 I32 last_close = 0;
686b73d4 5532
5d458dd8 5533 DEBUG_PARSE_r(PerlIO_printf(Perl_debug_log, "\nMulti Top Level\n"));
07be1b83 5534
f8fc2ecf 5535 scan = ri->program + 1;
e755fd73 5536 cl_init(pRExC_state, &ch_class);
653099ff 5537 data.start_class = &ch_class;
cb434fcc 5538 data.last_closep = &last_close;
07be1b83 5539
de8c5301 5540
1de06328 5541 minlen = study_chunk(pRExC_state, &scan, &minlen, &fake, scan + RExC_size,
40d049e4 5542 &data, -1, NULL, NULL, SCF_DO_STCLASS_AND|SCF_WHILEM_VISITED_POS,0);
de8c5301 5543
786e8c11 5544 CHECK_RESTUDY_GOTO;
07be1b83 5545
33b8afdf 5546 r->check_substr = r->check_utf8 = r->anchored_substr = r->anchored_utf8
c445ea15 5547 = r->float_substr = r->float_utf8 = NULL;
f4244008 5548
653099ff 5549 if (!(data.start_class->flags & ANYOF_EOS)
eb160463
GS
5550 && !cl_is_anything(data.start_class))
5551 {
2eccd3b2 5552 const U32 n = add_data(pRExC_state, 1, "f");
c613755a 5553 data.start_class->flags |= ANYOF_IS_SYNTHETIC;
653099ff 5554
f8fc2ecf 5555 Newx(RExC_rxi->data->data[n], 1,
653099ff
GS
5556 struct regnode_charclass_class);
5557 StructCopy(data.start_class,
f8fc2ecf 5558 (struct regnode_charclass_class*)RExC_rxi->data->data[n],
653099ff 5559 struct regnode_charclass_class);
f8fc2ecf 5560 ri->regstclass = (regnode*)RExC_rxi->data->data[n];
bbe252da 5561 r->intflags &= ~PREGf_SKIP; /* Used in find_byclass(). */
a3621e74 5562 DEBUG_COMPILE_r({ SV* sv = sv_newmortal();
32fc9b6a 5563 regprop(r, sv, (regnode*)data.start_class);
9c5ffd7c 5564 PerlIO_printf(Perl_debug_log,
a0288114 5565 "synthetic stclass \"%s\".\n",
3f7c398e 5566 SvPVX_const(sv));});
653099ff 5567 }
a0d0e21e
LW
5568 }
5569
1de06328
YO
5570 /* Guard against an embedded (?=) or (?<=) with a longer minlen than
5571 the "real" pattern. */
cf9788e3
RGS
5572 DEBUG_OPTIMISE_r({
5573 PerlIO_printf(Perl_debug_log,"minlen: %"IVdf" r->minlen:%"IVdf"\n",
70685ca0 5574 (IV)minlen, (IV)r->minlen);
cf9788e3 5575 });
de8c5301 5576 r->minlenret = minlen;
1de06328
YO
5577 if (r->minlen < minlen)
5578 r->minlen = minlen;
5579
b81d288d 5580 if (RExC_seen & REG_SEEN_GPOS)
bbe252da 5581 r->extflags |= RXf_GPOS_SEEN;
830247a4 5582 if (RExC_seen & REG_SEEN_LOOKBEHIND)
bbe252da 5583 r->extflags |= RXf_LOOKBEHIND_SEEN;
830247a4 5584 if (RExC_seen & REG_SEEN_EVAL)
bbe252da 5585 r->extflags |= RXf_EVAL_SEEN;
f33976b4 5586 if (RExC_seen & REG_SEEN_CANY)
bbe252da 5587 r->extflags |= RXf_CANY_SEEN;
e2e6a0f1 5588 if (RExC_seen & REG_SEEN_VERBARG)
bbe252da 5589 r->intflags |= PREGf_VERBARG_SEEN;
5d458dd8 5590 if (RExC_seen & REG_SEEN_CUTGROUP)
bbe252da 5591 r->intflags |= PREGf_CUTGROUP_SEEN;
81714fb9 5592 if (RExC_paren_names)
85fbaab2 5593 RXp_PAREN_NAMES(r) = MUTABLE_HV(SvREFCNT_inc(RExC_paren_names));
81714fb9 5594 else
5daac39c 5595 RXp_PAREN_NAMES(r) = NULL;
0ac6acae 5596
7bd1e614 5597#ifdef STUPID_PATTERN_CHECKS
5509d87a 5598 if (RX_PRELEN(rx) == 0)
640f820d 5599 r->extflags |= RXf_NULL;
5509d87a 5600 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
0ac6acae
AB
5601 /* XXX: this should happen BEFORE we compile */
5602 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5509d87a 5603 else if (RX_PRELEN(rx) == 3 && memEQ("\\s+", RX_PRECOMP(rx), 3))
0ac6acae 5604 r->extflags |= RXf_WHITE;
5509d87a 5605 else if (RX_PRELEN(rx) == 1 && RXp_PRECOMP(rx)[0] == '^')
e357fc67 5606 r->extflags |= RXf_START_ONLY;
f1b875a0 5607#else
5509d87a 5608 if (r->extflags & RXf_SPLIT && RX_PRELEN(rx) == 1 && RX_PRECOMP(rx)[0] == ' ')
7bd1e614
YO
5609 /* XXX: this should happen BEFORE we compile */
5610 r->extflags |= (RXf_SKIPWHITE|RXf_WHITE);
5611 else {
5612 regnode *first = ri->program + 1;
39aa8307 5613 U8 fop = OP(first);
f6d9469c
DM
5614
5615 if (PL_regkind[fop] == NOTHING && OP(NEXTOPER(first)) == END)
640f820d 5616 r->extflags |= RXf_NULL;
f6d9469c 5617 else if (PL_regkind[fop] == BOL && OP(NEXTOPER(first)) == END)
7bd1e614 5618 r->extflags |= RXf_START_ONLY;
f6d9469c
DM
5619 else if (fop == PLUS && OP(NEXTOPER(first)) == SPACE
5620 && OP(regnext(first)) == END)
7bd1e614
YO
5621 r->extflags |= RXf_WHITE;
5622 }
f1b875a0 5623#endif
1f1031fe
YO
5624#ifdef DEBUGGING
5625 if (RExC_paren_names) {
af534a04 5626 ri->name_list_idx = add_data( pRExC_state, 1, "a" );
1f1031fe
YO
5627 ri->data->data[ri->name_list_idx] = (void*)SvREFCNT_inc(RExC_paren_name_list);
5628 } else
1f1031fe 5629#endif
cde0cee5 5630 ri->name_list_idx = 0;
1f1031fe 5631
40d049e4
YO
5632 if (RExC_recurse_count) {
5633 for ( ; RExC_recurse_count ; RExC_recurse_count-- ) {
5634 const regnode *scan = RExC_recurse[RExC_recurse_count-1];
5635 ARG2L_SET( scan, RExC_open_parens[ARG(scan)-1] - scan );
5636 }
5637 }
f0ab9afb 5638 Newxz(r->offs, RExC_npar, regexp_paren_pair);
c74340f9
YO
5639 /* assume we don't need to swap parens around before we match */
5640
be8e71aa
YO
5641 DEBUG_DUMP_r({
5642 PerlIO_printf(Perl_debug_log,"Final program:\n");
3dab1dad
YO
5643 regdump(r);
5644 });
7122b237
YO
5645#ifdef RE_TRACK_PATTERN_OFFSETS
5646 DEBUG_OFFSETS_r(if (ri->u.offsets) {
5647 const U32 len = ri->u.offsets[0];
8e9a8a48
YO
5648 U32 i;
5649 GET_RE_DEBUG_FLAGS_DECL;
7122b237 5650 PerlIO_printf(Perl_debug_log, "Offsets: [%"UVuf"]\n\t", (UV)ri->u.offsets[0]);
8e9a8a48 5651 for (i = 1; i <= len; i++) {
7122b237 5652 if (ri->u.offsets[i*2-1] || ri->u.offsets[i*2])
8e9a8a48 5653 PerlIO_printf(Perl_debug_log, "%"UVuf":%"UVuf"[%"UVuf"] ",
7122b237 5654 (UV)i, (UV)ri->u.offsets[i*2-1], (UV)ri->u.offsets[i*2]);
8e9a8a48
YO
5655 }
5656 PerlIO_printf(Perl_debug_log, "\n");
5657 });
7122b237 5658#endif
288b8c02 5659 return rx;
a687059c
LW
5660}
5661
f9f4320a 5662#undef RE_ENGINE_PTR
3dab1dad 5663
93b32b6d 5664
81714fb9 5665SV*
192b9cd1
AB
5666Perl_reg_named_buff(pTHX_ REGEXP * const rx, SV * const key, SV * const value,
5667 const U32 flags)
5668{
7918f24d
NC
5669 PERL_ARGS_ASSERT_REG_NAMED_BUFF;
5670
192b9cd1
AB
5671 PERL_UNUSED_ARG(value);
5672
f1b875a0 5673 if (flags & RXapif_FETCH) {
192b9cd1 5674 return reg_named_buff_fetch(rx, key, flags);
f1b875a0 5675 } else if (flags & (RXapif_STORE | RXapif_DELETE | RXapif_CLEAR)) {
6ad8f254 5676 Perl_croak_no_modify(aTHX);
192b9cd1 5677 return NULL;
f1b875a0 5678 } else if (flags & RXapif_EXISTS) {
192b9cd1
AB
5679 return reg_named_buff_exists(rx, key, flags)
5680 ? &PL_sv_yes
5681 : &PL_sv_no;
f1b875a0 5682 } else if (flags & RXapif_REGNAMES) {
192b9cd1 5683 return reg_named_buff_all(rx, flags);
f1b875a0 5684 } else if (flags & (RXapif_SCALAR | RXapif_REGNAMES_COUNT)) {
192b9cd1
AB
5685 return reg_named_buff_scalar(rx, flags);
5686 } else {
5687 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff", (int)flags);
5688 return NULL;
5689 }
5690}
5691
5692SV*
5693Perl_reg_named_buff_iter(pTHX_ REGEXP * const rx, const SV * const lastkey,
5694 const U32 flags)
5695{
7918f24d 5696 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ITER;
192b9cd1
AB
5697 PERL_UNUSED_ARG(lastkey);
5698
f1b875a0 5699 if (flags & RXapif_FIRSTKEY)
192b9cd1 5700 return reg_named_buff_firstkey(rx, flags);
f1b875a0 5701 else if (flags & RXapif_NEXTKEY)
192b9cd1
AB
5702 return reg_named_buff_nextkey(rx, flags);
5703 else {
5704 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_iter", (int)flags);
5705 return NULL;
5706 }
5707}
5708
5709SV*
288b8c02
NC
5710Perl_reg_named_buff_fetch(pTHX_ REGEXP * const r, SV * const namesv,
5711 const U32 flags)
81714fb9 5712{
44a2ac75
YO
5713 AV *retarray = NULL;
5714 SV *ret;
288b8c02 5715 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5716
5717 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FETCH;
5718
f1b875a0 5719 if (flags & RXapif_ALL)
44a2ac75 5720 retarray=newAV();
93b32b6d 5721
5daac39c
NC
5722 if (rx && RXp_PAREN_NAMES(rx)) {
5723 HE *he_str = hv_fetch_ent( RXp_PAREN_NAMES(rx), namesv, 0, 0 );
93b32b6d
YO
5724 if (he_str) {
5725 IV i;
5726 SV* sv_dat=HeVAL(he_str);
5727 I32 *nums=(I32*)SvPVX(sv_dat);
5728 for ( i=0; i<SvIVX(sv_dat); i++ ) {
192b9cd1
AB
5729 if ((I32)(rx->nparens) >= nums[i]
5730 && rx->offs[nums[i]].start != -1
5731 && rx->offs[nums[i]].end != -1)
93b32b6d 5732 {
49d7dfbc 5733 ret = newSVpvs("");
288b8c02 5734 CALLREG_NUMBUF_FETCH(r,nums[i],ret);
93b32b6d
YO
5735 if (!retarray)
5736 return ret;
5737 } else {
7402016d
AB
5738 if (retarray)
5739 ret = newSVsv(&PL_sv_undef);
93b32b6d 5740 }
ec83ea38 5741 if (retarray)
93b32b6d 5742 av_push(retarray, ret);
81714fb9 5743 }
93b32b6d 5744 if (retarray)
ad64d0ec 5745 return newRV_noinc(MUTABLE_SV(retarray));
192b9cd1
AB
5746 }
5747 }
5748 return NULL;
5749}
5750
5751bool
288b8c02 5752Perl_reg_named_buff_exists(pTHX_ REGEXP * const r, SV * const key,
192b9cd1
AB
5753 const U32 flags)
5754{
288b8c02 5755 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5756
5757 PERL_ARGS_ASSERT_REG_NAMED_BUFF_EXISTS;
5758
5daac39c 5759 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5760 if (flags & RXapif_ALL) {
5daac39c 5761 return hv_exists_ent(RXp_PAREN_NAMES(rx), key, 0);
192b9cd1 5762 } else {
288b8c02 5763 SV *sv = CALLREG_NAMED_BUFF_FETCH(r, key, flags);
6499cc01
RGS
5764 if (sv) {
5765 SvREFCNT_dec(sv);
192b9cd1
AB
5766 return TRUE;
5767 } else {
5768 return FALSE;
5769 }
5770 }
5771 } else {
5772 return FALSE;
5773 }
5774}
5775
5776SV*
288b8c02 5777Perl_reg_named_buff_firstkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5778{
288b8c02 5779 struct regexp *const rx = (struct regexp *)SvANY(r);
7918f24d
NC
5780
5781 PERL_ARGS_ASSERT_REG_NAMED_BUFF_FIRSTKEY;
5782
5daac39c
NC
5783 if ( rx && RXp_PAREN_NAMES(rx) ) {
5784 (void)hv_iterinit(RXp_PAREN_NAMES(rx));
192b9cd1 5785
288b8c02 5786 return CALLREG_NAMED_BUFF_NEXTKEY(r, NULL, flags & ~RXapif_FIRSTKEY);
1e1d4b91
JJ
5787 } else {
5788 return FALSE;
5789 }
192b9cd1
AB
5790}
5791
5792SV*
288b8c02 5793Perl_reg_named_buff_nextkey(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5794{
288b8c02 5795 struct regexp *const rx = (struct regexp *)SvANY(r);
250257bb 5796 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
5797
5798 PERL_ARGS_ASSERT_REG_NAMED_BUFF_NEXTKEY;
5799
5daac39c
NC
5800 if (rx && RXp_PAREN_NAMES(rx)) {
5801 HV *hv = RXp_PAREN_NAMES(rx);
192b9cd1
AB
5802 HE *temphe;
5803 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5804 IV i;
5805 IV parno = 0;
5806 SV* sv_dat = HeVAL(temphe);
5807 I32 *nums = (I32*)SvPVX(sv_dat);
5808 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5809 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5810 rx->offs[nums[i]].start != -1 &&
5811 rx->offs[nums[i]].end != -1)
5812 {
5813 parno = nums[i];
5814 break;
5815 }
5816 }
f1b875a0 5817 if (parno || flags & RXapif_ALL) {
a663657d 5818 return newSVhek(HeKEY_hek(temphe));
192b9cd1 5819 }
81714fb9
YO
5820 }
5821 }
44a2ac75
YO
5822 return NULL;
5823}
5824
192b9cd1 5825SV*
288b8c02 5826Perl_reg_named_buff_scalar(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1
AB
5827{
5828 SV *ret;
5829 AV *av;
5830 I32 length;
288b8c02 5831 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1 5832
7918f24d
NC
5833 PERL_ARGS_ASSERT_REG_NAMED_BUFF_SCALAR;
5834
5daac39c 5835 if (rx && RXp_PAREN_NAMES(rx)) {
f1b875a0 5836 if (flags & (RXapif_ALL | RXapif_REGNAMES_COUNT)) {
5daac39c 5837 return newSViv(HvTOTALKEYS(RXp_PAREN_NAMES(rx)));
f1b875a0 5838 } else if (flags & RXapif_ONE) {
288b8c02 5839 ret = CALLREG_NAMED_BUFF_ALL(r, (flags | RXapif_REGNAMES));
502c6561 5840 av = MUTABLE_AV(SvRV(ret));
192b9cd1 5841 length = av_len(av);
ec83ea38 5842 SvREFCNT_dec(ret);
192b9cd1
AB
5843 return newSViv(length + 1);
5844 } else {
5845 Perl_croak(aTHX_ "panic: Unknown flags %d in named_buff_scalar", (int)flags);
5846 return NULL;
5847 }
5848 }
5849 return &PL_sv_undef;
5850}
5851
5852SV*
288b8c02 5853Perl_reg_named_buff_all(pTHX_ REGEXP * const r, const U32 flags)
192b9cd1 5854{
288b8c02 5855 struct regexp *const rx = (struct regexp *)SvANY(r);
192b9cd1
AB
5856 AV *av = newAV();
5857
7918f24d
NC
5858 PERL_ARGS_ASSERT_REG_NAMED_BUFF_ALL;
5859
5daac39c
NC
5860 if (rx && RXp_PAREN_NAMES(rx)) {
5861 HV *hv= RXp_PAREN_NAMES(rx);
192b9cd1
AB
5862 HE *temphe;
5863 (void)hv_iterinit(hv);
5864 while ( (temphe = hv_iternext_flags(hv,0)) ) {
5865 IV i;
5866 IV parno = 0;
5867 SV* sv_dat = HeVAL(temphe);
5868 I32 *nums = (I32*)SvPVX(sv_dat);
5869 for ( i = 0; i < SvIVX(sv_dat); i++ ) {
250257bb 5870 if ((I32)(rx->lastparen) >= nums[i] &&
192b9cd1
AB
5871 rx->offs[nums[i]].start != -1 &&
5872 rx->offs[nums[i]].end != -1)
5873 {
5874 parno = nums[i];
5875 break;
5876 }
5877 }
f1b875a0 5878 if (parno || flags & RXapif_ALL) {
a663657d 5879 av_push(av, newSVhek(HeKEY_hek(temphe)));
192b9cd1
AB
5880 }
5881 }
5882 }
5883
ad64d0ec 5884 return newRV_noinc(MUTABLE_SV(av));
192b9cd1
AB
5885}
5886
49d7dfbc 5887void
288b8c02
NC
5888Perl_reg_numbered_buff_fetch(pTHX_ REGEXP * const r, const I32 paren,
5889 SV * const sv)
44a2ac75 5890{
288b8c02 5891 struct regexp *const rx = (struct regexp *)SvANY(r);
44a2ac75 5892 char *s = NULL;
a9d504c3 5893 I32 i = 0;
44a2ac75 5894 I32 s1, t1;
7918f24d
NC
5895
5896 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_FETCH;
44a2ac75 5897
cde0cee5
YO
5898 if (!rx->subbeg) {
5899 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5900 return;
cde0cee5
YO
5901 }
5902 else
f1b875a0 5903 if (paren == RX_BUFF_IDX_PREMATCH && rx->offs[0].start != -1) {
44a2ac75 5904 /* $` */
f0ab9afb 5905 i = rx->offs[0].start;
cde0cee5 5906 s = rx->subbeg;
44a2ac75
YO
5907 }
5908 else
f1b875a0 5909 if (paren == RX_BUFF_IDX_POSTMATCH && rx->offs[0].end != -1) {
44a2ac75 5910 /* $' */
f0ab9afb
NC
5911 s = rx->subbeg + rx->offs[0].end;
5912 i = rx->sublen - rx->offs[0].end;
44a2ac75
YO
5913 }
5914 else
5915 if ( 0 <= paren && paren <= (I32)rx->nparens &&
f0ab9afb
NC
5916 (s1 = rx->offs[paren].start) != -1 &&
5917 (t1 = rx->offs[paren].end) != -1)
44a2ac75
YO
5918 {
5919 /* $& $1 ... */
5920 i = t1 - s1;
5921 s = rx->subbeg + s1;
cde0cee5
YO
5922 } else {
5923 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5924 return;
cde0cee5
YO
5925 }
5926 assert(rx->sublen >= (s - rx->subbeg) + i );
5927 if (i >= 0) {
5928 const int oldtainted = PL_tainted;
5929 TAINT_NOT;
5930 sv_setpvn(sv, s, i);
5931 PL_tainted = oldtainted;
5932 if ( (rx->extflags & RXf_CANY_SEEN)
07bc277f 5933 ? (RXp_MATCH_UTF8(rx)
cde0cee5 5934 && (!i || is_utf8_string((U8*)s, i)))
07bc277f 5935 : (RXp_MATCH_UTF8(rx)) )
cde0cee5
YO
5936 {
5937 SvUTF8_on(sv);
5938 }
5939 else
5940 SvUTF8_off(sv);
5941 if (PL_tainting) {
07bc277f 5942 if (RXp_MATCH_TAINTED(rx)) {
cde0cee5
YO
5943 if (SvTYPE(sv) >= SVt_PVMG) {
5944 MAGIC* const mg = SvMAGIC(sv);
5945 MAGIC* mgt;
5946 PL_tainted = 1;
5947 SvMAGIC_set(sv, mg->mg_moremagic);
5948 SvTAINT(sv);
5949 if ((mgt = SvMAGIC(sv))) {
5950 mg->mg_moremagic = mgt;
5951 SvMAGIC_set(sv, mg);
44a2ac75 5952 }
cde0cee5
YO
5953 } else {
5954 PL_tainted = 1;
5955 SvTAINT(sv);
5956 }
5957 } else
5958 SvTAINTED_off(sv);
44a2ac75 5959 }
81714fb9 5960 } else {
44a2ac75 5961 sv_setsv(sv,&PL_sv_undef);
49d7dfbc 5962 return;
81714fb9
YO
5963 }
5964}
93b32b6d 5965
2fdbfb4d
AB
5966void
5967Perl_reg_numbered_buff_store(pTHX_ REGEXP * const rx, const I32 paren,
5968 SV const * const value)
5969{
7918f24d
NC
5970 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_STORE;
5971
2fdbfb4d
AB
5972 PERL_UNUSED_ARG(rx);
5973 PERL_UNUSED_ARG(paren);
5974 PERL_UNUSED_ARG(value);
5975
5976 if (!PL_localizing)
6ad8f254 5977 Perl_croak_no_modify(aTHX);
2fdbfb4d
AB
5978}
5979
5980I32
288b8c02 5981Perl_reg_numbered_buff_length(pTHX_ REGEXP * const r, const SV * const sv,
2fdbfb4d
AB
5982 const I32 paren)
5983{
288b8c02 5984 struct regexp *const rx = (struct regexp *)SvANY(r);
2fdbfb4d
AB
5985 I32 i;
5986 I32 s1, t1;
5987
7918f24d
NC
5988 PERL_ARGS_ASSERT_REG_NUMBERED_BUFF_LENGTH;
5989
2fdbfb4d
AB
5990 /* Some of this code was originally in C<Perl_magic_len> in F<mg.c> */
5991 switch (paren) {
192b9cd1 5992 /* $` / ${^PREMATCH} */
f1b875a0 5993 case RX_BUFF_IDX_PREMATCH:
2fdbfb4d
AB
5994 if (rx->offs[0].start != -1) {
5995 i = rx->offs[0].start;
5996 if (i > 0) {
5997 s1 = 0;
5998 t1 = i;
5999 goto getlen;
6000 }
6001 }
6002 return 0;
192b9cd1 6003 /* $' / ${^POSTMATCH} */
f1b875a0 6004 case RX_BUFF_IDX_POSTMATCH:
2fdbfb4d
AB
6005 if (rx->offs[0].end != -1) {
6006 i = rx->sublen - rx->offs[0].end;
6007 if (i > 0) {
6008 s1 = rx->offs[0].end;
6009 t1 = rx->sublen;
6010 goto getlen;
6011 }
6012 }
6013 return 0;
192b9cd1
AB
6014 /* $& / ${^MATCH}, $1, $2, ... */
6015 default:
2fdbfb4d
AB
6016 if (paren <= (I32)rx->nparens &&
6017 (s1 = rx->offs[paren].start) != -1 &&
6018 (t1 = rx->offs[paren].end) != -1)
6019 {
6020 i = t1 - s1;
6021 goto getlen;
6022 } else {
6023 if (ckWARN(WARN_UNINITIALIZED))
ad64d0ec 6024 report_uninit((const SV *)sv);
2fdbfb4d
AB
6025 return 0;
6026 }
6027 }
6028 getlen:
07bc277f 6029 if (i > 0 && RXp_MATCH_UTF8(rx)) {
2fdbfb4d
AB
6030 const char * const s = rx->subbeg + s1;
6031 const U8 *ep;
6032 STRLEN el;
6033
6034 i = t1 - s1;
6035 if (is_utf8_string_loclen((U8*)s, i, &ep, &el))
6036 i = el;
6037 }
6038 return i;
6039}
6040
fe578d7f 6041SV*
49d7dfbc 6042Perl_reg_qr_package(pTHX_ REGEXP * const rx)
fe578d7f 6043{
7918f24d 6044 PERL_ARGS_ASSERT_REG_QR_PACKAGE;
fe578d7f 6045 PERL_UNUSED_ARG(rx);
0fc92fc6
YO
6046 if (0)
6047 return NULL;
6048 else
6049 return newSVpvs("Regexp");
fe578d7f 6050}
0a4db386 6051
894be9b7 6052/* Scans the name of a named buffer from the pattern.
0a4db386
YO
6053 * If flags is REG_RSN_RETURN_NULL returns null.
6054 * If flags is REG_RSN_RETURN_NAME returns an SV* containing the name
6055 * If flags is REG_RSN_RETURN_DATA returns the data SV* corresponding
6056 * to the parsed name as looked up in the RExC_paren_names hash.
6057 * If there is an error throws a vFAIL().. type exception.
894be9b7 6058 */
0a4db386
YO
6059
6060#define REG_RSN_RETURN_NULL 0
6061#define REG_RSN_RETURN_NAME 1
6062#define REG_RSN_RETURN_DATA 2
6063
894be9b7 6064STATIC SV*
7918f24d
NC
6065S_reg_scan_name(pTHX_ RExC_state_t *pRExC_state, U32 flags)
6066{
894be9b7 6067 char *name_start = RExC_parse;
1f1031fe 6068
7918f24d
NC
6069 PERL_ARGS_ASSERT_REG_SCAN_NAME;
6070
1f1031fe
YO
6071 if (isIDFIRST_lazy_if(RExC_parse, UTF)) {
6072 /* skip IDFIRST by using do...while */
6073 if (UTF)
6074 do {
6075 RExC_parse += UTF8SKIP(RExC_parse);
6076 } while (isALNUM_utf8((U8*)RExC_parse));
6077 else
6078 do {
6079 RExC_parse++;
6080 } while (isALNUM(*RExC_parse));
894be9b7 6081 }
1f1031fe 6082
0a4db386 6083 if ( flags ) {
59cd0e26
NC
6084 SV* sv_name
6085 = newSVpvn_flags(name_start, (int)(RExC_parse - name_start),
6086 SVs_TEMP | (UTF ? SVf_UTF8 : 0));
0a4db386
YO
6087 if ( flags == REG_RSN_RETURN_NAME)
6088 return sv_name;
6089 else if (flags==REG_RSN_RETURN_DATA) {
6090 HE *he_str = NULL;
6091 SV *sv_dat = NULL;
6092 if ( ! sv_name ) /* should not happen*/
6093 Perl_croak(aTHX_ "panic: no svname in reg_scan_name");
6094 if (RExC_paren_names)
6095 he_str = hv_fetch_ent( RExC_paren_names, sv_name, 0, 0 );
6096 if ( he_str )
6097 sv_dat = HeVAL(he_str);
6098 if ( ! sv_dat )
6099 vFAIL("Reference to nonexistent named group");
6100 return sv_dat;
6101 }
6102 else {
5637ef5b
NC
6103 Perl_croak(aTHX_ "panic: bad flag %lx in reg_scan_name",
6104 (unsigned long) flags);
0a4db386
YO
6105 }
6106 /* NOT REACHED */
894be9b7 6107 }
0a4db386 6108 return NULL;
894be9b7
YO
6109}
6110
3dab1dad
YO
6111#define DEBUG_PARSE_MSG(funcname) DEBUG_PARSE_r({ \
6112 int rem=(int)(RExC_end - RExC_parse); \
6113 int cut; \
6114 int num; \
6115 int iscut=0; \
6116 if (rem>10) { \
6117 rem=10; \
6118 iscut=1; \
6119 } \
6120 cut=10-rem; \
6121 if (RExC_lastparse!=RExC_parse) \
6122 PerlIO_printf(Perl_debug_log," >%.*s%-*s", \
6123 rem, RExC_parse, \
6124 cut + 4, \
6125 iscut ? "..." : "<" \
6126 ); \
6127 else \
6128 PerlIO_printf(Perl_debug_log,"%16s",""); \
6129 \
6130 if (SIZE_ONLY) \
3b57cd43 6131 num = RExC_size + 1; \
3dab1dad
YO
6132 else \
6133 num=REG_NODE_NUM(RExC_emit); \
6134 if (RExC_lastnum!=num) \
0a4db386 6135 PerlIO_printf(Perl_debug_log,"|%4d",num); \
3dab1dad 6136 else \
0a4db386 6137 PerlIO_printf(Perl_debug_log,"|%4s",""); \
be8e71aa
YO
6138 PerlIO_printf(Perl_debug_log,"|%*s%-4s", \
6139 (int)((depth*2)), "", \
3dab1dad
YO
6140 (funcname) \
6141 ); \
6142 RExC_lastnum=num; \
6143 RExC_lastparse=RExC_parse; \
6144})
6145
07be1b83
YO
6146
6147
3dab1dad
YO
6148#define DEBUG_PARSE(funcname) DEBUG_PARSE_r({ \
6149 DEBUG_PARSE_MSG((funcname)); \
6150 PerlIO_printf(Perl_debug_log,"%4s","\n"); \
6151})
6bda09f9
YO
6152#define DEBUG_PARSE_FMT(funcname,fmt,args) DEBUG_PARSE_r({ \
6153 DEBUG_PARSE_MSG((funcname)); \
6154 PerlIO_printf(Perl_debug_log,fmt "\n",args); \
6155})
d764b54e
KW
6156
6157/* This section of code defines the inversion list object and its methods. The
6158 * interfaces are highly subject to change, so as much as possible is static to
fa2d2a23
KW
6159 * this file. An inversion list is here implemented as a malloc'd C UV array
6160 * with some added info that is placed as UVs at the beginning in a header
6161 * portion. An inversion list for Unicode is an array of code points, sorted
6162 * by ordinal number. The zeroth element is the first code point in the list.
6163 * The 1th element is the first element beyond that not in the list. In other
6164 * words, the first range is
6165 * invlist[0]..(invlist[1]-1)
dbe7a391
KW
6166 * The other ranges follow. Thus every element whose index is divisible by two
6167 * marks the beginning of a range that is in the list, and every element not
fa2d2a23
KW
6168 * divisible by two marks the beginning of a range not in the list. A single
6169 * element inversion list that contains the single code point N generally
6170 * consists of two elements
6171 * invlist[0] == N
6172 * invlist[1] == N+1
6173 * (The exception is when N is the highest representable value on the
6174 * machine, in which case the list containing just it would be a single
6175 * element, itself. By extension, if the last range in the list extends to
6176 * infinity, then the first element of that range will be in the inversion list
6177 * at a position that is divisible by two, and is the final element in the
6178 * list.)
f1b67122
KW
6179 * Taking the complement (inverting) an inversion list is quite simple, if the
6180 * first element is 0, remove it; otherwise add a 0 element at the beginning.
6181 * This implementation reserves an element at the beginning of each inversion list
6182 * to contain 0 when the list contains 0, and contains 1 otherwise. The actual
6183 * beginning of the list is either that element if 0, or the next one if 1.
6184 *
fa2d2a23
KW
6185 * More about inversion lists can be found in "Unicode Demystified"
6186 * Chapter 13 by Richard Gillam, published by Addison-Wesley.
97b14ce7 6187 * More will be coming when functionality is added later.
d764b54e 6188 *
fa2d2a23
KW
6189 * The inversion list data structure is currently implemented as an SV pointing
6190 * to an array of UVs that the SV thinks are bytes. This allows us to have an
6191 * array of UV whose memory management is automatically handled by the existing
6192 * facilities for SV's.
62672576 6193 *
d764b54e
KW
6194 * Some of the methods should always be private to the implementation, and some
6195 * should eventually be made public */
6196
fa2d2a23
KW
6197#define INVLIST_LEN_OFFSET 0 /* Number of elements in the inversion list */
6198#define INVLIST_ITER_OFFSET 1 /* Current iteration position */
6199
f59ff194
KW
6200/* This is a combination of a version and data structure type, so that one
6201 * being passed in can be validated to be an inversion list of the correct
6202 * vintage. When the structure of the header is changed, a new random number
6203 * in the range 2**31-1 should be generated and the new() method changed to
6204 * insert that at this location. Then, if an auxiliary program doesn't change
6205 * correspondingly, it will be discovered immediately */
6206#define INVLIST_VERSION_ID_OFFSET 2
6207#define INVLIST_VERSION_ID 1064334010
6208
6209/* For safety, when adding new elements, remember to #undef them at the end of
6210 * the inversion list code section */
6211
6212#define INVLIST_ZERO_OFFSET 3 /* 0 or 1; must be last element in header */
f1b67122
KW
6213/* The UV at position ZERO contains either 0 or 1. If 0, the inversion list
6214 * contains the code point U+00000, and begins here. If 1, the inversion list
6215 * doesn't contain U+0000, and it begins at the next UV in the array.
6216 * Inverting an inversion list consists of adding or removing the 0 at the
6217 * beginning of it. By reserving a space for that 0, inversion can be made
6218 * very fast */
6219
6220#define HEADER_LENGTH (INVLIST_ZERO_OFFSET + 1)
97b14ce7
KW
6221
6222/* Internally things are UVs */
6223#define TO_INTERNAL_SIZE(x) ((x + HEADER_LENGTH) * sizeof(UV))
6224#define FROM_INTERNAL_SIZE(x) ((x / sizeof(UV)) - HEADER_LENGTH)
6225
d764b54e 6226#define INVLIST_INITIAL_LEN 10
d764b54e
KW
6227
6228PERL_STATIC_INLINE UV*
f1b67122
KW
6229S__invlist_array_init(pTHX_ SV* const invlist, const bool will_have_0)
6230{
6231 /* Returns a pointer to the first element in the inversion list's array.
6232 * This is called upon initialization of an inversion list. Where the
6233 * array begins depends on whether the list has the code point U+0000
6234 * in it or not. The other parameter tells it whether the code that
6235 * follows this call is about to put a 0 in the inversion list or not.
6236 * The first element is either the element with 0, if 0, or the next one,
6237 * if 1 */
6238
6239 UV* zero = get_invlist_zero_addr(invlist);
6240
6241 PERL_ARGS_ASSERT__INVLIST_ARRAY_INIT;
6242
6243 /* Must be empty */
6244 assert(! *get_invlist_len_addr(invlist));
6245
6246 /* 1^1 = 0; 1^0 = 1 */
6247 *zero = 1 ^ will_have_0;
6248 return zero + *zero;
6249}
6250
6251PERL_STATIC_INLINE UV*
a25abddc 6252S_invlist_array(pTHX_ SV* const invlist)
d764b54e
KW
6253{
6254 /* Returns the pointer to the inversion list's array. Every time the
6255 * length changes, this needs to be called in case malloc or realloc moved
6256 * it */
6257
d764b54e
KW
6258 PERL_ARGS_ASSERT_INVLIST_ARRAY;
6259
dbe7a391
KW
6260 /* Must not be empty. If these fail, you probably didn't check for <len>
6261 * being non-zero before trying to get the array */
f1b67122
KW
6262 assert(*get_invlist_len_addr(invlist));
6263 assert(*get_invlist_zero_addr(invlist) == 0
6264 || *get_invlist_zero_addr(invlist) == 1);
6265
6266 /* The array begins either at the element reserved for zero if the
6267 * list contains 0 (that element will be set to 0), or otherwise the next
6268 * element (in which case the reserved element will be set to 1). */
6269 return (UV *) (get_invlist_zero_addr(invlist)
6270 + *get_invlist_zero_addr(invlist));
d764b54e
KW
6271}
6272
61bdbf38
KW
6273PERL_STATIC_INLINE UV*
6274S_get_invlist_len_addr(pTHX_ SV* invlist)
6275{
6276 /* Return the address of the UV that contains the current number
6277 * of used elements in the inversion list */
6278
6279 PERL_ARGS_ASSERT_GET_INVLIST_LEN_ADDR;
6280
6281 return (UV *) (SvPVX(invlist) + (INVLIST_LEN_OFFSET * sizeof (UV)));
6282}
6283
d764b54e 6284PERL_STATIC_INLINE UV
a25abddc 6285S_invlist_len(pTHX_ SV* const invlist)
d764b54e 6286{
dbe7a391
KW
6287 /* Returns the current number of elements stored in the inversion list's
6288 * array */
d764b54e 6289
d764b54e
KW
6290 PERL_ARGS_ASSERT_INVLIST_LEN;
6291
61bdbf38 6292 return *get_invlist_len_addr(invlist);
d764b54e
KW
6293}
6294
c56a880b
KW
6295PERL_STATIC_INLINE void
6296S_invlist_set_len(pTHX_ SV* const invlist, const UV len)
6297{
6298 /* Sets the current number of elements stored in the inversion list */
6299
6300 PERL_ARGS_ASSERT_INVLIST_SET_LEN;
6301
c56a880b 6302 *get_invlist_len_addr(invlist) = len;
f1b67122 6303
32f89ef6
KW
6304 assert(len <= SvLEN(invlist));
6305
f1b67122
KW
6306 SvCUR_set(invlist, TO_INTERNAL_SIZE(len));
6307 /* If the list contains U+0000, that element is part of the header,
6308 * and should not be counted as part of the array. It will contain
6309 * 0 in that case, and 1 otherwise. So we could flop 0=>1, 1=>0 and
6310 * subtract:
6311 * SvCUR_set(invlist,
6312 * TO_INTERNAL_SIZE(len
6313 * - (*get_invlist_zero_addr(inv_list) ^ 1)));
6314 * But, this is only valid if len is not 0. The consequences of not doing
9479a769
KW
6315 * this is that the memory allocation code may think that 1 more UV is
6316 * being used than actually is, and so might do an unnecessary grow. That
6317 * seems worth not bothering to make this the precise amount.
25e94a65
KW
6318 *
6319 * Note that when inverting, SvCUR shouldn't change */
c56a880b
KW
6320}
6321
d764b54e 6322PERL_STATIC_INLINE UV
a25abddc 6323S_invlist_max(pTHX_ SV* const invlist)
d764b54e
KW
6324{
6325 /* Returns the maximum number of elements storable in the inversion list's
6326 * array, without having to realloc() */
6327
d764b54e
KW
6328 PERL_ARGS_ASSERT_INVLIST_MAX;
6329
005b65ed 6330 return FROM_INTERNAL_SIZE(SvLEN(invlist));
d764b54e
KW
6331}
6332
f1b67122
KW
6333PERL_STATIC_INLINE UV*
6334S_get_invlist_zero_addr(pTHX_ SV* invlist)
6335{
6336 /* Return the address of the UV that is reserved to hold 0 if the inversion
6337 * list contains 0. This has to be the last element of the heading, as the
6338 * list proper starts with either it if 0, or the next element if not.
6339 * (But we force it to contain either 0 or 1) */
6340
6341 PERL_ARGS_ASSERT_GET_INVLIST_ZERO_ADDR;
6342
6343 return (UV *) (SvPVX(invlist) + (INVLIST_ZERO_OFFSET * sizeof (UV)));
6344}
d764b54e 6345
8d69a883 6346#ifndef PERL_IN_XSUB_RE
a25abddc 6347SV*
d764b54e
KW
6348Perl__new_invlist(pTHX_ IV initial_size)
6349{
6350
6351 /* Return a pointer to a newly constructed inversion list, with enough
6352 * space to store 'initial_size' elements. If that number is negative, a
6353 * system default is used instead */
6354
97b14ce7
KW
6355 SV* new_list;
6356
d764b54e
KW
6357 if (initial_size < 0) {
6358 initial_size = INVLIST_INITIAL_LEN;
6359 }
6360
6361 /* Allocate the initial space */
97b14ce7
KW
6362 new_list = newSV(TO_INTERNAL_SIZE(initial_size));
6363 invlist_set_len(new_list, 0);
6364
f3dc70d1
KW
6365 /* Force iterinit() to be used to get iteration to work */
6366 *get_invlist_iter_addr(new_list) = UV_MAX;
6367
f1b67122
KW
6368 /* This should force a segfault if a method doesn't initialize this
6369 * properly */
6370 *get_invlist_zero_addr(new_list) = UV_MAX;
6371
f59ff194
KW
6372 *get_invlist_version_id_addr(new_list) = INVLIST_VERSION_ID;
6373#if HEADER_LENGTH != 4
6374# error Need to regenerate VERSION_ID by running perl -E 'say int(rand 2**31-1)', and then changing the #if to the new length
6375#endif
6376
97b14ce7 6377 return new_list;
d764b54e 6378}
8d69a883 6379#endif
d764b54e 6380
f59ff194
KW
6381STATIC SV*
6382S__new_invlist_C_array(pTHX_ UV* list)
6383{
6384 /* Return a pointer to a newly constructed inversion list, initialized to
6385 * point to <list>, which has to be in the exact correct inversion list
6386 * form, including internal fields. Thus this is a dangerous routine that
6387 * should not be used in the wrong hands */
6388
6389 SV* invlist = newSV_type(SVt_PV);
6390
6391 PERL_ARGS_ASSERT__NEW_INVLIST_C_ARRAY;
6392
6393 SvPV_set(invlist, (char *) list);
6394 SvLEN_set(invlist, 0); /* Means we own the contents, and the system
6395 shouldn't touch it */
6396 SvCUR_set(invlist, TO_INTERNAL_SIZE(invlist_len(invlist)));
6397
6398 if (*get_invlist_version_id_addr(invlist) != INVLIST_VERSION_ID) {
6399 Perl_croak(aTHX_ "panic: Incorrect version for previously generated inversion list");
6400 }
6401
6402 return invlist;
6403}
6404
d764b54e 6405STATIC void
a25abddc 6406S_invlist_extend(pTHX_ SV* const invlist, const UV new_max)
d764b54e 6407{
62672576 6408 /* Grow the maximum size of an inversion list */
d764b54e
KW
6409
6410 PERL_ARGS_ASSERT_INVLIST_EXTEND;
6411
005b65ed 6412 SvGROW((SV *)invlist, TO_INTERNAL_SIZE(new_max));
d764b54e
KW
6413}
6414
6415PERL_STATIC_INLINE void
a25abddc 6416S_invlist_trim(pTHX_ SV* const invlist)
d764b54e
KW
6417{
6418 PERL_ARGS_ASSERT_INVLIST_TRIM;
6419
6420 /* Change the length of the inversion list to how many entries it currently
6421 * has */
6422
62672576 6423 SvPV_shrink_to_cur((SV *) invlist);
d764b54e
KW
6424}
6425
6426/* An element is in an inversion list iff its index is even numbered: 0, 2, 4,
6427 * etc */
4096c37b
KW
6428#define ELEMENT_RANGE_MATCHES_INVLIST(i) (! ((i) & 1))
6429#define PREV_RANGE_MATCHES_INVLIST(i) (! ELEMENT_RANGE_MATCHES_INVLIST(i))
d764b54e 6430
8dc9348a
KW
6431#define _invlist_union_complement_2nd(a, b, output) _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
6432
8d69a883 6433#ifndef PERL_IN_XSUB_RE
d764b54e 6434void
a25abddc 6435Perl__append_range_to_invlist(pTHX_ SV* const invlist, const UV start, const UV end)
d764b54e
KW
6436{
6437 /* Subject to change or removal. Append the range from 'start' to 'end' at
6438 * the end of the inversion list. The range must be above any existing
6439 * ones. */
6440
f1b67122 6441 UV* array;
d764b54e
KW
6442 UV max = invlist_max(invlist);
6443 UV len = invlist_len(invlist);
6444
6445 PERL_ARGS_ASSERT__APPEND_RANGE_TO_INVLIST;
6446
f1b67122
KW
6447 if (len == 0) { /* Empty lists must be initialized */
6448 array = _invlist_array_init(invlist, start == 0);
6449 }
6450 else {
d764b54e
KW
6451 /* Here, the existing list is non-empty. The current max entry in the
6452 * list is generally the first value not in the set, except when the
6453 * set extends to the end of permissible values, in which case it is
6454 * the first entry in that final set, and so this call is an attempt to
6455 * append out-of-order */
6456
6457 UV final_element = len - 1;
f1b67122 6458 array = invlist_array(invlist);
d764b54e 6459 if (array[final_element] > start
4096c37b 6460 || ELEMENT_RANGE_MATCHES_INVLIST(final_element))
d764b54e 6461 {
5637ef5b
NC
6462 Perl_croak(aTHX_ "panic: attempting to append to an inversion list, but wasn't at the end of the list, final=%"UVuf", start=%"UVuf", match=%c",
6463 array[final_element], start,
6464 ELEMENT_RANGE_MATCHES_INVLIST(final_element) ? 't' : 'f');
d764b54e
KW
6465 }
6466
6467 /* Here, it is a legal append. If the new range begins with the first
6468 * value not in the set, it is extending the set, so the new first
6469 * value not in the set is one greater than the newly extended range.
6470 * */
6471 if (array[final_element] == start) {
6472 if (end != UV_MAX) {
6473 array[final_element] = end + 1;
6474 }
6475 else {
6476 /* But if the end is the maximum representable on the machine,
dbe7a391 6477 * just let the range that this would extend to have no end */
d764b54e
KW
6478 invlist_set_len(invlist, len - 1);
6479 }
6480 return;
6481 }
6482 }
6483
6484 /* Here the new range doesn't extend any existing set. Add it */
6485
6486 len += 2; /* Includes an element each for the start and end of range */
6487
6488 /* If overflows the existing space, extend, which may cause the array to be
6489 * moved */
6490 if (max < len) {
6491 invlist_extend(invlist, len);
f1b67122
KW
6492 invlist_set_len(invlist, len); /* Have to set len here to avoid assert
6493 failure in invlist_array() */
d764b54e
KW
6494 array = invlist_array(invlist);
6495 }
f1b67122
KW
6496 else {
6497 invlist_set_len(invlist, len);
6498 }
d764b54e
KW
6499
6500 /* The next item on the list starts the range, the one after that is
6501 * one past the new range. */
6502 array[len - 2] = start;
6503 if (end != UV_MAX) {
6504 array[len - 1] = end + 1;
6505 }
6506 else {
6507 /* But if the end is the maximum representable on the machine, just let
6508 * the range have no end */
6509 invlist_set_len(invlist, len - 1);
6510 }
6511}
6512
d5e82ecc
KW
6513STATIC IV
6514S_invlist_search(pTHX_ SV* const invlist, const UV cp)
6515{
6516 /* Searches the inversion list for the entry that contains the input code
6517 * point <cp>. If <cp> is not in the list, -1 is returned. Otherwise, the
6518 * return value is the index into the list's array of the range that
6519 * contains <cp> */
6520
6521 IV low = 0;
6522 IV high = invlist_len(invlist);
6523 const UV * const array = invlist_array(invlist);
6524
6525 PERL_ARGS_ASSERT_INVLIST_SEARCH;
6526
6527 /* If list is empty or the code point is before the first element, return
6528 * failure. */
6529 if (high == 0 || cp < array[0]) {
6530 return -1;
6531 }
6532
6533 /* Binary search. What we are looking for is <i> such that
6534 * array[i] <= cp < array[i+1]
6535 * The loop below converges on the i+1. */
6536 while (low < high) {
6537 IV mid = (low + high) / 2;
6538 if (array[mid] <= cp) {
6539 low = mid + 1;
6540
6541 /* We could do this extra test to exit the loop early.
6542 if (cp < array[low]) {
6543 return mid;
6544 }
6545 */
6546 }
6547 else { /* cp < array[mid] */
6548 high = mid;
6549 }
6550 }
6551
6552 return high - 1;
6553}
6554
86f766ab 6555void
b6a0ff33
KW
6556Perl__invlist_populate_swatch(pTHX_ SV* const invlist, const UV start, const UV end, U8* swatch)
6557{
6558 /* populates a swatch of a swash the same way swatch_get() does in utf8.c,
6559 * but is used when the swash has an inversion list. This makes this much
6560 * faster, as it uses a binary search instead of a linear one. This is
6561 * intimately tied to that function, and perhaps should be in utf8.c,
6562 * except it is intimately tied to inversion lists as well. It assumes
6563 * that <swatch> is all 0's on input */
6564
6565 UV current = start;
6566 const IV len = invlist_len(invlist);
6567 IV i;
6568 const UV * array;
6569
6570 PERL_ARGS_ASSERT__INVLIST_POPULATE_SWATCH;
6571
6572 if (len == 0) { /* Empty inversion list */
6573 return;
6574 }
6575
6576 array = invlist_array(invlist);
6577
6578 /* Find which element it is */
6579 i = invlist_search(invlist, start);
6580
6581 /* We populate from <start> to <end> */
6582 while (current < end) {
6583 UV upper;
6584
6585 /* The inversion list gives the results for every possible code point
6586 * after the first one in the list. Only those ranges whose index is
6587 * even are ones that the inversion list matches. For the odd ones,
6588 * and if the initial code point is not in the list, we have to skip
6589 * forward to the next element */
6590 if (i == -1 || ! ELEMENT_RANGE_MATCHES_INVLIST(i)) {
6591 i++;
6592 if (i >= len) { /* Finished if beyond the end of the array */
6593 return;
6594 }
6595 current = array[i];
6596 if (current >= end) { /* Finished if beyond the end of what we
6597 are populating */
6598 return;
6599 }
6600 }
6601 assert(current >= start);
6602
6603 /* The current range ends one below the next one, except don't go past
6604 * <end> */
6605 i++;
6606 upper = (i < len && array[i] < end) ? array[i] : end;
6607
6608 /* Here we are in a range that matches. Populate a bit in the 3-bit U8
6609 * for each code point in it */
6610 for (; current < upper; current++) {
6611 const STRLEN offset = (STRLEN)(current - start);
6612 swatch[offset >> 3] |= 1 << (offset & 7);
6613 }
6614
6615 /* Quit if at the end of the list */
6616 if (i >= len) {
6617
6618 /* But first, have to deal with the highest possible code point on
6619 * the platform. The previous code assumes that <end> is one
6620 * beyond where we want to populate, but that is impossible at the
6621 * platform's infinity, so have to handle it specially */
6622 if (UNLIKELY(end == UV_MAX && ELEMENT_RANGE_MATCHES_INVLIST(len-1)))
6623 {
6624 const STRLEN offset = (STRLEN)(end - start);
6625 swatch[offset >> 3] |= 1 << (offset & 7);
6626 }
6627 return;
6628 }
6629
6630 /* Advance to the next range, which will be for code points not in the
6631 * inversion list */
6632 current = array[i];
6633 }
6634
6635 return;
6636}
6637
8dc9348a 6638
b6a0ff33 6639void
164173a2 6640Perl__invlist_union_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** output)
d764b54e 6641{
4065ba03
KW
6642 /* Take the union of two inversion lists and point <output> to it. *output
6643 * should be defined upon input, and if it points to one of the two lists,
f45adb79
KW
6644 * the reference count to that list will be decremented. The first list,
6645 * <a>, may be NULL, in which case a copy of the second list is returned.
164173a2
KW
6646 * If <complement_b> is TRUE, the union is taken of the complement
6647 * (inversion) of <b> instead of b itself.
f45adb79 6648 *
d764b54e
KW
6649 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6650 * Richard Gillam, published by Addison-Wesley, and explained at some
6651 * length there. The preface says to incorporate its examples into your
6652 * code at your own risk.
6653 *
6654 * The algorithm is like a merge sort.
6655 *
6656 * XXX A potential performance improvement is to keep track as we go along
6657 * if only one of the inputs contributes to the result, meaning the other
6658 * is a subset of that one. In that case, we can skip the final copy and
a2995b7f
KW
6659 * return the larger of the input lists, but then outside code might need
6660 * to keep track of whether to free the input list or not */
d764b54e 6661
f1b67122
KW
6662 UV* array_a; /* a's array */
6663 UV* array_b;
6664 UV len_a; /* length of a's array */
6665 UV len_b;
d764b54e 6666
a25abddc 6667 SV* u; /* the resulting union */
d764b54e
KW
6668 UV* array_u;
6669 UV len_u;
6670
6671 UV i_a = 0; /* current index into a's array */
6672 UV i_b = 0;
6673 UV i_u = 0;
6674
6675 /* running count, as explained in the algorithm source book; items are
6676 * stopped accumulating and are output when the count changes to/from 0.
6677 * The count is incremented when we start a range that's in the set, and
6678 * decremented when we start a range that's not in the set. So its range
6679 * is 0 to 2. Only when the count is zero is something not in the set.
6680 */
6681 UV count = 0;
6682
164173a2 6683 PERL_ARGS_ASSERT__INVLIST_UNION_MAYBE_COMPLEMENT_2ND;
60825692 6684 assert(a != b);
d764b54e 6685
f1b67122 6686 /* If either one is empty, the union is the other one */
f45adb79 6687 if (a == NULL || ((len_a = invlist_len(a)) == 0)) {
4065ba03 6688 if (*output == a) {
f45adb79
KW
6689 if (a != NULL) {
6690 SvREFCNT_dec(a);
6691 }
f1b67122 6692 }
4065ba03 6693 if (*output != b) {
f1b67122 6694 *output = invlist_clone(b);
164173a2
KW
6695 if (complement_b) {
6696 _invlist_invert(*output);
6697 }
dbe7a391 6698 } /* else *output already = b; */
f1b67122
KW
6699 return;
6700 }
6701 else if ((len_b = invlist_len(b)) == 0) {
4065ba03 6702 if (*output == b) {
f1b67122
KW
6703 SvREFCNT_dec(b);
6704 }
164173a2
KW
6705
6706 /* The complement of an empty list is a list that has everything in it,
6707 * so the union with <a> includes everything too */
6708 if (complement_b) {
6709 if (a == *output) {
6710 SvREFCNT_dec(a);
6711 }
6712 *output = _new_invlist(1);
6713 _append_range_to_invlist(*output, 0, UV_MAX);
6714 }
6715 else if (*output != a) {
6716 *output = invlist_clone(a);
6717 }
6718 /* else *output already = a; */
f1b67122
KW
6719 return;
6720 }
6721
6722 /* Here both lists exist and are non-empty */
6723 array_a = invlist_array(a);
6724 array_b = invlist_array(b);
6725
164173a2
KW
6726 /* If are to take the union of 'a' with the complement of b, set it
6727 * up so are looking at b's complement. */
6728 if (complement_b) {
6729
6730 /* To complement, we invert: if the first element is 0, remove it. To
6731 * do this, we just pretend the array starts one later, and clear the
6732 * flag as we don't have to do anything else later */
6733 if (array_b[0] == 0) {
6734 array_b++;
6735 len_b--;
6736 complement_b = FALSE;
6737 }
6738 else {
6739
6740 /* But if the first element is not zero, we unshift a 0 before the
6741 * array. The data structure reserves a space for that 0 (which
6742 * should be a '1' right now), so physical shifting is unneeded,
6743 * but temporarily change that element to 0. Before exiting the
6744 * routine, we must restore the element to '1' */
6745 array_b--;
6746 len_b++;
6747 array_b[0] = 0;
6748 }
6749 }
6750
d764b54e
KW
6751 /* Size the union for the worst case: that the sets are completely
6752 * disjoint */
6753 u = _new_invlist(len_a + len_b);
f1b67122
KW
6754
6755 /* Will contain U+0000 if either component does */
6756 array_u = _invlist_array_init(u, (len_a > 0 && array_a[0] == 0)
6757 || (len_b > 0 && array_b[0] == 0));
d764b54e
KW
6758
6759 /* Go through each list item by item, stopping when exhausted one of
6760 * them */
6761 while (i_a < len_a && i_b < len_b) {
6762 UV cp; /* The element to potentially add to the union's array */
6763 bool cp_in_set; /* is it in the the input list's set or not */
6764
6765 /* We need to take one or the other of the two inputs for the union.
6766 * Since we are merging two sorted lists, we take the smaller of the
6767 * next items. In case of a tie, we take the one that is in its set
6768 * first. If we took one not in the set first, it would decrement the
6769 * count, possibly to 0 which would cause it to be output as ending the
6770 * range, and the next time through we would take the same number, and
6771 * output it again as beginning the next range. By doing it the
6772 * opposite way, there is no possibility that the count will be
6773 * momentarily decremented to 0, and thus the two adjoining ranges will
6774 * be seamlessly merged. (In a tie and both are in the set or both not
6775 * in the set, it doesn't matter which we take first.) */
6776 if (array_a[i_a] < array_b[i_b]
4096c37b
KW
6777 || (array_a[i_a] == array_b[i_b]
6778 && ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
d764b54e 6779 {
4096c37b 6780 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
d764b54e
KW
6781 cp= array_a[i_a++];
6782 }
6783 else {
4096c37b 6784 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
d764b54e
KW
6785 cp= array_b[i_b++];
6786 }
6787
6788 /* Here, have chosen which of the two inputs to look at. Only output
6789 * if the running count changes to/from 0, which marks the
6790 * beginning/end of a range in that's in the set */
6791 if (cp_in_set) {
6792 if (count == 0) {
6793 array_u[i_u++] = cp;
6794 }
6795 count++;
6796 }
6797 else {
6798 count--;
6799 if (count == 0) {
6800 array_u[i_u++] = cp;
6801 }
6802 }
6803 }
6804
6805 /* Here, we are finished going through at least one of the lists, which
6806 * means there is something remaining in at most one. We check if the list
6807 * that hasn't been exhausted is positioned such that we are in the middle
bac5f0ae
KW
6808 * of a range in its set or not. (i_a and i_b point to the element beyond
6809 * the one we care about.) If in the set, we decrement 'count'; if 0, there
6810 * is potentially more to output.
d764b54e
KW
6811 * There are four cases:
6812 * 1) Both weren't in their sets, count is 0, and remains 0. What's left
6813 * in the union is entirely from the non-exhausted set.
6814 * 2) Both were in their sets, count is 2. Nothing further should
6815 * be output, as everything that remains will be in the exhausted
6816 * list's set, hence in the union; decrementing to 1 but not 0 insures
6817 * that
6818 * 3) the exhausted was in its set, non-exhausted isn't, count is 1.
6819 * Nothing further should be output because the union includes
bac5f0ae 6820 * everything from the exhausted set. Not decrementing ensures that.
d764b54e
KW
6821 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1;
6822 * decrementing to 0 insures that we look at the remainder of the
6823 * non-exhausted set */
4096c37b
KW
6824 if ((i_a != len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
6825 || (i_b != len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
d764b54e
KW
6826 {
6827 count--;
6828 }
6829
6830 /* The final length is what we've output so far, plus what else is about to
6831 * be output. (If 'count' is non-zero, then the input list we exhausted
6832 * has everything remaining up to the machine's limit in its set, and hence
6833 * in the union, so there will be no further output. */
6834 len_u = i_u;
6835 if (count == 0) {
6836 /* At most one of the subexpressions will be non-zero */
6837 len_u += (len_a - i_a) + (len_b - i_b);
6838 }
6839
6840 /* Set result to final length, which can change the pointer to array_u, so
6841 * re-find it */
6842 if (len_u != invlist_len(u)) {
6843 invlist_set_len(u, len_u);
6844 invlist_trim(u);
6845 array_u = invlist_array(u);
6846 }
6847
6848 /* When 'count' is 0, the list that was exhausted (if one was shorter than
6849 * the other) ended with everything above it not in its set. That means
6850 * that the remaining part of the union is precisely the same as the
6851 * non-exhausted list, so can just copy it unchanged. (If both list were
6852 * exhausted at the same time, then the operations below will be both 0.)
6853 */
6854 if (count == 0) {
6855 IV copy_count; /* At most one will have a non-zero copy count */
6856 if ((copy_count = len_a - i_a) > 0) {
6857 Copy(array_a + i_a, array_u + i_u, copy_count, UV);
6858 }
6859 else if ((copy_count = len_b - i_b) > 0) {
6860 Copy(array_b + i_b, array_u + i_u, copy_count, UV);
6861 }
6862 }
6863
a2995b7f 6864 /* We may be removing a reference to one of the inputs */
4065ba03 6865 if (a == *output || b == *output) {
a2995b7f
KW
6866 SvREFCNT_dec(*output);
6867 }
6868
164173a2
KW
6869 /* If we've changed b, restore it */
6870 if (complement_b) {
6871 array_b[0] = 1;
6872 }
6873
a2995b7f
KW
6874 *output = u;
6875 return;
d764b54e
KW
6876}
6877
86f766ab 6878void
52ae8f7e 6879Perl__invlist_intersection_maybe_complement_2nd(pTHX_ SV* const a, SV* const b, bool complement_b, SV** i)
d764b54e 6880{
4065ba03
KW
6881 /* Take the intersection of two inversion lists and point <i> to it. *i
6882 * should be defined upon input, and if it points to one of the two lists,
6883 * the reference count to that list will be decremented.
52ae8f7e
KW
6884 * If <complement_b> is TRUE, the result will be the intersection of <a>
6885 * and the complement (or inversion) of <b> instead of <b> directly.
6886 *
a2995b7f
KW
6887 * The basis for this comes from "Unicode Demystified" Chapter 13 by
6888 * Richard Gillam, published by Addison-Wesley, and explained at some
6889 * length there. The preface says to incorporate its examples into your
6890 * code at your own risk. In fact, it had bugs
d764b54e
KW
6891 *
6892 * The algorithm is like a merge sort, and is essentially the same as the
6893 * union above
6894 */
6895
f1b67122
KW
6896 UV* array_a; /* a's array */
6897 UV* array_b;
6898 UV len_a; /* length of a's array */
6899 UV len_b;
d764b54e 6900
a25abddc 6901 SV* r; /* the resulting intersection */
d764b54e
KW
6902 UV* array_r;
6903 UV len_r;
6904
6905 UV i_a = 0; /* current index into a's array */
6906 UV i_b = 0;
6907 UV i_r = 0;
6908
6909 /* running count, as explained in the algorithm source book; items are
6910 * stopped accumulating and are output when the count changes to/from 2.
6911 * The count is incremented when we start a range that's in the set, and
6912 * decremented when we start a range that's not in the set. So its range
6913 * is 0 to 2. Only when the count is 2 is something in the intersection.
6914 */
6915 UV count = 0;
6916
52ae8f7e 6917 PERL_ARGS_ASSERT__INVLIST_INTERSECTION_MAYBE_COMPLEMENT_2ND;
60825692 6918 assert(a != b);
d764b54e 6919
52ae8f7e 6920 /* Special case if either one is empty */
f1b67122
KW
6921 len_a = invlist_len(a);
6922 if ((len_a == 0) || ((len_b = invlist_len(b)) == 0)) {
f1b67122 6923
52ae8f7e
KW
6924 if (len_a != 0 && complement_b) {
6925
6926 /* Here, 'a' is not empty, therefore from the above 'if', 'b' must
6927 * be empty. Here, also we are using 'b's complement, which hence
6928 * must be every possible code point. Thus the intersection is
6929 * simply 'a'. */
6930 if (*i != a) {
6931 *i = invlist_clone(a);
6932
6933 if (*i == b) {
6934 SvREFCNT_dec(b);
6935 }
6936 }
6937 /* else *i is already 'a' */
6938 return;
6939 }
6940
6941 /* Here, 'a' or 'b' is empty and not using the complement of 'b'. The
6942 * intersection must be empty */
4065ba03 6943 if (*i == a) {
f1b67122
KW
6944 SvREFCNT_dec(a);
6945 }
4065ba03 6946 else if (*i == b) {
f1b67122
KW
6947 SvREFCNT_dec(b);
6948 }
2ea86699 6949 *i = _new_invlist(0);
f1b67122
KW
6950 return;
6951 }
6952
6953 /* Here both lists exist and are non-empty */
6954 array_a = invlist_array(a);
6955 array_b = invlist_array(b);
6956
52ae8f7e
KW
6957 /* If are to take the intersection of 'a' with the complement of b, set it
6958 * up so are looking at b's complement. */
6959 if (complement_b) {
6960
6961 /* To complement, we invert: if the first element is 0, remove it. To
6962 * do this, we just pretend the array starts one later, and clear the
6963 * flag as we don't have to do anything else later */
6964 if (array_b[0] == 0) {
6965 array_b++;
6966 len_b--;
6967 complement_b = FALSE;
6968 }
6969 else {
6970
6971 /* But if the first element is not zero, we unshift a 0 before the
6972 * array. The data structure reserves a space for that 0 (which
6973 * should be a '1' right now), so physical shifting is unneeded,
6974 * but temporarily change that element to 0. Before exiting the
6975 * routine, we must restore the element to '1' */
6976 array_b--;
6977 len_b++;
6978 array_b[0] = 0;
6979 }
6980 }
6981
d764b54e
KW
6982 /* Size the intersection for the worst case: that the intersection ends up
6983 * fragmenting everything to be completely disjoint */
6984 r= _new_invlist(len_a + len_b);
f1b67122
KW
6985
6986 /* Will contain U+0000 iff both components do */
6987 array_r = _invlist_array_init(r, len_a > 0 && array_a[0] == 0
6988 && len_b > 0 && array_b[0] == 0);
d764b54e
KW
6989
6990 /* Go through each list item by item, stopping when exhausted one of
6991 * them */
6992 while (i_a < len_a && i_b < len_b) {
6993 UV cp; /* The element to potentially add to the intersection's
6994 array */
6995 bool cp_in_set; /* Is it in the input list's set or not */
6996
c4a30257
KW
6997 /* We need to take one or the other of the two inputs for the
6998 * intersection. Since we are merging two sorted lists, we take the
6999 * smaller of the next items. In case of a tie, we take the one that
7000 * is not in its set first (a difference from the union algorithm). If
7001 * we took one in the set first, it would increment the count, possibly
7002 * to 2 which would cause it to be output as starting a range in the
7003 * intersection, and the next time through we would take that same
7004 * number, and output it again as ending the set. By doing it the
7005 * opposite of this, there is no possibility that the count will be
7006 * momentarily incremented to 2. (In a tie and both are in the set or
7007 * both not in the set, it doesn't matter which we take first.) */
d764b54e 7008 if (array_a[i_a] < array_b[i_b]
4096c37b
KW
7009 || (array_a[i_a] == array_b[i_b]
7010 && ! ELEMENT_RANGE_MATCHES_INVLIST(i_a)))
d764b54e 7011 {
4096c37b 7012 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_a);
d764b54e
KW
7013 cp= array_a[i_a++];
7014 }
7015 else {
4096c37b 7016 cp_in_set = ELEMENT_RANGE_MATCHES_INVLIST(i_b);
d764b54e
KW
7017 cp= array_b[i_b++];
7018 }
7019
7020 /* Here, have chosen which of the two inputs to look at. Only output
7021 * if the running count changes to/from 2, which marks the
7022 * beginning/end of a range that's in the intersection */
7023 if (cp_in_set) {
7024 count++;
7025 if (count == 2) {
7026 array_r[i_r++] = cp;
7027 }
7028 }
7029 else {
7030 if (count == 2) {
7031 array_r[i_r++] = cp;
7032 }
7033 count--;
7034 }
7035 }
7036
c4a30257
KW
7037 /* Here, we are finished going through at least one of the lists, which
7038 * means there is something remaining in at most one. We check if the list
7039 * that has been exhausted is positioned such that we are in the middle
7040 * of a range in its set or not. (i_a and i_b point to elements 1 beyond
7041 * the ones we care about.) There are four cases:
7042 * 1) Both weren't in their sets, count is 0, and remains 0. There's
7043 * nothing left in the intersection.
7044 * 2) Both were in their sets, count is 2 and perhaps is incremented to
7045 * above 2. What should be output is exactly that which is in the
7046 * non-exhausted set, as everything it has is also in the intersection
7047 * set, and everything it doesn't have can't be in the intersection
7048 * 3) The exhausted was in its set, non-exhausted isn't, count is 1, and
7049 * gets incremented to 2. Like the previous case, the intersection is
7050 * everything that remains in the non-exhausted set.
7051 * 4) the exhausted wasn't in its set, non-exhausted is, count is 1, and
7052 * remains 1. And the intersection has nothing more. */
4096c37b
KW
7053 if ((i_a == len_a && PREV_RANGE_MATCHES_INVLIST(i_a))
7054 || (i_b == len_b && PREV_RANGE_MATCHES_INVLIST(i_b)))
d764b54e 7055 {
c4a30257 7056 count++;
d764b54e
KW
7057 }
7058
7059 /* The final length is what we've output so far plus what else is in the
c4a30257 7060 * intersection. At most one of the subexpressions below will be non-zero */
d764b54e 7061 len_r = i_r;
c4a30257 7062 if (count >= 2) {
d764b54e
KW
7063 len_r += (len_a - i_a) + (len_b - i_b);
7064 }
7065
7066 /* Set result to final length, which can change the pointer to array_r, so
7067 * re-find it */
7068 if (len_r != invlist_len(r)) {
7069 invlist_set_len(r, len_r);
7070 invlist_trim(r);
7071 array_r = invlist_array(r);
7072 }
7073
7074 /* Finish outputting any remaining */
c4a30257 7075 if (count >= 2) { /* At most one will have a non-zero copy count */
d764b54e
KW
7076 IV copy_count;
7077 if ((copy_count = len_a - i_a) > 0) {
7078 Copy(array_a + i_a, array_r + i_r, copy_count, UV);
7079 }
7080 else if ((copy_count = len_b - i_b) > 0) {
7081 Copy(array_b + i_b, array_r + i_r, copy_count, UV);
7082 }
7083 }
7084
a2995b7f 7085 /* We may be removing a reference to one of the inputs */
4065ba03 7086 if (a == *i || b == *i) {
a2995b7f
KW
7087 SvREFCNT_dec(*i);
7088 }
7089
52ae8f7e
KW
7090 /* If we've changed b, restore it */
7091 if (complement_b) {
7092 array_b[0] = 1;
7093 }
7094
a2995b7f
KW
7095 *i = r;
7096 return;
d764b54e
KW
7097}
7098
3c234b35
TC
7099#endif
7100
a25abddc
KW
7101STATIC SV*
7102S_add_range_to_invlist(pTHX_ SV* invlist, const UV start, const UV end)
d764b54e
KW
7103{
7104 /* Add the range from 'start' to 'end' inclusive to the inversion list's
7105 * set. A pointer to the inversion list is returned. This may actually be
c52a3e71
KW
7106 * a new list, in which case the passed in one has been destroyed. The
7107 * passed in inversion list can be NULL, in which case a new one is created
7108 * with just the one range in it */
d764b54e 7109
a25abddc 7110 SV* range_invlist;
c52a3e71 7111 UV len;
d764b54e 7112
c52a3e71
KW
7113 if (invlist == NULL) {
7114 invlist = _new_invlist(2);
7115 len = 0;
7116 }
7117 else {
7118 len = invlist_len(invlist);
7119 }
d764b54e
KW
7120
7121 /* If comes after the final entry, can just append it to the end */
7122 if (len == 0
7123 || start >= invlist_array(invlist)
7124 [invlist_len(invlist) - 1])
7125 {
7126 _append_range_to_invlist(invlist, start, end);
7127 return invlist;
7128 }
7129
7130 /* Here, can't just append things, create and return a new inversion list
7131 * which is the union of this range and the existing inversion list */
7132 range_invlist = _new_invlist(2);
7133 _append_range_to_invlist(range_invlist, start, end);
7134
37e85ffe 7135 _invlist_union(invlist, range_invlist, &invlist);
d764b54e 7136
0a89af2f 7137 /* The temporary can be freed */
318c430e 7138 SvREFCNT_dec(range_invlist);
d764b54e 7139
6d63a9fb 7140 return invlist;
d764b54e
KW
7141}
7142
a25abddc
KW
7143PERL_STATIC_INLINE SV*
7144S_add_cp_to_invlist(pTHX_ SV* invlist, const UV cp) {
c229b64c
KW
7145 return add_range_to_invlist(invlist, cp, cp);
7146}
7147
3c234b35 7148#ifndef PERL_IN_XSUB_RE
86f766ab
KW
7149void
7150Perl__invlist_invert(pTHX_ SV* const invlist)
25e94a65
KW
7151{
7152 /* Complement the input inversion list. This adds a 0 if the list didn't
7153 * have a zero; removes it otherwise. As described above, the data
7154 * structure is set up so that this is very efficient */
7155
7156 UV* len_pos = get_invlist_len_addr(invlist);
7157
37e85ffe 7158 PERL_ARGS_ASSERT__INVLIST_INVERT;
25e94a65
KW
7159
7160 /* The inverse of matching nothing is matching everything */
7161 if (*len_pos == 0) {
7162 _append_range_to_invlist(invlist, 0, UV_MAX);
7163 return;
7164 }
7165
7166 /* The exclusive or complents 0 to 1; and 1 to 0. If the result is 1, the
7167 * zero element was a 0, so it is being removed, so the length decrements
7168 * by 1; and vice-versa. SvCUR is unaffected */
7169 if (*get_invlist_zero_addr(invlist) ^= 1) {
7170 (*len_pos)--;
7171 }
7172 else {
7173 (*len_pos)++;
7174 }
7175}
89302fc2
KW
7176
7177void
7178Perl__invlist_invert_prop(pTHX_ SV* const invlist)
7179{
7180 /* Complement the input inversion list (which must be a Unicode property,
7181 * all of which don't match above the Unicode maximum code point.) And
7182 * Perl has chosen to not have the inversion match above that either. This
7183 * adds a 0x110000 if the list didn't end with it, and removes it if it did
7184 */
7185
7186 UV len;
7187 UV* array;
7188
7189 PERL_ARGS_ASSERT__INVLIST_INVERT_PROP;
7190
7191 _invlist_invert(invlist);
7192
7193 len = invlist_len(invlist);
7194
7195 if (len != 0) { /* If empty do nothing */
7196 array = invlist_array(invlist);
7197 if (array[len - 1] != PERL_UNICODE_MAX + 1) {
7198 /* Add 0x110000. First, grow if necessary */
7199 len++;
7200 if (invlist_max(invlist) < len) {
7201 invlist_extend(invlist, len);
7202 array = invlist_array(invlist);
7203 }
7204 invlist_set_len(invlist, len);
7205 array[len - 1] = PERL_UNICODE_MAX + 1;
7206 }
7207 else { /* Remove the 0x110000 */
7208 invlist_set_len(invlist, len - 1);
7209 }
7210 }
7211
7212 return;
7213}
3c234b35 7214#endif
25e94a65
KW
7215
7216PERL_STATIC_INLINE SV*
7217S_invlist_clone(pTHX_ SV* const invlist)
7218{
7219
7220 /* Return a new inversion list that is a copy of the input one, which is
7221 * unchanged */
7222
6c6c83ac
KW
7223 /* Need to allocate extra space to accommodate Perl's addition of a
7224 * trailing NUL to SvPV's, since it thinks they are always strings */
7225 SV* new_invlist = _new_invlist(invlist_len(invlist) + 1);
6d47fb3d 7226 STRLEN length = SvCUR(invlist);
25e94a65
KW
7227
7228 PERL_ARGS_ASSERT_INVLIST_CLONE;
7229
6d47fb3d
KW
7230 SvCUR_set(new_invlist, length); /* This isn't done automatically */
7231 Copy(SvPVX(invlist), SvPVX(new_invlist), length, char);
7232
25e94a65
KW
7233 return new_invlist;
7234}
7235
f3dc70d1
KW
7236PERL_STATIC_INLINE UV*
7237S_get_invlist_iter_addr(pTHX_ SV* invlist)
7238{
7239 /* Return the address of the UV that contains the current iteration
7240 * position */
7241
7242 PERL_ARGS_ASSERT_GET_INVLIST_ITER_ADDR;
7243
7244 return (UV *) (SvPVX(invlist) + (INVLIST_ITER_OFFSET * sizeof (UV)));
7245}
7246
f59ff194
KW
7247PERL_STATIC_INLINE UV*
7248S_get_invlist_version_id_addr(pTHX_ SV* invlist)
7249{
7250 /* Return the address of the UV that contains the version id. */
7251
7252 PERL_ARGS_ASSERT_GET_INVLIST_VERSION_ID_ADDR;
7253
7254 return (UV *) (SvPVX(invlist) + (INVLIST_VERSION_ID_OFFSET * sizeof (UV)));
7255}
7256
f3dc70d1
KW
7257PERL_STATIC_INLINE void
7258S_invlist_iterinit(pTHX_ SV* invlist) /* Initialize iterator for invlist */
7259{
7260 PERL_ARGS_ASSERT_INVLIST_ITERINIT;
7261
7262 *get_invlist_iter_addr(invlist) = 0;
7263}
7264
7265STATIC bool
7266S_invlist_iternext(pTHX_ SV* invlist, UV* start, UV* end)
7267{
dbe7a391
KW
7268 /* An C<invlist_iterinit> call on <invlist> must be used to set this up.
7269 * This call sets in <*start> and <*end>, the next range in <invlist>.
7270 * Returns <TRUE> if successful and the next call will return the next
7271 * range; <FALSE> if was already at the end of the list. If the latter,
7272 * <*start> and <*end> are unchanged, and the next call to this function
7273 * will start over at the beginning of the list */
7274
f3dc70d1
KW
7275 UV* pos = get_invlist_iter_addr(invlist);
7276 UV len = invlist_len(invlist);
7277 UV *array;
7278
7279 PERL_ARGS_ASSERT_INVLIST_ITERNEXT;
7280
7281 if (*pos >= len) {
7282 *pos = UV_MAX; /* Force iternit() to be required next time */
7283 return FALSE;
7284 }
7285
7286 array = invlist_array(invlist);
7287
7288 *start = array[(*pos)++];
7289
7290 if (*pos >= len) {
7291 *end = UV_MAX;
7292 }
7293 else {
7294 *end = array[(*pos)++] - 1;
7295 }
7296
7297 return TRUE;
7298}
7299
b2b97e77
KW
7300#ifndef PERL_IN_XSUB_RE
7301SV *
7302Perl__invlist_contents(pTHX_ SV* const invlist)
7303{
7304 /* Get the contents of an inversion list into a string SV so that they can
7305 * be printed out. It uses the format traditionally done for debug tracing
7306 */
7307
7308 UV start, end;
7309 SV* output = newSVpvs("\n");
7310
7311 PERL_ARGS_ASSERT__INVLIST_CONTENTS;
7312
7313 invlist_iterinit(invlist);
7314 while (invlist_iternext(invlist, &start, &end)) {
7315 if (end == UV_MAX) {
7316 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\tINFINITY\n", start);
7317 }
7318 else if (end != start) {
7319 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\t%04"UVXf"\n",
7320 start, end);
7321 }
7322 else {
7323 Perl_sv_catpvf(aTHX_ output, "%04"UVXf"\n", start);
7324 }
7325 }
7326
7327 return output;
7328}
7329#endif
7330
768318b8
KW
7331#if 0
7332void
7333S_invlist_dump(pTHX_ SV* const invlist, const char * const header)
7334{
7335 /* Dumps out the ranges in an inversion list. The string 'header'
7336 * if present is output on a line before the first range */
7337
7338 UV start, end;
7339
7340 if (header && strlen(header)) {
7341 PerlIO_printf(Perl_debug_log, "%s\n", header);
7342 }
7343 invlist_iterinit(invlist);
7344 while (invlist_iternext(invlist, &start, &end)) {
7345 if (end == UV_MAX) {
7346 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. INFINITY\n", start);
7347 }
7348 else {
7349 PerlIO_printf(Perl_debug_log, "0x%04"UVXf" .. 0x%04"UVXf"\n", start, end);
7350 }
7351 }
7352}
7353#endif
7354
97b14ce7 7355#undef HEADER_LENGTH
060b7a35 7356#undef INVLIST_INITIAL_LENGTH
005b65ed
KW
7357#undef TO_INTERNAL_SIZE
7358#undef FROM_INTERNAL_SIZE
f1b67122
KW
7359#undef INVLIST_LEN_OFFSET
7360#undef INVLIST_ZERO_OFFSET
f3dc70d1 7361#undef INVLIST_ITER_OFFSET
f59ff194 7362#undef INVLIST_VERSION_ID
060b7a35 7363
d764b54e
KW
7364/* End of inversion list object */
7365
a687059c
LW
7366/*
7367 - reg - regular expression, i.e. main body or parenthesized thing
7368 *
7369 * Caller must absorb opening parenthesis.
7370 *
7371 * Combining parenthesis handling with the base level of regular expression
7372 * is a trifle forced, but the need to tie the tails of the branches to what
7373 * follows makes it hard to avoid.
7374 */
07be1b83
YO
7375#define REGTAIL(x,y,z) regtail((x),(y),(z),depth+1)
7376#ifdef DEBUGGING
7377#define REGTAIL_STUDY(x,y,z) regtail_study((x),(y),(z),depth+1)
7378#else
7379#define REGTAIL_STUDY(x,y,z) regtail((x),(y),(z),depth+1)
7380#endif
3dab1dad 7381
76e3520e 7382STATIC regnode *
3dab1dad 7383S_reg(pTHX_ RExC_state_t *pRExC_state, I32 paren, I32 *flagp,U32 depth)
c277df42 7384 /* paren: Parenthesized? 0=top, 1=(, inside: changed to letter. */
a687059c 7385{
27da23d5 7386 dVAR;
c277df42
IZ
7387 register regnode *ret; /* Will be the head of the group. */
7388 register regnode *br;
7389 register regnode *lastbr;
cbbf8932 7390 register regnode *ender = NULL;
a0d0e21e 7391 register I32 parno = 0;
cbbf8932 7392 I32 flags;
f7819f85 7393 U32 oregflags = RExC_flags;
6136c704
AL
7394 bool have_branch = 0;
7395 bool is_open = 0;
594d7033
YO
7396 I32 freeze_paren = 0;
7397 I32 after_freeze = 0;
9d1d55b5
JP
7398
7399 /* for (?g), (?gc), and (?o) warnings; warning
7400 about (?c) will warn about (?g) -- japhy */
7401
6136c704
AL
7402#define WASTED_O 0x01
7403#define WASTED_G 0x02
7404#define WASTED_C 0x04
7405#define WASTED_GC (0x02|0x04)
cbbf8932 7406 I32 wastedflags = 0x00;
9d1d55b5 7407
fac92740 7408 char * parse_start = RExC_parse; /* MJD */
a28509cc 7409 char * const oregcomp_parse = RExC_parse;
a0d0e21e 7410
3dab1dad 7411 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
7412
7413 PERL_ARGS_ASSERT_REG;
3dab1dad
YO
7414 DEBUG_PARSE("reg ");
7415
821b33a5 7416 *flagp = 0; /* Tentatively. */
a0d0e21e 7417
9d1d55b5 7418
a0d0e21e
LW
7419 /* Make an OPEN node, if parenthesized. */
7420 if (paren) {
e2e6a0f1
YO
7421 if ( *RExC_parse == '*') { /* (*VERB:ARG) */
7422 char *start_verb = RExC_parse;
7423 STRLEN verb_len = 0;
7424 char *start_arg = NULL;
7425 unsigned char op = 0;
7426 int argok = 1;
7427 int internal_argval = 0; /* internal_argval is only useful if !argok */
7428 while ( *RExC_parse && *RExC_parse != ')' ) {
7429 if ( *RExC_parse == ':' ) {
7430 start_arg = RExC_parse + 1;
7431 break;
7432 }
7433 RExC_parse++;
7434 }
7435 ++start_verb;
7436 verb_len = RExC_parse - start_verb;
7437 if ( start_arg ) {
7438 RExC_parse++;
7439 while ( *RExC_parse && *RExC_parse != ')' )
7440 RExC_parse++;
7441 if ( *RExC_parse != ')' )
7442 vFAIL("Unterminated verb pattern argument");
7443 if ( RExC_parse == start_arg )
7444 start_arg = NULL;
7445 } else {
7446 if ( *RExC_parse != ')' )
7447 vFAIL("Unterminated verb pattern");
7448 }
5d458dd8 7449
e2e6a0f1
YO
7450 switch ( *start_verb ) {
7451 case 'A': /* (*ACCEPT) */
568a785a 7452 if ( memEQs(start_verb,verb_len,"ACCEPT") ) {
e2e6a0f1
YO
7453 op = ACCEPT;
7454 internal_argval = RExC_nestroot;
7455 }
7456 break;
7457 case 'C': /* (*COMMIT) */
568a785a 7458 if ( memEQs(start_verb,verb_len,"COMMIT") )
e2e6a0f1 7459 op = COMMIT;
e2e6a0f1
YO
7460 break;
7461 case 'F': /* (*FAIL) */
568a785a 7462 if ( verb_len==1 || memEQs(start_verb,verb_len,"FAIL") ) {
e2e6a0f1
YO
7463 op = OPFAIL;
7464 argok = 0;
7465 }
7466 break;
5d458dd8
YO
7467 case ':': /* (*:NAME) */
7468 case 'M': /* (*MARK:NAME) */
568a785a 7469 if ( verb_len==0 || memEQs(start_verb,verb_len,"MARK") ) {
e2e6a0f1 7470 op = MARKPOINT;
5d458dd8
YO
7471 argok = -1;
7472 }
7473 break;
7474 case 'P': /* (*PRUNE) */
568a785a 7475 if ( memEQs(start_verb,verb_len,"PRUNE") )
5d458dd8 7476 op = PRUNE;
e2e6a0f1 7477 break;
5d458dd8 7478 case 'S': /* (*SKIP) */
568a785a 7479 if ( memEQs(start_verb,verb_len,"SKIP") )
5d458dd8
YO
7480 op = SKIP;
7481 break;
7482 case 'T': /* (*THEN) */
7483 /* [19:06] <TimToady> :: is then */
568a785a 7484 if ( memEQs(start_verb,verb_len,"THEN") ) {
5d458dd8
YO
7485 op = CUTGROUP;
7486 RExC_seen |= REG_SEEN_CUTGROUP;
7487 }
e2e6a0f1
YO
7488 break;
7489 }
7490 if ( ! op ) {
7491 RExC_parse++;
7492 vFAIL3("Unknown verb pattern '%.*s'",
7493 verb_len, start_verb);
7494 }
7495 if ( argok ) {
7496 if ( start_arg && internal_argval ) {
7497 vFAIL3("Verb pattern '%.*s' may not have an argument",
7498 verb_len, start_verb);
7499 } else if ( argok < 0 && !start_arg ) {
7500 vFAIL3("Verb pattern '%.*s' has a mandatory argument",
7501 verb_len, start_verb);
7502 } else {
7503 ret = reganode(pRExC_state, op, internal_argval);
7504 if ( ! internal_argval && ! SIZE_ONLY ) {
7505 if (start_arg) {
7506 SV *sv = newSVpvn( start_arg, RExC_parse - start_arg);
7507 ARG(ret) = add_data( pRExC_state, 1, "S" );
f8fc2ecf 7508 RExC_rxi->data->data[ARG(ret)]=(void*)sv;
e2e6a0f1
YO
7509 ret->flags = 0;
7510 } else {
7511 ret->flags = 1;
7512 }
7513 }
7514 }
7515 if (!internal_argval)
7516 RExC_seen |= REG_SEEN_VERBARG;
7517 } else if ( start_arg ) {
7518 vFAIL3("Verb pattern '%.*s' may not have an argument",
7519 verb_len, start_verb);
7520 } else {
7521 ret = reg_node(pRExC_state, op);
7522 }
7523 nextchar(pRExC_state);
7524 return ret;
7525 } else
fac92740 7526 if (*RExC_parse == '?') { /* (?...) */
6136c704 7527 bool is_logical = 0;
a28509cc 7528 const char * const seqstart = RExC_parse;
fb85c044 7529 bool has_use_defaults = FALSE;
ca9dfc88 7530
830247a4
IZ
7531 RExC_parse++;
7532 paren = *RExC_parse++;
c277df42 7533 ret = NULL; /* For look-ahead/behind. */
a0d0e21e 7534 switch (paren) {
894be9b7 7535
1f1031fe
YO
7536 case 'P': /* (?P...) variants for those used to PCRE/Python */
7537 paren = *RExC_parse++;
7538 if ( paren == '<') /* (?P<...>) named capture */
7539 goto named_capture;
7540 else if (paren == '>') { /* (?P>name) named recursion */
7541 goto named_recursion;
7542 }
7543 else if (paren == '=') { /* (?P=...) named backref */
7544 /* this pretty much dupes the code for \k<NAME> in regatom(), if
7545 you change this make sure you change that */
7546 char* name_start = RExC_parse;
7547 U32 num = 0;
7548 SV *sv_dat = reg_scan_name(pRExC_state,
7549 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7550 if (RExC_parse == name_start || *RExC_parse != ')')
7551 vFAIL2("Sequence %.3s... not terminated",parse_start);
7552
7553 if (!SIZE_ONLY) {
7554 num = add_data( pRExC_state, 1, "S" );
7555 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7556 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
7557 }
7558 RExC_sawback = 1;
4444fd9f
KW
7559 ret = reganode(pRExC_state,
7560 ((! FOLD)
7561 ? NREF
2f7f8cb1
KW
7562 : (MORE_ASCII_RESTRICTED)
7563 ? NREFFA
7564 : (AT_LEAST_UNI_SEMANTICS)
7565 ? NREFFU
7566 : (LOC)
7567 ? NREFFL
7568 : NREFF),
4444fd9f 7569 num);
1f1031fe
YO
7570 *flagp |= HASWIDTH;
7571
7572 Set_Node_Offset(ret, parse_start+1);
7573 Set_Node_Cur_Length(ret); /* MJD */
7574
7575 nextchar(pRExC_state);
7576 return ret;
7577 }
57b84237
YO
7578 RExC_parse++;
7579 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7580 /*NOTREACHED*/
7581 case '<': /* (?<...) */
b81d288d 7582 if (*RExC_parse == '!')
c277df42 7583 paren = ',';
0a4db386 7584 else if (*RExC_parse != '=')
1f1031fe 7585 named_capture:
0a4db386 7586 { /* (?<...>) */
81714fb9 7587 char *name_start;
894be9b7 7588 SV *svname;
81714fb9
YO
7589 paren= '>';
7590 case '\'': /* (?'...') */
7591 name_start= RExC_parse;
0a4db386
YO
7592 svname = reg_scan_name(pRExC_state,
7593 SIZE_ONLY ? /* reverse test from the others */
7594 REG_RSN_RETURN_NAME :
7595 REG_RSN_RETURN_NULL);
57b84237
YO
7596 if (RExC_parse == name_start) {
7597 RExC_parse++;
7598 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7599 /*NOTREACHED*/
7600 }
81714fb9
YO
7601 if (*RExC_parse != paren)
7602 vFAIL2("Sequence (?%c... not terminated",
7603 paren=='>' ? '<' : paren);
7604 if (SIZE_ONLY) {
e62cc96a
YO
7605 HE *he_str;
7606 SV *sv_dat = NULL;
486ec47a 7607 if (!svname) /* shouldn't happen */
894be9b7
YO
7608 Perl_croak(aTHX_
7609 "panic: reg_scan_name returned NULL");
81714fb9
YO
7610 if (!RExC_paren_names) {
7611 RExC_paren_names= newHV();
ad64d0ec 7612 sv_2mortal(MUTABLE_SV(RExC_paren_names));
1f1031fe
YO
7613#ifdef DEBUGGING
7614 RExC_paren_name_list= newAV();
ad64d0ec 7615 sv_2mortal(MUTABLE_SV(RExC_paren_name_list));
1f1031fe 7616#endif
81714fb9
YO
7617 }
7618 he_str = hv_fetch_ent( RExC_paren_names, svname, 1, 0 );
e62cc96a 7619 if ( he_str )
81714fb9 7620 sv_dat = HeVAL(he_str);
e62cc96a 7621 if ( ! sv_dat ) {
81714fb9 7622 /* croak baby croak */
e62cc96a
YO
7623 Perl_croak(aTHX_
7624 "panic: paren_name hash element allocation failed");
7625 } else if ( SvPOK(sv_dat) ) {
76a476f9
YO
7626 /* (?|...) can mean we have dupes so scan to check
7627 its already been stored. Maybe a flag indicating
7628 we are inside such a construct would be useful,
7629 but the arrays are likely to be quite small, so
7630 for now we punt -- dmq */
7631 IV count = SvIV(sv_dat);
7632 I32 *pv = (I32*)SvPVX(sv_dat);
7633 IV i;
7634 for ( i = 0 ; i < count ; i++ ) {
7635 if ( pv[i] == RExC_npar ) {
7636 count = 0;
7637 break;
7638 }
7639 }
7640 if ( count ) {
7641 pv = (I32*)SvGROW(sv_dat, SvCUR(sv_dat) + sizeof(I32)+1);
7642 SvCUR_set(sv_dat, SvCUR(sv_dat) + sizeof(I32));
7643 pv[count] = RExC_npar;
3a92e6ae 7644 SvIV_set(sv_dat, SvIVX(sv_dat) + 1);
76a476f9 7645 }
81714fb9
YO
7646 } else {
7647 (void)SvUPGRADE(sv_dat,SVt_PVNV);
7648 sv_setpvn(sv_dat, (char *)&(RExC_npar), sizeof(I32));
7649 SvIOK_on(sv_dat);
3ec35e0f 7650 SvIV_set(sv_dat, 1);
e62cc96a 7651 }
1f1031fe 7652#ifdef DEBUGGING
17a3c617 7653 /* Yes this does cause a memory leak in debugging Perls */
1f1031fe
YO
7654 if (!av_store(RExC_paren_name_list, RExC_npar, SvREFCNT_inc(svname)))
7655 SvREFCNT_dec(svname);
7656#endif
e62cc96a 7657
81714fb9
YO
7658 /*sv_dump(sv_dat);*/
7659 }
7660 nextchar(pRExC_state);
7661 paren = 1;
7662 goto capturing_parens;
7663 }
7664 RExC_seen |= REG_SEEN_LOOKBEHIND;
b57e4118 7665 RExC_in_lookbehind++;
830247a4 7666 RExC_parse++;
fac92740 7667 case '=': /* (?=...) */
89c6a13e 7668 RExC_seen_zerolen++;
5c3fa2e7 7669 break;
fac92740 7670 case '!': /* (?!...) */
830247a4 7671 RExC_seen_zerolen++;
e2e6a0f1
YO
7672 if (*RExC_parse == ')') {
7673 ret=reg_node(pRExC_state, OPFAIL);
7674 nextchar(pRExC_state);
7675 return ret;
7676 }
594d7033
YO
7677 break;
7678 case '|': /* (?|...) */
7679 /* branch reset, behave like a (?:...) except that
7680 buffers in alternations share the same numbers */
7681 paren = ':';
7682 after_freeze = freeze_paren = RExC_npar;
7683 break;
fac92740
MJD
7684 case ':': /* (?:...) */
7685 case '>': /* (?>...) */
a0d0e21e 7686 break;
fac92740
MJD
7687 case '$': /* (?$...) */
7688 case '@': /* (?@...) */
8615cb43 7689 vFAIL2("Sequence (?%c...) not implemented", (int)paren);
a0d0e21e 7690 break;
fac92740 7691 case '#': /* (?#...) */
830247a4
IZ
7692 while (*RExC_parse && *RExC_parse != ')')
7693 RExC_parse++;
7694 if (*RExC_parse != ')')
c277df42 7695 FAIL("Sequence (?#... not terminated");
830247a4 7696 nextchar(pRExC_state);
a0d0e21e
LW
7697 *flagp = TRYAGAIN;
7698 return NULL;
894be9b7
YO
7699 case '0' : /* (?0) */
7700 case 'R' : /* (?R) */
7701 if (*RExC_parse != ')')
6bda09f9 7702 FAIL("Sequence (?R) not terminated");
1a147d38 7703 ret = reg_node(pRExC_state, GOSTART);
a3b492c3 7704 *flagp |= POSTPONED;
7f69552c
YO
7705 nextchar(pRExC_state);
7706 return ret;
7707 /*notreached*/
894be9b7
YO
7708 { /* named and numeric backreferences */
7709 I32 num;
894be9b7
YO
7710 case '&': /* (?&NAME) */
7711 parse_start = RExC_parse - 1;
1f1031fe 7712 named_recursion:
894be9b7 7713 {
0a4db386
YO
7714 SV *sv_dat = reg_scan_name(pRExC_state,
7715 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7716 num = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
894be9b7
YO
7717 }
7718 goto gen_recurse_regop;
7719 /* NOT REACHED */
542fa716
YO
7720 case '+':
7721 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7722 RExC_parse++;
7723 vFAIL("Illegal pattern");
7724 }
7725 goto parse_recursion;
7726 /* NOT REACHED*/
7727 case '-': /* (?-1) */
7728 if (!(RExC_parse[0] >= '1' && RExC_parse[0] <= '9')) {
7729 RExC_parse--; /* rewind to let it be handled later */
7730 goto parse_flags;
7731 }
7732 /*FALLTHROUGH */
6bda09f9
YO
7733 case '1': case '2': case '3': case '4': /* (?1) */
7734 case '5': case '6': case '7': case '8': case '9':
7735 RExC_parse--;
542fa716 7736 parse_recursion:
894be9b7
YO
7737 num = atoi(RExC_parse);
7738 parse_start = RExC_parse - 1; /* MJD */
542fa716
YO
7739 if (*RExC_parse == '-')
7740 RExC_parse++;
6bda09f9
YO
7741 while (isDIGIT(*RExC_parse))
7742 RExC_parse++;
7743 if (*RExC_parse!=')')
7744 vFAIL("Expecting close bracket");
686b73d4 7745
894be9b7 7746 gen_recurse_regop:
542fa716
YO
7747 if ( paren == '-' ) {
7748 /*
7749 Diagram of capture buffer numbering.
7750 Top line is the normal capture buffer numbers
3b753521 7751 Bottom line is the negative indexing as from
542fa716
YO
7752 the X (the (?-2))
7753
7754 + 1 2 3 4 5 X 6 7
7755 /(a(x)y)(a(b(c(?-2)d)e)f)(g(h))/
7756 - 5 4 3 2 1 X x x
7757
7758 */
7759 num = RExC_npar + num;
7760 if (num < 1) {
7761 RExC_parse++;
7762 vFAIL("Reference to nonexistent group");
7763 }
7764 } else if ( paren == '+' ) {
7765 num = RExC_npar + num - 1;
7766 }
7767
1a147d38 7768 ret = reganode(pRExC_state, GOSUB, num);
6bda09f9
YO
7769 if (!SIZE_ONLY) {
7770 if (num > (I32)RExC_rx->nparens) {
7771 RExC_parse++;
7772 vFAIL("Reference to nonexistent group");
7773 }
40d049e4 7774 ARG2L_SET( ret, RExC_recurse_count++);
6bda09f9 7775 RExC_emit++;
226de585 7776 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
acff02b8 7777 "Recurse #%"UVuf" to %"IVdf"\n", (UV)ARG(ret), (IV)ARG2L(ret)));
894be9b7 7778 } else {
6bda09f9 7779 RExC_size++;
6bda09f9 7780 }
0a4db386 7781 RExC_seen |= REG_SEEN_RECURSE;
6bda09f9 7782 Set_Node_Length(ret, 1 + regarglen[OP(ret)]); /* MJD */
58663417
RGS
7783 Set_Node_Offset(ret, parse_start); /* MJD */
7784
a3b492c3 7785 *flagp |= POSTPONED;
6bda09f9
YO
7786 nextchar(pRExC_state);
7787 return ret;
894be9b7
YO
7788 } /* named and numeric backreferences */
7789 /* NOT REACHED */
7790
fac92740 7791 case '?': /* (??...) */
6136c704 7792 is_logical = 1;
57b84237
YO
7793 if (*RExC_parse != '{') {
7794 RExC_parse++;
7795 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
7796 /*NOTREACHED*/
7797 }
a3b492c3 7798 *flagp |= POSTPONED;
830247a4 7799 paren = *RExC_parse++;
0f5d15d6 7800 /* FALL THROUGH */
fac92740 7801 case '{': /* (?{...}) */
c277df42 7802 {
2eccd3b2
NC
7803 I32 count = 1;
7804 U32 n = 0;
c277df42 7805 char c;
830247a4 7806 char *s = RExC_parse;
c277df42 7807
830247a4
IZ
7808 RExC_seen_zerolen++;
7809 RExC_seen |= REG_SEEN_EVAL;
7810 while (count && (c = *RExC_parse)) {
6136c704
AL
7811 if (c == '\\') {
7812 if (RExC_parse[1])
7813 RExC_parse++;
7814 }
b81d288d 7815 else if (c == '{')
c277df42 7816 count++;
b81d288d 7817 else if (c == '}')
c277df42 7818 count--;
830247a4 7819 RExC_parse++;
c277df42 7820 }
6136c704 7821 if (*RExC_parse != ')') {
686b73d4 7822 RExC_parse = s;
b45f050a
JF
7823 vFAIL("Sequence (?{...}) not terminated or not {}-balanced");
7824 }
c277df42 7825 if (!SIZE_ONLY) {
f3548bdc 7826 PAD *pad;
6136c704
AL
7827 OP_4tree *sop, *rop;
7828 SV * const sv = newSVpvn(s, RExC_parse - 1 - s);
c277df42 7829
569233ed
SB
7830 ENTER;
7831 Perl_save_re_context(aTHX);
d59a8b3e 7832 rop = Perl_sv_compile_2op_is_broken(aTHX_ sv, &sop, "re", &pad);
9b978d73
DM
7833 sop->op_private |= OPpREFCOUNTED;
7834 /* re_dup will OpREFCNT_inc */
7835 OpREFCNT_set(sop, 1);
569233ed 7836 LEAVE;
c277df42 7837
830247a4 7838 n = add_data(pRExC_state, 3, "nop");
f8fc2ecf
YO
7839 RExC_rxi->data->data[n] = (void*)rop;
7840 RExC_rxi->data->data[n+1] = (void*)sop;
7841 RExC_rxi->data->data[n+2] = (void*)pad;
c277df42 7842 SvREFCNT_dec(sv);
a0ed51b3 7843 }
e24b16f9 7844 else { /* First pass */
830247a4 7845 if (PL_reginterp_cnt < ++RExC_seen_evals
923e4eb5 7846 && IN_PERL_RUNTIME)
2cd61cdb
IZ
7847 /* No compiled RE interpolated, has runtime
7848 components ===> unsafe. */
7849 FAIL("Eval-group not allowed at runtime, use re 'eval'");
5b61d3f7 7850 if (PL_tainting && PL_tainted)
cc6b7395 7851 FAIL("Eval-group in insecure regular expression");
54df2634 7852#if PERL_VERSION > 8
923e4eb5 7853 if (IN_PERL_COMPILETIME)
b5c19bd7 7854 PL_cv_has_eval = 1;
54df2634 7855#endif
c277df42 7856 }
b5c19bd7 7857
830247a4 7858 nextchar(pRExC_state);
6136c704 7859 if (is_logical) {
830247a4 7860 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
7861 if (!SIZE_ONLY)
7862 ret->flags = 2;
3dab1dad 7863 REGTAIL(pRExC_state, ret, reganode(pRExC_state, EVAL, n));
fac92740 7864 /* deal with the length of this later - MJD */
0f5d15d6
IZ
7865 return ret;
7866 }
ccb2c380
MP
7867 ret = reganode(pRExC_state, EVAL, n);
7868 Set_Node_Length(ret, RExC_parse - parse_start + 1);
7869 Set_Node_Offset(ret, parse_start);
7870 return ret;
c277df42 7871 }
fac92740 7872 case '(': /* (?(?{...})...) and (?(?=...)...) */
c277df42 7873 {
0a4db386 7874 int is_define= 0;
fac92740 7875 if (RExC_parse[0] == '?') { /* (?(?...)) */
b81d288d
AB
7876 if (RExC_parse[1] == '=' || RExC_parse[1] == '!'
7877 || RExC_parse[1] == '<'
830247a4 7878 || RExC_parse[1] == '{') { /* Lookahead or eval. */
c277df42 7879 I32 flag;
686b73d4 7880
830247a4 7881 ret = reg_node(pRExC_state, LOGICAL);
0f5d15d6
IZ
7882 if (!SIZE_ONLY)
7883 ret->flags = 1;
3dab1dad 7884 REGTAIL(pRExC_state, ret, reg(pRExC_state, 1, &flag,depth+1));
c277df42 7885 goto insert_if;
b81d288d 7886 }
a0ed51b3 7887 }
0a4db386
YO
7888 else if ( RExC_parse[0] == '<' /* (?(<NAME>)...) */
7889 || RExC_parse[0] == '\'' ) /* (?('NAME')...) */
7890 {
7891 char ch = RExC_parse[0] == '<' ? '>' : '\'';
7892 char *name_start= RExC_parse++;
2eccd3b2 7893 U32 num = 0;
0a4db386
YO
7894 SV *sv_dat=reg_scan_name(pRExC_state,
7895 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7896 if (RExC_parse == name_start || *RExC_parse != ch)
7897 vFAIL2("Sequence (?(%c... not terminated",
7898 (ch == '>' ? '<' : ch));
7899 RExC_parse++;
7900 if (!SIZE_ONLY) {
7901 num = add_data( pRExC_state, 1, "S" );
f8fc2ecf 7902 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 7903 SvREFCNT_inc_simple_void(sv_dat);
0a4db386
YO
7904 }
7905 ret = reganode(pRExC_state,NGROUPP,num);
7906 goto insert_if_check_paren;
7907 }
7908 else if (RExC_parse[0] == 'D' &&
7909 RExC_parse[1] == 'E' &&
7910 RExC_parse[2] == 'F' &&
7911 RExC_parse[3] == 'I' &&
7912 RExC_parse[4] == 'N' &&
7913 RExC_parse[5] == 'E')
7914 {
7915 ret = reganode(pRExC_state,DEFINEP,0);
7916 RExC_parse +=6 ;
7917 is_define = 1;
7918 goto insert_if_check_paren;
7919 }
7920 else if (RExC_parse[0] == 'R') {
7921 RExC_parse++;
7922 parno = 0;
7923 if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
7924 parno = atoi(RExC_parse++);
7925 while (isDIGIT(*RExC_parse))
7926 RExC_parse++;
7927 } else if (RExC_parse[0] == '&') {
7928 SV *sv_dat;
7929 RExC_parse++;
7930 sv_dat = reg_scan_name(pRExC_state,
7931 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
7932 parno = sv_dat ? *((I32 *)SvPVX(sv_dat)) : 0;
7933 }
1a147d38 7934 ret = reganode(pRExC_state,INSUBP,parno);
0a4db386
YO
7935 goto insert_if_check_paren;
7936 }
830247a4 7937 else if (RExC_parse[0] >= '1' && RExC_parse[0] <= '9' ) {
fac92740 7938 /* (?(1)...) */
6136c704 7939 char c;
830247a4 7940 parno = atoi(RExC_parse++);
c277df42 7941
830247a4
IZ
7942 while (isDIGIT(*RExC_parse))
7943 RExC_parse++;
fac92740 7944 ret = reganode(pRExC_state, GROUPP, parno);
2af232bd 7945
0a4db386 7946 insert_if_check_paren:
830247a4 7947 if ((c = *nextchar(pRExC_state)) != ')')
b45f050a 7948 vFAIL("Switch condition not recognized");
c277df42 7949 insert_if:
3dab1dad
YO
7950 REGTAIL(pRExC_state, ret, reganode(pRExC_state, IFTHEN, 0));
7951 br = regbranch(pRExC_state, &flags, 1,depth+1);
c277df42 7952 if (br == NULL)
830247a4 7953 br = reganode(pRExC_state, LONGJMP, 0);
c277df42 7954 else
3dab1dad 7955 REGTAIL(pRExC_state, br, reganode(pRExC_state, LONGJMP, 0));
830247a4 7956 c = *nextchar(pRExC_state);
d1b80229
IZ
7957 if (flags&HASWIDTH)
7958 *flagp |= HASWIDTH;
c277df42 7959 if (c == '|') {
0a4db386
YO
7960 if (is_define)
7961 vFAIL("(?(DEFINE)....) does not allow branches");
830247a4 7962 lastbr = reganode(pRExC_state, IFTHEN, 0); /* Fake one for optimizer. */
3dab1dad
YO
7963 regbranch(pRExC_state, &flags, 1,depth+1);
7964 REGTAIL(pRExC_state, ret, lastbr);
d1b80229
IZ
7965 if (flags&HASWIDTH)
7966 *flagp |= HASWIDTH;
830247a4 7967 c = *nextchar(pRExC_state);
a0ed51b3
LW
7968 }
7969 else
c277df42
IZ
7970 lastbr = NULL;
7971 if (c != ')')
8615cb43 7972 vFAIL("Switch (?(condition)... contains too many branches");
830247a4 7973 ender = reg_node(pRExC_state, TAIL);
3dab1dad 7974 REGTAIL(pRExC_state, br, ender);
c277df42 7975 if (lastbr) {
3dab1dad
YO
7976 REGTAIL(pRExC_state, lastbr, ender);
7977 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender);
a0ed51b3
LW
7978 }
7979 else
3dab1dad 7980 REGTAIL(pRExC_state, ret, ender);
3b57cd43
YO
7981 RExC_size++; /* XXX WHY do we need this?!!
7982 For large programs it seems to be required
7983 but I can't figure out why. -- dmq*/
c277df42 7984 return ret;
a0ed51b3
LW
7985 }
7986 else {
830247a4 7987 vFAIL2("Unknown switch condition (?(%.2s", RExC_parse);
c277df42
IZ
7988 }
7989 }
1b1626e4 7990 case 0:
830247a4 7991 RExC_parse--; /* for vFAIL to print correctly */
8615cb43 7992 vFAIL("Sequence (? incomplete");
1b1626e4 7993 break;
85508812
KW
7994 case DEFAULT_PAT_MOD: /* Use default flags with the exceptions
7995 that follow */
fb85c044
KW
7996 has_use_defaults = TRUE;
7997 STD_PMMOD_FLAGS_CLEAR(&RExC_flags);
e40e74fe
KW
7998 set_regex_charset(&RExC_flags, (RExC_utf8 || RExC_uni_semantics)
7999 ? REGEX_UNICODE_CHARSET
8000 : REGEX_DEPENDS_CHARSET);
fb85c044 8001 goto parse_flags;
a0d0e21e 8002 default:
cde0cee5
YO
8003 --RExC_parse;
8004 parse_flags: /* (?i) */
8005 {
8006 U32 posflags = 0, negflags = 0;
8007 U32 *flagsp = &posflags;
f6a766d5 8008 char has_charset_modifier = '\0';
295c2f7d
KW
8009 regex_charset cs = (RExC_utf8 || RExC_uni_semantics)
8010 ? REGEX_UNICODE_CHARSET
8011 : REGEX_DEPENDS_CHARSET;
cde0cee5
YO
8012
8013 while (*RExC_parse) {
8014 /* && strchr("iogcmsx", *RExC_parse) */
9d1d55b5
JP
8015 /* (?g), (?gc) and (?o) are useless here
8016 and must be globally applied -- japhy */
cde0cee5
YO
8017 switch (*RExC_parse) {
8018 CASE_STD_PMMOD_FLAGS_PARSE_SET(flagsp);
9de15fec 8019 case LOCALE_PAT_MOD:
f6a766d5
KW
8020 if (has_charset_modifier) {
8021 goto excess_modifier;
8022 }
8023 else if (flagsp == &negflags) {
9442e3b8 8024 goto neg_modifier;
9de15fec 8025 }
a62b1201 8026 cs = REGEX_LOCALE_CHARSET;
f6a766d5 8027 has_charset_modifier = LOCALE_PAT_MOD;
4624b182 8028 RExC_contains_locale = 1;
9de15fec
KW
8029 break;
8030 case UNICODE_PAT_MOD:
f6a766d5
KW
8031 if (has_charset_modifier) {
8032 goto excess_modifier;
8033 }
8034 else if (flagsp == &negflags) {
9442e3b8 8035 goto neg_modifier;
9de15fec 8036 }
a62b1201 8037 cs = REGEX_UNICODE_CHARSET;
f6a766d5 8038 has_charset_modifier = UNICODE_PAT_MOD;
9de15fec 8039 break;
cfaf538b 8040 case ASCII_RESTRICT_PAT_MOD:
f6a766d5 8041 if (flagsp == &negflags) {
9442e3b8 8042 goto neg_modifier;
cfaf538b 8043 }
f6a766d5
KW
8044 if (has_charset_modifier) {
8045 if (cs != REGEX_ASCII_RESTRICTED_CHARSET) {
8046 goto excess_modifier;
8047 }
2f7f8cb1 8048 /* Doubled modifier implies more restricted */
f6a766d5
KW
8049 cs = REGEX_ASCII_MORE_RESTRICTED_CHARSET;
8050 }
2f7f8cb1
KW
8051 else {
8052 cs = REGEX_ASCII_RESTRICTED_CHARSET;
8053 }
f6a766d5 8054 has_charset_modifier = ASCII_RESTRICT_PAT_MOD;
cfaf538b 8055 break;
50e91148 8056 case DEPENDS_PAT_MOD:
9442e3b8 8057 if (has_use_defaults) {
9de15fec 8058 goto fail_modifiers;
f6a766d5 8059 }
9442e3b8
KW
8060 else if (flagsp == &negflags) {
8061 goto neg_modifier;
8062 }
f6a766d5
KW
8063 else if (has_charset_modifier) {
8064 goto excess_modifier;
9de15fec 8065 }
7b98bc43
KW
8066
8067 /* The dual charset means unicode semantics if the
8068 * pattern (or target, not known until runtime) are
e40e74fe
KW
8069 * utf8, or something in the pattern indicates unicode
8070 * semantics */
8071 cs = (RExC_utf8 || RExC_uni_semantics)
a62b1201
KW
8072 ? REGEX_UNICODE_CHARSET
8073 : REGEX_DEPENDS_CHARSET;
f6a766d5 8074 has_charset_modifier = DEPENDS_PAT_MOD;
9de15fec 8075 break;
f6a766d5
KW
8076 excess_modifier:
8077 RExC_parse++;
8078 if (has_charset_modifier == ASCII_RESTRICT_PAT_MOD) {
0c96c706 8079 vFAIL2("Regexp modifier \"%c\" may appear a maximum of twice", ASCII_RESTRICT_PAT_MOD);
f6a766d5
KW
8080 }
8081 else if (has_charset_modifier == *(RExC_parse - 1)) {
0c96c706 8082 vFAIL2("Regexp modifier \"%c\" may not appear twice", *(RExC_parse - 1));
f6a766d5
KW
8083 }
8084 else {
0c96c706 8085 vFAIL3("Regexp modifiers \"%c\" and \"%c\" are mutually exclusive", has_charset_modifier, *(RExC_parse - 1));
f6a766d5
KW
8086 }
8087 /*NOTREACHED*/
9442e3b8
KW
8088 neg_modifier:
8089 RExC_parse++;
8090 vFAIL2("Regexp modifier \"%c\" may not appear after the \"-\"", *(RExC_parse - 1));
8091 /*NOTREACHED*/
f7819f85
A
8092 case ONCE_PAT_MOD: /* 'o' */
8093 case GLOBAL_PAT_MOD: /* 'g' */
9d1d55b5 8094 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704 8095 const I32 wflagbit = *RExC_parse == 'o' ? WASTED_O : WASTED_G;
9d1d55b5
JP
8096 if (! (wastedflags & wflagbit) ) {
8097 wastedflags |= wflagbit;
8098 vWARN5(
8099 RExC_parse + 1,
8100 "Useless (%s%c) - %suse /%c modifier",
8101 flagsp == &negflags ? "?-" : "?",
8102 *RExC_parse,
8103 flagsp == &negflags ? "don't " : "",
8104 *RExC_parse
8105 );
8106 }
8107 }
cde0cee5
YO
8108 break;
8109
f7819f85 8110 case CONTINUE_PAT_MOD: /* 'c' */
9d1d55b5 8111 if (SIZE_ONLY && ckWARN(WARN_REGEXP)) {
6136c704
AL
8112 if (! (wastedflags & WASTED_C) ) {
8113 wastedflags |= WASTED_GC;
9d1d55b5
JP
8114 vWARN3(
8115 RExC_parse + 1,
8116 "Useless (%sc) - %suse /gc modifier",
8117 flagsp == &negflags ? "?-" : "?",
8118 flagsp == &negflags ? "don't " : ""
8119 );
8120 }
8121 }
cde0cee5 8122 break;
f7819f85 8123 case KEEPCOPY_PAT_MOD: /* 'p' */
cde0cee5 8124 if (flagsp == &negflags) {
668c081a
NC
8125 if (SIZE_ONLY)
8126 ckWARNreg(RExC_parse + 1,"Useless use of (?-p)");
cde0cee5
YO
8127 } else {
8128 *flagsp |= RXf_PMf_KEEPCOPY;
8129 }
8130 break;
8131 case '-':
3b753521 8132 /* A flag is a default iff it is following a minus, so
fb85c044
KW
8133 * if there is a minus, it means will be trying to
8134 * re-specify a default which is an error */
8135 if (has_use_defaults || flagsp == &negflags) {
9de15fec 8136 fail_modifiers:
57b84237
YO
8137 RExC_parse++;
8138 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8139 /*NOTREACHED*/
8140 }
cde0cee5
YO
8141 flagsp = &negflags;
8142 wastedflags = 0; /* reset so (?g-c) warns twice */
8143 break;
8144 case ':':
8145 paren = ':';
8146 /*FALLTHROUGH*/
8147 case ')':
8148 RExC_flags |= posflags;
8149 RExC_flags &= ~negflags;
a62b1201 8150 set_regex_charset(&RExC_flags, cs);
f7819f85
A
8151 if (paren != ':') {
8152 oregflags |= posflags;
8153 oregflags &= ~negflags;
a62b1201 8154 set_regex_charset(&oregflags, cs);
f7819f85 8155 }
cde0cee5
YO
8156 nextchar(pRExC_state);
8157 if (paren != ':') {
8158 *flagp = TRYAGAIN;
8159 return NULL;
8160 } else {
8161 ret = NULL;
8162 goto parse_rest;
8163 }
8164 /*NOTREACHED*/
8165 default:
cde0cee5
YO
8166 RExC_parse++;
8167 vFAIL3("Sequence (%.*s...) not recognized", RExC_parse-seqstart, seqstart);
8168 /*NOTREACHED*/
8169 }
830247a4 8170 ++RExC_parse;
48c036b1 8171 }
cde0cee5 8172 }} /* one for the default block, one for the switch */
a0d0e21e 8173 }
fac92740 8174 else { /* (...) */
81714fb9 8175 capturing_parens:
830247a4
IZ
8176 parno = RExC_npar;
8177 RExC_npar++;
e2e6a0f1 8178
830247a4 8179 ret = reganode(pRExC_state, OPEN, parno);
e2e6a0f1
YO
8180 if (!SIZE_ONLY ){
8181 if (!RExC_nestroot)
8182 RExC_nestroot = parno;
c009da3d
YO
8183 if (RExC_seen & REG_SEEN_RECURSE
8184 && !RExC_open_parens[parno-1])
8185 {
e2e6a0f1 8186 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
40d049e4
YO
8187 "Setting open paren #%"IVdf" to %d\n",
8188 (IV)parno, REG_NODE_NUM(ret)));
e2e6a0f1
YO
8189 RExC_open_parens[parno-1]= ret;
8190 }
6bda09f9 8191 }
fac92740
MJD
8192 Set_Node_Length(ret, 1); /* MJD */
8193 Set_Node_Offset(ret, RExC_parse); /* MJD */
6136c704 8194 is_open = 1;
a0d0e21e 8195 }
a0ed51b3 8196 }
fac92740 8197 else /* ! paren */
a0d0e21e 8198 ret = NULL;
cde0cee5
YO
8199
8200 parse_rest:
a0d0e21e 8201 /* Pick up the branches, linking them together. */
fac92740 8202 parse_start = RExC_parse; /* MJD */
3dab1dad 8203 br = regbranch(pRExC_state, &flags, 1,depth+1);
ee91d26e 8204
fac92740 8205 /* branch_len = (paren != 0); */
2af232bd 8206
a0d0e21e
LW
8207 if (br == NULL)
8208 return(NULL);
830247a4
IZ
8209 if (*RExC_parse == '|') {
8210 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9 8211 reginsert(pRExC_state, BRANCHJ, br, depth+1);
a0ed51b3 8212 }
fac92740 8213 else { /* MJD */
6bda09f9 8214 reginsert(pRExC_state, BRANCH, br, depth+1);
fac92740
MJD
8215 Set_Node_Length(br, paren != 0);
8216 Set_Node_Offset_To_R(br-RExC_emit_start, parse_start-RExC_start);
8217 }
c277df42
IZ
8218 have_branch = 1;
8219 if (SIZE_ONLY)
830247a4 8220 RExC_extralen += 1; /* For BRANCHJ-BRANCH. */
a0ed51b3
LW
8221 }
8222 else if (paren == ':') {
c277df42
IZ
8223 *flagp |= flags&SIMPLE;
8224 }
6136c704 8225 if (is_open) { /* Starts with OPEN. */
3dab1dad 8226 REGTAIL(pRExC_state, ret, br); /* OPEN -> first. */
a0ed51b3
LW
8227 }
8228 else if (paren != '?') /* Not Conditional */
a0d0e21e 8229 ret = br;
8ae10a67 8230 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
c277df42 8231 lastbr = br;
830247a4
IZ
8232 while (*RExC_parse == '|') {
8233 if (!SIZE_ONLY && RExC_extralen) {
8234 ender = reganode(pRExC_state, LONGJMP,0);
3dab1dad 8235 REGTAIL(pRExC_state, NEXTOPER(NEXTOPER(lastbr)), ender); /* Append to the previous. */
c277df42
IZ
8236 }
8237 if (SIZE_ONLY)
830247a4
IZ
8238 RExC_extralen += 2; /* Account for LONGJMP. */
8239 nextchar(pRExC_state);
594d7033
YO
8240 if (freeze_paren) {
8241 if (RExC_npar > after_freeze)
8242 after_freeze = RExC_npar;
8243 RExC_npar = freeze_paren;
8244 }
3dab1dad 8245 br = regbranch(pRExC_state, &flags, 0, depth+1);
2af232bd 8246
a687059c 8247 if (br == NULL)
a0d0e21e 8248 return(NULL);
3dab1dad 8249 REGTAIL(pRExC_state, lastbr, br); /* BRANCH -> BRANCH. */
c277df42 8250 lastbr = br;
8ae10a67 8251 *flagp |= flags & (SPSTART | HASWIDTH | POSTPONED);
a0d0e21e
LW
8252 }
8253
c277df42
IZ
8254 if (have_branch || paren != ':') {
8255 /* Make a closing node, and hook it on the end. */
8256 switch (paren) {
8257 case ':':
830247a4 8258 ender = reg_node(pRExC_state, TAIL);
c277df42
IZ
8259 break;
8260 case 1:
830247a4 8261 ender = reganode(pRExC_state, CLOSE, parno);
40d049e4
YO
8262 if (!SIZE_ONLY && RExC_seen & REG_SEEN_RECURSE) {
8263 DEBUG_OPTIMISE_MORE_r(PerlIO_printf(Perl_debug_log,
8264 "Setting close paren #%"IVdf" to %d\n",
8265 (IV)parno, REG_NODE_NUM(ender)));
8266 RExC_close_parens[parno-1]= ender;
e2e6a0f1
YO
8267 if (RExC_nestroot == parno)
8268 RExC_nestroot = 0;
40d049e4 8269 }
fac92740
MJD
8270 Set_Node_Offset(ender,RExC_parse+1); /* MJD */
8271 Set_Node_Length(ender,1); /* MJD */
c277df42
IZ
8272 break;
8273 case '<':
c277df42
IZ
8274 case ',':
8275 case '=':
8276 case '!':
c277df42 8277 *flagp &= ~HASWIDTH;
821b33a5
IZ
8278 /* FALL THROUGH */
8279 case '>':
830247a4 8280 ender = reg_node(pRExC_state, SUCCEED);
c277df42
IZ
8281 break;
8282 case 0:
830247a4 8283 ender = reg_node(pRExC_state, END);
40d049e4
YO
8284 if (!SIZE_ONLY) {
8285 assert(!RExC_opend); /* there can only be one! */
8286 RExC_opend = ender;
8287 }
c277df42
IZ
8288 break;
8289 }
eaf3ca90 8290 REGTAIL(pRExC_state, lastbr, ender);
a0d0e21e 8291
9674d46a 8292 if (have_branch && !SIZE_ONLY) {
eaf3ca90
YO
8293 if (depth==1)
8294 RExC_seen |= REG_TOP_LEVEL_BRANCHES;
8295
c277df42 8296 /* Hook the tails of the branches to the closing node. */
9674d46a
AL
8297 for (br = ret; br; br = regnext(br)) {
8298 const U8 op = PL_regkind[OP(br)];
8299 if (op == BRANCH) {
07be1b83 8300 REGTAIL_STUDY(pRExC_state, NEXTOPER(br), ender);
9674d46a
AL
8301 }
8302 else if (op == BRANCHJ) {
07be1b83 8303 REGTAIL_STUDY(pRExC_state, NEXTOPER(NEXTOPER(br)), ender);
9674d46a 8304 }
c277df42
IZ
8305 }
8306 }
a0d0e21e 8307 }
c277df42
IZ
8308
8309 {
e1ec3a88
AL
8310 const char *p;
8311 static const char parens[] = "=!<,>";
c277df42
IZ
8312
8313 if (paren && (p = strchr(parens, paren))) {
eb160463 8314 U8 node = ((p - parens) % 2) ? UNLESSM : IFMATCH;
c277df42
IZ
8315 int flag = (p - parens) > 1;
8316
8317 if (paren == '>')
8318 node = SUSPEND, flag = 0;
6bda09f9 8319 reginsert(pRExC_state, node,ret, depth+1);
45948336
EP
8320 Set_Node_Cur_Length(ret);
8321 Set_Node_Offset(ret, parse_start + 1);
c277df42 8322 ret->flags = flag;
07be1b83 8323 REGTAIL_STUDY(pRExC_state, ret, reg_node(pRExC_state, TAIL));
c277df42 8324 }
a0d0e21e
LW
8325 }
8326
8327 /* Check for proper termination. */
ce3e6498 8328 if (paren) {
e2509266 8329 RExC_flags = oregflags;
830247a4
IZ
8330 if (RExC_parse >= RExC_end || *nextchar(pRExC_state) != ')') {
8331 RExC_parse = oregcomp_parse;
380a0633 8332 vFAIL("Unmatched (");
ce3e6498 8333 }
a0ed51b3 8334 }
830247a4
IZ
8335 else if (!paren && RExC_parse < RExC_end) {
8336 if (*RExC_parse == ')') {
8337 RExC_parse++;
380a0633 8338 vFAIL("Unmatched )");
a0ed51b3
LW
8339 }
8340 else
b45f050a 8341 FAIL("Junk on end of regexp"); /* "Can't happen". */
a0d0e21e
LW
8342 /* NOTREACHED */
8343 }
b57e4118
KW
8344
8345 if (RExC_in_lookbehind) {
8346 RExC_in_lookbehind--;
8347 }
fd4be6f0 8348 if (after_freeze > RExC_npar)
594d7033 8349 RExC_npar = after_freeze;
a0d0e21e 8350 return(ret);
a687059c
LW
8351}
8352
8353/*
8354 - regbranch - one alternative of an | operator
8355 *
8356 * Implements the concatenation operator.
8357 */
76e3520e 8358STATIC regnode *
3dab1dad 8359S_regbranch(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, I32 first, U32 depth)
a687059c 8360{
97aff369 8361 dVAR;
c277df42
IZ
8362 register regnode *ret;
8363 register regnode *chain = NULL;
8364 register regnode *latest;
8365 I32 flags = 0, c = 0;
3dab1dad 8366 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8367
8368 PERL_ARGS_ASSERT_REGBRANCH;
8369
3dab1dad 8370 DEBUG_PARSE("brnc");
02daf0ab 8371
b81d288d 8372 if (first)
c277df42
IZ
8373 ret = NULL;
8374 else {
b81d288d 8375 if (!SIZE_ONLY && RExC_extralen)
830247a4 8376 ret = reganode(pRExC_state, BRANCHJ,0);
fac92740 8377 else {
830247a4 8378 ret = reg_node(pRExC_state, BRANCH);
fac92740
MJD
8379 Set_Node_Length(ret, 1);
8380 }
c277df42 8381 }
686b73d4 8382
b81d288d 8383 if (!first && SIZE_ONLY)
830247a4 8384 RExC_extralen += 1; /* BRANCHJ */
b81d288d 8385
c277df42 8386 *flagp = WORST; /* Tentatively. */
a0d0e21e 8387
830247a4
IZ
8388 RExC_parse--;
8389 nextchar(pRExC_state);
8390 while (RExC_parse < RExC_end && *RExC_parse != '|' && *RExC_parse != ')') {
a0d0e21e 8391 flags &= ~TRYAGAIN;
3dab1dad 8392 latest = regpiece(pRExC_state, &flags,depth+1);
a0d0e21e
LW
8393 if (latest == NULL) {
8394 if (flags & TRYAGAIN)
8395 continue;
8396 return(NULL);
a0ed51b3
LW
8397 }
8398 else if (ret == NULL)
c277df42 8399 ret = latest;
8ae10a67 8400 *flagp |= flags&(HASWIDTH|POSTPONED);
c277df42 8401 if (chain == NULL) /* First piece. */
a0d0e21e
LW
8402 *flagp |= flags&SPSTART;
8403 else {
830247a4 8404 RExC_naughty++;
3dab1dad 8405 REGTAIL(pRExC_state, chain, latest);
a687059c 8406 }
a0d0e21e 8407 chain = latest;
c277df42
IZ
8408 c++;
8409 }
8410 if (chain == NULL) { /* Loop ran zero times. */
830247a4 8411 chain = reg_node(pRExC_state, NOTHING);
c277df42
IZ
8412 if (ret == NULL)
8413 ret = chain;
8414 }
8415 if (c == 1) {
8416 *flagp |= flags&SIMPLE;
a0d0e21e 8417 }
a687059c 8418
d4c19fe8 8419 return ret;
a687059c
LW
8420}
8421
8422/*
8423 - regpiece - something followed by possible [*+?]
8424 *
8425 * Note that the branching code sequences used for ? and the general cases
8426 * of * and + are somewhat optimized: they use the same NOTHING node as
8427 * both the endmarker for their branch list and the body of the last branch.
8428 * It might seem that this node could be dispensed with entirely, but the
8429 * endmarker role is not redundant.
8430 */
76e3520e 8431STATIC regnode *
3dab1dad 8432S_regpiece(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 8433{
97aff369 8434 dVAR;
c277df42 8435 register regnode *ret;
a0d0e21e
LW
8436 register char op;
8437 register char *next;
8438 I32 flags;
1df70142 8439 const char * const origparse = RExC_parse;
a0d0e21e 8440 I32 min;
c277df42 8441 I32 max = REG_INFTY;
f19a8d85 8442#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8443 char *parse_start;
f19a8d85 8444#endif
10edeb5d 8445 const char *maxpos = NULL;
3dab1dad 8446 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8447
8448 PERL_ARGS_ASSERT_REGPIECE;
8449
3dab1dad 8450 DEBUG_PARSE("piec");
a0d0e21e 8451
3dab1dad 8452 ret = regatom(pRExC_state, &flags,depth+1);
a0d0e21e
LW
8453 if (ret == NULL) {
8454 if (flags & TRYAGAIN)
8455 *flagp |= TRYAGAIN;
8456 return(NULL);
8457 }
8458
830247a4 8459 op = *RExC_parse;
a0d0e21e 8460
830247a4 8461 if (op == '{' && regcurly(RExC_parse)) {
10edeb5d 8462 maxpos = NULL;
f19a8d85 8463#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8464 parse_start = RExC_parse; /* MJD */
f19a8d85 8465#endif
830247a4 8466 next = RExC_parse + 1;
a0d0e21e
LW
8467 while (isDIGIT(*next) || *next == ',') {
8468 if (*next == ',') {
8469 if (maxpos)
8470 break;
8471 else
8472 maxpos = next;
a687059c 8473 }
a0d0e21e
LW
8474 next++;
8475 }
8476 if (*next == '}') { /* got one */
8477 if (!maxpos)
8478 maxpos = next;
830247a4
IZ
8479 RExC_parse++;
8480 min = atoi(RExC_parse);
a0d0e21e
LW
8481 if (*maxpos == ',')
8482 maxpos++;
8483 else
830247a4 8484 maxpos = RExC_parse;
a0d0e21e
LW
8485 max = atoi(maxpos);
8486 if (!max && *maxpos != '0')
c277df42
IZ
8487 max = REG_INFTY; /* meaning "infinity" */
8488 else if (max >= REG_INFTY)
8615cb43 8489 vFAIL2("Quantifier in {,} bigger than %d", REG_INFTY - 1);
830247a4
IZ
8490 RExC_parse = next;
8491 nextchar(pRExC_state);
a0d0e21e
LW
8492
8493 do_curly:
8494 if ((flags&SIMPLE)) {
830247a4 8495 RExC_naughty += 2 + RExC_naughty / 2;
6bda09f9 8496 reginsert(pRExC_state, CURLY, ret, depth+1);
fac92740
MJD
8497 Set_Node_Offset(ret, parse_start+1); /* MJD */
8498 Set_Node_Cur_Length(ret);
a0d0e21e
LW
8499 }
8500 else {
3dab1dad 8501 regnode * const w = reg_node(pRExC_state, WHILEM);
2c2d71f5
JH
8502
8503 w->flags = 0;
3dab1dad 8504 REGTAIL(pRExC_state, ret, w);
830247a4 8505 if (!SIZE_ONLY && RExC_extralen) {
6bda09f9
YO
8506 reginsert(pRExC_state, LONGJMP,ret, depth+1);
8507 reginsert(pRExC_state, NOTHING,ret, depth+1);
c277df42
IZ
8508 NEXT_OFF(ret) = 3; /* Go over LONGJMP. */
8509 }
6bda09f9 8510 reginsert(pRExC_state, CURLYX,ret, depth+1);
fac92740
MJD
8511 /* MJD hk */
8512 Set_Node_Offset(ret, parse_start+1);
2af232bd 8513 Set_Node_Length(ret,
fac92740 8514 op == '{' ? (RExC_parse - parse_start) : 1);
2af232bd 8515
830247a4 8516 if (!SIZE_ONLY && RExC_extralen)
c277df42 8517 NEXT_OFF(ret) = 3; /* Go over NOTHING to LONGJMP. */
3dab1dad 8518 REGTAIL(pRExC_state, ret, reg_node(pRExC_state, NOTHING));
c277df42 8519 if (SIZE_ONLY)
830247a4
IZ
8520 RExC_whilem_seen++, RExC_extralen += 3;
8521 RExC_naughty += 4 + RExC_naughty; /* compound interest */
a0d0e21e 8522 }
c277df42 8523 ret->flags = 0;
a0d0e21e
LW
8524
8525 if (min > 0)
821b33a5
IZ
8526 *flagp = WORST;
8527 if (max > 0)
8528 *flagp |= HASWIDTH;
8fa23287 8529 if (max < min)
8615cb43 8530 vFAIL("Can't do {n,m} with n > m");
c277df42 8531 if (!SIZE_ONLY) {
eb160463
GS
8532 ARG1_SET(ret, (U16)min);
8533 ARG2_SET(ret, (U16)max);
a687059c 8534 }
a687059c 8535
a0d0e21e 8536 goto nest_check;
a687059c 8537 }
a0d0e21e 8538 }
a687059c 8539
a0d0e21e
LW
8540 if (!ISMULT1(op)) {
8541 *flagp = flags;
a687059c 8542 return(ret);
a0d0e21e 8543 }
bb20fd44 8544
c277df42 8545#if 0 /* Now runtime fix should be reliable. */
b45f050a
JF
8546
8547 /* if this is reinstated, don't forget to put this back into perldiag:
8548
8549 =item Regexp *+ operand could be empty at {#} in regex m/%s/
8550
8551 (F) The part of the regexp subject to either the * or + quantifier
8552 could match an empty string. The {#} shows in the regular
8553 expression about where the problem was discovered.
8554
8555 */
8556
bb20fd44 8557 if (!(flags&HASWIDTH) && op != '?')
b45f050a 8558 vFAIL("Regexp *+ operand could be empty");
b81d288d 8559#endif
bb20fd44 8560
f19a8d85 8561#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 8562 parse_start = RExC_parse;
f19a8d85 8563#endif
830247a4 8564 nextchar(pRExC_state);
a0d0e21e 8565
821b33a5 8566 *flagp = (op != '+') ? (WORST|SPSTART|HASWIDTH) : (WORST|HASWIDTH);
a0d0e21e
LW
8567
8568 if (op == '*' && (flags&SIMPLE)) {
6bda09f9 8569 reginsert(pRExC_state, STAR, ret, depth+1);
c277df42 8570 ret->flags = 0;
830247a4 8571 RExC_naughty += 4;
a0d0e21e
LW
8572 }
8573 else if (op == '*') {
8574 min = 0;
8575 goto do_curly;
a0ed51b3
LW
8576 }
8577 else if (op == '+' && (flags&SIMPLE)) {
6bda09f9 8578 reginsert(pRExC_state, PLUS, ret, depth+1);
c277df42 8579 ret->flags = 0;
830247a4 8580 RExC_naughty += 3;
a0d0e21e
LW
8581 }
8582 else if (op == '+') {
8583 min = 1;
8584 goto do_curly;
a0ed51b3
LW
8585 }
8586 else if (op == '?') {
a0d0e21e
LW
8587 min = 0; max = 1;
8588 goto do_curly;
8589 }
8590 nest_check:
668c081a
NC
8591 if (!SIZE_ONLY && !(flags&(HASWIDTH|POSTPONED)) && max > REG_INFTY/3) {
8592 ckWARN3reg(RExC_parse,
8593 "%.*s matches null string many times",
8594 (int)(RExC_parse >= origparse ? RExC_parse - origparse : 0),
8595 origparse);
a0d0e21e
LW
8596 }
8597
b9b4dddf 8598 if (RExC_parse < RExC_end && *RExC_parse == '?') {
830247a4 8599 nextchar(pRExC_state);
6bda09f9 8600 reginsert(pRExC_state, MINMOD, ret, depth+1);
3dab1dad 8601 REGTAIL(pRExC_state, ret, ret + NODE_STEP_REGNODE);
a0d0e21e 8602 }
b9b4dddf
YO
8603#ifndef REG_ALLOW_MINMOD_SUSPEND
8604 else
8605#endif
8606 if (RExC_parse < RExC_end && *RExC_parse == '+') {
8607 regnode *ender;
8608 nextchar(pRExC_state);
8609 ender = reg_node(pRExC_state, SUCCEED);
8610 REGTAIL(pRExC_state, ret, ender);
8611 reginsert(pRExC_state, SUSPEND, ret, depth+1);
8612 ret->flags = 0;
8613 ender = reg_node(pRExC_state, TAIL);
8614 REGTAIL(pRExC_state, ret, ender);
8615 /*ret= ender;*/
8616 }
8617
8618 if (RExC_parse < RExC_end && ISMULT2(RExC_parse)) {
830247a4 8619 RExC_parse++;
b45f050a
JF
8620 vFAIL("Nested quantifiers");
8621 }
a0d0e21e
LW
8622
8623 return(ret);
a687059c
LW
8624}
8625
fc8cd66c 8626
9d64099b 8627/* reg_namedseq(pRExC_state,UVp, UV depth)
fc8cd66c
YO
8628
8629 This is expected to be called by a parser routine that has
afefe6bf 8630 recognized '\N' and needs to handle the rest. RExC_parse is
fc8cd66c
YO
8631 expected to point at the first char following the N at the time
8632 of the call.
ff3f963a
KW
8633
8634 The \N may be inside (indicated by valuep not being NULL) or outside a
8635 character class.
8636
8637 \N may begin either a named sequence, or if outside a character class, mean
8638 to match a non-newline. For non single-quoted regexes, the tokenizer has
8639 attempted to decide which, and in the case of a named sequence converted it
8640 into one of the forms: \N{} (if the sequence is null), or \N{U+c1.c2...},
8641 where c1... are the characters in the sequence. For single-quoted regexes,
8642 the tokenizer passes the \N sequence through unchanged; this code will not
8643 attempt to determine this nor expand those. The net effect is that if the
8644 beginning of the passed-in pattern isn't '{U+' or there is no '}', it
8645 signals that this \N occurrence means to match a non-newline.
8646
8647 Only the \N{U+...} form should occur in a character class, for the same
8648 reason that '.' inside a character class means to just match a period: it
8649 just doesn't make sense.
fc8cd66c
YO
8650
8651 If valuep is non-null then it is assumed that we are parsing inside
8652 of a charclass definition and the first codepoint in the resolved
8653 string is returned via *valuep and the routine will return NULL.
8654 In this mode if a multichar string is returned from the charnames
ff3f963a 8655 handler, a warning will be issued, and only the first char in the
fc8cd66c
YO
8656 sequence will be examined. If the string returned is zero length
8657 then the value of *valuep is undefined and NON-NULL will
8658 be returned to indicate failure. (This will NOT be a valid pointer
8659 to a regnode.)
8660
ff3f963a
KW
8661 If valuep is null then it is assumed that we are parsing normal text and a
8662 new EXACT node is inserted into the program containing the resolved string,
8663 and a pointer to the new node is returned. But if the string is zero length
8664 a NOTHING node is emitted instead.
afefe6bf 8665
fc8cd66c 8666 On success RExC_parse is set to the char following the endbrace.
ff3f963a 8667 Parsing failures will generate a fatal error via vFAIL(...)
fc8cd66c
YO
8668 */
8669STATIC regnode *
9d64099b 8670S_reg_namedseq(pTHX_ RExC_state_t *pRExC_state, UV *valuep, I32 *flagp, U32 depth)
fc8cd66c 8671{
c3c41406 8672 char * endbrace; /* '}' following the name */
fc8cd66c 8673 regnode *ret = NULL;
c3c41406 8674 char* p;
ff3f963a
KW
8675
8676 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
8677
8678 PERL_ARGS_ASSERT_REG_NAMEDSEQ;
ff3f963a
KW
8679
8680 GET_RE_DEBUG_FLAGS;
c3c41406
KW
8681
8682 /* The [^\n] meaning of \N ignores spaces and comments under the /x
8683 * modifier. The other meaning does not */
8684 p = (RExC_flags & RXf_PMf_EXTENDED)
8685 ? regwhite( pRExC_state, RExC_parse )
8686 : RExC_parse;
7918f24d 8687
ff3f963a 8688 /* Disambiguate between \N meaning a named character versus \N meaning
c3c41406
KW
8689 * [^\n]. The former is assumed when it can't be the latter. */
8690 if (*p != '{' || regcurly(p)) {
8691 RExC_parse = p;
ff3f963a 8692 if (valuep) {
afefe6bf 8693 /* no bare \N in a charclass */
ff3f963a
KW
8694 vFAIL("\\N in a character class must be a named character: \\N{...}");
8695 }
afefe6bf
RGS
8696 nextchar(pRExC_state);
8697 ret = reg_node(pRExC_state, REG_ANY);
8698 *flagp |= HASWIDTH|SIMPLE;
8699 RExC_naughty++;
8700 RExC_parse--;
8701 Set_Node_Length(ret, 1); /* MJD */
8702 return ret;
fc8cd66c 8703 }
a4893424 8704
c3c41406
KW
8705 /* Here, we have decided it should be a named sequence */
8706
8707 /* The test above made sure that the next real character is a '{', but
8708 * under the /x modifier, it could be separated by space (or a comment and
8709 * \n) and this is not allowed (for consistency with \x{...} and the
8710 * tokenizer handling of \N{NAME}). */
8711 if (*RExC_parse != '{') {
8712 vFAIL("Missing braces on \\N{}");
8713 }
8714
ff3f963a 8715 RExC_parse++; /* Skip past the '{' */
c3c41406
KW
8716
8717 if (! (endbrace = strchr(RExC_parse, '}')) /* no trailing brace */
8718 || ! (endbrace == RExC_parse /* nothing between the {} */
8719 || (endbrace - RExC_parse >= 2 /* U+ (bad hex is checked below */
8720 && strnEQ(RExC_parse, "U+", 2)))) /* for a better error msg) */
8721 {
8722 if (endbrace) RExC_parse = endbrace; /* position msg's '<--HERE' */
8723 vFAIL("\\N{NAME} must be resolved by the lexer");
8724 }
8725
ff3f963a
KW
8726 if (endbrace == RExC_parse) { /* empty: \N{} */
8727 if (! valuep) {
8728 RExC_parse = endbrace + 1;
8729 return reg_node(pRExC_state,NOTHING);
a4893424 8730 }
fc8cd66c 8731
ff3f963a
KW
8732 if (SIZE_ONLY) {
8733 ckWARNreg(RExC_parse,
8734 "Ignoring zero length \\N{} in character class"
8735 );
8736 RExC_parse = endbrace + 1;
8737 }
8738 *valuep = 0;
8739 return (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 8740 }
ff3f963a 8741
62fed28b 8742 REQUIRE_UTF8; /* named sequences imply Unicode semantics */
ff3f963a
KW
8743 RExC_parse += 2; /* Skip past the 'U+' */
8744
8745 if (valuep) { /* In a bracketed char class */
8746 /* We only pay attention to the first char of
8747 multichar strings being returned. I kinda wonder
8748 if this makes sense as it does change the behaviour
8749 from earlier versions, OTOH that behaviour was broken
8750 as well. XXX Solution is to recharacterize as
8751 [rest-of-class]|multi1|multi2... */
8752
8753 STRLEN length_of_hex;
8754 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
8755 | PERL_SCAN_DISALLOW_PREFIX
8756 | (SIZE_ONLY ? PERL_SCAN_SILENT_ILLDIGIT : 0);
8757
37820adc
KW
8758 char * endchar = RExC_parse + strcspn(RExC_parse, ".}");
8759 if (endchar < endbrace) {
ff3f963a
KW
8760 ckWARNreg(endchar, "Using just the first character returned by \\N{} in character class");
8761 }
ff3f963a
KW
8762
8763 length_of_hex = (STRLEN)(endchar - RExC_parse);
8764 *valuep = grok_hex(RExC_parse, &length_of_hex, &flags, NULL);
8765
8766 /* The tokenizer should have guaranteed validity, but it's possible to
8767 * bypass it by using single quoting, so check */
c3c41406
KW
8768 if (length_of_hex == 0
8769 || length_of_hex != (STRLEN)(endchar - RExC_parse) )
8770 {
8771 RExC_parse += length_of_hex; /* Includes all the valid */
8772 RExC_parse += (RExC_orig_utf8) /* point to after 1st invalid */
8773 ? UTF8SKIP(RExC_parse)
8774 : 1;
8775 /* Guard against malformed utf8 */
8776 if (RExC_parse >= endchar) RExC_parse = endchar;
8777 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a
KW
8778 }
8779
8780 RExC_parse = endbrace + 1;
8781 if (endchar == endbrace) return NULL;
8782
8783 ret = (regnode *) &RExC_parse; /* Invalid regnode pointer */
fc8cd66c 8784 }
ff3f963a 8785 else { /* Not a char class */
e2a7e165
KW
8786
8787 /* What is done here is to convert this to a sub-pattern of the form
8788 * (?:\x{char1}\x{char2}...)
8789 * and then call reg recursively. That way, it retains its atomicness,
8790 * while not having to worry about special handling that some code
8791 * points may have. toke.c has converted the original Unicode values
8792 * to native, so that we can just pass on the hex values unchanged. We
8793 * do have to set a flag to keep recoding from happening in the
8794 * recursion */
8795
8796 SV * substitute_parse = newSVpvn_flags("?:", 2, SVf_UTF8|SVs_TEMP);
8797 STRLEN len;
ff3f963a
KW
8798 char *endchar; /* Points to '.' or '}' ending cur char in the input
8799 stream */
e2a7e165
KW
8800 char *orig_end = RExC_end;
8801
8802 while (RExC_parse < endbrace) {
ff3f963a
KW
8803
8804 /* Code points are separated by dots. If none, there is only one
8805 * code point, and is terminated by the brace */
37820adc 8806 endchar = RExC_parse + strcspn(RExC_parse, ".}");
ff3f963a 8807
e2a7e165
KW
8808 /* Convert to notation the rest of the code understands */
8809 sv_catpv(substitute_parse, "\\x{");
8810 sv_catpvn(substitute_parse, RExC_parse, endchar - RExC_parse);
8811 sv_catpv(substitute_parse, "}");
ff3f963a
KW
8812
8813 /* Point to the beginning of the next character in the sequence. */
8814 RExC_parse = endchar + 1;
ff3f963a 8815 }
e2a7e165 8816 sv_catpv(substitute_parse, ")");
ff3f963a 8817
e2a7e165 8818 RExC_parse = SvPV(substitute_parse, len);
ff3f963a 8819
e2a7e165
KW
8820 /* Don't allow empty number */
8821 if (len < 8) {
8822 vFAIL("Invalid hexadecimal number in \\N{U+...}");
ff3f963a 8823 }
e2a7e165 8824 RExC_end = RExC_parse + len;
ff3f963a 8825
e2a7e165
KW
8826 /* The values are Unicode, and therefore not subject to recoding */
8827 RExC_override_recoding = 1;
8828
8829 ret = reg(pRExC_state, 1, flagp, depth+1);
8830
8831 RExC_parse = endbrace;
8832 RExC_end = orig_end;
8833 RExC_override_recoding = 0;
ff3f963a 8834
ff3f963a
KW
8835 nextchar(pRExC_state);
8836 }
8837
8838 return ret;
fc8cd66c
YO
8839}
8840
8841
9e08bc66
TS
8842/*
8843 * reg_recode
8844 *
8845 * It returns the code point in utf8 for the value in *encp.
8846 * value: a code value in the source encoding
8847 * encp: a pointer to an Encode object
8848 *
8849 * If the result from Encode is not a single character,
8850 * it returns U+FFFD (Replacement character) and sets *encp to NULL.
8851 */
8852STATIC UV
8853S_reg_recode(pTHX_ const char value, SV **encp)
8854{
8855 STRLEN numlen = 1;
59cd0e26 8856 SV * const sv = newSVpvn_flags(&value, numlen, SVs_TEMP);
c86f7df5 8857 const char * const s = *encp ? sv_recode_to_utf8(sv, *encp) : SvPVX(sv);
9e08bc66
TS
8858 const STRLEN newlen = SvCUR(sv);
8859 UV uv = UNICODE_REPLACEMENT;
8860
7918f24d
NC
8861 PERL_ARGS_ASSERT_REG_RECODE;
8862
9e08bc66
TS
8863 if (newlen)
8864 uv = SvUTF8(sv)
8865 ? utf8n_to_uvchr((U8*)s, newlen, &numlen, UTF8_ALLOW_DEFAULT)
8866 : *(U8*)s;
8867
8868 if (!newlen || numlen != newlen) {
8869 uv = UNICODE_REPLACEMENT;
c86f7df5 8870 *encp = NULL;
9e08bc66
TS
8871 }
8872 return uv;
8873}
8874
fc8cd66c 8875
a687059c
LW
8876/*
8877 - regatom - the lowest level
ee9b8eae
YO
8878
8879 Try to identify anything special at the start of the pattern. If there
8880 is, then handle it as required. This may involve generating a single regop,
8881 such as for an assertion; or it may involve recursing, such as to
8882 handle a () structure.
8883
8884 If the string doesn't start with something special then we gobble up
8885 as much literal text as we can.
8886
8887 Once we have been able to handle whatever type of thing started the
8888 sequence, we return.
8889
8890 Note: we have to be careful with escapes, as they can be both literal
8891 and special, and in the case of \10 and friends can either, depending
486ec47a 8892 on context. Specifically there are two separate switches for handling
ee9b8eae
YO
8893 escape sequences, with the one for handling literal escapes requiring
8894 a dummy entry for all of the special escapes that are actually handled
8895 by the other.
8896*/
8897
76e3520e 8898STATIC regnode *
3dab1dad 8899S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
a687059c 8900{
97aff369 8901 dVAR;
cbbf8932 8902 register regnode *ret = NULL;
a0d0e21e 8903 I32 flags;
45948336 8904 char *parse_start = RExC_parse;
980866de 8905 U8 op;
3dab1dad
YO
8906 GET_RE_DEBUG_FLAGS_DECL;
8907 DEBUG_PARSE("atom");
a0d0e21e
LW
8908 *flagp = WORST; /* Tentatively. */
8909
7918f24d 8910 PERL_ARGS_ASSERT_REGATOM;
ee9b8eae 8911
a0d0e21e 8912tryagain:
f9a79580 8913 switch ((U8)*RExC_parse) {
a0d0e21e 8914 case '^':
830247a4
IZ
8915 RExC_seen_zerolen++;
8916 nextchar(pRExC_state);
bbe252da 8917 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 8918 ret = reg_node(pRExC_state, MBOL);
bbe252da 8919 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 8920 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 8921 else
830247a4 8922 ret = reg_node(pRExC_state, BOL);
fac92740 8923 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
8924 break;
8925 case '$':
830247a4 8926 nextchar(pRExC_state);
b81d288d 8927 if (*RExC_parse)
830247a4 8928 RExC_seen_zerolen++;
bbe252da 8929 if (RExC_flags & RXf_PMf_MULTILINE)
830247a4 8930 ret = reg_node(pRExC_state, MEOL);
bbe252da 8931 else if (RExC_flags & RXf_PMf_SINGLELINE)
830247a4 8932 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 8933 else
830247a4 8934 ret = reg_node(pRExC_state, EOL);
fac92740 8935 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
8936 break;
8937 case '.':
830247a4 8938 nextchar(pRExC_state);
bbe252da 8939 if (RExC_flags & RXf_PMf_SINGLELINE)
ffc61ed2
JH
8940 ret = reg_node(pRExC_state, SANY);
8941 else
8942 ret = reg_node(pRExC_state, REG_ANY);
8943 *flagp |= HASWIDTH|SIMPLE;
830247a4 8944 RExC_naughty++;
fac92740 8945 Set_Node_Length(ret, 1); /* MJD */
a0d0e21e
LW
8946 break;
8947 case '[':
b45f050a 8948 {
3dab1dad
YO
8949 char * const oregcomp_parse = ++RExC_parse;
8950 ret = regclass(pRExC_state,depth+1);
830247a4
IZ
8951 if (*RExC_parse != ']') {
8952 RExC_parse = oregcomp_parse;
b45f050a
JF
8953 vFAIL("Unmatched [");
8954 }
830247a4 8955 nextchar(pRExC_state);
a0d0e21e 8956 *flagp |= HASWIDTH|SIMPLE;
fac92740 8957 Set_Node_Length(ret, RExC_parse - oregcomp_parse + 1); /* MJD */
a0d0e21e 8958 break;
b45f050a 8959 }
a0d0e21e 8960 case '(':
830247a4 8961 nextchar(pRExC_state);
3dab1dad 8962 ret = reg(pRExC_state, 1, &flags,depth+1);
a0d0e21e 8963 if (ret == NULL) {
bf93d4cc 8964 if (flags & TRYAGAIN) {
830247a4 8965 if (RExC_parse == RExC_end) {
bf93d4cc
GS
8966 /* Make parent create an empty node if needed. */
8967 *flagp |= TRYAGAIN;
8968 return(NULL);
8969 }
a0d0e21e 8970 goto tryagain;
bf93d4cc 8971 }
a0d0e21e
LW
8972 return(NULL);
8973 }
a3b492c3 8974 *flagp |= flags&(HASWIDTH|SPSTART|SIMPLE|POSTPONED);
a0d0e21e
LW
8975 break;
8976 case '|':
8977 case ')':
8978 if (flags & TRYAGAIN) {
8979 *flagp |= TRYAGAIN;
8980 return NULL;
8981 }
b45f050a 8982 vFAIL("Internal urp");
a0d0e21e
LW
8983 /* Supposed to be caught earlier. */
8984 break;
85afd4ae 8985 case '{':
830247a4
IZ
8986 if (!regcurly(RExC_parse)) {
8987 RExC_parse++;
85afd4ae
CS
8988 goto defchar;
8989 }
8990 /* FALL THROUGH */
a0d0e21e
LW
8991 case '?':
8992 case '+':
8993 case '*':
830247a4 8994 RExC_parse++;
b45f050a 8995 vFAIL("Quantifier follows nothing");
a0d0e21e
LW
8996 break;
8997 case '\\':
ee9b8eae
YO
8998 /* Special Escapes
8999
9000 This switch handles escape sequences that resolve to some kind
9001 of special regop and not to literal text. Escape sequnces that
9002 resolve to literal text are handled below in the switch marked
9003 "Literal Escapes".
9004
9005 Every entry in this switch *must* have a corresponding entry
9006 in the literal escape switch. However, the opposite is not
9007 required, as the default for this switch is to jump to the
9008 literal text handling code.
9009 */
a0a388a1 9010 switch ((U8)*++RExC_parse) {
ee9b8eae 9011 /* Special Escapes */
a0d0e21e 9012 case 'A':
830247a4
IZ
9013 RExC_seen_zerolen++;
9014 ret = reg_node(pRExC_state, SBOL);
a0d0e21e 9015 *flagp |= SIMPLE;
ee9b8eae 9016 goto finish_meta_pat;
a0d0e21e 9017 case 'G':
830247a4
IZ
9018 ret = reg_node(pRExC_state, GPOS);
9019 RExC_seen |= REG_SEEN_GPOS;
a0d0e21e 9020 *flagp |= SIMPLE;
ee9b8eae
YO
9021 goto finish_meta_pat;
9022 case 'K':
9023 RExC_seen_zerolen++;
9024 ret = reg_node(pRExC_state, KEEPS);
9025 *flagp |= SIMPLE;
37923168
RGS
9026 /* XXX:dmq : disabling in-place substitution seems to
9027 * be necessary here to avoid cases of memory corruption, as
9028 * with: C<$_="x" x 80; s/x\K/y/> -- rgs
9029 */
9030 RExC_seen |= REG_SEEN_LOOKBEHIND;
ee9b8eae 9031 goto finish_meta_pat;
a0d0e21e 9032 case 'Z':
830247a4 9033 ret = reg_node(pRExC_state, SEOL);
a0d0e21e 9034 *flagp |= SIMPLE;
a1917ab9 9035 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 9036 goto finish_meta_pat;
b85d18e9 9037 case 'z':
830247a4 9038 ret = reg_node(pRExC_state, EOS);
b85d18e9 9039 *flagp |= SIMPLE;
830247a4 9040 RExC_seen_zerolen++; /* Do not optimize RE away */
ee9b8eae 9041 goto finish_meta_pat;
4a2d328f 9042 case 'C':
f33976b4
DB
9043 ret = reg_node(pRExC_state, CANY);
9044 RExC_seen |= REG_SEEN_CANY;
a0ed51b3 9045 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9046 goto finish_meta_pat;
a0ed51b3 9047 case 'X':
830247a4 9048 ret = reg_node(pRExC_state, CLUMP);
a0ed51b3 9049 *flagp |= HASWIDTH;
ee9b8eae 9050 goto finish_meta_pat;
a0d0e21e 9051 case 'w':
980866de
KW
9052 switch (get_regex_charset(RExC_flags)) {
9053 case REGEX_LOCALE_CHARSET:
9054 op = ALNUML;
9055 break;
9056 case REGEX_UNICODE_CHARSET:
9057 op = ALNUMU;
9058 break;
cfaf538b 9059 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9060 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9061 op = ALNUMA;
9062 break;
980866de
KW
9063 case REGEX_DEPENDS_CHARSET:
9064 op = ALNUM;
9065 break;
9066 default:
9067 goto bad_charset;
a12cf05f 9068 }
980866de 9069 ret = reg_node(pRExC_state, op);
a0d0e21e 9070 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9071 goto finish_meta_pat;
a0d0e21e 9072 case 'W':
980866de
KW
9073 switch (get_regex_charset(RExC_flags)) {
9074 case REGEX_LOCALE_CHARSET:
9075 op = NALNUML;
9076 break;
9077 case REGEX_UNICODE_CHARSET:
9078 op = NALNUMU;
9079 break;
cfaf538b 9080 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9081 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9082 op = NALNUMA;
9083 break;
980866de
KW
9084 case REGEX_DEPENDS_CHARSET:
9085 op = NALNUM;
9086 break;
9087 default:
9088 goto bad_charset;
a12cf05f 9089 }
980866de 9090 ret = reg_node(pRExC_state, op);
a0d0e21e 9091 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9092 goto finish_meta_pat;
a0d0e21e 9093 case 'b':
830247a4
IZ
9094 RExC_seen_zerolen++;
9095 RExC_seen |= REG_SEEN_LOOKBEHIND;
63ac0dad
KW
9096 switch (get_regex_charset(RExC_flags)) {
9097 case REGEX_LOCALE_CHARSET:
9098 op = BOUNDL;
9099 break;
9100 case REGEX_UNICODE_CHARSET:
9101 op = BOUNDU;
9102 break;
cfaf538b 9103 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9104 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9105 op = BOUNDA;
9106 break;
63ac0dad
KW
9107 case REGEX_DEPENDS_CHARSET:
9108 op = BOUND;
9109 break;
9110 default:
9111 goto bad_charset;
a12cf05f 9112 }
63ac0dad 9113 ret = reg_node(pRExC_state, op);
b988e673 9114 FLAGS(ret) = get_regex_charset(RExC_flags);
a0d0e21e 9115 *flagp |= SIMPLE;
5024bc2d
KW
9116 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9117 ckWARNregdep(RExC_parse, "\"\\b{\" is deprecated; use \"\\b\\{\" instead");
9118 }
ee9b8eae 9119 goto finish_meta_pat;
a0d0e21e 9120 case 'B':
830247a4
IZ
9121 RExC_seen_zerolen++;
9122 RExC_seen |= REG_SEEN_LOOKBEHIND;
63ac0dad
KW
9123 switch (get_regex_charset(RExC_flags)) {
9124 case REGEX_LOCALE_CHARSET:
9125 op = NBOUNDL;
9126 break;
9127 case REGEX_UNICODE_CHARSET:
9128 op = NBOUNDU;
9129 break;
cfaf538b 9130 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9131 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9132 op = NBOUNDA;
9133 break;
63ac0dad
KW
9134 case REGEX_DEPENDS_CHARSET:
9135 op = NBOUND;
9136 break;
9137 default:
9138 goto bad_charset;
a12cf05f 9139 }
63ac0dad 9140 ret = reg_node(pRExC_state, op);
b988e673 9141 FLAGS(ret) = get_regex_charset(RExC_flags);
a0d0e21e 9142 *flagp |= SIMPLE;
5024bc2d
KW
9143 if (! SIZE_ONLY && (U8) *(RExC_parse + 1) == '{') {
9144 ckWARNregdep(RExC_parse, "\"\\B{\" is deprecated; use \"\\B\\{\" instead");
9145 }
ee9b8eae 9146 goto finish_meta_pat;
a0d0e21e 9147 case 's':
980866de
KW
9148 switch (get_regex_charset(RExC_flags)) {
9149 case REGEX_LOCALE_CHARSET:
9150 op = SPACEL;
9151 break;
9152 case REGEX_UNICODE_CHARSET:
9153 op = SPACEU;
9154 break;
cfaf538b 9155 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9156 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9157 op = SPACEA;
9158 break;
980866de
KW
9159 case REGEX_DEPENDS_CHARSET:
9160 op = SPACE;
9161 break;
9162 default:
9163 goto bad_charset;
a12cf05f 9164 }
980866de 9165 ret = reg_node(pRExC_state, op);
a0d0e21e 9166 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9167 goto finish_meta_pat;
a0d0e21e 9168 case 'S':
980866de
KW
9169 switch (get_regex_charset(RExC_flags)) {
9170 case REGEX_LOCALE_CHARSET:
9171 op = NSPACEL;
9172 break;
9173 case REGEX_UNICODE_CHARSET:
9174 op = NSPACEU;
9175 break;
cfaf538b 9176 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9177 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9178 op = NSPACEA;
9179 break;
980866de
KW
9180 case REGEX_DEPENDS_CHARSET:
9181 op = NSPACE;
9182 break;
9183 default:
9184 goto bad_charset;
a12cf05f 9185 }
980866de 9186 ret = reg_node(pRExC_state, op);
a0d0e21e 9187 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9188 goto finish_meta_pat;
a0d0e21e 9189 case 'd':
56ae17b4
KW
9190 switch (get_regex_charset(RExC_flags)) {
9191 case REGEX_LOCALE_CHARSET:
9192 op = DIGITL;
9193 break;
cfaf538b 9194 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9195 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9196 op = DIGITA;
9197 break;
56ae17b4
KW
9198 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9199 case REGEX_UNICODE_CHARSET:
9200 op = DIGIT;
9201 break;
9202 default:
9203 goto bad_charset;
6ab9ea91 9204 }
56ae17b4 9205 ret = reg_node(pRExC_state, op);
a0d0e21e 9206 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9207 goto finish_meta_pat;
a0d0e21e 9208 case 'D':
56ae17b4
KW
9209 switch (get_regex_charset(RExC_flags)) {
9210 case REGEX_LOCALE_CHARSET:
9211 op = NDIGITL;
9212 break;
cfaf538b 9213 case REGEX_ASCII_RESTRICTED_CHARSET:
2f7f8cb1 9214 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
cfaf538b
KW
9215 op = NDIGITA;
9216 break;
56ae17b4
KW
9217 case REGEX_DEPENDS_CHARSET: /* No difference between these */
9218 case REGEX_UNICODE_CHARSET:
9219 op = NDIGIT;
9220 break;
9221 default:
9222 goto bad_charset;
6ab9ea91 9223 }
56ae17b4 9224 ret = reg_node(pRExC_state, op);
a0d0e21e 9225 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9226 goto finish_meta_pat;
e1d1eefb
YO
9227 case 'R':
9228 ret = reg_node(pRExC_state, LNBREAK);
9229 *flagp |= HASWIDTH|SIMPLE;
9230 goto finish_meta_pat;
9231 case 'h':
9232 ret = reg_node(pRExC_state, HORIZWS);
9233 *flagp |= HASWIDTH|SIMPLE;
9234 goto finish_meta_pat;
9235 case 'H':
9236 ret = reg_node(pRExC_state, NHORIZWS);
9237 *flagp |= HASWIDTH|SIMPLE;
9238 goto finish_meta_pat;
ee9b8eae 9239 case 'v':
e1d1eefb
YO
9240 ret = reg_node(pRExC_state, VERTWS);
9241 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae
YO
9242 goto finish_meta_pat;
9243 case 'V':
e1d1eefb
YO
9244 ret = reg_node(pRExC_state, NVERTWS);
9245 *flagp |= HASWIDTH|SIMPLE;
ee9b8eae 9246 finish_meta_pat:
830247a4 9247 nextchar(pRExC_state);
fac92740 9248 Set_Node_Length(ret, 2); /* MJD */
ee9b8eae 9249 break;
a14b48bc
LW
9250 case 'p':
9251 case 'P':
686b73d4 9252 {
3dab1dad 9253 char* const oldregxend = RExC_end;
d008bc60 9254#ifdef DEBUGGING
ccb2c380 9255 char* parse_start = RExC_parse - 2;
d008bc60 9256#endif
a14b48bc 9257
830247a4 9258 if (RExC_parse[1] == '{') {
3568d838 9259 /* a lovely hack--pretend we saw [\pX] instead */
830247a4
IZ
9260 RExC_end = strchr(RExC_parse, '}');
9261 if (!RExC_end) {
3dab1dad 9262 const U8 c = (U8)*RExC_parse;
830247a4
IZ
9263 RExC_parse += 2;
9264 RExC_end = oldregxend;
0da60cf5 9265 vFAIL2("Missing right brace on \\%c{}", c);
b45f050a 9266 }
830247a4 9267 RExC_end++;
a14b48bc 9268 }
af6f566e 9269 else {
830247a4 9270 RExC_end = RExC_parse + 2;
af6f566e
HS
9271 if (RExC_end > oldregxend)
9272 RExC_end = oldregxend;
9273 }
830247a4 9274 RExC_parse--;
a14b48bc 9275
3dab1dad 9276 ret = regclass(pRExC_state,depth+1);
a14b48bc 9277
830247a4
IZ
9278 RExC_end = oldregxend;
9279 RExC_parse--;
ccb2c380
MP
9280
9281 Set_Node_Offset(ret, parse_start + 2);
9282 Set_Node_Cur_Length(ret);
830247a4 9283 nextchar(pRExC_state);
a14b48bc
LW
9284 *flagp |= HASWIDTH|SIMPLE;
9285 }
9286 break;
fc8cd66c 9287 case 'N':
afefe6bf 9288 /* Handle \N and \N{NAME} here and not below because it can be
fc8cd66c
YO
9289 multicharacter. join_exact() will join them up later on.
9290 Also this makes sure that things like /\N{BLAH}+/ and
9291 \N{BLAH} being multi char Just Happen. dmq*/
9292 ++RExC_parse;
9d64099b 9293 ret= reg_namedseq(pRExC_state, NULL, flagp, depth);
fc8cd66c 9294 break;
0a4db386 9295 case 'k': /* Handle \k<NAME> and \k'NAME' */
1f1031fe 9296 parse_named_seq:
81714fb9
YO
9297 {
9298 char ch= RExC_parse[1];
1f1031fe
YO
9299 if (ch != '<' && ch != '\'' && ch != '{') {
9300 RExC_parse++;
9301 vFAIL2("Sequence %.2s... not terminated",parse_start);
81714fb9 9302 } else {
1f1031fe
YO
9303 /* this pretty much dupes the code for (?P=...) in reg(), if
9304 you change this make sure you change that */
81714fb9 9305 char* name_start = (RExC_parse += 2);
2eccd3b2 9306 U32 num = 0;
0a4db386
YO
9307 SV *sv_dat = reg_scan_name(pRExC_state,
9308 SIZE_ONLY ? REG_RSN_RETURN_NULL : REG_RSN_RETURN_DATA);
1f1031fe 9309 ch= (ch == '<') ? '>' : (ch == '{') ? '}' : '\'';
81714fb9 9310 if (RExC_parse == name_start || *RExC_parse != ch)
1f1031fe
YO
9311 vFAIL2("Sequence %.3s... not terminated",parse_start);
9312
9313 if (!SIZE_ONLY) {
9314 num = add_data( pRExC_state, 1, "S" );
9315 RExC_rxi->data->data[num]=(void*)sv_dat;
5a5094bd 9316 SvREFCNT_inc_simple_void(sv_dat);
1f1031fe
YO
9317 }
9318
81714fb9
YO
9319 RExC_sawback = 1;
9320 ret = reganode(pRExC_state,
4444fd9f
KW
9321 ((! FOLD)
9322 ? NREF
2f7f8cb1
KW
9323 : (MORE_ASCII_RESTRICTED)
9324 ? NREFFA
9325 : (AT_LEAST_UNI_SEMANTICS)
9326 ? NREFFU
9327 : (LOC)
9328 ? NREFFL
9329 : NREFF),
4444fd9f 9330 num);
81714fb9 9331 *flagp |= HASWIDTH;
1f1031fe 9332
81714fb9
YO
9333 /* override incorrect value set in reganode MJD */
9334 Set_Node_Offset(ret, parse_start+1);
9335 Set_Node_Cur_Length(ret); /* MJD */
9336 nextchar(pRExC_state);
1f1031fe 9337
81714fb9
YO
9338 }
9339 break;
1f1031fe 9340 }
2bf803e2 9341 case 'g':
a0d0e21e
LW
9342 case '1': case '2': case '3': case '4':
9343 case '5': case '6': case '7': case '8': case '9':
9344 {
c74340f9 9345 I32 num;
2bf803e2
YO
9346 bool isg = *RExC_parse == 'g';
9347 bool isrel = 0;
9348 bool hasbrace = 0;
9349 if (isg) {
c74340f9 9350 RExC_parse++;
2bf803e2
YO
9351 if (*RExC_parse == '{') {
9352 RExC_parse++;
9353 hasbrace = 1;
9354 }
9355 if (*RExC_parse == '-') {
9356 RExC_parse++;
9357 isrel = 1;
9358 }
1f1031fe
YO
9359 if (hasbrace && !isDIGIT(*RExC_parse)) {
9360 if (isrel) RExC_parse--;
9361 RExC_parse -= 2;
9362 goto parse_named_seq;
9363 } }
c74340f9 9364 num = atoi(RExC_parse);
b72d83b2
RGS
9365 if (isg && num == 0)
9366 vFAIL("Reference to invalid group 0");
c74340f9 9367 if (isrel) {
5624f11d 9368 num = RExC_npar - num;
c74340f9
YO
9369 if (num < 1)
9370 vFAIL("Reference to nonexistent or unclosed group");
9371 }
2bf803e2 9372 if (!isg && num > 9 && num >= RExC_npar)
a0d0e21e
LW
9373 goto defchar;
9374 else {
3dab1dad 9375 char * const parse_start = RExC_parse - 1; /* MJD */
830247a4
IZ
9376 while (isDIGIT(*RExC_parse))
9377 RExC_parse++;
1f1031fe
YO
9378 if (parse_start == RExC_parse - 1)
9379 vFAIL("Unterminated \\g... pattern");
2bf803e2
YO
9380 if (hasbrace) {
9381 if (*RExC_parse != '}')
9382 vFAIL("Unterminated \\g{...} pattern");
9383 RExC_parse++;
9384 }
c74340f9
YO
9385 if (!SIZE_ONLY) {
9386 if (num > (I32)RExC_rx->nparens)
9387 vFAIL("Reference to nonexistent group");
c74340f9 9388 }
830247a4 9389 RExC_sawback = 1;
eb160463 9390 ret = reganode(pRExC_state,
4444fd9f
KW
9391 ((! FOLD)
9392 ? REF
2f7f8cb1
KW
9393 : (MORE_ASCII_RESTRICTED)
9394 ? REFFA
9395 : (AT_LEAST_UNI_SEMANTICS)
9396 ? REFFU
9397 : (LOC)
9398 ? REFFL
9399 : REFF),
4444fd9f 9400 num);
a0d0e21e 9401 *flagp |= HASWIDTH;
2af232bd 9402
fac92740 9403 /* override incorrect value set in reganode MJD */
2af232bd 9404 Set_Node_Offset(ret, parse_start+1);
fac92740 9405 Set_Node_Cur_Length(ret); /* MJD */
830247a4
IZ
9406 RExC_parse--;
9407 nextchar(pRExC_state);
a0d0e21e
LW
9408 }
9409 }
9410 break;
9411 case '\0':
830247a4 9412 if (RExC_parse >= RExC_end)
b45f050a 9413 FAIL("Trailing \\");
a0d0e21e
LW
9414 /* FALL THROUGH */
9415 default:
a0288114 9416 /* Do not generate "unrecognized" warnings here, we fall
c9f97d15 9417 back into the quick-grab loop below */
45948336 9418 parse_start--;
a0d0e21e
LW
9419 goto defchar;
9420 }
9421 break;
4633a7c4
LW
9422
9423 case '#':
bbe252da 9424 if (RExC_flags & RXf_PMf_EXTENDED) {
bcdf7404 9425 if ( reg_skipcomment( pRExC_state ) )
4633a7c4
LW
9426 goto tryagain;
9427 }
9428 /* FALL THROUGH */
9429
f9a79580 9430 default:
561784a5
KW
9431
9432 parse_start = RExC_parse - 1;
9433
9434 RExC_parse++;
9435
9436 defchar: {
ba210ebe 9437 register STRLEN len;
58ae7d3f 9438 register UV ender;
a0d0e21e 9439 register char *p;
3dab1dad 9440 char *s;
80aecb99 9441 STRLEN foldlen;
89ebb4a3 9442 U8 tmpbuf[UTF8_MAXBYTES_CASE+1], *foldbuf;
d82f9944 9443 U8 node_type;
f06dbbb7 9444
bb914485
KW
9445 /* Is this a LATIN LOWER CASE SHARP S in an EXACTFU node? If so,
9446 * it is folded to 'ss' even if not utf8 */
9447 bool is_exactfu_sharp_s;
9448
58ae7d3f 9449 ender = 0;
3f410cf6
KW
9450 node_type = ((! FOLD) ? EXACT
9451 : (LOC)
9452 ? EXACTFL
9453 : (MORE_ASCII_RESTRICTED)
9454 ? EXACTFA
9455 : (AT_LEAST_UNI_SEMANTICS)
9456 ? EXACTFU
9457 : EXACTF);
d82f9944 9458 ret = reg_node(pRExC_state, node_type);
cd439c50 9459 s = STRING(ret);
3f410cf6
KW
9460
9461 /* XXX The node can hold up to 255 bytes, yet this only goes to
9462 * 127. I (khw) do not know why. Keeping it somewhat less than
9463 * 255 allows us to not have to worry about overflow due to
9464 * converting to utf8 and fold expansion, but that value is
9465 * 255-UTF8_MAXBYTES_CASE. join_exact() may join adjacent nodes
9466 * split up by this limit into a single one using the real max of
9467 * 255. Even at 127, this breaks under rare circumstances. If
9468 * folding, we do not want to split a node at a character that is a
9469 * non-final in a multi-char fold, as an input string could just
9470 * happen to want to match across the node boundary. The join
9471 * would solve that problem if the join actually happens. But a
9472 * series of more than two nodes in a row each of 127 would cause
9473 * the first join to succeed to get to 254, but then there wouldn't
9474 * be room for the next one, which could at be one of those split
9475 * multi-char folds. I don't know of any fool-proof solution. One
9476 * could back off to end with only a code point that isn't such a
9477 * non-final, but it is possible for there not to be any in the
9478 * entire node. */
830247a4 9479 for (len = 0, p = RExC_parse - 1;
3f410cf6
KW
9480 len < 127 && p < RExC_end;
9481 len++)
a0d0e21e 9482 {
3dab1dad 9483 char * const oldp = p;
5b5a24f7 9484
bbe252da 9485 if (RExC_flags & RXf_PMf_EXTENDED)
bcdf7404 9486 p = regwhite( pRExC_state, p );
f9a79580 9487 switch ((U8)*p) {
a0d0e21e
LW
9488 case '^':
9489 case '$':
9490 case '.':
9491 case '[':
9492 case '(':
9493 case ')':
9494 case '|':
9495 goto loopdone;
9496 case '\\':
ee9b8eae
YO
9497 /* Literal Escapes Switch
9498
9499 This switch is meant to handle escape sequences that
9500 resolve to a literal character.
9501
9502 Every escape sequence that represents something
9503 else, like an assertion or a char class, is handled
9504 in the switch marked 'Special Escapes' above in this
9505 routine, but also has an entry here as anything that
9506 isn't explicitly mentioned here will be treated as
9507 an unescaped equivalent literal.
9508 */
9509
a0a388a1 9510 switch ((U8)*++p) {
ee9b8eae
YO
9511 /* These are all the special escapes. */
9512 case 'A': /* Start assertion */
9513 case 'b': case 'B': /* Word-boundary assertion*/
9514 case 'C': /* Single char !DANGEROUS! */
9515 case 'd': case 'D': /* digit class */
9516 case 'g': case 'G': /* generic-backref, pos assertion */
e1d1eefb 9517 case 'h': case 'H': /* HORIZWS */
ee9b8eae
YO
9518 case 'k': case 'K': /* named backref, keep marker */
9519 case 'N': /* named char sequence */
38a44b82 9520 case 'p': case 'P': /* Unicode property */
e1d1eefb 9521 case 'R': /* LNBREAK */
ee9b8eae 9522 case 's': case 'S': /* space class */
e1d1eefb 9523 case 'v': case 'V': /* VERTWS */
ee9b8eae
YO
9524 case 'w': case 'W': /* word class */
9525 case 'X': /* eXtended Unicode "combining character sequence" */
9526 case 'z': case 'Z': /* End of line/string assertion */
a0d0e21e
LW
9527 --p;
9528 goto loopdone;
ee9b8eae
YO
9529
9530 /* Anything after here is an escape that resolves to a
9531 literal. (Except digits, which may or may not)
9532 */
a0d0e21e
LW
9533 case 'n':
9534 ender = '\n';
9535 p++;
a687059c 9536 break;
a0d0e21e
LW
9537 case 'r':
9538 ender = '\r';
9539 p++;
a687059c 9540 break;
a0d0e21e
LW
9541 case 't':
9542 ender = '\t';
9543 p++;
a687059c 9544 break;
a0d0e21e
LW
9545 case 'f':
9546 ender = '\f';
9547 p++;
a687059c 9548 break;
a0d0e21e 9549 case 'e':
c7f1f016 9550 ender = ASCII_TO_NATIVE('\033');
a0d0e21e 9551 p++;
a687059c 9552 break;
a0d0e21e 9553 case 'a':
c7f1f016 9554 ender = ASCII_TO_NATIVE('\007');
a0d0e21e 9555 p++;
a687059c 9556 break;
f0a2b745
KW
9557 case 'o':
9558 {
9559 STRLEN brace_len = len;
00c0cb6d 9560 UV result;
454155d9
KW
9561 const char* error_msg;
9562
9563 bool valid = grok_bslash_o(p,
9564 &result,
9565 &brace_len,
9566 &error_msg,
9567 1);
9568 p += brace_len;
9569 if (! valid) {
9570 RExC_parse = p; /* going to die anyway; point
9571 to exact spot of failure */
f0a2b745
KW
9572 vFAIL(error_msg);
9573 }
00c0cb6d
DG
9574 else
9575 {
9576 ender = result;
9577 }
f0a2b745
KW
9578 if (PL_encoding && ender < 0x100) {
9579 goto recode_encoding;
9580 }
9581 if (ender > 0xff) {
62fed28b 9582 REQUIRE_UTF8;
f0a2b745
KW
9583 }
9584 break;
9585 }
a0d0e21e 9586 case 'x':
a0ed51b3 9587 if (*++p == '{') {
1df70142 9588 char* const e = strchr(p, '}');
686b73d4 9589
b45f050a 9590 if (!e) {
830247a4 9591 RExC_parse = p + 1;
b45f050a
JF
9592 vFAIL("Missing right brace on \\x{}");
9593 }
de5f0749 9594 else {
a4c04bdc
NC
9595 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
9596 | PERL_SCAN_DISALLOW_PREFIX;
1df70142 9597 STRLEN numlen = e - p - 1;
53305cf1 9598 ender = grok_hex(p + 1, &numlen, &flags, NULL);
aaa80028 9599 if (ender > 0xff)
62fed28b 9600 REQUIRE_UTF8;
a0ed51b3
LW
9601 p = e + 1;
9602 }
a0ed51b3
LW
9603 }
9604 else {
a4c04bdc 9605 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
1df70142 9606 STRLEN numlen = 2;
53305cf1 9607 ender = grok_hex(p, &numlen, &flags, NULL);
a0ed51b3
LW
9608 p += numlen;
9609 }
9e08bc66
TS
9610 if (PL_encoding && ender < 0x100)
9611 goto recode_encoding;
a687059c 9612 break;
a0d0e21e
LW
9613 case 'c':
9614 p++;
17a3df4c 9615 ender = grok_bslash_c(*p++, UTF, SIZE_ONLY);
a687059c 9616 break;
a0d0e21e
LW
9617 case '0': case '1': case '2': case '3':case '4':
9618 case '5': case '6': case '7': case '8':case '9':
9619 if (*p == '0' ||
ca67da41 9620 (isDIGIT(p[1]) && atoi(p) >= RExC_npar))
c99e91e9
KW
9621 {
9622 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
1df70142 9623 STRLEN numlen = 3;
53305cf1 9624 ender = grok_oct(p, &numlen, &flags, NULL);
fa1639c5 9625 if (ender > 0xff) {
62fed28b 9626 REQUIRE_UTF8;
609122bd 9627 }
a0d0e21e
LW
9628 p += numlen;
9629 }
9630 else {
9631 --p;
9632 goto loopdone;
a687059c 9633 }
9e08bc66
TS
9634 if (PL_encoding && ender < 0x100)
9635 goto recode_encoding;
9636 break;
9637 recode_encoding:
e2a7e165 9638 if (! RExC_override_recoding) {
9e08bc66
TS
9639 SV* enc = PL_encoding;
9640 ender = reg_recode((const char)(U8)ender, &enc);
668c081a
NC
9641 if (!enc && SIZE_ONLY)
9642 ckWARNreg(p, "Invalid escape in the specified encoding");
62fed28b 9643 REQUIRE_UTF8;
9e08bc66 9644 }
a687059c 9645 break;
a0d0e21e 9646 case '\0':
830247a4 9647 if (p >= RExC_end)
b45f050a 9648 FAIL("Trailing \\");
a687059c 9649 /* FALL THROUGH */
a0d0e21e 9650 default:
216bfc0a
KW
9651 if (!SIZE_ONLY&& isALPHA(*p)) {
9652 /* Include any { following the alpha to emphasize
9653 * that it could be part of an escape at some point
9654 * in the future */
9655 int len = (*(p + 1) == '{') ? 2 : 1;
9656 ckWARN3reg(p + len, "Unrecognized escape \\%.*s passed through", len, p);
9657 }
a0ed51b3 9658 goto normal_default;
a0d0e21e
LW
9659 }
9660 break;
a687059c 9661 default:
a0ed51b3 9662 normal_default:
fd400ab9 9663 if (UTF8_IS_START(*p) && UTF) {
1df70142 9664 STRLEN numlen;
5e12f4fb 9665 ender = utf8n_to_uvchr((U8*)p, RExC_end - p,
9f7f3913 9666 &numlen, UTF8_ALLOW_DEFAULT);
a0ed51b3
LW
9667 p += numlen;
9668 }
9669 else
5b67c30a 9670 ender = (U8) *p++;
a0d0e21e 9671 break;
7e2509c1
KW
9672 } /* End of switch on the literal */
9673
bb914485
KW
9674 is_exactfu_sharp_s = (node_type == EXACTFU
9675 && ender == LATIN_SMALL_LETTER_SHARP_S);
bcdf7404
YO
9676 if ( RExC_flags & RXf_PMf_EXTENDED)
9677 p = regwhite( pRExC_state, p );
bb914485 9678 if ((UTF && FOLD) || is_exactfu_sharp_s) {
17580e7a
KW
9679 /* Prime the casefolded buffer. Locale rules, which apply
9680 * only to code points < 256, aren't known until execution,
9681 * so for them, just output the original character using
a0c4c608
KW
9682 * utf8. If we start to fold non-UTF patterns, be sure to
9683 * update join_exact() */
17580e7a
KW
9684 if (LOC && ender < 256) {
9685 if (UNI_IS_INVARIANT(ender)) {
9686 *tmpbuf = (U8) ender;
9687 foldlen = 1;
9688 } else {
9689 *tmpbuf = UTF8_TWO_BYTE_HI(ender);
9690 *(tmpbuf + 1) = UTF8_TWO_BYTE_LO(ender);
9691 foldlen = 2;
9692 }
9693 }
9694 else if (isASCII(ender)) { /* Note: Here can't also be LOC
9695 */
2f7f8cb1 9696 ender = toLOWER(ender);
cd64649c 9697 *tmpbuf = (U8) ender;
2f7f8cb1
KW
9698 foldlen = 1;
9699 }
17580e7a
KW
9700 else if (! MORE_ASCII_RESTRICTED && ! LOC) {
9701
9702 /* Locale and /aa require more selectivity about the
9703 * fold, so are handled below. Otherwise, here, just
9704 * use the fold */
2f7f8cb1
KW
9705 ender = toFOLD_uni(ender, tmpbuf, &foldlen);
9706 }
9707 else {
17580e7a
KW
9708 /* Under locale rules or /aa we are not to mix,
9709 * respectively, ords < 256 or ASCII with non-. So
9710 * reject folds that mix them, using only the
9711 * non-folded code point. So do the fold to a
9712 * temporary, and inspect each character in it. */
2f7f8cb1
KW
9713 U8 trialbuf[UTF8_MAXBYTES_CASE+1];
9714 U8* s = trialbuf;
9715 UV tmpender = toFOLD_uni(ender, trialbuf, &foldlen);
9716 U8* e = s + foldlen;
9717 bool fold_ok = TRUE;
9718
9719 while (s < e) {
17580e7a
KW
9720 if (isASCII(*s)
9721 || (LOC && (UTF8_IS_INVARIANT(*s)
9722 || UTF8_IS_DOWNGRADEABLE_START(*s))))
9723 {
2f7f8cb1
KW
9724 fold_ok = FALSE;
9725 break;
9726 }
9727 s += UTF8SKIP(s);
9728 }
9729 if (fold_ok) {
9730 Copy(trialbuf, tmpbuf, foldlen, U8);
9731 ender = tmpender;
9732 }
9733 else {
9734 uvuni_to_utf8(tmpbuf, ender);
9735 foldlen = UNISKIP(ender);
9736 }
9737 }
60a8b682 9738 }
bcdf7404 9739 if (p < RExC_end && ISMULT2(p)) { /* Back off on ?+*. */
a0d0e21e
LW
9740 if (len)
9741 p = oldp;
bb914485 9742 else if (UTF || is_exactfu_sharp_s) {
80aecb99 9743 if (FOLD) {
60a8b682 9744 /* Emit all the Unicode characters. */
1df70142 9745 STRLEN numlen;
80aecb99
JH
9746 for (foldbuf = tmpbuf;
9747 foldlen;
9748 foldlen -= numlen) {
9749 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 9750 if (numlen > 0) {
71207a34 9751 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
9752 s += unilen;
9753 len += unilen;
9754 /* In EBCDIC the numlen
9755 * and unilen can differ. */
9dc45d57 9756 foldbuf += numlen;
47654450
JH
9757 if (numlen >= foldlen)
9758 break;
9dc45d57
JH
9759 }
9760 else
9761 break; /* "Can't happen." */
80aecb99
JH
9762 }
9763 }
9764 else {
71207a34 9765 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 9766 if (unilen > 0) {
0ebc6274
JH
9767 s += unilen;
9768 len += unilen;
9dc45d57 9769 }
80aecb99 9770 }
a0ed51b3 9771 }
a0d0e21e
LW
9772 else {
9773 len++;
eb160463 9774 REGC((char)ender, s++);
a0d0e21e
LW
9775 }
9776 break;
a687059c 9777 }
bb914485 9778 if (UTF || is_exactfu_sharp_s) {
80aecb99 9779 if (FOLD) {
60a8b682 9780 /* Emit all the Unicode characters. */
1df70142 9781 STRLEN numlen;
80aecb99
JH
9782 for (foldbuf = tmpbuf;
9783 foldlen;
9784 foldlen -= numlen) {
9785 ender = utf8_to_uvchr(foldbuf, &numlen);
9dc45d57 9786 if (numlen > 0) {
71207a34 9787 const STRLEN unilen = reguni(pRExC_state, ender, s);
0ebc6274
JH
9788 len += unilen;
9789 s += unilen;
9790 /* In EBCDIC the numlen
9791 * and unilen can differ. */
9dc45d57 9792 foldbuf += numlen;
47654450
JH
9793 if (numlen >= foldlen)
9794 break;
9dc45d57
JH
9795 }
9796 else
9797 break;
80aecb99
JH
9798 }
9799 }
9800 else {
71207a34 9801 const STRLEN unilen = reguni(pRExC_state, ender, s);
9ede7db1 9802 if (unilen > 0) {
0ebc6274
JH
9803 s += unilen;
9804 len += unilen;
9dc45d57 9805 }
80aecb99
JH
9806 }
9807 len--;
a0ed51b3 9808 }
d669c36c 9809 else {
eb160463 9810 REGC((char)ender, s++);
d669c36c 9811 }
a0d0e21e 9812 }
7e2509c1
KW
9813 loopdone: /* Jumped to when encounters something that shouldn't be in
9814 the node */
830247a4 9815 RExC_parse = p - 1;
fac92740 9816 Set_Node_Cur_Length(ret); /* MJD */
830247a4 9817 nextchar(pRExC_state);
793db0cb
JH
9818 {
9819 /* len is STRLEN which is unsigned, need to copy to signed */
9820 IV iv = len;
9821 if (iv < 0)
9822 vFAIL("Internal disaster");
9823 }
a0d0e21e
LW
9824 if (len > 0)
9825 *flagp |= HASWIDTH;
090f7165 9826 if (len == 1 && UNI_IS_INVARIANT(ender))
a0d0e21e 9827 *flagp |= SIMPLE;
686b73d4 9828
cd439c50 9829 if (SIZE_ONLY)
830247a4 9830 RExC_size += STR_SZ(len);
3dab1dad
YO
9831 else {
9832 STR_LEN(ret) = len;
830247a4 9833 RExC_emit += STR_SZ(len);
07be1b83 9834 }
3dab1dad 9835 }
a0d0e21e
LW
9836 break;
9837 }
a687059c 9838
a0d0e21e 9839 return(ret);
980866de
KW
9840
9841/* Jumped to when an unrecognized character set is encountered */
9842bad_charset:
9843 Perl_croak(aTHX_ "panic: Unknown regex character set encoding: %u", get_regex_charset(RExC_flags));
9844 return(NULL);
a687059c
LW
9845}
9846
873ef191 9847STATIC char *
bcdf7404 9848S_regwhite( RExC_state_t *pRExC_state, char *p )
5b5a24f7 9849{
bcdf7404 9850 const char *e = RExC_end;
7918f24d
NC
9851
9852 PERL_ARGS_ASSERT_REGWHITE;
9853
5b5a24f7
CS
9854 while (p < e) {
9855 if (isSPACE(*p))
9856 ++p;
9857 else if (*p == '#') {
bcdf7404 9858 bool ended = 0;
5b5a24f7 9859 do {
bcdf7404
YO
9860 if (*p++ == '\n') {
9861 ended = 1;
9862 break;
9863 }
9864 } while (p < e);
9865 if (!ended)
9866 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
5b5a24f7
CS
9867 }
9868 else
9869 break;
9870 }
9871 return p;
9872}
9873
b8c5462f
JH
9874/* Parse POSIX character classes: [[:foo:]], [[=foo=]], [[.foo.]].
9875 Character classes ([:foo:]) can also be negated ([:^foo:]).
9876 Returns a named class id (ANYOF_XXX) if successful, -1 otherwise.
9877 Equivalence classes ([=foo=]) and composites ([.foo.]) are parsed,
beeb77fc 9878 but trigger failures because they are currently unimplemented. */
9a86a77b
JH
9879
9880#define POSIXCC_DONE(c) ((c) == ':')
9881#define POSIXCC_NOTYET(c) ((c) == '=' || (c) == '.')
9882#define POSIXCC(c) (POSIXCC_DONE(c) || POSIXCC_NOTYET(c))
9883
b8c5462f 9884STATIC I32
830247a4 9885S_regpposixcc(pTHX_ RExC_state_t *pRExC_state, I32 value)
620e46c5 9886{
97aff369 9887 dVAR;
936ed897 9888 I32 namedclass = OOB_NAMEDCLASS;
620e46c5 9889
7918f24d
NC
9890 PERL_ARGS_ASSERT_REGPPOSIXCC;
9891
830247a4 9892 if (value == '[' && RExC_parse + 1 < RExC_end &&
620e46c5 9893 /* I smell either [: or [= or [. -- POSIX has been here, right? */
9a86a77b 9894 POSIXCC(UCHARAT(RExC_parse))) {
1df70142 9895 const char c = UCHARAT(RExC_parse);
097eb12c 9896 char* const s = RExC_parse++;
686b73d4 9897
9a86a77b 9898 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != c)
830247a4
IZ
9899 RExC_parse++;
9900 if (RExC_parse == RExC_end)
620e46c5 9901 /* Grandfather lone [:, [=, [. */
830247a4 9902 RExC_parse = s;
620e46c5 9903 else {
3dab1dad 9904 const char* const t = RExC_parse++; /* skip over the c */
80916619
NC
9905 assert(*t == c);
9906
9a86a77b 9907 if (UCHARAT(RExC_parse) == ']') {
3dab1dad 9908 const char *posixcc = s + 1;
830247a4 9909 RExC_parse++; /* skip over the ending ] */
3dab1dad 9910
b8c5462f 9911 if (*s == ':') {
1df70142
AL
9912 const I32 complement = *posixcc == '^' ? *posixcc++ : 0;
9913 const I32 skip = t - posixcc;
80916619
NC
9914
9915 /* Initially switch on the length of the name. */
9916 switch (skip) {
9917 case 4:
3dab1dad
YO
9918 if (memEQ(posixcc, "word", 4)) /* this is not POSIX, this is the Perl \w */
9919 namedclass = complement ? ANYOF_NALNUM : ANYOF_ALNUM;
cc4319de 9920 break;
80916619
NC
9921 case 5:
9922 /* Names all of length 5. */
9923 /* alnum alpha ascii blank cntrl digit graph lower
9924 print punct space upper */
9925 /* Offset 4 gives the best switch position. */
9926 switch (posixcc[4]) {
9927 case 'a':
3dab1dad
YO
9928 if (memEQ(posixcc, "alph", 4)) /* alpha */
9929 namedclass = complement ? ANYOF_NALPHA : ANYOF_ALPHA;
80916619
NC
9930 break;
9931 case 'e':
3dab1dad
YO
9932 if (memEQ(posixcc, "spac", 4)) /* space */
9933 namedclass = complement ? ANYOF_NPSXSPC : ANYOF_PSXSPC;
80916619
NC
9934 break;
9935 case 'h':
3dab1dad
YO
9936 if (memEQ(posixcc, "grap", 4)) /* graph */
9937 namedclass = complement ? ANYOF_NGRAPH : ANYOF_GRAPH;
80916619
NC
9938 break;
9939 case 'i':
3dab1dad
YO
9940 if (memEQ(posixcc, "asci", 4)) /* ascii */
9941 namedclass = complement ? ANYOF_NASCII : ANYOF_ASCII;
80916619
NC
9942 break;
9943 case 'k':
3dab1dad
YO
9944 if (memEQ(posixcc, "blan", 4)) /* blank */
9945 namedclass = complement ? ANYOF_NBLANK : ANYOF_BLANK;
80916619
NC
9946 break;
9947 case 'l':
3dab1dad
YO
9948 if (memEQ(posixcc, "cntr", 4)) /* cntrl */
9949 namedclass = complement ? ANYOF_NCNTRL : ANYOF_CNTRL;
80916619
NC
9950 break;
9951 case 'm':
3dab1dad
YO
9952 if (memEQ(posixcc, "alnu", 4)) /* alnum */
9953 namedclass = complement ? ANYOF_NALNUMC : ANYOF_ALNUMC;
80916619
NC
9954 break;
9955 case 'r':
3dab1dad
YO
9956 if (memEQ(posixcc, "lowe", 4)) /* lower */
9957 namedclass = complement ? ANYOF_NLOWER : ANYOF_LOWER;
9958 else if (memEQ(posixcc, "uppe", 4)) /* upper */
9959 namedclass = complement ? ANYOF_NUPPER : ANYOF_UPPER;
80916619
NC
9960 break;
9961 case 't':
3dab1dad
YO
9962 if (memEQ(posixcc, "digi", 4)) /* digit */
9963 namedclass = complement ? ANYOF_NDIGIT : ANYOF_DIGIT;
9964 else if (memEQ(posixcc, "prin", 4)) /* print */
9965 namedclass = complement ? ANYOF_NPRINT : ANYOF_PRINT;
9966 else if (memEQ(posixcc, "punc", 4)) /* punct */
9967 namedclass = complement ? ANYOF_NPUNCT : ANYOF_PUNCT;
80916619 9968 break;
b8c5462f
JH
9969 }
9970 break;
80916619 9971 case 6:
3dab1dad
YO
9972 if (memEQ(posixcc, "xdigit", 6))
9973 namedclass = complement ? ANYOF_NXDIGIT : ANYOF_XDIGIT;
b8c5462f
JH
9974 break;
9975 }
80916619
NC
9976
9977 if (namedclass == OOB_NAMEDCLASS)
b45f050a
JF
9978 Simple_vFAIL3("POSIX class [:%.*s:] unknown",
9979 t - s - 1, s + 1);
80916619
NC
9980 assert (posixcc[skip] == ':');
9981 assert (posixcc[skip+1] == ']');
b45f050a 9982 } else if (!SIZE_ONLY) {
b8c5462f 9983 /* [[=foo=]] and [[.foo.]] are still future. */
b45f050a 9984
830247a4 9985 /* adjust RExC_parse so the warning shows after
b45f050a 9986 the class closes */
9a86a77b 9987 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse) != ']')
830247a4 9988 RExC_parse++;
b45f050a
JF
9989 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
9990 }
b8c5462f
JH
9991 } else {
9992 /* Maternal grandfather:
9993 * "[:" ending in ":" but not in ":]" */
830247a4 9994 RExC_parse = s;
767d463e 9995 }
620e46c5
JH
9996 }
9997 }
9998
b8c5462f
JH
9999 return namedclass;
10000}
10001
10002STATIC void
830247a4 10003S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
b8c5462f 10004{
97aff369 10005 dVAR;
7918f24d
NC
10006
10007 PERL_ARGS_ASSERT_CHECKPOSIXCC;
10008
3dab1dad 10009 if (POSIXCC(UCHARAT(RExC_parse))) {
1df70142
AL
10010 const char *s = RExC_parse;
10011 const char c = *s++;
b8c5462f 10012
3dab1dad 10013 while (isALNUM(*s))
b8c5462f
JH
10014 s++;
10015 if (*s && c == *s && s[1] == ']') {
668c081a
NC
10016 ckWARN3reg(s+2,
10017 "POSIX syntax [%c %c] belongs inside character classes",
10018 c, c);
b45f050a
JF
10019
10020 /* [[=foo=]] and [[.foo.]] are still future. */
9a86a77b 10021 if (POSIXCC_NOTYET(c)) {
830247a4 10022 /* adjust RExC_parse so the error shows after
b45f050a 10023 the class closes */
9a86a77b 10024 while (UCHARAT(RExC_parse) && UCHARAT(RExC_parse++) != ']')
3dab1dad 10025 NOOP;
b45f050a
JF
10026 Simple_vFAIL3("POSIX syntax [%c %c] is reserved for future extensions", c, c);
10027 }
b8c5462f
JH
10028 }
10029 }
620e46c5
JH
10030}
10031
ea317ccb
KW
10032/* Generate the code to add a full posix character <class> to the bracketed
10033 * character class given by <node>. (<node> is needed only under locale rules)
10034 * destlist is the inversion list for non-locale rules that this class is
10035 * to be added to
10036 * sourcelist is the ASCII-range inversion list to add under /a rules
10037 * Xsourcelist is the full Unicode range list to use otherwise. */
10038#define DO_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10039 if (LOC) { \
10040 SV* scratch_list = NULL; \
10041 \
10042 /* Set this class in the node for runtime matching */ \
10043 ANYOF_CLASS_SET(node, class); \
10044 \
10045 /* For above Latin1 code points, we use the full Unicode range */ \
10046 _invlist_intersection(PL_AboveLatin1, \
10047 Xsourcelist, \
10048 &scratch_list); \
10049 /* And set the output to it, adding instead if there already is an \
10050 * output. Checking if <destlist> is NULL first saves an extra \
10051 * clone. Its reference count will be decremented at the next \
10052 * union, etc, or if this is the only instance, at the end of the \
10053 * routine */ \
10054 if (! destlist) { \
10055 destlist = scratch_list; \
10056 } \
10057 else { \
10058 _invlist_union(destlist, scratch_list, &destlist); \
10059 SvREFCNT_dec(scratch_list); \
10060 } \
10061 } \
10062 else { \
10063 /* For non-locale, just add it to any existing list */ \
10064 _invlist_union(destlist, \
10065 (AT_LEAST_ASCII_RESTRICTED) \
10066 ? sourcelist \
10067 : Xsourcelist, \
10068 &destlist); \
10069 }
10070
10071/* Like DO_POSIX, but matches the complement of <sourcelist> and <Xsourcelist>.
10072 */
10073#define DO_N_POSIX(node, class, destlist, sourcelist, Xsourcelist) \
10074 if (LOC) { \
10075 SV* scratch_list = NULL; \
10076 ANYOF_CLASS_SET(node, class); \
10077 _invlist_subtract(PL_AboveLatin1, Xsourcelist, &scratch_list); \
10078 if (! destlist) { \
10079 destlist = scratch_list; \
10080 } \
10081 else { \
10082 _invlist_union(destlist, scratch_list, &destlist); \
10083 SvREFCNT_dec(scratch_list); \
10084 } \
10085 } \
10086 else { \
10087 _invlist_union_complement_2nd(destlist, \
10088 (AT_LEAST_ASCII_RESTRICTED) \
10089 ? sourcelist \
10090 : Xsourcelist, \
10091 &destlist); \
10092 /* Under /d, everything in the upper half of the Latin1 range \
10093 * matches this complement */ \
10094 if (DEPENDS_SEMANTICS) { \
10095 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10096 } \
10097 }
10098
10099/* Generate the code to add a posix character <class> to the bracketed
10100 * character class given by <node>. (<node> is needed only under locale rules)
10101 * destlist is the inversion list for non-locale rules that this class is
10102 * to be added to
10103 * sourcelist is the ASCII-range inversion list to add under /a rules
10104 * l1_sourcelist is the Latin1 range list to use otherwise.
10105 * Xpropertyname is the name to add to <run_time_list> of the property to
10106 * specify the code points above Latin1 that will have to be
10107 * determined at run-time
10108 * run_time_list is a SV* that contains text names of properties that are to
10109 * be computed at run time. This concatenates <Xpropertyname>
10110 * to it, apppropriately
10111 * This is essentially DO_POSIX, but we know only the Latin1 values at compile
10112 * time */
10113#define DO_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10114 l1_sourcelist, Xpropertyname, run_time_list) \
10115 /* If not /a matching, there are going to be code points we will have \
10116 * to defer to runtime to look-up */ \
10117 if (! AT_LEAST_ASCII_RESTRICTED) { \
10118 Perl_sv_catpvf(aTHX_ run_time_list, "+utf8::%s\n", Xpropertyname); \
10119 } \
10120 if (LOC) { \
10121 ANYOF_CLASS_SET(node, class); \
10122 } \
10123 else { \
10124 _invlist_union(destlist, \
10125 (AT_LEAST_ASCII_RESTRICTED) \
10126 ? sourcelist \
10127 : l1_sourcelist, \
10128 &destlist); \
10129 }
10130
10131/* Like DO_POSIX_LATIN1_ONLY_KNOWN, but for the complement. A combination of
10132 * this and DO_N_POSIX */
10133#define DO_N_POSIX_LATIN1_ONLY_KNOWN(node, class, destlist, sourcelist, \
10134 l1_sourcelist, Xpropertyname, run_time_list) \
10135 if (AT_LEAST_ASCII_RESTRICTED) { \
10136 _invlist_union_complement_2nd(destlist, sourcelist, &destlist); \
10137 } \
10138 else { \
10139 Perl_sv_catpvf(aTHX_ run_time_list, "!utf8::%s\n", Xpropertyname); \
10140 if (LOC) { \
10141 ANYOF_CLASS_SET(node, namedclass); \
10142 } \
10143 else { \
10144 SV* scratch_list = NULL; \
10145 _invlist_subtract(PL_Latin1, l1_sourcelist, &scratch_list); \
10146 if (! destlist) { \
10147 destlist = scratch_list; \
10148 } \
10149 else { \
10150 _invlist_union(destlist, scratch_list, &destlist); \
10151 SvREFCNT_dec(scratch_list); \
10152 } \
10153 if (DEPENDS_SEMANTICS) { \
10154 ANYOF_FLAGS(node) |= ANYOF_NON_UTF8_LATIN1_ALL; \
10155 } \
10156 } \
10157 }
a12cf05f 10158
2283d326 10159STATIC U8
a25abddc 10160S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
2283d326
KW
10161{
10162
10163 /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
10164 * Locale folding is done at run-time, so this function should not be
10165 * called for nodes that are for locales.
10166 *
d50a4f90 10167 * This function sets the bit corresponding to the fold of the input
2283d326
KW
10168 * 'value', if not already set. The fold of 'f' is 'F', and the fold of
10169 * 'F' is 'f'.
10170 *
d50a4f90
KW
10171 * It also knows about the characters that are in the bitmap that have
10172 * folds that are matchable only outside it, and sets the appropriate lists
10173 * and flags.
10174 *
10175 * It returns the number of bits that actually changed from 0 to 1 */
2283d326
KW
10176
10177 U8 stored = 0;
2283d326
KW
10178 U8 fold;
10179
4c9daa0a
KW
10180 PERL_ARGS_ASSERT_SET_REGCLASS_BIT_FOLD;
10181
cfaf538b 10182 fold = (AT_LEAST_UNI_SEMANTICS) ? PL_fold_latin1[value]
2f7f8cb1 10183 : PL_fold[value];
2283d326
KW
10184
10185 /* It assumes the bit for 'value' has already been set */
10186 if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
10187 ANYOF_BITMAP_SET(node, fold);
10188 stored++;
10189 }
d50a4f90
KW
10190 if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value) && (! isASCII(value) || ! MORE_ASCII_RESTRICTED)) {
10191 /* Certain Latin1 characters have matches outside the bitmap. To get
10192 * here, 'value' is one of those characters. None of these matches is
10193 * valid for ASCII characters under /aa, which have been excluded by
10194 * the 'if' above. The matches fall into three categories:
10195 * 1) They are singly folded-to or -from an above 255 character, as
10196 * LATIN SMALL LETTER Y WITH DIAERESIS and LATIN CAPITAL LETTER Y
10197 * WITH DIAERESIS;
10198 * 2) They are part of a multi-char fold with another character in the
10199 * bitmap, only LATIN SMALL LETTER SHARP S => "ss" fits that bill;
10200 * 3) They are part of a multi-char fold with a character not in the
10201 * bitmap, such as various ligatures.
10202 * We aren't dealing fully with multi-char folds, except we do deal
10203 * with the pattern containing a character that has a multi-char fold
10204 * (not so much the inverse).
10205 * For types 1) and 3), the matches only happen when the target string
10206 * is utf8; that's not true for 2), and we set a flag for it.
10207 *
10208 * The code below adds to the passed in inversion list the single fold
10209 * closures for 'value'. The values are hard-coded here so that an
10210 * innocent-looking character class, like /[ks]/i won't have to go out
10211 * to disk to find the possible matches. XXX It would be better to
10212 * generate these via regen, in case a new version of the Unicode
10213 * standard adds new mappings, though that is not really likely. */
10214 switch (value) {
10215 case 'k':
10216 case 'K':
10217 /* KELVIN SIGN */
10218 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212A);
10219 break;
10220 case 's':
10221 case 'S':
10222 /* LATIN SMALL LETTER LONG S */
10223 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x017F);
10224 break;
10225 case MICRO_SIGN:
10226 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10227 GREEK_SMALL_LETTER_MU);
10228 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10229 GREEK_CAPITAL_LETTER_MU);
10230 break;
10231 case LATIN_CAPITAL_LETTER_A_WITH_RING_ABOVE:
10232 case LATIN_SMALL_LETTER_A_WITH_RING_ABOVE:
10233 /* ANGSTROM SIGN */
10234 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, 0x212B);
10235 if (DEPENDS_SEMANTICS) { /* See DEPENDS comment below */
10236 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10237 PL_fold_latin1[value]);
10238 }
10239 break;
10240 case LATIN_SMALL_LETTER_Y_WITH_DIAERESIS:
10241 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10242 LATIN_CAPITAL_LETTER_Y_WITH_DIAERESIS);
10243 break;
10244 case LATIN_SMALL_LETTER_SHARP_S:
1d4120df
KW
10245 *invlist_ptr = add_cp_to_invlist(*invlist_ptr,
10246 LATIN_CAPITAL_LETTER_SHARP_S);
d50a4f90 10247
419d8974 10248 /* Under /a, /d, and /u, this can match the two chars "ss" */
d50a4f90
KW
10249 if (! MORE_ASCII_RESTRICTED) {
10250 add_alternate(alternate_ptr, (U8 *) "ss", 2);
10251
419d8974
KW
10252 /* And under /u or /a, it can match even if the target is
10253 * not utf8 */
10254 if (AT_LEAST_UNI_SEMANTICS) {
d50a4f90
KW
10255 ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
10256 }
10257 }
10258 break;
10259 case 'F': case 'f':
10260 case 'I': case 'i':
10261 case 'L': case 'l':
10262 case 'T': case 't':
d50a4f90
KW
10263 case 'A': case 'a':
10264 case 'H': case 'h':
10265 case 'J': case 'j':
10266 case 'N': case 'n':
10267 case 'W': case 'w':
10268 case 'Y': case 'y':
f580a93d
KW
10269 /* These all are targets of multi-character folds from code
10270 * points that require UTF8 to express, so they can't match
10271 * unless the target string is in UTF-8, so no action here is
10272 * necessary, as regexec.c properly handles the general case
10273 * for UTF-8 matching */
d50a4f90
KW
10274 break;
10275 default:
10276 /* Use deprecated warning to increase the chances of this
10277 * being output */
10278 ckWARN2regdep(RExC_parse, "Perl folding rules are not up-to-date for 0x%x; please use the perlbug utility to report;", value);
10279 break;
10280 }
10281 }
10282 else if (DEPENDS_SEMANTICS
f56b6394 10283 && ! isASCII(value)
d50a4f90
KW
10284 && PL_fold_latin1[value] != value)
10285 {
10286 /* Under DEPENDS rules, non-ASCII Latin1 characters match their
10287 * folds only when the target string is in UTF-8. We add the fold
10288 * here to the list of things to match outside the bitmap, which
10289 * won't be looked at unless it is UTF8 (or else if something else
10290 * says to look even if not utf8, but those things better not happen
10291 * under DEPENDS semantics. */
10292 *invlist_ptr = add_cp_to_invlist(*invlist_ptr, PL_fold_latin1[value]);
2283d326
KW
10293 }
10294
10295 return stored;
10296}
10297
10298
10299PERL_STATIC_INLINE U8
a25abddc 10300S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, SV** invlist_ptr, AV** alternate_ptr)
2283d326
KW
10301{
10302 /* This inline function sets a bit in the bitmap if not already set, and if
10303 * appropriate, its fold, returning the number of bits that actually
10304 * changed from 0 to 1 */
10305
10306 U8 stored;
10307
4c9daa0a
KW
10308 PERL_ARGS_ASSERT_SET_REGCLASS_BIT;
10309
2283d326
KW
10310 if (ANYOF_BITMAP_TEST(node, value)) { /* Already set */
10311 return 0;
10312 }
10313
10314 ANYOF_BITMAP_SET(node, value);
10315 stored = 1;
10316
10317 if (FOLD && ! LOC) { /* Locale folds aren't known until runtime */
2c6aa593 10318 stored += set_regclass_bit_fold(pRExC_state, node, value, invlist_ptr, alternate_ptr);
2283d326
KW
10319 }
10320
10321 return stored;
10322}
10323
c8453963
KW
10324STATIC void
10325S_add_alternate(pTHX_ AV** alternate_ptr, U8* string, STRLEN len)
10326{
10327 /* Adds input 'string' with length 'len' to the ANYOF node's unicode
10328 * alternate list, pointed to by 'alternate_ptr'. This is an array of
10329 * the multi-character folds of characters in the node */
10330 SV *sv;
10331
10332 PERL_ARGS_ASSERT_ADD_ALTERNATE;
10333
10334 if (! *alternate_ptr) {
10335 *alternate_ptr = newAV();
10336 }
10337 sv = newSVpvn_utf8((char*)string, len, TRUE);
10338 av_push(*alternate_ptr, sv);
10339 return;
10340}
10341
7f6f358c
YO
10342/*
10343 parse a class specification and produce either an ANYOF node that
ddad5e0b 10344 matches the pattern or perhaps will be optimized into an EXACTish node
679d1424
KW
10345 instead. The node contains a bit map for the first 256 characters, with the
10346 corresponding bit set if that character is in the list. For characters
10347 above 255, a range list is used */
89836f1f 10348
76e3520e 10349STATIC regnode *
3dab1dad 10350S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
a687059c 10351{
97aff369 10352 dVAR;
9a86a77b 10353 register UV nextvalue;
3568d838 10354 register IV prevvalue = OOB_UNICODE;
ffc61ed2 10355 register IV range = 0;
e1d1eefb 10356 UV value = 0; /* XXX:dmq: needs to be referenceable (unfortunately) */
c277df42 10357 register regnode *ret;
ba210ebe 10358 STRLEN numlen;
ffc61ed2 10359 IV namedclass;
cbbf8932 10360 char *rangebegin = NULL;
936ed897 10361 bool need_class = 0;
827f5bb8 10362 bool allow_full_fold = TRUE; /* Assume wants multi-char folding */
c445ea15 10363 SV *listsv = NULL;
137165a6
KW
10364 STRLEN initial_listsv_len = 0; /* Kind of a kludge to see if it is more
10365 than just initialized. */
dc415b4a
KW
10366 SV* properties = NULL; /* Code points that match \p{} \P{} */
10367 UV element_count = 0; /* Number of distinct elements in the class.
10368 Optimizations may be possible if this is tiny */
ffc61ed2 10369 UV n;
53742956 10370
dc415b4a 10371 /* Unicode properties are stored in a swash; this holds the current one
88d45d28
KW
10372 * being parsed. If this swash is the only above-latin1 component of the
10373 * character class, an optimization is to pass it directly on to the
10374 * execution engine. Otherwise, it is set to NULL to indicate that there
10375 * are other things in the class that have to be dealt with at execution
10376 * time */
dc415b4a
KW
10377 SV* swash = NULL; /* Code points that match \p{} \P{} */
10378
10379 /* Set if a component of this character class is user-defined; just passed
10380 * on to the engine */
10381 UV has_user_defined_property = 0;
10382
53742956 10383 /* code points this node matches that can't be stored in the bitmap */
a25abddc 10384 SV* nonbitmap = NULL;
53742956
KW
10385
10386 /* The items that are to match that aren't stored in the bitmap, but are a
10387 * result of things that are stored there. This is the fold closure of
10388 * such a character, either because it has DEPENDS semantics and shouldn't
10389 * be matched unless the target string is utf8, or is a code point that is
10390 * too large for the bit map, as for example, the fold of the MICRO SIGN is
10391 * above 255. This all is solely for performance reasons. By having this
10392 * code know the outside-the-bitmap folds that the bitmapped characters are
10393 * involved with, we don't have to go out to disk to find the list of
10394 * matches, unless the character class includes code points that aren't
10395 * storable in the bit map. That means that a character class with an 's'
10396 * in it, for example, doesn't need to go out to disk to find everything
10397 * that matches. A 2nd list is used so that the 'nonbitmap' list is kept
10398 * empty unless there is something whose fold we don't know about, and will
10399 * have to go out to the disk to find. */
a25abddc 10400 SV* l1_fold_invlist = NULL;
53742956
KW
10401
10402 /* List of multi-character folds that are matched by this node */
cbbf8932 10403 AV* unicode_alternate = NULL;
1b2d223b
JH
10404#ifdef EBCDIC
10405 UV literal_endpoint = 0;
10406#endif
ffc130aa 10407 UV stored = 0; /* how many chars stored in the bitmap */
ffc61ed2 10408
3dab1dad 10409 regnode * const orig_emit = RExC_emit; /* Save the original RExC_emit in
7f6f358c 10410 case we need to change the emitted regop to an EXACT. */
07be1b83 10411 const char * orig_parse = RExC_parse;
72f13be8 10412 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
10413
10414 PERL_ARGS_ASSERT_REGCLASS;
76e84362
SH
10415#ifndef DEBUGGING
10416 PERL_UNUSED_ARG(depth);
10417#endif
72f13be8 10418
3dab1dad 10419 DEBUG_PARSE("clas");
7f6f358c
YO
10420
10421 /* Assume we are going to generate an ANYOF node. */
ffc61ed2
JH
10422 ret = reganode(pRExC_state, ANYOF, 0);
10423
56ca34ca
KW
10424
10425 if (!SIZE_ONLY) {
ffc61ed2 10426 ANYOF_FLAGS(ret) = 0;
56ca34ca 10427 }
ffc61ed2 10428
9a86a77b 10429 if (UCHARAT(RExC_parse) == '^') { /* Complement of range. */
ffc61ed2
JH
10430 RExC_naughty++;
10431 RExC_parse++;
10432 if (!SIZE_ONLY)
10433 ANYOF_FLAGS(ret) |= ANYOF_INVERT;
827f5bb8
KW
10434
10435 /* We have decided to not allow multi-char folds in inverted character
ac455f4c
KW
10436 * classes, due to the confusion that can happen, especially with
10437 * classes that are designed for a non-Unicode world: You have the
10438 * peculiar case that:
827f5bb8
KW
10439 "s s" =~ /^[^\xDF]+$/i => Y
10440 "ss" =~ /^[^\xDF]+$/i => N
10441 *
10442 * See [perl #89750] */
10443 allow_full_fold = FALSE;
ffc61ed2 10444 }
a0d0e21e 10445
73060fc4 10446 if (SIZE_ONLY) {
830247a4 10447 RExC_size += ANYOF_SKIP;
73060fc4
JH
10448 listsv = &PL_sv_undef; /* For code scanners: listsv always non-NULL. */
10449 }
936ed897 10450 else {
830247a4 10451 RExC_emit += ANYOF_SKIP;
3a15e693 10452 if (LOC) {
936ed897 10453 ANYOF_FLAGS(ret) |= ANYOF_LOCALE;
3a15e693 10454 }
ffc61ed2 10455 ANYOF_BITMAP_ZERO(ret);
396482e1 10456 listsv = newSVpvs("# comment\n");
137165a6 10457 initial_listsv_len = SvCUR(listsv);
a0d0e21e 10458 }
b8c5462f 10459
9a86a77b
JH
10460 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10461
b938889d 10462 if (!SIZE_ONLY && POSIXCC(nextvalue))
830247a4 10463 checkposixcc(pRExC_state);
b8c5462f 10464
f064b6ad
HS
10465 /* allow 1st char to be ] (allowing it to be - is dealt with later) */
10466 if (UCHARAT(RExC_parse) == ']')
10467 goto charclassloop;
ffc61ed2 10468
fc8cd66c 10469parseit:
9a86a77b 10470 while (RExC_parse < RExC_end && UCHARAT(RExC_parse) != ']') {
ffc61ed2
JH
10471
10472 charclassloop:
10473
10474 namedclass = OOB_NAMEDCLASS; /* initialize as illegal */
10475
dc415b4a 10476 if (!range) {
830247a4 10477 rangebegin = RExC_parse;
dc415b4a
KW
10478 element_count++;
10479 }
ffc61ed2 10480 if (UTF) {
5e12f4fb 10481 value = utf8n_to_uvchr((U8*)RExC_parse,
3568d838 10482 RExC_end - RExC_parse,
9f7f3913 10483 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
10484 RExC_parse += numlen;
10485 }
10486 else
10487 value = UCHARAT(RExC_parse++);
7f6f358c 10488
9a86a77b
JH
10489 nextvalue = RExC_parse < RExC_end ? UCHARAT(RExC_parse) : 0;
10490 if (value == '[' && POSIXCC(nextvalue))
830247a4 10491 namedclass = regpposixcc(pRExC_state, value);
620e46c5 10492 else if (value == '\\') {
ffc61ed2 10493 if (UTF) {
5e12f4fb 10494 value = utf8n_to_uvchr((U8*)RExC_parse,
ffc61ed2 10495 RExC_end - RExC_parse,
9f7f3913 10496 &numlen, UTF8_ALLOW_DEFAULT);
ffc61ed2
JH
10497 RExC_parse += numlen;
10498 }
10499 else
10500 value = UCHARAT(RExC_parse++);
470c3474 10501 /* Some compilers cannot handle switching on 64-bit integer
ffc61ed2 10502 * values, therefore value cannot be an UV. Yes, this will
e2962f66
JH
10503 * be a problem later if we want switch on Unicode.
10504 * A similar issue a little bit later when switching on
10505 * namedclass. --jhi */
ffc61ed2 10506 switch ((I32)value) {
b8c5462f
JH
10507 case 'w': namedclass = ANYOF_ALNUM; break;
10508 case 'W': namedclass = ANYOF_NALNUM; break;
10509 case 's': namedclass = ANYOF_SPACE; break;
10510 case 'S': namedclass = ANYOF_NSPACE; break;
10511 case 'd': namedclass = ANYOF_DIGIT; break;
10512 case 'D': namedclass = ANYOF_NDIGIT; break;
e1d1eefb
YO
10513 case 'v': namedclass = ANYOF_VERTWS; break;
10514 case 'V': namedclass = ANYOF_NVERTWS; break;
10515 case 'h': namedclass = ANYOF_HORIZWS; break;
10516 case 'H': namedclass = ANYOF_NHORIZWS; break;
fc8cd66c
YO
10517 case 'N': /* Handle \N{NAME} in class */
10518 {
10519 /* We only pay attention to the first char of
10520 multichar strings being returned. I kinda wonder
10521 if this makes sense as it does change the behaviour
10522 from earlier versions, OTOH that behaviour was broken
10523 as well. */
10524 UV v; /* value is register so we cant & it /grrr */
9d64099b 10525 if (reg_namedseq(pRExC_state, &v, NULL, depth)) {
fc8cd66c
YO
10526 goto parseit;
10527 }
10528 value= v;
10529 }
10530 break;
ffc61ed2
JH
10531 case 'p':
10532 case 'P':
3dab1dad
YO
10533 {
10534 char *e;
af6f566e 10535 if (RExC_parse >= RExC_end)
2a4859cd 10536 vFAIL2("Empty \\%c{}", (U8)value);
ffc61ed2 10537 if (*RExC_parse == '{') {
1df70142 10538 const U8 c = (U8)value;
ffc61ed2
JH
10539 e = strchr(RExC_parse++, '}');
10540 if (!e)
0da60cf5 10541 vFAIL2("Missing right brace on \\%c{}", c);
ab13f0c7
JH
10542 while (isSPACE(UCHARAT(RExC_parse)))
10543 RExC_parse++;
10544 if (e == RExC_parse)
0da60cf5 10545 vFAIL2("Empty \\%c{}", c);
ffc61ed2 10546 n = e - RExC_parse;
ab13f0c7
JH
10547 while (isSPACE(UCHARAT(RExC_parse + n - 1)))
10548 n--;
ffc61ed2
JH
10549 }
10550 else {
10551 e = RExC_parse;
10552 n = 1;
10553 }
ee410026 10554 if (!SIZE_ONLY) {
dc415b4a
KW
10555 SV** invlistsvp;
10556 SV* invlist;
10557 char* name;
ab13f0c7
JH
10558 if (UCHARAT(RExC_parse) == '^') {
10559 RExC_parse++;
10560 n--;
10561 value = value == 'p' ? 'P' : 'p'; /* toggle */
10562 while (isSPACE(UCHARAT(RExC_parse))) {
10563 RExC_parse++;
10564 n--;
10565 }
10566 }
dc415b4a
KW
10567 /* Try to get the definition of the property into
10568 * <invlist>. If /i is in effect, the effective property
10569 * will have its name be <__NAME_i>. The design is
10570 * discussed in commit
10571 * 2f833f5208e26b208886e51e09e2c072b5eabb46 */
10572 Newx(name, n + sizeof("_i__\n"), char);
10573
10574 sprintf(name, "%s%.*s%s\n",
10575 (FOLD) ? "__" : "",
10576 (int)n,
10577 RExC_parse,
10578 (FOLD) ? "_i" : ""
10579 );
10580
10581 /* Look up the property name, and get its swash and
10582 * inversion list, if the property is found */
dc415b4a
KW
10583 if (swash) {
10584 SvREFCNT_dec(swash);
10585 }
10586 swash = _core_swash_init("utf8", name, &PL_sv_undef,
10587 1, /* binary */
10588 0, /* not tr/// */
10589 TRUE, /* this routine will handle
10590 undefined properties */
10591 NULL, FALSE /* No inversion list */
10592 );
b6c46382 10593 if ( ! swash
dc415b4a
KW
10594 || ! SvROK(swash)
10595 || ! SvTYPE(SvRV(swash)) == SVt_PVHV
10596 || ! (invlistsvp =
10597 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10598 "INVLIST", FALSE))
10599 || ! (invlist = *invlistsvp))
10600 {
10601 if (swash) {
10602 SvREFCNT_dec(swash);
10603 swash = NULL;
10604 }
10605
10606 /* Here didn't find it. It could be a user-defined
10607 * property that will be available at run-time. Add it
10608 * to the list to look up then */
10609 Perl_sv_catpvf(aTHX_ listsv, "%cutf8::%s\n",
10610 (value == 'p' ? '+' : '!'),
10611 name);
10612 has_user_defined_property = 1;
10613
10614 /* We don't know yet, so have to assume that the
10615 * property could match something in the Latin1 range,
10616 * hence something that isn't utf8 */
10617 ANYOF_FLAGS(ret) |= ANYOF_NONBITMAP_NON_UTF8;
10618 }
10619 else {
10620
10621 /* Here, did get the swash and its inversion list. If
10622 * the swash is from a user-defined property, then this
10623 * whole character class should be regarded as such */
10624 SV** user_defined_svp =
10625 hv_fetchs(MUTABLE_HV(SvRV(swash)),
10626 "USER_DEFINED", FALSE);
10627 if (user_defined_svp) {
10628 has_user_defined_property
10629 |= SvUV(*user_defined_svp);
10630 }
10631
10632 /* Invert if asking for the complement */
10633 if (value == 'P') {
8dc9348a 10634 _invlist_union_complement_2nd(properties, invlist, &properties);
2f833f52 10635
dc415b4a
KW
10636 /* The swash can't be used as-is, because we've
10637 * inverted things; delay removing it to here after
10638 * have copied its invlist above */
10639 SvREFCNT_dec(swash);
10640 swash = NULL;
10641 }
10642 else {
112b0fc6 10643 _invlist_union(properties, invlist, &properties);
dc415b4a
KW
10644 }
10645 }
10646 Safefree(name);
ffc61ed2
JH
10647 }
10648 RExC_parse = e + 1;
f81125e2 10649 namedclass = ANYOF_MAX; /* no official name, but it's named */
e40e74fe
KW
10650
10651 /* \p means they want Unicode semantics */
10652 RExC_uni_semantics = 1;
3dab1dad 10653 }
f81125e2 10654 break;
b8c5462f
JH
10655 case 'n': value = '\n'; break;
10656 case 'r': value = '\r'; break;
10657 case 't': value = '\t'; break;
10658 case 'f': value = '\f'; break;
10659 case 'b': value = '\b'; break;
c7f1f016
NIS
10660 case 'e': value = ASCII_TO_NATIVE('\033');break;
10661 case 'a': value = ASCII_TO_NATIVE('\007');break;
f0a2b745
KW
10662 case 'o':
10663 RExC_parse--; /* function expects to be pointed at the 'o' */
454155d9
KW
10664 {
10665 const char* error_msg;
10666 bool valid = grok_bslash_o(RExC_parse,
f0a2b745
KW
10667 &value,
10668 &numlen,
454155d9
KW
10669 &error_msg,
10670 SIZE_ONLY);
10671 RExC_parse += numlen;
10672 if (! valid) {
10673 vFAIL(error_msg);
10674 }
f0a2b745 10675 }
f0a2b745
KW
10676 if (PL_encoding && value < 0x100) {
10677 goto recode_encoding;
10678 }
10679 break;
b8c5462f 10680 case 'x':
ffc61ed2 10681 if (*RExC_parse == '{') {
a4c04bdc
NC
10682 I32 flags = PERL_SCAN_ALLOW_UNDERSCORES
10683 | PERL_SCAN_DISALLOW_PREFIX;
3dab1dad 10684 char * const e = strchr(RExC_parse++, '}');
b81d288d 10685 if (!e)
ffc61ed2 10686 vFAIL("Missing right brace on \\x{}");
53305cf1
NC
10687
10688 numlen = e - RExC_parse;
10689 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
10690 RExC_parse = e + 1;
10691 }
10692 else {
a4c04bdc 10693 I32 flags = PERL_SCAN_DISALLOW_PREFIX;
53305cf1
NC
10694 numlen = 2;
10695 value = grok_hex(RExC_parse, &numlen, &flags, NULL);
ffc61ed2
JH
10696 RExC_parse += numlen;
10697 }
9e08bc66
TS
10698 if (PL_encoding && value < 0x100)
10699 goto recode_encoding;
b8c5462f
JH
10700 break;
10701 case 'c':
17a3df4c 10702 value = grok_bslash_c(*RExC_parse++, UTF, SIZE_ONLY);
b8c5462f
JH
10703 break;
10704 case '0': case '1': case '2': case '3': case '4':
c99e91e9 10705 case '5': case '6': case '7':
9e08bc66 10706 {
c99e91e9
KW
10707 /* Take 1-3 octal digits */
10708 I32 flags = PERL_SCAN_SILENT_ILLDIGIT;
9e08bc66
TS
10709 numlen = 3;
10710 value = grok_oct(--RExC_parse, &numlen, &flags, NULL);
10711 RExC_parse += numlen;
10712 if (PL_encoding && value < 0x100)
10713 goto recode_encoding;
10714 break;
10715 }
10716 recode_encoding:
e2a7e165 10717 if (! RExC_override_recoding) {
9e08bc66
TS
10718 SV* enc = PL_encoding;
10719 value = reg_recode((const char)(U8)value, &enc);
668c081a
NC
10720 if (!enc && SIZE_ONLY)
10721 ckWARNreg(RExC_parse,
10722 "Invalid escape in the specified encoding");
9e08bc66
TS
10723 break;
10724 }
1028017a 10725 default:
c99e91e9
KW
10726 /* Allow \_ to not give an error */
10727 if (!SIZE_ONLY && isALNUM(value) && value != '_') {
668c081a
NC
10728 ckWARN2reg(RExC_parse,
10729 "Unrecognized escape \\%c in character class passed through",
10730 (int)value);
c99e91e9 10731 }
1028017a 10732 break;
b8c5462f 10733 }
ffc61ed2 10734 } /* end of \blah */
1b2d223b
JH
10735#ifdef EBCDIC
10736 else
10737 literal_endpoint++;
10738#endif
ffc61ed2
JH
10739
10740 if (namedclass > OOB_NAMEDCLASS) { /* this is a named class \blah */
10741
2c63ecad
KW
10742 /* What matches in a locale is not known until runtime, so need to
10743 * (one time per class) allocate extra space to pass to regexec.
10744 * The space will contain a bit for each named class that is to be
10745 * matched against. This isn't needed for \p{} and pseudo-classes,
10746 * as they are not affected by locale, and hence are dealt with
10747 * separately */
10748 if (LOC && namedclass < ANYOF_MAX && ! need_class) {
10749 need_class = 1;
10750 if (SIZE_ONLY) {
dd58aee1 10751 RExC_size += ANYOF_CLASS_SKIP - ANYOF_SKIP;
2c63ecad
KW
10752 }
10753 else {
dd58aee1 10754 RExC_emit += ANYOF_CLASS_SKIP - ANYOF_SKIP;
2c63ecad
KW
10755 ANYOF_CLASS_ZERO(ret);
10756 }
9051cfd9 10757 ANYOF_FLAGS(ret) |= ANYOF_CLASS;
2c63ecad 10758 }
ffc61ed2 10759
d5788240 10760 /* a bad range like a-\d, a-[:digit:]. The '-' is taken as a
1d791ab2
KW
10761 * literal, as is the character that began the false range, i.e.
10762 * the 'a' in the examples */
ffc61ed2 10763 if (range) {
73b437c8 10764 if (!SIZE_ONLY) {
668c081a
NC
10765 const int w =
10766 RExC_parse >= rangebegin ?
10767 RExC_parse - rangebegin : 0;
10768 ckWARN4reg(RExC_parse,
b45f050a 10769 "False [] range \"%*.*s\"",
097eb12c 10770 w, w, rangebegin);
668c081a 10771
1d791ab2 10772 stored +=
5bfec14d 10773 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
3568d838 10774 if (prevvalue < 256) {
2283d326 10775 stored +=
5bfec14d 10776 set_regclass_bit(pRExC_state, ret, (U8) prevvalue, &l1_fold_invlist, &unicode_alternate);
ffc61ed2
JH
10777 }
10778 else {
1d791ab2 10779 nonbitmap = add_cp_to_invlist(nonbitmap, prevvalue);
ffc61ed2 10780 }
b8c5462f 10781 }
ffc61ed2
JH
10782
10783 range = 0; /* this was not a true range */
73b437c8 10784 }
ffc61ed2 10785
73b437c8 10786 if (!SIZE_ONLY) {
c49a72a9 10787
e2962f66
JH
10788 /* Possible truncation here but in some 64-bit environments
10789 * the compiler gets heartburn about switch on 64-bit values.
10790 * A similar issue a little earlier when switching on value.
98f323fa 10791 * --jhi */
e2962f66 10792 switch ((I32)namedclass) {
ea317ccb
KW
10793
10794 case ANYOF_ALNUMC: /* C's alnum, in contrast to \w */
10795 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10796 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10797 break;
10798 case ANYOF_NALNUMC:
10799 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10800 PL_PosixAlnum, PL_L1PosixAlnum, "XPosixAlnum", listsv);
10801 break;
10802 case ANYOF_ALPHA:
10803 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10804 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10805 break;
10806 case ANYOF_NALPHA:
10807 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10808 PL_PosixAlpha, PL_L1PosixAlpha, "XPosixAlpha", listsv);
10809 break;
73b437c8 10810 case ANYOF_ASCII:
ea317ccb
KW
10811 if (LOC) {
10812 ANYOF_CLASS_SET(ret, namedclass);
73b437c8 10813 }
ea317ccb
KW
10814 else {
10815 _invlist_union(properties, PL_ASCII, &properties);
10816 }
73b437c8
JH
10817 break;
10818 case ANYOF_NASCII:
ea317ccb
KW
10819 if (LOC) {
10820 ANYOF_CLASS_SET(ret, namedclass);
73b437c8 10821 }
ea317ccb
KW
10822 else {
10823 _invlist_union_complement_2nd(properties,
10824 PL_ASCII, &properties);
10825 if (DEPENDS_SEMANTICS) {
10826 ANYOF_FLAGS(ret) |= ANYOF_NON_UTF8_LATIN1_ALL;
10827 }
10828 }
10829 break;
10830 case ANYOF_BLANK:
10831 DO_POSIX(ret, namedclass, properties,
10832 PL_PosixBlank, PL_XPosixBlank);
10833 break;
10834 case ANYOF_NBLANK:
10835 DO_N_POSIX(ret, namedclass, properties,
10836 PL_PosixBlank, PL_XPosixBlank);
10837 break;
10838 case ANYOF_CNTRL:
10839 DO_POSIX(ret, namedclass, properties,
10840 PL_PosixCntrl, PL_XPosixCntrl);
10841 break;
10842 case ANYOF_NCNTRL:
10843 DO_N_POSIX(ret, namedclass, properties,
10844 PL_PosixCntrl, PL_XPosixCntrl);
10845 break;
ffc61ed2 10846 case ANYOF_DIGIT:
ea317ccb
KW
10847 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10848 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10849 break;
10850 case ANYOF_NDIGIT:
10851 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10852 PL_PosixDigit, PL_PosixDigit, "XPosixDigit", listsv);
10853 break;
10854 case ANYOF_GRAPH:
10855 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10856 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10857 break;
10858 case ANYOF_NGRAPH:
10859 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10860 PL_PosixGraph, PL_L1PosixGraph, "XPosixGraph", listsv);
10861 break;
10862 case ANYOF_HORIZWS:
10863 /* For these, we use the nonbitmap, as /d doesn't make a
10864 * difference in what these match. There would be problems
10865 * if these characters had folds other than themselves, as
24caacbc
KW
10866 * nonbitmap is subject to folding. It turns out that \h
10867 * is just a synonym for XPosixBlank */
10868 _invlist_union(nonbitmap, PL_XPosixBlank, &nonbitmap);
ea317ccb
KW
10869 break;
10870 case ANYOF_NHORIZWS:
10871 _invlist_union_complement_2nd(nonbitmap,
24caacbc 10872 PL_XPosixBlank, &nonbitmap);
ea317ccb
KW
10873 break;
10874 case ANYOF_LOWER:
10875 case ANYOF_NLOWER:
10876 { /* These require special handling, as they differ under
dab0c3e7
KW
10877 folding, matching Cased there (which in the ASCII range
10878 is the same as Alpha */
ea317ccb
KW
10879
10880 SV* ascii_source;
10881 SV* l1_source;
10882 const char *Xname;
10883
10884 if (FOLD && ! LOC) {
10885 ascii_source = PL_PosixAlpha;
dab0c3e7
KW
10886 l1_source = PL_L1Cased;
10887 Xname = "Cased";
ea317ccb 10888 }
ffc61ed2 10889 else {
ea317ccb
KW
10890 ascii_source = PL_PosixLower;
10891 l1_source = PL_L1PosixLower;
10892 Xname = "XPosixLower";
10893 }
10894 if (namedclass == ANYOF_LOWER) {
10895 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10896 ascii_source, l1_source, Xname, listsv);
10897 }
10898 else {
10899 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
10900 properties, ascii_source, l1_source, Xname, listsv);
ffc61ed2 10901 }
ffc61ed2 10902 break;
ea317ccb
KW
10903 }
10904 case ANYOF_PRINT:
10905 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10906 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10907 break;
10908 case ANYOF_NPRINT:
10909 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10910 PL_PosixPrint, PL_L1PosixPrint, "XPosixPrint", listsv);
10911 break;
10912 case ANYOF_PUNCT:
10913 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10914 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10915 break;
10916 case ANYOF_NPUNCT:
10917 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10918 PL_PosixPunct, PL_L1PosixPunct, "XPosixPunct", listsv);
10919 break;
10920 case ANYOF_PSXSPC:
10921 DO_POSIX(ret, namedclass, properties,
10922 PL_PosixSpace, PL_XPosixSpace);
10923 break;
10924 case ANYOF_NPSXSPC:
10925 DO_N_POSIX(ret, namedclass, properties,
10926 PL_PosixSpace, PL_XPosixSpace);
10927 break;
10928 case ANYOF_SPACE:
10929 DO_POSIX(ret, namedclass, properties,
10930 PL_PerlSpace, PL_XPerlSpace);
10931 break;
10932 case ANYOF_NSPACE:
10933 DO_N_POSIX(ret, namedclass, properties,
10934 PL_PerlSpace, PL_XPerlSpace);
10935 break;
10936 case ANYOF_UPPER: /* Same as LOWER, above */
10937 case ANYOF_NUPPER:
10938 {
10939 SV* ascii_source;
10940 SV* l1_source;
10941 const char *Xname;
10942
10943 if (FOLD && ! LOC) {
10944 ascii_source = PL_PosixAlpha;
dab0c3e7
KW
10945 l1_source = PL_L1Cased;
10946 Xname = "Cased";
ea317ccb 10947 }
ffc61ed2 10948 else {
ea317ccb
KW
10949 ascii_source = PL_PosixUpper;
10950 l1_source = PL_L1PosixUpper;
10951 Xname = "XPosixUpper";
ffc61ed2 10952 }
ea317ccb
KW
10953 if (namedclass == ANYOF_UPPER) {
10954 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10955 ascii_source, l1_source, Xname, listsv);
cfaf538b 10956 }
ea317ccb
KW
10957 else {
10958 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass,
10959 properties, ascii_source, l1_source, Xname, listsv);
10960 }
10961 break;
10962 }
10963 case ANYOF_ALNUM: /* Really is 'Word' */
10964 DO_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10965 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
10966 break;
10967 case ANYOF_NALNUM:
10968 DO_N_POSIX_LATIN1_ONLY_KNOWN(ret, namedclass, properties,
10969 PL_PosixWord, PL_L1PosixWord, "XPosixWord", listsv);
10970 break;
10971 case ANYOF_VERTWS:
10972 /* For these, we use the nonbitmap, as /d doesn't make a
10973 * difference in what these match. There would be problems
10974 * if these characters had folds other than themselves, as
10975 * nonbitmap is subject to folding */
10976 _invlist_union(nonbitmap, PL_VertSpace, &nonbitmap);
10977 break;
10978 case ANYOF_NVERTWS:
10979 _invlist_union_complement_2nd(nonbitmap,
10980 PL_VertSpace, &nonbitmap);
10981 break;
10982 case ANYOF_XDIGIT:
10983 DO_POSIX(ret, namedclass, properties,
10984 PL_PosixXDigit, PL_XPosixXDigit);
10985 break;
10986 case ANYOF_NXDIGIT:
10987 DO_N_POSIX(ret, namedclass, properties,
10988 PL_PosixXDigit, PL_XPosixXDigit);
10989 break;
f81125e2
JP
10990 case ANYOF_MAX:
10991 /* this is to handle \p and \P */
10992 break;
73b437c8 10993 default:
b45f050a 10994 vFAIL("Invalid [::] class");
73b437c8 10995 break;
b8c5462f 10996 }
ce1c68b2 10997
73b437c8 10998 continue;
a0d0e21e 10999 }
ffc61ed2
JH
11000 } /* end of namedclass \blah */
11001
a0d0e21e 11002 if (range) {
eb160463 11003 if (prevvalue > (IV)value) /* b-a */ {
d4c19fe8
AL
11004 const int w = RExC_parse - rangebegin;
11005 Simple_vFAIL4("Invalid [] range \"%*.*s\"", w, w, rangebegin);
3568d838 11006 range = 0; /* not a valid range */
73b437c8 11007 }
a0d0e21e
LW
11008 }
11009 else {
3568d838 11010 prevvalue = value; /* save the beginning of the range */
646253b5
KW
11011 if (RExC_parse+1 < RExC_end
11012 && *RExC_parse == '-'
11013 && RExC_parse[1] != ']')
11014 {
830247a4 11015 RExC_parse++;
ffc61ed2
JH
11016
11017 /* a bad range like \w-, [:word:]- ? */
11018 if (namedclass > OOB_NAMEDCLASS) {
afd78fd5 11019 if (ckWARN(WARN_REGEXP)) {
d4c19fe8 11020 const int w =
afd78fd5
JH
11021 RExC_parse >= rangebegin ?
11022 RExC_parse - rangebegin : 0;
830247a4 11023 vWARN4(RExC_parse,
b45f050a 11024 "False [] range \"%*.*s\"",
097eb12c 11025 w, w, rangebegin);
afd78fd5 11026 }
73b437c8 11027 if (!SIZE_ONLY)
2283d326 11028 stored +=
5bfec14d 11029 set_regclass_bit(pRExC_state, ret, '-', &l1_fold_invlist, &unicode_alternate);
73b437c8 11030 } else
ffc61ed2
JH
11031 range = 1; /* yeah, it's a range! */
11032 continue; /* but do it the next time */
a0d0e21e 11033 }
a687059c 11034 }
ffc61ed2 11035
046c4055
KW
11036 /* non-Latin1 code point implies unicode semantics. Must be set in
11037 * pass1 so is there for the whole of pass 2 */
56ca34ca
KW
11038 if (value > 255) {
11039 RExC_uni_semantics = 1;
11040 }
11041
93733859 11042 /* now is the next time */
ae5c130c 11043 if (!SIZE_ONLY) {
3568d838 11044 if (prevvalue < 256) {
1df70142 11045 const IV ceilvalue = value < 256 ? value : 255;
3dab1dad 11046 IV i;
3568d838 11047#ifdef EBCDIC
1b2d223b
JH
11048 /* In EBCDIC [\x89-\x91] should include
11049 * the \x8e but [i-j] should not. */
11050 if (literal_endpoint == 2 &&
11051 ((isLOWER(prevvalue) && isLOWER(ceilvalue)) ||
11052 (isUPPER(prevvalue) && isUPPER(ceilvalue))))
ffc61ed2 11053 {
3568d838
JH
11054 if (isLOWER(prevvalue)) {
11055 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 11056 if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326 11057 stored +=
5bfec14d 11058 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
2670d666 11059 }
ffc61ed2 11060 } else {
3568d838 11061 for (i = prevvalue; i <= ceilvalue; i++)
2670d666 11062 if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
2283d326 11063 stored +=
5bfec14d 11064 set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
2670d666 11065 }
ffc61ed2 11066 }
8ada0baa 11067 }
ffc61ed2 11068 else
8ada0baa 11069#endif
07be1b83 11070 for (i = prevvalue; i <= ceilvalue; i++) {
5bfec14d 11071 stored += set_regclass_bit(pRExC_state, ret, (U8) i, &l1_fold_invlist, &unicode_alternate);
07be1b83 11072 }
3568d838 11073 }
56ca34ca
KW
11074 if (value > 255) {
11075 const UV prevnatvalue = NATIVE_TO_UNI(prevvalue);
11076 const UV natvalue = NATIVE_TO_UNI(value);
56ca34ca 11077 nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
56ca34ca 11078 }
1b2d223b
JH
11079#ifdef EBCDIC
11080 literal_endpoint = 0;
11081#endif
8ada0baa 11082 }
ffc61ed2
JH
11083
11084 range = 0; /* this range (if it was one) is done now */
a0d0e21e 11085 }
ffc61ed2 11086
ffc61ed2 11087
7f6f358c
YO
11088
11089 if (SIZE_ONLY)
11090 return ret;
11091 /****** !SIZE_ONLY AFTER HERE *********/
11092
0c6e4288
KW
11093 /* If folding and there are code points above 255, we calculate all
11094 * characters that could fold to or from the ones already on the list */
11095 if (FOLD && nonbitmap) {
0d527bf8 11096 UV start, end; /* End points of code point ranges */
56ca34ca 11097
4065ba03 11098 SV* fold_intersection = NULL;
93e5bb1c
KW
11099
11100 /* This is a list of all the characters that participate in folds
11101 * (except marks, etc in multi-char folds */
11102 if (! PL_utf8_foldable) {
11103 SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
11104 PL_utf8_foldable = _swash_to_invlist(swash);
0501f9a8 11105 SvREFCNT_dec(swash);
93e5bb1c 11106 }
56ca34ca 11107
93e5bb1c
KW
11108 /* This is a hash that for a particular fold gives all characters
11109 * that are involved in it */
11110 if (! PL_utf8_foldclosures) {
11111
11112 /* If we were unable to find any folds, then we likely won't be
11113 * able to find the closures. So just create an empty list.
11114 * Folding will effectively be restricted to the non-Unicode rules
11115 * hard-coded into Perl. (This case happens legitimately during
11116 * compilation of Perl itself before the Unicode tables are
11117 * generated) */
11118 if (invlist_len(PL_utf8_foldable) == 0) {
ddc1cd80 11119 PL_utf8_foldclosures = newHV();
93e5bb1c
KW
11120 } else {
11121 /* If the folds haven't been read in, call a fold function
11122 * to force that */
11123 if (! PL_utf8_tofold) {
11124 U8 dummy[UTF8_MAXBYTES+1];
11125 STRLEN dummy_len;
f26f1b9c
KW
11126
11127 /* This particular string is above \xff in both UTF-8 and
11128 * UTFEBCDIC */
11129 to_utf8_fold((U8*) "\xC8\x80", dummy, &dummy_len);
686c8a98 11130 assert(PL_utf8_tofold); /* Verify that worked */
56ca34ca 11131 }
93e5bb1c 11132 PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
56ca34ca 11133 }
93e5bb1c
KW
11134 }
11135
dbe7a391
KW
11136 /* Only the characters in this class that participate in folds need be
11137 * checked. Get the intersection of this class and all the possible
11138 * characters that are foldable. This can quickly narrow down a large
11139 * class */
37e85ffe 11140 _invlist_intersection(PL_utf8_foldable, nonbitmap, &fold_intersection);
93e5bb1c
KW
11141
11142 /* Now look at the foldable characters in this class individually */
0d527bf8
KW
11143 invlist_iterinit(fold_intersection);
11144 while (invlist_iternext(fold_intersection, &start, &end)) {
93e5bb1c
KW
11145 UV j;
11146
93e5bb1c
KW
11147 /* Look at every character in the range */
11148 for (j = start; j <= end; j++) {
11149
11150 /* Get its fold */
11151 U8 foldbuf[UTF8_MAXBYTES_CASE+1];
11152 STRLEN foldlen;
827f5bb8
KW
11153 const UV f =
11154 _to_uni_fold_flags(j, foldbuf, &foldlen, allow_full_fold);
93e5bb1c
KW
11155
11156 if (foldlen > (STRLEN)UNISKIP(f)) {
11157
dbe7a391
KW
11158 /* Any multicharacter foldings (disallowed in lookbehind
11159 * patterns) require the following transform: [ABCDEF] ->
11160 * (?:[ABCabcDEFd]|pq|rst) where E folds into "pq" and F
11161 * folds into "rst", all other characters fold to single
11162 * characters. We save away these multicharacter foldings,
11163 * to be later saved as part of the additional "s" data. */
93e5bb1c
KW
11164 if (! RExC_in_lookbehind) {
11165 U8* loc = foldbuf;
11166 U8* e = foldbuf + foldlen;
11167
dbe7a391
KW
11168 /* If any of the folded characters of this are in the
11169 * Latin1 range, tell the regex engine that this can
11170 * match a non-utf8 target string. The only multi-byte
11171 * fold whose source is in the Latin1 range (U+00DF)
11172 * applies only when the target string is utf8, or
11173 * under unicode rules */
93e5bb1c
KW
11174 if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
11175 while (loc < e) {
11176
11177 /* Can't mix ascii with non- under /aa */
11178 if (MORE_ASCII_RESTRICTED
11179 && (isASCII(*loc) != isASCII(j)))
11180 {
11181 goto end_multi_fold;
11182 }
11183 if (UTF8_IS_INVARIANT(*loc)
11184 || UTF8_IS_DOWNGRADEABLE_START(*loc))
11185 {
dbe7a391
KW
11186 /* Can't mix above and below 256 under LOC
11187 */
93e5bb1c 11188 if (LOC) {
2f7f8cb1
KW
11189 goto end_multi_fold;
11190 }
93e5bb1c
KW
11191 ANYOF_FLAGS(ret)
11192 |= ANYOF_NONBITMAP_NON_UTF8;
11193 break;
8e3094e5 11194 }
93e5bb1c 11195 loc += UTF8SKIP(loc);
8e3094e5 11196 }
56ca34ca 11197 }
17580e7a 11198
93e5bb1c
KW
11199 add_alternate(&unicode_alternate, foldbuf, foldlen);
11200 end_multi_fold: ;
11201 }
14e30abc
KW
11202
11203 /* This is special-cased, as it is the only letter which
11204 * has both a multi-fold and single-fold in Latin1. All
11205 * the other chars that have single and multi-folds are
11206 * always in utf8, and the utf8 folding algorithm catches
11207 * them */
11208 if (! LOC && j == LATIN_CAPITAL_LETTER_SHARP_S) {
11209 stored += set_regclass_bit(pRExC_state,
11210 ret,
11211 LATIN_SMALL_LETTER_SHARP_S,
11212 &l1_fold_invlist, &unicode_alternate);
11213 }
93e5bb1c
KW
11214 }
11215 else {
11216 /* Single character fold. Add everything in its fold
dbe7a391 11217 * closure to the list that this node should match */
93e5bb1c
KW
11218 SV** listp;
11219
dbe7a391
KW
11220 /* The fold closures data structure is a hash with the keys
11221 * being every character that is folded to, like 'k', and
11222 * the values each an array of everything that folds to its
11223 * key. e.g. [ 'k', 'K', KELVIN_SIGN ] */
93e5bb1c
KW
11224 if ((listp = hv_fetch(PL_utf8_foldclosures,
11225 (char *) foldbuf, foldlen, FALSE)))
11226 {
11227 AV* list = (AV*) *listp;
11228 IV k;
11229 for (k = 0; k <= av_len(list); k++) {
11230 SV** c_p = av_fetch(list, k, FALSE);
11231 UV c;
11232 if (c_p == NULL) {
11233 Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
11234 }
11235 c = SvUV(*c_p);
11236
dbe7a391
KW
11237 /* /aa doesn't allow folds between ASCII and non-;
11238 * /l doesn't allow them between above and below
11239 * 256 */
93e5bb1c
KW
11240 if ((MORE_ASCII_RESTRICTED
11241 && (isASCII(c) != isASCII(j)))
11242 || (LOC && ((c < 256) != (j < 256))))
11243 {
11244 continue;
11245 }
56ca34ca 11246
93e5bb1c
KW
11247 if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
11248 stored += set_regclass_bit(pRExC_state,
11249 ret,
11250 (U8) c,
11251 &l1_fold_invlist, &unicode_alternate);
11252 }
dbe7a391
KW
11253 /* It may be that the code point is already in
11254 * this range or already in the bitmap, in
11255 * which case we need do nothing */
93e5bb1c
KW
11256 else if ((c < start || c > end)
11257 && (c > 255
11258 || ! ANYOF_BITMAP_TEST(ret, c)))
11259 {
11260 nonbitmap = add_cp_to_invlist(nonbitmap, c);
56ca34ca
KW
11261 }
11262 }
11263 }
11264 }
11265 }
93e5bb1c 11266 }
318c430e 11267 SvREFCNT_dec(fold_intersection);
56ca34ca
KW
11268 }
11269
53742956
KW
11270 /* Combine the two lists into one. */
11271 if (l1_fold_invlist) {
11272 if (nonbitmap) {
37e85ffe 11273 _invlist_union(nonbitmap, l1_fold_invlist, &nonbitmap);
318c430e 11274 SvREFCNT_dec(l1_fold_invlist);
53742956
KW
11275 }
11276 else {
11277 nonbitmap = l1_fold_invlist;
11278 }
11279 }
11280
dc415b4a
KW
11281 /* And combine the result (if any) with any inversion list from properties.
11282 * The lists are kept separate up to now because we don't want to fold the
11283 * properties */
11284 if (properties) {
11285 if (nonbitmap) {
11286 _invlist_union(nonbitmap, properties, &nonbitmap);
11287 SvREFCNT_dec(properties);
11288 }
11289 else {
11290 nonbitmap = properties;
11291 }
11292 }
11293
e4e94b48
KW
11294 /* Here, <nonbitmap> contains all the code points we can determine at
11295 * compile time that we haven't put into the bitmap. Go through it, and
11296 * for things that belong in the bitmap, put them there, and delete from
11297 * <nonbitmap> */
11298 if (nonbitmap) {
11299
11300 /* Above-ASCII code points in /d have to stay in <nonbitmap>, as they
11301 * possibly only should match when the target string is UTF-8 */
11302 UV max_cp_to_set = (DEPENDS_SEMANTICS) ? 127 : 255;
11303
11304 /* This gets set if we actually need to modify things */
11305 bool change_invlist = FALSE;
11306
11307 UV start, end;
11308
11309 /* Start looking through <nonbitmap> */
11310 invlist_iterinit(nonbitmap);
11311 while (invlist_iternext(nonbitmap, &start, &end)) {
11312 UV high;
11313 int i;
11314
11315 /* Quit if are above what we should change */
11316 if (start > max_cp_to_set) {
11317 break;
11318 }
11319
11320 change_invlist = TRUE;
11321
11322 /* Set all the bits in the range, up to the max that we are doing */
11323 high = (end < max_cp_to_set) ? end : max_cp_to_set;
11324 for (i = start; i <= (int) high; i++) {
11325 if (! ANYOF_BITMAP_TEST(ret, i)) {
11326 ANYOF_BITMAP_SET(ret, i);
11327 stored++;
11328 prevvalue = value;
11329 value = i;
11330 }
11331 }
11332 }
11333
a3e1f3a6
KW
11334 /* Done with loop; remove any code points that are in the bitmap from
11335 * <nonbitmap> */
e4e94b48 11336 if (change_invlist) {
a3e1f3a6
KW
11337 _invlist_subtract(nonbitmap,
11338 (DEPENDS_SEMANTICS)
11339 ? PL_ASCII
11340 : PL_Latin1,
11341 &nonbitmap);
e4e94b48
KW
11342 }
11343
11344 /* If have completely emptied it, remove it completely */
11345 if (invlist_len(nonbitmap) == 0) {
11346 SvREFCNT_dec(nonbitmap);
11347 nonbitmap = NULL;
11348 }
11349 }
dc415b4a 11350
fb9bfbf7 11351 /* Here, we have calculated what code points should be in the character
e4e94b48
KW
11352 * class. <nonbitmap> does not overlap the bitmap except possibly in the
11353 * case of DEPENDS rules.
dbe7a391
KW
11354 *
11355 * Now we can see about various optimizations. Fold calculation (which we
11356 * did above) needs to take place before inversion. Otherwise /[^k]/i
11357 * would invert to include K, which under /i would match k, which it
11358 * shouldn't. */
fb9bfbf7 11359
f56b6394 11360 /* Optimize inverted simple patterns (e.g. [^a-z]). Note that we haven't
dbe7a391 11361 * set the FOLD flag yet, so this does optimize those. It doesn't
40c78556
KW
11362 * optimize locale. Doing so perhaps could be done as long as there is
11363 * nothing like \w in it; some thought also would have to be given to the
11364 * interaction with above 0x100 chars */
dbe7a391
KW
11365 if ((ANYOF_FLAGS(ret) & ANYOF_INVERT)
11366 && ! LOC
137165a6 11367 && ! unicode_alternate
2fde50e1
KW
11368 /* In case of /d, there are some things that should match only when in
11369 * not in the bitmap, i.e., they require UTF8 to match. These are
dbe7a391
KW
11370 * listed in nonbitmap, but if ANYOF_NONBITMAP_NON_UTF8 is set in this
11371 * case, they don't require UTF8, so can invert here */
2fde50e1
KW
11372 && (! nonbitmap
11373 || ! DEPENDS_SEMANTICS
11374 || (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
137165a6
KW
11375 && SvCUR(listsv) == initial_listsv_len)
11376 {
a9746a27 11377 int i;
2fde50e1 11378 if (! nonbitmap) {
a9746a27
KW
11379 for (i = 0; i < 256; ++i) {
11380 if (ANYOF_BITMAP_TEST(ret, i)) {
11381 ANYOF_BITMAP_CLEAR(ret, i);
11382 }
11383 else {
11384 ANYOF_BITMAP_SET(ret, i);
11385 prevvalue = value;
11386 value = i;
11387 }
11388 }
2fde50e1
KW
11389 /* The inversion means that everything above 255 is matched */
11390 ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
11391 }
11392 else {
4004d578
KW
11393 /* Here, also has things outside the bitmap that may overlap with
11394 * the bitmap. We have to sync them up, so that they get inverted
11395 * in both places. Earlier, we removed all overlaps except in the
11396 * case of /d rules, so no syncing is needed except for this case
11397 */
11398 SV *remove_list = NULL;
11399
11400 if (DEPENDS_SEMANTICS) {
11401 UV start, end;
11402
11403 /* Set the bits that correspond to the ones that aren't in the
11404 * bitmap. Otherwise, when we invert, we'll miss these.
11405 * Earlier, we removed from the nonbitmap all code points
11406 * < 128, so there is no extra work here */
11407 invlist_iterinit(nonbitmap);
11408 while (invlist_iternext(nonbitmap, &start, &end)) {
11409 if (start > 255) { /* The bit map goes to 255 */
11410 break;
11411 }
11412 if (end > 255) {
11413 end = 255;
11414 }
11415 for (i = start; i <= (int) end; ++i) {
11416 ANYOF_BITMAP_SET(ret, i);
11417 prevvalue = value;
11418 value = i;
11419 }
11420 }
11421 }
a9746a27
KW
11422
11423 /* Now invert both the bitmap and the nonbitmap. Anything in the
4004d578
KW
11424 * bitmap has to also be removed from the non-bitmap, but again,
11425 * there should not be overlap unless is /d rules. */
37e85ffe 11426 _invlist_invert(nonbitmap);
4004d578 11427
5d103fec
KW
11428 /* Any swash can't be used as-is, because we've inverted things */
11429 if (swash) {
11430 SvREFCNT_dec(swash);
11431 swash = NULL;
11432 }
11433
a9746a27
KW
11434 for (i = 0; i < 256; ++i) {
11435 if (ANYOF_BITMAP_TEST(ret, i)) {
11436 ANYOF_BITMAP_CLEAR(ret, i);
4004d578
KW
11437 if (DEPENDS_SEMANTICS) {
11438 if (! remove_list) {
11439 remove_list = _new_invlist(2);
11440 }
11441 remove_list = add_cp_to_invlist(remove_list, i);
11442 }
2fde50e1
KW
11443 }
11444 else {
a9746a27
KW
11445 ANYOF_BITMAP_SET(ret, i);
11446 prevvalue = value;
11447 value = i;
2fde50e1
KW
11448 }
11449 }
dbe7a391
KW
11450
11451 /* And do the removal */
4004d578
KW
11452 if (DEPENDS_SEMANTICS) {
11453 if (remove_list) {
11454 _invlist_subtract(nonbitmap, remove_list, &nonbitmap);
11455 SvREFCNT_dec(remove_list);
11456 }
11457 }
11458 else {
11459 /* There is no overlap for non-/d, so just delete anything
11460 * below 256 */
a3e1f3a6 11461 _invlist_intersection(nonbitmap, PL_AboveLatin1, &nonbitmap);
4004d578 11462 }
2fde50e1
KW
11463 }
11464
40c78556
KW
11465 stored = 256 - stored;
11466
2fde50e1
KW
11467 /* Clear the invert flag since have just done it here */
11468 ANYOF_FLAGS(ret) &= ~ANYOF_INVERT;
40c78556
KW
11469 }
11470
0222889f
KW
11471 /* Folding in the bitmap is taken care of above, but not for locale (for
11472 * which we have to wait to see what folding is in effect at runtime), and
d9105c95
KW
11473 * for some things not in the bitmap (only the upper latin folds in this
11474 * case, as all other single-char folding has been set above). Set
11475 * run-time fold flag for these */
11476 if (FOLD && (LOC
11477 || (DEPENDS_SEMANTICS
11478 && nonbitmap
11479 && ! (ANYOF_FLAGS(ret) & ANYOF_NONBITMAP_NON_UTF8))
11480 || unicode_alternate))
11481 {
0222889f 11482 ANYOF_FLAGS(ret) |= ANYOF_LOC_NONBITMAP_FOLD;
f56b6394
KW
11483 }
11484
2786be71
KW
11485 /* A single character class can be "optimized" into an EXACTish node.
11486 * Note that since we don't currently count how many characters there are
11487 * outside the bitmap, we are XXX missing optimization possibilities for
11488 * them. This optimization can't happen unless this is a truly single
11489 * character class, which means that it can't be an inversion into a
11490 * many-character class, and there must be no possibility of there being
11491 * things outside the bitmap. 'stored' (only) for locales doesn't include
6da63e10
KW
11492 * \w, etc, so have to make a special test that they aren't present
11493 *
11494 * Similarly A 2-character class of the very special form like [bB] can be
11495 * optimized into an EXACTFish node, but only for non-locales, and for
11496 * characters which only have the two folds; so things like 'fF' and 'Ii'
11497 * wouldn't work because they are part of the fold of 'LATIN SMALL LIGATURE
11498 * FI'. */
137165a6 11499 if (! nonbitmap
53742956 11500 && ! unicode_alternate
137165a6
KW
11501 && SvCUR(listsv) == initial_listsv_len
11502 && ! (ANYOF_FLAGS(ret) & (ANYOF_INVERT|ANYOF_UNICODE_ALL))
6da63e10
KW
11503 && (((stored == 1 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11504 || (! ANYOF_CLASS_TEST_ANY_SET(ret)))))
11505 || (stored == 2 && ((! (ANYOF_FLAGS(ret) & ANYOF_LOCALE))
11506 && (! _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value))
11507 /* If the latest code point has a fold whose
11508 * bit is set, it must be the only other one */
2dcac756 11509 && ((prevvalue = PL_fold_latin1[value]) != (IV)value)
6da63e10 11510 && ANYOF_BITMAP_TEST(ret, prevvalue)))))
2786be71
KW
11511 {
11512 /* Note that the information needed to decide to do this optimization
11513 * is not currently available until the 2nd pass, and that the actually
6da63e10
KW
11514 * used EXACTish node takes less space than the calculated ANYOF node,
11515 * and hence the amount of space calculated in the first pass is larger
2786be71
KW
11516 * than actually used, so this optimization doesn't gain us any space.
11517 * But an EXACT node is faster than an ANYOF node, and can be combined
11518 * with any adjacent EXACT nodes later by the optimizer for further
6da63e10
KW
11519 * gains. The speed of executing an EXACTF is similar to an ANYOF
11520 * node, so the optimization advantage comes from the ability to join
11521 * it to adjacent EXACT nodes */
2786be71 11522
07be1b83 11523 const char * cur_parse= RExC_parse;
6da63e10 11524 U8 op;
07be1b83
YO
11525 RExC_emit = (regnode *)orig_emit;
11526 RExC_parse = (char *)orig_parse;
2786be71 11527
6da63e10
KW
11528 if (stored == 1) {
11529
11530 /* A locale node with one point can be folded; all the other cases
11531 * with folding will have two points, since we calculate them above
11532 */
39065660 11533 if (ANYOF_FLAGS(ret) & ANYOF_LOC_NONBITMAP_FOLD) {
6da63e10
KW
11534 op = EXACTFL;
11535 }
11536 else {
11537 op = EXACT;
11538 }
b36527fc
KW
11539 }
11540 else { /* else 2 chars in the bit map: the folds of each other */
11541
11542 /* Use the folded value, which for the cases where we get here,
11543 * is just the lower case of the current one (which may resolve to
11544 * itself, or to the other one */
11545 value = toLOWER_LATIN1(value);
6da63e10 11546
bf4c00b4
KW
11547 /* To join adjacent nodes, they must be the exact EXACTish type.
11548 * Try to use the most likely type, by using EXACTFA if possible,
11549 * then EXACTFU if the regex calls for it, or is required because
11550 * the character is non-ASCII. (If <value> is ASCII, its fold is
11551 * also ASCII for the cases where we get here.) */
11552 if (MORE_ASCII_RESTRICTED && isASCII(value)) {
11553 op = EXACTFA;
11554 }
11555 else if (AT_LEAST_UNI_SEMANTICS || !isASCII(value)) {
e62862f6
KW
11556 op = EXACTFU;
11557 }
11558 else { /* Otherwise, more likely to be EXACTF type */
11559 op = EXACTF;
11560 }
b36527fc 11561 }
6da63e10
KW
11562
11563 ret = reg_node(pRExC_state, op);
07be1b83 11564 RExC_parse = (char *)cur_parse;
2786be71
KW
11565 if (UTF && ! NATIVE_IS_INVARIANT(value)) {
11566 *STRING(ret)= UTF8_EIGHT_BIT_HI((U8) value);
11567 *(STRING(ret) + 1)= UTF8_EIGHT_BIT_LO((U8) value);
11568 STR_LEN(ret)= 2;
11569 RExC_emit += STR_SZ(2);
11570 }
11571 else {
11572 *STRING(ret)= (char)value;
11573 STR_LEN(ret)= 1;
11574 RExC_emit += STR_SZ(1);
11575 }
ef8d46e8 11576 SvREFCNT_dec(listsv);
7f6f358c
YO
11577 return ret;
11578 }
ffc61ed2 11579
dc415b4a
KW
11580 /* If there is a swash and more than one element, we can't use the swash in
11581 * the optimization below. */
11582 if (swash && element_count > 1) {
11583 SvREFCNT_dec(swash);
11584 swash = NULL;
11585 }
c16787fd
KW
11586 if (! nonbitmap
11587 && SvCUR(listsv) == initial_listsv_len
11588 && ! unicode_alternate)
11589 {
137165a6
KW
11590 ARG_SET(ret, ANYOF_NONBITMAP_EMPTY);
11591 SvREFCNT_dec(listsv);
11592 SvREFCNT_dec(unicode_alternate);
11593 }
11594 else {
0bd1039c
KW
11595 /* av[0] stores the character class description in its textual form:
11596 * used later (regexec.c:Perl_regclass_swash()) to initialize the
11597 * appropriate swash, and is also useful for dumping the regnode.
11598 * av[1] if NULL, is a placeholder to later contain the swash computed
11599 * from av[0]. But if no further computation need be done, the
11600 * swash is stored there now.
11601 * av[2] stores the multicharacter foldings, used later in
11602 * regexec.c:S_reginclass().
11603 * av[3] stores the nonbitmap inversion list for use in addition or
11604 * instead of av[0]; not used if av[1] isn't NULL
11605 * av[4] is set if any component of the class is from a user-defined
11606 * property; not used if av[1] isn't NULL */
097eb12c 11607 AV * const av = newAV();
ffc61ed2 11608 SV *rv;
0bd1039c 11609
c16787fd
KW
11610 av_store(av, 0, (SvCUR(listsv) == initial_listsv_len)
11611 ? &PL_sv_undef
11612 : listsv);
88d45d28
KW
11613 if (swash) {
11614 av_store(av, 1, swash);
11615 SvREFCNT_dec(nonbitmap);
11616 }
11617 else {
11618 av_store(av, 1, NULL);
c16787fd
KW
11619 if (nonbitmap) {
11620 av_store(av, 3, nonbitmap);
dc415b4a 11621 av_store(av, 4, newSVuv(has_user_defined_property));
c16787fd 11622 }
88d45d28 11623 }
827f5bb8
KW
11624
11625 /* Store any computed multi-char folds only if we are allowing
11626 * them */
11627 if (allow_full_fold) {
7b4a7e58
KW
11628 av_store(av, 2, MUTABLE_SV(unicode_alternate));
11629 if (unicode_alternate) { /* This node is variable length */
11630 OP(ret) = ANYOFV;
11631 }
827f5bb8
KW
11632 }
11633 else {
11634 av_store(av, 2, NULL);
11635 }
ad64d0ec 11636 rv = newRV_noinc(MUTABLE_SV(av));
19860706 11637 n = add_data(pRExC_state, 1, "s");
f8fc2ecf 11638 RExC_rxi->data->data[n] = (void*)rv;
ffc61ed2 11639 ARG_SET(ret, n);
a0ed51b3 11640 }
a0ed51b3
LW
11641 return ret;
11642}
89836f1f 11643
a0ed51b3 11644
bcdf7404
YO
11645/* reg_skipcomment()
11646
11647 Absorbs an /x style # comments from the input stream.
11648 Returns true if there is more text remaining in the stream.
11649 Will set the REG_SEEN_RUN_ON_COMMENT flag if the comment
11650 terminates the pattern without including a newline.
11651
11652 Note its the callers responsibility to ensure that we are
11653 actually in /x mode
11654
11655*/
11656
11657STATIC bool
11658S_reg_skipcomment(pTHX_ RExC_state_t *pRExC_state)
11659{
11660 bool ended = 0;
7918f24d
NC
11661
11662 PERL_ARGS_ASSERT_REG_SKIPCOMMENT;
11663
bcdf7404
YO
11664 while (RExC_parse < RExC_end)
11665 if (*RExC_parse++ == '\n') {
11666 ended = 1;
11667 break;
11668 }
11669 if (!ended) {
11670 /* we ran off the end of the pattern without ending
11671 the comment, so we have to add an \n when wrapping */
11672 RExC_seen |= REG_SEEN_RUN_ON_COMMENT;
11673 return 0;
11674 } else
11675 return 1;
11676}
11677
11678/* nextchar()
11679
3b753521 11680 Advances the parse position, and optionally absorbs
bcdf7404
YO
11681 "whitespace" from the inputstream.
11682
11683 Without /x "whitespace" means (?#...) style comments only,
11684 with /x this means (?#...) and # comments and whitespace proper.
11685
11686 Returns the RExC_parse point from BEFORE the scan occurs.
11687
11688 This is the /x friendly way of saying RExC_parse++.
11689*/
11690
76e3520e 11691STATIC char*
830247a4 11692S_nextchar(pTHX_ RExC_state_t *pRExC_state)
a0d0e21e 11693{
097eb12c 11694 char* const retval = RExC_parse++;
a0d0e21e 11695
7918f24d
NC
11696 PERL_ARGS_ASSERT_NEXTCHAR;
11697
4633a7c4 11698 for (;;) {
d224c965
KW
11699 if (RExC_end - RExC_parse >= 3
11700 && *RExC_parse == '('
11701 && RExC_parse[1] == '?'
11702 && RExC_parse[2] == '#')
11703 {
e994fd66
AE
11704 while (*RExC_parse != ')') {
11705 if (RExC_parse == RExC_end)
11706 FAIL("Sequence (?#... not terminated");
830247a4 11707 RExC_parse++;
e994fd66 11708 }
830247a4 11709 RExC_parse++;
4633a7c4
LW
11710 continue;
11711 }
bbe252da 11712 if (RExC_flags & RXf_PMf_EXTENDED) {
830247a4
IZ
11713 if (isSPACE(*RExC_parse)) {
11714 RExC_parse++;
748a9306
LW
11715 continue;
11716 }
830247a4 11717 else if (*RExC_parse == '#') {
bcdf7404
YO
11718 if ( reg_skipcomment( pRExC_state ) )
11719 continue;
748a9306 11720 }
748a9306 11721 }
4633a7c4 11722 return retval;
a0d0e21e 11723 }
a687059c
LW
11724}
11725
11726/*
c277df42 11727- reg_node - emit a node
a0d0e21e 11728*/
76e3520e 11729STATIC regnode * /* Location. */
830247a4 11730S_reg_node(pTHX_ RExC_state_t *pRExC_state, U8 op)
a687059c 11731{
97aff369 11732 dVAR;
c277df42 11733 register regnode *ptr;
504618e9 11734 regnode * const ret = RExC_emit;
07be1b83 11735 GET_RE_DEBUG_FLAGS_DECL;
a687059c 11736
7918f24d
NC
11737 PERL_ARGS_ASSERT_REG_NODE;
11738
c277df42 11739 if (SIZE_ONLY) {
830247a4
IZ
11740 SIZE_ALIGN(RExC_size);
11741 RExC_size += 1;
a0d0e21e
LW
11742 return(ret);
11743 }
3b57cd43 11744 if (RExC_emit >= RExC_emit_bound)
5637ef5b
NC
11745 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11746 op, RExC_emit, RExC_emit_bound);
3b57cd43 11747
c277df42 11748 NODE_ALIGN_FILL(ret);
a0d0e21e 11749 ptr = ret;
c277df42 11750 FILL_ADVANCE_NODE(ptr, op);
7122b237 11751#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 11752 if (RExC_offsets) { /* MJD */
07be1b83 11753 MJD_OFFSET_DEBUG(("%s:%d: (op %s) %s %"UVuf" (len %"UVuf") (max %"UVuf").\n",
fac92740 11754 "reg_node", __LINE__,
13d6edb4 11755 PL_reg_name[op],
07be1b83
YO
11756 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0]
11757 ? "Overwriting end of array!\n" : "OK",
11758 (UV)(RExC_emit - RExC_emit_start),
11759 (UV)(RExC_parse - RExC_start),
11760 (UV)RExC_offsets[0]));
ccb2c380 11761 Set_Node_Offset(RExC_emit, RExC_parse + (op == END));
fac92740 11762 }
7122b237 11763#endif
830247a4 11764 RExC_emit = ptr;
a0d0e21e 11765 return(ret);
a687059c
LW
11766}
11767
11768/*
a0d0e21e
LW
11769- reganode - emit a node with an argument
11770*/
76e3520e 11771STATIC regnode * /* Location. */
830247a4 11772S_reganode(pTHX_ RExC_state_t *pRExC_state, U8 op, U32 arg)
fe14fcc3 11773{
97aff369 11774 dVAR;
c277df42 11775 register regnode *ptr;
504618e9 11776 regnode * const ret = RExC_emit;
07be1b83 11777 GET_RE_DEBUG_FLAGS_DECL;
fe14fcc3 11778
7918f24d
NC
11779 PERL_ARGS_ASSERT_REGANODE;
11780
c277df42 11781 if (SIZE_ONLY) {
830247a4
IZ
11782 SIZE_ALIGN(RExC_size);
11783 RExC_size += 2;
6bda09f9
YO
11784 /*
11785 We can't do this:
11786
11787 assert(2==regarglen[op]+1);
686b73d4 11788
6bda09f9
YO
11789 Anything larger than this has to allocate the extra amount.
11790 If we changed this to be:
11791
11792 RExC_size += (1 + regarglen[op]);
11793
11794 then it wouldn't matter. Its not clear what side effect
11795 might come from that so its not done so far.
11796 -- dmq
11797 */
a0d0e21e
LW
11798 return(ret);
11799 }
3b57cd43 11800 if (RExC_emit >= RExC_emit_bound)
5637ef5b
NC
11801 Perl_croak(aTHX_ "panic: reg_node overrun trying to emit %d, %p>=%p",
11802 op, RExC_emit, RExC_emit_bound);
3b57cd43 11803
c277df42 11804 NODE_ALIGN_FILL(ret);
a0d0e21e 11805 ptr = ret;
c277df42 11806 FILL_ADVANCE_NODE_ARG(ptr, op, arg);
7122b237 11807#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 11808 if (RExC_offsets) { /* MJD */
07be1b83 11809 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 11810 "reganode",
ccb2c380 11811 __LINE__,
13d6edb4 11812 PL_reg_name[op],
07be1b83 11813 (UV)(RExC_emit - RExC_emit_start) > RExC_offsets[0] ?
fac92740 11814 "Overwriting end of array!\n" : "OK",
07be1b83
YO
11815 (UV)(RExC_emit - RExC_emit_start),
11816 (UV)(RExC_parse - RExC_start),
11817 (UV)RExC_offsets[0]));
ccb2c380 11818 Set_Cur_Node_Offset;
fac92740 11819 }
7122b237 11820#endif
830247a4 11821 RExC_emit = ptr;
a0d0e21e 11822 return(ret);
fe14fcc3
LW
11823}
11824
11825/*
cd439c50 11826- reguni - emit (if appropriate) a Unicode character
a0ed51b3 11827*/
71207a34
AL
11828STATIC STRLEN
11829S_reguni(pTHX_ const RExC_state_t *pRExC_state, UV uv, char* s)
a0ed51b3 11830{
97aff369 11831 dVAR;
7918f24d
NC
11832
11833 PERL_ARGS_ASSERT_REGUNI;
11834
71207a34 11835 return SIZE_ONLY ? UNISKIP(uv) : (uvchr_to_utf8((U8*)s, uv) - (U8*)s);
a0ed51b3
LW
11836}
11837
11838/*
a0d0e21e
LW
11839- reginsert - insert an operator in front of already-emitted operand
11840*
11841* Means relocating the operand.
11842*/
76e3520e 11843STATIC void
6bda09f9 11844S_reginsert(pTHX_ RExC_state_t *pRExC_state, U8 op, regnode *opnd, U32 depth)
a687059c 11845{
97aff369 11846 dVAR;
c277df42
IZ
11847 register regnode *src;
11848 register regnode *dst;
11849 register regnode *place;
504618e9 11850 const int offset = regarglen[(U8)op];
6bda09f9 11851 const int size = NODE_STEP_REGNODE + offset;
07be1b83 11852 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
11853
11854 PERL_ARGS_ASSERT_REGINSERT;
def51078 11855 PERL_UNUSED_ARG(depth);
22c35a8c 11856/* (PL_regkind[(U8)op] == CURLY ? EXTRA_STEP_2ARGS : 0); */
13d6edb4 11857 DEBUG_PARSE_FMT("inst"," - %s",PL_reg_name[op]);
c277df42 11858 if (SIZE_ONLY) {
6bda09f9 11859 RExC_size += size;
a0d0e21e
LW
11860 return;
11861 }
a687059c 11862
830247a4 11863 src = RExC_emit;
6bda09f9 11864 RExC_emit += size;
830247a4 11865 dst = RExC_emit;
40d049e4 11866 if (RExC_open_parens) {
6bda09f9 11867 int paren;
3b57cd43 11868 /*DEBUG_PARSE_FMT("inst"," - %"IVdf, (IV)RExC_npar);*/
6bda09f9 11869 for ( paren=0 ; paren < RExC_npar ; paren++ ) {
40d049e4 11870 if ( RExC_open_parens[paren] >= opnd ) {
3b57cd43 11871 /*DEBUG_PARSE_FMT("open"," - %d",size);*/
40d049e4
YO
11872 RExC_open_parens[paren] += size;
11873 } else {
3b57cd43 11874 /*DEBUG_PARSE_FMT("open"," - %s","ok");*/
40d049e4
YO
11875 }
11876 if ( RExC_close_parens[paren] >= opnd ) {
3b57cd43 11877 /*DEBUG_PARSE_FMT("close"," - %d",size);*/
40d049e4
YO
11878 RExC_close_parens[paren] += size;
11879 } else {
3b57cd43 11880 /*DEBUG_PARSE_FMT("close"," - %s","ok");*/
40d049e4
YO
11881 }
11882 }
6bda09f9 11883 }
40d049e4 11884
fac92740 11885 while (src > opnd) {
c277df42 11886 StructCopy(--src, --dst, regnode);
7122b237 11887#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 11888 if (RExC_offsets) { /* MJD 20010112 */
07be1b83 11889 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s copy %"UVuf" -> %"UVuf" (max %"UVuf").\n",
fac92740 11890 "reg_insert",
ccb2c380 11891 __LINE__,
13d6edb4 11892 PL_reg_name[op],
07be1b83
YO
11893 (UV)(dst - RExC_emit_start) > RExC_offsets[0]
11894 ? "Overwriting end of array!\n" : "OK",
11895 (UV)(src - RExC_emit_start),
11896 (UV)(dst - RExC_emit_start),
11897 (UV)RExC_offsets[0]));
ccb2c380
MP
11898 Set_Node_Offset_To_R(dst-RExC_emit_start, Node_Offset(src));
11899 Set_Node_Length_To_R(dst-RExC_emit_start, Node_Length(src));
fac92740 11900 }
7122b237 11901#endif
fac92740
MJD
11902 }
11903
a0d0e21e
LW
11904
11905 place = opnd; /* Op node, where operand used to be. */
7122b237 11906#ifdef RE_TRACK_PATTERN_OFFSETS
fac92740 11907 if (RExC_offsets) { /* MJD */
07be1b83 11908 MJD_OFFSET_DEBUG(("%s(%d): (op %s) %s %"UVuf" <- %"UVuf" (max %"UVuf").\n",
fac92740 11909 "reginsert",
ccb2c380 11910 __LINE__,
13d6edb4 11911 PL_reg_name[op],
07be1b83 11912 (UV)(place - RExC_emit_start) > RExC_offsets[0]
fac92740 11913 ? "Overwriting end of array!\n" : "OK",
07be1b83
YO
11914 (UV)(place - RExC_emit_start),
11915 (UV)(RExC_parse - RExC_start),
786e8c11 11916 (UV)RExC_offsets[0]));
ccb2c380 11917 Set_Node_Offset(place, RExC_parse);
45948336 11918 Set_Node_Length(place, 1);
fac92740 11919 }
7122b237 11920#endif
c277df42
IZ
11921 src = NEXTOPER(place);
11922 FILL_ADVANCE_NODE(place, op);
11923 Zero(src, offset, regnode);
a687059c
LW
11924}
11925
11926/*
c277df42 11927- regtail - set the next-pointer at the end of a node chain of p to val.
3dab1dad 11928- SEE ALSO: regtail_study
a0d0e21e 11929*/
097eb12c 11930/* TODO: All three parms should be const */
76e3520e 11931STATIC void
3dab1dad 11932S_regtail(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
a687059c 11933{
97aff369 11934 dVAR;
c277df42 11935 register regnode *scan;
72f13be8 11936 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
11937
11938 PERL_ARGS_ASSERT_REGTAIL;
f9049ba1
SP
11939#ifndef DEBUGGING
11940 PERL_UNUSED_ARG(depth);
11941#endif
a0d0e21e 11942
c277df42 11943 if (SIZE_ONLY)
a0d0e21e
LW
11944 return;
11945
11946 /* Find last node. */
11947 scan = p;
11948 for (;;) {
504618e9 11949 regnode * const temp = regnext(scan);
3dab1dad
YO
11950 DEBUG_PARSE_r({
11951 SV * const mysv=sv_newmortal();
11952 DEBUG_PARSE_MSG((scan==p ? "tail" : ""));
11953 regprop(RExC_rx, mysv, scan);
eaf3ca90
YO
11954 PerlIO_printf(Perl_debug_log, "~ %s (%d) %s %s\n",
11955 SvPV_nolen_const(mysv), REG_NODE_NUM(scan),
11956 (temp == NULL ? "->" : ""),
13d6edb4 11957 (temp == NULL ? PL_reg_name[OP(val)] : "")
eaf3ca90 11958 );
3dab1dad
YO
11959 });
11960 if (temp == NULL)
11961 break;
11962 scan = temp;
11963 }
11964
11965 if (reg_off_by_arg[OP(scan)]) {
11966 ARG_SET(scan, val - scan);
11967 }
11968 else {
11969 NEXT_OFF(scan) = val - scan;
11970 }
11971}
11972
07be1b83 11973#ifdef DEBUGGING
3dab1dad
YO
11974/*
11975- regtail_study - set the next-pointer at the end of a node chain of p to val.
11976- Look for optimizable sequences at the same time.
11977- currently only looks for EXACT chains.
07be1b83 11978
486ec47a 11979This is experimental code. The idea is to use this routine to perform
07be1b83
YO
11980in place optimizations on branches and groups as they are constructed,
11981with the long term intention of removing optimization from study_chunk so
11982that it is purely analytical.
11983
11984Currently only used when in DEBUG mode. The macro REGTAIL_STUDY() is used
11985to control which is which.
11986
3dab1dad
YO
11987*/
11988/* TODO: All four parms should be const */
07be1b83 11989
3dab1dad
YO
11990STATIC U8
11991S_regtail_study(pTHX_ RExC_state_t *pRExC_state, regnode *p, const regnode *val,U32 depth)
11992{
11993 dVAR;
11994 register regnode *scan;
07be1b83
YO
11995 U8 exact = PSEUDO;
11996#ifdef EXPERIMENTAL_INPLACESCAN
11997 I32 min = 0;
11998#endif
3dab1dad
YO
11999 GET_RE_DEBUG_FLAGS_DECL;
12000
7918f24d
NC
12001 PERL_ARGS_ASSERT_REGTAIL_STUDY;
12002
07be1b83 12003
3dab1dad
YO
12004 if (SIZE_ONLY)
12005 return exact;
12006
12007 /* Find last node. */
12008
12009 scan = p;
12010 for (;;) {
12011 regnode * const temp = regnext(scan);
07be1b83 12012#ifdef EXPERIMENTAL_INPLACESCAN
f758bddf
KW
12013 if (PL_regkind[OP(scan)] == EXACT) {
12014 bool has_exactf_sharp_s; /* Unexamined in this routine */
12015 if (join_exact(pRExC_state,scan,&min, &has_exactf_sharp_s, 1,val,depth+1))
07be1b83 12016 return EXACT;
f758bddf 12017 }
07be1b83 12018#endif
3dab1dad
YO
12019 if ( exact ) {
12020 switch (OP(scan)) {
12021 case EXACT:
12022 case EXACTF:
2f7f8cb1 12023 case EXACTFA:
2c2b7f86 12024 case EXACTFU:
3c760661 12025 case EXACTFU_SS:
ca600955 12026 case EXACTFU_NO_TRIE:
3dab1dad
YO
12027 case EXACTFL:
12028 if( exact == PSEUDO )
12029 exact= OP(scan);
07be1b83
YO
12030 else if ( exact != OP(scan) )
12031 exact= 0;
3dab1dad
YO
12032 case NOTHING:
12033 break;
12034 default:
12035 exact= 0;
12036 }
12037 }
12038 DEBUG_PARSE_r({
12039 SV * const mysv=sv_newmortal();
12040 DEBUG_PARSE_MSG((scan==p ? "tsdy" : ""));
12041 regprop(RExC_rx, mysv, scan);
eaf3ca90 12042 PerlIO_printf(Perl_debug_log, "~ %s (%d) -> %s\n",
3dab1dad 12043 SvPV_nolen_const(mysv),
eaf3ca90 12044 REG_NODE_NUM(scan),
13d6edb4 12045 PL_reg_name[exact]);
3dab1dad 12046 });
a0d0e21e
LW
12047 if (temp == NULL)
12048 break;
12049 scan = temp;
12050 }
07be1b83
YO
12051 DEBUG_PARSE_r({
12052 SV * const mysv_val=sv_newmortal();
12053 DEBUG_PARSE_MSG("");
12054 regprop(RExC_rx, mysv_val, val);
70685ca0
JH
12055 PerlIO_printf(Perl_debug_log, "~ attach to %s (%"IVdf") offset to %"IVdf"\n",
12056 SvPV_nolen_const(mysv_val),
12057 (IV)REG_NODE_NUM(val),
12058 (IV)(val - scan)
07be1b83
YO
12059 );
12060 });
c277df42
IZ
12061 if (reg_off_by_arg[OP(scan)]) {
12062 ARG_SET(scan, val - scan);
a0ed51b3
LW
12063 }
12064 else {
c277df42
IZ
12065 NEXT_OFF(scan) = val - scan;
12066 }
3dab1dad
YO
12067
12068 return exact;
a687059c 12069}
07be1b83 12070#endif
a687059c
LW
12071
12072/*
fd181c75 12073 - regdump - dump a regexp onto Perl_debug_log in vaguely comprehensible form
a687059c 12074 */
f7819f85 12075#ifdef DEBUGGING
c33269f7 12076static void
7918f24d
NC
12077S_regdump_extflags(pTHX_ const char *lead, const U32 flags)
12078{
f7819f85
A
12079 int bit;
12080 int set=0;
a62b1201 12081 regex_charset cs;
7918f24d 12082
f7819f85
A
12083 for (bit=0; bit<32; bit++) {
12084 if (flags & (1<<bit)) {
a62b1201
KW
12085 if ((1<<bit) & RXf_PMf_CHARSET) { /* Output separately, below */
12086 continue;
12087 }
f7819f85
A
12088 if (!set++ && lead)
12089 PerlIO_printf(Perl_debug_log, "%s",lead);
12090 PerlIO_printf(Perl_debug_log, "%s ",PL_reg_extflags_name[bit]);
12091 }
12092 }
a62b1201
KW
12093 if ((cs = get_regex_charset(flags)) != REGEX_DEPENDS_CHARSET) {
12094 if (!set++ && lead) {
12095 PerlIO_printf(Perl_debug_log, "%s",lead);
12096 }
12097 switch (cs) {
12098 case REGEX_UNICODE_CHARSET:
12099 PerlIO_printf(Perl_debug_log, "UNICODE");
12100 break;
12101 case REGEX_LOCALE_CHARSET:
12102 PerlIO_printf(Perl_debug_log, "LOCALE");
12103 break;
cfaf538b
KW
12104 case REGEX_ASCII_RESTRICTED_CHARSET:
12105 PerlIO_printf(Perl_debug_log, "ASCII-RESTRICTED");
12106 break;
2f7f8cb1
KW
12107 case REGEX_ASCII_MORE_RESTRICTED_CHARSET:
12108 PerlIO_printf(Perl_debug_log, "ASCII-MORE_RESTRICTED");
12109 break;
a62b1201
KW
12110 default:
12111 PerlIO_printf(Perl_debug_log, "UNKNOWN CHARACTER SET");
12112 break;
12113 }
12114 }
f7819f85
A
12115 if (lead) {
12116 if (set)
12117 PerlIO_printf(Perl_debug_log, "\n");
12118 else
12119 PerlIO_printf(Perl_debug_log, "%s[none-set]\n",lead);
12120 }
12121}
12122#endif
12123
a687059c 12124void
097eb12c 12125Perl_regdump(pTHX_ const regexp *r)
a687059c 12126{
35ff7856 12127#ifdef DEBUGGING
97aff369 12128 dVAR;
c445ea15 12129 SV * const sv = sv_newmortal();
ab3bbdeb 12130 SV *dsv= sv_newmortal();
f8fc2ecf 12131 RXi_GET_DECL(r,ri);
f7819f85 12132 GET_RE_DEBUG_FLAGS_DECL;
a687059c 12133
7918f24d
NC
12134 PERL_ARGS_ASSERT_REGDUMP;
12135
f8fc2ecf 12136 (void)dumpuntil(r, ri->program, ri->program + 1, NULL, NULL, sv, 0, 0);
a0d0e21e
LW
12137
12138 /* Header fields of interest. */
ab3bbdeb
YO
12139 if (r->anchored_substr) {
12140 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->anchored_substr),
12141 RE_SV_DUMPLEN(r->anchored_substr), 30);
7b0972df 12142 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
12143 "anchored %s%s at %"IVdf" ",
12144 s, RE_SV_TAIL(r->anchored_substr),
7b0972df 12145 (IV)r->anchored_offset);
ab3bbdeb
YO
12146 } else if (r->anchored_utf8) {
12147 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->anchored_utf8),
12148 RE_SV_DUMPLEN(r->anchored_utf8), 30);
33b8afdf 12149 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
12150 "anchored utf8 %s%s at %"IVdf" ",
12151 s, RE_SV_TAIL(r->anchored_utf8),
33b8afdf 12152 (IV)r->anchored_offset);
ab3bbdeb
YO
12153 }
12154 if (r->float_substr) {
12155 RE_PV_QUOTED_DECL(s, 0, dsv, SvPVX_const(r->float_substr),
12156 RE_SV_DUMPLEN(r->float_substr), 30);
7b0972df 12157 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
12158 "floating %s%s at %"IVdf"..%"UVuf" ",
12159 s, RE_SV_TAIL(r->float_substr),
7b0972df 12160 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb
YO
12161 } else if (r->float_utf8) {
12162 RE_PV_QUOTED_DECL(s, 1, dsv, SvPVX_const(r->float_utf8),
12163 RE_SV_DUMPLEN(r->float_utf8), 30);
33b8afdf 12164 PerlIO_printf(Perl_debug_log,
ab3bbdeb
YO
12165 "floating utf8 %s%s at %"IVdf"..%"UVuf" ",
12166 s, RE_SV_TAIL(r->float_utf8),
33b8afdf 12167 (IV)r->float_min_offset, (UV)r->float_max_offset);
ab3bbdeb 12168 }
33b8afdf 12169 if (r->check_substr || r->check_utf8)
b81d288d 12170 PerlIO_printf(Perl_debug_log,
10edeb5d
JH
12171 (const char *)
12172 (r->check_substr == r->float_substr
12173 && r->check_utf8 == r->float_utf8
12174 ? "(checking floating" : "(checking anchored"));
bbe252da 12175 if (r->extflags & RXf_NOSCAN)
c277df42 12176 PerlIO_printf(Perl_debug_log, " noscan");
bbe252da 12177 if (r->extflags & RXf_CHECK_ALL)
c277df42 12178 PerlIO_printf(Perl_debug_log, " isall");
33b8afdf 12179 if (r->check_substr || r->check_utf8)
c277df42
IZ
12180 PerlIO_printf(Perl_debug_log, ") ");
12181
f8fc2ecf
YO
12182 if (ri->regstclass) {
12183 regprop(r, sv, ri->regstclass);
1de06328 12184 PerlIO_printf(Perl_debug_log, "stclass %s ", SvPVX_const(sv));
46fc3d4c 12185 }
bbe252da 12186 if (r->extflags & RXf_ANCH) {
774d564b 12187 PerlIO_printf(Perl_debug_log, "anchored");
bbe252da 12188 if (r->extflags & RXf_ANCH_BOL)
774d564b 12189 PerlIO_printf(Perl_debug_log, "(BOL)");
bbe252da 12190 if (r->extflags & RXf_ANCH_MBOL)
c277df42 12191 PerlIO_printf(Perl_debug_log, "(MBOL)");
bbe252da 12192 if (r->extflags & RXf_ANCH_SBOL)
cad2e5aa 12193 PerlIO_printf(Perl_debug_log, "(SBOL)");
bbe252da 12194 if (r->extflags & RXf_ANCH_GPOS)
774d564b 12195 PerlIO_printf(Perl_debug_log, "(GPOS)");
12196 PerlIO_putc(Perl_debug_log, ' ');
12197 }
bbe252da 12198 if (r->extflags & RXf_GPOS_SEEN)
70685ca0 12199 PerlIO_printf(Perl_debug_log, "GPOS:%"UVuf" ", (UV)r->gofs);
bbe252da 12200 if (r->intflags & PREGf_SKIP)
760ac839 12201 PerlIO_printf(Perl_debug_log, "plus ");
bbe252da 12202 if (r->intflags & PREGf_IMPLICIT)
760ac839 12203 PerlIO_printf(Perl_debug_log, "implicit ");
70685ca0 12204 PerlIO_printf(Perl_debug_log, "minlen %"IVdf" ", (IV)r->minlen);
bbe252da 12205 if (r->extflags & RXf_EVAL_SEEN)
ce862d02 12206 PerlIO_printf(Perl_debug_log, "with eval ");
760ac839 12207 PerlIO_printf(Perl_debug_log, "\n");
f7819f85 12208 DEBUG_FLAGS_r(regdump_extflags("r->extflags: ",r->extflags));
65e66c80 12209#else
7918f24d 12210 PERL_ARGS_ASSERT_REGDUMP;
96a5add6 12211 PERL_UNUSED_CONTEXT;
65e66c80 12212 PERL_UNUSED_ARG(r);
17c3b450 12213#endif /* DEBUGGING */
a687059c
LW
12214}
12215
12216/*
a0d0e21e
LW
12217- regprop - printable representation of opcode
12218*/
3339dfd8
YO
12219#define EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags) \
12220STMT_START { \
12221 if (do_sep) { \
12222 Perl_sv_catpvf(aTHX_ sv,"%s][%s",PL_colors[1],PL_colors[0]); \
12223 if (flags & ANYOF_INVERT) \
12224 /*make sure the invert info is in each */ \
12225 sv_catpvs(sv, "^"); \
12226 do_sep = 0; \
12227 } \
12228} STMT_END
12229
46fc3d4c 12230void
32fc9b6a 12231Perl_regprop(pTHX_ const regexp *prog, SV *sv, const regnode *o)
a687059c 12232{
35ff7856 12233#ifdef DEBUGGING
97aff369 12234 dVAR;
9b155405 12235 register int k;
f8fc2ecf 12236 RXi_GET_DECL(prog,progi);
1de06328 12237 GET_RE_DEBUG_FLAGS_DECL;
f8fc2ecf 12238
7918f24d 12239 PERL_ARGS_ASSERT_REGPROP;
a0d0e21e 12240
76f68e9b 12241 sv_setpvs(sv, "");
8aa23a47 12242
03363afd 12243 if (OP(o) > REGNODE_MAX) /* regnode.type is unsigned */
830247a4
IZ
12244 /* It would be nice to FAIL() here, but this may be called from
12245 regexec.c, and it would be hard to supply pRExC_state. */
a5ca303d 12246 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(o), (int)REGNODE_MAX);
13d6edb4 12247 sv_catpv(sv, PL_reg_name[OP(o)]); /* Take off const! */
9b155405 12248
3dab1dad 12249 k = PL_regkind[OP(o)];
9b155405 12250
2a782b5b 12251 if (k == EXACT) {
f92a2122 12252 sv_catpvs(sv, " ");
ab3bbdeb
YO
12253 /* Using is_utf8_string() (via PERL_PV_UNI_DETECT)
12254 * is a crude hack but it may be the best for now since
12255 * we have no flag "this EXACTish node was UTF-8"
12256 * --jhi */
f92a2122
NC
12257 pv_pretty(sv, STRING(o), STR_LEN(o), 60, PL_colors[0], PL_colors[1],
12258 PERL_PV_ESCAPE_UNI_DETECT |
c89df6cf 12259 PERL_PV_ESCAPE_NONASCII |
f92a2122
NC
12260 PERL_PV_PRETTY_ELLIPSES |
12261 PERL_PV_PRETTY_LTGT |
12262 PERL_PV_PRETTY_NOCLEAR
12263 );
bb263b4e 12264 } else if (k == TRIE) {
3dab1dad 12265 /* print the details of the trie in dumpuntil instead, as
f8fc2ecf 12266 * progi->data isn't available here */
1de06328 12267 const char op = OP(o);
647f639f 12268 const U32 n = ARG(o);
1de06328 12269 const reg_ac_data * const ac = IS_TRIE_AC(op) ?
f8fc2ecf 12270 (reg_ac_data *)progi->data->data[n] :
1de06328 12271 NULL;
3251b653
NC
12272 const reg_trie_data * const trie
12273 = (reg_trie_data*)progi->data->data[!IS_TRIE_AC(op) ? n : ac->trie];
1de06328 12274
13d6edb4 12275 Perl_sv_catpvf(aTHX_ sv, "-%s",PL_reg_name[o->flags]);
1de06328
YO
12276 DEBUG_TRIE_COMPILE_r(
12277 Perl_sv_catpvf(aTHX_ sv,
12278 "<S:%"UVuf"/%"IVdf" W:%"UVuf" L:%"UVuf"/%"UVuf" C:%"UVuf"/%"UVuf">",
12279 (UV)trie->startstate,
1e2e3d02 12280 (IV)trie->statecount-1, /* -1 because of the unused 0 element */
1de06328
YO
12281 (UV)trie->wordcount,
12282 (UV)trie->minlen,
12283 (UV)trie->maxlen,
12284 (UV)TRIE_CHARCOUNT(trie),
12285 (UV)trie->uniquecharcount
12286 )
12287 );
12288 if ( IS_ANYOF_TRIE(op) || trie->bitmap ) {
12289 int i;
12290 int rangestart = -1;
f46cb337 12291 U8* bitmap = IS_ANYOF_TRIE(op) ? (U8*)ANYOF_BITMAP(o) : (U8*)TRIE_BITMAP(trie);
f3a2811a 12292 sv_catpvs(sv, "[");
1de06328
YO
12293 for (i = 0; i <= 256; i++) {
12294 if (i < 256 && BITMAP_TEST(bitmap,i)) {
12295 if (rangestart == -1)
12296 rangestart = i;
12297 } else if (rangestart != -1) {
12298 if (i <= rangestart + 3)
12299 for (; rangestart < i; rangestart++)
12300 put_byte(sv, rangestart);
12301 else {
12302 put_byte(sv, rangestart);
12303 sv_catpvs(sv, "-");
12304 put_byte(sv, i - 1);
12305 }
12306 rangestart = -1;
12307 }
12308 }
f3a2811a 12309 sv_catpvs(sv, "]");
1de06328
YO
12310 }
12311
a3621e74 12312 } else if (k == CURLY) {
cb434fcc 12313 if (OP(o) == CURLYM || OP(o) == CURLYN || OP(o) == CURLYX)
cea2e8a9
GS
12314 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* Parenth number */
12315 Perl_sv_catpvf(aTHX_ sv, " {%d,%d}", ARG1(o), ARG2(o));
a0d0e21e 12316 }
2c2d71f5
JH
12317 else if (k == WHILEM && o->flags) /* Ordinal/of */
12318 Perl_sv_catpvf(aTHX_ sv, "[%d/%d]", o->flags & 0xf, o->flags>>4);
1f1031fe 12319 else if (k == REF || k == OPEN || k == CLOSE || k == GROUPP || OP(o)==ACCEPT) {
894356b3 12320 Perl_sv_catpvf(aTHX_ sv, "%d", (int)ARG(o)); /* Parenth number */
5daac39c 12321 if ( RXp_PAREN_NAMES(prog) ) {
9d6ecd7a 12322 if ( k != REF || (OP(o) < NREF)) {
502c6561 12323 AV *list= MUTABLE_AV(progi->data->data[progi->name_list_idx]);
ee9b8eae
YO
12324 SV **name= av_fetch(list, ARG(o), 0 );
12325 if (name)
12326 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
12327 }
12328 else {
502c6561 12329 AV *list= MUTABLE_AV(progi->data->data[ progi->name_list_idx ]);
ad64d0ec 12330 SV *sv_dat= MUTABLE_SV(progi->data->data[ ARG( o ) ]);
ee9b8eae
YO
12331 I32 *nums=(I32*)SvPVX(sv_dat);
12332 SV **name= av_fetch(list, nums[0], 0 );
12333 I32 n;
12334 if (name) {
12335 for ( n=0; n<SvIVX(sv_dat); n++ ) {
12336 Perl_sv_catpvf(aTHX_ sv, "%s%"IVdf,
12337 (n ? "," : ""), (IV)nums[n]);
12338 }
12339 Perl_sv_catpvf(aTHX_ sv, " '%"SVf"'", SVfARG(*name));
1f1031fe 12340 }
1f1031fe 12341 }
ee9b8eae 12342 }
1f1031fe 12343 } else if (k == GOSUB)
6bda09f9 12344 Perl_sv_catpvf(aTHX_ sv, "%d[%+d]", (int)ARG(o),(int)ARG2L(o)); /* Paren and offset */
e2e6a0f1
YO
12345 else if (k == VERB) {
12346 if (!o->flags)
12347 Perl_sv_catpvf(aTHX_ sv, ":%"SVf,
ad64d0ec 12348 SVfARG((MUTABLE_SV(progi->data->data[ ARG( o ) ]))));
e2e6a0f1 12349 } else if (k == LOGICAL)
04ebc1ab 12350 Perl_sv_catpvf(aTHX_ sv, "[%d]", o->flags); /* 2: embedded, otherwise 1 */
653099ff
GS
12351 else if (k == ANYOF) {
12352 int i, rangestart = -1;
2d03de9c 12353 const U8 flags = ANYOF_FLAGS(o);
24d786f4 12354 int do_sep = 0;
0bd48802
AL
12355
12356 /* Should be synchronized with * ANYOF_ #xdefines in regcomp.h */
12357 static const char * const anyofs[] = {
653099ff
GS
12358 "\\w",
12359 "\\W",
12360 "\\s",
12361 "\\S",
12362 "\\d",
12363 "\\D",
12364 "[:alnum:]",
12365 "[:^alnum:]",
12366 "[:alpha:]",
12367 "[:^alpha:]",
12368 "[:ascii:]",
12369 "[:^ascii:]",
24d786f4
YO
12370 "[:cntrl:]",
12371 "[:^cntrl:]",
653099ff
GS
12372 "[:graph:]",
12373 "[:^graph:]",
12374 "[:lower:]",
12375 "[:^lower:]",
12376 "[:print:]",
12377 "[:^print:]",
12378 "[:punct:]",
12379 "[:^punct:]",
12380 "[:upper:]",
aaa51d5e 12381 "[:^upper:]",
653099ff 12382 "[:xdigit:]",
aaa51d5e
JF
12383 "[:^xdigit:]",
12384 "[:space:]",
12385 "[:^space:]",
12386 "[:blank:]",
12387 "[:^blank:]"
653099ff
GS
12388 };
12389
19860706 12390 if (flags & ANYOF_LOCALE)
396482e1 12391 sv_catpvs(sv, "{loc}");
39065660 12392 if (flags & ANYOF_LOC_NONBITMAP_FOLD)
396482e1 12393 sv_catpvs(sv, "{i}");
653099ff 12394 Perl_sv_catpvf(aTHX_ sv, "[%s", PL_colors[0]);
19860706 12395 if (flags & ANYOF_INVERT)
396482e1 12396 sv_catpvs(sv, "^");
686b73d4 12397
3339dfd8 12398 /* output what the standard cp 0-255 bitmap matches */
ffc61ed2
JH
12399 for (i = 0; i <= 256; i++) {
12400 if (i < 256 && ANYOF_BITMAP_TEST(o,i)) {
12401 if (rangestart == -1)
12402 rangestart = i;
12403 } else if (rangestart != -1) {
12404 if (i <= rangestart + 3)
12405 for (; rangestart < i; rangestart++)
653099ff 12406 put_byte(sv, rangestart);
ffc61ed2
JH
12407 else {
12408 put_byte(sv, rangestart);
396482e1 12409 sv_catpvs(sv, "-");
ffc61ed2 12410 put_byte(sv, i - 1);
653099ff 12411 }
24d786f4 12412 do_sep = 1;
ffc61ed2 12413 rangestart = -1;
653099ff 12414 }
847a199f 12415 }
3339dfd8
YO
12416
12417 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
3a15e693
KW
12418 /* output any special charclass tests (used entirely under use locale) */
12419 if (ANYOF_CLASS_TEST_ANY_SET(o))
bb7a0f54 12420 for (i = 0; i < (int)(sizeof(anyofs)/sizeof(char*)); i++)
24d786f4 12421 if (ANYOF_CLASS_TEST(o,i)) {
ffc61ed2 12422 sv_catpv(sv, anyofs[i]);
24d786f4
YO
12423 do_sep = 1;
12424 }
12425
3339dfd8
YO
12426 EMIT_ANYOF_TEST_SEPARATOR(do_sep,sv,flags);
12427
11454c59
KW
12428 if (flags & ANYOF_NON_UTF8_LATIN1_ALL) {
12429 sv_catpvs(sv, "{non-utf8-latin1-all}");
12430 }
12431
3339dfd8 12432 /* output information about the unicode matching */
ef87b810 12433 if (flags & ANYOF_UNICODE_ALL)
396482e1 12434 sv_catpvs(sv, "{unicode_all}");
137165a6 12435 else if (ANYOF_NONBITMAP(o))
ef87b810 12436 sv_catpvs(sv, "{unicode}");
f5ecd18d 12437 if (flags & ANYOF_NONBITMAP_NON_UTF8)
ef87b810 12438 sv_catpvs(sv, "{outside bitmap}");
ffc61ed2 12439
1aa9930e 12440 if (ANYOF_NONBITMAP(o)) {
dbe7a391 12441 SV *lv; /* Set if there is something outside the bit map */
32fc9b6a 12442 SV * const sw = regclass_swash(prog, o, FALSE, &lv, 0);
f1114c30
KW
12443 bool byte_output = FALSE; /* If something in the bitmap has been
12444 output */
686b73d4 12445
c16787fd 12446 if (lv && lv != &PL_sv_undef) {
ffc61ed2 12447 if (sw) {
89ebb4a3 12448 U8 s[UTF8_MAXBYTES_CASE+1];
24d786f4 12449
dbe7a391 12450 for (i = 0; i <= 256; i++) { /* Look at chars in bitmap */
1df70142 12451 uvchr_to_utf8(s, i);
686b73d4 12452
dcf8909a
KW
12453 if (i < 256
12454 && ! ANYOF_BITMAP_TEST(o, i) /* Don't duplicate
12455 things already
12456 output as part
12457 of the bitmap */
12458 && swash_fetch(sw, s, TRUE))
12459 {
ffc61ed2
JH
12460 if (rangestart == -1)
12461 rangestart = i;
12462 } else if (rangestart != -1) {
f1114c30 12463 byte_output = TRUE;
ffc61ed2
JH
12464 if (i <= rangestart + 3)
12465 for (; rangestart < i; rangestart++) {
7128c099 12466 put_byte(sv, rangestart);
ffc61ed2
JH
12467 }
12468 else {
7128c099 12469 put_byte(sv, rangestart);
396482e1 12470 sv_catpvs(sv, "-");
7128c099 12471 put_byte(sv, i-1);
ffc61ed2 12472 }
e87973a9 12473 rangestart = -1;
19860706 12474 }
e87973a9 12475 }
19860706 12476 }
fde631ed 12477
ffc61ed2 12478 {
2e0de35c 12479 char *s = savesvpv(lv);
c445ea15 12480 char * const origs = s;
686b73d4 12481
3dab1dad
YO
12482 while (*s && *s != '\n')
12483 s++;
686b73d4 12484
ffc61ed2 12485 if (*s == '\n') {
2d03de9c 12486 const char * const t = ++s;
686b73d4 12487
f1114c30
KW
12488 if (byte_output) {
12489 sv_catpvs(sv, " ");
12490 }
12491
ffc61ed2 12492 while (*s) {
c574ffb9
KW
12493 if (*s == '\n') {
12494
12495 /* Truncate very long output */
12496 if (s - origs > 256) {
12497 Perl_sv_catpvf(aTHX_ sv,
12498 "%.*s...",
12499 (int) (s - origs - 1),
12500 t);
12501 goto out_dump;
12502 }
ffc61ed2 12503 *s = ' ';
1a9c8476
KW
12504 }
12505 else if (*s == '\t') {
12506 *s = '-';
12507 }
ffc61ed2
JH
12508 s++;
12509 }
12510 if (s[-1] == ' ')
12511 s[-1] = 0;
686b73d4 12512
ffc61ed2 12513 sv_catpv(sv, t);
fde631ed 12514 }
686b73d4 12515
c574ffb9
KW
12516 out_dump:
12517
ffc61ed2 12518 Safefree(origs);
fde631ed 12519 }
c16787fd 12520 SvREFCNT_dec(lv);
fde631ed 12521 }
653099ff 12522 }
ffc61ed2 12523
653099ff
GS
12524 Perl_sv_catpvf(aTHX_ sv, "%s]", PL_colors[1]);
12525 }
9b155405 12526 else if (k == BRANCHJ && (OP(o) == UNLESSM || OP(o) == IFMATCH))
07be1b83 12527 Perl_sv_catpvf(aTHX_ sv, "[%d]", -(o->flags));
65e66c80 12528#else
96a5add6 12529 PERL_UNUSED_CONTEXT;
65e66c80
SP
12530 PERL_UNUSED_ARG(sv);
12531 PERL_UNUSED_ARG(o);
f9049ba1 12532 PERL_UNUSED_ARG(prog);
17c3b450 12533#endif /* DEBUGGING */
35ff7856 12534}
a687059c 12535
cad2e5aa 12536SV *
288b8c02 12537Perl_re_intuit_string(pTHX_ REGEXP * const r)
cad2e5aa 12538{ /* Assume that RE_INTUIT is set */
97aff369 12539 dVAR;
288b8c02 12540 struct regexp *const prog = (struct regexp *)SvANY(r);
a3621e74 12541 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
12542
12543 PERL_ARGS_ASSERT_RE_INTUIT_STRING;
96a5add6
AL
12544 PERL_UNUSED_CONTEXT;
12545
a3621e74 12546 DEBUG_COMPILE_r(
cfd0369c 12547 {
2d03de9c 12548 const char * const s = SvPV_nolen_const(prog->check_substr
cfd0369c 12549 ? prog->check_substr : prog->check_utf8);
cad2e5aa
JH
12550
12551 if (!PL_colorset) reginitcolors();
12552 PerlIO_printf(Perl_debug_log,
a0288114 12553 "%sUsing REx %ssubstr:%s \"%s%.60s%s%s\"\n",
33b8afdf
JH
12554 PL_colors[4],
12555 prog->check_substr ? "" : "utf8 ",
12556 PL_colors[5],PL_colors[0],
cad2e5aa
JH
12557 s,
12558 PL_colors[1],
12559 (strlen(s) > 60 ? "..." : ""));
12560 } );
12561
33b8afdf 12562 return prog->check_substr ? prog->check_substr : prog->check_utf8;
cad2e5aa
JH
12563}
12564
84da74a7 12565/*
f8149455 12566 pregfree()
84da74a7 12567
f8149455
YO
12568 handles refcounting and freeing the perl core regexp structure. When
12569 it is necessary to actually free the structure the first thing it
3b753521 12570 does is call the 'free' method of the regexp_engine associated to
f8149455
YO
12571 the regexp, allowing the handling of the void *pprivate; member
12572 first. (This routine is not overridable by extensions, which is why
12573 the extensions free is called first.)
12574
12575 See regdupe and regdupe_internal if you change anything here.
84da74a7 12576*/
f8149455 12577#ifndef PERL_IN_XSUB_RE
2b69d0c2 12578void
84679df5 12579Perl_pregfree(pTHX_ REGEXP *r)
a687059c 12580{
288b8c02
NC
12581 SvREFCNT_dec(r);
12582}
12583
12584void
12585Perl_pregfree2(pTHX_ REGEXP *rx)
12586{
27da23d5 12587 dVAR;
288b8c02 12588 struct regexp *const r = (struct regexp *)SvANY(rx);
fc32ee4a 12589 GET_RE_DEBUG_FLAGS_DECL;
a3621e74 12590
7918f24d
NC
12591 PERL_ARGS_ASSERT_PREGFREE2;
12592
28d8d7f4
YO
12593 if (r->mother_re) {
12594 ReREFCNT_dec(r->mother_re);
12595 } else {
288b8c02 12596 CALLREGFREE_PVT(rx); /* free the private data */
ef8d46e8 12597 SvREFCNT_dec(RXp_PAREN_NAMES(r));
28d8d7f4
YO
12598 }
12599 if (r->substrs) {
ef8d46e8
VP
12600 SvREFCNT_dec(r->anchored_substr);
12601 SvREFCNT_dec(r->anchored_utf8);
12602 SvREFCNT_dec(r->float_substr);
12603 SvREFCNT_dec(r->float_utf8);
28d8d7f4
YO
12604 Safefree(r->substrs);
12605 }
288b8c02 12606 RX_MATCH_COPY_FREE(rx);
f8c7b90f 12607#ifdef PERL_OLD_COPY_ON_WRITE
ef8d46e8 12608 SvREFCNT_dec(r->saved_copy);
ed252734 12609#endif
f0ab9afb 12610 Safefree(r->offs);
f8149455 12611}
28d8d7f4
YO
12612
12613/* reg_temp_copy()
12614
12615 This is a hacky workaround to the structural issue of match results
12616 being stored in the regexp structure which is in turn stored in
12617 PL_curpm/PL_reg_curpm. The problem is that due to qr// the pattern
12618 could be PL_curpm in multiple contexts, and could require multiple
12619 result sets being associated with the pattern simultaneously, such
12620 as when doing a recursive match with (??{$qr})
12621
12622 The solution is to make a lightweight copy of the regexp structure
12623 when a qr// is returned from the code executed by (??{$qr}) this
486ec47a 12624 lightweight copy doesn't actually own any of its data except for
28d8d7f4
YO
12625 the starp/end and the actual regexp structure itself.
12626
12627*/
12628
12629
84679df5 12630REGEXP *
f0826785 12631Perl_reg_temp_copy (pTHX_ REGEXP *ret_x, REGEXP *rx)
7918f24d 12632{
f0826785 12633 struct regexp *ret;
288b8c02 12634 struct regexp *const r = (struct regexp *)SvANY(rx);
28d8d7f4 12635 register const I32 npar = r->nparens+1;
7918f24d
NC
12636
12637 PERL_ARGS_ASSERT_REG_TEMP_COPY;
12638
f0826785
BM
12639 if (!ret_x)
12640 ret_x = (REGEXP*) newSV_type(SVt_REGEXP);
12641 ret = (struct regexp *)SvANY(ret_x);
12642
288b8c02 12643 (void)ReREFCNT_inc(rx);
f7c278bf
NC
12644 /* We can take advantage of the existing "copied buffer" mechanism in SVs
12645 by pointing directly at the buffer, but flagging that the allocated
12646 space in the copy is zero. As we've just done a struct copy, it's now
12647 a case of zero-ing that, rather than copying the current length. */
12648 SvPV_set(ret_x, RX_WRAPPED(rx));
8f6ae13c 12649 SvFLAGS(ret_x) |= SvFLAGS(rx) & (SVf_POK|SVp_POK|SVf_UTF8);
b6f60916
NC
12650 memcpy(&(ret->xpv_cur), &(r->xpv_cur),
12651 sizeof(regexp) - STRUCT_OFFSET(regexp, xpv_cur));
f7c278bf 12652 SvLEN_set(ret_x, 0);
b9ad13ac 12653 SvSTASH_set(ret_x, NULL);
703c388d 12654 SvMAGIC_set(ret_x, NULL);
f0ab9afb
NC
12655 Newx(ret->offs, npar, regexp_paren_pair);
12656 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
28d8d7f4 12657 if (r->substrs) {
28d8d7f4 12658 Newx(ret->substrs, 1, struct reg_substr_data);
6ab65676
NC
12659 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12660
12661 SvREFCNT_inc_void(ret->anchored_substr);
12662 SvREFCNT_inc_void(ret->anchored_utf8);
12663 SvREFCNT_inc_void(ret->float_substr);
12664 SvREFCNT_inc_void(ret->float_utf8);
12665
12666 /* check_substr and check_utf8, if non-NULL, point to either their
12667 anchored or float namesakes, and don't hold a second reference. */
486913e4 12668 }
288b8c02 12669 RX_MATCH_COPIED_off(ret_x);
28d8d7f4 12670#ifdef PERL_OLD_COPY_ON_WRITE
b89b0c6f 12671 ret->saved_copy = NULL;
28d8d7f4 12672#endif
288b8c02 12673 ret->mother_re = rx;
28d8d7f4 12674
288b8c02 12675 return ret_x;
28d8d7f4 12676}
f8149455
YO
12677#endif
12678
12679/* regfree_internal()
12680
12681 Free the private data in a regexp. This is overloadable by
12682 extensions. Perl takes care of the regexp structure in pregfree(),
3b753521 12683 this covers the *pprivate pointer which technically perl doesn't
f8149455
YO
12684 know about, however of course we have to handle the
12685 regexp_internal structure when no extension is in use.
12686
12687 Note this is called before freeing anything in the regexp
12688 structure.
12689 */
12690
12691void
288b8c02 12692Perl_regfree_internal(pTHX_ REGEXP * const rx)
f8149455
YO
12693{
12694 dVAR;
288b8c02 12695 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455
YO
12696 RXi_GET_DECL(r,ri);
12697 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
12698
12699 PERL_ARGS_ASSERT_REGFREE_INTERNAL;
12700
f8149455
YO
12701 DEBUG_COMPILE_r({
12702 if (!PL_colorset)
12703 reginitcolors();
12704 {
12705 SV *dsv= sv_newmortal();
3c8556c3 12706 RE_PV_QUOTED_DECL(s, RX_UTF8(rx),
5509d87a 12707 dsv, RX_PRECOMP(rx), RX_PRELEN(rx), 60);
f8149455
YO
12708 PerlIO_printf(Perl_debug_log,"%sFreeing REx:%s %s\n",
12709 PL_colors[4],PL_colors[5],s);
12710 }
12711 });
7122b237
YO
12712#ifdef RE_TRACK_PATTERN_OFFSETS
12713 if (ri->u.offsets)
12714 Safefree(ri->u.offsets); /* 20010421 MJD */
12715#endif
f8fc2ecf
YO
12716 if (ri->data) {
12717 int n = ri->data->count;
f3548bdc
DM
12718 PAD* new_comppad = NULL;
12719 PAD* old_comppad;
4026c95a 12720 PADOFFSET refcnt;
dfad63ad 12721
c277df42 12722 while (--n >= 0) {
261faec3 12723 /* If you add a ->what type here, update the comment in regcomp.h */
f8fc2ecf 12724 switch (ri->data->what[n]) {
af534a04 12725 case 'a':
c277df42 12726 case 's':
81714fb9 12727 case 'S':
55eed653 12728 case 'u':
ad64d0ec 12729 SvREFCNT_dec(MUTABLE_SV(ri->data->data[n]));
c277df42 12730 break;
653099ff 12731 case 'f':
f8fc2ecf 12732 Safefree(ri->data->data[n]);
653099ff 12733 break;
dfad63ad 12734 case 'p':
502c6561 12735 new_comppad = MUTABLE_AV(ri->data->data[n]);
dfad63ad 12736 break;
c277df42 12737 case 'o':
dfad63ad 12738 if (new_comppad == NULL)
cea2e8a9 12739 Perl_croak(aTHX_ "panic: pregfree comppad");
f3548bdc
DM
12740 PAD_SAVE_LOCAL(old_comppad,
12741 /* Watch out for global destruction's random ordering. */
c445ea15 12742 (SvTYPE(new_comppad) == SVt_PVAV) ? new_comppad : NULL
f3548bdc 12743 );
b34c0dd4 12744 OP_REFCNT_LOCK;
f8fc2ecf 12745 refcnt = OpREFCNT_dec((OP_4tree*)ri->data->data[n]);
4026c95a
SH
12746 OP_REFCNT_UNLOCK;
12747 if (!refcnt)
f8fc2ecf 12748 op_free((OP_4tree*)ri->data->data[n]);
9b978d73 12749
f3548bdc 12750 PAD_RESTORE_LOCAL(old_comppad);
ad64d0ec 12751 SvREFCNT_dec(MUTABLE_SV(new_comppad));
dfad63ad 12752 new_comppad = NULL;
c277df42
IZ
12753 break;
12754 case 'n':
9e55ce06 12755 break;
07be1b83 12756 case 'T':
be8e71aa
YO
12757 { /* Aho Corasick add-on structure for a trie node.
12758 Used in stclass optimization only */
07be1b83 12759 U32 refcount;
f8fc2ecf 12760 reg_ac_data *aho=(reg_ac_data*)ri->data->data[n];
07be1b83
YO
12761 OP_REFCNT_LOCK;
12762 refcount = --aho->refcount;
12763 OP_REFCNT_UNLOCK;
12764 if ( !refcount ) {
446bd890
NC
12765 PerlMemShared_free(aho->states);
12766 PerlMemShared_free(aho->fail);
446bd890
NC
12767 /* do this last!!!! */
12768 PerlMemShared_free(ri->data->data[n]);
12769 PerlMemShared_free(ri->regstclass);
07be1b83
YO
12770 }
12771 }
12772 break;
a3621e74 12773 case 't':
07be1b83 12774 {
be8e71aa 12775 /* trie structure. */
07be1b83 12776 U32 refcount;
f8fc2ecf 12777 reg_trie_data *trie=(reg_trie_data*)ri->data->data[n];
07be1b83
YO
12778 OP_REFCNT_LOCK;
12779 refcount = --trie->refcount;
12780 OP_REFCNT_UNLOCK;
12781 if ( !refcount ) {
446bd890 12782 PerlMemShared_free(trie->charmap);
446bd890
NC
12783 PerlMemShared_free(trie->states);
12784 PerlMemShared_free(trie->trans);
07be1b83 12785 if (trie->bitmap)
446bd890 12786 PerlMemShared_free(trie->bitmap);
786e8c11 12787 if (trie->jump)
446bd890 12788 PerlMemShared_free(trie->jump);
2e64971a 12789 PerlMemShared_free(trie->wordinfo);
446bd890
NC
12790 /* do this last!!!! */
12791 PerlMemShared_free(ri->data->data[n]);
a3621e74 12792 }
07be1b83
YO
12793 }
12794 break;
c277df42 12795 default:
f8fc2ecf 12796 Perl_croak(aTHX_ "panic: regfree data code '%c'", ri->data->what[n]);
c277df42
IZ
12797 }
12798 }
f8fc2ecf
YO
12799 Safefree(ri->data->what);
12800 Safefree(ri->data);
a0d0e21e 12801 }
28d8d7f4 12802
f8fc2ecf 12803 Safefree(ri);
a687059c 12804}
c277df42 12805
a09252eb
NC
12806#define av_dup_inc(s,t) MUTABLE_AV(sv_dup_inc((const SV *)s,t))
12807#define hv_dup_inc(s,t) MUTABLE_HV(sv_dup_inc((const SV *)s,t))
84da74a7
YO
12808#define SAVEPVN(p,n) ((p) ? savepvn(p,n) : NULL)
12809
12810/*
32cd70f6 12811 re_dup - duplicate a regexp.
84da74a7 12812
8233f606
DM
12813 This routine is expected to clone a given regexp structure. It is only
12814 compiled under USE_ITHREADS.
32cd70f6 12815
f8149455
YO
12816 After all of the core data stored in struct regexp is duplicated
12817 the regexp_engine.dupe method is used to copy any private data
12818 stored in the *pprivate pointer. This allows extensions to handle
12819 any duplication it needs to do.
12820
12821 See pregfree() and regfree_internal() if you change anything here.
84da74a7 12822*/
a3c0e9ca 12823#if defined(USE_ITHREADS)
f8149455 12824#ifndef PERL_IN_XSUB_RE
288b8c02
NC
12825void
12826Perl_re_dup_guts(pTHX_ const REGEXP *sstr, REGEXP *dstr, CLONE_PARAMS *param)
84da74a7 12827{
84da74a7 12828 dVAR;
a86a1ca7 12829 I32 npar;
288b8c02
NC
12830 const struct regexp *r = (const struct regexp *)SvANY(sstr);
12831 struct regexp *ret = (struct regexp *)SvANY(dstr);
f8149455 12832
7918f24d
NC
12833 PERL_ARGS_ASSERT_RE_DUP_GUTS;
12834
84da74a7 12835 npar = r->nparens+1;
f0ab9afb
NC
12836 Newx(ret->offs, npar, regexp_paren_pair);
12837 Copy(r->offs, ret->offs, npar, regexp_paren_pair);
6057429f 12838 if(ret->swap) {
28d8d7f4 12839 /* no need to copy these */
f0ab9afb 12840 Newx(ret->swap, npar, regexp_paren_pair);
28d8d7f4 12841 }
84da74a7 12842
6057429f 12843 if (ret->substrs) {
32cd70f6
NC
12844 /* Do it this way to avoid reading from *r after the StructCopy().
12845 That way, if any of the sv_dup_inc()s dislodge *r from the L1
12846 cache, it doesn't matter. */
66b1de87
NC
12847 const bool anchored = r->check_substr
12848 ? r->check_substr == r->anchored_substr
12849 : r->check_utf8 == r->anchored_utf8;
785a26d5 12850 Newx(ret->substrs, 1, struct reg_substr_data);
a86a1ca7
NC
12851 StructCopy(r->substrs, ret->substrs, struct reg_substr_data);
12852
32cd70f6
NC
12853 ret->anchored_substr = sv_dup_inc(ret->anchored_substr, param);
12854 ret->anchored_utf8 = sv_dup_inc(ret->anchored_utf8, param);
12855 ret->float_substr = sv_dup_inc(ret->float_substr, param);
12856 ret->float_utf8 = sv_dup_inc(ret->float_utf8, param);
a86a1ca7 12857
32cd70f6
NC
12858 /* check_substr and check_utf8, if non-NULL, point to either their
12859 anchored or float namesakes, and don't hold a second reference. */
12860
12861 if (ret->check_substr) {
12862 if (anchored) {
12863 assert(r->check_utf8 == r->anchored_utf8);
12864 ret->check_substr = ret->anchored_substr;
12865 ret->check_utf8 = ret->anchored_utf8;
12866 } else {
12867 assert(r->check_substr == r->float_substr);
12868 assert(r->check_utf8 == r->float_utf8);
12869 ret->check_substr = ret->float_substr;
12870 ret->check_utf8 = ret->float_utf8;
12871 }
66b1de87
NC
12872 } else if (ret->check_utf8) {
12873 if (anchored) {
12874 ret->check_utf8 = ret->anchored_utf8;
12875 } else {
12876 ret->check_utf8 = ret->float_utf8;
12877 }
32cd70f6 12878 }
6057429f 12879 }
f8149455 12880
5daac39c 12881 RXp_PAREN_NAMES(ret) = hv_dup_inc(RXp_PAREN_NAMES(ret), param);
bcdf7404 12882
6057429f 12883 if (ret->pprivate)
288b8c02 12884 RXi_SET(ret,CALLREGDUPE_PVT(dstr,param));
f8149455 12885
288b8c02 12886 if (RX_MATCH_COPIED(dstr))
6057429f 12887 ret->subbeg = SAVEPVN(ret->subbeg, ret->sublen);
f8149455
YO
12888 else
12889 ret->subbeg = NULL;
12890#ifdef PERL_OLD_COPY_ON_WRITE
12891 ret->saved_copy = NULL;
12892#endif
6057429f 12893
c2123ae3
NC
12894 if (ret->mother_re) {
12895 if (SvPVX_const(dstr) == SvPVX_const(ret->mother_re)) {
12896 /* Our storage points directly to our mother regexp, but that's
12897 1: a buffer in a different thread
12898 2: something we no longer hold a reference on
12899 so we need to copy it locally. */
d5aafdca
FC
12900 /* Note we need to use SvCUR(), rather than
12901 SvLEN(), on our mother_re, because it, in
c2123ae3
NC
12902 turn, may well be pointing to its own mother_re. */
12903 SvPV_set(dstr, SAVEPVN(SvPVX_const(ret->mother_re),
12904 SvCUR(ret->mother_re)+1));
12905 SvLEN_set(dstr, SvCUR(ret->mother_re)+1);
12906 }
12907 ret->mother_re = NULL;
12908 }
6057429f 12909 ret->gofs = 0;
f8149455
YO
12910}
12911#endif /* PERL_IN_XSUB_RE */
12912
12913/*
12914 regdupe_internal()
12915
12916 This is the internal complement to regdupe() which is used to copy
12917 the structure pointed to by the *pprivate pointer in the regexp.
12918 This is the core version of the extension overridable cloning hook.
12919 The regexp structure being duplicated will be copied by perl prior
12920 to this and will be provided as the regexp *r argument, however
12921 with the /old/ structures pprivate pointer value. Thus this routine
12922 may override any copying normally done by perl.
12923
12924 It returns a pointer to the new regexp_internal structure.
12925*/
12926
12927void *
288b8c02 12928Perl_regdupe_internal(pTHX_ REGEXP * const rx, CLONE_PARAMS *param)
f8149455
YO
12929{
12930 dVAR;
288b8c02 12931 struct regexp *const r = (struct regexp *)SvANY(rx);
f8149455 12932 regexp_internal *reti;
0780bc72 12933 int len;
f8149455 12934 RXi_GET_DECL(r,ri);
7918f24d
NC
12935
12936 PERL_ARGS_ASSERT_REGDUPE_INTERNAL;
f8149455 12937
7122b237 12938 len = ProgLen(ri);
f8149455 12939
45cf4570 12940 Newxc(reti, sizeof(regexp_internal) + len*sizeof(regnode), char, regexp_internal);
f8149455
YO
12941 Copy(ri->program, reti->program, len+1, regnode);
12942
f8149455 12943
f8fc2ecf 12944 reti->regstclass = NULL;
bcdf7404 12945
f8fc2ecf 12946 if (ri->data) {
84da74a7 12947 struct reg_data *d;
f8fc2ecf 12948 const int count = ri->data->count;
84da74a7
YO
12949 int i;
12950
12951 Newxc(d, sizeof(struct reg_data) + count*sizeof(void *),
12952 char, struct reg_data);
12953 Newx(d->what, count, U8);
12954
12955 d->count = count;
12956 for (i = 0; i < count; i++) {
f8fc2ecf 12957 d->what[i] = ri->data->what[i];
84da74a7 12958 switch (d->what[i]) {
af534a04 12959 /* legal options are one of: sSfpontTua
84da74a7 12960 see also regcomp.h and pregfree() */
af534a04 12961 case 'a': /* actually an AV, but the dup function is identical. */
84da74a7 12962 case 's':
81714fb9 12963 case 'S':
0536c0a7 12964 case 'p': /* actually an AV, but the dup function is identical. */
55eed653 12965 case 'u': /* actually an HV, but the dup function is identical. */
ad64d0ec 12966 d->data[i] = sv_dup_inc((const SV *)ri->data->data[i], param);
84da74a7 12967 break;
84da74a7
YO
12968 case 'f':
12969 /* This is cheating. */
12970 Newx(d->data[i], 1, struct regnode_charclass_class);
f8fc2ecf 12971 StructCopy(ri->data->data[i], d->data[i],
84da74a7 12972 struct regnode_charclass_class);
f8fc2ecf 12973 reti->regstclass = (regnode*)d->data[i];
84da74a7
YO
12974 break;
12975 case 'o':
bbe252da
YO
12976 /* Compiled op trees are readonly and in shared memory,
12977 and can thus be shared without duplication. */
84da74a7 12978 OP_REFCNT_LOCK;
f8fc2ecf 12979 d->data[i] = (void*)OpREFCNT_inc((OP*)ri->data->data[i]);
84da74a7
YO
12980 OP_REFCNT_UNLOCK;
12981 break;
23eab42c
NC
12982 case 'T':
12983 /* Trie stclasses are readonly and can thus be shared
12984 * without duplication. We free the stclass in pregfree
12985 * when the corresponding reg_ac_data struct is freed.
12986 */
12987 reti->regstclass= ri->regstclass;
12988 /* Fall through */
84da74a7 12989 case 't':
84da74a7 12990 OP_REFCNT_LOCK;
0536c0a7 12991 ((reg_trie_data*)ri->data->data[i])->refcount++;
84da74a7 12992 OP_REFCNT_UNLOCK;
0536c0a7
NC
12993 /* Fall through */
12994 case 'n':
12995 d->data[i] = ri->data->data[i];
84da74a7 12996 break;
84da74a7 12997 default:
f8fc2ecf 12998 Perl_croak(aTHX_ "panic: re_dup unknown data code '%c'", ri->data->what[i]);
84da74a7
YO
12999 }
13000 }
13001
f8fc2ecf 13002 reti->data = d;
84da74a7
YO
13003 }
13004 else
f8fc2ecf 13005 reti->data = NULL;
84da74a7 13006
cde0cee5
YO
13007 reti->name_list_idx = ri->name_list_idx;
13008
7122b237
YO
13009#ifdef RE_TRACK_PATTERN_OFFSETS
13010 if (ri->u.offsets) {
13011 Newx(reti->u.offsets, 2*len+1, U32);
13012 Copy(ri->u.offsets, reti->u.offsets, 2*len+1, U32);
13013 }
13014#else
13015 SetProgLen(reti,len);
13016#endif
13017
f8149455 13018 return (void*)reti;
84da74a7 13019}
f8149455
YO
13020
13021#endif /* USE_ITHREADS */
84da74a7 13022
f8149455 13023#ifndef PERL_IN_XSUB_RE
bcdf7404 13024
c277df42
IZ
13025/*
13026 - regnext - dig the "next" pointer out of a node
c277df42
IZ
13027 */
13028regnode *
864dbfa3 13029Perl_regnext(pTHX_ register regnode *p)
c277df42 13030{
97aff369 13031 dVAR;
c277df42
IZ
13032 register I32 offset;
13033
f8fc2ecf 13034 if (!p)
c277df42
IZ
13035 return(NULL);
13036
35db910f
KW
13037 if (OP(p) > REGNODE_MAX) { /* regnode.type is unsigned */
13038 Perl_croak(aTHX_ "Corrupted regexp opcode %d > %d", (int)OP(p), (int)REGNODE_MAX);
13039 }
13040
c277df42
IZ
13041 offset = (reg_off_by_arg[OP(p)] ? ARG(p) : NEXT_OFF(p));
13042 if (offset == 0)
13043 return(NULL);
13044
c277df42 13045 return(p+offset);
c277df42 13046}
76234dfb 13047#endif
c277df42 13048
686b73d4 13049STATIC void
cea2e8a9 13050S_re_croak2(pTHX_ const char* pat1,const char* pat2,...)
c277df42
IZ
13051{
13052 va_list args;
13053 STRLEN l1 = strlen(pat1);
13054 STRLEN l2 = strlen(pat2);
13055 char buf[512];
06bf62c7 13056 SV *msv;
73d840c0 13057 const char *message;
c277df42 13058
7918f24d
NC
13059 PERL_ARGS_ASSERT_RE_CROAK2;
13060
c277df42
IZ
13061 if (l1 > 510)
13062 l1 = 510;
13063 if (l1 + l2 > 510)
13064 l2 = 510 - l1;
13065 Copy(pat1, buf, l1 , char);
13066 Copy(pat2, buf + l1, l2 , char);
3b818b81
GS
13067 buf[l1 + l2] = '\n';
13068 buf[l1 + l2 + 1] = '\0';
8736538c
AS
13069#ifdef I_STDARG
13070 /* ANSI variant takes additional second argument */
c277df42 13071 va_start(args, pat2);
8736538c
AS
13072#else
13073 va_start(args);
13074#endif
5a844595 13075 msv = vmess(buf, &args);
c277df42 13076 va_end(args);
cfd0369c 13077 message = SvPV_const(msv,l1);
c277df42
IZ
13078 if (l1 > 512)
13079 l1 = 512;
13080 Copy(message, buf, l1 , char);
197cf9b9 13081 buf[l1-1] = '\0'; /* Overwrite \n */
cea2e8a9 13082 Perl_croak(aTHX_ "%s", buf);
c277df42 13083}
a0ed51b3
LW
13084
13085/* XXX Here's a total kludge. But we need to re-enter for swash routines. */
13086
76234dfb 13087#ifndef PERL_IN_XSUB_RE
a0ed51b3 13088void
864dbfa3 13089Perl_save_re_context(pTHX)
b81d288d 13090{
97aff369 13091 dVAR;
1ade1aa1
NC
13092
13093 struct re_save_state *state;
13094
13095 SAVEVPTR(PL_curcop);
13096 SSGROW(SAVESTACK_ALLOC_FOR_RE_SAVE_STATE + 1);
13097
13098 state = (struct re_save_state *)(PL_savestack + PL_savestack_ix);
13099 PL_savestack_ix += SAVESTACK_ALLOC_FOR_RE_SAVE_STATE;
c6bf6a65 13100 SSPUSHUV(SAVEt_RE_STATE);
1ade1aa1 13101
46ab3289 13102 Copy(&PL_reg_state, state, 1, struct re_save_state);
1ade1aa1 13103
a0ed51b3 13104 PL_reg_start_tmp = 0;
a0ed51b3 13105 PL_reg_start_tmpl = 0;
c445ea15 13106 PL_reg_oldsaved = NULL;
a5db57d6 13107 PL_reg_oldsavedlen = 0;
a5db57d6 13108 PL_reg_maxiter = 0;
a5db57d6 13109 PL_reg_leftiter = 0;
c445ea15 13110 PL_reg_poscache = NULL;
a5db57d6 13111 PL_reg_poscache_size = 0;
1ade1aa1
NC
13112#ifdef PERL_OLD_COPY_ON_WRITE
13113 PL_nrs = NULL;
13114#endif
ada6e8a9 13115
c445ea15
AL
13116 /* Save $1..$n (#18107: UTF-8 s/(\w+)/uc($1)/e); AMS 20021106. */
13117 if (PL_curpm) {
13118 const REGEXP * const rx = PM_GETRE(PL_curpm);
13119 if (rx) {
1df70142 13120 U32 i;
07bc277f 13121 for (i = 1; i <= RX_NPARENS(rx); i++) {
1df70142 13122 char digits[TYPE_CHARS(long)];
d9fad198 13123 const STRLEN len = my_snprintf(digits, sizeof(digits), "%lu", (long)i);
49f27e4b
NC
13124 GV *const *const gvp
13125 = (GV**)hv_fetch(PL_defstash, digits, len, 0);
13126
b37c2d43
AL
13127 if (gvp) {
13128 GV * const gv = *gvp;
13129 if (SvTYPE(gv) == SVt_PVGV && GvSV(gv))
13130 save_scalar(gv);
49f27e4b 13131 }
ada6e8a9
AMS
13132 }
13133 }
13134 }
a0ed51b3 13135}
76234dfb 13136#endif
51371543 13137
51371543 13138static void
acfe0abc 13139clear_re(pTHX_ void *r)
51371543 13140{
97aff369 13141 dVAR;
84679df5 13142 ReREFCNT_dec((REGEXP *)r);
51371543 13143}
ffbc6a93 13144
a28509cc
AL
13145#ifdef DEBUGGING
13146
13147STATIC void
13148S_put_byte(pTHX_ SV *sv, int c)
13149{
7918f24d
NC
13150 PERL_ARGS_ASSERT_PUT_BYTE;
13151
7fddd944
NC
13152 /* Our definition of isPRINT() ignores locales, so only bytes that are
13153 not part of UTF-8 are considered printable. I assume that the same
13154 holds for UTF-EBCDIC.
13155 Also, code point 255 is not printable in either (it's E0 in EBCDIC,
13156 which Wikipedia says:
13157
13158 EO, or Eight Ones, is an 8-bit EBCDIC character code represented as all
13159 ones (binary 1111 1111, hexadecimal FF). It is similar, but not
13160 identical, to the ASCII delete (DEL) or rubout control character.
13161 ) So the old condition can be simplified to !isPRINT(c) */
9ce2357e
KW
13162 if (!isPRINT(c)) {
13163 if (c < 256) {
13164 Perl_sv_catpvf(aTHX_ sv, "\\x%02x", c);
13165 }
13166 else {
13167 Perl_sv_catpvf(aTHX_ sv, "\\x{%x}", c);
13168 }
13169 }
5e7aa789 13170 else {
88c9ea1e 13171 const char string = c;
5e7aa789
NC
13172 if (c == '-' || c == ']' || c == '\\' || c == '^')
13173 sv_catpvs(sv, "\\");
13174 sv_catpvn(sv, &string, 1);
13175 }
a28509cc
AL
13176}
13177
786e8c11 13178
3dab1dad
YO
13179#define CLEAR_OPTSTART \
13180 if (optstart) STMT_START { \
70685ca0 13181 DEBUG_OPTIMISE_r(PerlIO_printf(Perl_debug_log, " (%"IVdf" nodes)\n", (IV)(node - optstart))); \
3dab1dad
YO
13182 optstart=NULL; \
13183 } STMT_END
13184
786e8c11 13185#define DUMPUNTIL(b,e) CLEAR_OPTSTART; node=dumpuntil(r,start,(b),(e),last,sv,indent+1,depth+1);
3dab1dad 13186
b5a2f8d8
NC
13187STATIC const regnode *
13188S_dumpuntil(pTHX_ const regexp *r, const regnode *start, const regnode *node,
786e8c11
YO
13189 const regnode *last, const regnode *plast,
13190 SV* sv, I32 indent, U32 depth)
a28509cc 13191{
97aff369 13192 dVAR;
786e8c11 13193 register U8 op = PSEUDO; /* Arbitrary non-END op. */
b5a2f8d8 13194 register const regnode *next;
3dab1dad 13195 const regnode *optstart= NULL;
1f1031fe 13196
f8fc2ecf 13197 RXi_GET_DECL(r,ri);
3dab1dad 13198 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
13199
13200 PERL_ARGS_ASSERT_DUMPUNTIL;
13201
786e8c11
YO
13202#ifdef DEBUG_DUMPUNTIL
13203 PerlIO_printf(Perl_debug_log, "--- %d : %d - %d - %d\n",indent,node-start,
13204 last ? last-start : 0,plast ? plast-start : 0);
13205#endif
13206
13207 if (plast && plast < last)
13208 last= plast;
13209
13210 while (PL_regkind[op] != END && (!last || node < last)) {
a28509cc 13211 /* While that wasn't END last time... */
a28509cc
AL
13212 NODE_ALIGN(node);
13213 op = OP(node);
de734bd5 13214 if (op == CLOSE || op == WHILEM)
786e8c11 13215 indent--;
b5a2f8d8 13216 next = regnext((regnode *)node);
1f1031fe 13217
a28509cc 13218 /* Where, what. */
8e11feef 13219 if (OP(node) == OPTIMIZED) {
e68ec53f 13220 if (!optstart && RE_DEBUG_FLAG(RE_DEBUG_COMPILE_OPTIMISE))
8e11feef 13221 optstart = node;
3dab1dad 13222 else
8e11feef 13223 goto after_print;
3dab1dad
YO
13224 } else
13225 CLEAR_OPTSTART;
686b73d4 13226
32fc9b6a 13227 regprop(r, sv, node);
a28509cc 13228 PerlIO_printf(Perl_debug_log, "%4"IVdf":%*s%s", (IV)(node - start),
786e8c11 13229 (int)(2*indent + 1), "", SvPVX_const(sv));
1f1031fe
YO
13230
13231 if (OP(node) != OPTIMIZED) {
13232 if (next == NULL) /* Next ptr. */
13233 PerlIO_printf(Perl_debug_log, " (0)");
13234 else if (PL_regkind[(U8)op] == BRANCH && PL_regkind[OP(next)] != BRANCH )
13235 PerlIO_printf(Perl_debug_log, " (FAIL)");
13236 else
13237 PerlIO_printf(Perl_debug_log, " (%"IVdf")", (IV)(next - start));
13238 (void)PerlIO_putc(Perl_debug_log, '\n');
13239 }
13240
a28509cc
AL
13241 after_print:
13242 if (PL_regkind[(U8)op] == BRANCHJ) {
be8e71aa
YO
13243 assert(next);
13244 {
13245 register const regnode *nnode = (OP(next) == LONGJMP
b5a2f8d8
NC
13246 ? regnext((regnode *)next)
13247 : next);
be8e71aa
YO
13248 if (last && nnode > last)
13249 nnode = last;
786e8c11 13250 DUMPUNTIL(NEXTOPER(NEXTOPER(node)), nnode);
be8e71aa 13251 }
a28509cc
AL
13252 }
13253 else if (PL_regkind[(U8)op] == BRANCH) {
be8e71aa 13254 assert(next);
786e8c11 13255 DUMPUNTIL(NEXTOPER(node), next);
a28509cc
AL
13256 }
13257 else if ( PL_regkind[(U8)op] == TRIE ) {
7f69552c 13258 const regnode *this_trie = node;
1de06328 13259 const char op = OP(node);
647f639f 13260 const U32 n = ARG(node);
1de06328 13261 const reg_ac_data * const ac = op>=AHOCORASICK ?
f8fc2ecf 13262 (reg_ac_data *)ri->data->data[n] :
1de06328 13263 NULL;
3251b653
NC
13264 const reg_trie_data * const trie =
13265 (reg_trie_data*)ri->data->data[op<AHOCORASICK ? n : ac->trie];
2b8b4781 13266#ifdef DEBUGGING
502c6561 13267 AV *const trie_words = MUTABLE_AV(ri->data->data[n + TRIE_WORDS_OFFSET]);
2b8b4781 13268#endif
786e8c11 13269 const regnode *nextbranch= NULL;
a28509cc 13270 I32 word_idx;
76f68e9b 13271 sv_setpvs(sv, "");
786e8c11 13272 for (word_idx= 0; word_idx < (I32)trie->wordcount; word_idx++) {
2b8b4781 13273 SV ** const elem_ptr = av_fetch(trie_words,word_idx,0);
686b73d4 13274
786e8c11
YO
13275 PerlIO_printf(Perl_debug_log, "%*s%s ",
13276 (int)(2*(indent+3)), "",
13277 elem_ptr ? pv_pretty(sv, SvPV_nolen_const(*elem_ptr), SvCUR(*elem_ptr), 60,
ab3bbdeb
YO
13278 PL_colors[0], PL_colors[1],
13279 (SvUTF8(*elem_ptr) ? PERL_PV_ESCAPE_UNI : 0) |
95b611b0 13280 PERL_PV_PRETTY_ELLIPSES |
7f69552c 13281 PERL_PV_PRETTY_LTGT
786e8c11
YO
13282 )
13283 : "???"
13284 );
13285 if (trie->jump) {
40d049e4 13286 U16 dist= trie->jump[word_idx+1];
70685ca0
JH
13287 PerlIO_printf(Perl_debug_log, "(%"UVuf")\n",
13288 (UV)((dist ? this_trie + dist : next) - start));
786e8c11
YO
13289 if (dist) {
13290 if (!nextbranch)
24b23f37 13291 nextbranch= this_trie + trie->jump[0];
7f69552c
YO
13292 DUMPUNTIL(this_trie + dist, nextbranch);
13293 }
786e8c11
YO
13294 if (nextbranch && PL_regkind[OP(nextbranch)]==BRANCH)
13295 nextbranch= regnext((regnode *)nextbranch);
13296 } else {
13297 PerlIO_printf(Perl_debug_log, "\n");
a28509cc 13298 }
786e8c11
YO
13299 }
13300 if (last && next > last)
13301 node= last;
13302 else
13303 node= next;
a28509cc 13304 }
786e8c11
YO
13305 else if ( op == CURLY ) { /* "next" might be very big: optimizer */
13306 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS,
13307 NEXTOPER(node) + EXTRA_STEP_2ARGS + 1);
a28509cc
AL
13308 }
13309 else if (PL_regkind[(U8)op] == CURLY && op != CURLYX) {
be8e71aa 13310 assert(next);
786e8c11 13311 DUMPUNTIL(NEXTOPER(node) + EXTRA_STEP_2ARGS, next);
a28509cc
AL
13312 }
13313 else if ( op == PLUS || op == STAR) {
786e8c11 13314 DUMPUNTIL(NEXTOPER(node), NEXTOPER(node) + 1);
a28509cc 13315 }
f56b6394 13316 else if (PL_regkind[(U8)op] == ANYOF) {
a28509cc 13317 /* arglen 1 + class block */
4a3ee7a8 13318 node += 1 + ((ANYOF_FLAGS(node) & ANYOF_CLASS)
a28509cc
AL
13319 ? ANYOF_CLASS_SKIP : ANYOF_SKIP);
13320 node = NEXTOPER(node);
13321 }
13322 else if (PL_regkind[(U8)op] == EXACT) {
13323 /* Literal string, where present. */
13324 node += NODE_SZ_STR(node) - 1;
13325 node = NEXTOPER(node);
13326 }
13327 else {
13328 node = NEXTOPER(node);
13329 node += regarglen[(U8)op];
13330 }
13331 if (op == CURLYX || op == OPEN)
786e8c11 13332 indent++;
a28509cc 13333 }
3dab1dad 13334 CLEAR_OPTSTART;
786e8c11 13335#ifdef DEBUG_DUMPUNTIL
70685ca0 13336 PerlIO_printf(Perl_debug_log, "--- %d\n", (int)indent);
786e8c11 13337#endif
1de06328 13338 return node;
a28509cc
AL
13339}
13340
13341#endif /* DEBUGGING */
13342
241d1a3b
NC
13343/*
13344 * Local variables:
13345 * c-indentation-style: bsd
13346 * c-basic-offset: 4
13347 * indent-tabs-mode: t
13348 * End:
13349 *
37442d52
RGS
13350 * ex: set ts=8 sts=4 sw=4 noet:
13351 */