This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Use macro accessor uniformly
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
40b6423c 84extern const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
04e98a4d 89#include "dquote_static.c"
26faadbd 90#include "charclass_invlists.h"
81e983c1 91#include "inline_invlist.c"
1b0f46bf 92#include "unicode_constants.h"
04e98a4d 93
94dc5c2d 94#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 95#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 96#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
b1603ef8 102
09b2b2e6 103struct RExC_state_t {
514a91f1
DM
104 U32 flags; /* RXf_* are we folding, multilining? */
105 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 106 char *precomp; /* uncompiled string. */
288b8c02 107 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
108 regexp *rx; /* perl core regexp structure */
109 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 110 char *start; /* Start of input for compile */
830247a4
IZ
111 char *end; /* End of input for compile */
112 char *parse; /* Input-scan pointer. */
ea3daa5d 113 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 114 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 115 regnode *emit_bound; /* First regnode outside of the allocated space */
f7c7e32a
DM
116 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
117 implies compiling, so don't emit */
9a81a976
KW
118 regnode_ssc emit_dummy; /* placeholder for emit to point to;
119 large enough for the largest
120 non-EXACTish node, so can use it as
121 scratch in pass1 */
830247a4
IZ
122 I32 naughty; /* How bad is this pattern? */
123 I32 sawback; /* Did we see \1, ...? */
124 U32 seen;
ea3daa5d 125 SSize_t size; /* Code size. */
c74340f9
YO
126 I32 npar; /* Capture buffer count, (OPEN). */
127 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 128 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
129 I32 extralen;
130 I32 seen_zerolen;
40d049e4
YO
131 regnode **open_parens; /* pointers to open parens */
132 regnode **close_parens; /* pointers to close parens */
133 regnode *opend; /* END node in program */
02daf0ab
YO
134 I32 utf8; /* whether the pattern is utf8 or not */
135 I32 orig_utf8; /* whether the pattern was originally in utf8 */
136 /* XXX use this for future optimisation of case
137 * where pattern must be upgraded to utf8. */
e40e74fe
KW
138 I32 uni_semantics; /* If a d charset modifier should use unicode
139 rules, even if the pattern is not in
140 utf8 */
81714fb9 141 HV *paren_names; /* Paren names */
1f1031fe 142
40d049e4
YO
143 regnode **recurse; /* Recurse regops */
144 I32 recurse_count; /* Number of recurse regops */
b57e4118 145 I32 in_lookbehind;
4624b182 146 I32 contains_locale;
bb3f3ed2 147 I32 override_recoding;
9d53c457 148 I32 in_multi_char_class;
3d2bd50a 149 struct reg_code_block *code_blocks; /* positions of literal (?{})
68e2671b 150 within pattern */
b1603ef8
DM
151 int num_code_blocks; /* size of code_blocks[] */
152 int code_index; /* next code_blocks[] slot */
830247a4
IZ
153#if ADD_TO_REGEXEC
154 char *starttry; /* -Dr: where regtry was called. */
155#define RExC_starttry (pRExC_state->starttry)
156#endif
d24ca0c5 157 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 158#ifdef DEBUGGING
be8e71aa 159 const char *lastparse;
3dab1dad 160 I32 lastnum;
1f1031fe 161 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
162#define RExC_lastparse (pRExC_state->lastparse)
163#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 164#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 165#endif
09b2b2e6 166};
830247a4 167
e2509266 168#define RExC_flags (pRExC_state->flags)
514a91f1 169#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 170#define RExC_precomp (pRExC_state->precomp)
288b8c02 171#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 172#define RExC_rx (pRExC_state->rx)
f8fc2ecf 173#define RExC_rxi (pRExC_state->rxi)
fac92740 174#define RExC_start (pRExC_state->start)
830247a4
IZ
175#define RExC_end (pRExC_state->end)
176#define RExC_parse (pRExC_state->parse)
177#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
178#ifdef RE_TRACK_PATTERN_OFFSETS
179#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
180#endif
830247a4 181#define RExC_emit (pRExC_state->emit)
f7c7e32a 182#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 183#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 184#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
185#define RExC_naughty (pRExC_state->naughty)
186#define RExC_sawback (pRExC_state->sawback)
187#define RExC_seen (pRExC_state->seen)
188#define RExC_size (pRExC_state->size)
189#define RExC_npar (pRExC_state->npar)
e2e6a0f1 190#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
191#define RExC_extralen (pRExC_state->extralen)
192#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 193#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 194#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 195#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
196#define RExC_open_parens (pRExC_state->open_parens)
197#define RExC_close_parens (pRExC_state->close_parens)
198#define RExC_opend (pRExC_state->opend)
81714fb9 199#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
200#define RExC_recurse (pRExC_state->recurse)
201#define RExC_recurse_count (pRExC_state->recurse_count)
b57e4118 202#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 203#define RExC_contains_locale (pRExC_state->contains_locale)
9d53c457
KW
204#define RExC_override_recoding (pRExC_state->override_recoding)
205#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
830247a4 206
cde0cee5 207
a687059c
LW
208#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
209#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
4d68ffa0 210 ((*s) == '{' && regcurly(s, FALSE)))
a687059c
LW
211
212/*
213 * Flags to be passed up and down.
214 */
a687059c 215#define WORST 0 /* Worst case. */
a3b492c3 216#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 217
e64f369d 218/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
219 * character. (There needs to be a case: in the switch statement in regexec.c
220 * for any node marked SIMPLE.) Note that this is not the same thing as
221 * REGNODE_SIMPLE */
fda99bee 222#define SIMPLE 0x02
e64f369d 223#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
224#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
225#define TRYAGAIN 0x10 /* Weeded out a declaration. */
226#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
a687059c 227
3dab1dad
YO
228#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
229
07be1b83
YO
230/* whether trie related optimizations are enabled */
231#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
232#define TRIE_STUDY_OPT
786e8c11 233#define FULL_TRIE_STUDY
07be1b83
YO
234#define TRIE_STCLASS
235#endif
1de06328
YO
236
237
40d049e4
YO
238
239#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
240#define PBITVAL(paren) (1 << ((paren) & 7))
241#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
242#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
243#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
244
bbd61b5f 245#define REQUIRE_UTF8 STMT_START { \
8d9c2815
NC
246 if (!UTF) { \
247 *flagp = RESTART_UTF8; \
248 return NULL; \
249 } \
bbd61b5f 250 } STMT_END
40d049e4 251
f19b1a63
KW
252/* This converts the named class defined in regcomp.h to its equivalent class
253 * number defined in handy.h. */
254#define namedclass_to_classnum(class) ((int) ((class) / 2))
255#define classnum_to_namedclass(classnum) ((classnum) * 2)
256
de92f5e6
KW
257#define _invlist_union_complement_2nd(a, b, output) \
258 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
259#define _invlist_intersection_complement_2nd(a, b, output) \
260 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
261
1de06328
YO
262/* About scan_data_t.
263
264 During optimisation we recurse through the regexp program performing
265 various inplace (keyhole style) optimisations. In addition study_chunk
266 and scan_commit populate this data structure with information about
267 what strings MUST appear in the pattern. We look for the longest
3b753521 268 string that must appear at a fixed location, and we look for the
1de06328
YO
269 longest string that may appear at a floating location. So for instance
270 in the pattern:
271
272 /FOO[xX]A.*B[xX]BAR/
273
274 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
275 strings (because they follow a .* construct). study_chunk will identify
276 both FOO and BAR as being the longest fixed and floating strings respectively.
277
278 The strings can be composites, for instance
279
280 /(f)(o)(o)/
281
282 will result in a composite fixed substring 'foo'.
283
284 For each string some basic information is maintained:
285
286 - offset or min_offset
287 This is the position the string must appear at, or not before.
288 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
289 characters must match before the string we are searching for.
290 Likewise when combined with minlenp and the length of the string it
1de06328
YO
291 tells us how many characters must appear after the string we have
292 found.
293
294 - max_offset
295 Only used for floating strings. This is the rightmost point that
ea3daa5d 296 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328
YO
297 string can occur infinitely far to the right.
298
299 - minlenp
2d608413
KW
300 A pointer to the minimum number of characters of the pattern that the
301 string was found inside. This is important as in the case of positive
1de06328
YO
302 lookahead or positive lookbehind we can have multiple patterns
303 involved. Consider
304
305 /(?=FOO).*F/
306
307 The minimum length of the pattern overall is 3, the minimum length
308 of the lookahead part is 3, but the minimum length of the part that
309 will actually match is 1. So 'FOO's minimum length is 3, but the
310 minimum length for the F is 1. This is important as the minimum length
311 is used to determine offsets in front of and behind the string being
312 looked for. Since strings can be composites this is the length of the
486ec47a 313 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
314 the length is calculated by study_chunk, so that the minimum lengths
315 are not known until the full pattern has been compiled, thus the
316 pointer to the value.
317
318 - lookbehind
319
320 In the case of lookbehind the string being searched for can be
321 offset past the start point of the final matching string.
322 If this value was just blithely removed from the min_offset it would
323 invalidate some of the calculations for how many chars must match
324 before or after (as they are derived from min_offset and minlen and
325 the length of the string being searched for).
326 When the final pattern is compiled and the data is moved from the
327 scan_data_t structure into the regexp structure the information
328 about lookbehind is factored in, with the information that would
329 have been lost precalculated in the end_shift field for the
330 associated string.
331
332 The fields pos_min and pos_delta are used to store the minimum offset
333 and the delta to the maximum offset at the current point in the pattern.
334
335*/
2c2d71f5
JH
336
337typedef struct scan_data_t {
1de06328
YO
338 /*I32 len_min; unused */
339 /*I32 len_delta; unused */
49f55535 340 SSize_t pos_min;
ea3daa5d 341 SSize_t pos_delta;
2c2d71f5 342 SV *last_found;
ea3daa5d 343 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 344 SSize_t last_start_min;
ea3daa5d 345 SSize_t last_start_max;
1de06328
YO
346 SV **longest; /* Either &l_fixed, or &l_float. */
347 SV *longest_fixed; /* longest fixed string found in pattern */
49f55535 348 SSize_t offset_fixed; /* offset where it starts */
ea3daa5d 349 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
350 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
351 SV *longest_float; /* longest floating string found in pattern */
49f55535 352 SSize_t offset_float_min; /* earliest point in string it can appear */
ea3daa5d
FC
353 SSize_t offset_float_max; /* latest point in string it can appear */
354 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
49f55535 355 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
2c2d71f5
JH
356 I32 flags;
357 I32 whilem_c;
49f55535 358 SSize_t *last_closep;
b8f7bb16 359 regnode_ssc *start_class;
2c2d71f5
JH
360} scan_data_t;
361
c02c3054
KW
362/* The below is perhaps overboard, but this allows us to save a test at the
363 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
364 * and 'a' differ by a single bit; the same with the upper and lower case of
365 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
366 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
367 * then inverts it to form a mask, with just a single 0, in the bit position
368 * where the upper- and lowercase differ. XXX There are about 40 other
369 * instances in the Perl core where this micro-optimization could be used.
370 * Should decide if maintenance cost is worse, before changing those
371 *
372 * Returns a boolean as to whether or not 'v' is either a lowercase or
373 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
374 * compile-time constant, the generated code is better than some optimizing
375 * compilers figure out, amounting to a mask and test. The results are
376 * meaningless if 'c' is not one of [A-Za-z] */
377#define isARG2_lower_or_UPPER_ARG1(c, v) \
378 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
379
a687059c 380/*
e50aee73 381 * Forward declarations for pregcomp()'s friends.
a687059c 382 */
a0d0e21e 383
27da23d5 384static const scan_data_t zero_scan_data =
1de06328 385 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
386
387#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
388#define SF_BEFORE_SEOL 0x0001
389#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
390#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
391#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
392
44e3dfd2
BF
393#define SF_FIX_SHIFT_EOL (+2)
394#define SF_FL_SHIFT_EOL (+4)
c277df42
IZ
395
396#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
397#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
398
399#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
400#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
401#define SF_IS_INF 0x0040
402#define SF_HAS_PAR 0x0080
403#define SF_IN_PAR 0x0100
404#define SF_HAS_EVAL 0x0200
405#define SCF_DO_SUBSTR 0x0400
653099ff
GS
406#define SCF_DO_STCLASS_AND 0x0800
407#define SCF_DO_STCLASS_OR 0x1000
408#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 409#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 410
786e8c11 411#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 412#define SCF_SEEN_ACCEPT 0x8000
688e0391 413#define SCF_TRIE_DOING_RESTUDY 0x10000
07be1b83 414
43fead97 415#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
416
417/* The enums for all these are ordered so things work out correctly */
a62b1201 418#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
cfaf538b 419#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
00b27cfc 420#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
421#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
422#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
2f7f8cb1 423#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
a725e29c 424#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 425
43fead97 426#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 427
93733859 428#define OOB_NAMEDCLASS -1
b8c5462f 429
8e661ac5
KW
430/* There is no code point that is out-of-bounds, so this is problematic. But
431 * its only current use is to initialize a variable that is always set before
432 * looked at. */
433#define OOB_UNICODE 0xDEADBEEF
434
a0ed51b3
LW
435#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
436#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
437
8615cb43 438
b45f050a
JF
439/* length of regex to show in messages that don't mark a position within */
440#define RegexLengthToShowInErrorMessages 127
441
442/*
443 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
444 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
445 * op/pragma/warn/regcomp.
446 */
7253e4e3
RK
447#define MARKER1 "<-- HERE" /* marker as it appears in the description */
448#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 449
c1d900c3 450#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
b45f050a 451
c1d900c3
BF
452#define REPORT_LOCATION_ARGS(offset) \
453 UTF8fARG(UTF, offset, RExC_precomp), \
454 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
455
b45f050a
JF
456/*
457 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
458 * arg. Show regex, up to a maximum length. If it's too long, chop and add
459 * "...".
460 */
58e23c8d 461#define _FAIL(code) STMT_START { \
bfed75c6 462 const char *ellipses = ""; \
ccb2c380
MP
463 IV len = RExC_end - RExC_precomp; \
464 \
465 if (!SIZE_ONLY) \
a5e7bc51 466 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
467 if (len > RegexLengthToShowInErrorMessages) { \
468 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
469 len = RegexLengthToShowInErrorMessages - 10; \
470 ellipses = "..."; \
471 } \
58e23c8d 472 code; \
ccb2c380 473} STMT_END
8615cb43 474
58e23c8d 475#define FAIL(msg) _FAIL( \
c1d900c3
BF
476 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
477 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
478
479#define FAIL2(msg,arg) _FAIL( \
c1d900c3
BF
480 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
481 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 482
b45f050a 483/*
b45f050a
JF
484 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
485 */
ccb2c380 486#define Simple_vFAIL(m) STMT_START { \
a28509cc 487 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380 488 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
c1d900c3 489 m, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 490} STMT_END
b45f050a
JF
491
492/*
493 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
494 */
ccb2c380
MP
495#define vFAIL(m) STMT_START { \
496 if (!SIZE_ONLY) \
a5e7bc51 497 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
498 Simple_vFAIL(m); \
499} STMT_END
b45f050a
JF
500
501/*
502 * Like Simple_vFAIL(), but accepts two arguments.
503 */
ccb2c380 504#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 505 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
506 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
507 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 508} STMT_END
b45f050a
JF
509
510/*
511 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
512 */
ccb2c380
MP
513#define vFAIL2(m,a1) STMT_START { \
514 if (!SIZE_ONLY) \
a5e7bc51 515 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
516 Simple_vFAIL2(m, a1); \
517} STMT_END
b45f050a
JF
518
519
520/*
521 * Like Simple_vFAIL(), but accepts three arguments.
522 */
ccb2c380 523#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 524 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
525 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
526 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 527} STMT_END
b45f050a
JF
528
529/*
530 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
531 */
ccb2c380
MP
532#define vFAIL3(m,a1,a2) STMT_START { \
533 if (!SIZE_ONLY) \
a5e7bc51 534 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
535 Simple_vFAIL3(m, a1, a2); \
536} STMT_END
b45f050a
JF
537
538/*
539 * Like Simple_vFAIL(), but accepts four arguments.
540 */
ccb2c380 541#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 542 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
543 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
544 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 545} STMT_END
b45f050a 546
95db3ffa
KW
547#define vFAIL4(m,a1,a2,a3) STMT_START { \
548 if (!SIZE_ONLY) \
549 SAVEFREESV(RExC_rx_sv); \
550 Simple_vFAIL4(m, a1, a2, a3); \
551} STMT_END
552
946095af
BF
553/* A specialized version of vFAIL2 that works with UTF8f */
554#define vFAIL2utf8f(m, a1) STMT_START { \
ef3f731d
BF
555 const IV offset = RExC_parse - RExC_precomp; \
556 if (!SIZE_ONLY) \
557 SAVEFREESV(RExC_rx_sv); \
558 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
559 REPORT_LOCATION_ARGS(offset)); \
946095af
BF
560} STMT_END
561
562
5e0a247b
KW
563/* m is not necessarily a "literal string", in this macro */
564#define reg_warn_non_literal_string(loc, m) STMT_START { \
565 const IV offset = loc - RExC_precomp; \
566 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
c1d900c3 567 m, REPORT_LOCATION_ARGS(offset)); \
5e0a247b
KW
568} STMT_END
569
668c081a 570#define ckWARNreg(loc,m) STMT_START { \
a28509cc 571 const IV offset = loc - RExC_precomp; \
f10f4c18 572 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 573 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
574} STMT_END
575
0d6106aa
KW
576#define vWARN_dep(loc, m) STMT_START { \
577 const IV offset = loc - RExC_precomp; \
578 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
c1d900c3 579 REPORT_LOCATION_ARGS(offset)); \
0d6106aa
KW
580} STMT_END
581
147508a2
KW
582#define ckWARNdep(loc,m) STMT_START { \
583 const IV offset = loc - RExC_precomp; \
584 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
585 m REPORT_LOCATION, \
c1d900c3 586 REPORT_LOCATION_ARGS(offset)); \
147508a2
KW
587} STMT_END
588
668c081a 589#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 590 const IV offset = loc - RExC_precomp; \
d1d15184 591 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18 592 m REPORT_LOCATION, \
c1d900c3 593 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
594} STMT_END
595
b23eb183 596#define ckWARN2reg_d(loc,m, a1) STMT_START { \
2335b3d3 597 const IV offset = loc - RExC_precomp; \
b23eb183 598 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
2335b3d3 599 m REPORT_LOCATION, \
c1d900c3 600 a1, REPORT_LOCATION_ARGS(offset)); \
2335b3d3
KW
601} STMT_END
602
668c081a 603#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 604 const IV offset = loc - RExC_precomp; \
668c081a 605 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 606 a1, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
607} STMT_END
608
609#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 610 const IV offset = loc - RExC_precomp; \
ccb2c380 611 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 612 a1, a2, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
613} STMT_END
614
668c081a
NC
615#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
616 const IV offset = loc - RExC_precomp; \
617 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 618 a1, a2, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
619} STMT_END
620
ccb2c380 621#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 622 const IV offset = loc - RExC_precomp; \
ccb2c380 623 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 624 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
625} STMT_END
626
668c081a
NC
627#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
628 const IV offset = loc - RExC_precomp; \
629 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 630 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
631} STMT_END
632
ccb2c380 633#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 634 const IV offset = loc - RExC_precomp; \
ccb2c380 635 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 636 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 637} STMT_END
9d1d55b5 638
8615cb43 639
cd439c50 640/* Allow for side effects in s */
ccb2c380
MP
641#define REGC(c,s) STMT_START { \
642 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
643} STMT_END
cd439c50 644
fac92740
MJD
645/* Macros for recording node offsets. 20001227 mjd@plover.com
646 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
647 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
648 * Element 0 holds the number n.
07be1b83 649 * Position is 1 indexed.
fac92740 650 */
7122b237
YO
651#ifndef RE_TRACK_PATTERN_OFFSETS
652#define Set_Node_Offset_To_R(node,byte)
653#define Set_Node_Offset(node,byte)
654#define Set_Cur_Node_Offset
655#define Set_Node_Length_To_R(node,len)
656#define Set_Node_Length(node,len)
6a86c6ad 657#define Set_Node_Cur_Length(node,start)
7122b237
YO
658#define Node_Offset(n)
659#define Node_Length(n)
660#define Set_Node_Offset_Length(node,offset,len)
661#define ProgLen(ri) ri->u.proglen
662#define SetProgLen(ri,x) ri->u.proglen = x
663#else
664#define ProgLen(ri) ri->u.offsets[0]
665#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
666#define Set_Node_Offset_To_R(node,byte) STMT_START { \
667 if (! SIZE_ONLY) { \
668 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 669 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 670 if((node) < 0) { \
551405c4 671 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
672 } else { \
673 RExC_offsets[2*(node)-1] = (byte); \
674 } \
675 } \
676} STMT_END
677
678#define Set_Node_Offset(node,byte) \
679 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
680#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
681
682#define Set_Node_Length_To_R(node,len) STMT_START { \
683 if (! SIZE_ONLY) { \
684 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 685 __LINE__, (int)(node), (int)(len))); \
ccb2c380 686 if((node) < 0) { \
551405c4 687 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
688 } else { \
689 RExC_offsets[2*(node)] = (len); \
690 } \
691 } \
692} STMT_END
693
694#define Set_Node_Length(node,len) \
695 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
696#define Set_Node_Cur_Length(node, start) \
697 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
698
699/* Get offsets and lengths */
700#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
701#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
702
07be1b83
YO
703#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
704 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
705 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
706} STMT_END
7122b237 707#endif
07be1b83
YO
708
709#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
710#define EXPERIMENTAL_INPLACESCAN
f427392e 711#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 712
304ee84b
YO
713#define DEBUG_STUDYDATA(str,data,depth) \
714DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 715 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
716 "%*s" str "Pos:%"IVdf"/%"IVdf \
717 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
718 (int)(depth)*2, "", \
719 (IV)((data)->pos_min), \
720 (IV)((data)->pos_delta), \
304ee84b 721 (UV)((data)->flags), \
1de06328 722 (IV)((data)->whilem_c), \
304ee84b
YO
723 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
724 is_inf ? "INF " : "" \
1de06328
YO
725 ); \
726 if ((data)->last_found) \
727 PerlIO_printf(Perl_debug_log, \
728 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
729 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
730 SvPVX_const((data)->last_found), \
731 (IV)((data)->last_end), \
732 (IV)((data)->last_start_min), \
733 (IV)((data)->last_start_max), \
734 ((data)->longest && \
735 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
736 SvPVX_const((data)->longest_fixed), \
737 (IV)((data)->offset_fixed), \
738 ((data)->longest && \
739 (data)->longest==&((data)->longest_float)) ? "*" : "", \
740 SvPVX_const((data)->longest_float), \
741 (IV)((data)->offset_float_min), \
742 (IV)((data)->offset_float_max) \
743 ); \
744 PerlIO_printf(Perl_debug_log,"\n"); \
745});
746
653099ff 747/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 748 Update the longest found anchored substring and the longest found
653099ff
GS
749 floating substrings if needed. */
750
4327152a 751STATIC void
ea3daa5d
FC
752S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
753 SSize_t *minlenp, int is_inf)
c277df42 754{
e1ec3a88
AL
755 const STRLEN l = CHR_SVLEN(data->last_found);
756 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 757 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 758
7918f24d
NC
759 PERL_ARGS_ASSERT_SCAN_COMMIT;
760
c277df42 761 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 762 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
763 if (*data->longest == data->longest_fixed) {
764 data->offset_fixed = l ? data->last_start_min : data->pos_min;
765 if (data->flags & SF_BEFORE_EOL)
b81d288d 766 data->flags
c277df42
IZ
767 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
768 else
769 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 770 data->minlen_fixed=minlenp;
1de06328 771 data->lookbehind_fixed=0;
a0ed51b3 772 }
304ee84b 773 else { /* *data->longest == data->longest_float */
c277df42 774 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
775 data->offset_float_max = (l
776 ? data->last_start_max
ea3daa5d
FC
777 : (data->pos_delta == SSize_t_MAX
778 ? SSize_t_MAX
779 : data->pos_min + data->pos_delta));
780 if (is_inf
781 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
782 data->offset_float_max = SSize_t_MAX;
c277df42 783 if (data->flags & SF_BEFORE_EOL)
b81d288d 784 data->flags
c277df42
IZ
785 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
786 else
787 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
788 data->minlen_float=minlenp;
789 data->lookbehind_float=0;
c277df42
IZ
790 }
791 }
792 SvCUR_set(data->last_found, 0);
0eda9292 793 {
a28509cc 794 SV * const sv = data->last_found;
097eb12c
AL
795 if (SvUTF8(sv) && SvMAGICAL(sv)) {
796 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
797 if (mg)
798 mg->mg_len = 0;
799 }
0eda9292 800 }
c277df42
IZ
801 data->last_end = -1;
802 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 803 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
804}
805
899d20b9
KW
806/* These macros set, clear and test whether the synthetic start class ('ssc',
807 * given by the parameter) matches an empty string (EOS). This uses the
808 * 'next_off' field in the node, to save a bit in the flags field. The ssc
809 * stands alone, so there is never a next_off, so this field is otherwise
810 * unused. The EOS information is used only for compilation, but theoretically
811 * it could be passed on to the execution code. This could be used to store
812 * more than one bit of information, but only this one is currently used. */
813#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
814#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
815#define TEST_SSC_EOS(node) cBOOL((node)->next_off)
816
653099ff
GS
817/* Can match anything (initialization) */
818STATIC void
557bd3fb 819S_ssc_anything(const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 820{
557bd3fb 821 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 822
557bd3fb 823 ANYOF_BITMAP_SETALL(ssc);
1867fb0b 824 ANYOF_FLAGS(ssc) = ANYOF_ABOVE_LATIN1_ALL;
557bd3fb 825 SET_SSC_EOS(ssc);
3fffb88a
KW
826
827 /* If any portion of the regex is to operate under locale rules,
828 * initialization includes it. The reason this isn't done for all regexes
829 * is that the optimizer was written under the assumption that locale was
830 * all-or-nothing. Given the complexity and lack of documentation in the
831 * optimizer, and that there are inadequate test cases for locale, so many
832 * parts of it may not work properly, it is safest to avoid locale unless
833 * necessary. */
834 if (RExC_contains_locale) {
8efd3f97 835 ANYOF_POSIXL_SETALL(ssc);
1867fb0b 836 ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL|ANYOF_LOC_FOLD;
3fffb88a 837 }
9d7a1e63 838 else {
8efd3f97 839 ANYOF_POSIXL_ZERO(ssc);
9d7a1e63 840 }
653099ff
GS
841}
842
843/* Can match anything (initialization) */
844STATIC int
557bd3fb 845S_ssc_is_anything(const regnode_ssc *ssc)
653099ff
GS
846{
847 int value;
848
557bd3fb 849 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 850
8efd3f97
KW
851 for (value = 0; value < ANYOF_POSIXL_MAX; value += 2)
852 if (ANYOF_POSIXL_TEST(ssc, value) && ANYOF_POSIXL_TEST(ssc, value + 1))
653099ff 853 return 1;
1867fb0b 854 if (!(ANYOF_FLAGS(ssc) & ANYOF_ABOVE_LATIN1_ALL))
1aa99e6b 855 return 0;
557bd3fb 856 if (!ANYOF_BITMAP_TESTALLSET((const void*)ssc))
f8bef550 857 return 0;
653099ff
GS
858 return 1;
859}
860
861/* Can match anything (initialization) */
862STATIC void
557bd3fb 863S_ssc_init(const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 864{
557bd3fb 865 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 866
557bd3fb
KW
867 Zero(ssc, 1, regnode_ssc);
868 ssc->type = ANYOF;
869 ssc_anything(pRExC_state, ssc);
870 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
871 OP(ssc) = ANYOF_SYNTHETIC;
653099ff
GS
872}
873
1051e1c4 874/* These two functions currently do the exact same thing */
557bd3fb 875#define ssc_init_zero ssc_init
653099ff 876
557bd3fb 877/* 'AND' a given class with another one. Can create false positives. 'ssc'
8efd3f97 878 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
b8f7bb16 879 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
653099ff 880STATIC void
1ca93ef1 881S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, const regnode_ssc *and_with)
653099ff 882{
557bd3fb 883 PERL_ARGS_ASSERT_SSC_AND;
40d049e4 884
954a2af6 885 assert(PL_regkind[and_with->type] == ANYOF);
1e6ade67 886
c6b76537 887 /* I (khw) am not sure all these restrictions are necessary XXX */
8efd3f97
KW
888 if (!(ANYOF_POSIXL_TEST_ANY_SET(and_with))
889 && !(ANYOF_POSIXL_TEST_ANY_SET(ssc))
1867fb0b
KW
890 && (ANYOF_FLAGS(and_with) & ANYOF_LOCALE) == (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
891 && !(ANYOF_FLAGS(and_with) & ANYOF_LOC_FOLD)
892 && !(ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) {
653099ff
GS
893 int i;
894
1867fb0b 895 if (ANYOF_FLAGS(and_with) & ANYOF_INVERT)
653099ff 896 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557bd3fb 897 ssc->bitmap[i] &= ~and_with->bitmap[i];
653099ff
GS
898 else
899 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557bd3fb 900 ssc->bitmap[i] &= and_with->bitmap[i];
653099ff 901 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
1aa99e6b 902
1867fb0b 903 if (ANYOF_FLAGS(and_with) & ANYOF_INVERT) {
8951c461 904
c6b76537
KW
905 /* Here, the and'ed node is inverted. Get the AND of the flags that
906 * aren't affected by the inversion. Those that are affected are
907 * handled individually below */
557bd3fb
KW
908 U8 affected_flags = ssc->flags & ~INVERSION_UNAFFECTED_FLAGS;
909 ssc->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
910 ssc->flags |= affected_flags;
c6b76537
KW
911
912 /* We currently don't know how to deal with things that aren't in the
913 * bitmap, but we know that the intersection is no greater than what
557bd3fb 914 * is already in ssc, so let there be false positives that get sorted
c6b76537
KW
915 * out after the synthetic start class succeeds, and the node is
916 * matched for real. */
917
918 /* The inversion of these two flags indicate that the resulting
919 * intersection doesn't have them */
1867fb0b
KW
920 if (ANYOF_FLAGS(and_with) & ANYOF_ABOVE_LATIN1_ALL) {
921 ANYOF_FLAGS(ssc) &= ~ANYOF_ABOVE_LATIN1_ALL;
4713bfe1 922 }
1867fb0b
KW
923 if (ANYOF_FLAGS(and_with) & ANYOF_NON_UTF8_LATIN1_ALL) {
924 ANYOF_FLAGS(ssc) &= ~ANYOF_NON_UTF8_LATIN1_ALL;
137165a6 925 }
1aa99e6b 926 }
c6b76537 927 else { /* and'd node is not inverted */
3ad98780
KW
928 U8 outside_bitmap_but_not_utf8; /* Temp variable */
929
137165a6 930 if (! ANYOF_NONBITMAP(and_with)) {
c6b76537
KW
931
932 /* Here 'and_with' doesn't match anything outside the bitmap
9b1eb067
KW
933 * (except possibly ANYOF_ABOVE_LATIN1_ALL), which means the
934 * intersection can't either, except for ANYOF_ABOVE_LATIN1_ALL, in
c6b76537 935 * which case we don't know what the intersection is, but it's no
557bd3fb 936 * greater than what ssc already has, so can just leave it alone,
c6b76537 937 * with possible false positives */
1867fb0b 938 if (! (ANYOF_FLAGS(and_with) & ANYOF_ABOVE_LATIN1_ALL)) {
557bd3fb 939 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
1867fb0b 940 ANYOF_FLAGS(ssc) &= ~ANYOF_NONBITMAP_NON_UTF8;
c6b76537 941 }
137165a6 942 }
557bd3fb 943 else if (! ANYOF_NONBITMAP(ssc)) {
c6b76537 944
557bd3fb 945 /* Here, 'and_with' does match something outside the bitmap, and ssc
c6b76537 946 * doesn't have a list of things to match outside the bitmap. If
557bd3fb
KW
947 * ssc can match all code points above 255, the intersection will
948 * be those above-255 code points that 'and_with' matches. If ssc
3ad98780
KW
949 * can't match all Unicode code points, it means that it can't
950 * match anything outside the bitmap (since the 'if' that got us
951 * into this block tested for that), so we leave the bitmap empty.
952 */
1867fb0b 953 if (ANYOF_FLAGS(ssc) & ANYOF_ABOVE_LATIN1_ALL) {
557bd3fb 954 ARG_SET(ssc, ARG(and_with));
3ad98780
KW
955
956 /* and_with's ARG may match things that don't require UTF8.
557bd3fb 957 * And now ssc's will too, in spite of this being an 'and'. See
3ad98780 958 * the comments below about the kludge */
1867fb0b 959 ANYOF_FLAGS(ssc) |= ANYOF_FLAGS(and_with) & ANYOF_NONBITMAP_NON_UTF8;
c6b76537
KW
960 }
961 }
962 else {
557bd3fb 963 /* Here, both 'and_with' and ssc match something outside the
c6b76537 964 * bitmap. Currently we do not do the intersection, so just match
557bd3fb 965 * whatever ssc had at the beginning. */
c6b76537
KW
966 }
967
968
3ad98780
KW
969 /* Take the intersection of the two sets of flags. However, the
970 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
971 * kludge around the fact that this flag is not treated like the others
557bd3fb 972 * which are initialized in ssc_anything(). The way the optimizer works
3ad98780
KW
973 * is that the synthetic start class (SSC) is initialized to match
974 * anything, and then the first time a real node is encountered, its
975 * values are AND'd with the SSC's with the result being the values of
976 * the real node. However, there are paths through the optimizer where
977 * the AND never gets called, so those initialized bits are set
978 * inappropriately, which is not usually a big deal, as they just cause
979 * false positives in the SSC, which will just mean a probably
980 * imperceptible slow down in execution. However this bit has a
981 * higher false positive consequence in that it can cause utf8.pm,
982 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
983 * bigger slowdown and also causes significant extra memory to be used.
984 * In order to prevent this, the code now takes a different tack. The
985 * bit isn't set unless some part of the regular expression needs it,
986 * but once set it won't get cleared. This means that these extra
987 * modules won't get loaded unless there was some path through the
988 * pattern that would have required them anyway, and so any false
989 * positives that occur by not ANDing them out when they could be
990 * aren't as severe as they would be if we treated this bit like all
991 * the others */
557bd3fb 992 outside_bitmap_but_not_utf8 = (ssc->flags | and_with->flags)
3ad98780 993 & ANYOF_NONBITMAP_NON_UTF8;
557bd3fb
KW
994 ssc->flags &= and_with->flags;
995 ssc->flags |= outside_bitmap_but_not_utf8;
137165a6 996 }
653099ff
GS
997}
998
557bd3fb 999/* 'OR' a given class with another one. Can create false positives. 'ssc'
8efd3f97 1000 * should not be inverted. 'or_with->flags & ANYOF_POSIXL' should be 0 if
b8f7bb16 1001 * 'or_with' is a regnode_charclass instead of a regnode_ssc. */
653099ff 1002STATIC void
557bd3fb 1003S_ssc_or(const RExC_state_t *pRExC_state, regnode_ssc *ssc, const regnode_ssc *or_with)
653099ff 1004{
557bd3fb 1005 PERL_ARGS_ASSERT_SSC_OR;
7918f24d 1006
1867fb0b 1007 if (ANYOF_FLAGS(or_with) & ANYOF_INVERT) {
c6b76537
KW
1008
1009 /* Here, the or'd node is to be inverted. This means we take the
1010 * complement of everything not in the bitmap, but currently we don't
1011 * know what that is, so give up and match anything */
1012 if (ANYOF_NONBITMAP(or_with)) {
557bd3fb 1013 ssc_anything(pRExC_state, ssc);
c6b76537 1014 }
653099ff
GS
1015 /* We do not use
1016 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
1017 * <= (B1 | !B2) | (CL1 | !CL2)
1018 * which is wasteful if CL2 is small, but we ignore CL2:
1019 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
1020 * XXXX Can we handle case-fold? Unclear:
1021 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
1022 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
1023 */
1867fb0b
KW
1024 else if ( (ANYOF_FLAGS(or_with) & ANYOF_LOCALE) == (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
1025 && !(ANYOF_FLAGS(or_with) & ANYOF_LOC_FOLD)
1026 && !(ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD) ) {
653099ff
GS
1027 int i;
1028
1029 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557bd3fb 1030 ssc->bitmap[i] |= ~or_with->bitmap[i];
653099ff
GS
1031 } /* XXXX: logic is complicated otherwise */
1032 else {
557bd3fb 1033 ssc_anything(pRExC_state, ssc);
653099ff 1034 }
c6b76537
KW
1035
1036 /* And, we can just take the union of the flags that aren't affected
1037 * by the inversion */
557bd3fb 1038 ssc->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
c6b76537
KW
1039
1040 /* For the remaining flags:
9b1eb067 1041 ANYOF_ABOVE_LATIN1_ALL and inverted means to not match anything above
557bd3fb
KW
1042 255, which means that the union with ssc should just be
1043 what ssc has in it, so can ignore this flag
c6b76537 1044 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
557bd3fb
KW
1045 is 127-255 to match them, but then invert that, so the
1046 union with ssc should just be what ssc has in it, so can
c6b76537
KW
1047 ignore this flag
1048 */
1049 } else { /* 'or_with' is not inverted */
653099ff 1050 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
1867fb0b
KW
1051 if ( (ANYOF_FLAGS(or_with) & ANYOF_LOCALE) == (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
1052 && (!(ANYOF_FLAGS(or_with) & ANYOF_LOC_FOLD)
1053 || (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) ) {
653099ff
GS
1054 int i;
1055
1056 /* OR char bitmap and class bitmap separately */
1057 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557bd3fb 1058 ssc->bitmap[i] |= or_with->bitmap[i];
1867fb0b 1059 if (ANYOF_FLAGS(or_with) & ANYOF_POSIXL) {
8efd3f97 1060 ANYOF_POSIXL_OR(or_with, ssc);
c96939e4 1061 }
653099ff
GS
1062 }
1063 else { /* XXXX: logic is complicated, leave it along for a moment. */
557bd3fb 1064 ssc_anything(pRExC_state, ssc);
653099ff 1065 }
9826f543 1066
c6b76537
KW
1067 if (ANYOF_NONBITMAP(or_with)) {
1068
1069 /* Use the added node's outside-the-bit-map match if there isn't a
1070 * conflict. If there is a conflict (both nodes match something
1071 * outside the bitmap, but what they match outside is not the same
1072 * pointer, and hence not easily compared until XXX we extend
1073 * inversion lists this far), give up and allow the start class to
d94b1d13
KW
1074 * match everything outside the bitmap. If that stuff is all above
1075 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
557bd3fb
KW
1076 if (! ANYOF_NONBITMAP(ssc)) {
1077 ARG_SET(ssc, ARG(or_with));
c6b76537 1078 }
557bd3fb 1079 else if (ARG(ssc) != ARG(or_with)) {
d94b1d13 1080
1867fb0b 1081 if ((ANYOF_FLAGS(or_with) & ANYOF_NONBITMAP_NON_UTF8)) {
557bd3fb 1082 ssc_anything(pRExC_state, ssc);
d94b1d13
KW
1083 }
1084 else {
1867fb0b 1085 ANYOF_FLAGS(ssc) |= ANYOF_ABOVE_LATIN1_ALL;
d94b1d13 1086 }
c6b76537 1087 }
4c34a693 1088 }
0b9668ee
KW
1089
1090 /* Take the union */
557bd3fb 1091 ssc->flags |= or_with->flags;
1aa99e6b 1092 }
653099ff
GS
1093}
1094
a3621e74
YO
1095#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1096#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1097#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1098#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1099
3dab1dad
YO
1100
1101#ifdef DEBUGGING
07be1b83 1102/*
2b8b4781
NC
1103 dump_trie(trie,widecharmap,revcharmap)
1104 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1105 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1106
1107 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1108 The _interim_ variants are used for debugging the interim
1109 tables that are used to generate the final compressed
1110 representation which is what dump_trie expects.
1111
486ec47a 1112 Part of the reason for their existence is to provide a form
3dab1dad 1113 of documentation as to how the different representations function.
07be1b83
YO
1114
1115*/
3dab1dad
YO
1116
1117/*
3dab1dad
YO
1118 Dumps the final compressed table form of the trie to Perl_debug_log.
1119 Used for debugging make_trie().
1120*/
b9a59e08 1121
3dab1dad 1122STATIC void
2b8b4781
NC
1123S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1124 AV *revcharmap, U32 depth)
3dab1dad
YO
1125{
1126 U32 state;
ab3bbdeb 1127 SV *sv=sv_newmortal();
55eed653 1128 int colwidth= widecharmap ? 6 : 4;
2e64971a 1129 U16 word;
3dab1dad
YO
1130 GET_RE_DEBUG_FLAGS_DECL;
1131
7918f24d 1132 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1133
3dab1dad
YO
1134 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1135 (int)depth * 2 + 2,"",
1136 "Match","Base","Ofs" );
1137
1138 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1139 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1140 if ( tmp ) {
ab3bbdeb
YO
1141 PerlIO_printf( Perl_debug_log, "%*s",
1142 colwidth,
ddc5bc0f 1143 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1144 PL_colors[0], PL_colors[1],
1145 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1146 PERL_PV_ESCAPE_FIRSTCHAR
1147 )
1148 );
3dab1dad
YO
1149 }
1150 }
1151 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1152 (int)depth * 2 + 2,"");
1153
1154 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1155 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1156 PerlIO_printf( Perl_debug_log, "\n");
1157
1e2e3d02 1158 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1159 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
1160
1161 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1162
1163 if ( trie->states[ state ].wordnum ) {
1164 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1165 } else {
1166 PerlIO_printf( Perl_debug_log, "%6s", "" );
1167 }
1168
1169 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1170
1171 if ( base ) {
1172 U32 ofs = 0;
1173
1174 while( ( base + ofs < trie->uniquecharcount ) ||
1175 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1176 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1177 ofs++;
1178
1179 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1180
1181 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1182 if ( ( base + ofs >= trie->uniquecharcount ) &&
1183 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1184 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1185 {
ab3bbdeb
YO
1186 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1187 colwidth,
3dab1dad
YO
1188 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1189 } else {
ab3bbdeb 1190 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1191 }
1192 }
1193
1194 PerlIO_printf( Perl_debug_log, "]");
1195
1196 }
1197 PerlIO_printf( Perl_debug_log, "\n" );
1198 }
2e64971a
DM
1199 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1200 for (word=1; word <= trie->wordcount; word++) {
1201 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1202 (int)word, (int)(trie->wordinfo[word].prev),
1203 (int)(trie->wordinfo[word].len));
1204 }
1205 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
1206}
1207/*
3dab1dad
YO
1208 Dumps a fully constructed but uncompressed trie in list form.
1209 List tries normally only are used for construction when the number of
1210 possible chars (trie->uniquecharcount) is very high.
1211 Used for debugging make_trie().
1212*/
1213STATIC void
55eed653 1214S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1215 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1216 U32 depth)
3dab1dad
YO
1217{
1218 U32 state;
ab3bbdeb 1219 SV *sv=sv_newmortal();
55eed653 1220 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1221 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1222
1223 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1224
3dab1dad 1225 /* print out the table precompression. */
ab3bbdeb
YO
1226 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1227 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1228 "------:-----+-----------------\n" );
3dab1dad
YO
1229
1230 for( state=1 ; state < next_alloc ; state ++ ) {
1231 U16 charid;
1232
ab3bbdeb 1233 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1234 (int)depth * 2 + 2,"", (UV)state );
1235 if ( ! trie->states[ state ].wordnum ) {
1236 PerlIO_printf( Perl_debug_log, "%5s| ","");
1237 } else {
1238 PerlIO_printf( Perl_debug_log, "W%4x| ",
1239 trie->states[ state ].wordnum
1240 );
1241 }
1242 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1243 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1244 if ( tmp ) {
1245 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1246 colwidth,
ddc5bc0f 1247 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1248 PL_colors[0], PL_colors[1],
1249 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1250 PERL_PV_ESCAPE_FIRSTCHAR
1251 ) ,
1e2e3d02
YO
1252 TRIE_LIST_ITEM(state,charid).forid,
1253 (UV)TRIE_LIST_ITEM(state,charid).newstate
1254 );
1255 if (!(charid % 10))
664e119d
RGS
1256 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1257 (int)((depth * 2) + 14), "");
1e2e3d02 1258 }
ab3bbdeb
YO
1259 }
1260 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1261 }
1262}
1263
1264/*
3dab1dad
YO
1265 Dumps a fully constructed but uncompressed trie in table form.
1266 This is the normal DFA style state transition table, with a few
1267 twists to facilitate compression later.
1268 Used for debugging make_trie().
1269*/
1270STATIC void
55eed653 1271S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1272 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1273 U32 depth)
3dab1dad
YO
1274{
1275 U32 state;
1276 U16 charid;
ab3bbdeb 1277 SV *sv=sv_newmortal();
55eed653 1278 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1279 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1280
1281 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1282
1283 /*
1284 print out the table precompression so that we can do a visual check
1285 that they are identical.
1286 */
1287
1288 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1289
1290 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1291 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1292 if ( tmp ) {
ab3bbdeb
YO
1293 PerlIO_printf( Perl_debug_log, "%*s",
1294 colwidth,
ddc5bc0f 1295 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1296 PL_colors[0], PL_colors[1],
1297 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1298 PERL_PV_ESCAPE_FIRSTCHAR
1299 )
1300 );
3dab1dad
YO
1301 }
1302 }
1303
1304 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1305
1306 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1307 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1308 }
1309
1310 PerlIO_printf( Perl_debug_log, "\n" );
1311
1312 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1313
1314 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1315 (int)depth * 2 + 2,"",
1316 (UV)TRIE_NODENUM( state ) );
1317
1318 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1319 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1320 if (v)
1321 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1322 else
1323 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1324 }
1325 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1326 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1327 } else {
1328 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1329 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1330 }
1331 }
07be1b83 1332}
3dab1dad
YO
1333
1334#endif
1335
2e64971a 1336
786e8c11
YO
1337/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1338 startbranch: the first branch in the whole branch sequence
1339 first : start branch of sequence of branch-exact nodes.
1340 May be the same as startbranch
1341 last : Thing following the last branch.
1342 May be the same as tail.
1343 tail : item following the branch sequence
1344 count : words in the sequence
1345 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1346 depth : indent depth
3dab1dad 1347
786e8c11 1348Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1349
786e8c11
YO
1350A trie is an N'ary tree where the branches are determined by digital
1351decomposition of the key. IE, at the root node you look up the 1st character and
1352follow that branch repeat until you find the end of the branches. Nodes can be
1353marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1354
786e8c11 1355 /he|she|his|hers/
72f13be8 1356
786e8c11
YO
1357would convert into the following structure. Numbers represent states, letters
1358following numbers represent valid transitions on the letter from that state, if
1359the number is in square brackets it represents an accepting state, otherwise it
1360will be in parenthesis.
07be1b83 1361
786e8c11
YO
1362 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1363 | |
1364 | (2)
1365 | |
1366 (1) +-i->(6)-+-s->[7]
1367 |
1368 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1369
786e8c11
YO
1370 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1371
1372This shows that when matching against the string 'hers' we will begin at state 1
1373read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1374then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1375is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1376single traverse. We store a mapping from accepting to state to which word was
1377matched, and then when we have multiple possibilities we try to complete the
1378rest of the regex in the order in which they occured in the alternation.
1379
1380The only prior NFA like behaviour that would be changed by the TRIE support is
1381the silent ignoring of duplicate alternations which are of the form:
1382
1383 / (DUPE|DUPE) X? (?{ ... }) Y /x
1384
4b714af6 1385Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1386and without the optimisation. With the optimisations dupes will be silently
486ec47a 1387ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1388the following demonstrates:
1389
1390 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1391
1392which prints out 'word' three times, but
1393
1394 'words'=~/(word|word|word)(?{ print $1 })S/
1395
1396which doesnt print it out at all. This is due to other optimisations kicking in.
1397
1398Example of what happens on a structural level:
1399
486ec47a 1400The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1401
1402 1: CURLYM[1] {1,32767}(18)
1403 5: BRANCH(8)
1404 6: EXACT <ac>(16)
1405 8: BRANCH(11)
1406 9: EXACT <ad>(16)
1407 11: BRANCH(14)
1408 12: EXACT <ab>(16)
1409 16: SUCCEED(0)
1410 17: NOTHING(18)
1411 18: END(0)
1412
1413This would be optimizable with startbranch=5, first=5, last=16, tail=16
1414and should turn into:
1415
1416 1: CURLYM[1] {1,32767}(18)
1417 5: TRIE(16)
1418 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1419 <ac>
1420 <ad>
1421 <ab>
1422 16: SUCCEED(0)
1423 17: NOTHING(18)
1424 18: END(0)
1425
1426Cases where tail != last would be like /(?foo|bar)baz/:
1427
1428 1: BRANCH(4)
1429 2: EXACT <foo>(8)
1430 4: BRANCH(7)
1431 5: EXACT <bar>(8)
1432 7: TAIL(8)
1433 8: EXACT <baz>(10)
1434 10: END(0)
1435
1436which would be optimizable with startbranch=1, first=1, last=7, tail=8
1437and would end up looking like:
1438
1439 1: TRIE(8)
1440 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1441 <foo>
1442 <bar>
1443 7: TAIL(8)
1444 8: EXACT <baz>(10)
1445 10: END(0)
1446
c80e42f3 1447 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
1448
1449is the recommended Unicode-aware way of saying
1450
1451 *(d++) = uv;
1452*/
1453
fab2782b 1454#define TRIE_STORE_REVCHAR(val) \
786e8c11 1455 STMT_START { \
73031816 1456 if (UTF) { \
fab2782b 1457 SV *zlopp = newSV(7); /* XXX: optimize me */ \
88c9ea1e 1458 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 1459 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
1460 SvCUR_set(zlopp, kapow - flrbbbbb); \
1461 SvPOK_on(zlopp); \
1462 SvUTF8_on(zlopp); \
1463 av_push(revcharmap, zlopp); \
1464 } else { \
fab2782b 1465 char ooooff = (char)val; \
73031816
NC
1466 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1467 } \
1468 } STMT_END
786e8c11 1469
914a25d5
KW
1470/* This gets the next character from the input, folding it if not already
1471 * folded. */
1472#define TRIE_READ_CHAR STMT_START { \
1473 wordlen++; \
1474 if ( UTF ) { \
1475 /* if it is UTF then it is either already folded, or does not need \
1476 * folding */ \
1c1d615a 1477 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
1478 } \
1479 else if (folder == PL_fold_latin1) { \
7d006b13
KW
1480 /* This folder implies Unicode rules, which in the range expressible \
1481 * by not UTF is the lower case, with the two exceptions, one of \
1482 * which should have been taken care of before calling this */ \
1483 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1484 uvc = toLOWER_L1(*uc); \
1485 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1486 len = 1; \
914a25d5
KW
1487 } else { \
1488 /* raw data, will be folded later if needed */ \
1489 uvc = (U32)*uc; \
1490 len = 1; \
1491 } \
786e8c11
YO
1492} STMT_END
1493
1494
1495
1496#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1497 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1498 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1499 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1500 } \
1501 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1502 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1503 TRIE_LIST_CUR( state )++; \
1504} STMT_END
07be1b83 1505
786e8c11
YO
1506#define TRIE_LIST_NEW(state) STMT_START { \
1507 Newxz( trie->states[ state ].trans.list, \
1508 4, reg_trie_trans_le ); \
1509 TRIE_LIST_CUR( state ) = 1; \
1510 TRIE_LIST_LEN( state ) = 4; \
1511} STMT_END
07be1b83 1512
786e8c11
YO
1513#define TRIE_HANDLE_WORD(state) STMT_START { \
1514 U16 dupe= trie->states[ state ].wordnum; \
1515 regnode * const noper_next = regnext( noper ); \
1516 \
786e8c11
YO
1517 DEBUG_r({ \
1518 /* store the word for dumping */ \
1519 SV* tmp; \
1520 if (OP(noper) != NOTHING) \
740cce10 1521 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1522 else \
740cce10 1523 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1524 av_push( trie_words, tmp ); \
786e8c11
YO
1525 }); \
1526 \
1527 curword++; \
2e64971a
DM
1528 trie->wordinfo[curword].prev = 0; \
1529 trie->wordinfo[curword].len = wordlen; \
1530 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1531 \
1532 if ( noper_next < tail ) { \
1533 if (!trie->jump) \
c944940b 1534 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1535 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1536 if (!jumper) \
1537 jumper = noper_next; \
1538 if (!nextbranch) \
1539 nextbranch= regnext(cur); \
1540 } \
1541 \
1542 if ( dupe ) { \
2e64971a
DM
1543 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1544 /* chain, so that when the bits of chain are later */\
1545 /* linked together, the dups appear in the chain */\
1546 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1547 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1548 } else { \
1549 /* we haven't inserted this word yet. */ \
1550 trie->states[ state ].wordnum = curword; \
1551 } \
1552} STMT_END
07be1b83 1553
3dab1dad 1554
786e8c11
YO
1555#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1556 ( ( base + charid >= ucharcount \
1557 && base + charid < ubound \
1558 && state == trie->trans[ base - ucharcount + charid ].check \
1559 && trie->trans[ base - ucharcount + charid ].next ) \
1560 ? trie->trans[ base - ucharcount + charid ].next \
1561 : ( state==1 ? special : 0 ) \
1562 )
3dab1dad 1563
786e8c11
YO
1564#define MADE_TRIE 1
1565#define MADE_JUMP_TRIE 2
1566#define MADE_EXACT_TRIE 4
3dab1dad 1567
a3621e74 1568STATIC I32
786e8c11 1569S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1570{
27da23d5 1571 dVAR;
a3621e74
YO
1572 /* first pass, loop through and scan words */
1573 reg_trie_data *trie;
55eed653 1574 HV *widecharmap = NULL;
2b8b4781 1575 AV *revcharmap = newAV();
a3621e74 1576 regnode *cur;
a3621e74
YO
1577 STRLEN len = 0;
1578 UV uvc = 0;
1579 U16 curword = 0;
1580 U32 next_alloc = 0;
786e8c11
YO
1581 regnode *jumper = NULL;
1582 regnode *nextbranch = NULL;
7f69552c 1583 regnode *convert = NULL;
2e64971a 1584 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1585 /* we just use folder as a flag in utf8 */
1e696034 1586 const U8 * folder = NULL;
a3621e74 1587
2b8b4781 1588#ifdef DEBUGGING
cf78de0b 1589 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2b8b4781
NC
1590 AV *trie_words = NULL;
1591 /* along with revcharmap, this only used during construction but both are
1592 * useful during debugging so we store them in the struct when debugging.
8e11feef 1593 */
2b8b4781 1594#else
cf78de0b 1595 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 1596 STRLEN trie_charcount=0;
3dab1dad 1597#endif
2b8b4781 1598 SV *re_trie_maxbuff;
a3621e74 1599 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1600
1601 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1602#ifndef DEBUGGING
1603 PERL_UNUSED_ARG(depth);
1604#endif
a3621e74 1605
1e696034 1606 switch (flags) {
79a81a6e 1607 case EXACT: break;
2f7f8cb1 1608 case EXACTFA:
fab2782b 1609 case EXACTFU_SS:
1e696034
KW
1610 case EXACTFU: folder = PL_fold_latin1; break;
1611 case EXACTF: folder = PL_fold; break;
1612 case EXACTFL: folder = PL_fold_locale; break;
fab2782b 1613 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
1614 }
1615
c944940b 1616 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1617 trie->refcount = 1;
3dab1dad 1618 trie->startstate = 1;
786e8c11 1619 trie->wordcount = word_count;
f8fc2ecf 1620 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1621 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
fab2782b 1622 if (flags == EXACT)
c944940b 1623 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1624 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1625 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1626
a3621e74 1627 DEBUG_r({
2b8b4781 1628 trie_words = newAV();
a3621e74 1629 });
a3621e74 1630
0111c4fd 1631 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1632 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1633 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1634 }
df826430 1635 DEBUG_TRIE_COMPILE_r({
3dab1dad 1636 PerlIO_printf( Perl_debug_log,
786e8c11 1637 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1638 (int)depth * 2 + 2, "",
1639 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1640 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1641 (int)depth);
3dab1dad 1642 });
7f69552c
YO
1643
1644 /* Find the node we are going to overwrite */
1645 if ( first == startbranch && OP( last ) != BRANCH ) {
1646 /* whole branch chain */
1647 convert = first;
1648 } else {
1649 /* branch sub-chain */
1650 convert = NEXTOPER( first );
1651 }
1652
a3621e74
YO
1653 /* -- First loop and Setup --
1654
1655 We first traverse the branches and scan each word to determine if it
1656 contains widechars, and how many unique chars there are, this is
1657 important as we have to build a table with at least as many columns as we
1658 have unique chars.
1659
1660 We use an array of integers to represent the character codes 0..255
38a44b82 1661 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1662 native representation of the character value as the key and IV's for the
1663 coded index.
1664
1665 *TODO* If we keep track of how many times each character is used we can
1666 remap the columns so that the table compression later on is more
3b753521 1667 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1668 middle and the least common are on the outside. IMO this would be better
1669 than a most to least common mapping as theres a decent chance the most
1670 common letter will share a node with the least common, meaning the node
486ec47a 1671 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1672 case is when we have the least common nodes twice.
1673
1674 */
1675
a3621e74 1676 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 1677 regnode *noper = NEXTOPER( cur );
e1ec3a88 1678 const U8 *uc = (U8*)STRING( noper );
df826430 1679 const U8 *e = uc + STR_LEN( noper );
a3621e74 1680 STRLEN foldlen = 0;
07be1b83 1681 U32 wordlen = 0; /* required init */
afa96d92
KW
1682 STRLEN minbytes = 0;
1683 STRLEN maxbytes = 0;
02daf0ab 1684 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1685
3dab1dad 1686 if (OP(noper) == NOTHING) {
df826430
YO
1687 regnode *noper_next= regnext(noper);
1688 if (noper_next != tail && OP(noper_next) == flags) {
1689 noper = noper_next;
1690 uc= (U8*)STRING(noper);
1691 e= uc + STR_LEN(noper);
1692 trie->minlen= STR_LEN(noper);
1693 } else {
1694 trie->minlen= 0;
1695 continue;
1696 }
3dab1dad 1697 }
df826430 1698
fab2782b 1699 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
1700 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1701 regardless of encoding */
fab2782b
YO
1702 if (OP( noper ) == EXACTFU_SS) {
1703 /* false positives are ok, so just set this */
0dc4a61d 1704 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
1705 }
1706 }
a3621e74 1707 for ( ; uc < e ; uc += len ) {
3dab1dad 1708 TRIE_CHARCOUNT(trie)++;
a3621e74 1709 TRIE_READ_CHAR;
645de4ce
KW
1710
1711 /* Acummulate to the current values, the range in the number of
1712 * bytes that this character could match. The max is presumed to
1713 * be the same as the folded input (which TRIE_READ_CHAR returns),
1714 * except that when this is not in UTF-8, it could be matched
1715 * against a string which is UTF-8, and the variant characters
1716 * could be 2 bytes instead of the 1 here. Likewise, for the
1717 * minimum number of bytes when not folded. When folding, the min
1718 * is assumed to be 1 byte could fold to match the single character
1719 * here, or in the case of a multi-char fold, 1 byte can fold to
1720 * the whole sequence. 'foldlen' is used to denote whether we are
1721 * in such a sequence, skipping the min setting if so. XXX TODO
1722 * Use the exact list of what folds to each character, from
1723 * PL_utf8_foldclosures */
1724 if (UTF) {
1725 maxbytes += UTF8SKIP(uc);
1726 if (! folder) {
1727 /* A non-UTF-8 string could be 1 byte to match our 2 */
1728 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
1729 ? 1
1730 : UTF8SKIP(uc);
1731 }
1732 else {
1733 if (foldlen) {
1734 foldlen -= UTF8SKIP(uc);
1735 }
1736 else {
1737 foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
1738 minbytes++;
1739 }
1740 }
1741 }
1742 else {
1743 maxbytes += (UNI_IS_INVARIANT(*uc))
1744 ? 1
1745 : 2;
1746 if (! folder) {
1747 minbytes++;
1748 }
1749 else {
1750 if (foldlen) {
1751 foldlen--;
1752 }
1753 else {
1754 foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
1755 minbytes++;
1756 }
1757 }
1758 }
a3621e74 1759 if ( uvc < 256 ) {
fab2782b
YO
1760 if ( folder ) {
1761 U8 folded= folder[ (U8) uvc ];
1762 if ( !trie->charmap[ folded ] ) {
1763 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1764 TRIE_STORE_REVCHAR( folded );
1765 }
1766 }
a3621e74
YO
1767 if ( !trie->charmap[ uvc ] ) {
1768 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 1769 TRIE_STORE_REVCHAR( uvc );
a3621e74 1770 }
02daf0ab 1771 if ( set_bit ) {
62012aee
KW
1772 /* store the codepoint in the bitmap, and its folded
1773 * equivalent. */
fab2782b 1774 TRIE_BITMAP_SET(trie, uvc);
0921ee73
T
1775
1776 /* store the folded codepoint */
fab2782b 1777 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
0921ee73
T
1778
1779 if ( !UTF ) {
1780 /* store first byte of utf8 representation of
acdf4139 1781 variant codepoints */
6f2d5cbc 1782 if (! UVCHR_IS_INVARIANT(uvc)) {
acdf4139 1783 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1784 }
1785 }
02daf0ab
YO
1786 set_bit = 0; /* We've done our bit :-) */
1787 }
a3621e74
YO
1788 } else {
1789 SV** svpp;
55eed653
NC
1790 if ( !widecharmap )
1791 widecharmap = newHV();
a3621e74 1792
55eed653 1793 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1794
1795 if ( !svpp )
e4584336 1796 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1797
1798 if ( !SvTRUE( *svpp ) ) {
1799 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 1800 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
1801 }
1802 }
1803 }
3dab1dad 1804 if( cur == first ) {
afa96d92
KW
1805 trie->minlen = minbytes;
1806 trie->maxlen = maxbytes;
1807 } else if (minbytes < trie->minlen) {
1808 trie->minlen = minbytes;
1809 } else if (maxbytes > trie->maxlen) {
1810 trie->maxlen = maxbytes;
fab2782b 1811 }
a3621e74
YO
1812 } /* end first pass */
1813 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1814 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1815 (int)depth * 2 + 2,"",
55eed653 1816 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1817 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1818 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1819 );
a3621e74
YO
1820
1821 /*
1822 We now know what we are dealing with in terms of unique chars and
1823 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1824 representation using a flat table will take. If it's over a reasonable
1825 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1826 conservative but potentially much slower representation using an array
1827 of lists.
1828
1829 At the end we convert both representations into the same compressed
1830 form that will be used in regexec.c for matching with. The latter
1831 is a form that cannot be used to construct with but has memory
1832 properties similar to the list form and access properties similar
1833 to the table form making it both suitable for fast searches and
1834 small enough that its feasable to store for the duration of a program.
1835
1836 See the comment in the code where the compressed table is produced
1837 inplace from the flat tabe representation for an explanation of how
1838 the compression works.
1839
1840 */
1841
1842
2e64971a
DM
1843 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1844 prev_states[1] = 0;
1845
3dab1dad 1846 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1847 /*
1848 Second Pass -- Array Of Lists Representation
1849
1850 Each state will be represented by a list of charid:state records
1851 (reg_trie_trans_le) the first such element holds the CUR and LEN
1852 points of the allocated array. (See defines above).
1853
1854 We build the initial structure using the lists, and then convert
1855 it into the compressed table form which allows faster lookups
1856 (but cant be modified once converted).
a3621e74
YO
1857 */
1858
a3621e74
YO
1859 STRLEN transcount = 1;
1860
1e2e3d02
YO
1861 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1862 "%*sCompiling trie using list compiler\n",
1863 (int)depth * 2 + 2, ""));
686b73d4 1864
c944940b
JH
1865 trie->states = (reg_trie_state *)
1866 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1867 sizeof(reg_trie_state) );
a3621e74
YO
1868 TRIE_LIST_NEW(1);
1869 next_alloc = 2;
1870
1871 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1872
df826430 1873 regnode *noper = NEXTOPER( cur );
c445ea15 1874 U8 *uc = (U8*)STRING( noper );
df826430 1875 const U8 *e = uc + STR_LEN( noper );
c445ea15
AL
1876 U32 state = 1; /* required init */
1877 U16 charid = 0; /* sanity init */
07be1b83 1878 U32 wordlen = 0; /* required init */
c445ea15 1879
df826430
YO
1880 if (OP(noper) == NOTHING) {
1881 regnode *noper_next= regnext(noper);
1882 if (noper_next != tail && OP(noper_next) == flags) {
1883 noper = noper_next;
1884 uc= (U8*)STRING(noper);
1885 e= uc + STR_LEN(noper);
1886 }
1887 }
1888
3dab1dad 1889 if (OP(noper) != NOTHING) {
786e8c11 1890 for ( ; uc < e ; uc += len ) {
c445ea15 1891
786e8c11 1892 TRIE_READ_CHAR;
c445ea15 1893
786e8c11
YO
1894 if ( uvc < 256 ) {
1895 charid = trie->charmap[ uvc ];
c445ea15 1896 } else {
55eed653 1897 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1898 if ( !svpp ) {
1899 charid = 0;
1900 } else {
1901 charid=(U16)SvIV( *svpp );
1902 }
c445ea15 1903 }
786e8c11
YO
1904 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1905 if ( charid ) {
a3621e74 1906
786e8c11
YO
1907 U16 check;
1908 U32 newstate = 0;
a3621e74 1909
786e8c11
YO
1910 charid--;
1911 if ( !trie->states[ state ].trans.list ) {
1912 TRIE_LIST_NEW( state );
c445ea15 1913 }
786e8c11
YO
1914 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1915 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1916 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1917 break;
1918 }
1919 }
1920 if ( ! newstate ) {
1921 newstate = next_alloc++;
2e64971a 1922 prev_states[newstate] = state;
786e8c11
YO
1923 TRIE_LIST_PUSH( state, charid, newstate );
1924 transcount++;
1925 }
1926 state = newstate;
1927 } else {
1928 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1929 }
a28509cc 1930 }
c445ea15 1931 }
3dab1dad 1932 TRIE_HANDLE_WORD(state);
a3621e74
YO
1933
1934 } /* end second pass */
1935
1e2e3d02
YO
1936 /* next alloc is the NEXT state to be allocated */
1937 trie->statecount = next_alloc;
c944940b
JH
1938 trie->states = (reg_trie_state *)
1939 PerlMemShared_realloc( trie->states,
1940 next_alloc
1941 * sizeof(reg_trie_state) );
a3621e74 1942
3dab1dad 1943 /* and now dump it out before we compress it */
2b8b4781
NC
1944 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1945 revcharmap, next_alloc,
1946 depth+1)
1e2e3d02 1947 );
a3621e74 1948
c944940b
JH
1949 trie->trans = (reg_trie_trans *)
1950 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1951 {
1952 U32 state;
a3621e74
YO
1953 U32 tp = 0;
1954 U32 zp = 0;
1955
1956
1957 for( state=1 ; state < next_alloc ; state ++ ) {
1958 U32 base=0;
1959
1960 /*
1961 DEBUG_TRIE_COMPILE_MORE_r(
1962 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1963 );
1964 */
1965
1966 if (trie->states[state].trans.list) {
1967 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1968 U16 maxid=minid;
a28509cc 1969 U16 idx;
a3621e74
YO
1970
1971 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1972 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1973 if ( forid < minid ) {
1974 minid=forid;
1975 } else if ( forid > maxid ) {
1976 maxid=forid;
1977 }
a3621e74
YO
1978 }
1979 if ( transcount < tp + maxid - minid + 1) {
1980 transcount *= 2;
c944940b
JH
1981 trie->trans = (reg_trie_trans *)
1982 PerlMemShared_realloc( trie->trans,
446bd890
NC
1983 transcount
1984 * sizeof(reg_trie_trans) );
a3621e74
YO
1985 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1986 }
1987 base = trie->uniquecharcount + tp - minid;
1988 if ( maxid == minid ) {
1989 U32 set = 0;
1990 for ( ; zp < tp ; zp++ ) {
1991 if ( ! trie->trans[ zp ].next ) {
1992 base = trie->uniquecharcount + zp - minid;
1993 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1994 trie->trans[ zp ].check = state;
1995 set = 1;
1996 break;
1997 }
1998 }
1999 if ( !set ) {
2000 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2001 trie->trans[ tp ].check = state;
2002 tp++;
2003 zp = tp;
2004 }
2005 } else {
2006 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 2007 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
2008 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2009 trie->trans[ tid ].check = state;
2010 }
2011 tp += ( maxid - minid + 1 );
2012 }
2013 Safefree(trie->states[ state ].trans.list);
2014 }
2015 /*
2016 DEBUG_TRIE_COMPILE_MORE_r(
2017 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2018 );
2019 */
2020 trie->states[ state ].trans.base=base;
2021 }
cc601c31 2022 trie->lasttrans = tp + 1;
a3621e74
YO
2023 }
2024 } else {
2025 /*
2026 Second Pass -- Flat Table Representation.
2027
2028 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
2029 We know that we will need Charcount+1 trans at most to store the data
2030 (one row per char at worst case) So we preallocate both structures
2031 assuming worst case.
2032
2033 We then construct the trie using only the .next slots of the entry
2034 structs.
2035
3b753521 2036 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
2037 make compression both faster and easier by keeping track of how many non
2038 zero fields are in the node.
2039
2040 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2041 transition.
2042
2043 There are two terms at use here: state as a TRIE_NODEIDX() which is a
2044 number representing the first entry of the node, and state as a
2045 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
2046 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
2047 are 2 entrys per node. eg:
2048
2049 A B A B
2050 1. 2 4 1. 3 7
2051 2. 0 3 3. 0 5
2052 3. 0 0 5. 0 0
2053 4. 0 0 7. 0 0
2054
2055 The table is internally in the right hand, idx form. However as we also
2056 have to deal with the states array which is indexed by nodenum we have to
2057 use TRIE_NODENUM() to convert.
2058
2059 */
1e2e3d02
YO
2060 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2061 "%*sCompiling trie using table compiler\n",
2062 (int)depth * 2 + 2, ""));
3dab1dad 2063
c944940b
JH
2064 trie->trans = (reg_trie_trans *)
2065 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2066 * trie->uniquecharcount + 1,
2067 sizeof(reg_trie_trans) );
2068 trie->states = (reg_trie_state *)
2069 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2070 sizeof(reg_trie_state) );
a3621e74
YO
2071 next_alloc = trie->uniquecharcount + 1;
2072
3dab1dad 2073
a3621e74
YO
2074 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2075
df826430 2076 regnode *noper = NEXTOPER( cur );
a28509cc 2077 const U8 *uc = (U8*)STRING( noper );
df826430 2078 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
2079
2080 U32 state = 1; /* required init */
2081
2082 U16 charid = 0; /* sanity init */
2083 U32 accept_state = 0; /* sanity init */
a3621e74 2084
07be1b83 2085 U32 wordlen = 0; /* required init */
a3621e74 2086
df826430
YO
2087 if (OP(noper) == NOTHING) {
2088 regnode *noper_next= regnext(noper);
2089 if (noper_next != tail && OP(noper_next) == flags) {
2090 noper = noper_next;
2091 uc= (U8*)STRING(noper);
2092 e= uc + STR_LEN(noper);
2093 }
2094 }
fab2782b 2095
3dab1dad 2096 if ( OP(noper) != NOTHING ) {
786e8c11 2097 for ( ; uc < e ; uc += len ) {
a3621e74 2098
786e8c11 2099 TRIE_READ_CHAR;
a3621e74 2100
786e8c11
YO
2101 if ( uvc < 256 ) {
2102 charid = trie->charmap[ uvc ];
2103 } else {
55eed653 2104 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 2105 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 2106 }
786e8c11
YO
2107 if ( charid ) {
2108 charid--;
2109 if ( !trie->trans[ state + charid ].next ) {
2110 trie->trans[ state + charid ].next = next_alloc;
2111 trie->trans[ state ].check++;
2e64971a
DM
2112 prev_states[TRIE_NODENUM(next_alloc)]
2113 = TRIE_NODENUM(state);
786e8c11
YO
2114 next_alloc += trie->uniquecharcount;
2115 }
2116 state = trie->trans[ state + charid ].next;
2117 } else {
2118 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2119 }
2120 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 2121 }
a3621e74 2122 }
3dab1dad
YO
2123 accept_state = TRIE_NODENUM( state );
2124 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
2125
2126 } /* end second pass */
2127
3dab1dad 2128 /* and now dump it out before we compress it */
2b8b4781
NC
2129 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2130 revcharmap,
2131 next_alloc, depth+1));
a3621e74 2132
a3621e74
YO
2133 {
2134 /*
2135 * Inplace compress the table.*
2136
2137 For sparse data sets the table constructed by the trie algorithm will
2138 be mostly 0/FAIL transitions or to put it another way mostly empty.
2139 (Note that leaf nodes will not contain any transitions.)
2140
2141 This algorithm compresses the tables by eliminating most such
2142 transitions, at the cost of a modest bit of extra work during lookup:
2143
2144 - Each states[] entry contains a .base field which indicates the
2145 index in the state[] array wheres its transition data is stored.
2146
3b753521 2147 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
2148
2149 - If .base is nonzero then charid is added to it to find an entry in
2150 the trans array.
2151
2152 -If trans[states[state].base+charid].check!=state then the
2153 transition is taken to be a 0/Fail transition. Thus if there are fail
2154 transitions at the front of the node then the .base offset will point
2155 somewhere inside the previous nodes data (or maybe even into a node
2156 even earlier), but the .check field determines if the transition is
2157 valid.
2158
786e8c11 2159 XXX - wrong maybe?
a3621e74 2160 The following process inplace converts the table to the compressed
3b753521 2161 table: We first do not compress the root node 1,and mark all its
a3621e74 2162 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
2163 allows us to do a DFA construction from the compressed table later,
2164 and ensures that any .base pointers we calculate later are greater
2165 than 0.
a3621e74
YO
2166
2167 - We set 'pos' to indicate the first entry of the second node.
2168
2169 - We then iterate over the columns of the node, finding the first and
2170 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2171 and set the .check pointers accordingly, and advance pos
2172 appropriately and repreat for the next node. Note that when we copy
2173 the next pointers we have to convert them from the original
2174 NODEIDX form to NODENUM form as the former is not valid post
2175 compression.
2176
2177 - If a node has no transitions used we mark its base as 0 and do not
2178 advance the pos pointer.
2179
2180 - If a node only has one transition we use a second pointer into the
2181 structure to fill in allocated fail transitions from other states.
2182 This pointer is independent of the main pointer and scans forward
2183 looking for null transitions that are allocated to a state. When it
2184 finds one it writes the single transition into the "hole". If the
786e8c11 2185 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
2186
2187 - Once compressed we can Renew/realloc the structures to release the
2188 excess space.
2189
2190 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2191 specifically Fig 3.47 and the associated pseudocode.
2192
2193 demq
2194 */
a3b680e6 2195 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 2196 U32 state, charid;
a3621e74 2197 U32 pos = 0, zp=0;
1e2e3d02 2198 trie->statecount = laststate;
a3621e74
YO
2199
2200 for ( state = 1 ; state < laststate ; state++ ) {
2201 U8 flag = 0;
a28509cc
AL
2202 const U32 stateidx = TRIE_NODEIDX( state );
2203 const U32 o_used = trie->trans[ stateidx ].check;
2204 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
2205 trie->trans[ stateidx ].check = 0;
2206
2207 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2208 if ( flag || trie->trans[ stateidx + charid ].next ) {
2209 if ( trie->trans[ stateidx + charid ].next ) {
2210 if (o_used == 1) {
2211 for ( ; zp < pos ; zp++ ) {
2212 if ( ! trie->trans[ zp ].next ) {
2213 break;
2214 }
2215 }
2216 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2217 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2218 trie->trans[ zp ].check = state;
2219 if ( ++zp > pos ) pos = zp;
2220 break;
2221 }
2222 used--;
2223 }
2224 if ( !flag ) {
2225 flag = 1;
2226 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2227 }
2228 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2229 trie->trans[ pos ].check = state;
2230 pos++;
2231 }
2232 }
2233 }
cc601c31 2234 trie->lasttrans = pos + 1;
c944940b
JH
2235 trie->states = (reg_trie_state *)
2236 PerlMemShared_realloc( trie->states, laststate
2237 * sizeof(reg_trie_state) );
a3621e74 2238 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 2239 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2240 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2241 (int)depth * 2 + 2,"",
2242 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
2243 (IV)next_alloc,
2244 (IV)pos,
a3621e74
YO
2245 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2246 );
2247
2248 } /* end table compress */
2249 }
1e2e3d02
YO
2250 DEBUG_TRIE_COMPILE_MORE_r(
2251 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2252 (int)depth * 2 + 2, "",
2253 (UV)trie->statecount,
2254 (UV)trie->lasttrans)
2255 );
cc601c31 2256 /* resize the trans array to remove unused space */
c944940b
JH
2257 trie->trans = (reg_trie_trans *)
2258 PerlMemShared_realloc( trie->trans, trie->lasttrans
2259 * sizeof(reg_trie_trans) );
a3621e74 2260
3b753521 2261 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2262 U8 nodetype =(U8)(flags & 0xFF);
2263 char *str=NULL;
786e8c11 2264
07be1b83 2265#ifdef DEBUGGING
e62cc96a 2266 regnode *optimize = NULL;
7122b237
YO
2267#ifdef RE_TRACK_PATTERN_OFFSETS
2268
b57a0404
JH
2269 U32 mjd_offset = 0;
2270 U32 mjd_nodelen = 0;
7122b237
YO
2271#endif /* RE_TRACK_PATTERN_OFFSETS */
2272#endif /* DEBUGGING */
a3621e74 2273 /*
3dab1dad
YO
2274 This means we convert either the first branch or the first Exact,
2275 depending on whether the thing following (in 'last') is a branch
2276 or not and whther first is the startbranch (ie is it a sub part of
2277 the alternation or is it the whole thing.)
3b753521 2278 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2279 the whole branch sequence, including the first.
a3621e74 2280 */
3dab1dad 2281 /* Find the node we are going to overwrite */
7f69552c 2282 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2283 /* branch sub-chain */
3dab1dad 2284 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2285#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2286 DEBUG_r({
2287 mjd_offset= Node_Offset((convert));
2288 mjd_nodelen= Node_Length((convert));
2289 });
7122b237 2290#endif
7f69552c 2291 /* whole branch chain */
7122b237
YO
2292 }
2293#ifdef RE_TRACK_PATTERN_OFFSETS
2294 else {
7f69552c
YO
2295 DEBUG_r({
2296 const regnode *nop = NEXTOPER( convert );
2297 mjd_offset= Node_Offset((nop));
2298 mjd_nodelen= Node_Length((nop));
2299 });
07be1b83
YO
2300 }
2301 DEBUG_OPTIMISE_r(
2302 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2303 (int)depth * 2 + 2, "",
786e8c11 2304 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2305 );
7122b237 2306#endif
3dab1dad
YO
2307 /* But first we check to see if there is a common prefix we can
2308 split out as an EXACT and put in front of the TRIE node. */
2309 trie->startstate= 1;
55eed653 2310 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2311 U32 state;
1e2e3d02 2312 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2313 U32 ofs = 0;
8e11feef
RGS
2314 I32 idx = -1;
2315 U32 count = 0;
2316 const U32 base = trie->states[ state ].trans.base;
a3621e74 2317
3dab1dad 2318 if ( trie->states[state].wordnum )
8e11feef 2319 count = 1;
a3621e74 2320
8e11feef 2321 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2322 if ( ( base + ofs >= trie->uniquecharcount ) &&
2323 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2324 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2325 {
3dab1dad 2326 if ( ++count > 1 ) {
2b8b4781 2327 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2328 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2329 if ( state == 1 ) break;
3dab1dad
YO
2330 if ( count == 2 ) {
2331 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2332 DEBUG_OPTIMISE_r(
8e11feef
RGS
2333 PerlIO_printf(Perl_debug_log,
2334 "%*sNew Start State=%"UVuf" Class: [",
2335 (int)depth * 2 + 2, "",
786e8c11 2336 (UV)state));
be8e71aa 2337 if (idx >= 0) {
2b8b4781 2338 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2339 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2340
3dab1dad 2341 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2342 if ( folder )
2343 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2344 DEBUG_OPTIMISE_r(
f1f66076 2345 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2346 );
8e11feef
RGS
2347 }
2348 }
2349 TRIE_BITMAP_SET(trie,*ch);
2350 if ( folder )
2351 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2352 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2353 }
2354 idx = ofs;
2355 }
3dab1dad
YO
2356 }
2357 if ( count == 1 ) {
2b8b4781 2358 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2359 STRLEN len;
2360 char *ch = SvPV( *tmp, len );
de734bd5
A
2361 DEBUG_OPTIMISE_r({
2362 SV *sv=sv_newmortal();
8e11feef
RGS
2363 PerlIO_printf( Perl_debug_log,
2364 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2365 (int)depth * 2 + 2, "",
de734bd5
A
2366 (UV)state, (UV)idx,
2367 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2368 PL_colors[0], PL_colors[1],
2369 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2370 PERL_PV_ESCAPE_FIRSTCHAR
2371 )
2372 );
2373 });
3dab1dad
YO
2374 if ( state==1 ) {
2375 OP( convert ) = nodetype;
2376 str=STRING(convert);
2377 STR_LEN(convert)=0;
2378 }
c490c714
YO
2379 STR_LEN(convert) += len;
2380 while (len--)
de734bd5 2381 *str++ = *ch++;
8e11feef 2382 } else {
f9049ba1 2383#ifdef DEBUGGING
8e11feef
RGS
2384 if (state>1)
2385 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2386#endif
8e11feef
RGS
2387 break;
2388 }
2389 }
2e64971a 2390 trie->prefixlen = (state-1);
3dab1dad 2391 if (str) {
8e11feef 2392 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2393 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2394 trie->startstate = state;
07be1b83
YO
2395 trie->minlen -= (state - 1);
2396 trie->maxlen -= (state - 1);
33809eae
JH
2397#ifdef DEBUGGING
2398 /* At least the UNICOS C compiler choked on this
2399 * being argument to DEBUG_r(), so let's just have
2400 * it right here. */
2401 if (
2402#ifdef PERL_EXT_RE_BUILD
2403 1
2404#else
2405 DEBUG_r_TEST
2406#endif
2407 ) {
2408 regnode *fix = convert;
2409 U32 word = trie->wordcount;
2410 mjd_nodelen++;
2411 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2412 while( ++fix < n ) {
2413 Set_Node_Offset_Length(fix, 0, 0);
2414 }
2415 while (word--) {
2416 SV ** const tmp = av_fetch( trie_words, word, 0 );
2417 if (tmp) {
2418 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2419 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2420 else
2421 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2422 }
2423 }
2424 }
2425#endif
8e11feef
RGS
2426 if (trie->maxlen) {
2427 convert = n;
2428 } else {
3dab1dad 2429 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2430 DEBUG_r(optimize= n);
3dab1dad
YO
2431 }
2432 }
2433 }
a5ca303d
YO
2434 if (!jumper)
2435 jumper = last;
3dab1dad 2436 if ( trie->maxlen ) {
8e11feef
RGS
2437 NEXT_OFF( convert ) = (U16)(tail - convert);
2438 ARG_SET( convert, data_slot );
786e8c11
YO
2439 /* Store the offset to the first unabsorbed branch in
2440 jump[0], which is otherwise unused by the jump logic.
2441 We use this when dumping a trie and during optimisation. */
2442 if (trie->jump)
7f69552c 2443 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2444
6c48061a
YO
2445 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2446 * and there is a bitmap
2447 * and the first "jump target" node we found leaves enough room
2448 * then convert the TRIE node into a TRIEC node, with the bitmap
2449 * embedded inline in the opcode - this is hypothetically faster.
2450 */
2451 if ( !trie->states[trie->startstate].wordnum
2452 && trie->bitmap
2453 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2454 {
2455 OP( convert ) = TRIEC;
2456 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2457 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2458 trie->bitmap= NULL;
2459 } else
2460 OP( convert ) = TRIE;
a3621e74 2461
3dab1dad
YO
2462 /* store the type in the flags */
2463 convert->flags = nodetype;
a5ca303d
YO
2464 DEBUG_r({
2465 optimize = convert
2466 + NODE_STEP_REGNODE
2467 + regarglen[ OP( convert ) ];
2468 });
2469 /* XXX We really should free up the resource in trie now,
2470 as we won't use them - (which resources?) dmq */
3dab1dad 2471 }
a3621e74 2472 /* needed for dumping*/
e62cc96a 2473 DEBUG_r(if (optimize) {
07be1b83 2474 regnode *opt = convert;
bcdf7404 2475
e62cc96a 2476 while ( ++opt < optimize) {
07be1b83
YO
2477 Set_Node_Offset_Length(opt,0,0);
2478 }
786e8c11
YO
2479 /*
2480 Try to clean up some of the debris left after the
2481 optimisation.
a3621e74 2482 */
786e8c11 2483 while( optimize < jumper ) {
07be1b83 2484 mjd_nodelen += Node_Length((optimize));
a3621e74 2485 OP( optimize ) = OPTIMIZED;
07be1b83 2486 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2487 optimize++;
2488 }
07be1b83 2489 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2490 });
2491 } /* end node insert */
2e64971a
DM
2492
2493 /* Finish populating the prev field of the wordinfo array. Walk back
2494 * from each accept state until we find another accept state, and if
2495 * so, point the first word's .prev field at the second word. If the
2496 * second already has a .prev field set, stop now. This will be the
2497 * case either if we've already processed that word's accept state,
3b753521
FN
2498 * or that state had multiple words, and the overspill words were
2499 * already linked up earlier.
2e64971a
DM
2500 */
2501 {
2502 U16 word;
2503 U32 state;
2504 U16 prev;
2505
2506 for (word=1; word <= trie->wordcount; word++) {
2507 prev = 0;
2508 if (trie->wordinfo[word].prev)
2509 continue;
2510 state = trie->wordinfo[word].accept;
2511 while (state) {
2512 state = prev_states[state];
2513 if (!state)
2514 break;
2515 prev = trie->states[state].wordnum;
2516 if (prev)
2517 break;
2518 }
2519 trie->wordinfo[word].prev = prev;
2520 }
2521 Safefree(prev_states);
2522 }
2523
2524
2525 /* and now dump out the compressed format */
2526 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2527
55eed653 2528 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2529#ifdef DEBUGGING
2530 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2531 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2532#else
03e70be4 2533 SvREFCNT_dec_NN(revcharmap);
07be1b83 2534#endif
786e8c11
YO
2535 return trie->jump
2536 ? MADE_JUMP_TRIE
2537 : trie->startstate>1
2538 ? MADE_EXACT_TRIE
2539 : MADE_TRIE;
2540}
2541
2542STATIC void
2543S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2544{
3b753521 2545/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2546
2547 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2548 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2549 ISBN 0-201-10088-6
2550
2551 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2552 suffix of the current state's 'word' that is also a proper prefix of another word in our
2553 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2554 the DFA not to have to restart after its tried and failed a word at a given point, it
2555 simply continues as though it had been matching the other word in the first place.
2556 Consider
2557 'abcdgu'=~/abcdefg|cdgu/
2558 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2559 fail, which would bring us to the state representing 'd' in the second word where we would
2560 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2561 */
2562 /* add a fail transition */
3251b653
NC
2563 const U32 trie_offset = ARG(source);
2564 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2565 U32 *q;
2566 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2567 const U32 numstates = trie->statecount;
786e8c11
YO
2568 const U32 ubound = trie->lasttrans + ucharcount;
2569 U32 q_read = 0;
2570 U32 q_write = 0;
2571 U32 charid;
2572 U32 base = trie->states[ 1 ].trans.base;
2573 U32 *fail;
2574 reg_ac_data *aho;
cf78de0b 2575 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
786e8c11 2576 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2577
2578 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2579#ifndef DEBUGGING
2580 PERL_UNUSED_ARG(depth);
2581#endif
2582
2583
2584 ARG_SET( stclass, data_slot );
c944940b 2585 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2586 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2587 aho->trie=trie_offset;
446bd890
NC
2588 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2589 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2590 Newxz( q, numstates, U32);
c944940b 2591 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2592 aho->refcount = 1;
2593 fail = aho->fail;
2594 /* initialize fail[0..1] to be 1 so that we always have
2595 a valid final fail state */
2596 fail[ 0 ] = fail[ 1 ] = 1;
2597
2598 for ( charid = 0; charid < ucharcount ; charid++ ) {
2599 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2600 if ( newstate ) {
2601 q[ q_write ] = newstate;
2602 /* set to point at the root */
2603 fail[ q[ q_write++ ] ]=1;
2604 }
2605 }
2606 while ( q_read < q_write) {
2607 const U32 cur = q[ q_read++ % numstates ];
2608 base = trie->states[ cur ].trans.base;
2609
2610 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2611 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2612 if (ch_state) {
2613 U32 fail_state = cur;
2614 U32 fail_base;
2615 do {
2616 fail_state = fail[ fail_state ];
2617 fail_base = aho->states[ fail_state ].trans.base;
2618 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2619
2620 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2621 fail[ ch_state ] = fail_state;
2622 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2623 {
2624 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2625 }
2626 q[ q_write++ % numstates] = ch_state;
2627 }
2628 }
2629 }
2630 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2631 when we fail in state 1, this allows us to use the
2632 charclass scan to find a valid start char. This is based on the principle
2633 that theres a good chance the string being searched contains lots of stuff
2634 that cant be a start char.
2635 */
2636 fail[ 0 ] = fail[ 1 ] = 0;
2637 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2638 PerlIO_printf(Perl_debug_log,
2639 "%*sStclass Failtable (%"UVuf" states): 0",
2640 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2641 );
786e8c11
YO
2642 for( q_read=1; q_read<numstates; q_read++ ) {
2643 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2644 }
2645 PerlIO_printf(Perl_debug_log, "\n");
2646 });
2647 Safefree(q);
2648 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2649}
2650
786e8c11 2651
07be1b83 2652#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2653 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2654 SV * const mysv=sv_newmortal(); \
2655 regnode *Next = regnext(scan); \
2656 regprop(RExC_rx, mysv, scan); \
7f69552c 2657 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2658 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2659 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2660 }});
07be1b83 2661
1de06328 2662
bb914485 2663/* The below joins as many adjacent EXACTish nodes as possible into a single
0a982f06
KW
2664 * one. The regop may be changed if the node(s) contain certain sequences that
2665 * require special handling. The joining is only done if:
bb914485
KW
2666 * 1) there is room in the current conglomerated node to entirely contain the
2667 * next one.
2668 * 2) they are the exact same node type
2669 *
87b8b349 2670 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
bb914485
KW
2671 * these get optimized out
2672 *
0a982f06
KW
2673 * If a node is to match under /i (folded), the number of characters it matches
2674 * can be different than its character length if it contains a multi-character
2675 * fold. *min_subtract is set to the total delta of the input nodes.
bb914485 2676 *
a0c4c608
KW
2677 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2678 * and contains LATIN SMALL LETTER SHARP S
f758bddf 2679 *
bb914485 2680 * This is as good a place as any to discuss the design of handling these
0a982f06
KW
2681 * multi-character fold sequences. It's been wrong in Perl for a very long
2682 * time. There are three code points in Unicode whose multi-character folds
2683 * were long ago discovered to mess things up. The previous designs for
2684 * dealing with these involved assigning a special node for them. This
2685 * approach doesn't work, as evidenced by this example:
a0c4c608 2686 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
0a982f06
KW
2687 * Both these fold to "sss", but if the pattern is parsed to create a node that
2688 * would match just the \xDF, it won't be able to handle the case where a
bb914485
KW
2689 * successful match would have to cross the node's boundary. The new approach
2690 * that hopefully generally solves the problem generates an EXACTFU_SS node
2691 * that is "sss".
2692 *
0a982f06 2693 * It turns out that there are problems with all multi-character folds, and not
cb117658
KW
2694 * just these three. Now the code is general, for all such cases. The
2695 * approach taken is:
0a982f06
KW
2696 * 1) This routine examines each EXACTFish node that could contain multi-
2697 * character fold sequences. It returns in *min_subtract how much to
9d071ca8 2698 * subtract from the the actual length of the string to get a real minimum
0a982f06
KW
2699 * match length; it is 0 if there are no multi-char folds. This delta is
2700 * used by the caller to adjust the min length of the match, and the delta
2701 * between min and max, so that the optimizer doesn't reject these
2702 * possibilities based on size constraints.
cb117658 2703 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
0a982f06
KW
2704 * is used for an EXACTFU node that contains at least one "ss" sequence in
2705 * it. For non-UTF-8 patterns and strings, this is the only case where
2706 * there is a possible fold length change. That means that a regular
2707 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2708 * with length changes, and so can be processed faster. regexec.c takes
2709 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2710 * pre-folded by regcomp.c. This saves effort in regex matching.
87b8b349 2711 * However, the pre-folding isn't done for non-UTF8 patterns because the
0a982f06
KW
2712 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2713 * down by forcing the pattern into UTF8 unless necessary. Also what
2714 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2715 * possibilities for the non-UTF8 patterns are quite simple, except for
2716 * the sharp s. All the ones that don't involve a UTF-8 target string are
2717 * members of a fold-pair, and arrays are set up for all of them so that
2718 * the other member of the pair can be found quickly. Code elsewhere in
2719 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2720 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2721 * described in the next item.
098b07d5 2722 * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
1ca267a5
KW
2723 * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
2724 * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
2725 * (probably unwittingly, in Perl_regexec_flags()) makes is that a
2726 * character in the pattern corresponds to at most a single character in
2727 * the target string. (And I do mean character, and not byte here, unlike
2728 * other parts of the documentation that have never been updated to
2729 * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
2730 * two character string 'ss'; in EXACTFA nodes it can match
2731 * "\x{17F}\x{17F}". These violate the assumption, and they are the only
2732 * instances where it is violated. I'm reluctant to try to change the
2733 * assumption, as the code involved is impenetrable to me (khw), so
2734 * instead the code here punts. This routine examines (when the pattern
2735 * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
2736 * boolean indicating whether or not the node contains a sharp s. When it
2737 * is true, the caller sets a flag that later causes the optimizer in this
2738 * file to not set values for the floating and fixed string lengths, and
2739 * thus avoids the optimizer code in regexec.c that makes the invalid
2740 * assumption. Thus, there is no optimization based on string lengths for
2741 * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
2742 * (The reason the assumption is wrong only in these two cases is that all
2743 * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
2744 * other folds to their expanded versions. We can't prefold sharp s to
2745 * 'ss' in EXACTF nodes because we don't know at compile time if it
2746 * actually matches 'ss' or not. It will match iff the target string is
2747 * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
2748 * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
2749 * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
2750 * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
2751 * require the pattern to be forced into UTF-8, the overhead of which we
2752 * want to avoid.)
098b07d5
KW
2753 *
2754 * Similarly, the code that generates tries doesn't currently handle
2755 * not-already-folded multi-char folds, and it looks like a pain to change
2756 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
2757 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
2758 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
2759 * using /iaa matching will be doing so almost entirely with ASCII
2760 * strings, so this should rarely be encountered in practice */
1de06328 2761
9d071ca8 2762#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
07be1b83 2763 if (PL_regkind[OP(scan)] == EXACT) \
9d071ca8 2764 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
07be1b83 2765
be8e71aa 2766STATIC U32
9d071ca8 2767S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
07be1b83
YO
2768 /* Merge several consecutive EXACTish nodes into one. */
2769 regnode *n = regnext(scan);
2770 U32 stringok = 1;
2771 regnode *next = scan + NODE_SZ_STR(scan);
2772 U32 merged = 0;
2773 U32 stopnow = 0;
2774#ifdef DEBUGGING
2775 regnode *stop = scan;
72f13be8 2776 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2777#else
d47053eb
RGS
2778 PERL_UNUSED_ARG(depth);
2779#endif
7918f24d
NC
2780
2781 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2782#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2783 PERL_UNUSED_ARG(flags);
2784 PERL_UNUSED_ARG(val);
07be1b83 2785#endif
07be1b83 2786 DEBUG_PEEP("join",scan,depth);
bb914485 2787
3f410cf6
KW
2788 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2789 * EXACT ones that are mergeable to the current one. */
2790 while (n
2791 && (PL_regkind[OP(n)] == NOTHING
2792 || (stringok && OP(n) == OP(scan)))
07be1b83 2793 && NEXT_OFF(n)
3f410cf6
KW
2794 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2795 {
07be1b83
YO
2796
2797 if (OP(n) == TAIL || n > next)
2798 stringok = 0;
2799 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2800 DEBUG_PEEP("skip:",n,depth);
2801 NEXT_OFF(scan) += NEXT_OFF(n);
2802 next = n + NODE_STEP_REGNODE;
2803#ifdef DEBUGGING
2804 if (stringok)
2805 stop = n;
2806#endif
2807 n = regnext(n);
2808 }
2809 else if (stringok) {
786e8c11 2810 const unsigned int oldl = STR_LEN(scan);
07be1b83 2811 regnode * const nnext = regnext(n);
b2230d39 2812
87b8b349
KW
2813 /* XXX I (khw) kind of doubt that this works on platforms where
2814 * U8_MAX is above 255 because of lots of other assumptions */
79a81a6e 2815 /* Don't join if the sum can't fit into a single node */
b2230d39
KW
2816 if (oldl + STR_LEN(n) > U8_MAX)
2817 break;
07be1b83
YO
2818
2819 DEBUG_PEEP("merg",n,depth);
07be1b83 2820 merged++;
b2230d39 2821
07be1b83
YO
2822 NEXT_OFF(scan) += NEXT_OFF(n);
2823 STR_LEN(scan) += STR_LEN(n);
2824 next = n + NODE_SZ_STR(n);
2825 /* Now we can overwrite *n : */
2826 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2827#ifdef DEBUGGING
2828 stop = next - 1;
2829#endif
2830 n = nnext;
2831 if (stopnow) break;
2832 }
2833
d47053eb
RGS
2834#ifdef EXPERIMENTAL_INPLACESCAN
2835 if (flags && !NEXT_OFF(n)) {
2836 DEBUG_PEEP("atch", val, depth);
2837 if (reg_off_by_arg[OP(n)]) {
2838 ARG_SET(n, val - n);
2839 }
2840 else {
2841 NEXT_OFF(n) = val - n;
2842 }
2843 stopnow = 1;
2844 }
07be1b83
YO
2845#endif
2846 }
2c2b7f86 2847
9d071ca8 2848 *min_subtract = 0;
f758bddf 2849 *has_exactf_sharp_s = FALSE;
f646642f 2850
3f410cf6
KW
2851 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2852 * can now analyze for sequences of problematic code points. (Prior to
2853 * this final joining, sequences could have been split over boundaries, and
a0c4c608
KW
2854 * hence missed). The sequences only happen in folding, hence for any
2855 * non-EXACT EXACTish node */
86d6fcad 2856 if (OP(scan) != EXACT) {
0a982f06
KW
2857 const U8 * const s0 = (U8*) STRING(scan);
2858 const U8 * s = s0;
2859 const U8 * const s_end = s0 + STR_LEN(scan);
f758bddf
KW
2860
2861 /* One pass is made over the node's string looking for all the
2862 * possibilities. to avoid some tests in the loop, there are two main
2863 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2864 * non-UTF-8 */
2865 if (UTF) {
86d6fcad 2866
0a982f06
KW
2867 /* Examine the string for a multi-character fold sequence. UTF-8
2868 * patterns have all characters pre-folded by the time this code is
2869 * executed */
2870 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2871 length sequence we are looking for is 2 */
86d6fcad 2872 {
0a982f06
KW
2873 int count = 0;
2874 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2875 if (! len) { /* Not a multi-char fold: get next char */
2876 s += UTF8SKIP(s);
2877 continue;
2878 }
bb914485 2879
0a982f06 2880 /* Nodes with 'ss' require special handling, except for EXACTFL
098b07d5
KW
2881 * and EXACTFA-ish for which there is no multi-char fold to
2882 * this */
0a982f06 2883 if (len == 2 && *s == 's' && *(s+1) == 's'
098b07d5
KW
2884 && OP(scan) != EXACTFL
2885 && OP(scan) != EXACTFA
2886 && OP(scan) != EXACTFA_NO_TRIE)
0a982f06
KW
2887 {
2888 count = 2;
2889 OP(scan) = EXACTFU_SS;
2890 s += 2;
2891 }
0a982f06
KW
2892 else { /* Here is a generic multi-char fold. */
2893 const U8* multi_end = s + len;
2894
2895 /* Count how many characters in it. In the case of /l and
2896 * /aa, no folds which contain ASCII code points are
2897 * allowed, so check for those, and skip if found. (In
2898 * EXACTFL, no folds are allowed to any Latin1 code point,
2899 * not just ASCII. But there aren't any of these
2900 * currently, nor ever likely, so don't take the time to
2901 * test for them. The code that generates the
2902 * is_MULTI_foo() macros croaks should one actually get put
2903 * into Unicode .) */
098b07d5
KW
2904 if (OP(scan) != EXACTFL
2905 && OP(scan) != EXACTFA
2906 && OP(scan) != EXACTFA_NO_TRIE)
2907 {
0a982f06
KW
2908 count = utf8_length(s, multi_end);
2909 s = multi_end;
2910 }
2911 else {
2912 while (s < multi_end) {
2913 if (isASCII(*s)) {
2914 s++;
2915 goto next_iteration;
2916 }
2917 else {
2918 s += UTF8SKIP(s);
2919 }
2920 count++;
2921 }
2922 }
2923 }
f758bddf 2924
0a982f06
KW
2925 /* The delta is how long the sequence is minus 1 (1 is how long
2926 * the character that folds to the sequence is) */
2927 *min_subtract += count - 1;
2928 next_iteration: ;
bb914485
KW
2929 }
2930 }
1ca267a5
KW
2931 else if (OP(scan) == EXACTFA) {
2932
2933 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
2934 * fold to the ASCII range (and there are no existing ones in the
2935 * upper latin1 range). But, as outlined in the comments preceding
098b07d5
KW
2936 * this function, we need to flag any occurrences of the sharp s.
2937 * This character forbids trie formation (because of added
2938 * complexity) */
1ca267a5
KW
2939 while (s < s_end) {
2940 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
098b07d5 2941 OP(scan) = EXACTFA_NO_TRIE;
1ca267a5
KW
2942 *has_exactf_sharp_s = TRUE;
2943 break;
2944 }
2945 s++;
2946 continue;
2947 }
2948 }
2949 else if (OP(scan) != EXACTFL) {
2950
2951 /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
2952 * multi-char folds that are all Latin1. (This code knows that
2953 * there are no current multi-char folds possible with EXACTFL,
2954 * relying on fold_grind.t to catch any errors if the very unlikely
2955 * event happens that some get added in future Unicode versions.)
2956 * As explained in the comments preceding this function, we look
2957 * also for the sharp s in EXACTF nodes; it can be in the final
0a982f06
KW
2958 * position. Otherwise we can stop looking 1 byte earlier because
2959 * have to find at least two characters for a multi-fold */
f758bddf
KW
2960 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2961
0a982f06 2962 while (s < upper) {
40b1ba4f 2963 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
0a982f06
KW
2964 if (! len) { /* Not a multi-char fold. */
2965 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2966 {
2967 *has_exactf_sharp_s = TRUE;
2968 }
2969 s++;
2970 continue;
2971 }
2972
2973 if (len == 2
c02c3054
KW
2974 && isARG2_lower_or_UPPER_ARG1('s', *s)
2975 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
0a982f06
KW
2976 {
2977
2978 /* EXACTF nodes need to know that the minimum length
2979 * changed so that a sharp s in the string can match this
2980 * ss in the pattern, but they remain EXACTF nodes, as they
2981 * won't match this unless the target string is is UTF-8,
2982 * which we don't know until runtime */
2983 if (OP(scan) != EXACTF) {
2984 OP(scan) = EXACTFU_SS;
2985 }
86d6fcad 2986 }
0a982f06
KW
2987
2988 *min_subtract += len - 1;
2989 s += len;
86d6fcad
KW
2990 }
2991 }
07be1b83 2992 }
3f410cf6 2993
07be1b83 2994#ifdef DEBUGGING
bb789b09
DM
2995 /* Allow dumping but overwriting the collection of skipped
2996 * ops and/or strings with fake optimized ops */
07be1b83
YO
2997 n = scan + NODE_SZ_STR(scan);
2998 while (n <= stop) {
bb789b09
DM
2999 OP(n) = OPTIMIZED;
3000 FLAGS(n) = 0;
3001 NEXT_OFF(n) = 0;
07be1b83
YO
3002 n++;
3003 }
3004#endif
3005 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3006 return stopnow;
3007}
3008
486ec47a 3009/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
3010 Finds fixed substrings. */
3011
a0288114 3012/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
3013 to the position after last scanned or to NULL. */
3014
40d049e4
YO
3015#define INIT_AND_WITHP \
3016 assert(!and_withp); \
b8f7bb16 3017 Newx(and_withp,1, regnode_ssc); \
40d049e4 3018 SAVEFREEPV(and_withp)
07be1b83 3019
b515a41d 3020/* this is a chain of data about sub patterns we are processing that
486ec47a 3021 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
3022 we can simulate recursion without losing state. */
3023struct scan_frame;
3024typedef struct scan_frame {
3025 regnode *last; /* last node to process in this frame */
3026 regnode *next; /* next node to process when last is reached */
3027 struct scan_frame *prev; /*previous frame*/
3028 I32 stop; /* what stopparen do we use */
3029} scan_frame;
3030
304ee84b
YO
3031
3032#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3033
49f55535 3034STATIC SSize_t
40d049e4 3035S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
ea3daa5d 3036 SSize_t *minlenp, SSize_t *deltap,
40d049e4
YO
3037 regnode *last,
3038 scan_data_t *data,
3039 I32 stopparen,
3040 U8* recursed,
b8f7bb16 3041 regnode_ssc *and_withp,
40d049e4 3042 U32 flags, U32 depth)
c277df42
IZ
3043 /* scanp: Start here (read-write). */
3044 /* deltap: Write maxlen-minlen here. */
3045 /* last: Stop before this one. */
40d049e4
YO
3046 /* data: string data about the pattern */
3047 /* stopparen: treat close N as END */
3048 /* recursed: which subroutines have we recursed into */
3049 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 3050{
97aff369 3051 dVAR;
ea3daa5d
FC
3052 /* There must be at least this number of characters to match */
3053 SSize_t min = 0;
2d608413 3054 I32 pars = 0, code;
c277df42 3055 regnode *scan = *scanp, *next;
ea3daa5d 3056 SSize_t delta = 0;
c277df42 3057 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 3058 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
3059 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3060 scan_data_t data_fake;
a3621e74 3061 SV *re_trie_maxbuff = NULL;
786e8c11 3062 regnode *first_non_open = scan;
ea3daa5d 3063 SSize_t stopmin = SSize_t_MAX;
8aa23a47 3064 scan_frame *frame = NULL;
a3621e74 3065 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 3066
7918f24d
NC
3067 PERL_ARGS_ASSERT_STUDY_CHUNK;
3068
13a24bad 3069#ifdef DEBUGGING
40d049e4 3070 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 3071#endif
40d049e4 3072
786e8c11 3073 if ( depth == 0 ) {
40d049e4 3074 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
3075 first_non_open=regnext(first_non_open);
3076 }
3077
b81d288d 3078
8aa23a47
YO
3079 fake_study_recurse:
3080 while ( scan && OP(scan) != END && scan < last ){
2d608413
KW
3081 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3082 node length to get a real minimum (because
3083 the folded version may be shorter) */
f758bddf 3084 bool has_exactf_sharp_s = FALSE;
8aa23a47 3085 /* Peephole optimizer: */
304ee84b 3086 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47 3087 DEBUG_PEEP("Peep",scan,depth);
a0c4c608
KW
3088
3089 /* Its not clear to khw or hv why this is done here, and not in the
3090 * clauses that deal with EXACT nodes. khw's guess is that it's
3091 * because of a previous design */
9d071ca8 3092 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
8aa23a47
YO
3093
3094 /* Follow the next-chain of the current node and optimize
3095 away all the NOTHINGs from it. */
3096 if (OP(scan) != CURLYX) {
3097 const int max = (reg_off_by_arg[OP(scan)]
3098 ? I32_MAX
3099 /* I32 may be smaller than U16 on CRAYs! */
3100 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3101 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3102 int noff;
3103 regnode *n = scan;
686b73d4 3104
8aa23a47
YO
3105 /* Skip NOTHING and LONGJMP. */
3106 while ((n = regnext(n))
3107 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3108 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3109 && off + noff < max)
3110 off += noff;
3111 if (reg_off_by_arg[OP(scan)])
3112 ARG(scan) = off;
3113 else
3114 NEXT_OFF(scan) = off;
3115 }
a3621e74 3116
c277df42 3117
8aa23a47
YO
3118
3119 /* The principal pseudo-switch. Cannot be a switch, since we
3120 look into several different things. */
3121 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3122 || OP(scan) == IFTHEN) {
3123 next = regnext(scan);
3124 code = OP(scan);
3125 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
686b73d4 3126
8aa23a47
YO
3127 if (OP(next) == code || code == IFTHEN) {
3128 /* NOTE - There is similar code to this block below for handling
3129 TRIE nodes on a re-study. If you change stuff here check there
3130 too. */
49f55535 3131 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
b8f7bb16 3132 regnode_ssc accum;
8aa23a47 3133 regnode * const startbranch=scan;
686b73d4 3134
8aa23a47 3135 if (flags & SCF_DO_SUBSTR)
304ee84b 3136 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47 3137 if (flags & SCF_DO_STCLASS)
557bd3fb 3138 ssc_init_zero(pRExC_state, &accum);
8aa23a47
YO
3139
3140 while (OP(scan) == code) {
49f55535
FC
3141 SSize_t deltanext, minnext, fake;
3142 I32 f = 0;
b8f7bb16 3143 regnode_ssc this_class;
8aa23a47
YO
3144
3145 num++;
3146 data_fake.flags = 0;
3147 if (data) {
3148 data_fake.whilem_c = data->whilem_c;
3149 data_fake.last_closep = data->last_closep;
3150 }
3151 else
3152 data_fake.last_closep = &fake;
58e23c8d
YO
3153
3154 data_fake.pos_delta = delta;
8aa23a47
YO
3155 next = regnext(scan);
3156 scan = NEXTOPER(scan);
3157 if (code != BRANCH)
c277df42 3158 scan = NEXTOPER(scan);
8aa23a47 3159 if (flags & SCF_DO_STCLASS) {
557bd3fb 3160 ssc_init(pRExC_state, &this_class);
8aa23a47
YO
3161 data_fake.start_class = &this_class;
3162 f = SCF_DO_STCLASS_AND;
58e23c8d 3163 }
8aa23a47
YO
3164 if (flags & SCF_WHILEM_VISITED_POS)
3165 f |= SCF_WHILEM_VISITED_POS;
3166
3167 /* we suppose the run is continuous, last=next...*/
3168 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3169 next, &data_fake,
3170 stopparen, recursed, NULL, f,depth+1);
3171 if (min1 > minnext)
3172 min1 = minnext;
ea3daa5d 3173 if (deltanext == SSize_t_MAX) {
8aa23a47 3174 is_inf = is_inf_internal = 1;
ea3daa5d 3175 max1 = SSize_t_MAX;
9b139d09
GG
3176 } else if (max1 < minnext + deltanext)
3177 max1 = minnext + deltanext;
8aa23a47
YO
3178 scan = next;
3179 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3180 pars++;
3181 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3182 if ( stopmin > minnext)
3183 stopmin = min + min1;
3184 flags &= ~SCF_DO_SUBSTR;
3185 if (data)
3186 data->flags |= SCF_SEEN_ACCEPT;
3187 }
3188 if (data) {
3189 if (data_fake.flags & SF_HAS_EVAL)
3190 data->flags |= SF_HAS_EVAL;
3191 data->whilem_c = data_fake.whilem_c;
3dab1dad 3192 }
8aa23a47 3193 if (flags & SCF_DO_STCLASS)
557bd3fb 3194 ssc_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
3195 }
3196 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3197 min1 = 0;
3198 if (flags & SCF_DO_SUBSTR) {
3199 data->pos_min += min1;
ea3daa5d
FC
3200 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3201 data->pos_delta = SSize_t_MAX;
9b139d09
GG
3202 else
3203 data->pos_delta += max1 - min1;
8aa23a47
YO
3204 if (max1 != min1 || is_inf)
3205 data->longest = &(data->longest_float);
3206 }
3207 min += min1;
ea3daa5d
FC
3208 if (delta == SSize_t_MAX
3209 || SSize_t_MAX - delta - (max1 - min1) < 0)
3210 delta = SSize_t_MAX;
9b139d09
GG
3211 else
3212 delta += max1 - min1;
8aa23a47 3213 if (flags & SCF_DO_STCLASS_OR) {
557bd3fb 3214 ssc_or(pRExC_state, data->start_class, &accum);
8aa23a47 3215 if (min1) {
1ca93ef1 3216 ssc_and(pRExC_state, data->start_class, and_withp);
8aa23a47 3217 flags &= ~SCF_DO_STCLASS;
653099ff 3218 }
8aa23a47
YO
3219 }
3220 else if (flags & SCF_DO_STCLASS_AND) {
3221 if (min1) {
1ca93ef1 3222 ssc_and(pRExC_state, data->start_class, &accum);
8aa23a47 3223 flags &= ~SCF_DO_STCLASS;
de0c8cb8 3224 }
8aa23a47
YO
3225 else {
3226 /* Switch to OR mode: cache the old value of
3227 * data->start_class */
3228 INIT_AND_WITHP;
b8f7bb16 3229 StructCopy(data->start_class, and_withp, regnode_ssc);
8aa23a47 3230 flags &= ~SCF_DO_STCLASS_AND;
b8f7bb16 3231 StructCopy(&accum, data->start_class, regnode_ssc);
8aa23a47 3232 flags |= SCF_DO_STCLASS_OR;
899d20b9 3233 SET_SSC_EOS(data->start_class);
de0c8cb8 3234 }
8aa23a47 3235 }
a3621e74 3236
8aa23a47
YO
3237 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3238 /* demq.
a3621e74 3239
8aa23a47
YO
3240 Assuming this was/is a branch we are dealing with: 'scan' now
3241 points at the item that follows the branch sequence, whatever
3242 it is. We now start at the beginning of the sequence and look
3243 for subsequences of
a3621e74 3244
8aa23a47
YO
3245 BRANCH->EXACT=>x1
3246 BRANCH->EXACT=>x2
3247 tail
a3621e74 3248
8aa23a47 3249 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 3250
486ec47a 3251 If we can find such a subsequence we need to turn the first
8aa23a47
YO
3252 element into a trie and then add the subsequent branch exact
3253 strings to the trie.
a3621e74 3254
8aa23a47 3255 We have two cases
a3621e74 3256
3b753521 3257 1. patterns where the whole set of branches can be converted.
a3621e74 3258
8aa23a47 3259 2. patterns where only a subset can be converted.
a3621e74 3260
8aa23a47
YO
3261 In case 1 we can replace the whole set with a single regop
3262 for the trie. In case 2 we need to keep the start and end
3b753521 3263 branches so
a3621e74 3264
8aa23a47
YO
3265 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3266 becomes BRANCH TRIE; BRANCH X;
786e8c11 3267
8aa23a47
YO
3268 There is an additional case, that being where there is a
3269 common prefix, which gets split out into an EXACT like node
3270 preceding the TRIE node.
a3621e74 3271
8aa23a47
YO
3272 If x(1..n)==tail then we can do a simple trie, if not we make
3273 a "jump" trie, such that when we match the appropriate word
486ec47a 3274 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 3275 a nested if into a case structure of sorts.
b515a41d 3276
8aa23a47 3277 */
686b73d4 3278
8aa23a47
YO
3279 int made=0;
3280 if (!re_trie_maxbuff) {
3281 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3282 if (!SvIOK(re_trie_maxbuff))
3283 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3284 }
3285 if ( SvIV(re_trie_maxbuff)>=0 ) {
3286 regnode *cur;
3287 regnode *first = (regnode *)NULL;
3288 regnode *last = (regnode *)NULL;
3289 regnode *tail = scan;
fab2782b 3290 U8 trietype = 0;
8aa23a47 3291 U32 count=0;
a3621e74
YO
3292
3293#ifdef DEBUGGING
8aa23a47 3294 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 3295#endif
8aa23a47
YO
3296 /* var tail is used because there may be a TAIL
3297 regop in the way. Ie, the exacts will point to the
3298 thing following the TAIL, but the last branch will
3299 point at the TAIL. So we advance tail. If we
3300 have nested (?:) we may have to move through several
3301 tails.
3302 */
3303
3304 while ( OP( tail ) == TAIL ) {
3305 /* this is the TAIL generated by (?:) */
3306 tail = regnext( tail );
3307 }
a3621e74 3308
8aa23a47 3309
df826430 3310 DEBUG_TRIE_COMPILE_r({
8aa23a47
YO
3311 regprop(RExC_rx, mysv, tail );
3312 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3313 (int)depth * 2 + 2, "",
3314 "Looking for TRIE'able sequences. Tail node is: ",
3315 SvPV_nolen_const( mysv )
3316 );
3317 });
3318
3319 /*
3320
fab2782b
YO
3321 Step through the branches
3322 cur represents each branch,
3323 noper is the first thing to be matched as part of that branch
3324 noper_next is the regnext() of that node.
3325
3326 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3327 via a "jump trie" but we also support building with NOJUMPTRIE,
3328 which restricts the trie logic to structures like /FOO|BAR/.
3329
3330 If noper is a trieable nodetype then the branch is a possible optimization
3331 target. If we are building under NOJUMPTRIE then we require that noper_next
3332 is the same as scan (our current position in the regex program).
3333
3334 Once we have two or more consecutive such branches we can create a
3335 trie of the EXACT's contents and stitch it in place into the program.
3336
3337 If the sequence represents all of the branches in the alternation we
3338 replace the entire thing with a single TRIE node.
3339
3340 Otherwise when it is a subsequence we need to stitch it in place and
3341 replace only the relevant branches. This means the first branch has
3342 to remain as it is used by the alternation logic, and its next pointer,
3343 and needs to be repointed at the item on the branch chain following
3344 the last branch we have optimized away.
3345
3346 This could be either a BRANCH, in which case the subsequence is internal,
3347 or it could be the item following the branch sequence in which case the
3348 subsequence is at the end (which does not necessarily mean the first node
3349 is the start of the alternation).
3350
3351 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3352
3353 optype | trietype
3354 ----------------+-----------
3355 NOTHING | NOTHING
3356 EXACT | EXACT
3357 EXACTFU | EXACTFU
3358 EXACTFU_SS | EXACTFU
098b07d5 3359 EXACTFA | EXACTFA
fab2782b 3360
8aa23a47
YO
3361
3362 */
fab2782b
YO
3363#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3364 ( EXACT == (X) ) ? EXACT : \
cb117658 3365 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
098b07d5 3366 ( EXACTFA == (X) ) ? EXACTFA : \
fab2782b 3367 0 )
8aa23a47
YO
3368
3369 /* dont use tail as the end marker for this traverse */
3370 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3371 regnode * const noper = NEXTOPER( cur );
fab2782b
YO
3372 U8 noper_type = OP( noper );
3373 U8 noper_trietype = TRIE_TYPE( noper_type );
b515a41d 3374#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 3375 regnode * const noper_next = regnext( noper );
df826430
YO
3376 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3377 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
b515a41d
YO
3378#endif
3379
df826430 3380 DEBUG_TRIE_COMPILE_r({
8aa23a47
YO
3381 regprop(RExC_rx, mysv, cur);
3382 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3383 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3384
3385 regprop(RExC_rx, mysv, noper);
3386 PerlIO_printf( Perl_debug_log, " -> %s",
3387 SvPV_nolen_const(mysv));
3388
3389 if ( noper_next ) {
3390 regprop(RExC_rx, mysv, noper_next );
3391 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3392 SvPV_nolen_const(mysv));
3393 }
df826430
YO
3394 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3395 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3396 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3397 );
8aa23a47 3398 });
fab2782b
YO
3399
3400 /* Is noper a trieable nodetype that can be merged with the
3401 * current trie (if there is one)? */
3402 if ( noper_trietype
3403 &&
3404 (
df826430
YO
3405 ( noper_trietype == NOTHING)
3406 || ( trietype == NOTHING )
a40630bf 3407 || ( trietype == noper_trietype )
fab2782b 3408 )
786e8c11 3409#ifdef NOJUMPTRIE
8aa23a47 3410 && noper_next == tail
786e8c11 3411#endif
8aa23a47
YO
3412 && count < U16_MAX)
3413 {
fab2782b
YO
3414 /* Handle mergable triable node
3415 * Either we are the first node in a new trieable sequence,
3416 * in which case we do some bookkeeping, otherwise we update
3417 * the end pointer. */
fab2782b 3418 if ( !first ) {
3b6759a6 3419 first = cur;
df826430
YO
3420 if ( noper_trietype == NOTHING ) {
3421#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3422 regnode * const noper_next = regnext( noper );
3b6759a6 3423 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
df826430
YO
3424 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3425#endif
3426
190c1910 3427 if ( noper_next_trietype ) {
df826430 3428 trietype = noper_next_trietype;
190c1910
YO
3429 } else if (noper_next_type) {
3430 /* a NOTHING regop is 1 regop wide. We need at least two
3431 * for a trie so we can't merge this in */
3432 first = NULL;
3433 }
3434 } else {
3435 trietype = noper_trietype;
3b6759a6 3436 }
8aa23a47 3437 } else {
fab2782b
YO
3438 if ( trietype == NOTHING )
3439 trietype = noper_trietype;
8aa23a47
YO
3440 last = cur;
3441 }
df826430
YO
3442 if (first)
3443 count++;
fab2782b
YO
3444 } /* end handle mergable triable node */
3445 else {
3446 /* handle unmergable node -
3447 * noper may either be a triable node which can not be tried
3448 * together with the current trie, or a non triable node */
729aaeb5
YO
3449 if ( last ) {
3450 /* If last is set and trietype is not NOTHING then we have found
3451 * at least two triable branch sequences in a row of a similar
3452 * trietype so we can turn them into a trie. If/when we
3453 * allow NOTHING to start a trie sequence this condition will be
3454 * required, and it isn't expensive so we leave it in for now. */
e6351b37 3455 if ( trietype && trietype != NOTHING )
729aaeb5
YO
3456 make_trie( pRExC_state,
3457 startbranch, first, cur, tail, count,
3458 trietype, depth+1 );
fab2782b 3459 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
8aa23a47 3460 }
fab2782b 3461 if ( noper_trietype
786e8c11 3462#ifdef NOJUMPTRIE
8aa23a47 3463 && noper_next == tail
786e8c11 3464#endif
8aa23a47 3465 ){
fab2782b 3466 /* noper is triable, so we can start a new trie sequence */
8aa23a47
YO
3467 count = 1;
3468 first = cur;
fab2782b
YO
3469 trietype = noper_trietype;
3470 } else if (first) {
3471 /* if we already saw a first but the current node is not triable then we have
3472 * to reset the first information. */
8aa23a47
YO
3473 count = 0;
3474 first = NULL;
fab2782b 3475 trietype = 0;
8aa23a47 3476 }
fab2782b
YO
3477 } /* end handle unmergable node */
3478 } /* loop over branches */
df826430 3479 DEBUG_TRIE_COMPILE_r({
8aa23a47
YO
3480 regprop(RExC_rx, mysv, cur);
3481 PerlIO_printf( Perl_debug_log,
3482 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3483 "&q