This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Use STR_WITH_LEN to avoid bookkeeping
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
40b6423c 84extern const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
04e98a4d 89#include "dquote_static.c"
26faadbd 90#include "charclass_invlists.h"
81e983c1 91#include "inline_invlist.c"
1b0f46bf 92#include "unicode_constants.h"
04e98a4d 93
94dc5c2d 94#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 95#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 96#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
b1603ef8 102
09b2b2e6 103struct RExC_state_t {
514a91f1
DM
104 U32 flags; /* RXf_* are we folding, multilining? */
105 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 106 char *precomp; /* uncompiled string. */
288b8c02 107 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
108 regexp *rx; /* perl core regexp structure */
109 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 110 char *start; /* Start of input for compile */
830247a4
IZ
111 char *end; /* End of input for compile */
112 char *parse; /* Input-scan pointer. */
ea3daa5d 113 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 114 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 115 regnode *emit_bound; /* First regnode outside of the allocated space */
f7c7e32a
DM
116 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
117 implies compiling, so don't emit */
118 regnode emit_dummy; /* placeholder for emit to point to */
830247a4
IZ
119 I32 naughty; /* How bad is this pattern? */
120 I32 sawback; /* Did we see \1, ...? */
121 U32 seen;
ea3daa5d 122 SSize_t size; /* Code size. */
c74340f9
YO
123 I32 npar; /* Capture buffer count, (OPEN). */
124 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 125 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
126 I32 extralen;
127 I32 seen_zerolen;
40d049e4
YO
128 regnode **open_parens; /* pointers to open parens */
129 regnode **close_parens; /* pointers to close parens */
130 regnode *opend; /* END node in program */
02daf0ab
YO
131 I32 utf8; /* whether the pattern is utf8 or not */
132 I32 orig_utf8; /* whether the pattern was originally in utf8 */
133 /* XXX use this for future optimisation of case
134 * where pattern must be upgraded to utf8. */
e40e74fe
KW
135 I32 uni_semantics; /* If a d charset modifier should use unicode
136 rules, even if the pattern is not in
137 utf8 */
81714fb9 138 HV *paren_names; /* Paren names */
1f1031fe 139
40d049e4
YO
140 regnode **recurse; /* Recurse regops */
141 I32 recurse_count; /* Number of recurse regops */
b57e4118 142 I32 in_lookbehind;
4624b182 143 I32 contains_locale;
bb3f3ed2 144 I32 override_recoding;
9d53c457 145 I32 in_multi_char_class;
3d2bd50a 146 struct reg_code_block *code_blocks; /* positions of literal (?{})
68e2671b 147 within pattern */
b1603ef8
DM
148 int num_code_blocks; /* size of code_blocks[] */
149 int code_index; /* next code_blocks[] slot */
830247a4
IZ
150#if ADD_TO_REGEXEC
151 char *starttry; /* -Dr: where regtry was called. */
152#define RExC_starttry (pRExC_state->starttry)
153#endif
d24ca0c5 154 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 155#ifdef DEBUGGING
be8e71aa 156 const char *lastparse;
3dab1dad 157 I32 lastnum;
1f1031fe 158 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
159#define RExC_lastparse (pRExC_state->lastparse)
160#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 161#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 162#endif
09b2b2e6 163};
830247a4 164
e2509266 165#define RExC_flags (pRExC_state->flags)
514a91f1 166#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 167#define RExC_precomp (pRExC_state->precomp)
288b8c02 168#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 169#define RExC_rx (pRExC_state->rx)
f8fc2ecf 170#define RExC_rxi (pRExC_state->rxi)
fac92740 171#define RExC_start (pRExC_state->start)
830247a4
IZ
172#define RExC_end (pRExC_state->end)
173#define RExC_parse (pRExC_state->parse)
174#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
175#ifdef RE_TRACK_PATTERN_OFFSETS
176#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
177#endif
830247a4 178#define RExC_emit (pRExC_state->emit)
f7c7e32a 179#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 180#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 181#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
182#define RExC_naughty (pRExC_state->naughty)
183#define RExC_sawback (pRExC_state->sawback)
184#define RExC_seen (pRExC_state->seen)
185#define RExC_size (pRExC_state->size)
186#define RExC_npar (pRExC_state->npar)
e2e6a0f1 187#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
188#define RExC_extralen (pRExC_state->extralen)
189#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 190#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 191#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 192#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
193#define RExC_open_parens (pRExC_state->open_parens)
194#define RExC_close_parens (pRExC_state->close_parens)
195#define RExC_opend (pRExC_state->opend)
81714fb9 196#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
197#define RExC_recurse (pRExC_state->recurse)
198#define RExC_recurse_count (pRExC_state->recurse_count)
b57e4118 199#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 200#define RExC_contains_locale (pRExC_state->contains_locale)
9d53c457
KW
201#define RExC_override_recoding (pRExC_state->override_recoding)
202#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
830247a4 203
cde0cee5 204
a687059c
LW
205#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
206#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
4d68ffa0 207 ((*s) == '{' && regcurly(s, FALSE)))
a687059c
LW
208
209/*
210 * Flags to be passed up and down.
211 */
a687059c 212#define WORST 0 /* Worst case. */
a3b492c3 213#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 214
e64f369d 215/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
216 * character. (There needs to be a case: in the switch statement in regexec.c
217 * for any node marked SIMPLE.) Note that this is not the same thing as
218 * REGNODE_SIMPLE */
fda99bee 219#define SIMPLE 0x02
e64f369d 220#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
221#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
222#define TRYAGAIN 0x10 /* Weeded out a declaration. */
223#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
a687059c 224
3dab1dad
YO
225#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
226
07be1b83
YO
227/* whether trie related optimizations are enabled */
228#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
229#define TRIE_STUDY_OPT
786e8c11 230#define FULL_TRIE_STUDY
07be1b83
YO
231#define TRIE_STCLASS
232#endif
1de06328
YO
233
234
40d049e4
YO
235
236#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
237#define PBITVAL(paren) (1 << ((paren) & 7))
238#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
239#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
240#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
241
bbd61b5f 242#define REQUIRE_UTF8 STMT_START { \
8d9c2815
NC
243 if (!UTF) { \
244 *flagp = RESTART_UTF8; \
245 return NULL; \
246 } \
bbd61b5f 247 } STMT_END
40d049e4 248
f19b1a63
KW
249/* This converts the named class defined in regcomp.h to its equivalent class
250 * number defined in handy.h. */
251#define namedclass_to_classnum(class) ((int) ((class) / 2))
252#define classnum_to_namedclass(classnum) ((classnum) * 2)
253
de92f5e6
KW
254#define _invlist_union_complement_2nd(a, b, output) \
255 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
256#define _invlist_intersection_complement_2nd(a, b, output) \
257 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
258
1de06328
YO
259/* About scan_data_t.
260
261 During optimisation we recurse through the regexp program performing
262 various inplace (keyhole style) optimisations. In addition study_chunk
263 and scan_commit populate this data structure with information about
264 what strings MUST appear in the pattern. We look for the longest
3b753521 265 string that must appear at a fixed location, and we look for the
1de06328
YO
266 longest string that may appear at a floating location. So for instance
267 in the pattern:
268
269 /FOO[xX]A.*B[xX]BAR/
270
271 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
272 strings (because they follow a .* construct). study_chunk will identify
273 both FOO and BAR as being the longest fixed and floating strings respectively.
274
275 The strings can be composites, for instance
276
277 /(f)(o)(o)/
278
279 will result in a composite fixed substring 'foo'.
280
281 For each string some basic information is maintained:
282
283 - offset or min_offset
284 This is the position the string must appear at, or not before.
285 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
286 characters must match before the string we are searching for.
287 Likewise when combined with minlenp and the length of the string it
1de06328
YO
288 tells us how many characters must appear after the string we have
289 found.
290
291 - max_offset
292 Only used for floating strings. This is the rightmost point that
ea3daa5d 293 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328
YO
294 string can occur infinitely far to the right.
295
296 - minlenp
2d608413
KW
297 A pointer to the minimum number of characters of the pattern that the
298 string was found inside. This is important as in the case of positive
1de06328
YO
299 lookahead or positive lookbehind we can have multiple patterns
300 involved. Consider
301
302 /(?=FOO).*F/
303
304 The minimum length of the pattern overall is 3, the minimum length
305 of the lookahead part is 3, but the minimum length of the part that
306 will actually match is 1. So 'FOO's minimum length is 3, but the
307 minimum length for the F is 1. This is important as the minimum length
308 is used to determine offsets in front of and behind the string being
309 looked for. Since strings can be composites this is the length of the
486ec47a 310 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
311 the length is calculated by study_chunk, so that the minimum lengths
312 are not known until the full pattern has been compiled, thus the
313 pointer to the value.
314
315 - lookbehind
316
317 In the case of lookbehind the string being searched for can be
318 offset past the start point of the final matching string.
319 If this value was just blithely removed from the min_offset it would
320 invalidate some of the calculations for how many chars must match
321 before or after (as they are derived from min_offset and minlen and
322 the length of the string being searched for).
323 When the final pattern is compiled and the data is moved from the
324 scan_data_t structure into the regexp structure the information
325 about lookbehind is factored in, with the information that would
326 have been lost precalculated in the end_shift field for the
327 associated string.
328
329 The fields pos_min and pos_delta are used to store the minimum offset
330 and the delta to the maximum offset at the current point in the pattern.
331
332*/
2c2d71f5
JH
333
334typedef struct scan_data_t {
1de06328
YO
335 /*I32 len_min; unused */
336 /*I32 len_delta; unused */
49f55535 337 SSize_t pos_min;
ea3daa5d 338 SSize_t pos_delta;
2c2d71f5 339 SV *last_found;
ea3daa5d 340 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 341 SSize_t last_start_min;
ea3daa5d 342 SSize_t last_start_max;
1de06328
YO
343 SV **longest; /* Either &l_fixed, or &l_float. */
344 SV *longest_fixed; /* longest fixed string found in pattern */
49f55535 345 SSize_t offset_fixed; /* offset where it starts */
ea3daa5d 346 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
347 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
348 SV *longest_float; /* longest floating string found in pattern */
49f55535 349 SSize_t offset_float_min; /* earliest point in string it can appear */
ea3daa5d
FC
350 SSize_t offset_float_max; /* latest point in string it can appear */
351 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
49f55535 352 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
2c2d71f5
JH
353 I32 flags;
354 I32 whilem_c;
49f55535 355 SSize_t *last_closep;
b8f7bb16 356 regnode_ssc *start_class;
2c2d71f5
JH
357} scan_data_t;
358
c02c3054
KW
359/* The below is perhaps overboard, but this allows us to save a test at the
360 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
361 * and 'a' differ by a single bit; the same with the upper and lower case of
362 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
363 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
364 * then inverts it to form a mask, with just a single 0, in the bit position
365 * where the upper- and lowercase differ. XXX There are about 40 other
366 * instances in the Perl core where this micro-optimization could be used.
367 * Should decide if maintenance cost is worse, before changing those
368 *
369 * Returns a boolean as to whether or not 'v' is either a lowercase or
370 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
371 * compile-time constant, the generated code is better than some optimizing
372 * compilers figure out, amounting to a mask and test. The results are
373 * meaningless if 'c' is not one of [A-Za-z] */
374#define isARG2_lower_or_UPPER_ARG1(c, v) \
375 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
376
a687059c 377/*
e50aee73 378 * Forward declarations for pregcomp()'s friends.
a687059c 379 */
a0d0e21e 380
27da23d5 381static const scan_data_t zero_scan_data =
1de06328 382 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
383
384#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
385#define SF_BEFORE_SEOL 0x0001
386#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
387#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
388#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
389
44e3dfd2
BF
390#define SF_FIX_SHIFT_EOL (+2)
391#define SF_FL_SHIFT_EOL (+4)
c277df42
IZ
392
393#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
394#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
395
396#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
397#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
398#define SF_IS_INF 0x0040
399#define SF_HAS_PAR 0x0080
400#define SF_IN_PAR 0x0100
401#define SF_HAS_EVAL 0x0200
402#define SCF_DO_SUBSTR 0x0400
653099ff
GS
403#define SCF_DO_STCLASS_AND 0x0800
404#define SCF_DO_STCLASS_OR 0x1000
405#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 406#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 407
786e8c11 408#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 409#define SCF_SEEN_ACCEPT 0x8000
688e0391 410#define SCF_TRIE_DOING_RESTUDY 0x10000
07be1b83 411
43fead97 412#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
413
414/* The enums for all these are ordered so things work out correctly */
a62b1201 415#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
cfaf538b 416#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
00b27cfc 417#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
418#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
419#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
2f7f8cb1 420#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
a725e29c 421#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 422
43fead97 423#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 424
93733859 425#define OOB_NAMEDCLASS -1
b8c5462f 426
8e661ac5
KW
427/* There is no code point that is out-of-bounds, so this is problematic. But
428 * its only current use is to initialize a variable that is always set before
429 * looked at. */
430#define OOB_UNICODE 0xDEADBEEF
431
a0ed51b3
LW
432#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
433#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
434
8615cb43 435
b45f050a
JF
436/* length of regex to show in messages that don't mark a position within */
437#define RegexLengthToShowInErrorMessages 127
438
439/*
440 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
441 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
442 * op/pragma/warn/regcomp.
443 */
7253e4e3
RK
444#define MARKER1 "<-- HERE" /* marker as it appears in the description */
445#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 446
c1d900c3 447#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
b45f050a 448
c1d900c3
BF
449#define REPORT_LOCATION_ARGS(offset) \
450 UTF8fARG(UTF, offset, RExC_precomp), \
451 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
452
b45f050a
JF
453/*
454 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
455 * arg. Show regex, up to a maximum length. If it's too long, chop and add
456 * "...".
457 */
58e23c8d 458#define _FAIL(code) STMT_START { \
bfed75c6 459 const char *ellipses = ""; \
ccb2c380
MP
460 IV len = RExC_end - RExC_precomp; \
461 \
462 if (!SIZE_ONLY) \
a5e7bc51 463 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
464 if (len > RegexLengthToShowInErrorMessages) { \
465 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
466 len = RegexLengthToShowInErrorMessages - 10; \
467 ellipses = "..."; \
468 } \
58e23c8d 469 code; \
ccb2c380 470} STMT_END
8615cb43 471
58e23c8d 472#define FAIL(msg) _FAIL( \
c1d900c3
BF
473 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
474 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
475
476#define FAIL2(msg,arg) _FAIL( \
c1d900c3
BF
477 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
478 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 479
b45f050a 480/*
b45f050a
JF
481 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
482 */
ccb2c380 483#define Simple_vFAIL(m) STMT_START { \
a28509cc 484 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380 485 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
c1d900c3 486 m, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 487} STMT_END
b45f050a
JF
488
489/*
490 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
491 */
ccb2c380
MP
492#define vFAIL(m) STMT_START { \
493 if (!SIZE_ONLY) \
a5e7bc51 494 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
495 Simple_vFAIL(m); \
496} STMT_END
b45f050a
JF
497
498/*
499 * Like Simple_vFAIL(), but accepts two arguments.
500 */
ccb2c380 501#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 502 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
503 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
504 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 505} STMT_END
b45f050a
JF
506
507/*
508 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
509 */
ccb2c380
MP
510#define vFAIL2(m,a1) STMT_START { \
511 if (!SIZE_ONLY) \
a5e7bc51 512 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
513 Simple_vFAIL2(m, a1); \
514} STMT_END
b45f050a
JF
515
516
517/*
518 * Like Simple_vFAIL(), but accepts three arguments.
519 */
ccb2c380 520#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 521 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
522 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
523 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 524} STMT_END
b45f050a
JF
525
526/*
527 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
528 */
ccb2c380
MP
529#define vFAIL3(m,a1,a2) STMT_START { \
530 if (!SIZE_ONLY) \
a5e7bc51 531 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
532 Simple_vFAIL3(m, a1, a2); \
533} STMT_END
b45f050a
JF
534
535/*
536 * Like Simple_vFAIL(), but accepts four arguments.
537 */
ccb2c380 538#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 539 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
540 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
541 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 542} STMT_END
b45f050a 543
95db3ffa
KW
544#define vFAIL4(m,a1,a2,a3) STMT_START { \
545 if (!SIZE_ONLY) \
546 SAVEFREESV(RExC_rx_sv); \
547 Simple_vFAIL4(m, a1, a2, a3); \
548} STMT_END
549
946095af
BF
550/* A specialized version of vFAIL2 that works with UTF8f */
551#define vFAIL2utf8f(m, a1) STMT_START { \
ef3f731d
BF
552 const IV offset = RExC_parse - RExC_precomp; \
553 if (!SIZE_ONLY) \
554 SAVEFREESV(RExC_rx_sv); \
555 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
556 REPORT_LOCATION_ARGS(offset)); \
946095af
BF
557} STMT_END
558
559
5e0a247b
KW
560/* m is not necessarily a "literal string", in this macro */
561#define reg_warn_non_literal_string(loc, m) STMT_START { \
562 const IV offset = loc - RExC_precomp; \
563 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
c1d900c3 564 m, REPORT_LOCATION_ARGS(offset)); \
5e0a247b
KW
565} STMT_END
566
668c081a 567#define ckWARNreg(loc,m) STMT_START { \
a28509cc 568 const IV offset = loc - RExC_precomp; \
f10f4c18 569 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 570 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
571} STMT_END
572
0d6106aa
KW
573#define vWARN_dep(loc, m) STMT_START { \
574 const IV offset = loc - RExC_precomp; \
575 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
c1d900c3 576 REPORT_LOCATION_ARGS(offset)); \
0d6106aa
KW
577} STMT_END
578
147508a2
KW
579#define ckWARNdep(loc,m) STMT_START { \
580 const IV offset = loc - RExC_precomp; \
581 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
582 m REPORT_LOCATION, \
c1d900c3 583 REPORT_LOCATION_ARGS(offset)); \
147508a2
KW
584} STMT_END
585
668c081a 586#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 587 const IV offset = loc - RExC_precomp; \
d1d15184 588 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18 589 m REPORT_LOCATION, \
c1d900c3 590 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
591} STMT_END
592
b23eb183 593#define ckWARN2reg_d(loc,m, a1) STMT_START { \
2335b3d3 594 const IV offset = loc - RExC_precomp; \
b23eb183 595 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
2335b3d3 596 m REPORT_LOCATION, \
c1d900c3 597 a1, REPORT_LOCATION_ARGS(offset)); \
2335b3d3
KW
598} STMT_END
599
668c081a 600#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 601 const IV offset = loc - RExC_precomp; \
668c081a 602 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 603 a1, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
604} STMT_END
605
606#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 607 const IV offset = loc - RExC_precomp; \
ccb2c380 608 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 609 a1, a2, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
610} STMT_END
611
668c081a
NC
612#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
613 const IV offset = loc - RExC_precomp; \
614 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 615 a1, a2, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
616} STMT_END
617
ccb2c380 618#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 619 const IV offset = loc - RExC_precomp; \
ccb2c380 620 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 621 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
622} STMT_END
623
668c081a
NC
624#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
625 const IV offset = loc - RExC_precomp; \
626 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 627 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
628} STMT_END
629
ccb2c380 630#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 631 const IV offset = loc - RExC_precomp; \
ccb2c380 632 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 633 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 634} STMT_END
9d1d55b5 635
8615cb43 636
cd439c50 637/* Allow for side effects in s */
ccb2c380
MP
638#define REGC(c,s) STMT_START { \
639 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
640} STMT_END
cd439c50 641
fac92740
MJD
642/* Macros for recording node offsets. 20001227 mjd@plover.com
643 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
644 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
645 * Element 0 holds the number n.
07be1b83 646 * Position is 1 indexed.
fac92740 647 */
7122b237
YO
648#ifndef RE_TRACK_PATTERN_OFFSETS
649#define Set_Node_Offset_To_R(node,byte)
650#define Set_Node_Offset(node,byte)
651#define Set_Cur_Node_Offset
652#define Set_Node_Length_To_R(node,len)
653#define Set_Node_Length(node,len)
6a86c6ad 654#define Set_Node_Cur_Length(node,start)
7122b237
YO
655#define Node_Offset(n)
656#define Node_Length(n)
657#define Set_Node_Offset_Length(node,offset,len)
658#define ProgLen(ri) ri->u.proglen
659#define SetProgLen(ri,x) ri->u.proglen = x
660#else
661#define ProgLen(ri) ri->u.offsets[0]
662#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
663#define Set_Node_Offset_To_R(node,byte) STMT_START { \
664 if (! SIZE_ONLY) { \
665 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 666 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 667 if((node) < 0) { \
551405c4 668 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
669 } else { \
670 RExC_offsets[2*(node)-1] = (byte); \
671 } \
672 } \
673} STMT_END
674
675#define Set_Node_Offset(node,byte) \
676 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
677#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
678
679#define Set_Node_Length_To_R(node,len) STMT_START { \
680 if (! SIZE_ONLY) { \
681 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 682 __LINE__, (int)(node), (int)(len))); \
ccb2c380 683 if((node) < 0) { \
551405c4 684 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
685 } else { \
686 RExC_offsets[2*(node)] = (len); \
687 } \
688 } \
689} STMT_END
690
691#define Set_Node_Length(node,len) \
692 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
693#define Set_Node_Cur_Length(node, start) \
694 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
695
696/* Get offsets and lengths */
697#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
698#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
699
07be1b83
YO
700#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
701 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
702 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
703} STMT_END
7122b237 704#endif
07be1b83
YO
705
706#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
707#define EXPERIMENTAL_INPLACESCAN
f427392e 708#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 709
304ee84b
YO
710#define DEBUG_STUDYDATA(str,data,depth) \
711DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 712 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
713 "%*s" str "Pos:%"IVdf"/%"IVdf \
714 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
715 (int)(depth)*2, "", \
716 (IV)((data)->pos_min), \
717 (IV)((data)->pos_delta), \
304ee84b 718 (UV)((data)->flags), \
1de06328 719 (IV)((data)->whilem_c), \
304ee84b
YO
720 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
721 is_inf ? "INF " : "" \
1de06328
YO
722 ); \
723 if ((data)->last_found) \
724 PerlIO_printf(Perl_debug_log, \
725 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
726 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
727 SvPVX_const((data)->last_found), \
728 (IV)((data)->last_end), \
729 (IV)((data)->last_start_min), \
730 (IV)((data)->last_start_max), \
731 ((data)->longest && \
732 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
733 SvPVX_const((data)->longest_fixed), \
734 (IV)((data)->offset_fixed), \
735 ((data)->longest && \
736 (data)->longest==&((data)->longest_float)) ? "*" : "", \
737 SvPVX_const((data)->longest_float), \
738 (IV)((data)->offset_float_min), \
739 (IV)((data)->offset_float_max) \
740 ); \
741 PerlIO_printf(Perl_debug_log,"\n"); \
742});
743
653099ff 744/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 745 Update the longest found anchored substring and the longest found
653099ff
GS
746 floating substrings if needed. */
747
4327152a 748STATIC void
ea3daa5d
FC
749S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
750 SSize_t *minlenp, int is_inf)
c277df42 751{
e1ec3a88
AL
752 const STRLEN l = CHR_SVLEN(data->last_found);
753 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 754 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 755
7918f24d
NC
756 PERL_ARGS_ASSERT_SCAN_COMMIT;
757
c277df42 758 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 759 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
760 if (*data->longest == data->longest_fixed) {
761 data->offset_fixed = l ? data->last_start_min : data->pos_min;
762 if (data->flags & SF_BEFORE_EOL)
b81d288d 763 data->flags
c277df42
IZ
764 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
765 else
766 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 767 data->minlen_fixed=minlenp;
1de06328 768 data->lookbehind_fixed=0;
a0ed51b3 769 }
304ee84b 770 else { /* *data->longest == data->longest_float */
c277df42 771 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
772 data->offset_float_max = (l
773 ? data->last_start_max
ea3daa5d
FC
774 : (data->pos_delta == SSize_t_MAX
775 ? SSize_t_MAX
776 : data->pos_min + data->pos_delta));
777 if (is_inf
778 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
779 data->offset_float_max = SSize_t_MAX;
c277df42 780 if (data->flags & SF_BEFORE_EOL)
b81d288d 781 data->flags
c277df42
IZ
782 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
783 else
784 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
785 data->minlen_float=minlenp;
786 data->lookbehind_float=0;
c277df42
IZ
787 }
788 }
789 SvCUR_set(data->last_found, 0);
0eda9292 790 {
a28509cc 791 SV * const sv = data->last_found;
097eb12c
AL
792 if (SvUTF8(sv) && SvMAGICAL(sv)) {
793 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
794 if (mg)
795 mg->mg_len = 0;
796 }
0eda9292 797 }
c277df42
IZ
798 data->last_end = -1;
799 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 800 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
801}
802
899d20b9
KW
803/* These macros set, clear and test whether the synthetic start class ('ssc',
804 * given by the parameter) matches an empty string (EOS). This uses the
805 * 'next_off' field in the node, to save a bit in the flags field. The ssc
806 * stands alone, so there is never a next_off, so this field is otherwise
807 * unused. The EOS information is used only for compilation, but theoretically
808 * it could be passed on to the execution code. This could be used to store
809 * more than one bit of information, but only this one is currently used. */
810#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
811#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
812#define TEST_SSC_EOS(node) cBOOL((node)->next_off)
813
653099ff
GS
814/* Can match anything (initialization) */
815STATIC void
557bd3fb 816S_ssc_anything(const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 817{
557bd3fb 818 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 819
557bd3fb 820 ANYOF_BITMAP_SETALL(ssc);
9b1eb067 821 ssc->flags = ANYOF_ABOVE_LATIN1_ALL;
557bd3fb 822 SET_SSC_EOS(ssc);
3fffb88a
KW
823
824 /* If any portion of the regex is to operate under locale rules,
825 * initialization includes it. The reason this isn't done for all regexes
826 * is that the optimizer was written under the assumption that locale was
827 * all-or-nothing. Given the complexity and lack of documentation in the
828 * optimizer, and that there are inadequate test cases for locale, so many
829 * parts of it may not work properly, it is safest to avoid locale unless
830 * necessary. */
831 if (RExC_contains_locale) {
8efd3f97
KW
832 ANYOF_POSIXL_SETALL(ssc);
833 ssc->flags |= ANYOF_LOCALE|ANYOF_POSIXL|ANYOF_LOC_FOLD;
3fffb88a 834 }
9d7a1e63 835 else {
8efd3f97 836 ANYOF_POSIXL_ZERO(ssc);
9d7a1e63 837 }
653099ff
GS
838}
839
840/* Can match anything (initialization) */
841STATIC int
557bd3fb 842S_ssc_is_anything(const regnode_ssc *ssc)
653099ff
GS
843{
844 int value;
845
557bd3fb 846 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 847
8efd3f97
KW
848 for (value = 0; value < ANYOF_POSIXL_MAX; value += 2)
849 if (ANYOF_POSIXL_TEST(ssc, value) && ANYOF_POSIXL_TEST(ssc, value + 1))
653099ff 850 return 1;
9b1eb067 851 if (!(ssc->flags & ANYOF_ABOVE_LATIN1_ALL))
1aa99e6b 852 return 0;
557bd3fb 853 if (!ANYOF_BITMAP_TESTALLSET((const void*)ssc))
f8bef550 854 return 0;
653099ff
GS
855 return 1;
856}
857
858/* Can match anything (initialization) */
859STATIC void
557bd3fb 860S_ssc_init(const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 861{
557bd3fb 862 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 863
557bd3fb
KW
864 Zero(ssc, 1, regnode_ssc);
865 ssc->type = ANYOF;
866 ssc_anything(pRExC_state, ssc);
867 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
868 OP(ssc) = ANYOF_SYNTHETIC;
653099ff
GS
869}
870
1051e1c4 871/* These two functions currently do the exact same thing */
557bd3fb 872#define ssc_init_zero ssc_init
653099ff 873
557bd3fb 874/* 'AND' a given class with another one. Can create false positives. 'ssc'
8efd3f97 875 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
b8f7bb16 876 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
653099ff 877STATIC void
1ca93ef1 878S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, const regnode_ssc *and_with)
653099ff 879{
557bd3fb 880 PERL_ARGS_ASSERT_SSC_AND;
40d049e4 881
954a2af6 882 assert(PL_regkind[and_with->type] == ANYOF);
1e6ade67 883
c6b76537 884 /* I (khw) am not sure all these restrictions are necessary XXX */
8efd3f97
KW
885 if (!(ANYOF_POSIXL_TEST_ANY_SET(and_with))
886 && !(ANYOF_POSIXL_TEST_ANY_SET(ssc))
557bd3fb 887 && (and_with->flags & ANYOF_LOCALE) == (ssc->flags & ANYOF_LOCALE)
538b546e 888 && !(and_with->flags & ANYOF_LOC_FOLD)
557bd3fb 889 && !(ssc->flags & ANYOF_LOC_FOLD)) {
653099ff
GS
890 int i;
891
892 if (and_with->flags & ANYOF_INVERT)
893 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557bd3fb 894 ssc->bitmap[i] &= ~and_with->bitmap[i];
653099ff
GS
895 else
896 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557bd3fb 897 ssc->bitmap[i] &= and_with->bitmap[i];
653099ff 898 } /* XXXX: logic is complicated otherwise, leave it along for a moment. */
1aa99e6b 899
c6b76537 900 if (and_with->flags & ANYOF_INVERT) {
8951c461 901
c6b76537
KW
902 /* Here, the and'ed node is inverted. Get the AND of the flags that
903 * aren't affected by the inversion. Those that are affected are
904 * handled individually below */
557bd3fb
KW
905 U8 affected_flags = ssc->flags & ~INVERSION_UNAFFECTED_FLAGS;
906 ssc->flags &= (and_with->flags & INVERSION_UNAFFECTED_FLAGS);
907 ssc->flags |= affected_flags;
c6b76537
KW
908
909 /* We currently don't know how to deal with things that aren't in the
910 * bitmap, but we know that the intersection is no greater than what
557bd3fb 911 * is already in ssc, so let there be false positives that get sorted
c6b76537
KW
912 * out after the synthetic start class succeeds, and the node is
913 * matched for real. */
914
915 /* The inversion of these two flags indicate that the resulting
916 * intersection doesn't have them */
9b1eb067
KW
917 if (and_with->flags & ANYOF_ABOVE_LATIN1_ALL) {
918 ssc->flags &= ~ANYOF_ABOVE_LATIN1_ALL;
4713bfe1 919 }
c6b76537 920 if (and_with->flags & ANYOF_NON_UTF8_LATIN1_ALL) {
557bd3fb 921 ssc->flags &= ~ANYOF_NON_UTF8_LATIN1_ALL;
137165a6 922 }
1aa99e6b 923 }
c6b76537 924 else { /* and'd node is not inverted */
3ad98780
KW
925 U8 outside_bitmap_but_not_utf8; /* Temp variable */
926
137165a6 927 if (! ANYOF_NONBITMAP(and_with)) {
c6b76537
KW
928
929 /* Here 'and_with' doesn't match anything outside the bitmap
9b1eb067
KW
930 * (except possibly ANYOF_ABOVE_LATIN1_ALL), which means the
931 * intersection can't either, except for ANYOF_ABOVE_LATIN1_ALL, in
c6b76537 932 * which case we don't know what the intersection is, but it's no
557bd3fb 933 * greater than what ssc already has, so can just leave it alone,
c6b76537 934 * with possible false positives */
9b1eb067 935 if (! (and_with->flags & ANYOF_ABOVE_LATIN1_ALL)) {
557bd3fb
KW
936 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
937 ssc->flags &= ~ANYOF_NONBITMAP_NON_UTF8;
c6b76537 938 }
137165a6 939 }
557bd3fb 940 else if (! ANYOF_NONBITMAP(ssc)) {
c6b76537 941
557bd3fb 942 /* Here, 'and_with' does match something outside the bitmap, and ssc
c6b76537 943 * doesn't have a list of things to match outside the bitmap. If
557bd3fb
KW
944 * ssc can match all code points above 255, the intersection will
945 * be those above-255 code points that 'and_with' matches. If ssc
3ad98780
KW
946 * can't match all Unicode code points, it means that it can't
947 * match anything outside the bitmap (since the 'if' that got us
948 * into this block tested for that), so we leave the bitmap empty.
949 */
9b1eb067 950 if (ssc->flags & ANYOF_ABOVE_LATIN1_ALL) {
557bd3fb 951 ARG_SET(ssc, ARG(and_with));
3ad98780
KW
952
953 /* and_with's ARG may match things that don't require UTF8.
557bd3fb 954 * And now ssc's will too, in spite of this being an 'and'. See
3ad98780 955 * the comments below about the kludge */
557bd3fb 956 ssc->flags |= and_with->flags & ANYOF_NONBITMAP_NON_UTF8;
c6b76537
KW
957 }
958 }
959 else {
557bd3fb 960 /* Here, both 'and_with' and ssc match something outside the
c6b76537 961 * bitmap. Currently we do not do the intersection, so just match
557bd3fb 962 * whatever ssc had at the beginning. */
c6b76537
KW
963 }
964
965
3ad98780
KW
966 /* Take the intersection of the two sets of flags. However, the
967 * ANYOF_NONBITMAP_NON_UTF8 flag is treated as an 'or'. This is a
968 * kludge around the fact that this flag is not treated like the others
557bd3fb 969 * which are initialized in ssc_anything(). The way the optimizer works
3ad98780
KW
970 * is that the synthetic start class (SSC) is initialized to match
971 * anything, and then the first time a real node is encountered, its
972 * values are AND'd with the SSC's with the result being the values of
973 * the real node. However, there are paths through the optimizer where
974 * the AND never gets called, so those initialized bits are set
975 * inappropriately, which is not usually a big deal, as they just cause
976 * false positives in the SSC, which will just mean a probably
977 * imperceptible slow down in execution. However this bit has a
978 * higher false positive consequence in that it can cause utf8.pm,
979 * utf8_heavy.pl ... to be loaded when not necessary, which is a much
980 * bigger slowdown and also causes significant extra memory to be used.
981 * In order to prevent this, the code now takes a different tack. The
982 * bit isn't set unless some part of the regular expression needs it,
983 * but once set it won't get cleared. This means that these extra
984 * modules won't get loaded unless there was some path through the
985 * pattern that would have required them anyway, and so any false
986 * positives that occur by not ANDing them out when they could be
987 * aren't as severe as they would be if we treated this bit like all
988 * the others */
557bd3fb 989 outside_bitmap_but_not_utf8 = (ssc->flags | and_with->flags)
3ad98780 990 & ANYOF_NONBITMAP_NON_UTF8;
557bd3fb
KW
991 ssc->flags &= and_with->flags;
992 ssc->flags |= outside_bitmap_but_not_utf8;
137165a6 993 }
653099ff
GS
994}
995
557bd3fb 996/* 'OR' a given class with another one. Can create false positives. 'ssc'
8efd3f97 997 * should not be inverted. 'or_with->flags & ANYOF_POSIXL' should be 0 if
b8f7bb16 998 * 'or_with' is a regnode_charclass instead of a regnode_ssc. */
653099ff 999STATIC void
557bd3fb 1000S_ssc_or(const RExC_state_t *pRExC_state, regnode_ssc *ssc, const regnode_ssc *or_with)
653099ff 1001{
557bd3fb 1002 PERL_ARGS_ASSERT_SSC_OR;
7918f24d 1003
653099ff 1004 if (or_with->flags & ANYOF_INVERT) {
c6b76537
KW
1005
1006 /* Here, the or'd node is to be inverted. This means we take the
1007 * complement of everything not in the bitmap, but currently we don't
1008 * know what that is, so give up and match anything */
1009 if (ANYOF_NONBITMAP(or_with)) {
557bd3fb 1010 ssc_anything(pRExC_state, ssc);
c6b76537 1011 }
653099ff
GS
1012 /* We do not use
1013 * (B1 | CL1) | (!B2 & !CL2) = (B1 | !B2 & !CL2) | (CL1 | (!B2 & !CL2))
1014 * <= (B1 | !B2) | (CL1 | !CL2)
1015 * which is wasteful if CL2 is small, but we ignore CL2:
1016 * (B1 | CL1) | (!B2 & !CL2) <= (B1 | CL1) | !B2 = (B1 | !B2) | CL1
1017 * XXXX Can we handle case-fold? Unclear:
1018 * (OK1(i) | OK1(i')) | !(OK1(i) | OK1(i')) =
1019 * (OK1(i) | OK1(i')) | (!OK1(i) & !OK1(i'))
1020 */
557bd3fb 1021 else if ( (or_with->flags & ANYOF_LOCALE) == (ssc->flags & ANYOF_LOCALE)
538b546e 1022 && !(or_with->flags & ANYOF_LOC_FOLD)
557bd3fb 1023 && !(ssc->flags & ANYOF_LOC_FOLD) ) {
653099ff
GS
1024 int i;
1025
1026 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557bd3fb 1027 ssc->bitmap[i] |= ~or_with->bitmap[i];
653099ff
GS
1028 } /* XXXX: logic is complicated otherwise */
1029 else {
557bd3fb 1030 ssc_anything(pRExC_state, ssc);
653099ff 1031 }
c6b76537
KW
1032
1033 /* And, we can just take the union of the flags that aren't affected
1034 * by the inversion */
557bd3fb 1035 ssc->flags |= or_with->flags & INVERSION_UNAFFECTED_FLAGS;
c6b76537
KW
1036
1037 /* For the remaining flags:
9b1eb067 1038 ANYOF_ABOVE_LATIN1_ALL and inverted means to not match anything above
557bd3fb
KW
1039 255, which means that the union with ssc should just be
1040 what ssc has in it, so can ignore this flag
c6b76537 1041 ANYOF_NON_UTF8_LATIN1_ALL and inverted means if not utf8 and ord
557bd3fb
KW
1042 is 127-255 to match them, but then invert that, so the
1043 union with ssc should just be what ssc has in it, so can
c6b76537
KW
1044 ignore this flag
1045 */
1046 } else { /* 'or_with' is not inverted */
653099ff 1047 /* (B1 | CL1) | (B2 | CL2) = (B1 | B2) | (CL1 | CL2)) */
557bd3fb 1048 if ( (or_with->flags & ANYOF_LOCALE) == (ssc->flags & ANYOF_LOCALE)
538b546e 1049 && (!(or_with->flags & ANYOF_LOC_FOLD)
557bd3fb 1050 || (ssc->flags & ANYOF_LOC_FOLD)) ) {
653099ff
GS
1051 int i;
1052
1053 /* OR char bitmap and class bitmap separately */
1054 for (i = 0; i < ANYOF_BITMAP_SIZE; i++)
557bd3fb 1055 ssc->bitmap[i] |= or_with->bitmap[i];
8efd3f97
KW
1056 if (or_with->flags & ANYOF_POSIXL) {
1057 ANYOF_POSIXL_OR(or_with, ssc);
c96939e4 1058 }
653099ff
GS
1059 }
1060 else { /* XXXX: logic is complicated, leave it along for a moment. */
557bd3fb 1061 ssc_anything(pRExC_state, ssc);
653099ff 1062 }
9826f543 1063
c6b76537
KW
1064 if (ANYOF_NONBITMAP(or_with)) {
1065
1066 /* Use the added node's outside-the-bit-map match if there isn't a
1067 * conflict. If there is a conflict (both nodes match something
1068 * outside the bitmap, but what they match outside is not the same
1069 * pointer, and hence not easily compared until XXX we extend
1070 * inversion lists this far), give up and allow the start class to
d94b1d13
KW
1071 * match everything outside the bitmap. If that stuff is all above
1072 * 255, can just set UNICODE_ALL, otherwise caould be anything. */
557bd3fb
KW
1073 if (! ANYOF_NONBITMAP(ssc)) {
1074 ARG_SET(ssc, ARG(or_with));
c6b76537 1075 }
557bd3fb 1076 else if (ARG(ssc) != ARG(or_with)) {
d94b1d13
KW
1077
1078 if ((or_with->flags & ANYOF_NONBITMAP_NON_UTF8)) {
557bd3fb 1079 ssc_anything(pRExC_state, ssc);
d94b1d13
KW
1080 }
1081 else {
9b1eb067 1082 ssc->flags |= ANYOF_ABOVE_LATIN1_ALL;
d94b1d13 1083 }
c6b76537 1084 }
4c34a693 1085 }
0b9668ee
KW
1086
1087 /* Take the union */
557bd3fb 1088 ssc->flags |= or_with->flags;
1aa99e6b 1089 }
653099ff
GS
1090}
1091
a3621e74
YO
1092#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1093#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1094#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1095#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1096
3dab1dad
YO
1097
1098#ifdef DEBUGGING
07be1b83 1099/*
2b8b4781
NC
1100 dump_trie(trie,widecharmap,revcharmap)
1101 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1102 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1103
1104 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1105 The _interim_ variants are used for debugging the interim
1106 tables that are used to generate the final compressed
1107 representation which is what dump_trie expects.
1108
486ec47a 1109 Part of the reason for their existence is to provide a form
3dab1dad 1110 of documentation as to how the different representations function.
07be1b83
YO
1111
1112*/
3dab1dad
YO
1113
1114/*
3dab1dad
YO
1115 Dumps the final compressed table form of the trie to Perl_debug_log.
1116 Used for debugging make_trie().
1117*/
b9a59e08 1118
3dab1dad 1119STATIC void
2b8b4781
NC
1120S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1121 AV *revcharmap, U32 depth)
3dab1dad
YO
1122{
1123 U32 state;
ab3bbdeb 1124 SV *sv=sv_newmortal();
55eed653 1125 int colwidth= widecharmap ? 6 : 4;
2e64971a 1126 U16 word;
3dab1dad
YO
1127 GET_RE_DEBUG_FLAGS_DECL;
1128
7918f24d 1129 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1130
3dab1dad
YO
1131 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1132 (int)depth * 2 + 2,"",
1133 "Match","Base","Ofs" );
1134
1135 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1136 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1137 if ( tmp ) {
ab3bbdeb
YO
1138 PerlIO_printf( Perl_debug_log, "%*s",
1139 colwidth,
ddc5bc0f 1140 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1141 PL_colors[0], PL_colors[1],
1142 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1143 PERL_PV_ESCAPE_FIRSTCHAR
1144 )
1145 );
3dab1dad
YO
1146 }
1147 }
1148 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1149 (int)depth * 2 + 2,"");
1150
1151 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1152 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1153 PerlIO_printf( Perl_debug_log, "\n");
1154
1e2e3d02 1155 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1156 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
1157
1158 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1159
1160 if ( trie->states[ state ].wordnum ) {
1161 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1162 } else {
1163 PerlIO_printf( Perl_debug_log, "%6s", "" );
1164 }
1165
1166 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1167
1168 if ( base ) {
1169 U32 ofs = 0;
1170
1171 while( ( base + ofs < trie->uniquecharcount ) ||
1172 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1173 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1174 ofs++;
1175
1176 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1177
1178 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1179 if ( ( base + ofs >= trie->uniquecharcount ) &&
1180 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1181 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1182 {
ab3bbdeb
YO
1183 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1184 colwidth,
3dab1dad
YO
1185 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1186 } else {
ab3bbdeb 1187 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1188 }
1189 }
1190
1191 PerlIO_printf( Perl_debug_log, "]");
1192
1193 }
1194 PerlIO_printf( Perl_debug_log, "\n" );
1195 }
2e64971a
DM
1196 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1197 for (word=1; word <= trie->wordcount; word++) {
1198 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1199 (int)word, (int)(trie->wordinfo[word].prev),
1200 (int)(trie->wordinfo[word].len));
1201 }
1202 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
1203}
1204/*
3dab1dad
YO
1205 Dumps a fully constructed but uncompressed trie in list form.
1206 List tries normally only are used for construction when the number of
1207 possible chars (trie->uniquecharcount) is very high.
1208 Used for debugging make_trie().
1209*/
1210STATIC void
55eed653 1211S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1212 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1213 U32 depth)
3dab1dad
YO
1214{
1215 U32 state;
ab3bbdeb 1216 SV *sv=sv_newmortal();
55eed653 1217 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1218 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1219
1220 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1221
3dab1dad 1222 /* print out the table precompression. */
ab3bbdeb
YO
1223 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1224 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1225 "------:-----+-----------------\n" );
3dab1dad
YO
1226
1227 for( state=1 ; state < next_alloc ; state ++ ) {
1228 U16 charid;
1229
ab3bbdeb 1230 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1231 (int)depth * 2 + 2,"", (UV)state );
1232 if ( ! trie->states[ state ].wordnum ) {
1233 PerlIO_printf( Perl_debug_log, "%5s| ","");
1234 } else {
1235 PerlIO_printf( Perl_debug_log, "W%4x| ",
1236 trie->states[ state ].wordnum
1237 );
1238 }
1239 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1240 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1241 if ( tmp ) {
1242 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1243 colwidth,
ddc5bc0f 1244 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1245 PL_colors[0], PL_colors[1],
1246 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1247 PERL_PV_ESCAPE_FIRSTCHAR
1248 ) ,
1e2e3d02
YO
1249 TRIE_LIST_ITEM(state,charid).forid,
1250 (UV)TRIE_LIST_ITEM(state,charid).newstate
1251 );
1252 if (!(charid % 10))
664e119d
RGS
1253 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1254 (int)((depth * 2) + 14), "");
1e2e3d02 1255 }
ab3bbdeb
YO
1256 }
1257 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1258 }
1259}
1260
1261/*
3dab1dad
YO
1262 Dumps a fully constructed but uncompressed trie in table form.
1263 This is the normal DFA style state transition table, with a few
1264 twists to facilitate compression later.
1265 Used for debugging make_trie().
1266*/
1267STATIC void
55eed653 1268S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1269 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1270 U32 depth)
3dab1dad
YO
1271{
1272 U32 state;
1273 U16 charid;
ab3bbdeb 1274 SV *sv=sv_newmortal();
55eed653 1275 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1276 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1277
1278 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1279
1280 /*
1281 print out the table precompression so that we can do a visual check
1282 that they are identical.
1283 */
1284
1285 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1286
1287 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1288 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1289 if ( tmp ) {
ab3bbdeb
YO
1290 PerlIO_printf( Perl_debug_log, "%*s",
1291 colwidth,
ddc5bc0f 1292 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1293 PL_colors[0], PL_colors[1],
1294 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1295 PERL_PV_ESCAPE_FIRSTCHAR
1296 )
1297 );
3dab1dad
YO
1298 }
1299 }
1300
1301 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1302
1303 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1304 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1305 }
1306
1307 PerlIO_printf( Perl_debug_log, "\n" );
1308
1309 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1310
1311 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1312 (int)depth * 2 + 2,"",
1313 (UV)TRIE_NODENUM( state ) );
1314
1315 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1316 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1317 if (v)
1318 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1319 else
1320 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1321 }
1322 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1323 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1324 } else {
1325 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1326 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1327 }
1328 }
07be1b83 1329}
3dab1dad
YO
1330
1331#endif
1332
2e64971a 1333
786e8c11
YO
1334/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1335 startbranch: the first branch in the whole branch sequence
1336 first : start branch of sequence of branch-exact nodes.
1337 May be the same as startbranch
1338 last : Thing following the last branch.
1339 May be the same as tail.
1340 tail : item following the branch sequence
1341 count : words in the sequence
1342 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1343 depth : indent depth
3dab1dad 1344
786e8c11 1345Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1346
786e8c11
YO
1347A trie is an N'ary tree where the branches are determined by digital
1348decomposition of the key. IE, at the root node you look up the 1st character and
1349follow that branch repeat until you find the end of the branches. Nodes can be
1350marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1351
786e8c11 1352 /he|she|his|hers/
72f13be8 1353
786e8c11
YO
1354would convert into the following structure. Numbers represent states, letters
1355following numbers represent valid transitions on the letter from that state, if
1356the number is in square brackets it represents an accepting state, otherwise it
1357will be in parenthesis.
07be1b83 1358
786e8c11
YO
1359 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1360 | |
1361 | (2)
1362 | |
1363 (1) +-i->(6)-+-s->[7]
1364 |
1365 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1366
786e8c11
YO
1367 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1368
1369This shows that when matching against the string 'hers' we will begin at state 1
1370read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1371then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1372is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1373single traverse. We store a mapping from accepting to state to which word was
1374matched, and then when we have multiple possibilities we try to complete the
1375rest of the regex in the order in which they occured in the alternation.
1376
1377The only prior NFA like behaviour that would be changed by the TRIE support is
1378the silent ignoring of duplicate alternations which are of the form:
1379
1380 / (DUPE|DUPE) X? (?{ ... }) Y /x
1381
4b714af6 1382Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1383and without the optimisation. With the optimisations dupes will be silently
486ec47a 1384ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1385the following demonstrates:
1386
1387 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1388
1389which prints out 'word' three times, but
1390
1391 'words'=~/(word|word|word)(?{ print $1 })S/
1392
1393which doesnt print it out at all. This is due to other optimisations kicking in.
1394
1395Example of what happens on a structural level:
1396
486ec47a 1397The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1398
1399 1: CURLYM[1] {1,32767}(18)
1400 5: BRANCH(8)
1401 6: EXACT <ac>(16)
1402 8: BRANCH(11)
1403 9: EXACT <ad>(16)
1404 11: BRANCH(14)
1405 12: EXACT <ab>(16)
1406 16: SUCCEED(0)
1407 17: NOTHING(18)
1408 18: END(0)
1409
1410This would be optimizable with startbranch=5, first=5, last=16, tail=16
1411and should turn into:
1412
1413 1: CURLYM[1] {1,32767}(18)
1414 5: TRIE(16)
1415 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1416 <ac>
1417 <ad>
1418 <ab>
1419 16: SUCCEED(0)
1420 17: NOTHING(18)
1421 18: END(0)
1422
1423Cases where tail != last would be like /(?foo|bar)baz/:
1424
1425 1: BRANCH(4)
1426 2: EXACT <foo>(8)
1427 4: BRANCH(7)
1428 5: EXACT <bar>(8)
1429 7: TAIL(8)
1430 8: EXACT <baz>(10)
1431 10: END(0)
1432
1433which would be optimizable with startbranch=1, first=1, last=7, tail=8
1434and would end up looking like:
1435
1436 1: TRIE(8)
1437 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1438 <foo>
1439 <bar>
1440 7: TAIL(8)
1441 8: EXACT <baz>(10)
1442 10: END(0)
1443
c80e42f3 1444 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
1445
1446is the recommended Unicode-aware way of saying
1447
1448 *(d++) = uv;
1449*/
1450
fab2782b 1451#define TRIE_STORE_REVCHAR(val) \
786e8c11 1452 STMT_START { \
73031816 1453 if (UTF) { \
fab2782b 1454 SV *zlopp = newSV(7); /* XXX: optimize me */ \
88c9ea1e 1455 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 1456 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
1457 SvCUR_set(zlopp, kapow - flrbbbbb); \
1458 SvPOK_on(zlopp); \
1459 SvUTF8_on(zlopp); \
1460 av_push(revcharmap, zlopp); \
1461 } else { \
fab2782b 1462 char ooooff = (char)val; \
73031816
NC
1463 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1464 } \
1465 } STMT_END
786e8c11 1466
914a25d5
KW
1467/* This gets the next character from the input, folding it if not already
1468 * folded. */
1469#define TRIE_READ_CHAR STMT_START { \
1470 wordlen++; \
1471 if ( UTF ) { \
1472 /* if it is UTF then it is either already folded, or does not need \
1473 * folding */ \
1c1d615a 1474 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
1475 } \
1476 else if (folder == PL_fold_latin1) { \
7d006b13
KW
1477 /* This folder implies Unicode rules, which in the range expressible \
1478 * by not UTF is the lower case, with the two exceptions, one of \
1479 * which should have been taken care of before calling this */ \
1480 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1481 uvc = toLOWER_L1(*uc); \
1482 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1483 len = 1; \
914a25d5
KW
1484 } else { \
1485 /* raw data, will be folded later if needed */ \
1486 uvc = (U32)*uc; \
1487 len = 1; \
1488 } \
786e8c11
YO
1489} STMT_END
1490
1491
1492
1493#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1494 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1495 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1496 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1497 } \
1498 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1499 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1500 TRIE_LIST_CUR( state )++; \
1501} STMT_END
07be1b83 1502
786e8c11
YO
1503#define TRIE_LIST_NEW(state) STMT_START { \
1504 Newxz( trie->states[ state ].trans.list, \
1505 4, reg_trie_trans_le ); \
1506 TRIE_LIST_CUR( state ) = 1; \
1507 TRIE_LIST_LEN( state ) = 4; \
1508} STMT_END
07be1b83 1509
786e8c11
YO
1510#define TRIE_HANDLE_WORD(state) STMT_START { \
1511 U16 dupe= trie->states[ state ].wordnum; \
1512 regnode * const noper_next = regnext( noper ); \
1513 \
786e8c11
YO
1514 DEBUG_r({ \
1515 /* store the word for dumping */ \
1516 SV* tmp; \
1517 if (OP(noper) != NOTHING) \
740cce10 1518 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1519 else \
740cce10 1520 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1521 av_push( trie_words, tmp ); \
786e8c11
YO
1522 }); \
1523 \
1524 curword++; \
2e64971a
DM
1525 trie->wordinfo[curword].prev = 0; \
1526 trie->wordinfo[curword].len = wordlen; \
1527 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1528 \
1529 if ( noper_next < tail ) { \
1530 if (!trie->jump) \
c944940b 1531 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1532 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1533 if (!jumper) \
1534 jumper = noper_next; \
1535 if (!nextbranch) \
1536 nextbranch= regnext(cur); \
1537 } \
1538 \
1539 if ( dupe ) { \
2e64971a
DM
1540 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1541 /* chain, so that when the bits of chain are later */\
1542 /* linked together, the dups appear in the chain */\
1543 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1544 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1545 } else { \
1546 /* we haven't inserted this word yet. */ \
1547 trie->states[ state ].wordnum = curword; \
1548 } \
1549} STMT_END
07be1b83 1550
3dab1dad 1551
786e8c11
YO
1552#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1553 ( ( base + charid >= ucharcount \
1554 && base + charid < ubound \
1555 && state == trie->trans[ base - ucharcount + charid ].check \
1556 && trie->trans[ base - ucharcount + charid ].next ) \
1557 ? trie->trans[ base - ucharcount + charid ].next \
1558 : ( state==1 ? special : 0 ) \
1559 )
3dab1dad 1560
786e8c11
YO
1561#define MADE_TRIE 1
1562#define MADE_JUMP_TRIE 2
1563#define MADE_EXACT_TRIE 4
3dab1dad 1564
a3621e74 1565STATIC I32
786e8c11 1566S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1567{
27da23d5 1568 dVAR;
a3621e74
YO
1569 /* first pass, loop through and scan words */
1570 reg_trie_data *trie;
55eed653 1571 HV *widecharmap = NULL;
2b8b4781 1572 AV *revcharmap = newAV();
a3621e74 1573 regnode *cur;
a3621e74
YO
1574 STRLEN len = 0;
1575 UV uvc = 0;
1576 U16 curword = 0;
1577 U32 next_alloc = 0;
786e8c11
YO
1578 regnode *jumper = NULL;
1579 regnode *nextbranch = NULL;
7f69552c 1580 regnode *convert = NULL;
2e64971a 1581 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1582 /* we just use folder as a flag in utf8 */
1e696034 1583 const U8 * folder = NULL;
a3621e74 1584
2b8b4781 1585#ifdef DEBUGGING
cf78de0b 1586 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2b8b4781
NC
1587 AV *trie_words = NULL;
1588 /* along with revcharmap, this only used during construction but both are
1589 * useful during debugging so we store them in the struct when debugging.
8e11feef 1590 */
2b8b4781 1591#else
cf78de0b 1592 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 1593 STRLEN trie_charcount=0;
3dab1dad 1594#endif
2b8b4781 1595 SV *re_trie_maxbuff;
a3621e74 1596 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1597
1598 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1599#ifndef DEBUGGING
1600 PERL_UNUSED_ARG(depth);
1601#endif
a3621e74 1602
1e696034 1603 switch (flags) {
79a81a6e 1604 case EXACT: break;
2f7f8cb1 1605 case EXACTFA:
fab2782b 1606 case EXACTFU_SS:
1e696034
KW
1607 case EXACTFU: folder = PL_fold_latin1; break;
1608 case EXACTF: folder = PL_fold; break;
1609 case EXACTFL: folder = PL_fold_locale; break;
fab2782b 1610 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
1611 }
1612
c944940b 1613 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1614 trie->refcount = 1;
3dab1dad 1615 trie->startstate = 1;
786e8c11 1616 trie->wordcount = word_count;
f8fc2ecf 1617 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1618 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
fab2782b 1619 if (flags == EXACT)
c944940b 1620 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1621 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1622 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1623
a3621e74 1624 DEBUG_r({
2b8b4781 1625 trie_words = newAV();
a3621e74 1626 });
a3621e74 1627
0111c4fd 1628 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1629 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1630 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1631 }
df826430 1632 DEBUG_TRIE_COMPILE_r({
3dab1dad 1633 PerlIO_printf( Perl_debug_log,
786e8c11 1634 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1635 (int)depth * 2 + 2, "",
1636 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1637 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1638 (int)depth);
3dab1dad 1639 });
7f69552c
YO
1640
1641 /* Find the node we are going to overwrite */
1642 if ( first == startbranch && OP( last ) != BRANCH ) {
1643 /* whole branch chain */
1644 convert = first;
1645 } else {
1646 /* branch sub-chain */
1647 convert = NEXTOPER( first );
1648 }
1649
a3621e74
YO
1650 /* -- First loop and Setup --
1651
1652 We first traverse the branches and scan each word to determine if it
1653 contains widechars, and how many unique chars there are, this is
1654 important as we have to build a table with at least as many columns as we
1655 have unique chars.
1656
1657 We use an array of integers to represent the character codes 0..255
38a44b82 1658 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1659 native representation of the character value as the key and IV's for the
1660 coded index.
1661
1662 *TODO* If we keep track of how many times each character is used we can
1663 remap the columns so that the table compression later on is more
3b753521 1664 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1665 middle and the least common are on the outside. IMO this would be better
1666 than a most to least common mapping as theres a decent chance the most
1667 common letter will share a node with the least common, meaning the node
486ec47a 1668 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1669 case is when we have the least common nodes twice.
1670
1671 */
1672
a3621e74 1673 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 1674 regnode *noper = NEXTOPER( cur );
e1ec3a88 1675 const U8 *uc = (U8*)STRING( noper );
df826430 1676 const U8 *e = uc + STR_LEN( noper );
a3621e74 1677 STRLEN foldlen = 0;
07be1b83 1678 U32 wordlen = 0; /* required init */
afa96d92
KW
1679 STRLEN minbytes = 0;
1680 STRLEN maxbytes = 0;
02daf0ab 1681 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1682
3dab1dad 1683 if (OP(noper) == NOTHING) {
df826430
YO
1684 regnode *noper_next= regnext(noper);
1685 if (noper_next != tail && OP(noper_next) == flags) {
1686 noper = noper_next;
1687 uc= (U8*)STRING(noper);
1688 e= uc + STR_LEN(noper);
1689 trie->minlen= STR_LEN(noper);
1690 } else {
1691 trie->minlen= 0;
1692 continue;
1693 }
3dab1dad 1694 }
df826430 1695
fab2782b 1696 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
1697 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1698 regardless of encoding */
fab2782b
YO
1699 if (OP( noper ) == EXACTFU_SS) {
1700 /* false positives are ok, so just set this */
0dc4a61d 1701 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
1702 }
1703 }
a3621e74 1704 for ( ; uc < e ; uc += len ) {
3dab1dad 1705 TRIE_CHARCOUNT(trie)++;
a3621e74 1706 TRIE_READ_CHAR;
645de4ce
KW
1707
1708 /* Acummulate to the current values, the range in the number of
1709 * bytes that this character could match. The max is presumed to
1710 * be the same as the folded input (which TRIE_READ_CHAR returns),
1711 * except that when this is not in UTF-8, it could be matched
1712 * against a string which is UTF-8, and the variant characters
1713 * could be 2 bytes instead of the 1 here. Likewise, for the
1714 * minimum number of bytes when not folded. When folding, the min
1715 * is assumed to be 1 byte could fold to match the single character
1716 * here, or in the case of a multi-char fold, 1 byte can fold to
1717 * the whole sequence. 'foldlen' is used to denote whether we are
1718 * in such a sequence, skipping the min setting if so. XXX TODO
1719 * Use the exact list of what folds to each character, from
1720 * PL_utf8_foldclosures */
1721 if (UTF) {
1722 maxbytes += UTF8SKIP(uc);
1723 if (! folder) {
1724 /* A non-UTF-8 string could be 1 byte to match our 2 */
1725 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
1726 ? 1
1727 : UTF8SKIP(uc);
1728 }
1729 else {
1730 if (foldlen) {
1731 foldlen -= UTF8SKIP(uc);
1732 }
1733 else {
1734 foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
1735 minbytes++;
1736 }
1737 }
1738 }
1739 else {
1740 maxbytes += (UNI_IS_INVARIANT(*uc))
1741 ? 1
1742 : 2;
1743 if (! folder) {
1744 minbytes++;
1745 }
1746 else {
1747 if (foldlen) {
1748 foldlen--;
1749 }
1750 else {
1751 foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
1752 minbytes++;
1753 }
1754 }
1755 }
a3621e74 1756 if ( uvc < 256 ) {
fab2782b
YO
1757 if ( folder ) {
1758 U8 folded= folder[ (U8) uvc ];
1759 if ( !trie->charmap[ folded ] ) {
1760 trie->charmap[ folded ]=( ++trie->uniquecharcount );
1761 TRIE_STORE_REVCHAR( folded );
1762 }
1763 }
a3621e74
YO
1764 if ( !trie->charmap[ uvc ] ) {
1765 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 1766 TRIE_STORE_REVCHAR( uvc );
a3621e74 1767 }
02daf0ab 1768 if ( set_bit ) {
62012aee
KW
1769 /* store the codepoint in the bitmap, and its folded
1770 * equivalent. */
fab2782b 1771 TRIE_BITMAP_SET(trie, uvc);
0921ee73
T
1772
1773 /* store the folded codepoint */
fab2782b 1774 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
0921ee73
T
1775
1776 if ( !UTF ) {
1777 /* store first byte of utf8 representation of
acdf4139 1778 variant codepoints */
6f2d5cbc 1779 if (! UVCHR_IS_INVARIANT(uvc)) {
acdf4139 1780 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
1781 }
1782 }
02daf0ab
YO
1783 set_bit = 0; /* We've done our bit :-) */
1784 }
a3621e74
YO
1785 } else {
1786 SV** svpp;
55eed653
NC
1787 if ( !widecharmap )
1788 widecharmap = newHV();
a3621e74 1789
55eed653 1790 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
1791
1792 if ( !svpp )
e4584336 1793 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
1794
1795 if ( !SvTRUE( *svpp ) ) {
1796 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 1797 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
1798 }
1799 }
1800 }
3dab1dad 1801 if( cur == first ) {
afa96d92
KW
1802 trie->minlen = minbytes;
1803 trie->maxlen = maxbytes;
1804 } else if (minbytes < trie->minlen) {
1805 trie->minlen = minbytes;
1806 } else if (maxbytes > trie->maxlen) {
1807 trie->maxlen = maxbytes;
fab2782b 1808 }
a3621e74
YO
1809 } /* end first pass */
1810 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
1811 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
1812 (int)depth * 2 + 2,"",
55eed653 1813 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
1814 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
1815 (int)trie->minlen, (int)trie->maxlen )
a3621e74 1816 );
a3621e74
YO
1817
1818 /*
1819 We now know what we are dealing with in terms of unique chars and
1820 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
1821 representation using a flat table will take. If it's over a reasonable
1822 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
1823 conservative but potentially much slower representation using an array
1824 of lists.
1825
1826 At the end we convert both representations into the same compressed
1827 form that will be used in regexec.c for matching with. The latter
1828 is a form that cannot be used to construct with but has memory
1829 properties similar to the list form and access properties similar
1830 to the table form making it both suitable for fast searches and
1831 small enough that its feasable to store for the duration of a program.
1832
1833 See the comment in the code where the compressed table is produced
1834 inplace from the flat tabe representation for an explanation of how
1835 the compression works.
1836
1837 */
1838
1839
2e64971a
DM
1840 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
1841 prev_states[1] = 0;
1842
3dab1dad 1843 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
1844 /*
1845 Second Pass -- Array Of Lists Representation
1846
1847 Each state will be represented by a list of charid:state records
1848 (reg_trie_trans_le) the first such element holds the CUR and LEN
1849 points of the allocated array. (See defines above).
1850
1851 We build the initial structure using the lists, and then convert
1852 it into the compressed table form which allows faster lookups
1853 (but cant be modified once converted).
a3621e74
YO
1854 */
1855
a3621e74
YO
1856 STRLEN transcount = 1;
1857
1e2e3d02
YO
1858 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1859 "%*sCompiling trie using list compiler\n",
1860 (int)depth * 2 + 2, ""));
686b73d4 1861
c944940b
JH
1862 trie->states = (reg_trie_state *)
1863 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
1864 sizeof(reg_trie_state) );
a3621e74
YO
1865 TRIE_LIST_NEW(1);
1866 next_alloc = 2;
1867
1868 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
1869
df826430 1870 regnode *noper = NEXTOPER( cur );
c445ea15 1871 U8 *uc = (U8*)STRING( noper );
df826430 1872 const U8 *e = uc + STR_LEN( noper );
c445ea15
AL
1873 U32 state = 1; /* required init */
1874 U16 charid = 0; /* sanity init */
07be1b83 1875 U32 wordlen = 0; /* required init */
c445ea15 1876
df826430
YO
1877 if (OP(noper) == NOTHING) {
1878 regnode *noper_next= regnext(noper);
1879 if (noper_next != tail && OP(noper_next) == flags) {
1880 noper = noper_next;
1881 uc= (U8*)STRING(noper);
1882 e= uc + STR_LEN(noper);
1883 }
1884 }
1885
3dab1dad 1886 if (OP(noper) != NOTHING) {
786e8c11 1887 for ( ; uc < e ; uc += len ) {
c445ea15 1888
786e8c11 1889 TRIE_READ_CHAR;
c445ea15 1890
786e8c11
YO
1891 if ( uvc < 256 ) {
1892 charid = trie->charmap[ uvc ];
c445ea15 1893 } else {
55eed653 1894 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
1895 if ( !svpp ) {
1896 charid = 0;
1897 } else {
1898 charid=(U16)SvIV( *svpp );
1899 }
c445ea15 1900 }
786e8c11
YO
1901 /* charid is now 0 if we dont know the char read, or nonzero if we do */
1902 if ( charid ) {
a3621e74 1903
786e8c11
YO
1904 U16 check;
1905 U32 newstate = 0;
a3621e74 1906
786e8c11
YO
1907 charid--;
1908 if ( !trie->states[ state ].trans.list ) {
1909 TRIE_LIST_NEW( state );
c445ea15 1910 }
786e8c11
YO
1911 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
1912 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
1913 newstate = TRIE_LIST_ITEM( state, check ).newstate;
1914 break;
1915 }
1916 }
1917 if ( ! newstate ) {
1918 newstate = next_alloc++;
2e64971a 1919 prev_states[newstate] = state;
786e8c11
YO
1920 TRIE_LIST_PUSH( state, charid, newstate );
1921 transcount++;
1922 }
1923 state = newstate;
1924 } else {
1925 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 1926 }
a28509cc 1927 }
c445ea15 1928 }
3dab1dad 1929 TRIE_HANDLE_WORD(state);
a3621e74
YO
1930
1931 } /* end second pass */
1932
1e2e3d02
YO
1933 /* next alloc is the NEXT state to be allocated */
1934 trie->statecount = next_alloc;
c944940b
JH
1935 trie->states = (reg_trie_state *)
1936 PerlMemShared_realloc( trie->states,
1937 next_alloc
1938 * sizeof(reg_trie_state) );
a3621e74 1939
3dab1dad 1940 /* and now dump it out before we compress it */
2b8b4781
NC
1941 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
1942 revcharmap, next_alloc,
1943 depth+1)
1e2e3d02 1944 );
a3621e74 1945
c944940b
JH
1946 trie->trans = (reg_trie_trans *)
1947 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
1948 {
1949 U32 state;
a3621e74
YO
1950 U32 tp = 0;
1951 U32 zp = 0;
1952
1953
1954 for( state=1 ; state < next_alloc ; state ++ ) {
1955 U32 base=0;
1956
1957 /*
1958 DEBUG_TRIE_COMPILE_MORE_r(
1959 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
1960 );
1961 */
1962
1963 if (trie->states[state].trans.list) {
1964 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
1965 U16 maxid=minid;
a28509cc 1966 U16 idx;
a3621e74
YO
1967
1968 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
1969 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
1970 if ( forid < minid ) {
1971 minid=forid;
1972 } else if ( forid > maxid ) {
1973 maxid=forid;
1974 }
a3621e74
YO
1975 }
1976 if ( transcount < tp + maxid - minid + 1) {
1977 transcount *= 2;
c944940b
JH
1978 trie->trans = (reg_trie_trans *)
1979 PerlMemShared_realloc( trie->trans,
446bd890
NC
1980 transcount
1981 * sizeof(reg_trie_trans) );
a3621e74
YO
1982 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
1983 }
1984 base = trie->uniquecharcount + tp - minid;
1985 if ( maxid == minid ) {
1986 U32 set = 0;
1987 for ( ; zp < tp ; zp++ ) {
1988 if ( ! trie->trans[ zp ].next ) {
1989 base = trie->uniquecharcount + zp - minid;
1990 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1991 trie->trans[ zp ].check = state;
1992 set = 1;
1993 break;
1994 }
1995 }
1996 if ( !set ) {
1997 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
1998 trie->trans[ tp ].check = state;
1999 tp++;
2000 zp = tp;
2001 }
2002 } else {
2003 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 2004 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
2005 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2006 trie->trans[ tid ].check = state;
2007 }
2008 tp += ( maxid - minid + 1 );
2009 }
2010 Safefree(trie->states[ state ].trans.list);
2011 }
2012 /*
2013 DEBUG_TRIE_COMPILE_MORE_r(
2014 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2015 );
2016 */
2017 trie->states[ state ].trans.base=base;
2018 }
cc601c31 2019 trie->lasttrans = tp + 1;
a3621e74
YO
2020 }
2021 } else {
2022 /*
2023 Second Pass -- Flat Table Representation.
2024
2025 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
2026 We know that we will need Charcount+1 trans at most to store the data
2027 (one row per char at worst case) So we preallocate both structures
2028 assuming worst case.
2029
2030 We then construct the trie using only the .next slots of the entry
2031 structs.
2032
3b753521 2033 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
2034 make compression both faster and easier by keeping track of how many non
2035 zero fields are in the node.
2036
2037 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2038 transition.
2039
2040 There are two terms at use here: state as a TRIE_NODEIDX() which is a
2041 number representing the first entry of the node, and state as a
2042 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
2043 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
2044 are 2 entrys per node. eg:
2045
2046 A B A B
2047 1. 2 4 1. 3 7
2048 2. 0 3 3. 0 5
2049 3. 0 0 5. 0 0
2050 4. 0 0 7. 0 0
2051
2052 The table is internally in the right hand, idx form. However as we also
2053 have to deal with the states array which is indexed by nodenum we have to
2054 use TRIE_NODENUM() to convert.
2055
2056 */
1e2e3d02
YO
2057 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2058 "%*sCompiling trie using table compiler\n",
2059 (int)depth * 2 + 2, ""));
3dab1dad 2060
c944940b
JH
2061 trie->trans = (reg_trie_trans *)
2062 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2063 * trie->uniquecharcount + 1,
2064 sizeof(reg_trie_trans) );
2065 trie->states = (reg_trie_state *)
2066 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2067 sizeof(reg_trie_state) );
a3621e74
YO
2068 next_alloc = trie->uniquecharcount + 1;
2069
3dab1dad 2070
a3621e74
YO
2071 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2072
df826430 2073 regnode *noper = NEXTOPER( cur );
a28509cc 2074 const U8 *uc = (U8*)STRING( noper );
df826430 2075 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
2076
2077 U32 state = 1; /* required init */
2078
2079 U16 charid = 0; /* sanity init */
2080 U32 accept_state = 0; /* sanity init */
a3621e74 2081
07be1b83 2082 U32 wordlen = 0; /* required init */
a3621e74 2083
df826430
YO
2084 if (OP(noper) == NOTHING) {
2085 regnode *noper_next= regnext(noper);
2086 if (noper_next != tail && OP(noper_next) == flags) {
2087 noper = noper_next;
2088 uc= (U8*)STRING(noper);
2089 e= uc + STR_LEN(noper);
2090 }
2091 }
fab2782b 2092
3dab1dad 2093 if ( OP(noper) != NOTHING ) {
786e8c11 2094 for ( ; uc < e ; uc += len ) {
a3621e74 2095
786e8c11 2096 TRIE_READ_CHAR;
a3621e74 2097
786e8c11
YO
2098 if ( uvc < 256 ) {
2099 charid = trie->charmap[ uvc ];
2100 } else {
55eed653 2101 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 2102 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 2103 }
786e8c11
YO
2104 if ( charid ) {
2105 charid--;
2106 if ( !trie->trans[ state + charid ].next ) {
2107 trie->trans[ state + charid ].next = next_alloc;
2108 trie->trans[ state ].check++;
2e64971a
DM
2109 prev_states[TRIE_NODENUM(next_alloc)]
2110 = TRIE_NODENUM(state);
786e8c11
YO
2111 next_alloc += trie->uniquecharcount;
2112 }
2113 state = trie->trans[ state + charid ].next;
2114 } else {
2115 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2116 }
2117 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 2118 }
a3621e74 2119 }
3dab1dad
YO
2120 accept_state = TRIE_NODENUM( state );
2121 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
2122
2123 } /* end second pass */
2124
3dab1dad 2125 /* and now dump it out before we compress it */
2b8b4781
NC
2126 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2127 revcharmap,
2128 next_alloc, depth+1));
a3621e74 2129
a3621e74
YO
2130 {
2131 /*
2132 * Inplace compress the table.*
2133
2134 For sparse data sets the table constructed by the trie algorithm will
2135 be mostly 0/FAIL transitions or to put it another way mostly empty.
2136 (Note that leaf nodes will not contain any transitions.)
2137
2138 This algorithm compresses the tables by eliminating most such
2139 transitions, at the cost of a modest bit of extra work during lookup:
2140
2141 - Each states[] entry contains a .base field which indicates the
2142 index in the state[] array wheres its transition data is stored.
2143
3b753521 2144 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
2145
2146 - If .base is nonzero then charid is added to it to find an entry in
2147 the trans array.
2148
2149 -If trans[states[state].base+charid].check!=state then the
2150 transition is taken to be a 0/Fail transition. Thus if there are fail
2151 transitions at the front of the node then the .base offset will point
2152 somewhere inside the previous nodes data (or maybe even into a node
2153 even earlier), but the .check field determines if the transition is
2154 valid.
2155
786e8c11 2156 XXX - wrong maybe?
a3621e74 2157 The following process inplace converts the table to the compressed
3b753521 2158 table: We first do not compress the root node 1,and mark all its
a3621e74 2159 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
2160 allows us to do a DFA construction from the compressed table later,
2161 and ensures that any .base pointers we calculate later are greater
2162 than 0.
a3621e74
YO
2163
2164 - We set 'pos' to indicate the first entry of the second node.
2165
2166 - We then iterate over the columns of the node, finding the first and
2167 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2168 and set the .check pointers accordingly, and advance pos
2169 appropriately and repreat for the next node. Note that when we copy
2170 the next pointers we have to convert them from the original
2171 NODEIDX form to NODENUM form as the former is not valid post
2172 compression.
2173
2174 - If a node has no transitions used we mark its base as 0 and do not
2175 advance the pos pointer.
2176
2177 - If a node only has one transition we use a second pointer into the
2178 structure to fill in allocated fail transitions from other states.
2179 This pointer is independent of the main pointer and scans forward
2180 looking for null transitions that are allocated to a state. When it
2181 finds one it writes the single transition into the "hole". If the
786e8c11 2182 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
2183
2184 - Once compressed we can Renew/realloc the structures to release the
2185 excess space.
2186
2187 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2188 specifically Fig 3.47 and the associated pseudocode.
2189
2190 demq
2191 */
a3b680e6 2192 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 2193 U32 state, charid;
a3621e74 2194 U32 pos = 0, zp=0;
1e2e3d02 2195 trie->statecount = laststate;
a3621e74
YO
2196
2197 for ( state = 1 ; state < laststate ; state++ ) {
2198 U8 flag = 0;
a28509cc
AL
2199 const U32 stateidx = TRIE_NODEIDX( state );
2200 const U32 o_used = trie->trans[ stateidx ].check;
2201 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
2202 trie->trans[ stateidx ].check = 0;
2203
2204 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2205 if ( flag || trie->trans[ stateidx + charid ].next ) {
2206 if ( trie->trans[ stateidx + charid ].next ) {
2207 if (o_used == 1) {
2208 for ( ; zp < pos ; zp++ ) {
2209 if ( ! trie->trans[ zp ].next ) {
2210 break;
2211 }
2212 }
2213 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2214 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2215 trie->trans[ zp ].check = state;
2216 if ( ++zp > pos ) pos = zp;
2217 break;
2218 }
2219 used--;
2220 }
2221 if ( !flag ) {
2222 flag = 1;
2223 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2224 }
2225 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2226 trie->trans[ pos ].check = state;
2227 pos++;
2228 }
2229 }
2230 }
cc601c31 2231 trie->lasttrans = pos + 1;
c944940b
JH
2232 trie->states = (reg_trie_state *)
2233 PerlMemShared_realloc( trie->states, laststate
2234 * sizeof(reg_trie_state) );
a3621e74 2235 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 2236 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2237 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2238 (int)depth * 2 + 2,"",
2239 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
2240 (IV)next_alloc,
2241 (IV)pos,
a3621e74
YO
2242 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2243 );
2244
2245 } /* end table compress */
2246 }
1e2e3d02
YO
2247 DEBUG_TRIE_COMPILE_MORE_r(
2248 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2249 (int)depth * 2 + 2, "",
2250 (UV)trie->statecount,
2251 (UV)trie->lasttrans)
2252 );
cc601c31 2253 /* resize the trans array to remove unused space */
c944940b
JH
2254 trie->trans = (reg_trie_trans *)
2255 PerlMemShared_realloc( trie->trans, trie->lasttrans
2256 * sizeof(reg_trie_trans) );
a3621e74 2257
3b753521 2258 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2259 U8 nodetype =(U8)(flags & 0xFF);
2260 char *str=NULL;
786e8c11 2261
07be1b83 2262#ifdef DEBUGGING
e62cc96a 2263 regnode *optimize = NULL;
7122b237
YO
2264#ifdef RE_TRACK_PATTERN_OFFSETS
2265
b57a0404
JH
2266 U32 mjd_offset = 0;
2267 U32 mjd_nodelen = 0;
7122b237
YO
2268#endif /* RE_TRACK_PATTERN_OFFSETS */
2269#endif /* DEBUGGING */
a3621e74 2270 /*
3dab1dad
YO
2271 This means we convert either the first branch or the first Exact,
2272 depending on whether the thing following (in 'last') is a branch
2273 or not and whther first is the startbranch (ie is it a sub part of
2274 the alternation or is it the whole thing.)
3b753521 2275 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2276 the whole branch sequence, including the first.
a3621e74 2277 */
3dab1dad 2278 /* Find the node we are going to overwrite */
7f69552c 2279 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2280 /* branch sub-chain */
3dab1dad 2281 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2282#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2283 DEBUG_r({
2284 mjd_offset= Node_Offset((convert));
2285 mjd_nodelen= Node_Length((convert));
2286 });
7122b237 2287#endif
7f69552c 2288 /* whole branch chain */
7122b237
YO
2289 }
2290#ifdef RE_TRACK_PATTERN_OFFSETS
2291 else {
7f69552c
YO
2292 DEBUG_r({
2293 const regnode *nop = NEXTOPER( convert );
2294 mjd_offset= Node_Offset((nop));
2295 mjd_nodelen= Node_Length((nop));
2296 });
07be1b83
YO
2297 }
2298 DEBUG_OPTIMISE_r(
2299 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2300 (int)depth * 2 + 2, "",
786e8c11 2301 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2302 );
7122b237 2303#endif
3dab1dad
YO
2304 /* But first we check to see if there is a common prefix we can
2305 split out as an EXACT and put in front of the TRIE node. */
2306 trie->startstate= 1;
55eed653 2307 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2308 U32 state;
1e2e3d02 2309 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2310 U32 ofs = 0;
8e11feef
RGS
2311 I32 idx = -1;
2312 U32 count = 0;
2313 const U32 base = trie->states[ state ].trans.base;
a3621e74 2314
3dab1dad 2315 if ( trie->states[state].wordnum )
8e11feef 2316 count = 1;
a3621e74 2317
8e11feef 2318 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2319 if ( ( base + ofs >= trie->uniquecharcount ) &&
2320 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2321 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2322 {
3dab1dad 2323 if ( ++count > 1 ) {
2b8b4781 2324 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2325 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2326 if ( state == 1 ) break;
3dab1dad
YO
2327 if ( count == 2 ) {
2328 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2329 DEBUG_OPTIMISE_r(
8e11feef
RGS
2330 PerlIO_printf(Perl_debug_log,
2331 "%*sNew Start State=%"UVuf" Class: [",
2332 (int)depth * 2 + 2, "",
786e8c11 2333 (UV)state));
be8e71aa 2334 if (idx >= 0) {
2b8b4781 2335 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2336 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2337
3dab1dad 2338 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2339 if ( folder )
2340 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2341 DEBUG_OPTIMISE_r(
f1f66076 2342 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2343 );
8e11feef
RGS
2344 }
2345 }
2346 TRIE_BITMAP_SET(trie,*ch);
2347 if ( folder )
2348 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2349 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2350 }
2351 idx = ofs;
2352 }
3dab1dad
YO
2353 }
2354 if ( count == 1 ) {
2b8b4781 2355 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2356 STRLEN len;
2357 char *ch = SvPV( *tmp, len );
de734bd5
A
2358 DEBUG_OPTIMISE_r({
2359 SV *sv=sv_newmortal();
8e11feef
RGS
2360 PerlIO_printf( Perl_debug_log,
2361 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2362 (int)depth * 2 + 2, "",
de734bd5
A
2363 (UV)state, (UV)idx,
2364 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2365 PL_colors[0], PL_colors[1],
2366 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2367 PERL_PV_ESCAPE_FIRSTCHAR
2368 )
2369 );
2370 });
3dab1dad
YO
2371 if ( state==1 ) {
2372 OP( convert ) = nodetype;
2373 str=STRING(convert);
2374 STR_LEN(convert)=0;
2375 }
c490c714
YO
2376 STR_LEN(convert) += len;
2377 while (len--)
de734bd5 2378 *str++ = *ch++;
8e11feef 2379 } else {
f9049ba1 2380#ifdef DEBUGGING
8e11feef
RGS
2381 if (state>1)
2382 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2383#endif
8e11feef
RGS
2384 break;
2385 }
2386 }
2e64971a 2387 trie->prefixlen = (state-1);
3dab1dad 2388 if (str) {
8e11feef 2389 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2390 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2391 trie->startstate = state;
07be1b83
YO
2392 trie->minlen -= (state - 1);
2393 trie->maxlen -= (state - 1);
33809eae
JH
2394#ifdef DEBUGGING
2395 /* At least the UNICOS C compiler choked on this
2396 * being argument to DEBUG_r(), so let's just have
2397 * it right here. */
2398 if (
2399#ifdef PERL_EXT_RE_BUILD
2400 1
2401#else
2402 DEBUG_r_TEST
2403#endif
2404 ) {
2405 regnode *fix = convert;
2406 U32 word = trie->wordcount;
2407 mjd_nodelen++;
2408 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2409 while( ++fix < n ) {
2410 Set_Node_Offset_Length(fix, 0, 0);
2411 }
2412 while (word--) {
2413 SV ** const tmp = av_fetch( trie_words, word, 0 );
2414 if (tmp) {
2415 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2416 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2417 else
2418 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2419 }
2420 }
2421 }
2422#endif
8e11feef
RGS
2423 if (trie->maxlen) {
2424 convert = n;
2425 } else {
3dab1dad 2426 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2427 DEBUG_r(optimize= n);
3dab1dad
YO
2428 }
2429 }
2430 }
a5ca303d
YO
2431 if (!jumper)
2432 jumper = last;
3dab1dad 2433 if ( trie->maxlen ) {
8e11feef
RGS
2434 NEXT_OFF( convert ) = (U16)(tail - convert);
2435 ARG_SET( convert, data_slot );
786e8c11
YO
2436 /* Store the offset to the first unabsorbed branch in
2437 jump[0], which is otherwise unused by the jump logic.
2438 We use this when dumping a trie and during optimisation. */
2439 if (trie->jump)
7f69552c 2440 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2441
6c48061a
YO
2442 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2443 * and there is a bitmap
2444 * and the first "jump target" node we found leaves enough room
2445 * then convert the TRIE node into a TRIEC node, with the bitmap
2446 * embedded inline in the opcode - this is hypothetically faster.
2447 */
2448 if ( !trie->states[trie->startstate].wordnum
2449 && trie->bitmap
2450 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2451 {
2452 OP( convert ) = TRIEC;
2453 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2454 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2455 trie->bitmap= NULL;
2456 } else
2457 OP( convert ) = TRIE;
a3621e74 2458
3dab1dad
YO
2459 /* store the type in the flags */
2460 convert->flags = nodetype;
a5ca303d
YO
2461 DEBUG_r({
2462 optimize = convert
2463 + NODE_STEP_REGNODE
2464 + regarglen[ OP( convert ) ];
2465 });
2466 /* XXX We really should free up the resource in trie now,
2467 as we won't use them - (which resources?) dmq */
3dab1dad 2468 }
a3621e74 2469 /* needed for dumping*/
e62cc96a 2470 DEBUG_r(if (optimize) {
07be1b83 2471 regnode *opt = convert;
bcdf7404 2472
e62cc96a 2473 while ( ++opt < optimize) {
07be1b83
YO
2474 Set_Node_Offset_Length(opt,0,0);
2475 }
786e8c11
YO
2476 /*
2477 Try to clean up some of the debris left after the
2478 optimisation.
a3621e74 2479 */
786e8c11 2480 while( optimize < jumper ) {
07be1b83 2481 mjd_nodelen += Node_Length((optimize));
a3621e74 2482 OP( optimize ) = OPTIMIZED;
07be1b83 2483 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2484 optimize++;
2485 }
07be1b83 2486 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2487 });
2488 } /* end node insert */
2e64971a
DM
2489
2490 /* Finish populating the prev field of the wordinfo array. Walk back
2491 * from each accept state until we find another accept state, and if
2492 * so, point the first word's .prev field at the second word. If the
2493 * second already has a .prev field set, stop now. This will be the
2494 * case either if we've already processed that word's accept state,
3b753521
FN
2495 * or that state had multiple words, and the overspill words were
2496 * already linked up earlier.
2e64971a
DM
2497 */
2498 {
2499 U16 word;
2500 U32 state;
2501 U16 prev;
2502
2503 for (word=1; word <= trie->wordcount; word++) {
2504 prev = 0;
2505 if (trie->wordinfo[word].prev)
2506 continue;
2507 state = trie->wordinfo[word].accept;
2508 while (state) {
2509 state = prev_states[state];
2510 if (!state)
2511 break;
2512 prev = trie->states[state].wordnum;
2513 if (prev)
2514 break;
2515 }
2516 trie->wordinfo[word].prev = prev;
2517 }
2518 Safefree(prev_states);
2519 }
2520
2521
2522 /* and now dump out the compressed format */
2523 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2524
55eed653 2525 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2526#ifdef DEBUGGING
2527 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2528 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2529#else
03e70be4 2530 SvREFCNT_dec_NN(revcharmap);
07be1b83 2531#endif
786e8c11
YO
2532 return trie->jump
2533 ? MADE_JUMP_TRIE
2534 : trie->startstate>1
2535 ? MADE_EXACT_TRIE
2536 : MADE_TRIE;
2537}
2538
2539STATIC void
2540S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2541{
3b753521 2542/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2543
2544 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2545 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2546 ISBN 0-201-10088-6
2547
2548 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2549 suffix of the current state's 'word' that is also a proper prefix of another word in our
2550 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2551 the DFA not to have to restart after its tried and failed a word at a given point, it
2552 simply continues as though it had been matching the other word in the first place.
2553 Consider
2554 'abcdgu'=~/abcdefg|cdgu/
2555 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2556 fail, which would bring us to the state representing 'd' in the second word where we would
2557 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2558 */
2559 /* add a fail transition */
3251b653
NC
2560 const U32 trie_offset = ARG(source);
2561 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2562 U32 *q;
2563 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2564 const U32 numstates = trie->statecount;
786e8c11
YO
2565 const U32 ubound = trie->lasttrans + ucharcount;
2566 U32 q_read = 0;
2567 U32 q_write = 0;
2568 U32 charid;
2569 U32 base = trie->states[ 1 ].trans.base;
2570 U32 *fail;
2571 reg_ac_data *aho;
cf78de0b 2572 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
786e8c11 2573 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2574
2575 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2576#ifndef DEBUGGING
2577 PERL_UNUSED_ARG(depth);
2578#endif
2579
2580
2581 ARG_SET( stclass, data_slot );
c944940b 2582 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2583 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2584 aho->trie=trie_offset;
446bd890
NC
2585 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2586 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2587 Newxz( q, numstates, U32);
c944940b 2588 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2589 aho->refcount = 1;
2590 fail = aho->fail;
2591 /* initialize fail[0..1] to be 1 so that we always have
2592 a valid final fail state */
2593 fail[ 0 ] = fail[ 1 ] = 1;
2594
2595 for ( charid = 0; charid < ucharcount ; charid++ ) {
2596 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2597 if ( newstate ) {
2598 q[ q_write ] = newstate;
2599 /* set to point at the root */
2600 fail[ q[ q_write++ ] ]=1;
2601 }
2602 }
2603 while ( q_read < q_write) {
2604 const U32 cur = q[ q_read++ % numstates ];
2605 base = trie->states[ cur ].trans.base;
2606
2607 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2608 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2609 if (ch_state) {
2610 U32 fail_state = cur;
2611 U32 fail_base;
2612 do {
2613 fail_state = fail[ fail_state ];
2614 fail_base = aho->states[ fail_state ].trans.base;
2615 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2616
2617 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2618 fail[ ch_state ] = fail_state;
2619 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2620 {
2621 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2622 }
2623 q[ q_write++ % numstates] = ch_state;
2624 }
2625 }
2626 }
2627 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2628 when we fail in state 1, this allows us to use the
2629 charclass scan to find a valid start char. This is based on the principle
2630 that theres a good chance the string being searched contains lots of stuff
2631 that cant be a start char.
2632 */
2633 fail[ 0 ] = fail[ 1 ] = 0;
2634 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2635 PerlIO_printf(Perl_debug_log,
2636 "%*sStclass Failtable (%"UVuf" states): 0",
2637 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2638 );
786e8c11
YO
2639 for( q_read=1; q_read<numstates; q_read++ ) {
2640 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2641 }
2642 PerlIO_printf(Perl_debug_log, "\n");
2643 });
2644 Safefree(q);
2645 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2646}
2647
786e8c11 2648
07be1b83 2649#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2650 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2651 SV * const mysv=sv_newmortal(); \
2652 regnode *Next = regnext(scan); \
2653 regprop(RExC_rx, mysv, scan); \
7f69552c 2654 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2655 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2656 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2657 }});
07be1b83 2658
1de06328 2659
bb914485 2660/* The below joins as many adjacent EXACTish nodes as possible into a single
0a982f06
KW
2661 * one. The regop may be changed if the node(s) contain certain sequences that
2662 * require special handling. The joining is only done if:
bb914485
KW
2663 * 1) there is room in the current conglomerated node to entirely contain the
2664 * next one.
2665 * 2) they are the exact same node type
2666 *
87b8b349 2667 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
bb914485
KW
2668 * these get optimized out
2669 *
0a982f06
KW
2670 * If a node is to match under /i (folded), the number of characters it matches
2671 * can be different than its character length if it contains a multi-character
2672 * fold. *min_subtract is set to the total delta of the input nodes.
bb914485 2673 *
a0c4c608
KW
2674 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2675 * and contains LATIN SMALL LETTER SHARP S
f758bddf 2676 *
bb914485 2677 * This is as good a place as any to discuss the design of handling these
0a982f06
KW
2678 * multi-character fold sequences. It's been wrong in Perl for a very long
2679 * time. There are three code points in Unicode whose multi-character folds
2680 * were long ago discovered to mess things up. The previous designs for
2681 * dealing with these involved assigning a special node for them. This
2682 * approach doesn't work, as evidenced by this example:
a0c4c608 2683 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
0a982f06
KW
2684 * Both these fold to "sss", but if the pattern is parsed to create a node that
2685 * would match just the \xDF, it won't be able to handle the case where a
bb914485
KW
2686 * successful match would have to cross the node's boundary. The new approach
2687 * that hopefully generally solves the problem generates an EXACTFU_SS node
2688 * that is "sss".
2689 *
0a982f06 2690 * It turns out that there are problems with all multi-character folds, and not
cb117658
KW
2691 * just these three. Now the code is general, for all such cases. The
2692 * approach taken is:
0a982f06
KW
2693 * 1) This routine examines each EXACTFish node that could contain multi-
2694 * character fold sequences. It returns in *min_subtract how much to
9d071ca8 2695 * subtract from the the actual length of the string to get a real minimum
0a982f06
KW
2696 * match length; it is 0 if there are no multi-char folds. This delta is
2697 * used by the caller to adjust the min length of the match, and the delta
2698 * between min and max, so that the optimizer doesn't reject these
2699 * possibilities based on size constraints.
cb117658 2700 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
0a982f06
KW
2701 * is used for an EXACTFU node that contains at least one "ss" sequence in
2702 * it. For non-UTF-8 patterns and strings, this is the only case where
2703 * there is a possible fold length change. That means that a regular
2704 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2705 * with length changes, and so can be processed faster. regexec.c takes
2706 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2707 * pre-folded by regcomp.c. This saves effort in regex matching.
87b8b349 2708 * However, the pre-folding isn't done for non-UTF8 patterns because the
0a982f06
KW
2709 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2710 * down by forcing the pattern into UTF8 unless necessary. Also what
2711 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2712 * possibilities for the non-UTF8 patterns are quite simple, except for
2713 * the sharp s. All the ones that don't involve a UTF-8 target string are
2714 * members of a fold-pair, and arrays are set up for all of them so that
2715 * the other member of the pair can be found quickly. Code elsewhere in
2716 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2717 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2718 * described in the next item.
098b07d5 2719 * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
1ca267a5
KW
2720 * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
2721 * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
2722 * (probably unwittingly, in Perl_regexec_flags()) makes is that a
2723 * character in the pattern corresponds to at most a single character in
2724 * the target string. (And I do mean character, and not byte here, unlike
2725 * other parts of the documentation that have never been updated to
2726 * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
2727 * two character string 'ss'; in EXACTFA nodes it can match
2728 * "\x{17F}\x{17F}". These violate the assumption, and they are the only
2729 * instances where it is violated. I'm reluctant to try to change the
2730 * assumption, as the code involved is impenetrable to me (khw), so
2731 * instead the code here punts. This routine examines (when the pattern
2732 * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
2733 * boolean indicating whether or not the node contains a sharp s. When it
2734 * is true, the caller sets a flag that later causes the optimizer in this
2735 * file to not set values for the floating and fixed string lengths, and
2736 * thus avoids the optimizer code in regexec.c that makes the invalid
2737 * assumption. Thus, there is no optimization based on string lengths for
2738 * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
2739 * (The reason the assumption is wrong only in these two cases is that all
2740 * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
2741 * other folds to their expanded versions. We can't prefold sharp s to
2742 * 'ss' in EXACTF nodes because we don't know at compile time if it
2743 * actually matches 'ss' or not. It will match iff the target string is
2744 * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
2745 * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
2746 * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
2747 * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
2748 * require the pattern to be forced into UTF-8, the overhead of which we
2749 * want to avoid.)
098b07d5
KW
2750 *
2751 * Similarly, the code that generates tries doesn't currently handle
2752 * not-already-folded multi-char folds, and it looks like a pain to change
2753 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
2754 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
2755 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
2756 * using /iaa matching will be doing so almost entirely with ASCII
2757 * strings, so this should rarely be encountered in practice */
1de06328 2758
9d071ca8 2759#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
07be1b83 2760 if (PL_regkind[OP(scan)] == EXACT) \
9d071ca8 2761 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
07be1b83 2762
be8e71aa 2763STATIC U32
9d071ca8 2764S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
07be1b83
YO
2765 /* Merge several consecutive EXACTish nodes into one. */
2766 regnode *n = regnext(scan);
2767 U32 stringok = 1;
2768 regnode *next = scan + NODE_SZ_STR(scan);
2769 U32 merged = 0;
2770 U32 stopnow = 0;
2771#ifdef DEBUGGING
2772 regnode *stop = scan;
72f13be8 2773 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 2774#else
d47053eb
RGS
2775 PERL_UNUSED_ARG(depth);
2776#endif
7918f24d
NC
2777
2778 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 2779#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
2780 PERL_UNUSED_ARG(flags);
2781 PERL_UNUSED_ARG(val);
07be1b83 2782#endif
07be1b83 2783 DEBUG_PEEP("join",scan,depth);
bb914485 2784
3f410cf6
KW
2785 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
2786 * EXACT ones that are mergeable to the current one. */
2787 while (n
2788 && (PL_regkind[OP(n)] == NOTHING
2789 || (stringok && OP(n) == OP(scan)))
07be1b83 2790 && NEXT_OFF(n)
3f410cf6
KW
2791 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
2792 {
07be1b83
YO
2793
2794 if (OP(n) == TAIL || n > next)
2795 stringok = 0;
2796 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
2797 DEBUG_PEEP("skip:",n,depth);
2798 NEXT_OFF(scan) += NEXT_OFF(n);
2799 next = n + NODE_STEP_REGNODE;
2800#ifdef DEBUGGING
2801 if (stringok)
2802 stop = n;
2803#endif
2804 n = regnext(n);
2805 }
2806 else if (stringok) {
786e8c11 2807 const unsigned int oldl = STR_LEN(scan);
07be1b83 2808 regnode * const nnext = regnext(n);
b2230d39 2809
87b8b349
KW
2810 /* XXX I (khw) kind of doubt that this works on platforms where
2811 * U8_MAX is above 255 because of lots of other assumptions */
79a81a6e 2812 /* Don't join if the sum can't fit into a single node */
b2230d39
KW
2813 if (oldl + STR_LEN(n) > U8_MAX)
2814 break;
07be1b83
YO
2815
2816 DEBUG_PEEP("merg",n,depth);
07be1b83 2817 merged++;
b2230d39 2818
07be1b83
YO
2819 NEXT_OFF(scan) += NEXT_OFF(n);
2820 STR_LEN(scan) += STR_LEN(n);
2821 next = n + NODE_SZ_STR(n);
2822 /* Now we can overwrite *n : */
2823 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
2824#ifdef DEBUGGING
2825 stop = next - 1;
2826#endif
2827 n = nnext;
2828 if (stopnow) break;
2829 }
2830
d47053eb
RGS
2831#ifdef EXPERIMENTAL_INPLACESCAN
2832 if (flags && !NEXT_OFF(n)) {
2833 DEBUG_PEEP("atch", val, depth);
2834 if (reg_off_by_arg[OP(n)]) {
2835 ARG_SET(n, val - n);
2836 }
2837 else {
2838 NEXT_OFF(n) = val - n;
2839 }
2840 stopnow = 1;
2841 }
07be1b83
YO
2842#endif
2843 }
2c2b7f86 2844
9d071ca8 2845 *min_subtract = 0;
f758bddf 2846 *has_exactf_sharp_s = FALSE;
f646642f 2847
3f410cf6
KW
2848 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
2849 * can now analyze for sequences of problematic code points. (Prior to
2850 * this final joining, sequences could have been split over boundaries, and
a0c4c608
KW
2851 * hence missed). The sequences only happen in folding, hence for any
2852 * non-EXACT EXACTish node */
86d6fcad 2853 if (OP(scan) != EXACT) {
0a982f06
KW
2854 const U8 * const s0 = (U8*) STRING(scan);
2855 const U8 * s = s0;
2856 const U8 * const s_end = s0 + STR_LEN(scan);
f758bddf
KW
2857
2858 /* One pass is made over the node's string looking for all the
2859 * possibilities. to avoid some tests in the loop, there are two main
2860 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
2861 * non-UTF-8 */
2862 if (UTF) {
86d6fcad 2863
0a982f06
KW
2864 /* Examine the string for a multi-character fold sequence. UTF-8
2865 * patterns have all characters pre-folded by the time this code is
2866 * executed */
2867 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
2868 length sequence we are looking for is 2 */
86d6fcad 2869 {
0a982f06
KW
2870 int count = 0;
2871 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
2872 if (! len) { /* Not a multi-char fold: get next char */
2873 s += UTF8SKIP(s);
2874 continue;
2875 }
bb914485 2876
0a982f06 2877 /* Nodes with 'ss' require special handling, except for EXACTFL
098b07d5
KW
2878 * and EXACTFA-ish for which there is no multi-char fold to
2879 * this */
0a982f06 2880 if (len == 2 && *s == 's' && *(s+1) == 's'
098b07d5
KW
2881 && OP(scan) != EXACTFL
2882 && OP(scan) != EXACTFA
2883 && OP(scan) != EXACTFA_NO_TRIE)
0a982f06
KW
2884 {
2885 count = 2;
2886 OP(scan) = EXACTFU_SS;
2887 s += 2;
2888 }
0a982f06
KW
2889 else { /* Here is a generic multi-char fold. */
2890 const U8* multi_end = s + len;
2891
2892 /* Count how many characters in it. In the case of /l and
2893 * /aa, no folds which contain ASCII code points are
2894 * allowed, so check for those, and skip if found. (In
2895 * EXACTFL, no folds are allowed to any Latin1 code point,
2896 * not just ASCII. But there aren't any of these
2897 * currently, nor ever likely, so don't take the time to
2898 * test for them. The code that generates the
2899 * is_MULTI_foo() macros croaks should one actually get put
2900 * into Unicode .) */
098b07d5
KW
2901 if (OP(scan) != EXACTFL
2902 && OP(scan) != EXACTFA
2903 && OP(scan) != EXACTFA_NO_TRIE)
2904 {
0a982f06
KW
2905 count = utf8_length(s, multi_end);
2906 s = multi_end;
2907 }
2908 else {
2909 while (s < multi_end) {
2910 if (isASCII(*s)) {
2911 s++;
2912 goto next_iteration;
2913 }
2914 else {
2915 s += UTF8SKIP(s);
2916 }
2917 count++;
2918 }
2919 }
2920 }
f758bddf 2921
0a982f06
KW
2922 /* The delta is how long the sequence is minus 1 (1 is how long
2923 * the character that folds to the sequence is) */
2924 *min_subtract += count - 1;
2925 next_iteration: ;
bb914485
KW
2926 }
2927 }
1ca267a5
KW
2928 else if (OP(scan) == EXACTFA) {
2929
2930 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
2931 * fold to the ASCII range (and there are no existing ones in the
2932 * upper latin1 range). But, as outlined in the comments preceding
098b07d5
KW
2933 * this function, we need to flag any occurrences of the sharp s.
2934 * This character forbids trie formation (because of added
2935 * complexity) */
1ca267a5
KW
2936 while (s < s_end) {
2937 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
098b07d5 2938 OP(scan) = EXACTFA_NO_TRIE;
1ca267a5
KW
2939 *has_exactf_sharp_s = TRUE;
2940 break;
2941 }
2942 s++;
2943 continue;
2944 }
2945 }
2946 else if (OP(scan) != EXACTFL) {
2947
2948 /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
2949 * multi-char folds that are all Latin1. (This code knows that
2950 * there are no current multi-char folds possible with EXACTFL,
2951 * relying on fold_grind.t to catch any errors if the very unlikely
2952 * event happens that some get added in future Unicode versions.)
2953 * As explained in the comments preceding this function, we look
2954 * also for the sharp s in EXACTF nodes; it can be in the final
0a982f06
KW
2955 * position. Otherwise we can stop looking 1 byte earlier because
2956 * have to find at least two characters for a multi-fold */
f758bddf
KW
2957 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
2958
0a982f06 2959 while (s < upper) {
40b1ba4f 2960 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
0a982f06
KW
2961 if (! len) { /* Not a multi-char fold. */
2962 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
2963 {
2964 *has_exactf_sharp_s = TRUE;
2965 }
2966 s++;
2967 continue;
2968 }
2969
2970 if (len == 2
c02c3054
KW
2971 && isARG2_lower_or_UPPER_ARG1('s', *s)
2972 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
0a982f06
KW
2973 {
2974
2975 /* EXACTF nodes need to know that the minimum length
2976 * changed so that a sharp s in the string can match this
2977 * ss in the pattern, but they remain EXACTF nodes, as they
2978 * won't match this unless the target string is is UTF-8,
2979 * which we don't know until runtime */
2980 if (OP(scan) != EXACTF) {
2981 OP(scan) = EXACTFU_SS;
2982 }
86d6fcad 2983 }
0a982f06
KW
2984
2985 *min_subtract += len - 1;
2986 s += len;
86d6fcad
KW
2987 }
2988 }
07be1b83 2989 }
3f410cf6 2990
07be1b83 2991#ifdef DEBUGGING
bb789b09
DM
2992 /* Allow dumping but overwriting the collection of skipped
2993 * ops and/or strings with fake optimized ops */
07be1b83
YO
2994 n = scan + NODE_SZ_STR(scan);
2995 while (n <= stop) {
bb789b09
DM
2996 OP(n) = OPTIMIZED;
2997 FLAGS(n) = 0;
2998 NEXT_OFF(n) = 0;
07be1b83
YO
2999 n++;
3000 }
3001#endif
3002 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3003 return stopnow;
3004}
3005
486ec47a 3006/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
3007 Finds fixed substrings. */
3008
a0288114 3009/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
3010 to the position after last scanned or to NULL. */
3011
40d049e4
YO
3012#define INIT_AND_WITHP \
3013 assert(!and_withp); \
b8f7bb16 3014 Newx(and_withp,1, regnode_ssc); \
40d049e4 3015 SAVEFREEPV(and_withp)
07be1b83 3016
b515a41d 3017/* this is a chain of data about sub patterns we are processing that
486ec47a 3018 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
3019 we can simulate recursion without losing state. */
3020struct scan_frame;
3021typedef struct scan_frame {
3022 regnode *last; /* last node to process in this frame */
3023 regnode *next; /* next node to process when last is reached */
3024 struct scan_frame *prev; /*previous frame*/
3025 I32 stop; /* what stopparen do we use */
3026} scan_frame;
3027
304ee84b
YO
3028
3029#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3030
49f55535 3031STATIC SSize_t
40d049e4 3032S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
ea3daa5d 3033 SSize_t *minlenp, SSize_t *deltap,
40d049e4
YO
3034 regnode *last,
3035 scan_data_t *data,
3036 I32 stopparen,
3037 U8* recursed,
b8f7bb16 3038 regnode_ssc *and_withp,
40d049e4 3039 U32 flags, U32 depth)
c277df42
IZ
3040 /* scanp: Start here (read-write). */
3041 /* deltap: Write maxlen-minlen here. */
3042 /* last: Stop before this one. */
40d049e4
YO
3043 /* data: string data about the pattern */
3044 /* stopparen: treat close N as END */
3045 /* recursed: which subroutines have we recursed into */
3046 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 3047{
97aff369 3048 dVAR;
ea3daa5d
FC
3049 /* There must be at least this number of characters to match */
3050 SSize_t min = 0;
2d608413 3051 I32 pars = 0, code;
c277df42 3052 regnode *scan = *scanp, *next;
ea3daa5d 3053 SSize_t delta = 0;
c277df42 3054 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 3055 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
3056 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3057 scan_data_t data_fake;
a3621e74 3058 SV *re_trie_maxbuff = NULL;
786e8c11 3059 regnode *first_non_open = scan;
ea3daa5d 3060 SSize_t stopmin = SSize_t_MAX;
8aa23a47 3061 scan_frame *frame = NULL;
a3621e74 3062 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 3063
7918f24d
NC
3064 PERL_ARGS_ASSERT_STUDY_CHUNK;
3065
13a24bad 3066#ifdef DEBUGGING
40d049e4 3067 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 3068#endif
40d049e4 3069
786e8c11 3070 if ( depth == 0 ) {
40d049e4 3071 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
3072 first_non_open=regnext(first_non_open);
3073 }
3074
b81d288d 3075
8aa23a47
YO
3076 fake_study_recurse:
3077 while ( scan && OP(scan) != END && scan < last ){
2d608413
KW
3078 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3079 node length to get a real minimum (because
3080 the folded version may be shorter) */
f758bddf 3081 bool has_exactf_sharp_s = FALSE;
8aa23a47 3082 /* Peephole optimizer: */
304ee84b 3083 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47 3084 DEBUG_PEEP("Peep",scan,depth);
a0c4c608
KW
3085
3086 /* Its not clear to khw or hv why this is done here, and not in the
3087 * clauses that deal with EXACT nodes. khw's guess is that it's
3088 * because of a previous design */
9d071ca8 3089 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
8aa23a47
YO
3090
3091 /* Follow the next-chain of the current node and optimize
3092 away all the NOTHINGs from it. */
3093 if (OP(scan) != CURLYX) {
3094 const int max = (reg_off_by_arg[OP(scan)]
3095 ? I32_MAX
3096 /* I32 may be smaller than U16 on CRAYs! */
3097 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3098 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3099 int noff;
3100 regnode *n = scan;
686b73d4 3101
8aa23a47
YO
3102 /* Skip NOTHING and LONGJMP. */
3103 while ((n = regnext(n))
3104 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3105 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3106 && off + noff < max)
3107 off += noff;
3108 if (reg_off_by_arg[OP(scan)])
3109 ARG(scan) = off;
3110 else
3111 NEXT_OFF(scan) = off;
3112 }
a3621e74 3113
c277df42 3114
8aa23a47
YO
3115
3116 /* The principal pseudo-switch. Cannot be a switch, since we
3117 look into several different things. */
3118 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3119 || OP(scan) == IFTHEN) {
3120 next = regnext(scan);
3121 code = OP(scan);
3122 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
686b73d4 3123
8aa23a47
YO
3124 if (OP(next) == code || code == IFTHEN) {
3125 /* NOTE - There is similar code to this block below for handling
3126 TRIE nodes on a re-study. If you change stuff here check there
3127 too. */
49f55535 3128 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
b8f7bb16 3129 regnode_ssc accum;
8aa23a47 3130 regnode * const startbranch=scan;
686b73d4 3131
8aa23a47 3132 if (flags & SCF_DO_SUBSTR)
304ee84b 3133 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47 3134 if (flags & SCF_DO_STCLASS)
557bd3fb 3135 ssc_init_zero(pRExC_state, &accum);
8aa23a47
YO
3136
3137 while (OP(scan) == code) {
49f55535
FC
3138 SSize_t deltanext, minnext, fake;
3139 I32 f = 0;
b8f7bb16 3140 regnode_ssc this_class;
8aa23a47
YO
3141
3142 num++;
3143 data_fake.flags = 0;
3144 if (data) {
3145 data_fake.whilem_c = data->whilem_c;
3146 data_fake.last_closep = data->last_closep;
3147 }
3148 else
3149 data_fake.last_closep = &fake;
58e23c8d
YO
3150
3151 data_fake.pos_delta = delta;
8aa23a47
YO
3152 next = regnext(scan);
3153 scan = NEXTOPER(scan);
3154 if (code != BRANCH)
c277df42 3155 scan = NEXTOPER(scan);
8aa23a47 3156 if (flags & SCF_DO_STCLASS) {
557bd3fb 3157 ssc_init(pRExC_state, &this_class);
8aa23a47
YO
3158 data_fake.start_class = &this_class;
3159 f = SCF_DO_STCLASS_AND;
58e23c8d 3160 }
8aa23a47
YO
3161 if (flags & SCF_WHILEM_VISITED_POS)
3162 f |= SCF_WHILEM_VISITED_POS;
3163
3164 /* we suppose the run is continuous, last=next...*/
3165 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3166 next, &data_fake,
3167 stopparen, recursed, NULL, f,depth+1);
3168 if (min1 > minnext)
3169 min1 = minnext;
ea3daa5d 3170 if (deltanext == SSize_t_MAX) {
8aa23a47 3171 is_inf = is_inf_internal = 1;
ea3daa5d 3172 max1 = SSize_t_MAX;
9b139d09
GG
3173 } else if (max1 < minnext + deltanext)
3174 max1 = minnext + deltanext;
8aa23a47
YO
3175 scan = next;
3176 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3177 pars++;
3178 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3179 if ( stopmin > minnext)
3180 stopmin = min + min1;
3181 flags &= ~SCF_DO_SUBSTR;
3182 if (data)
3183 data->flags |= SCF_SEEN_ACCEPT;
3184 }
3185 if (data) {
3186 if (data_fake.flags & SF_HAS_EVAL)
3187 data->flags |= SF_HAS_EVAL;
3188 data->whilem_c = data_fake.whilem_c;
3dab1dad 3189 }
8aa23a47 3190 if (flags & SCF_DO_STCLASS)
557bd3fb 3191 ssc_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
3192 }
3193 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3194 min1 = 0;
3195 if (flags & SCF_DO_SUBSTR) {
3196 data->pos_min += min1;
ea3daa5d
FC
3197 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3198 data->pos_delta = SSize_t_MAX;
9b139d09
GG
3199 else
3200 data->pos_delta += max1 - min1;
8aa23a47
YO
3201 if (max1 != min1 || is_inf)
3202 data->longest = &(data->longest_float);
3203 }
3204 min += min1;
ea3daa5d
FC
3205 if (delta == SSize_t_MAX
3206 || SSize_t_MAX - delta - (max1 - min1) < 0)
3207 delta = SSize_t_MAX;
9b139d09
GG
3208 else
3209 delta += max1 - min1;
8aa23a47 3210 if (flags & SCF_DO_STCLASS_OR) {
557bd3fb 3211 ssc_or(pRExC_state, data->start_class, &accum);
8aa23a47 3212 if (min1) {
1ca93ef1 3213 ssc_and(pRExC_state, data->start_class, and_withp);
8aa23a47 3214 flags &= ~SCF_DO_STCLASS;
653099ff 3215 }
8aa23a47
YO
3216 }
3217 else if (flags & SCF_DO_STCLASS_AND) {
3218 if (min1) {
1ca93ef1 3219 ssc_and(pRExC_state, data->start_class, &accum);
8aa23a47 3220 flags &= ~SCF_DO_STCLASS;
de0c8cb8 3221 }
8aa23a47
YO
3222 else {
3223 /* Switch to OR mode: cache the old value of
3224 * data->start_class */
3225 INIT_AND_WITHP;
b8f7bb16 3226 StructCopy(data->start_class, and_withp, regnode_ssc);
8aa23a47 3227 flags &= ~SCF_DO_STCLASS_AND;
b8f7bb16 3228 StructCopy(&accum, data->start_class, regnode_ssc);
8aa23a47 3229 flags |= SCF_DO_STCLASS_OR;
899d20b9 3230 SET_SSC_EOS(data->start_class);
de0c8cb8 3231 }
8aa23a47 3232 }
a3621e74 3233
8aa23a47
YO
3234 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3235 /* demq.
a3621e74 3236
8aa23a47
YO
3237 Assuming this was/is a branch we are dealing with: 'scan' now
3238 points at the item that follows the branch sequence, whatever
3239 it is. We now start at the beginning of the sequence and look
3240 for subsequences of
a3621e74 3241
8aa23a47
YO
3242 BRANCH->EXACT=>x1
3243 BRANCH->EXACT=>x2
3244 tail
a3621e74 3245
8aa23a47 3246 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 3247
486ec47a 3248 If we can find such a subsequence we need to turn the first
8aa23a47
YO
3249 element into a trie and then add the subsequent branch exact
3250 strings to the trie.
a3621e74 3251
8aa23a47 3252 We have two cases
a3621e74 3253
3b753521 3254 1. patterns where the whole set of branches can be converted.
a3621e74 3255
8aa23a47 3256 2. patterns where only a subset can be converted.
a3621e74 3257
8aa23a47
YO
3258 In case 1 we can replace the whole set with a single regop
3259 for the trie. In case 2 we need to keep the start and end
3b753521 3260 branches so
a3621e74 3261
8aa23a47
YO
3262 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3263 becomes BRANCH TRIE; BRANCH X;
786e8c11 3264
8aa23a47
YO
3265 There is an additional case, that being where there is a
3266 common prefix, which gets split out into an EXACT like node
3267 preceding the TRIE node.
a3621e74 3268
8aa23a47
YO
3269 If x(1..n)==tail then we can do a simple trie, if not we make
3270 a "jump" trie, such that when we match the appropriate word
486ec47a 3271 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 3272 a nested if into a case structure of sorts.
b515a41d 3273
8aa23a47 3274 */
686b73d4 3275
8aa23a47
YO
3276 int made=0;
3277 if (!re_trie_maxbuff) {
3278 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3279 if (!SvIOK(re_trie_maxbuff))
3280 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3281 }
3282 if ( SvIV(re_trie_maxbuff)>=0 ) {
3283 regnode *cur;
3284 regnode *first = (regnode *)NULL;
3285 regnode *last = (regnode *)NULL;
3286 regnode *tail = scan;
fab2782b 3287 U8 trietype = 0;
8aa23a47 3288 U32 count=0;
a3621e74
YO
3289
3290#ifdef DEBUGGING
8aa23a47 3291 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 3292#endif
8aa23a47
YO
3293 /* var tail is used because there may be a TAIL
3294 regop in the way. Ie, the exacts will point to the
3295 thing following the TAIL, but the last branch will
3296 point at the TAIL. So we advance tail. If we
3297 have nested (?:) we may have to move through several
3298 tails.
3299 */
3300
3301 while ( OP( tail ) == TAIL ) {
3302 /* this is the TAIL generated by (?:) */
3303 tail = regnext( tail );
3304 }
a3621e74 3305
8aa23a47 3306
df826430 3307 DEBUG_TRIE_COMPILE_r({
8aa23a47
YO
3308 regprop(RExC_rx, mysv, tail );
3309 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3310 (int)depth * 2 + 2, "",
3311 "Looking for TRIE'able sequences. Tail node is: ",
3312 SvPV_nolen_const( mysv )
3313 );
3314 });
3315
3316 /*
3317
fab2782b
YO
3318 Step through the branches
3319 cur represents each branch,
3320 noper is the first thing to be matched as part of that branch
3321 noper_next is the regnext() of that node.
3322
3323 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3324 via a "jump trie" but we also support building with NOJUMPTRIE,
3325 which restricts the trie logic to structures like /FOO|BAR/.
3326
3327 If noper is a trieable nodetype then the branch is a possible optimization
3328 target. If we are building under NOJUMPTRIE then we require that noper_next
3329 is the same as scan (our current position in the regex program).
3330
3331 Once we have two or more consecutive such branches we can create a
3332 trie of the EXACT's contents and stitch it in place into the program.
3333
3334 If the sequence represents all of the branches in the alternation we
3335 replace the entire thing with a single TRIE node.
3336
3337 Otherwise when it is a subsequence we need to stitch it in place and
3338 replace only the relevant branches. This means the first branch has
3339 to remain as it is used by the alternation logic, and its next pointer,
3340 and needs to be repointed at the item on the branch chain following
3341 the last branch we have optimized away.
3342
3343 This could be either a BRANCH, in which case the subsequence is internal,
3344 or it could be the item following the branch sequence in which case the
3345 subsequence is at the end (which does not necessarily mean the first node
3346 is the start of the alternation).
3347
3348 TRIE_TYPE(X) is a define which maps the optype to a trietype.
3349
3350 optype | trietype
3351 ----------------+-----------
3352 NOTHING | NOTHING
3353 EXACT | EXACT
3354 EXACTFU | EXACTFU
3355 EXACTFU_SS | EXACTFU
098b07d5 3356 EXACTFA | EXACTFA
fab2782b 3357
8aa23a47
YO
3358
3359 */
fab2782b
YO
3360#define TRIE_TYPE(X) ( ( NOTHING == (X) ) ? NOTHING : \
3361 ( EXACT == (X) ) ? EXACT : \
cb117658 3362 ( EXACTFU == (X) || EXACTFU_SS == (X) ) ? EXACTFU : \
098b07d5 3363 ( EXACTFA == (X) ) ? EXACTFA : \
fab2782b 3364 0 )
8aa23a47
YO
3365
3366 /* dont use tail as the end marker for this traverse */
3367 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
3368 regnode * const noper = NEXTOPER( cur );
fab2782b
YO
3369 U8 noper_type = OP( noper );
3370 U8 noper_trietype = TRIE_TYPE( noper_type );
b515a41d 3371#if defined(DEBUGGING) || defined(NOJUMPTRIE)
8aa23a47 3372 regnode * const noper_next = regnext( noper );
df826430
YO
3373 U8 noper_next_type = (noper_next && noper_next != tail) ? OP(noper_next) : 0;
3374 U8 noper_next_trietype = (noper_next && noper_next != tail) ? TRIE_TYPE( noper_next_type ) :0;
b515a41d
YO
3375#endif
3376
df826430 3377 DEBUG_TRIE_COMPILE_r({
8aa23a47
YO
3378 regprop(RExC_rx, mysv, cur);
3379 PerlIO_printf( Perl_debug_log, "%*s- %s (%d)",
3380 (int)depth * 2 + 2,"", SvPV_nolen_const( mysv ), REG_NODE_NUM(cur) );
3381
3382 regprop(RExC_rx, mysv, noper);
3383 PerlIO_printf( Perl_debug_log, " -> %s",
3384 SvPV_nolen_const(mysv));
3385
3386 if ( noper_next ) {
3387 regprop(RExC_rx, mysv, noper_next );
3388 PerlIO_printf( Perl_debug_log,"\t=> %s\t",
3389 SvPV_nolen_const(mysv));
3390 }
df826430
YO
3391 PerlIO_printf( Perl_debug_log, "(First==%d,Last==%d,Cur==%d,tt==%s,nt==%s,nnt==%s)\n",
3392 REG_NODE_NUM(first), REG_NODE_NUM(last), REG_NODE_NUM(cur),
3393 PL_reg_name[trietype], PL_reg_name[noper_trietype], PL_reg_name[noper_next_trietype]
3394 );
8aa23a47 3395 });
fab2782b
YO
3396
3397 /* Is noper a trieable nodetype that can be merged with the
3398 * current trie (if there is one)? */
3399 if ( noper_trietype
3400 &&
3401 (
df826430
YO
3402 ( noper_trietype == NOTHING)
3403 || ( trietype == NOTHING )
a40630bf 3404 || ( trietype == noper_trietype )
fab2782b 3405 )
786e8c11 3406#ifdef NOJUMPTRIE
8aa23a47 3407 && noper_next == tail
786e8c11 3408#endif
8aa23a47
YO
3409 && count < U16_MAX)
3410 {
fab2782b
YO
3411 /* Handle mergable triable node
3412 * Either we are the first node in a new trieable sequence,
3413 * in which case we do some bookkeeping, otherwise we update
3414 * the end pointer. */
fab2782b 3415 if ( !first ) {
3b6759a6 3416 first = cur;
df826430
YO
3417 if ( noper_trietype == NOTHING ) {
3418#if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
3419 regnode * const noper_next = regnext( noper );
3b6759a6 3420 U8 noper_next_type = (noper_next && noper_next!=tail) ? OP(noper_next) : 0;
df826430
YO
3421 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
3422#endif
3423
190c1910 3424 if ( noper_next_trietype ) {
df826430 3425 trietype = noper_next_trietype;
190c1910
YO
3426 } else if (noper_next_type) {
3427 /* a NOTHING regop is 1 regop wide. We need at least two
3428 * for a trie so we can't merge this in */
3429 first = NULL;
3430 }
3431 } else {
3432 trietype = noper_trietype;
3b6759a6 3433 }
8aa23a47 3434 } else {
fab2782b
YO
3435 if ( trietype == NOTHING )
3436 trietype = noper_trietype;
8aa23a47
YO
3437 last = cur;
3438 }
df826430
YO
3439 if (first)
3440 count++;
fab2782b
YO
3441 } /* end handle mergable triable node */
3442 else {
3443 /* handle unmergable node -
3444 * noper may either be a triable node which can not be tried
3445 * together with the current trie, or a non triable node */
729aaeb5
YO
3446 if ( last ) {
3447 /* If last is set and trietype is not NOTHING then we have found
3448 * at least two triable branch sequences in a row of a similar
3449 * trietype so we can turn them into a trie. If/when we
3450 * allow NOTHING to start a trie sequence this condition will be
3451 * required, and it isn't expensive so we leave it in for now. */
e6351b37 3452 if ( trietype && trietype != NOTHING )
729aaeb5
YO
3453 make_trie( pRExC_state,
3454 startbranch, first, cur, tail, count,
3455 trietype, depth+1 );
fab2782b 3456 last = NULL; /* note: we clear/update first, trietype etc below, so we dont do it here */
8aa23a47 3457 }
fab2782b 3458 if ( noper_trietype
786e8c11 3459#ifdef NOJUMPTRIE
8aa23a47 3460 && noper_next == tail
786e8c11 3461#endif
8aa23a47 3462 ){
fab2782b 3463 /* noper is triable, so we can start a new trie sequence */
8aa23a47
YO
3464 count = 1;
3465 first = cur;
fab2782b
YO
3466 trietype = noper_trietype;
3467 } else if (first) {
3468 /* if we already saw a first but the current node is not triable then we have
3469 * to reset the first information. */
8aa23a47
YO
3470 count = 0;
3471 first = NULL;
fab2782b 3472 trietype = 0;
8aa23a47 3473 }
fab2782b
YO
3474 } /* end handle unmergable node */
3475 } /* loop over branches */
df826430 3476 DEBUG_TRIE_COMPILE_r({
8aa23a47
YO
3477 regprop(RExC_rx, mysv, cur);
3478 PerlIO_printf( Perl_debug_log,
3479 "%*s- %s (%d) <SCAN FINISHED>\n", (int)depth * 2 + 2,
3480 "", SvPV_nolen_const( mysv ),REG_NODE_NUM(cur));
3481
3482 });
e6351b37 3483