This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
better diagnostics of RExC_seen in regcomp.c
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
04e98a4d 89#include "dquote_static.c"
26faadbd 90#include "charclass_invlists.h"
81e983c1 91#include "inline_invlist.c"
1b0f46bf 92#include "unicode_constants.h"
04e98a4d 93
94dc5c2d 94#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 95#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 96#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
b1603ef8 102
09b2b2e6 103struct RExC_state_t {
514a91f1
DM
104 U32 flags; /* RXf_* are we folding, multilining? */
105 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 106 char *precomp; /* uncompiled string. */
288b8c02 107 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
108 regexp *rx; /* perl core regexp structure */
109 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 110 char *start; /* Start of input for compile */
830247a4
IZ
111 char *end; /* End of input for compile */
112 char *parse; /* Input-scan pointer. */
ea3daa5d 113 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 114 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 115 regnode *emit_bound; /* First regnode outside of the allocated space */
f7c7e32a
DM
116 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
117 implies compiling, so don't emit */
9a81a976
KW
118 regnode_ssc emit_dummy; /* placeholder for emit to point to;
119 large enough for the largest
120 non-EXACTish node, so can use it as
121 scratch in pass1 */
830247a4
IZ
122 I32 naughty; /* How bad is this pattern? */
123 I32 sawback; /* Did we see \1, ...? */
124 U32 seen;
ea3daa5d 125 SSize_t size; /* Code size. */
09a65838 126 I32 npar; /* Capture buffer count, (OPEN) plus one. ("par" 0 is the whole pattern)*/
e2e6a0f1 127 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
128 I32 extralen;
129 I32 seen_zerolen;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
e40e74fe
KW
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
139 utf8 */
81714fb9 140 HV *paren_names; /* Paren names */
1f1031fe 141
40d049e4
YO
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
09a65838
YO
144 U8 *study_chunk_recursed; /* bitmap of which parens we have moved through */
145 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 146 I32 in_lookbehind;
4624b182 147 I32 contains_locale;
cfafade5 148 I32 contains_i;
bb3f3ed2 149 I32 override_recoding;
9d53c457 150 I32 in_multi_char_class;
3d2bd50a 151 struct reg_code_block *code_blocks; /* positions of literal (?{})
68e2671b 152 within pattern */
b1603ef8
DM
153 int num_code_blocks; /* size of code_blocks[] */
154 int code_index; /* next code_blocks[] slot */
830247a4
IZ
155#if ADD_TO_REGEXEC
156 char *starttry; /* -Dr: where regtry was called. */
157#define RExC_starttry (pRExC_state->starttry)
158#endif
d24ca0c5 159 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 160#ifdef DEBUGGING
be8e71aa 161 const char *lastparse;
3dab1dad 162 I32 lastnum;
1f1031fe 163 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
164#define RExC_lastparse (pRExC_state->lastparse)
165#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 166#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 167#endif
09b2b2e6 168};
830247a4 169
e2509266 170#define RExC_flags (pRExC_state->flags)
514a91f1 171#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 172#define RExC_precomp (pRExC_state->precomp)
288b8c02 173#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 174#define RExC_rx (pRExC_state->rx)
f8fc2ecf 175#define RExC_rxi (pRExC_state->rxi)
fac92740 176#define RExC_start (pRExC_state->start)
830247a4
IZ
177#define RExC_end (pRExC_state->end)
178#define RExC_parse (pRExC_state->parse)
179#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
180#ifdef RE_TRACK_PATTERN_OFFSETS
181#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
182#endif
830247a4 183#define RExC_emit (pRExC_state->emit)
f7c7e32a 184#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 185#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 186#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
187#define RExC_naughty (pRExC_state->naughty)
188#define RExC_sawback (pRExC_state->sawback)
189#define RExC_seen (pRExC_state->seen)
190#define RExC_size (pRExC_state->size)
191#define RExC_npar (pRExC_state->npar)
e2e6a0f1 192#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
193#define RExC_extralen (pRExC_state->extralen)
194#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 195#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 196#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 197#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
198#define RExC_open_parens (pRExC_state->open_parens)
199#define RExC_close_parens (pRExC_state->close_parens)
200#define RExC_opend (pRExC_state->opend)
81714fb9 201#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
202#define RExC_recurse (pRExC_state->recurse)
203#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838
YO
204#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
205#define RExC_study_chunk_recursed_bytes (pRExC_state->study_chunk_recursed_bytes)
b57e4118 206#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 207#define RExC_contains_locale (pRExC_state->contains_locale)
cfafade5 208#define RExC_contains_i (pRExC_state->contains_i)
9d53c457
KW
209#define RExC_override_recoding (pRExC_state->override_recoding)
210#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
830247a4 211
cde0cee5 212
a687059c
LW
213#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
214#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
4d68ffa0 215 ((*s) == '{' && regcurly(s, FALSE)))
a687059c
LW
216
217/*
218 * Flags to be passed up and down.
219 */
a687059c 220#define WORST 0 /* Worst case. */
a3b492c3 221#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 222
e64f369d 223/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
224 * character. (There needs to be a case: in the switch statement in regexec.c
225 * for any node marked SIMPLE.) Note that this is not the same thing as
226 * REGNODE_SIMPLE */
fda99bee 227#define SIMPLE 0x02
e64f369d 228#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
229#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
230#define TRYAGAIN 0x10 /* Weeded out a declaration. */
231#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
a687059c 232
3dab1dad
YO
233#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
234
07be1b83
YO
235/* whether trie related optimizations are enabled */
236#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
237#define TRIE_STUDY_OPT
786e8c11 238#define FULL_TRIE_STUDY
07be1b83
YO
239#define TRIE_STCLASS
240#endif
1de06328
YO
241
242
40d049e4
YO
243
244#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
245#define PBITVAL(paren) (1 << ((paren) & 7))
246#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
247#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
248#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
249
bbd61b5f 250#define REQUIRE_UTF8 STMT_START { \
8d9c2815
NC
251 if (!UTF) { \
252 *flagp = RESTART_UTF8; \
253 return NULL; \
254 } \
bbd61b5f 255 } STMT_END
40d049e4 256
f19b1a63
KW
257/* This converts the named class defined in regcomp.h to its equivalent class
258 * number defined in handy.h. */
259#define namedclass_to_classnum(class) ((int) ((class) / 2))
260#define classnum_to_namedclass(classnum) ((classnum) * 2)
261
de92f5e6
KW
262#define _invlist_union_complement_2nd(a, b, output) \
263 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
264#define _invlist_intersection_complement_2nd(a, b, output) \
265 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
266
1de06328
YO
267/* About scan_data_t.
268
269 During optimisation we recurse through the regexp program performing
270 various inplace (keyhole style) optimisations. In addition study_chunk
271 and scan_commit populate this data structure with information about
272 what strings MUST appear in the pattern. We look for the longest
3b753521 273 string that must appear at a fixed location, and we look for the
1de06328
YO
274 longest string that may appear at a floating location. So for instance
275 in the pattern:
276
277 /FOO[xX]A.*B[xX]BAR/
278
279 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
280 strings (because they follow a .* construct). study_chunk will identify
281 both FOO and BAR as being the longest fixed and floating strings respectively.
282
283 The strings can be composites, for instance
284
285 /(f)(o)(o)/
286
287 will result in a composite fixed substring 'foo'.
288
289 For each string some basic information is maintained:
290
291 - offset or min_offset
292 This is the position the string must appear at, or not before.
293 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
294 characters must match before the string we are searching for.
295 Likewise when combined with minlenp and the length of the string it
1de06328
YO
296 tells us how many characters must appear after the string we have
297 found.
298
299 - max_offset
300 Only used for floating strings. This is the rightmost point that
ea3daa5d 301 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328
YO
302 string can occur infinitely far to the right.
303
304 - minlenp
2d608413
KW
305 A pointer to the minimum number of characters of the pattern that the
306 string was found inside. This is important as in the case of positive
1de06328
YO
307 lookahead or positive lookbehind we can have multiple patterns
308 involved. Consider
309
310 /(?=FOO).*F/
311
312 The minimum length of the pattern overall is 3, the minimum length
313 of the lookahead part is 3, but the minimum length of the part that
314 will actually match is 1. So 'FOO's minimum length is 3, but the
315 minimum length for the F is 1. This is important as the minimum length
316 is used to determine offsets in front of and behind the string being
317 looked for. Since strings can be composites this is the length of the
486ec47a 318 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
319 the length is calculated by study_chunk, so that the minimum lengths
320 are not known until the full pattern has been compiled, thus the
321 pointer to the value.
322
323 - lookbehind
324
325 In the case of lookbehind the string being searched for can be
326 offset past the start point of the final matching string.
327 If this value was just blithely removed from the min_offset it would
328 invalidate some of the calculations for how many chars must match
329 before or after (as they are derived from min_offset and minlen and
330 the length of the string being searched for).
331 When the final pattern is compiled and the data is moved from the
332 scan_data_t structure into the regexp structure the information
333 about lookbehind is factored in, with the information that would
334 have been lost precalculated in the end_shift field for the
335 associated string.
336
337 The fields pos_min and pos_delta are used to store the minimum offset
338 and the delta to the maximum offset at the current point in the pattern.
339
340*/
2c2d71f5
JH
341
342typedef struct scan_data_t {
1de06328
YO
343 /*I32 len_min; unused */
344 /*I32 len_delta; unused */
49f55535 345 SSize_t pos_min;
ea3daa5d 346 SSize_t pos_delta;
2c2d71f5 347 SV *last_found;
ea3daa5d 348 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 349 SSize_t last_start_min;
ea3daa5d 350 SSize_t last_start_max;
1de06328
YO
351 SV **longest; /* Either &l_fixed, or &l_float. */
352 SV *longest_fixed; /* longest fixed string found in pattern */
49f55535 353 SSize_t offset_fixed; /* offset where it starts */
ea3daa5d 354 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
355 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
356 SV *longest_float; /* longest floating string found in pattern */
49f55535 357 SSize_t offset_float_min; /* earliest point in string it can appear */
ea3daa5d
FC
358 SSize_t offset_float_max; /* latest point in string it can appear */
359 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
49f55535 360 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
2c2d71f5
JH
361 I32 flags;
362 I32 whilem_c;
49f55535 363 SSize_t *last_closep;
b8f7bb16 364 regnode_ssc *start_class;
2c2d71f5
JH
365} scan_data_t;
366
c02c3054
KW
367/* The below is perhaps overboard, but this allows us to save a test at the
368 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
369 * and 'a' differ by a single bit; the same with the upper and lower case of
370 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
371 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
372 * then inverts it to form a mask, with just a single 0, in the bit position
373 * where the upper- and lowercase differ. XXX There are about 40 other
374 * instances in the Perl core where this micro-optimization could be used.
375 * Should decide if maintenance cost is worse, before changing those
376 *
377 * Returns a boolean as to whether or not 'v' is either a lowercase or
378 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
379 * compile-time constant, the generated code is better than some optimizing
380 * compilers figure out, amounting to a mask and test. The results are
381 * meaningless if 'c' is not one of [A-Za-z] */
382#define isARG2_lower_or_UPPER_ARG1(c, v) \
383 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
384
a687059c 385/*
e50aee73 386 * Forward declarations for pregcomp()'s friends.
a687059c 387 */
a0d0e21e 388
27da23d5 389static const scan_data_t zero_scan_data =
1de06328 390 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
391
392#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
393#define SF_BEFORE_SEOL 0x0001
394#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
395#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
396#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
397
44e3dfd2
BF
398#define SF_FIX_SHIFT_EOL (+2)
399#define SF_FL_SHIFT_EOL (+4)
c277df42
IZ
400
401#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
402#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
403
404#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
405#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
406#define SF_IS_INF 0x0040
407#define SF_HAS_PAR 0x0080
408#define SF_IN_PAR 0x0100
409#define SF_HAS_EVAL 0x0200
410#define SCF_DO_SUBSTR 0x0400
653099ff
GS
411#define SCF_DO_STCLASS_AND 0x0800
412#define SCF_DO_STCLASS_OR 0x1000
413#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 414#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 415
786e8c11 416#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 417#define SCF_SEEN_ACCEPT 0x8000
688e0391 418#define SCF_TRIE_DOING_RESTUDY 0x10000
07be1b83 419
43fead97 420#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
421
422/* The enums for all these are ordered so things work out correctly */
a62b1201 423#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
cfaf538b 424#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
00b27cfc 425#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
426#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
427#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
2f7f8cb1 428#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
a725e29c 429#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 430
43fead97 431#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 432
93733859 433#define OOB_NAMEDCLASS -1
b8c5462f 434
8e661ac5
KW
435/* There is no code point that is out-of-bounds, so this is problematic. But
436 * its only current use is to initialize a variable that is always set before
437 * looked at. */
438#define OOB_UNICODE 0xDEADBEEF
439
a0ed51b3
LW
440#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
441#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
442
8615cb43 443
b45f050a
JF
444/* length of regex to show in messages that don't mark a position within */
445#define RegexLengthToShowInErrorMessages 127
446
447/*
448 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
449 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
450 * op/pragma/warn/regcomp.
451 */
7253e4e3
RK
452#define MARKER1 "<-- HERE" /* marker as it appears in the description */
453#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 454
c1d900c3 455#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
b45f050a 456
c1d900c3
BF
457#define REPORT_LOCATION_ARGS(offset) \
458 UTF8fARG(UTF, offset, RExC_precomp), \
459 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
460
b45f050a
JF
461/*
462 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
463 * arg. Show regex, up to a maximum length. If it's too long, chop and add
464 * "...".
465 */
58e23c8d 466#define _FAIL(code) STMT_START { \
bfed75c6 467 const char *ellipses = ""; \
ccb2c380
MP
468 IV len = RExC_end - RExC_precomp; \
469 \
470 if (!SIZE_ONLY) \
a5e7bc51 471 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
472 if (len > RegexLengthToShowInErrorMessages) { \
473 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
474 len = RegexLengthToShowInErrorMessages - 10; \
475 ellipses = "..."; \
476 } \
58e23c8d 477 code; \
ccb2c380 478} STMT_END
8615cb43 479
58e23c8d 480#define FAIL(msg) _FAIL( \
c1d900c3
BF
481 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
482 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
483
484#define FAIL2(msg,arg) _FAIL( \
c1d900c3
BF
485 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
486 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 487
b45f050a 488/*
b45f050a
JF
489 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
490 */
ccb2c380 491#define Simple_vFAIL(m) STMT_START { \
a28509cc 492 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380 493 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
c1d900c3 494 m, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 495} STMT_END
b45f050a
JF
496
497/*
498 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
499 */
ccb2c380
MP
500#define vFAIL(m) STMT_START { \
501 if (!SIZE_ONLY) \
a5e7bc51 502 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
503 Simple_vFAIL(m); \
504} STMT_END
b45f050a
JF
505
506/*
507 * Like Simple_vFAIL(), but accepts two arguments.
508 */
ccb2c380 509#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 510 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
511 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
512 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 513} STMT_END
b45f050a
JF
514
515/*
516 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
517 */
ccb2c380
MP
518#define vFAIL2(m,a1) STMT_START { \
519 if (!SIZE_ONLY) \
a5e7bc51 520 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
521 Simple_vFAIL2(m, a1); \
522} STMT_END
b45f050a
JF
523
524
525/*
526 * Like Simple_vFAIL(), but accepts three arguments.
527 */
ccb2c380 528#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 529 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
530 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
531 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 532} STMT_END
b45f050a
JF
533
534/*
535 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
536 */
ccb2c380
MP
537#define vFAIL3(m,a1,a2) STMT_START { \
538 if (!SIZE_ONLY) \
a5e7bc51 539 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
540 Simple_vFAIL3(m, a1, a2); \
541} STMT_END
b45f050a
JF
542
543/*
544 * Like Simple_vFAIL(), but accepts four arguments.
545 */
ccb2c380 546#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 547 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
548 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
549 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 550} STMT_END
b45f050a 551
95db3ffa
KW
552#define vFAIL4(m,a1,a2,a3) STMT_START { \
553 if (!SIZE_ONLY) \
554 SAVEFREESV(RExC_rx_sv); \
555 Simple_vFAIL4(m, a1, a2, a3); \
556} STMT_END
557
946095af
BF
558/* A specialized version of vFAIL2 that works with UTF8f */
559#define vFAIL2utf8f(m, a1) STMT_START { \
ef3f731d
BF
560 const IV offset = RExC_parse - RExC_precomp; \
561 if (!SIZE_ONLY) \
562 SAVEFREESV(RExC_rx_sv); \
563 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
564 REPORT_LOCATION_ARGS(offset)); \
946095af
BF
565} STMT_END
566
567
5e0a247b
KW
568/* m is not necessarily a "literal string", in this macro */
569#define reg_warn_non_literal_string(loc, m) STMT_START { \
570 const IV offset = loc - RExC_precomp; \
571 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
c1d900c3 572 m, REPORT_LOCATION_ARGS(offset)); \
5e0a247b
KW
573} STMT_END
574
668c081a 575#define ckWARNreg(loc,m) STMT_START { \
a28509cc 576 const IV offset = loc - RExC_precomp; \
f10f4c18 577 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 578 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
579} STMT_END
580
0d6106aa
KW
581#define vWARN_dep(loc, m) STMT_START { \
582 const IV offset = loc - RExC_precomp; \
583 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
c1d900c3 584 REPORT_LOCATION_ARGS(offset)); \
0d6106aa
KW
585} STMT_END
586
147508a2
KW
587#define ckWARNdep(loc,m) STMT_START { \
588 const IV offset = loc - RExC_precomp; \
589 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
590 m REPORT_LOCATION, \
c1d900c3 591 REPORT_LOCATION_ARGS(offset)); \
147508a2
KW
592} STMT_END
593
668c081a 594#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 595 const IV offset = loc - RExC_precomp; \
d1d15184 596 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18 597 m REPORT_LOCATION, \
c1d900c3 598 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
599} STMT_END
600
b23eb183 601#define ckWARN2reg_d(loc,m, a1) STMT_START { \
2335b3d3 602 const IV offset = loc - RExC_precomp; \
b23eb183 603 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
2335b3d3 604 m REPORT_LOCATION, \
c1d900c3 605 a1, REPORT_LOCATION_ARGS(offset)); \
2335b3d3
KW
606} STMT_END
607
668c081a 608#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 609 const IV offset = loc - RExC_precomp; \
668c081a 610 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 611 a1, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
612} STMT_END
613
614#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 615 const IV offset = loc - RExC_precomp; \
ccb2c380 616 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 617 a1, a2, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
618} STMT_END
619
668c081a
NC
620#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
621 const IV offset = loc - RExC_precomp; \
622 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 623 a1, a2, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
624} STMT_END
625
ccb2c380 626#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 627 const IV offset = loc - RExC_precomp; \
ccb2c380 628 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 629 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
630} STMT_END
631
668c081a
NC
632#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
633 const IV offset = loc - RExC_precomp; \
634 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 635 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
636} STMT_END
637
ccb2c380 638#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 639 const IV offset = loc - RExC_precomp; \
ccb2c380 640 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 641 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 642} STMT_END
9d1d55b5 643
8615cb43 644
cd439c50 645/* Allow for side effects in s */
ccb2c380
MP
646#define REGC(c,s) STMT_START { \
647 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
648} STMT_END
cd439c50 649
fac92740
MJD
650/* Macros for recording node offsets. 20001227 mjd@plover.com
651 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
652 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
653 * Element 0 holds the number n.
07be1b83 654 * Position is 1 indexed.
fac92740 655 */
7122b237
YO
656#ifndef RE_TRACK_PATTERN_OFFSETS
657#define Set_Node_Offset_To_R(node,byte)
658#define Set_Node_Offset(node,byte)
659#define Set_Cur_Node_Offset
660#define Set_Node_Length_To_R(node,len)
661#define Set_Node_Length(node,len)
6a86c6ad 662#define Set_Node_Cur_Length(node,start)
7122b237
YO
663#define Node_Offset(n)
664#define Node_Length(n)
665#define Set_Node_Offset_Length(node,offset,len)
666#define ProgLen(ri) ri->u.proglen
667#define SetProgLen(ri,x) ri->u.proglen = x
668#else
669#define ProgLen(ri) ri->u.offsets[0]
670#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
671#define Set_Node_Offset_To_R(node,byte) STMT_START { \
672 if (! SIZE_ONLY) { \
673 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 674 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 675 if((node) < 0) { \
551405c4 676 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
677 } else { \
678 RExC_offsets[2*(node)-1] = (byte); \
679 } \
680 } \
681} STMT_END
682
683#define Set_Node_Offset(node,byte) \
684 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
685#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
686
687#define Set_Node_Length_To_R(node,len) STMT_START { \
688 if (! SIZE_ONLY) { \
689 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 690 __LINE__, (int)(node), (int)(len))); \
ccb2c380 691 if((node) < 0) { \
551405c4 692 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
693 } else { \
694 RExC_offsets[2*(node)] = (len); \
695 } \
696 } \
697} STMT_END
698
699#define Set_Node_Length(node,len) \
700 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
701#define Set_Node_Cur_Length(node, start) \
702 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
703
704/* Get offsets and lengths */
705#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
706#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
707
07be1b83
YO
708#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
709 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
710 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
711} STMT_END
7122b237 712#endif
07be1b83
YO
713
714#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
715#define EXPERIMENTAL_INPLACESCAN
f427392e 716#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 717
9e9ecfdf
YO
718#define DEBUG_RExC_seen() \
719 DEBUG_OPTIMISE_MORE_r({ \
720 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
721 \
722 if (RExC_seen & REG_SEEN_ZERO_LEN) \
723 PerlIO_printf(Perl_debug_log,"REG_SEEN_ZERO_LEN "); \
724 \
725 if (RExC_seen & REG_SEEN_LOOKBEHIND) \
726 PerlIO_printf(Perl_debug_log,"REG_SEEN_LOOKBEHIND "); \
727 \
728 if (RExC_seen & REG_SEEN_GPOS) \
729 PerlIO_printf(Perl_debug_log,"REG_SEEN_GPOS "); \
730 \
731 if (RExC_seen & REG_SEEN_CANY) \
732 PerlIO_printf(Perl_debug_log,"REG_SEEN_CANY "); \
733 \
734 if (RExC_seen & REG_SEEN_RECURSE) \
735 PerlIO_printf(Perl_debug_log,"REG_SEEN_RECURSE "); \
736 \
737 if (RExC_seen & REG_TOP_LEVEL_BRANCHES) \
738 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES "); \
739 \
740 if (RExC_seen & REG_SEEN_VERBARG) \
741 PerlIO_printf(Perl_debug_log,"REG_SEEN_VERBARG "); \
742 \
743 if (RExC_seen & REG_SEEN_CUTGROUP) \
744 PerlIO_printf(Perl_debug_log,"REG_SEEN_CUTGROUP "); \
745 \
746 if (RExC_seen & REG_SEEN_RUN_ON_COMMENT) \
747 PerlIO_printf(Perl_debug_log,"REG_SEEN_RUN_ON_COMMENT "); \
748 \
749 if (RExC_seen & REG_SEEN_EXACTF_SHARP_S) \
750 PerlIO_printf(Perl_debug_log,"REG_SEEN_EXACTF_SHARP_S "); \
751 \
752 if (RExC_seen & REG_SEEN_GOSTART) \
753 PerlIO_printf(Perl_debug_log,"REG_SEEN_GOSTART "); \
754 \
755 PerlIO_printf(Perl_debug_log,"\n"); \
756 });
757
304ee84b
YO
758#define DEBUG_STUDYDATA(str,data,depth) \
759DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 760 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
761 "%*s" str "Pos:%"IVdf"/%"IVdf \
762 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
763 (int)(depth)*2, "", \
764 (IV)((data)->pos_min), \
765 (IV)((data)->pos_delta), \
304ee84b 766 (UV)((data)->flags), \
1de06328 767 (IV)((data)->whilem_c), \
304ee84b
YO
768 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
769 is_inf ? "INF " : "" \
1de06328
YO
770 ); \
771 if ((data)->last_found) \
772 PerlIO_printf(Perl_debug_log, \
773 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
774 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
775 SvPVX_const((data)->last_found), \
776 (IV)((data)->last_end), \
777 (IV)((data)->last_start_min), \
778 (IV)((data)->last_start_max), \
779 ((data)->longest && \
780 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
781 SvPVX_const((data)->longest_fixed), \
782 (IV)((data)->offset_fixed), \
783 ((data)->longest && \
784 (data)->longest==&((data)->longest_float)) ? "*" : "", \
785 SvPVX_const((data)->longest_float), \
786 (IV)((data)->offset_float_min), \
787 (IV)((data)->offset_float_max) \
788 ); \
789 PerlIO_printf(Perl_debug_log,"\n"); \
790});
791
653099ff 792/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 793 Update the longest found anchored substring and the longest found
653099ff
GS
794 floating substrings if needed. */
795
4327152a 796STATIC void
ea3daa5d
FC
797S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
798 SSize_t *minlenp, int is_inf)
c277df42 799{
e1ec3a88
AL
800 const STRLEN l = CHR_SVLEN(data->last_found);
801 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 802 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 803
7918f24d
NC
804 PERL_ARGS_ASSERT_SCAN_COMMIT;
805
c277df42 806 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 807 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
808 if (*data->longest == data->longest_fixed) {
809 data->offset_fixed = l ? data->last_start_min : data->pos_min;
810 if (data->flags & SF_BEFORE_EOL)
b81d288d 811 data->flags
c277df42
IZ
812 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
813 else
814 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 815 data->minlen_fixed=minlenp;
1de06328 816 data->lookbehind_fixed=0;
a0ed51b3 817 }
304ee84b 818 else { /* *data->longest == data->longest_float */
c277df42 819 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
820 data->offset_float_max = (l
821 ? data->last_start_max
ea3daa5d
FC
822 : (data->pos_delta == SSize_t_MAX
823 ? SSize_t_MAX
824 : data->pos_min + data->pos_delta));
825 if (is_inf
826 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
827 data->offset_float_max = SSize_t_MAX;
c277df42 828 if (data->flags & SF_BEFORE_EOL)
b81d288d 829 data->flags
c277df42
IZ
830 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
831 else
832 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
833 data->minlen_float=minlenp;
834 data->lookbehind_float=0;
c277df42
IZ
835 }
836 }
837 SvCUR_set(data->last_found, 0);
0eda9292 838 {
a28509cc 839 SV * const sv = data->last_found;
097eb12c
AL
840 if (SvUTF8(sv) && SvMAGICAL(sv)) {
841 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
842 if (mg)
843 mg->mg_len = 0;
844 }
0eda9292 845 }
c277df42
IZ
846 data->last_end = -1;
847 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 848 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
849}
850
cdd87c1d
KW
851/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
852 * list that describes which code points it matches */
853
653099ff 854STATIC void
3420edd7 855S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 856{
cdd87c1d
KW
857 /* Set the SSC 'ssc' to match an empty string or any code point */
858
557bd3fb 859 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 860
cdd87c1d 861 assert(OP(ssc) == ANYOF_SYNTHETIC);
3fffb88a 862
cdd87c1d
KW
863 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
864 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
a0dd4231 865 ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
653099ff
GS
866}
867
653099ff 868STATIC int
cdd87c1d 869S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
653099ff 870{
c144baaa
KW
871 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
872 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
873 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
874 * in any way, so there's no point in using it */
cdd87c1d
KW
875
876 UV start, end;
877 bool ret;
653099ff 878
557bd3fb 879 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 880
cdd87c1d
KW
881 assert(OP(ssc) == ANYOF_SYNTHETIC);
882
77ebeeba 883 if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
cdd87c1d
KW
884 return FALSE;
885 }
886
887 /* See if the list consists solely of the range 0 - Infinity */
888 invlist_iterinit(ssc->invlist);
889 ret = invlist_iternext(ssc->invlist, &start, &end)
890 && start == 0
891 && end == UV_MAX;
892
893 invlist_iterfinish(ssc->invlist);
894
895 if (ret) {
896 return TRUE;
897 }
898
899 /* If e.g., both \w and \W are set, matches everything */
900 if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
901 int i;
902 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
903 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
904 return TRUE;
905 }
906 }
907 }
908
909 return FALSE;
653099ff
GS
910}
911
653099ff 912STATIC void
cdd87c1d 913S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 914{
cdd87c1d
KW
915 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
916 * string, any code point, or any posix class under locale */
917
557bd3fb 918 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 919
557bd3fb 920 Zero(ssc, 1, regnode_ssc);
557bd3fb 921 OP(ssc) = ANYOF_SYNTHETIC;
cdd87c1d 922 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
3420edd7 923 ssc_anything(ssc);
cdd87c1d
KW
924
925 /* If any portion of the regex is to operate under locale rules,
926 * initialization includes it. The reason this isn't done for all regexes
927 * is that the optimizer was written under the assumption that locale was
928 * all-or-nothing. Given the complexity and lack of documentation in the
929 * optimizer, and that there are inadequate test cases for locale, many
930 * parts of it may not work properly, it is safest to avoid locale unless
931 * necessary. */
932 if (RExC_contains_locale) {
933 ANYOF_POSIXL_SETALL(ssc);
cfafade5
KW
934 ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
935 if (RExC_contains_i) {
936 ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD;
937 }
cdd87c1d
KW
938 }
939 else {
940 ANYOF_POSIXL_ZERO(ssc);
941 }
653099ff
GS
942}
943
b423522f
KW
944STATIC int
945S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
946 const regnode_ssc *ssc)
947{
948 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
949 * to the list of code points matched, and locale posix classes; hence does
950 * not check its flags) */
951
952 UV start, end;
953 bool ret;
954
955 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
956
957 assert(OP(ssc) == ANYOF_SYNTHETIC);
958
959 invlist_iterinit(ssc->invlist);
960 ret = invlist_iternext(ssc->invlist, &start, &end)
961 && start == 0
962 && end == UV_MAX;
963
964 invlist_iterfinish(ssc->invlist);
965
966 if (! ret) {
967 return FALSE;
968 }
969
970 if (RExC_contains_locale) {
971 if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
972 || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
973 || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))
974 {
975 return FALSE;
976 }
977 if (RExC_contains_i && ! (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) {
978 return FALSE;
979 }
980 }
981
982 return TRUE;
983}
984
985STATIC SV*
986S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
987 const regnode_charclass_posixl* const node)
988{
989 /* Returns a mortal inversion list defining which code points are matched
990 * by 'node', which is of type ANYOF. Handles complementing the result if
991 * appropriate. If some code points aren't knowable at this time, the
992 * returned list must, and will, contain every possible code point. */
993
994 SV* invlist = sv_2mortal(_new_invlist(0));
995 unsigned int i;
996 const U32 n = ARG(node);
997
998 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
999
1000 /* Look at the data structure created by S_set_ANYOF_arg() */
1001 if (n != ANYOF_NONBITMAP_EMPTY) {
1002 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1003 AV * const av = MUTABLE_AV(SvRV(rv));
1004 SV **const ary = AvARRAY(av);
1005 assert(RExC_rxi->data->what[n] == 's');
1006
1007 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1008 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1009 }
1010 else if (ary[0] && ary[0] != &PL_sv_undef) {
1011
1012 /* Here, no compile-time swash, and there are things that won't be
1013 * known until runtime -- we have to assume it could be anything */
1014 return _add_range_to_invlist(invlist, 0, UV_MAX);
1015 }
1016 else {
1017
1018 /* Here no compile-time swash, and no run-time only data. Use the
1019 * node's inversion list */
1020 invlist = sv_2mortal(invlist_clone(ary[2]));
1021 }
1022 }
1023
1024 /* An ANYOF node contains a bitmap for the first 256 code points, and an
1025 * inversion list for the others, but if there are code points that should
1026 * match only conditionally on the target string being UTF-8, those are
1027 * placed in the inversion list, and not the bitmap. Since there are
1028 * circumstances under which they could match, they are included in the
1029 * SSC. But if the ANYOF node is to be inverted, we have to exclude them
1030 * here, so that when we invert below, the end result actually does include
1031 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
1032 * before we add the unconditionally matched code points */
1033 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1034 _invlist_intersection_complement_2nd(invlist,
1035 PL_UpperLatin1,
1036 &invlist);
1037 }
1038
1039 /* Add in the points from the bit map */
1040 for (i = 0; i < 256; i++) {
1041 if (ANYOF_BITMAP_TEST(node, i)) {
1042 invlist = add_cp_to_invlist(invlist, i);
1043 }
1044 }
1045
1046 /* If this can match all upper Latin1 code points, have to add them
1047 * as well */
1048 if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_LATIN1_ALL) {
1049 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1050 }
1051
1052 /* Similarly for these */
1053 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1054 invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1055 }
1056
1057 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1058 _invlist_invert(invlist);
1059 }
1060
1061 return invlist;
1062}
1063
1051e1c4 1064/* These two functions currently do the exact same thing */
557bd3fb 1065#define ssc_init_zero ssc_init
653099ff 1066
cdd87c1d
KW
1067#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1068#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1069
b423522f
KW
1070STATIC void
1071S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
1072{
1073 /* Take the flags 'and_with' and accumulate them anded into the flags for
a0dd4231
KW
1074 * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored.
1075 * The flags 'and_with' should not come from another SSC (otherwise the
1076 * EMPTY_STRING flag won't work) */
b423522f
KW
1077
1078 const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS;
1079
1080 PERL_ARGS_ASSERT_SSC_FLAGS_AND;
1081
1082 /* Use just the SSC-related flags from 'and_with' */
1083 ANYOF_FLAGS(ssc) &= (and_with & ANYOF_LOCALE_FLAGS);
1084 ANYOF_FLAGS(ssc) |= ssc_only_flags;
1085}
1086
557bd3fb 1087/* 'AND' a given class with another one. Can create false positives. 'ssc'
8efd3f97 1088 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
b8f7bb16 1089 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1090
653099ff 1091STATIC void
b423522f
KW
1092S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1093 const regnode_ssc *and_with)
653099ff 1094{
cdd87c1d
KW
1095 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1096 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1097
a0dd4231
KW
1098 SV* anded_cp_list;
1099 U8 anded_flags;
1e6ade67 1100
cdd87c1d 1101 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1102
cdd87c1d 1103 assert(OP(ssc) == ANYOF_SYNTHETIC);
a0dd4231
KW
1104
1105 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1106 * the code point inversion list and just the relevant flags */
1107 if (OP(and_with) == ANYOF_SYNTHETIC) {
1108 anded_cp_list = and_with->invlist;
1109 anded_flags = ANYOF_FLAGS(and_with);
1110 }
1111 else {
1112 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1113 (regnode_charclass_posixl*) and_with);
1114 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_LOCALE_FLAGS;
1115 }
1116
1117 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1118
1119 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1120 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1121 * 'and_with' may be inverted. When not inverted, we have the situation of
1122 * computing:
1123 * (C1 | P1) & (C2 | P2)
1124 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1125 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1126 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1127 * <= ((C1 & C2) | P1 | P2)
1128 * Alternatively, the last few steps could be:
1129 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1130 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1131 * <= (C1 | C2 | (P1 & P2))
1132 * We favor the second approach if either P1 or P2 is non-empty. This is
1133 * because these components are a barrier to doing optimizations, as what
1134 * they match cannot be known until the moment of matching as they are
1135 * dependent on the current locale, 'AND"ing them likely will reduce or
1136 * eliminate them.
1137 * But we can do better if we know that C1,P1 are in their initial state (a
1138 * frequent occurrence), each matching everything:
1139 * (<everything>) & (C2 | P2) = C2 | P2
1140 * Similarly, if C2,P2 are in their initial state (again a frequent
1141 * occurrence), the result is a no-op
1142 * (C1 | P1) & (<everything>) = C1 | P1
1143 *
1144 * Inverted, we have
1145 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1146 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1147 * <= (C1 & ~C2) | (P1 & ~P2)
1148 * */
1aa99e6b 1149
a0dd4231
KW
1150 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
1151 && OP(and_with) != ANYOF_SYNTHETIC)
1152 {
cdd87c1d 1153 unsigned int i;
8951c461 1154
cdd87c1d
KW
1155 ssc_intersection(ssc,
1156 anded_cp_list,
1157 FALSE /* Has already been inverted */
1158 );
c6b76537 1159
cdd87c1d
KW
1160 /* If either P1 or P2 is empty, the intersection will be also; can skip
1161 * the loop */
1162 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1163 ANYOF_POSIXL_ZERO(ssc);
1164 }
1165 else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1166
1167 /* Note that the Posix class component P from 'and_with' actually
1168 * looks like:
1169 * P = Pa | Pb | ... | Pn
1170 * where each component is one posix class, such as in [\w\s].
1171 * Thus
1172 * ~P = ~(Pa | Pb | ... | Pn)
1173 * = ~Pa & ~Pb & ... & ~Pn
1174 * <= ~Pa | ~Pb | ... | ~Pn
1175 * The last is something we can easily calculate, but unfortunately
1176 * is likely to have many false positives. We could do better
1177 * in some (but certainly not all) instances if two classes in
1178 * P have known relationships. For example
1179 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1180 * So
1181 * :lower: & :print: = :lower:
1182 * And similarly for classes that must be disjoint. For example,
1183 * since \s and \w can have no elements in common based on rules in
1184 * the POSIX standard,
1185 * \w & ^\S = nothing
1186 * Unfortunately, some vendor locales do not meet the Posix
1187 * standard, in particular almost everything by Microsoft.
1188 * The loop below just changes e.g., \w into \W and vice versa */
1189
1190 regnode_charclass_posixl temp;
1191 int add = 1; /* To calculate the index of the complement */
1192
1193 ANYOF_POSIXL_ZERO(&temp);
1194 for (i = 0; i < ANYOF_MAX; i++) {
1195 assert(i % 2 != 0
1196 || ! ANYOF_POSIXL_TEST(and_with, i)
1197 || ! ANYOF_POSIXL_TEST(and_with, i + 1));
1198
1199 if (ANYOF_POSIXL_TEST(and_with, i)) {
1200 ANYOF_POSIXL_SET(&temp, i + add);
1201 }
1202 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1203 }
1204 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1205
cdd87c1d
KW
1206 } /* else ssc already has no posixes */
1207 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1208 in its initial state */
1209 else if (OP(and_with) != ANYOF_SYNTHETIC
1210 || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
1211 {
1212 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1213 * copy it over 'ssc' */
1214 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1215 if (OP(and_with) == ANYOF_SYNTHETIC) {
1216 StructCopy(and_with, ssc, regnode_ssc);
1217 }
1218 else {
1219 ssc->invlist = anded_cp_list;
1220 ANYOF_POSIXL_ZERO(ssc);
1221 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1222 ANYOF_POSIXL_OR(and_with, ssc);
1223 }
1224 }
1225 }
1226 else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
1227 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1228 {
1229 /* One or the other of P1, P2 is non-empty. */
1230 ANYOF_POSIXL_AND(and_with, ssc);
1231 ssc_union(ssc, anded_cp_list, FALSE);
1232 }
1233 else { /* P1 = P2 = empty */
1234 ssc_intersection(ssc, anded_cp_list, FALSE);
1235 }
137165a6 1236 }
653099ff
GS
1237}
1238
653099ff 1239STATIC void
cdd87c1d
KW
1240S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1241 const regnode_ssc *or_with)
653099ff 1242{
cdd87c1d
KW
1243 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1244 * another SSC or a regular ANYOF class. Can create false positives if
1245 * 'or_with' is to be inverted. */
7918f24d 1246
a0dd4231
KW
1247 SV* ored_cp_list;
1248 U8 ored_flags;
c6b76537 1249
cdd87c1d 1250 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1251
cdd87c1d 1252 assert(OP(ssc) == ANYOF_SYNTHETIC);
a0dd4231
KW
1253
1254 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1255 * the code point inversion list and just the relevant flags */
1256 if (OP(or_with) == ANYOF_SYNTHETIC) {
1257 ored_cp_list = or_with->invlist;
1258 ored_flags = ANYOF_FLAGS(or_with);
1259 }
1260 else {
1261 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1262 (regnode_charclass_posixl*) or_with);
1263 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_LOCALE_FLAGS;
1264 }
1265
1266 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1267
1268 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1269 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1270 * 'or_with' may be inverted. When not inverted, we have the simple
1271 * situation of computing:
1272 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1273 * If P1|P2 yields a situation with both a class and its complement are
1274 * set, like having both \w and \W, this matches all code points, and we
1275 * can delete these from the P component of the ssc going forward. XXX We
1276 * might be able to delete all the P components, but I (khw) am not certain
1277 * about this, and it is better to be safe.
1278 *
1279 * Inverted, we have
1280 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1281 * <= (C1 | P1) | ~C2
1282 * <= (C1 | ~C2) | P1
1283 * (which results in actually simpler code than the non-inverted case)
1284 * */
9826f543 1285
a0dd4231
KW
1286 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
1287 && OP(or_with) != ANYOF_SYNTHETIC)
1288 {
cdd87c1d
KW
1289 /* We ignore P2, leaving P1 going forward */
1290 }
1291 else { /* Not inverted */
1292 ANYOF_POSIXL_OR(or_with, ssc);
1293 if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1294 unsigned int i;
1295 for (i = 0; i < ANYOF_MAX; i += 2) {
1296 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1297 {
1298 ssc_match_all_cp(ssc);
1299 ANYOF_POSIXL_CLEAR(ssc, i);
1300 ANYOF_POSIXL_CLEAR(ssc, i+1);
1301 if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1302 ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
1303 }
1304 }
1305 }
1306 }
1aa99e6b 1307 }
cdd87c1d
KW
1308
1309 ssc_union(ssc,
1310 ored_cp_list,
1311 FALSE /* Already has been inverted */
1312 );
653099ff
GS
1313}
1314
b423522f
KW
1315PERL_STATIC_INLINE void
1316S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1317{
1318 PERL_ARGS_ASSERT_SSC_UNION;
1319
1320 assert(OP(ssc) == ANYOF_SYNTHETIC);
1321
1322 _invlist_union_maybe_complement_2nd(ssc->invlist,
1323 invlist,
1324 invert2nd,
1325 &ssc->invlist);
1326}
1327
1328PERL_STATIC_INLINE void
1329S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1330 SV* const invlist,
1331 const bool invert2nd)
1332{
1333 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1334
1335 assert(OP(ssc) == ANYOF_SYNTHETIC);
1336
1337 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1338 invlist,
1339 invert2nd,
1340 &ssc->invlist);
1341}
1342
1343PERL_STATIC_INLINE void
1344S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1345{
1346 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1347
1348 assert(OP(ssc) == ANYOF_SYNTHETIC);
1349
1350 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1351}
1352
1353PERL_STATIC_INLINE void
1354S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1355{
1356 /* AND just the single code point 'cp' into the SSC 'ssc' */
1357
1358 SV* cp_list = _new_invlist(2);
1359
1360 PERL_ARGS_ASSERT_SSC_CP_AND;
1361
1362 assert(OP(ssc) == ANYOF_SYNTHETIC);
1363
1364 cp_list = add_cp_to_invlist(cp_list, cp);
1365 ssc_intersection(ssc, cp_list,
1366 FALSE /* Not inverted */
1367 );
1368 SvREFCNT_dec_NN(cp_list);
1369}
1370
1371PERL_STATIC_INLINE void
1372S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1373{
1374 /* Set the SSC 'ssc' to not match any locale things */
1375
1376 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1377
1378 assert(OP(ssc) == ANYOF_SYNTHETIC);
1379
1380 ANYOF_POSIXL_ZERO(ssc);
1381 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1382}
1383
1384STATIC void
1385S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1386{
1387 /* The inversion list in the SSC is marked mortal; now we need a more
1388 * permanent copy, which is stored the same way that is done in a regular
1389 * ANYOF node, with the first 256 code points in a bit map */
1390
1391 SV* invlist = invlist_clone(ssc->invlist);
1392
1393 PERL_ARGS_ASSERT_SSC_FINALIZE;
1394
1395 assert(OP(ssc) == ANYOF_SYNTHETIC);
1396
a0dd4231
KW
1397 /* The code in this file assumes that all but these flags aren't relevant
1398 * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1399 * time we reach here */
1400 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS));
1401
b423522f
KW
1402 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1403
1404 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
1405
1406 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
1407}
1408
a3621e74
YO
1409#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1410#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1411#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1412#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1413
3dab1dad
YO
1414
1415#ifdef DEBUGGING
07be1b83 1416/*
2b8b4781
NC
1417 dump_trie(trie,widecharmap,revcharmap)
1418 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1419 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1420
1421 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1422 The _interim_ variants are used for debugging the interim
1423 tables that are used to generate the final compressed
1424 representation which is what dump_trie expects.
1425
486ec47a 1426 Part of the reason for their existence is to provide a form
3dab1dad 1427 of documentation as to how the different representations function.
07be1b83
YO
1428
1429*/
3dab1dad
YO
1430
1431/*
3dab1dad
YO
1432 Dumps the final compressed table form of the trie to Perl_debug_log.
1433 Used for debugging make_trie().
1434*/
b9a59e08 1435
3dab1dad 1436STATIC void
2b8b4781
NC
1437S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1438 AV *revcharmap, U32 depth)
3dab1dad
YO
1439{
1440 U32 state;
ab3bbdeb 1441 SV *sv=sv_newmortal();
55eed653 1442 int colwidth= widecharmap ? 6 : 4;
2e64971a 1443 U16 word;
3dab1dad
YO
1444 GET_RE_DEBUG_FLAGS_DECL;
1445
7918f24d 1446 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1447
3dab1dad
YO
1448 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1449 (int)depth * 2 + 2,"",
1450 "Match","Base","Ofs" );
1451
1452 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1453 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1454 if ( tmp ) {
ab3bbdeb
YO
1455 PerlIO_printf( Perl_debug_log, "%*s",
1456 colwidth,
ddc5bc0f 1457 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1458 PL_colors[0], PL_colors[1],
1459 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1460 PERL_PV_ESCAPE_FIRSTCHAR
1461 )
1462 );
3dab1dad
YO
1463 }
1464 }
1465 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1466 (int)depth * 2 + 2,"");
1467
1468 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1469 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1470 PerlIO_printf( Perl_debug_log, "\n");
1471
1e2e3d02 1472 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1473 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
1474
1475 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1476
1477 if ( trie->states[ state ].wordnum ) {
1478 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1479 } else {
1480 PerlIO_printf( Perl_debug_log, "%6s", "" );
1481 }
1482
1483 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1484
1485 if ( base ) {
1486 U32 ofs = 0;
1487
1488 while( ( base + ofs < trie->uniquecharcount ) ||
1489 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1490 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1491 ofs++;
1492
1493 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1494
1495 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1496 if ( ( base + ofs >= trie->uniquecharcount ) &&
1497 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1498 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1499 {
ab3bbdeb
YO
1500 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1501 colwidth,
3dab1dad
YO
1502 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1503 } else {
ab3bbdeb 1504 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1505 }
1506 }
1507
1508 PerlIO_printf( Perl_debug_log, "]");
1509
1510 }
1511 PerlIO_printf( Perl_debug_log, "\n" );
1512 }
2e64971a
DM
1513 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1514 for (word=1; word <= trie->wordcount; word++) {
1515 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1516 (int)word, (int)(trie->wordinfo[word].prev),
1517 (int)(trie->wordinfo[word].len));
1518 }
1519 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
1520}
1521/*
3dab1dad
YO
1522 Dumps a fully constructed but uncompressed trie in list form.
1523 List tries normally only are used for construction when the number of
1524 possible chars (trie->uniquecharcount) is very high.
1525 Used for debugging make_trie().
1526*/
1527STATIC void
55eed653 1528S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1529 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1530 U32 depth)
3dab1dad
YO
1531{
1532 U32 state;
ab3bbdeb 1533 SV *sv=sv_newmortal();
55eed653 1534 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1535 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1536
1537 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1538
3dab1dad 1539 /* print out the table precompression. */
ab3bbdeb
YO
1540 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1541 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1542 "------:-----+-----------------\n" );
3dab1dad
YO
1543
1544 for( state=1 ; state < next_alloc ; state ++ ) {
1545 U16 charid;
1546
ab3bbdeb 1547 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1548 (int)depth * 2 + 2,"", (UV)state );
1549 if ( ! trie->states[ state ].wordnum ) {
1550 PerlIO_printf( Perl_debug_log, "%5s| ","");
1551 } else {
1552 PerlIO_printf( Perl_debug_log, "W%4x| ",
1553 trie->states[ state ].wordnum
1554 );
1555 }
1556 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1557 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1558 if ( tmp ) {
1559 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1560 colwidth,
ddc5bc0f 1561 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1562 PL_colors[0], PL_colors[1],
1563 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1564 PERL_PV_ESCAPE_FIRSTCHAR
1565 ) ,
1e2e3d02
YO
1566 TRIE_LIST_ITEM(state,charid).forid,
1567 (UV)TRIE_LIST_ITEM(state,charid).newstate
1568 );
1569 if (!(charid % 10))
664e119d
RGS
1570 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1571 (int)((depth * 2) + 14), "");
1e2e3d02 1572 }
ab3bbdeb
YO
1573 }
1574 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1575 }
1576}
1577
1578/*
3dab1dad
YO
1579 Dumps a fully constructed but uncompressed trie in table form.
1580 This is the normal DFA style state transition table, with a few
1581 twists to facilitate compression later.
1582 Used for debugging make_trie().
1583*/
1584STATIC void
55eed653 1585S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1586 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1587 U32 depth)
3dab1dad
YO
1588{
1589 U32 state;
1590 U16 charid;
ab3bbdeb 1591 SV *sv=sv_newmortal();
55eed653 1592 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1593 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1594
1595 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1596
1597 /*
1598 print out the table precompression so that we can do a visual check
1599 that they are identical.
1600 */
1601
1602 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1603
1604 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1605 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1606 if ( tmp ) {
ab3bbdeb
YO
1607 PerlIO_printf( Perl_debug_log, "%*s",
1608 colwidth,
ddc5bc0f 1609 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1610 PL_colors[0], PL_colors[1],
1611 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1612 PERL_PV_ESCAPE_FIRSTCHAR
1613 )
1614 );
3dab1dad
YO
1615 }
1616 }
1617
1618 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1619
1620 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1621 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1622 }
1623
1624 PerlIO_printf( Perl_debug_log, "\n" );
1625
1626 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1627
1628 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1629 (int)depth * 2 + 2,"",
1630 (UV)TRIE_NODENUM( state ) );
1631
1632 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1633 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1634 if (v)
1635 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1636 else
1637 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1638 }
1639 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1640 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1641 } else {
1642 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1643 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1644 }
1645 }
07be1b83 1646}
3dab1dad
YO
1647
1648#endif
1649
2e64971a 1650
786e8c11
YO
1651/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1652 startbranch: the first branch in the whole branch sequence
1653 first : start branch of sequence of branch-exact nodes.
1654 May be the same as startbranch
1655 last : Thing following the last branch.
1656 May be the same as tail.
1657 tail : item following the branch sequence
1658 count : words in the sequence
1659 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1660 depth : indent depth
3dab1dad 1661
786e8c11 1662Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1663
786e8c11
YO
1664A trie is an N'ary tree where the branches are determined by digital
1665decomposition of the key. IE, at the root node you look up the 1st character and
1666follow that branch repeat until you find the end of the branches. Nodes can be
1667marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1668
786e8c11 1669 /he|she|his|hers/
72f13be8 1670
786e8c11
YO
1671would convert into the following structure. Numbers represent states, letters
1672following numbers represent valid transitions on the letter from that state, if
1673the number is in square brackets it represents an accepting state, otherwise it
1674will be in parenthesis.
07be1b83 1675
786e8c11
YO
1676 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1677 | |
1678 | (2)
1679 | |
1680 (1) +-i->(6)-+-s->[7]
1681 |
1682 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1683
786e8c11
YO
1684 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1685
1686This shows that when matching against the string 'hers' we will begin at state 1
1687read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1688then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1689is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1690single traverse. We store a mapping from accepting to state to which word was
1691matched, and then when we have multiple possibilities we try to complete the
1692rest of the regex in the order in which they occured in the alternation.
1693
1694The only prior NFA like behaviour that would be changed by the TRIE support is
1695the silent ignoring of duplicate alternations which are of the form:
1696
1697 / (DUPE|DUPE) X? (?{ ... }) Y /x
1698
4b714af6 1699Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1700and without the optimisation. With the optimisations dupes will be silently
486ec47a 1701ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1702the following demonstrates:
1703
1704 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1705
1706which prints out 'word' three times, but
1707
1708 'words'=~/(word|word|word)(?{ print $1 })S/
1709
1710which doesnt print it out at all. This is due to other optimisations kicking in.
1711
1712Example of what happens on a structural level:
1713
486ec47a 1714The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1715
1716 1: CURLYM[1] {1,32767}(18)
1717 5: BRANCH(8)
1718 6: EXACT <ac>(16)
1719 8: BRANCH(11)
1720 9: EXACT <ad>(16)
1721 11: BRANCH(14)
1722 12: EXACT <ab>(16)
1723 16: SUCCEED(0)
1724 17: NOTHING(18)
1725 18: END(0)
1726
1727This would be optimizable with startbranch=5, first=5, last=16, tail=16
1728and should turn into:
1729
1730 1: CURLYM[1] {1,32767}(18)
1731 5: TRIE(16)
1732 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1733 <ac>
1734 <ad>
1735 <ab>
1736 16: SUCCEED(0)
1737 17: NOTHING(18)
1738 18: END(0)
1739
1740Cases where tail != last would be like /(?foo|bar)baz/:
1741
1742 1: BRANCH(4)
1743 2: EXACT <foo>(8)
1744 4: BRANCH(7)
1745 5: EXACT <bar>(8)
1746 7: TAIL(8)
1747 8: EXACT <baz>(10)
1748 10: END(0)
1749
1750which would be optimizable with startbranch=1, first=1, last=7, tail=8
1751and would end up looking like:
1752
1753 1: TRIE(8)
1754 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1755 <foo>
1756 <bar>
1757 7: TAIL(8)
1758 8: EXACT <baz>(10)
1759 10: END(0)
1760
c80e42f3 1761 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
1762
1763is the recommended Unicode-aware way of saying
1764
1765 *(d++) = uv;
1766*/
1767
fab2782b 1768#define TRIE_STORE_REVCHAR(val) \
786e8c11 1769 STMT_START { \
73031816 1770 if (UTF) { \
fab2782b 1771 SV *zlopp = newSV(7); /* XXX: optimize me */ \
88c9ea1e 1772 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 1773 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
1774 SvCUR_set(zlopp, kapow - flrbbbbb); \
1775 SvPOK_on(zlopp); \
1776 SvUTF8_on(zlopp); \
1777 av_push(revcharmap, zlopp); \
1778 } else { \
fab2782b 1779 char ooooff = (char)val; \
73031816
NC
1780 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1781 } \
1782 } STMT_END
786e8c11 1783
914a25d5
KW
1784/* This gets the next character from the input, folding it if not already
1785 * folded. */
1786#define TRIE_READ_CHAR STMT_START { \
1787 wordlen++; \
1788 if ( UTF ) { \
1789 /* if it is UTF then it is either already folded, or does not need \
1790 * folding */ \
1c1d615a 1791 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
1792 } \
1793 else if (folder == PL_fold_latin1) { \
7d006b13
KW
1794 /* This folder implies Unicode rules, which in the range expressible \
1795 * by not UTF is the lower case, with the two exceptions, one of \
1796 * which should have been taken care of before calling this */ \
1797 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1798 uvc = toLOWER_L1(*uc); \
1799 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1800 len = 1; \
914a25d5
KW
1801 } else { \
1802 /* raw data, will be folded later if needed */ \
1803 uvc = (U32)*uc; \
1804 len = 1; \
1805 } \
786e8c11
YO
1806} STMT_END
1807
1808
1809
1810#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1811 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1812 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1813 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1814 } \
1815 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1816 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1817 TRIE_LIST_CUR( state )++; \
1818} STMT_END
07be1b83 1819
786e8c11
YO
1820#define TRIE_LIST_NEW(state) STMT_START { \
1821 Newxz( trie->states[ state ].trans.list, \
1822 4, reg_trie_trans_le ); \
1823 TRIE_LIST_CUR( state ) = 1; \
1824 TRIE_LIST_LEN( state ) = 4; \
1825} STMT_END
07be1b83 1826
786e8c11
YO
1827#define TRIE_HANDLE_WORD(state) STMT_START { \
1828 U16 dupe= trie->states[ state ].wordnum; \
1829 regnode * const noper_next = regnext( noper ); \
1830 \
786e8c11
YO
1831 DEBUG_r({ \
1832 /* store the word for dumping */ \
1833 SV* tmp; \
1834 if (OP(noper) != NOTHING) \
740cce10 1835 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1836 else \
740cce10 1837 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1838 av_push( trie_words, tmp ); \
786e8c11
YO
1839 }); \
1840 \
1841 curword++; \
2e64971a
DM
1842 trie->wordinfo[curword].prev = 0; \
1843 trie->wordinfo[curword].len = wordlen; \
1844 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1845 \
1846 if ( noper_next < tail ) { \
1847 if (!trie->jump) \
c944940b 1848 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1849 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1850 if (!jumper) \
1851 jumper = noper_next; \
1852 if (!nextbranch) \
1853 nextbranch= regnext(cur); \
1854 } \
1855 \
1856 if ( dupe ) { \
2e64971a
DM
1857 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1858 /* chain, so that when the bits of chain are later */\
1859 /* linked together, the dups appear in the chain */\
1860 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1861 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1862 } else { \
1863 /* we haven't inserted this word yet. */ \
1864 trie->states[ state ].wordnum = curword; \
1865 } \
1866} STMT_END
07be1b83 1867
3dab1dad 1868
786e8c11
YO
1869#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1870 ( ( base + charid >= ucharcount \
1871 && base + charid < ubound \
1872 && state == trie->trans[ base - ucharcount + charid ].check \
1873 && trie->trans[ base - ucharcount + charid ].next ) \
1874 ? trie->trans[ base - ucharcount + charid ].next \
1875 : ( state==1 ? special : 0 ) \
1876 )
3dab1dad 1877
786e8c11
YO
1878#define MADE_TRIE 1
1879#define MADE_JUMP_TRIE 2
1880#define MADE_EXACT_TRIE 4
3dab1dad 1881
a3621e74 1882STATIC I32
786e8c11 1883S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1884{
27da23d5 1885 dVAR;
a3621e74
YO
1886 /* first pass, loop through and scan words */
1887 reg_trie_data *trie;
55eed653 1888 HV *widecharmap = NULL;
2b8b4781 1889 AV *revcharmap = newAV();
a3621e74 1890 regnode *cur;
a3621e74
YO
1891 STRLEN len = 0;
1892 UV uvc = 0;
1893 U16 curword = 0;
1894 U32 next_alloc = 0;
786e8c11
YO
1895 regnode *jumper = NULL;
1896 regnode *nextbranch = NULL;
7f69552c 1897 regnode *convert = NULL;
2e64971a 1898 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1899 /* we just use folder as a flag in utf8 */
1e696034 1900 const U8 * folder = NULL;
a3621e74 1901
2b8b4781 1902#ifdef DEBUGGING
cf78de0b 1903 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2b8b4781
NC
1904 AV *trie_words = NULL;
1905 /* along with revcharmap, this only used during construction but both are
1906 * useful during debugging so we store them in the struct when debugging.
8e11feef 1907 */
2b8b4781 1908#else
cf78de0b 1909 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 1910 STRLEN trie_charcount=0;
3dab1dad 1911#endif
2b8b4781 1912 SV *re_trie_maxbuff;
a3621e74 1913 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1914
1915 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1916#ifndef DEBUGGING
1917 PERL_UNUSED_ARG(depth);
1918#endif
a3621e74 1919
1e696034 1920 switch (flags) {
79a81a6e 1921 case EXACT: break;
2f7f8cb1 1922 case EXACTFA:
fab2782b 1923 case EXACTFU_SS:
1e696034
KW
1924 case EXACTFU: folder = PL_fold_latin1; break;
1925 case EXACTF: folder = PL_fold; break;
1926 case EXACTFL: folder = PL_fold_locale; break;
fab2782b 1927 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
1928 }
1929
c944940b 1930 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1931 trie->refcount = 1;
3dab1dad 1932 trie->startstate = 1;
786e8c11 1933 trie->wordcount = word_count;
f8fc2ecf 1934 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1935 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
fab2782b 1936 if (flags == EXACT)
c944940b 1937 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1938 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1939 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1940
a3621e74 1941 DEBUG_r({
2b8b4781 1942 trie_words = newAV();
a3621e74 1943 });
a3621e74 1944
0111c4fd 1945 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1946 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1947 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1948 }
df826430 1949 DEBUG_TRIE_COMPILE_r({
3dab1dad 1950 PerlIO_printf( Perl_debug_log,
786e8c11 1951 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1952 (int)depth * 2 + 2, "",
1953 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1954 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1955 (int)depth);
3dab1dad 1956 });
7f69552c
YO
1957
1958 /* Find the node we are going to overwrite */
1959 if ( first == startbranch && OP( last ) != BRANCH ) {
1960 /* whole branch chain */
1961 convert = first;
1962 } else {
1963 /* branch sub-chain */
1964 convert = NEXTOPER( first );
1965 }
1966
a3621e74
YO
1967 /* -- First loop and Setup --
1968
1969 We first traverse the branches and scan each word to determine if it
1970 contains widechars, and how many unique chars there are, this is
1971 important as we have to build a table with at least as many columns as we
1972 have unique chars.
1973
1974 We use an array of integers to represent the character codes 0..255
38a44b82 1975 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1976 native representation of the character value as the key and IV's for the
1977 coded index.
1978
1979 *TODO* If we keep track of how many times each character is used we can
1980 remap the columns so that the table compression later on is more
3b753521 1981 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1982 middle and the least common are on the outside. IMO this would be better
1983 than a most to least common mapping as theres a decent chance the most
1984 common letter will share a node with the least common, meaning the node
486ec47a 1985 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1986 case is when we have the least common nodes twice.
1987
1988 */
1989
a3621e74 1990 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 1991 regnode *noper = NEXTOPER( cur );
e1ec3a88 1992 const U8 *uc = (U8*)STRING( noper );
df826430 1993 const U8 *e = uc + STR_LEN( noper );
a3621e74 1994 STRLEN foldlen = 0;
07be1b83 1995 U32 wordlen = 0; /* required init */
afa96d92
KW
1996 STRLEN minbytes = 0;
1997 STRLEN maxbytes = 0;
02daf0ab 1998 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1999
3dab1dad 2000 if (OP(noper) == NOTHING) {
df826430
YO
2001 regnode *noper_next= regnext(noper);
2002 if (noper_next != tail && OP(noper_next) == flags) {
2003 noper = noper_next;
2004 uc= (U8*)STRING(noper);
2005 e= uc + STR_LEN(noper);
2006 trie->minlen= STR_LEN(noper);
2007 } else {
2008 trie->minlen= 0;
2009 continue;
2010 }
3dab1dad 2011 }
df826430 2012
fab2782b 2013 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2014 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2015 regardless of encoding */
fab2782b
YO
2016 if (OP( noper ) == EXACTFU_SS) {
2017 /* false positives are ok, so just set this */
0dc4a61d 2018 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2019 }
2020 }
a3621e74 2021 for ( ; uc < e ; uc += len ) {
3dab1dad 2022 TRIE_CHARCOUNT(trie)++;
a3621e74 2023 TRIE_READ_CHAR;
645de4ce
KW
2024
2025 /* Acummulate to the current values, the range in the number of
2026 * bytes that this character could match. The max is presumed to
2027 * be the same as the folded input (which TRIE_READ_CHAR returns),
2028 * except that when this is not in UTF-8, it could be matched
2029 * against a string which is UTF-8, and the variant characters
2030 * could be 2 bytes instead of the 1 here. Likewise, for the
2031 * minimum number of bytes when not folded. When folding, the min
2032 * is assumed to be 1 byte could fold to match the single character
2033 * here, or in the case of a multi-char fold, 1 byte can fold to
2034 * the whole sequence. 'foldlen' is used to denote whether we are
2035 * in such a sequence, skipping the min setting if so. XXX TODO
2036 * Use the exact list of what folds to each character, from
2037 * PL_utf8_foldclosures */
2038 if (UTF) {
2039 maxbytes += UTF8SKIP(uc);
2040 if (! folder) {
2041 /* A non-UTF-8 string could be 1 byte to match our 2 */
2042 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
2043 ? 1
2044 : UTF8SKIP(uc);
2045 }
2046 else {
2047 if (foldlen) {
2048 foldlen -= UTF8SKIP(uc);
2049 }
2050 else {
2051 foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
2052 minbytes++;
2053 }
2054 }
2055 }
2056 else {
2057 maxbytes += (UNI_IS_INVARIANT(*uc))
2058 ? 1
2059 : 2;
2060 if (! folder) {
2061 minbytes++;
2062 }
2063 else {
2064 if (foldlen) {
2065 foldlen--;
2066 }
2067 else {
2068 foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
2069 minbytes++;
2070 }
2071 }
2072 }
a3621e74 2073 if ( uvc < 256 ) {
fab2782b
YO
2074 if ( folder ) {
2075 U8 folded= folder[ (U8) uvc ];
2076 if ( !trie->charmap[ folded ] ) {
2077 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2078 TRIE_STORE_REVCHAR( folded );
2079 }
2080 }
a3621e74
YO
2081 if ( !trie->charmap[ uvc ] ) {
2082 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2083 TRIE_STORE_REVCHAR( uvc );
a3621e74 2084 }
02daf0ab 2085 if ( set_bit ) {
62012aee
KW
2086 /* store the codepoint in the bitmap, and its folded
2087 * equivalent. */
fab2782b 2088 TRIE_BITMAP_SET(trie, uvc);
0921ee73
T
2089
2090 /* store the folded codepoint */
fab2782b 2091 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
0921ee73
T
2092
2093 if ( !UTF ) {
2094 /* store first byte of utf8 representation of
acdf4139 2095 variant codepoints */
6f2d5cbc 2096 if (! UVCHR_IS_INVARIANT(uvc)) {
acdf4139 2097 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
2098 }
2099 }
02daf0ab
YO
2100 set_bit = 0; /* We've done our bit :-) */
2101 }
a3621e74
YO
2102 } else {
2103 SV** svpp;
55eed653
NC
2104 if ( !widecharmap )
2105 widecharmap = newHV();
a3621e74 2106
55eed653 2107 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2108
2109 if ( !svpp )
e4584336 2110 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
2111
2112 if ( !SvTRUE( *svpp ) ) {
2113 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2114 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2115 }
2116 }
2117 }
3dab1dad 2118 if( cur == first ) {
afa96d92
KW
2119 trie->minlen = minbytes;
2120 trie->maxlen = maxbytes;
2121 } else if (minbytes < trie->minlen) {
2122 trie->minlen = minbytes;
2123 } else if (maxbytes > trie->maxlen) {
2124 trie->maxlen = maxbytes;
fab2782b 2125 }
a3621e74
YO
2126 } /* end first pass */
2127 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
2128 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2129 (int)depth * 2 + 2,"",
55eed653 2130 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2131 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2132 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2133 );
a3621e74
YO
2134
2135 /*
2136 We now know what we are dealing with in terms of unique chars and
2137 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2138 representation using a flat table will take. If it's over a reasonable
2139 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2140 conservative but potentially much slower representation using an array
2141 of lists.
2142
2143 At the end we convert both representations into the same compressed
2144 form that will be used in regexec.c for matching with. The latter
2145 is a form that cannot be used to construct with but has memory
2146 properties similar to the list form and access properties similar
2147 to the table form making it both suitable for fast searches and
2148 small enough that its feasable to store for the duration of a program.
2149
2150 See the comment in the code where the compressed table is produced
2151 inplace from the flat tabe representation for an explanation of how
2152 the compression works.
2153
2154 */
2155
2156
2e64971a
DM
2157 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2158 prev_states[1] = 0;
2159
3dab1dad 2160 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
2161 /*
2162 Second Pass -- Array Of Lists Representation
2163
2164 Each state will be represented by a list of charid:state records
2165 (reg_trie_trans_le) the first such element holds the CUR and LEN
2166 points of the allocated array. (See defines above).
2167
2168 We build the initial structure using the lists, and then convert
2169 it into the compressed table form which allows faster lookups
2170 (but cant be modified once converted).
a3621e74
YO
2171 */
2172
a3621e74
YO
2173 STRLEN transcount = 1;
2174
1e2e3d02
YO
2175 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2176 "%*sCompiling trie using list compiler\n",
2177 (int)depth * 2 + 2, ""));
686b73d4 2178
c944940b
JH
2179 trie->states = (reg_trie_state *)
2180 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2181 sizeof(reg_trie_state) );
a3621e74
YO
2182 TRIE_LIST_NEW(1);
2183 next_alloc = 2;
2184
2185 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2186
df826430 2187 regnode *noper = NEXTOPER( cur );
c445ea15 2188 U8 *uc = (U8*)STRING( noper );
df826430 2189 const U8 *e = uc + STR_LEN( noper );
c445ea15
AL
2190 U32 state = 1; /* required init */
2191 U16 charid = 0; /* sanity init */
07be1b83 2192 U32 wordlen = 0; /* required init */
c445ea15 2193
df826430
YO
2194 if (OP(noper) == NOTHING) {
2195 regnode *noper_next= regnext(noper);
2196 if (noper_next != tail && OP(noper_next) == flags) {
2197 noper = noper_next;
2198 uc= (U8*)STRING(noper);
2199 e= uc + STR_LEN(noper);
2200 }
2201 }
2202
3dab1dad 2203 if (OP(noper) != NOTHING) {
786e8c11 2204 for ( ; uc < e ; uc += len ) {
c445ea15 2205
786e8c11 2206 TRIE_READ_CHAR;
c445ea15 2207
786e8c11
YO
2208 if ( uvc < 256 ) {
2209 charid = trie->charmap[ uvc ];
c445ea15 2210 } else {
55eed653 2211 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
2212 if ( !svpp ) {
2213 charid = 0;
2214 } else {
2215 charid=(U16)SvIV( *svpp );
2216 }
c445ea15 2217 }
786e8c11
YO
2218 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2219 if ( charid ) {
a3621e74 2220
786e8c11
YO
2221 U16 check;
2222 U32 newstate = 0;
a3621e74 2223
786e8c11
YO
2224 charid--;
2225 if ( !trie->states[ state ].trans.list ) {
2226 TRIE_LIST_NEW( state );
c445ea15 2227 }
786e8c11
YO
2228 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
2229 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
2230 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2231 break;
2232 }
2233 }
2234 if ( ! newstate ) {
2235 newstate = next_alloc++;
2e64971a 2236 prev_states[newstate] = state;
786e8c11
YO
2237 TRIE_LIST_PUSH( state, charid, newstate );
2238 transcount++;
2239 }
2240 state = newstate;
2241 } else {
2242 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 2243 }
a28509cc 2244 }
c445ea15 2245 }
3dab1dad 2246 TRIE_HANDLE_WORD(state);
a3621e74
YO
2247
2248 } /* end second pass */
2249
1e2e3d02
YO
2250 /* next alloc is the NEXT state to be allocated */
2251 trie->statecount = next_alloc;
c944940b
JH
2252 trie->states = (reg_trie_state *)
2253 PerlMemShared_realloc( trie->states,
2254 next_alloc
2255 * sizeof(reg_trie_state) );
a3621e74 2256
3dab1dad 2257 /* and now dump it out before we compress it */
2b8b4781
NC
2258 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2259 revcharmap, next_alloc,
2260 depth+1)
1e2e3d02 2261 );
a3621e74 2262
c944940b
JH
2263 trie->trans = (reg_trie_trans *)
2264 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2265 {
2266 U32 state;
a3621e74
YO
2267 U32 tp = 0;
2268 U32 zp = 0;
2269
2270
2271 for( state=1 ; state < next_alloc ; state ++ ) {
2272 U32 base=0;
2273
2274 /*
2275 DEBUG_TRIE_COMPILE_MORE_r(
2276 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2277 );
2278 */
2279
2280 if (trie->states[state].trans.list) {
2281 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2282 U16 maxid=minid;
a28509cc 2283 U16 idx;
a3621e74
YO
2284
2285 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
2286 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2287 if ( forid < minid ) {
2288 minid=forid;
2289 } else if ( forid > maxid ) {
2290 maxid=forid;
2291 }
a3621e74
YO
2292 }
2293 if ( transcount < tp + maxid - minid + 1) {
2294 transcount *= 2;
c944940b
JH
2295 trie->trans = (reg_trie_trans *)
2296 PerlMemShared_realloc( trie->trans,
446bd890
NC
2297 transcount
2298 * sizeof(reg_trie_trans) );
a3621e74
YO
2299 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
2300 }
2301 base = trie->uniquecharcount + tp - minid;
2302 if ( maxid == minid ) {
2303 U32 set = 0;
2304 for ( ; zp < tp ; zp++ ) {
2305 if ( ! trie->trans[ zp ].next ) {
2306 base = trie->uniquecharcount + zp - minid;
2307 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2308 trie->trans[ zp ].check = state;
2309 set = 1;
2310 break;
2311 }
2312 }
2313 if ( !set ) {
2314 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2315 trie->trans[ tp ].check = state;
2316 tp++;
2317 zp = tp;
2318 }
2319 } else {
2320 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 2321 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
2322 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2323 trie->trans[ tid ].check = state;
2324 }
2325 tp += ( maxid - minid + 1 );
2326 }
2327 Safefree(trie->states[ state ].trans.list);
2328 }
2329 /*
2330 DEBUG_TRIE_COMPILE_MORE_r(
2331 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2332 );
2333 */
2334 trie->states[ state ].trans.base=base;
2335 }
cc601c31 2336 trie->lasttrans = tp + 1;
a3621e74
YO
2337 }
2338 } else {
2339 /*
2340 Second Pass -- Flat Table Representation.
2341
b423522f
KW
2342 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2343 each. We know that we will need Charcount+1 trans at most to store
2344 the data (one row per char at worst case) So we preallocate both
2345 structures assuming worst case.
a3621e74
YO
2346
2347 We then construct the trie using only the .next slots of the entry
2348 structs.
2349
b423522f
KW
2350 We use the .check field of the first entry of the node temporarily
2351 to make compression both faster and easier by keeping track of how
2352 many non zero fields are in the node.
a3621e74
YO
2353
2354 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2355 transition.
2356
b423522f
KW
2357 There are two terms at use here: state as a TRIE_NODEIDX() which is
2358 a number representing the first entry of the node, and state as a
2359 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2360 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2361 if there are 2 entrys per node. eg:
a3621e74
YO
2362
2363 A B A B
2364 1. 2 4 1. 3 7
2365 2. 0 3 3. 0 5
2366 3. 0 0 5. 0 0
2367 4. 0 0 7. 0 0
2368
b423522f
KW
2369 The table is internally in the right hand, idx form. However as we
2370 also have to deal with the states array which is indexed by nodenum
2371 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
2372
2373 */
1e2e3d02
YO
2374 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2375 "%*sCompiling trie using table compiler\n",
2376 (int)depth * 2 + 2, ""));
3dab1dad 2377
c944940b
JH
2378 trie->trans = (reg_trie_trans *)
2379 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2380 * trie->uniquecharcount + 1,
2381 sizeof(reg_trie_trans) );
2382 trie->states = (reg_trie_state *)
2383 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2384 sizeof(reg_trie_state) );
a3621e74
YO
2385 next_alloc = trie->uniquecharcount + 1;
2386
3dab1dad 2387
a3621e74
YO
2388 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2389
df826430 2390 regnode *noper = NEXTOPER( cur );
a28509cc 2391 const U8 *uc = (U8*)STRING( noper );
df826430 2392 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
2393
2394 U32 state = 1; /* required init */
2395
2396 U16 charid = 0; /* sanity init */
2397 U32 accept_state = 0; /* sanity init */
a3621e74 2398
07be1b83 2399 U32 wordlen = 0; /* required init */
a3621e74 2400
df826430
YO
2401 if (OP(noper) == NOTHING) {
2402 regnode *noper_next= regnext(noper);
2403 if (noper_next != tail && OP(noper_next) == flags) {
2404 noper = noper_next;
2405 uc= (U8*)STRING(noper);
2406 e= uc + STR_LEN(noper);
2407 }
2408 }
fab2782b 2409
3dab1dad 2410 if ( OP(noper) != NOTHING ) {
786e8c11 2411 for ( ; uc < e ; uc += len ) {
a3621e74 2412
786e8c11 2413 TRIE_READ_CHAR;
a3621e74 2414
786e8c11
YO
2415 if ( uvc < 256 ) {
2416 charid = trie->charmap[ uvc ];
2417 } else {
55eed653 2418 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 2419 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 2420 }
786e8c11
YO
2421 if ( charid ) {
2422 charid--;
2423 if ( !trie->trans[ state + charid ].next ) {
2424 trie->trans[ state + charid ].next = next_alloc;
2425 trie->trans[ state ].check++;
2e64971a
DM
2426 prev_states[TRIE_NODENUM(next_alloc)]
2427 = TRIE_NODENUM(state);
786e8c11
YO
2428 next_alloc += trie->uniquecharcount;
2429 }
2430 state = trie->trans[ state + charid ].next;
2431 } else {
2432 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2433 }
2434 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 2435 }
a3621e74 2436 }
3dab1dad
YO
2437 accept_state = TRIE_NODENUM( state );
2438 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
2439
2440 } /* end second pass */
2441
3dab1dad 2442 /* and now dump it out before we compress it */
2b8b4781
NC
2443 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2444 revcharmap,
2445 next_alloc, depth+1));
a3621e74 2446
a3621e74
YO
2447 {
2448 /*
2449 * Inplace compress the table.*
2450
2451 For sparse data sets the table constructed by the trie algorithm will
2452 be mostly 0/FAIL transitions or to put it another way mostly empty.
2453 (Note that leaf nodes will not contain any transitions.)
2454
2455 This algorithm compresses the tables by eliminating most such
2456 transitions, at the cost of a modest bit of extra work during lookup:
2457
2458 - Each states[] entry contains a .base field which indicates the
2459 index in the state[] array wheres its transition data is stored.
2460
3b753521 2461 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
2462
2463 - If .base is nonzero then charid is added to it to find an entry in
2464 the trans array.
2465
2466 -If trans[states[state].base+charid].check!=state then the
2467 transition is taken to be a 0/Fail transition. Thus if there are fail
2468 transitions at the front of the node then the .base offset will point
2469 somewhere inside the previous nodes data (or maybe even into a node
2470 even earlier), but the .check field determines if the transition is
2471 valid.
2472
786e8c11 2473 XXX - wrong maybe?
a3621e74 2474 The following process inplace converts the table to the compressed
3b753521 2475 table: We first do not compress the root node 1,and mark all its
a3621e74 2476 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
2477 allows us to do a DFA construction from the compressed table later,
2478 and ensures that any .base pointers we calculate later are greater
2479 than 0.
a3621e74
YO
2480
2481 - We set 'pos' to indicate the first entry of the second node.
2482
2483 - We then iterate over the columns of the node, finding the first and
2484 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2485 and set the .check pointers accordingly, and advance pos
2486 appropriately and repreat for the next node. Note that when we copy
2487 the next pointers we have to convert them from the original
2488 NODEIDX form to NODENUM form as the former is not valid post
2489 compression.
2490
2491 - If a node has no transitions used we mark its base as 0 and do not
2492 advance the pos pointer.
2493
2494 - If a node only has one transition we use a second pointer into the
2495 structure to fill in allocated fail transitions from other states.
2496 This pointer is independent of the main pointer and scans forward
2497 looking for null transitions that are allocated to a state. When it
2498 finds one it writes the single transition into the "hole". If the
786e8c11 2499 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
2500
2501 - Once compressed we can Renew/realloc the structures to release the
2502 excess space.
2503
2504 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2505 specifically Fig 3.47 and the associated pseudocode.
2506
2507 demq
2508 */
a3b680e6 2509 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 2510 U32 state, charid;
a3621e74 2511 U32 pos = 0, zp=0;
1e2e3d02 2512 trie->statecount = laststate;
a3621e74
YO
2513
2514 for ( state = 1 ; state < laststate ; state++ ) {
2515 U8 flag = 0;
a28509cc
AL
2516 const U32 stateidx = TRIE_NODEIDX( state );
2517 const U32 o_used = trie->trans[ stateidx ].check;
2518 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
2519 trie->trans[ stateidx ].check = 0;
2520
2521 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2522 if ( flag || trie->trans[ stateidx + charid ].next ) {
2523 if ( trie->trans[ stateidx + charid ].next ) {
2524 if (o_used == 1) {
2525 for ( ; zp < pos ; zp++ ) {
2526 if ( ! trie->trans[ zp ].next ) {
2527 break;
2528 }
2529 }
2530 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2531 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2532 trie->trans[ zp ].check = state;
2533 if ( ++zp > pos ) pos = zp;
2534 break;
2535 }
2536 used--;
2537 }
2538 if ( !flag ) {
2539 flag = 1;
2540 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2541 }
2542 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2543 trie->trans[ pos ].check = state;
2544 pos++;
2545 }
2546 }
2547 }
cc601c31 2548 trie->lasttrans = pos + 1;
c944940b
JH
2549 trie->states = (reg_trie_state *)
2550 PerlMemShared_realloc( trie->states, laststate
2551 * sizeof(reg_trie_state) );
a3621e74 2552 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 2553 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2554 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2555 (int)depth * 2 + 2,"",
2556 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
2557 (IV)next_alloc,
2558 (IV)pos,
a3621e74
YO
2559 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2560 );
2561
2562 } /* end table compress */
2563 }
1e2e3d02
YO
2564 DEBUG_TRIE_COMPILE_MORE_r(
2565 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2566 (int)depth * 2 + 2, "",
2567 (UV)trie->statecount,
2568 (UV)trie->lasttrans)
2569 );
cc601c31 2570 /* resize the trans array to remove unused space */
c944940b
JH
2571 trie->trans = (reg_trie_trans *)
2572 PerlMemShared_realloc( trie->trans, trie->lasttrans
2573 * sizeof(reg_trie_trans) );
a3621e74 2574
3b753521 2575 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2576 U8 nodetype =(U8)(flags & 0xFF);
2577 char *str=NULL;
786e8c11 2578
07be1b83 2579#ifdef DEBUGGING
e62cc96a 2580 regnode *optimize = NULL;
7122b237
YO
2581#ifdef RE_TRACK_PATTERN_OFFSETS
2582
b57a0404
JH
2583 U32 mjd_offset = 0;
2584 U32 mjd_nodelen = 0;
7122b237
YO
2585#endif /* RE_TRACK_PATTERN_OFFSETS */
2586#endif /* DEBUGGING */
a3621e74 2587 /*
3dab1dad
YO
2588 This means we convert either the first branch or the first Exact,
2589 depending on whether the thing following (in 'last') is a branch
2590 or not and whther first is the startbranch (ie is it a sub part of
2591 the alternation or is it the whole thing.)
3b753521 2592 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2593 the whole branch sequence, including the first.
a3621e74 2594 */
3dab1dad 2595 /* Find the node we are going to overwrite */
7f69552c 2596 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2597 /* branch sub-chain */
3dab1dad 2598 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2599#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2600 DEBUG_r({
2601 mjd_offset= Node_Offset((convert));
2602 mjd_nodelen= Node_Length((convert));
2603 });
7122b237 2604#endif
7f69552c 2605 /* whole branch chain */
7122b237
YO
2606 }
2607#ifdef RE_TRACK_PATTERN_OFFSETS
2608 else {
7f69552c
YO
2609 DEBUG_r({
2610 const regnode *nop = NEXTOPER( convert );
2611 mjd_offset= Node_Offset((nop));
2612 mjd_nodelen= Node_Length((nop));
2613 });
07be1b83
YO
2614 }
2615 DEBUG_OPTIMISE_r(
2616 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2617 (int)depth * 2 + 2, "",
786e8c11 2618 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2619 );
7122b237 2620#endif
3dab1dad
YO
2621 /* But first we check to see if there is a common prefix we can
2622 split out as an EXACT and put in front of the TRIE node. */
2623 trie->startstate= 1;
55eed653 2624 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2625 U32 state;
1e2e3d02 2626 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2627 U32 ofs = 0;
8e11feef
RGS
2628 I32 idx = -1;
2629 U32 count = 0;
2630 const U32 base = trie->states[ state ].trans.base;
a3621e74 2631
3dab1dad 2632 if ( trie->states[state].wordnum )
8e11feef 2633 count = 1;
a3621e74 2634
8e11feef 2635 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2636 if ( ( base + ofs >= trie->uniquecharcount ) &&
2637 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2638 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2639 {
3dab1dad 2640 if ( ++count > 1 ) {
2b8b4781 2641 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2642 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2643 if ( state == 1 ) break;
3dab1dad
YO
2644 if ( count == 2 ) {
2645 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2646 DEBUG_OPTIMISE_r(
8e11feef
RGS
2647 PerlIO_printf(Perl_debug_log,
2648 "%*sNew Start State=%"UVuf" Class: [",
2649 (int)depth * 2 + 2, "",
786e8c11 2650 (UV)state));
be8e71aa 2651 if (idx >= 0) {
2b8b4781 2652 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2653 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2654
3dab1dad 2655 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2656 if ( folder )
2657 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2658 DEBUG_OPTIMISE_r(
f1f66076 2659 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2660 );
8e11feef
RGS
2661 }
2662 }
2663 TRIE_BITMAP_SET(trie,*ch);
2664 if ( folder )
2665 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2666 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2667 }
2668 idx = ofs;
2669 }
3dab1dad
YO
2670 }
2671 if ( count == 1 ) {
2b8b4781 2672 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2673 STRLEN len;
2674 char *ch = SvPV( *tmp, len );
de734bd5
A
2675 DEBUG_OPTIMISE_r({
2676 SV *sv=sv_newmortal();
8e11feef
RGS
2677 PerlIO_printf( Perl_debug_log,
2678 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2679 (int)depth * 2 + 2, "",
de734bd5
A
2680 (UV)state, (UV)idx,
2681 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2682 PL_colors[0], PL_colors[1],
2683 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2684 PERL_PV_ESCAPE_FIRSTCHAR
2685 )
2686 );
2687 });
3dab1dad
YO
2688 if ( state==1 ) {
2689 OP( convert ) = nodetype;
2690 str=STRING(convert);
2691 STR_LEN(convert)=0;
2692 }
c490c714
YO
2693 STR_LEN(convert) += len;
2694 while (len--)
de734bd5 2695 *str++ = *ch++;
8e11feef 2696 } else {
f9049ba1 2697#ifdef DEBUGGING
8e11feef
RGS
2698 if (state>1)
2699 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2700#endif
8e11feef
RGS
2701 break;
2702 }
2703 }
2e64971a 2704 trie->prefixlen = (state-1);
3dab1dad 2705 if (str) {
8e11feef 2706 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2707 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2708 trie->startstate = state;
07be1b83
YO
2709 trie->minlen -= (state - 1);
2710 trie->maxlen -= (state - 1);
33809eae
JH
2711#ifdef DEBUGGING
2712 /* At least the UNICOS C compiler choked on this
2713 * being argument to DEBUG_r(), so let's just have
2714 * it right here. */
2715 if (
2716#ifdef PERL_EXT_RE_BUILD
2717 1
2718#else
2719 DEBUG_r_TEST
2720#endif
2721 ) {
2722 regnode *fix = convert;
2723 U32 word = trie->wordcount;
2724 mjd_nodelen++;
2725 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2726 while( ++fix < n ) {
2727 Set_Node_Offset_Length(fix, 0, 0);
2728 }
2729 while (word--) {
2730 SV ** const tmp = av_fetch( trie_words, word, 0 );
2731 if (tmp) {
2732 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2733 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2734 else
2735 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2736 }
2737 }
2738 }
2739#endif
8e11feef
RGS
2740 if (trie->maxlen) {
2741 convert = n;
2742 } else {
3dab1dad 2743 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2744 DEBUG_r(optimize= n);
3dab1dad
YO
2745 }
2746 }
2747 }
a5ca303d
YO
2748 if (!jumper)
2749 jumper = last;
3dab1dad 2750 if ( trie->maxlen ) {
8e11feef
RGS
2751 NEXT_OFF( convert ) = (U16)(tail - convert);
2752 ARG_SET( convert, data_slot );
786e8c11
YO
2753 /* Store the offset to the first unabsorbed branch in
2754 jump[0], which is otherwise unused by the jump logic.
2755 We use this when dumping a trie and during optimisation. */
2756 if (trie->jump)
7f69552c 2757 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2758
6c48061a
YO
2759 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2760 * and there is a bitmap
2761 * and the first "jump target" node we found leaves enough room
2762 * then convert the TRIE node into a TRIEC node, with the bitmap
2763 * embedded inline in the opcode - this is hypothetically faster.
2764 */
2765 if ( !trie->states[trie->startstate].wordnum
2766 && trie->bitmap
2767 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2768 {
2769 OP( convert ) = TRIEC;
2770 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2771 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2772 trie->bitmap= NULL;
2773 } else
2774 OP( convert ) = TRIE;
a3621e74 2775
3dab1dad
YO
2776 /* store the type in the flags */
2777 convert->flags = nodetype;
a5ca303d
YO
2778 DEBUG_r({
2779 optimize = convert
2780 + NODE_STEP_REGNODE
2781 + regarglen[ OP( convert ) ];
2782 });
2783 /* XXX We really should free up the resource in trie now,
2784 as we won't use them - (which resources?) dmq */
3dab1dad 2785 }
a3621e74 2786 /* needed for dumping*/
e62cc96a 2787 DEBUG_r(if (optimize) {
07be1b83 2788 regnode *opt = convert;
bcdf7404 2789
e62cc96a 2790 while ( ++opt < optimize) {
07be1b83
YO
2791 Set_Node_Offset_Length(opt,0,0);
2792 }
786e8c11
YO
2793 /*
2794 Try to clean up some of the debris left after the
2795 optimisation.
a3621e74 2796 */
786e8c11 2797 while( optimize < jumper ) {
07be1b83 2798 mjd_nodelen += Node_Length((optimize));
a3621e74 2799 OP( optimize ) = OPTIMIZED;
07be1b83 2800 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2801 optimize++;
2802 }
07be1b83 2803 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2804 });
2805 } /* end node insert */
2e64971a
DM
2806
2807 /* Finish populating the prev field of the wordinfo array. Walk back
2808 * from each accept state until we find another accept state, and if
2809 * so, point the first word's .prev field at the second word. If the
2810 * second already has a .prev field set, stop now. This will be the
2811 * case either if we've already processed that word's accept state,
3b753521
FN
2812 * or that state had multiple words, and the overspill words were
2813 * already linked up earlier.
2e64971a
DM
2814 */
2815 {
2816 U16 word;
2817 U32 state;
2818 U16 prev;
2819
2820 for (word=1; word <= trie->wordcount; word++) {
2821 prev = 0;
2822 if (trie->wordinfo[word].prev)
2823 continue;
2824 state = trie->wordinfo[word].accept;
2825 while (state) {
2826 state = prev_states[state];
2827 if (!state)
2828 break;
2829 prev = trie->states[state].wordnum;
2830 if (prev)
2831 break;
2832 }
2833 trie->wordinfo[word].prev = prev;
2834 }
2835 Safefree(prev_states);
2836 }
2837
2838
2839 /* and now dump out the compressed format */
2840 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2841
55eed653 2842 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2843#ifdef DEBUGGING
2844 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2845 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2846#else
03e70be4 2847 SvREFCNT_dec_NN(revcharmap);
07be1b83 2848#endif
786e8c11
YO
2849 return trie->jump
2850 ? MADE_JUMP_TRIE
2851 : trie->startstate>1
2852 ? MADE_EXACT_TRIE
2853 : MADE_TRIE;
2854}
2855
2856STATIC void
2857S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2858{
b423522f
KW
2859/* The Trie is constructed and compressed now so we can build a fail array if
2860 * it's needed
786e8c11 2861
b423522f
KW
2862 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2863 3.32 in the
2864 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2865 Ullman 1985/88
786e8c11
YO
2866 ISBN 0-201-10088-6
2867
b423522f
KW
2868 We find the fail state for each state in the trie, this state is the longest
2869 proper suffix of the current state's 'word' that is also a proper prefix of
2870 another word in our trie. State 1 represents the word '' and is thus the
2871 default fail state. This allows the DFA not to have to restart after its
2872 tried and failed a word at a given point, it simply continues as though it
2873 had been matching the other word in the first place.
786e8c11
YO
2874 Consider
2875 'abcdgu'=~/abcdefg|cdgu/
b423522f
KW
2876 When we get to 'd' we are still matching the first word, we would encounter
2877 'g' which would fail, which would bring us to the state representing 'd' in
2878 the second word where we would try 'g' and succeed, proceeding to match
2879 'cdgu'.
786e8c11
YO
2880 */
2881 /* add a fail transition */
3251b653
NC
2882 const U32 trie_offset = ARG(source);
2883 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2884 U32 *q;
2885 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2886 const U32 numstates = trie->statecount;
786e8c11
YO
2887 const U32 ubound = trie->lasttrans + ucharcount;
2888 U32 q_read = 0;
2889 U32 q_write = 0;
2890 U32 charid;
2891 U32 base = trie->states[ 1 ].trans.base;
2892 U32 *fail;
2893 reg_ac_data *aho;
cf78de0b 2894 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
786e8c11 2895 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2896
2897 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2898#ifndef DEBUGGING
2899 PERL_UNUSED_ARG(depth);
2900#endif
2901
2902
2903 ARG_SET( stclass, data_slot );
c944940b 2904 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2905 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2906 aho->trie=trie_offset;
446bd890
NC
2907 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2908 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2909 Newxz( q, numstates, U32);
c944940b 2910 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2911 aho->refcount = 1;
2912 fail = aho->fail;
2913 /* initialize fail[0..1] to be 1 so that we always have
2914 a valid final fail state */
2915 fail[ 0 ] = fail[ 1 ] = 1;
2916
2917 for ( charid = 0; charid < ucharcount ; charid++ ) {
2918 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2919 if ( newstate ) {
2920 q[ q_write ] = newstate;
2921 /* set to point at the root */
2922 fail[ q[ q_write++ ] ]=1;
2923 }
2924 }
2925 while ( q_read < q_write) {
2926 const U32 cur = q[ q_read++ % numstates ];
2927 base = trie->states[ cur ].trans.base;
2928
2929 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2930 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2931 if (ch_state) {
2932 U32 fail_state = cur;
2933 U32 fail_base;
2934 do {
2935 fail_state = fail[ fail_state ];
2936 fail_base = aho->states[ fail_state ].trans.base;
2937 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2938
2939 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2940 fail[ ch_state ] = fail_state;
2941 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2942 {
2943 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2944 }
2945 q[ q_write++ % numstates] = ch_state;
2946 }
2947 }
2948 }
2949 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2950 when we fail in state 1, this allows us to use the
2951 charclass scan to find a valid start char. This is based on the principle
2952 that theres a good chance the string being searched contains lots of stuff
2953 that cant be a start char.
2954 */
2955 fail[ 0 ] = fail[ 1 ] = 0;
2956 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2957 PerlIO_printf(Perl_debug_log,
2958 "%*sStclass Failtable (%"UVuf" states): 0",
2959 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2960 );
786e8c11
YO
2961 for( q_read=1; q_read<numstates; q_read++ ) {
2962 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2963 }
2964 PerlIO_printf(Perl_debug_log, "\n");
2965 });
2966 Safefree(q);
2967 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2968}
2969
786e8c11 2970
07be1b83 2971#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2972 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2973 SV * const mysv=sv_newmortal(); \
2974 regnode *Next = regnext(scan); \
2975 regprop(RExC_rx, mysv, scan); \
7f69552c 2976 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2977 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2978 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2979 }});
07be1b83 2980
1de06328 2981
bb914485 2982/* The below joins as many adjacent EXACTish nodes as possible into a single
0a982f06
KW
2983 * one. The regop may be changed if the node(s) contain certain sequences that
2984 * require special handling. The joining is only done if:
bb914485
KW
2985 * 1) there is room in the current conglomerated node to entirely contain the
2986 * next one.
2987 * 2) they are the exact same node type
2988 *
87b8b349 2989 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
bb914485
KW
2990 * these get optimized out
2991 *
0a982f06
KW
2992 * If a node is to match under /i (folded), the number of characters it matches
2993 * can be different than its character length if it contains a multi-character
2994 * fold. *min_subtract is set to the total delta of the input nodes.
bb914485 2995 *
a0c4c608
KW
2996 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2997 * and contains LATIN SMALL LETTER SHARP S
f758bddf 2998 *
bb914485 2999 * This is as good a place as any to discuss the design of handling these
0a982f06
KW
3000 * multi-character fold sequences. It's been wrong in Perl for a very long
3001 * time. There are three code points in Unicode whose multi-character folds
3002 * were long ago discovered to mess things up. The previous designs for
3003 * dealing with these involved assigning a special node for them. This
3004 * approach doesn't work, as evidenced by this example:
a0c4c608 3005 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
0a982f06
KW
3006 * Both these fold to "sss", but if the pattern is parsed to create a node that
3007 * would match just the \xDF, it won't be able to handle the case where a
bb914485
KW
3008 * successful match would have to cross the node's boundary. The new approach
3009 * that hopefully generally solves the problem generates an EXACTFU_SS node
3010 * that is "sss".
3011 *
0a982f06 3012 * It turns out that there are problems with all multi-character folds, and not
cb117658
KW
3013 * just these three. Now the code is general, for all such cases. The
3014 * approach taken is:
0a982f06
KW
3015 * 1) This routine examines each EXACTFish node that could contain multi-
3016 * character fold sequences. It returns in *min_subtract how much to
9d071ca8 3017 * subtract from the the actual length of the string to get a real minimum
0a982f06
KW
3018 * match length; it is 0 if there are no multi-char folds. This delta is
3019 * used by the caller to adjust the min length of the match, and the delta
3020 * between min and max, so that the optimizer doesn't reject these
3021 * possibilities based on size constraints.
cb117658 3022 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
0a982f06
KW
3023 * is used for an EXACTFU node that contains at least one "ss" sequence in
3024 * it. For non-UTF-8 patterns and strings, this is the only case where
3025 * there is a possible fold length change. That means that a regular
3026 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3027 * with length changes, and so can be processed faster. regexec.c takes
3028 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3029 * pre-folded by regcomp.c. This saves effort in regex matching.
87b8b349 3030 * However, the pre-folding isn't done for non-UTF8 patterns because the
0a982f06
KW
3031 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
3032 * down by forcing the pattern into UTF8 unless necessary. Also what
3033 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
3034 * possibilities for the non-UTF8 patterns are quite simple, except for
3035 * the sharp s. All the ones that don't involve a UTF-8 target string are
3036 * members of a fold-pair, and arrays are set up for all of them so that
3037 * the other member of the pair can be found quickly. Code elsewhere in
3038 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3039 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3040 * described in the next item.
098b07d5 3041 * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
1ca267a5
KW
3042 * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
3043 * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
3044 * (probably unwittingly, in Perl_regexec_flags()) makes is that a
3045 * character in the pattern corresponds to at most a single character in
3046 * the target string. (And I do mean character, and not byte here, unlike
3047 * other parts of the documentation that have never been updated to
3048 * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
3049 * two character string 'ss'; in EXACTFA nodes it can match
3050 * "\x{17F}\x{17F}". These violate the assumption, and they are the only
3051 * instances where it is violated. I'm reluctant to try to change the
3052 * assumption, as the code involved is impenetrable to me (khw), so
3053 * instead the code here punts. This routine examines (when the pattern
3054 * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
3055 * boolean indicating whether or not the node contains a sharp s. When it
3056 * is true, the caller sets a flag that later causes the optimizer in this
3057 * file to not set values for the floating and fixed string lengths, and
3058 * thus avoids the optimizer code in regexec.c that makes the invalid
3059 * assumption. Thus, there is no optimization based on string lengths for
3060 * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
3061 * (The reason the assumption is wrong only in these two cases is that all
3062 * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
3063 * other folds to their expanded versions. We can't prefold sharp s to
3064 * 'ss' in EXACTF nodes because we don't know at compile time if it
3065 * actually matches 'ss' or not. It will match iff the target string is
3066 * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
3067 * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
3068 * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
3069 * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
3070 * require the pattern to be forced into UTF-8, the overhead of which we
3071 * want to avoid.)
098b07d5
KW
3072 *
3073 * Similarly, the code that generates tries doesn't currently handle
3074 * not-already-folded multi-char folds, and it looks like a pain to change
3075 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3076 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3077 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3078 * using /iaa matching will be doing so almost entirely with ASCII
3079 * strings, so this should rarely be encountered in practice */
1de06328 3080
9d071ca8 3081#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
07be1b83 3082 if (PL_regkind[OP(scan)] == EXACT) \
9d071ca8 3083 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
07be1b83 3084
be8e71aa 3085STATIC U32
9d071ca8 3086S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
07be1b83
YO
3087 /* Merge several consecutive EXACTish nodes into one. */
3088 regnode *n = regnext(scan);
3089 U32 stringok = 1;
3090 regnode *next = scan + NODE_SZ_STR(scan);
3091 U32 merged = 0;
3092 U32 stopnow = 0;
3093#ifdef DEBUGGING
3094 regnode *stop = scan;
72f13be8 3095 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 3096#else
d47053eb
RGS
3097 PERL_UNUSED_ARG(depth);
3098#endif
7918f24d
NC
3099
3100 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 3101#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
3102 PERL_UNUSED_ARG(flags);
3103 PERL_UNUSED_ARG(val);
07be1b83 3104#endif
07be1b83 3105 DEBUG_PEEP("join",scan,depth);
bb914485 3106
3f410cf6
KW
3107 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3108 * EXACT ones that are mergeable to the current one. */
3109 while (n
3110 && (PL_regkind[OP(n)] == NOTHING
3111 || (stringok && OP(n) == OP(scan)))
07be1b83 3112 && NEXT_OFF(n)
3f410cf6
KW
3113 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3114 {
07be1b83
YO
3115
3116 if (OP(n) == TAIL || n > next)
3117 stringok = 0;
3118 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
3119 DEBUG_PEEP("skip:",n,depth);
3120 NEXT_OFF(scan) += NEXT_OFF(n);
3121 next = n + NODE_STEP_REGNODE;
3122#ifdef DEBUGGING
3123 if (stringok)
3124 stop = n;
3125#endif
3126 n = regnext(n);
3127 }
3128 else if (stringok) {
786e8c11 3129 const unsigned int oldl = STR_LEN(scan);
07be1b83 3130 regnode * const nnext = regnext(n);
b2230d39 3131
87b8b349
KW
3132 /* XXX I (khw) kind of doubt that this works on platforms where
3133 * U8_MAX is above 255 because of lots of other assumptions */
79a81a6e 3134 /* Don't join if the sum can't fit into a single node */
b2230d39
KW
3135 if (oldl + STR_LEN(n) > U8_MAX)
3136 break;
07be1b83
YO
3137
3138 DEBUG_PEEP("merg",n,depth);
07be1b83 3139 merged++;
b2230d39 3140
07be1b83
YO
3141 NEXT_OFF(scan) += NEXT_OFF(n);
3142 STR_LEN(scan) += STR_LEN(n);
3143 next = n + NODE_SZ_STR(n);
3144 /* Now we can overwrite *n : */
3145 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3146#ifdef DEBUGGING
3147 stop = next - 1;
3148#endif
3149 n = nnext;
3150 if (stopnow) break;
3151 }
3152
d47053eb
RGS
3153#ifdef EXPERIMENTAL_INPLACESCAN
3154 if (flags && !NEXT_OFF(n)) {
3155 DEBUG_PEEP("atch", val, depth);
3156 if (reg_off_by_arg[OP(n)]) {
3157 ARG_SET(n, val - n);
3158 }
3159 else {
3160 NEXT_OFF(n) = val - n;
3161 }
3162 stopnow = 1;
3163 }
07be1b83
YO
3164#endif
3165 }
2c2b7f86 3166
9d071ca8 3167 *min_subtract = 0;
f758bddf 3168 *has_exactf_sharp_s = FALSE;
f646642f 3169
3f410cf6
KW
3170 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3171 * can now analyze for sequences of problematic code points. (Prior to
3172 * this final joining, sequences could have been split over boundaries, and
a0c4c608
KW
3173 * hence missed). The sequences only happen in folding, hence for any
3174 * non-EXACT EXACTish node */
86d6fcad 3175 if (OP(scan) != EXACT) {
0a982f06
KW
3176 const U8 * const s0 = (U8*) STRING(scan);
3177 const U8 * s = s0;
3178 const U8 * const s_end = s0 + STR_LEN(scan);
f758bddf
KW
3179
3180 /* One pass is made over the node's string looking for all the
3181 * possibilities. to avoid some tests in the loop, there are two main
3182 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3183 * non-UTF-8 */
3184 if (UTF) {
86d6fcad 3185
0a982f06
KW
3186 /* Examine the string for a multi-character fold sequence. UTF-8
3187 * patterns have all characters pre-folded by the time this code is
3188 * executed */
3189 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3190 length sequence we are looking for is 2 */
86d6fcad 3191 {
0a982f06
KW
3192 int count = 0;
3193 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3194 if (! len) { /* Not a multi-char fold: get next char */
3195 s += UTF8SKIP(s);
3196 continue;
3197 }
bb914485 3198
0a982f06 3199 /* Nodes with 'ss' require special handling, except for EXACTFL
098b07d5
KW
3200 * and EXACTFA-ish for which there is no multi-char fold to
3201 * this */
0a982f06 3202 if (len == 2 && *s == 's' && *(s+1) == 's'
098b07d5
KW
3203 && OP(scan) != EXACTFL
3204 && OP(scan) != EXACTFA
3205 && OP(scan) != EXACTFA_NO_TRIE)
0a982f06
KW
3206 {
3207 count = 2;
3208 OP(scan) = EXACTFU_SS;
3209 s += 2;
3210 }
0a982f06
KW
3211 else { /* Here is a generic multi-char fold. */
3212 const U8* multi_end = s + len;
3213
3214 /* Count how many characters in it. In the case of /l and
3215 * /aa, no folds which contain ASCII code points are
3216 * allowed, so check for those, and skip if found. (In
3217 * EXACTFL, no folds are allowed to any Latin1 code point,
3218 * not just ASCII. But there aren't any of these
3219 * currently, nor ever likely, so don't take the time to
3220 * test for them. The code that generates the
3221 * is_MULTI_foo() macros croaks should one actually get put
3222 * into Unicode .) */
098b07d5
KW
3223 if (OP(scan) != EXACTFL
3224 && OP(scan) != EXACTFA
3225 && OP(scan) != EXACTFA_NO_TRIE)
3226 {
0a982f06
KW
3227 count = utf8_length(s, multi_end);
3228 s = multi_end;
3229 }
3230 else {
3231 while (s < multi_end) {
3232 if (isASCII(*s)) {
3233 s++;
3234 goto next_iteration;
3235 }
3236 else {
3237 s += UTF8SKIP(s);
3238 }
3239 count++;
3240 }
3241 }
3242 }
f758bddf 3243
0a982f06
KW
3244 /* The delta is how long the sequence is minus 1 (1 is how long
3245 * the character that folds to the sequence is) */
3246 *min_subtract += count - 1;
3247 next_iteration: ;
bb914485
KW
3248 }
3249 }
1ca267a5
KW
3250 else if (OP(scan) == EXACTFA) {
3251
3252 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3253 * fold to the ASCII range (and there are no existing ones in the
3254 * upper latin1 range). But, as outlined in the comments preceding
098b07d5
KW
3255 * this function, we need to flag any occurrences of the sharp s.
3256 * This character forbids trie formation (because of added
3257 * complexity) */
1ca267a5
KW
3258 while (s < s_end) {
3259 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
098b07d5 3260 OP(scan) = EXACTFA_NO_TRIE;
1ca267a5
KW
3261 *has_exactf_sharp_s = TRUE;
3262 break;
3263 }
3264 s++;
3265 continue;
3266 }
3267 }
3268 else if (OP(scan) != EXACTFL) {
3269
3270 /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
3271 * multi-char folds that are all Latin1. (This code knows that
3272 * there are no current multi-char folds possible with EXACTFL,
3273 * relying on fold_grind.t to catch any errors if the very unlikely
3274 * event happens that some get added in future Unicode versions.)
3275 * As explained in the comments preceding this function, we look
3276 * also for the sharp s in EXACTF nodes; it can be in the final
0a982f06
KW
3277 * position. Otherwise we can stop looking 1 byte earlier because
3278 * have to find at least two characters for a multi-fold */
f758bddf
KW
3279 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
3280
0a982f06 3281 while (s < upper) {
40b1ba4f 3282 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
0a982f06
KW
3283 if (! len) { /* Not a multi-char fold. */
3284 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
3285 {
3286 *has_exactf_sharp_s = TRUE;
3287 }
3288 s++;
3289 continue;
3290 }
3291
3292 if (len == 2
c02c3054
KW
3293 && isARG2_lower_or_UPPER_ARG1('s', *s)
3294 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
0a982f06
KW
3295 {
3296
3297 /* EXACTF nodes need to know that the minimum length
3298 * changed so that a sharp s in the string can match this
3299 * ss in the pattern, but they remain EXACTF nodes, as they
3300 * won't match this unless the target string is is UTF-8,
3301 * which we don't know until runtime */
3302 if (OP(scan) != EXACTF) {
3303 OP(scan) = EXACTFU_SS;
3304 }
86d6fcad 3305 }
0a982f06
KW
3306
3307 *min_subtract += len - 1;
3308 s += len;
86d6fcad
KW
3309 }
3310 }
07be1b83 3311 }
3f410cf6 3312
07be1b83 3313#ifdef DEBUGGING
bb789b09
DM
3314 /* Allow dumping but overwriting the collection of skipped
3315 * ops and/or strings with fake optimized ops */
07be1b83
YO
3316 n = scan + NODE_SZ_STR(scan);
3317 while (n <= stop) {
bb789b09
DM
3318 OP(n) = OPTIMIZED;
3319 FLAGS(n) = 0;
3320 NEXT_OFF(n) = 0;
07be1b83
YO
3321 n++;
3322 }
3323#endif
3324 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3325 return stopnow;
3326}
3327
486ec47a 3328/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
3329 Finds fixed substrings. */
3330
a0288114 3331/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
3332 to the position after last scanned or to NULL. */
3333
40d049e4
YO
3334#define INIT_AND_WITHP \
3335 assert(!and_withp); \
b8f7bb16 3336 Newx(and_withp,1, regnode_ssc); \
40d049e4 3337 SAVEFREEPV(and_withp)
07be1b83 3338
b515a41d 3339/* this is a chain of data about sub patterns we are processing that
486ec47a 3340 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
3341 we can simulate recursion without losing state. */
3342struct scan_frame;
3343typedef struct scan_frame {
3344 regnode *last; /* last node to process in this frame */
3345 regnode *next; /* next node to process when last is reached */
3346 struct scan_frame *prev; /*previous frame*/
09a65838 3347 U32 prev_recursed_depth;
b515a41d
YO
3348 I32 stop; /* what stopparen do we use */
3349} scan_frame;
3350
304ee84b
YO
3351
3352#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3353
49f55535 3354STATIC SSize_t
40d049e4 3355S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
ea3daa5d 3356 SSize_t *minlenp, SSize_t *deltap,
40d049e4
YO
3357 regnode *last,
3358 scan_data_t *data,
3359 I32 stopparen,
09a65838 3360 U32 recursed_depth,
b8f7bb16 3361 regnode_ssc *and_withp,
40d049e4 3362 U32 flags, U32 depth)
c277df42
IZ
3363 /* scanp: Start here (read-write). */
3364 /* deltap: Write maxlen-minlen here. */
3365 /* last: Stop before this one. */
40d049e4
YO
3366 /* data: string data about the pattern */
3367 /* stopparen: treat close N as END */
3368 /* recursed: which subroutines have we recursed into */
3369 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 3370{
97aff369 3371 dVAR;
ea3daa5d
FC
3372 /* There must be at least this number of characters to match */
3373 SSize_t min = 0;
2d608413 3374 I32 pars = 0, code;
c277df42 3375 regnode *scan = *scanp, *next;
ea3daa5d 3376 SSize_t delta = 0;
c277df42 3377 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 3378 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
3379 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3380 scan_data_t data_fake;
a3621e74 3381 SV *re_trie_maxbuff = NULL;
786e8c11 3382 regnode *first_non_open = scan;
ea3daa5d 3383 SSize_t stopmin = SSize_t_MAX;
8aa23a47 3384 scan_frame *frame = NULL;
a3621e74 3385 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 3386
7918f24d
NC
3387 PERL_ARGS_ASSERT_STUDY_CHUNK;
3388
13a24bad 3389#ifdef DEBUGGING
40d049e4 3390 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 3391#endif
786e8c11 3392 if ( depth == 0 ) {
40d049e4 3393 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
3394 first_non_open=regnext(first_non_open);
3395 }
3396
b81d288d 3397
8aa23a47
YO
3398 fake_study_recurse:
3399 while ( scan && OP(scan) != END && scan < last ){
2d608413
KW
3400 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3401 node length to get a real minimum (because
3402 the folded version may be shorter) */
f758bddf 3403 bool has_exactf_sharp_s = FALSE;
8aa23a47 3404 /* Peephole optimizer: */
09a65838
YO
3405 DEBUG_OPTIMISE_MORE_r(
3406 {
3407 PerlIO_printf(Perl_debug_log,"%*sstudy_chunk stopparen=%d depth=%u recursed_depth=%u ",
3408 (depth*2), "", stopparen, depth, recursed_depth);
3409 if (recursed_depth) {
3410 U32 i;
3411 U32 j;
3412 for ( j = 0 ; j < recursed_depth ; j++ ) {
3413 PerlIO_printf(Perl_debug_log,"[");
9e9ecfdf 3414 for ( i = 0 ; i < (U32)RExC_npar ; i++ )
09a65838
YO
3415 PerlIO_printf(Perl_debug_log,"%d",
3416 PAREN_TEST(RExC_study_chunk_recursed +
3417 (j * RExC_study_chunk_recursed_bytes), i)
3418 ? 1 : 0
3419 );
3420 PerlIO_printf(Perl_debug_log,"]");
3421 }
0ac75573 3422 }
09a65838
YO
3423 PerlIO_printf(Perl_debug_log,"\n");
3424 }
3425 );
0ac75573
YO
3426 DEBUG_STUDYDATA("Peep:", data, depth);
3427 DEBUG_PEEP("Peep", scan, depth);
3428
a0c4c608
KW
3429
3430 /* Its not clear to khw or hv why this is done here, and not in the
3431 * clauses that deal with EXACT nodes. khw's guess is that it's
3432 * because of a previous design */
9d071ca8 3433 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
8aa23a47
YO
3434
3435 /* Follow the next-chain of the current node and optimize
3436 away all the NOTHINGs from it. */
3437 if (OP(scan) != CURLYX) {
3438 const int max = (reg_off_by_arg[OP(scan)]
3439 ? I32_MAX
3440 /* I32 may be smaller than U16 on CRAYs! */
3441 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3442 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3443 int noff;
3444 regnode *n = scan;
686b73d4 3445
8aa23a47
YO
3446 /* Skip NOTHING and LONGJMP. */
3447 while ((n = regnext(n))
3448 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3449 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3450 && off + noff < max)
3451 off += noff;
3452 if (reg_off_by_arg[OP(scan)])
3453 ARG(scan) = off;
3454 else
3455 NEXT_OFF(scan) = off;
3456 }
a3621e74 3457
c277df42 3458
8aa23a47
YO
3459
3460 /* The principal pseudo-switch. Cannot be a switch, since we
3461 look into several different things. */
3462 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3463 || OP(scan) == IFTHEN) {
3464 next = regnext(scan);
3465 code = OP(scan);
3466 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
686b73d4 3467
8aa23a47 3468 if (OP(next) == code || code == IFTHEN) {
b423522f
KW
3469 /* NOTE - There is similar code to this block below for
3470 * handling TRIE nodes on a re-study. If you change stuff here
3471 * check there too. */
49f55535 3472 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
b8f7bb16 3473 regnode_ssc accum;
8aa23a47 3474 regnode * const startbranch=scan;
686b73d4 3475
8aa23a47 3476 if (flags & SCF_DO_SUBSTR)
304ee84b 3477 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47 3478 if (flags & SCF_DO_STCLASS)
557bd3fb 3479 ssc_init_zero(pRExC_state, &accum);
8aa23a47
YO
3480
3481 while (OP(scan) == code) {
49f55535
FC
3482 SSize_t deltanext, minnext, fake;
3483 I32 f = 0;
b8f7bb16 3484 regnode_ssc this_class;
8aa23a47
YO
3485
3486 num++;
3487 data_fake.flags = 0;
3488 if (data) {
3489 data_fake.whilem_c = data->whilem_c;
3490 data_fake.last_closep = data->last_closep;
3491 }
3492 else
3493 data_fake.last_closep = &fake;
58e23c8d
YO
3494
3495 data_fake.pos_delta = delta;
8aa23a47
YO
3496 next = regnext(scan);
3497 scan = NEXTOPER(scan);
3498 if (code != BRANCH)
c277df42 3499 scan = NEXTOPER(scan);
8aa23a47 3500 if (flags & SCF_DO_STCLASS) {
557bd3fb 3501 ssc_init(pRExC_state, &this_class);
8aa23a47
YO
3502 data_fake.start_class = &this_class;
3503 f = SCF_DO_STCLASS_AND;
58e23c8d 3504 }
8aa23a47
YO
3505 if (flags & SCF_WHILEM_VISITED_POS)
3506 f |= SCF_WHILEM_VISITED_POS;
3507
3508 /* we suppose the run is continuous, last=next...*/
3509 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3510 next, &data_fake,
09a65838 3511 stopparen, recursed_depth, NULL, f,depth+1);
8aa23a47
YO
3512 if (min1 > minnext)
3513 min1 = minnext;
ea3daa5d 3514 if (deltanext == SSize_t_MAX) {
8aa23a47 3515 is_inf = is_inf_internal = 1;
ea3daa5d 3516 max1 = SSize_t_MAX;
9b139d09
GG
3517 } else if (max1 < minnext + deltanext)
3518 max1 = minnext + deltanext;
8aa23a47
YO
3519 scan = next;
3520 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3521 pars++;
3522 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3523 if ( stopmin > minnext)
3524 stopmin = min + min1;
3525 flags &= ~SCF_DO_SUBSTR;
3526 if (data)
3527 data->flags |= SCF_SEEN_ACCEPT;
3528 }
3529 if (data) {
3530 if (data_fake.flags & SF_HAS_EVAL)
3531 data->flags |= SF_HAS_EVAL;
3532 data->whilem_c = data_fake.whilem_c;
3dab1dad 3533 }
8aa23a47 3534 if (flags & SCF_DO_STCLASS)
557bd3fb 3535 ssc_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
3536 }
3537 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3538 min1 = 0;
3539 if (flags & SCF_DO_SUBSTR) {
3540 data->pos_min += min1;
ea3daa5d
FC
3541 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3542 data->pos_delta = SSize_t_MAX;
9b139d09
GG
3543 else
3544 data->pos_delta += max1 - min1;
8aa23a47
YO
3545 if (max1 != min1 || is_inf)
3546 data->longest = &(data->longest_float);
3547 }
3548 min += min1;
ea3daa5d
FC
3549 if (delta == SSize_t_MAX
3550 || SSize_t_MAX - delta - (max1 - min1) < 0)
3551 delta = SSize_t_MAX;
9b139d09
GG
3552 else
3553 delta += max1 - min1;
8aa23a47 3554 if (flags & SCF_DO_STCLASS_OR) {
557bd3fb 3555 ssc_or(pRExC_state, data->start_class, &accum);
8aa23a47 3556 if (min1) {
1ca93ef1 3557 ssc_and(pRExC_state, data->start_class, and_withp);
8aa23a47 3558 flags &= ~SCF_DO_STCLASS;
653099ff 3559 }
8aa23a47
YO
3560 }
3561 else if (flags & SCF_DO_STCLASS_AND) {
3562 if (min1) {
1ca93ef1 3563 ssc_and(pRExC_state, data->start_class, &accum);
8aa23a47 3564 flags &= ~SCF_DO_STCLASS;
de0c8cb8 3565 }
8aa23a47
YO
3566 else {
3567 /* Switch to OR mode: cache the old value of
3568 * data->start_class */
3569 INIT_AND_WITHP;
b8f7bb16 3570 StructCopy(data->start_class, and_withp, regnode_ssc);
8aa23a47 3571 flags &= ~SCF_DO_STCLASS_AND;
b8f7bb16 3572 StructCopy(&accum, data->start_class, regnode_ssc);
8aa23a47 3573 flags |= SCF_DO_STCLASS_OR;
de0c8cb8 3574 }
8aa23a47 3575 }
a3621e74 3576
8aa23a47
YO
3577 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3578 /* demq.
a3621e74 3579
b423522f
KW
3580 Assuming this was/is a branch we are dealing with: 'scan'
3581 now points at the item that follows the branch sequence,
3582 whatever it is. We now start at the beginning of the
3583 sequence and look for subsequences of
a3621e74 3584
8aa23a47
YO
3585 BRANCH->EXACT=>x1
3586 BRANCH->EXACT=>x2
3587 tail
a3621e74 3588
b423522f
KW
3589 which would be constructed from a pattern like
3590 /A|LIST|OF|WORDS/
a3621e74 3591
486ec47a 3592 If we can find such a subsequence we need to turn the first
8aa23a47
YO
3593 element into a trie and then add the subsequent branch exact
3594 strings to the trie.
a3621e74 3595
8aa23a47 3596 We have two cases
a3621e74 3597
b423522f
KW
3598 1. patterns where the whole set of branches can be
3599 converted.
a3621e74