This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regex sets: fix Solaris optimiser bug (2nd attempt)
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
f7e03a10 89#include "dquote_inline.h"
b992490d 90#include "invlist_inline.h"
1b0f46bf 91#include "unicode_constants.h"
04e98a4d 92
538e84ed
KW
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
95#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 97#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 98#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
dea37815
KW
104#ifndef MIN
105#define MIN(a,b) ((a) < (b) ? (a) : (b))
106#endif
b1603ef8 107
3001ef3e
KW
108#ifndef MAX
109#define MAX(a,b) ((a) > (b) ? (a) : (b))
110#endif
111
3f910e62
YO
112/* this is a chain of data about sub patterns we are processing that
113 need to be handled separately/specially in study_chunk. Its so
114 we can simulate recursion without losing state. */
115struct scan_frame;
116typedef struct scan_frame {
117 regnode *last_regnode; /* last node to process in this frame */
118 regnode *next_regnode; /* next node to process when last is reached */
119 U32 prev_recursed_depth;
120 I32 stopparen; /* what stopparen do we use */
121 U32 is_top_frame; /* what flags do we use? */
122
123 struct scan_frame *this_prev_frame; /* this previous frame */
124 struct scan_frame *prev_frame; /* previous frame */
125 struct scan_frame *next_frame; /* next frame */
126} scan_frame;
127
f4ae5a27
KW
128/* Certain characters are output as a sequence with the first being a
129 * backslash. */
130#define isBACKSLASHED_PUNCT(c) \
131 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
132
133
09b2b2e6 134struct RExC_state_t {
514a91f1
DM
135 U32 flags; /* RXf_* are we folding, multilining? */
136 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 137 char *precomp; /* uncompiled string. */
711b303b 138 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 139 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 140 regexp *rx; /* perl core regexp structure */
538e84ed
KW
141 regexp_internal *rxi; /* internal data for regexp object
142 pprivate field */
fac92740 143 char *start; /* Start of input for compile */
830247a4
IZ
144 char *end; /* End of input for compile */
145 char *parse; /* Input-scan pointer. */
285b5ca0
KW
146 char *adjusted_start; /* 'start', adjusted. See code use */
147 STRLEN precomp_adj; /* an offset beyond precomp. See code use */
ea3daa5d 148 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 149 regnode *emit_start; /* Start of emitted-code area */
538e84ed
KW
150 regnode *emit_bound; /* First regnode outside of the
151 allocated space */
f7c7e32a
DM
152 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
153 implies compiling, so don't emit */
9a81a976
KW
154 regnode_ssc emit_dummy; /* placeholder for emit to point to;
155 large enough for the largest
156 non-EXACTish node, so can use it as
157 scratch in pass1 */
830247a4
IZ
158 I32 naughty; /* How bad is this pattern? */
159 I32 sawback; /* Did we see \1, ...? */
160 U32 seen;
ea3daa5d 161 SSize_t size; /* Code size. */
538e84ed
KW
162 I32 npar; /* Capture buffer count, (OPEN) plus
163 one. ("par" 0 is the whole
164 pattern)*/
165 I32 nestroot; /* root parens we are in - used by
166 accept */
830247a4
IZ
167 I32 extralen;
168 I32 seen_zerolen;
40d049e4
YO
169 regnode **open_parens; /* pointers to open parens */
170 regnode **close_parens; /* pointers to close parens */
d5a00e4a 171 regnode *end_op; /* END node in program */
02daf0ab
YO
172 I32 utf8; /* whether the pattern is utf8 or not */
173 I32 orig_utf8; /* whether the pattern was originally in utf8 */
174 /* XXX use this for future optimisation of case
175 * where pattern must be upgraded to utf8. */
e40e74fe
KW
176 I32 uni_semantics; /* If a d charset modifier should use unicode
177 rules, even if the pattern is not in
178 utf8 */
81714fb9 179 HV *paren_names; /* Paren names */
538e84ed 180
40d049e4 181 regnode **recurse; /* Recurse regops */
d5a00e4a 182 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 183 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 184 through */
09a65838 185 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 186 I32 in_lookbehind;
4624b182 187 I32 contains_locale;
cfafade5 188 I32 contains_i;
bb3f3ed2 189 I32 override_recoding;
b6d67071
KW
190#ifdef EBCDIC
191 I32 recode_x_to_native;
192#endif
9d53c457 193 I32 in_multi_char_class;
3d2bd50a 194 struct reg_code_block *code_blocks; /* positions of literal (?{})
68e2671b 195 within pattern */
b1603ef8
DM
196 int num_code_blocks; /* size of code_blocks[] */
197 int code_index; /* next code_blocks[] slot */
ee273784 198 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
199 scan_frame *frame_head;
200 scan_frame *frame_last;
201 U32 frame_count;
dc6d7f5c 202#ifdef ADD_TO_REGEXEC
830247a4
IZ
203 char *starttry; /* -Dr: where regtry was called. */
204#define RExC_starttry (pRExC_state->starttry)
205#endif
d24ca0c5 206 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 207#ifdef DEBUGGING
be8e71aa 208 const char *lastparse;
3dab1dad 209 I32 lastnum;
1f1031fe 210 AV *paren_name_list; /* idx -> name */
d9a72fcc 211 U32 study_chunk_recursed_count;
c9f0d54c
YO
212 SV *mysv1;
213 SV *mysv2;
3dab1dad
YO
214#define RExC_lastparse (pRExC_state->lastparse)
215#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 216#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 217#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
218#define RExC_mysv (pRExC_state->mysv1)
219#define RExC_mysv1 (pRExC_state->mysv1)
220#define RExC_mysv2 (pRExC_state->mysv2)
221
3dab1dad 222#endif
512c0f5a 223 bool seen_unfolded_sharp_s;
911bd04e 224 bool strict;
09b2b2e6 225};
830247a4 226
e2509266 227#define RExC_flags (pRExC_state->flags)
514a91f1 228#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 229#define RExC_precomp (pRExC_state->precomp)
285b5ca0
KW
230#define RExC_precomp_adj (pRExC_state->precomp_adj)
231#define RExC_adjusted_start (pRExC_state->adjusted_start)
711b303b 232#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 233#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 234#define RExC_rx (pRExC_state->rx)
f8fc2ecf 235#define RExC_rxi (pRExC_state->rxi)
fac92740 236#define RExC_start (pRExC_state->start)
830247a4
IZ
237#define RExC_end (pRExC_state->end)
238#define RExC_parse (pRExC_state->parse)
239#define RExC_whilem_seen (pRExC_state->whilem_seen)
512c0f5a
KW
240
241/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
242 * EXACTF node, hence was parsed under /di rules. If later in the parse,
243 * something forces the pattern into using /ui rules, the sharp s should be
244 * folded into the sequence 'ss', which takes up more space than previously
245 * calculated. This means that the sizing pass needs to be restarted. (The
246 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
247 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
248 * so there is no need to resize [perl #125990]. */
249#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
250
7122b237 251#ifdef RE_TRACK_PATTERN_OFFSETS
538e84ed
KW
252#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
253 others */
7122b237 254#endif
830247a4 255#define RExC_emit (pRExC_state->emit)
f7c7e32a 256#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 257#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 258#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
259#define RExC_sawback (pRExC_state->sawback)
260#define RExC_seen (pRExC_state->seen)
261#define RExC_size (pRExC_state->size)
ee273784 262#define RExC_maxlen (pRExC_state->maxlen)
830247a4 263#define RExC_npar (pRExC_state->npar)
e2e6a0f1 264#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
265#define RExC_extralen (pRExC_state->extralen)
266#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 267#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 268#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 269#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
270#define RExC_open_parens (pRExC_state->open_parens)
271#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 272#define RExC_end_op (pRExC_state->end_op)
81714fb9 273#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
274#define RExC_recurse (pRExC_state->recurse)
275#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 276#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
277#define RExC_study_chunk_recursed_bytes \
278 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 279#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 280#define RExC_contains_locale (pRExC_state->contains_locale)
cfafade5 281#define RExC_contains_i (pRExC_state->contains_i)
9d53c457 282#define RExC_override_recoding (pRExC_state->override_recoding)
b6d67071
KW
283#ifdef EBCDIC
284# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
285#endif
9d53c457 286#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
287#define RExC_frame_head (pRExC_state->frame_head)
288#define RExC_frame_last (pRExC_state->frame_last)
289#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 290#define RExC_strict (pRExC_state->strict)
830247a4 291
99807a43
HS
292/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
293 * a flag to disable back-off on the fixed/floating substrings - if it's
294 * a high complexity pattern we assume the benefit of avoiding a full match
295 * is worth the cost of checking for the substrings even if they rarely help.
296 */
297#define RExC_naughty (pRExC_state->naughty)
298#define TOO_NAUGHTY (10)
299#define MARK_NAUGHTY(add) \
300 if (RExC_naughty < TOO_NAUGHTY) \
301 RExC_naughty += (add)
302#define MARK_NAUGHTY_EXP(exp, add) \
303 if (RExC_naughty < TOO_NAUGHTY) \
304 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 305
a687059c
LW
306#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
307#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 308 ((*s) == '{' && regcurly(s)))
a687059c
LW
309
310/*
311 * Flags to be passed up and down.
312 */
a687059c 313#define WORST 0 /* Worst case. */
a3b492c3 314#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 315
e64f369d 316/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
317 * character. (There needs to be a case: in the switch statement in regexec.c
318 * for any node marked SIMPLE.) Note that this is not the same thing as
319 * REGNODE_SIMPLE */
fda99bee 320#define SIMPLE 0x02
e64f369d 321#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
322#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
323#define TRYAGAIN 0x10 /* Weeded out a declaration. */
b97943f7
KW
324#define RESTART_PASS1 0x20 /* Need to restart sizing pass */
325#define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
326 calcuate sizes as UTF-8 */
a687059c 327
3dab1dad
YO
328#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
329
07be1b83
YO
330/* whether trie related optimizations are enabled */
331#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
332#define TRIE_STUDY_OPT
786e8c11 333#define FULL_TRIE_STUDY
07be1b83
YO
334#define TRIE_STCLASS
335#endif
1de06328
YO
336
337
40d049e4
YO
338
339#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
340#define PBITVAL(paren) (1 << ((paren) & 7))
341#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
342#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
343#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
344
82a6ada4 345#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 346 if (!UTF) { \
82a6ada4
KW
347 assert(PASS1); \
348 *flagp = RESTART_PASS1|NEED_UTF8; \
8d9c2815
NC
349 return NULL; \
350 } \
82a6ada4 351 } STMT_END
40d049e4 352
512c0f5a
KW
353/* Change from /d into /u rules, and restart the parse if we've already seen
354 * something whose size would increase as a result, by setting *flagp and
355 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
356 * we've change to /u during the parse. */
357#define REQUIRE_UNI_RULES(flagp, restart_retval) \
358 STMT_START { \
359 if (DEPENDS_SEMANTICS) { \
360 assert(PASS1); \
361 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
362 RExC_uni_semantics = 1; \
363 if (RExC_seen_unfolded_sharp_s) { \
364 *flagp |= RESTART_PASS1; \
365 return restart_retval; \
366 } \
367 } \
368 } STMT_END
369
f19b1a63
KW
370/* This converts the named class defined in regcomp.h to its equivalent class
371 * number defined in handy.h. */
372#define namedclass_to_classnum(class) ((int) ((class) / 2))
373#define classnum_to_namedclass(classnum) ((classnum) * 2)
374
de92f5e6
KW
375#define _invlist_union_complement_2nd(a, b, output) \
376 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
377#define _invlist_intersection_complement_2nd(a, b, output) \
378 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
379
1de06328
YO
380/* About scan_data_t.
381
382 During optimisation we recurse through the regexp program performing
383 various inplace (keyhole style) optimisations. In addition study_chunk
384 and scan_commit populate this data structure with information about
538e84ed 385 what strings MUST appear in the pattern. We look for the longest
3b753521 386 string that must appear at a fixed location, and we look for the
1de06328
YO
387 longest string that may appear at a floating location. So for instance
388 in the pattern:
538e84ed 389
1de06328 390 /FOO[xX]A.*B[xX]BAR/
538e84ed 391
1de06328
YO
392 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
393 strings (because they follow a .* construct). study_chunk will identify
394 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 395
1de06328 396 The strings can be composites, for instance
538e84ed 397
1de06328 398 /(f)(o)(o)/
538e84ed 399
1de06328 400 will result in a composite fixed substring 'foo'.
538e84ed 401
1de06328 402 For each string some basic information is maintained:
538e84ed 403
1de06328
YO
404 - offset or min_offset
405 This is the position the string must appear at, or not before.
406 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
407 characters must match before the string we are searching for.
408 Likewise when combined with minlenp and the length of the string it
538e84ed 409 tells us how many characters must appear after the string we have
1de06328 410 found.
538e84ed 411
1de06328
YO
412 - max_offset
413 Only used for floating strings. This is the rightmost point that
ea3daa5d 414 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 415 string can occur infinitely far to the right.
538e84ed 416
1de06328 417 - minlenp
2d608413
KW
418 A pointer to the minimum number of characters of the pattern that the
419 string was found inside. This is important as in the case of positive
538e84ed 420 lookahead or positive lookbehind we can have multiple patterns
1de06328 421 involved. Consider
538e84ed 422
1de06328 423 /(?=FOO).*F/
538e84ed 424
1de06328
YO
425 The minimum length of the pattern overall is 3, the minimum length
426 of the lookahead part is 3, but the minimum length of the part that
538e84ed 427 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 428 minimum length for the F is 1. This is important as the minimum length
538e84ed 429 is used to determine offsets in front of and behind the string being
1de06328 430 looked for. Since strings can be composites this is the length of the
486ec47a 431 pattern at the time it was committed with a scan_commit. Note that
1de06328 432 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 433 are not known until the full pattern has been compiled, thus the
1de06328 434 pointer to the value.
538e84ed 435
1de06328 436 - lookbehind
538e84ed 437
1de06328 438 In the case of lookbehind the string being searched for can be
538e84ed 439 offset past the start point of the final matching string.
1de06328
YO
440 If this value was just blithely removed from the min_offset it would
441 invalidate some of the calculations for how many chars must match
442 before or after (as they are derived from min_offset and minlen and
538e84ed 443 the length of the string being searched for).
1de06328
YO
444 When the final pattern is compiled and the data is moved from the
445 scan_data_t structure into the regexp structure the information
538e84ed
KW
446 about lookbehind is factored in, with the information that would
447 have been lost precalculated in the end_shift field for the
1de06328
YO
448 associated string.
449
450 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 451 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
452
453*/
2c2d71f5
JH
454
455typedef struct scan_data_t {
1de06328
YO
456 /*I32 len_min; unused */
457 /*I32 len_delta; unused */
49f55535 458 SSize_t pos_min;
ea3daa5d 459 SSize_t pos_delta;
2c2d71f5 460 SV *last_found;
ea3daa5d 461 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 462 SSize_t last_start_min;
ea3daa5d 463 SSize_t last_start_max;
1de06328
YO
464 SV **longest; /* Either &l_fixed, or &l_float. */
465 SV *longest_fixed; /* longest fixed string found in pattern */
49f55535 466 SSize_t offset_fixed; /* offset where it starts */
ea3daa5d 467 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
468 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
469 SV *longest_float; /* longest floating string found in pattern */
49f55535 470 SSize_t offset_float_min; /* earliest point in string it can appear */
ea3daa5d
FC
471 SSize_t offset_float_max; /* latest point in string it can appear */
472 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
49f55535 473 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
2c2d71f5
JH
474 I32 flags;
475 I32 whilem_c;
49f55535 476 SSize_t *last_closep;
b8f7bb16 477 regnode_ssc *start_class;
2c2d71f5
JH
478} scan_data_t;
479
a687059c 480/*
e50aee73 481 * Forward declarations for pregcomp()'s friends.
a687059c 482 */
a0d0e21e 483
27da23d5 484static const scan_data_t zero_scan_data =
1de06328 485 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
486
487#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
488#define SF_BEFORE_SEOL 0x0001
489#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
490#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
491#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
492
44e3dfd2
BF
493#define SF_FIX_SHIFT_EOL (+2)
494#define SF_FL_SHIFT_EOL (+4)
c277df42
IZ
495
496#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
497#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
498
499#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
500#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
501#define SF_IS_INF 0x0040
502#define SF_HAS_PAR 0x0080
503#define SF_IN_PAR 0x0100
504#define SF_HAS_EVAL 0x0200
bc604ad8
KW
505
506
507/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
508 * longest substring in the pattern. When it is not set the optimiser keeps
509 * track of position, but does not keep track of the actual strings seen,
510 *
511 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
512 * /foo/i will not.
513 *
514 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
515 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
516 * turned off because of the alternation (BRANCH). */
07be1b83 517#define SCF_DO_SUBSTR 0x0400
bc604ad8 518
653099ff
GS
519#define SCF_DO_STCLASS_AND 0x0800
520#define SCF_DO_STCLASS_OR 0x1000
521#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 522#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 523
786e8c11 524#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 525#define SCF_SEEN_ACCEPT 0x8000
688e0391 526#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
527#define SCF_IN_DEFINE 0x20000
528
529
530
07be1b83 531
43fead97 532#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
533
534/* The enums for all these are ordered so things work out correctly */
a62b1201 535#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
536#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
537 == REGEX_DEPENDS_CHARSET)
00b27cfc 538#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
539#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
540 >= REGEX_UNICODE_CHARSET)
541#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
542 == REGEX_ASCII_RESTRICTED_CHARSET)
543#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
544 >= REGEX_ASCII_RESTRICTED_CHARSET)
545#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
546 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 547
43fead97 548#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 549
f2c2a6ab
KW
550/* For programs that want to be strictly Unicode compatible by dying if any
551 * attempt is made to match a non-Unicode code point against a Unicode
552 * property. */
553#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
554
93733859 555#define OOB_NAMEDCLASS -1
b8c5462f 556
8e661ac5
KW
557/* There is no code point that is out-of-bounds, so this is problematic. But
558 * its only current use is to initialize a variable that is always set before
559 * looked at. */
560#define OOB_UNICODE 0xDEADBEEF
561
a0ed51b3
LW
562#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
563#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
564
8615cb43 565
b45f050a
JF
566/* length of regex to show in messages that don't mark a position within */
567#define RegexLengthToShowInErrorMessages 127
568
569/*
570 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
571 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
572 * op/pragma/warn/regcomp.
573 */
7253e4e3
RK
574#define MARKER1 "<-- HERE" /* marker as it appears in the description */
575#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 576
538e84ed
KW
577#define REPORT_LOCATION " in regex; marked by " MARKER1 \
578 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
b45f050a 579
285b5ca0
KW
580/* The code in this file in places uses one level of recursion with parsing
581 * rebased to an alternate string constructed by us in memory. This can take
582 * the form of something that is completely different from the input, or
583 * something that uses the input as part of the alternate. In the first case,
584 * there should be no possibility of an error, as we are in complete control of
585 * the alternate string. But in the second case we don't control the input
586 * portion, so there may be errors in that. Here's an example:
587 * /[abc\x{DF}def]/ui
588 * is handled specially because \x{df} folds to a sequence of more than one
589 * character, 'ss'. What is done is to create and parse an alternate string,
590 * which looks like this:
591 * /(?:\x{DF}|[abc\x{DF}def])/ui
592 * where it uses the input unchanged in the middle of something it constructs,
593 * which is a branch for the DF outside the character class, and clustering
594 * parens around the whole thing. (It knows enough to skip the DF inside the
595 * class while in this substitute parse.) 'abc' and 'def' may have errors that
596 * need to be reported. The general situation looks like this:
597 *
598 * sI tI xI eI
599 * Input: ----------------------------------------------------
600 * Constructed: ---------------------------------------------------
601 * sC tC xC eC EC
602 *
603 * The input string sI..eI is the input pattern. The string sC..EC is the
604 * constructed substitute parse string. The portions sC..tC and eC..EC are
605 * constructed by us. The portion tC..eC is an exact duplicate of the input
606 * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that
607 * while parsing, we find an error at xC. We want to display a message showing
608 * the real input string. Thus we need to find the point xI in it which
609 * corresponds to xC. xC >= tC, since the portion of the string sC..tC has
610 * been constructed by us, and so shouldn't have errors. We get:
611 *
612 * xI = sI + (tI - sI) + (xC - tC)
613 *
614 * and, the offset into sI is:
615 *
616 * (xI - sI) = (tI - sI) + (xC - tC)
617 *
618 * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
619 * and we save tC as RExC_adjusted_start.
903c858a
KW
620 *
621 * During normal processing of the input pattern, everything points to that,
622 * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
285b5ca0
KW
623 */
624
625#define tI_sI RExC_precomp_adj
626#define tC RExC_adjusted_start
627#define sC RExC_precomp
628#define xI_offset(xC) ((IV) (tI_sI + (xC - tC)))
629#define xI(xC) (sC + xI_offset(xC))
630#define eC RExC_precomp_end
631
632#define REPORT_LOCATION_ARGS(xC) \
633 UTF8fARG(UTF, \
634 (xI(xC) > eC) /* Don't run off end */ \
635 ? eC - sC /* Length before the <--HERE */ \
636 : xI_offset(xC), \
637 sC), /* The input pattern printed up to the <--HERE */ \
638 UTF8fARG(UTF, \
639 (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \
640 (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */
c1d900c3 641
8a6d8ec6
HS
642/* Used to point after bad bytes for an error message, but avoid skipping
643 * past a nul byte. */
644#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
645
b45f050a
JF
646/*
647 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
648 * arg. Show regex, up to a maximum length. If it's too long, chop and add
649 * "...".
650 */
58e23c8d 651#define _FAIL(code) STMT_START { \
bfed75c6 652 const char *ellipses = ""; \
711b303b 653 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380
MP
654 \
655 if (!SIZE_ONLY) \
a5e7bc51 656 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
657 if (len > RegexLengthToShowInErrorMessages) { \
658 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
659 len = RegexLengthToShowInErrorMessages - 10; \
660 ellipses = "..."; \
661 } \
58e23c8d 662 code; \
ccb2c380 663} STMT_END
8615cb43 664
58e23c8d 665#define FAIL(msg) _FAIL( \
c1d900c3
BF
666 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
667 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
668
669#define FAIL2(msg,arg) _FAIL( \
c1d900c3
BF
670 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
671 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 672
b45f050a 673/*
b45f050a
JF
674 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
675 */
ccb2c380 676#define Simple_vFAIL(m) STMT_START { \
ccb2c380 677 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 678 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 679} STMT_END
b45f050a
JF
680
681/*
682 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
683 */
ccb2c380
MP
684#define vFAIL(m) STMT_START { \
685 if (!SIZE_ONLY) \
a5e7bc51 686 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
687 Simple_vFAIL(m); \
688} STMT_END
b45f050a
JF
689
690/*
691 * Like Simple_vFAIL(), but accepts two arguments.
692 */
ccb2c380 693#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
694 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
695 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 696} STMT_END
b45f050a
JF
697
698/*
699 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
700 */
ccb2c380
MP
701#define vFAIL2(m,a1) STMT_START { \
702 if (!SIZE_ONLY) \
a5e7bc51 703 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
704 Simple_vFAIL2(m, a1); \
705} STMT_END
b45f050a
JF
706
707
708/*
709 * Like Simple_vFAIL(), but accepts three arguments.
710 */
ccb2c380 711#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 712 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 713 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 714} STMT_END
b45f050a
JF
715
716/*
717 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
718 */
ccb2c380
MP
719#define vFAIL3(m,a1,a2) STMT_START { \
720 if (!SIZE_ONLY) \
a5e7bc51 721 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
722 Simple_vFAIL3(m, a1, a2); \
723} STMT_END
b45f050a
JF
724
725/*
726 * Like Simple_vFAIL(), but accepts four arguments.
727 */
ccb2c380 728#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
729 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
730 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 731} STMT_END
b45f050a 732
95db3ffa
KW
733#define vFAIL4(m,a1,a2,a3) STMT_START { \
734 if (!SIZE_ONLY) \
735 SAVEFREESV(RExC_rx_sv); \
736 Simple_vFAIL4(m, a1, a2, a3); \
737} STMT_END
738
946095af 739/* A specialized version of vFAIL2 that works with UTF8f */
d528642a
KW
740#define vFAIL2utf8f(m, a1) STMT_START { \
741 if (!SIZE_ONLY) \
742 SAVEFREESV(RExC_rx_sv); \
743 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
744 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
745} STMT_END
746
3ba22297 747#define vFAIL3utf8f(m, a1, a2) STMT_START { \
3ba22297
KW
748 if (!SIZE_ONLY) \
749 SAVEFREESV(RExC_rx_sv); \
750 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 751 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
752} STMT_END
753
499333dc
KW
754/* These have asserts in them because of [perl #122671] Many warnings in
755 * regcomp.c can occur twice. If they get output in pass1 and later in that
756 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
757 * would get output again. So they should be output in pass2, and these
758 * asserts make sure new warnings follow that paradigm. */
946095af 759
5e0a247b
KW
760/* m is not necessarily a "literal string", in this macro */
761#define reg_warn_non_literal_string(loc, m) STMT_START { \
d528642a
KW
762 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
763 "%s" REPORT_LOCATION, \
764 m, REPORT_LOCATION_ARGS(loc)); \
5e0a247b
KW
765} STMT_END
766
668c081a 767#define ckWARNreg(loc,m) STMT_START { \
d528642a
KW
768 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
769 m REPORT_LOCATION, \
770 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
771} STMT_END
772
b927b7e9 773#define vWARN(loc, m) STMT_START { \
d528642a
KW
774 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
775 m REPORT_LOCATION, \
776 REPORT_LOCATION_ARGS(loc)); \
b927b7e9
KW
777} STMT_END
778
0d6106aa 779#define vWARN_dep(loc, m) STMT_START { \
d528642a
KW
780 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
781 m REPORT_LOCATION, \
782 REPORT_LOCATION_ARGS(loc)); \
0d6106aa
KW
783} STMT_END
784
147508a2 785#define ckWARNdep(loc,m) STMT_START { \
d528642a
KW
786 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
787 m REPORT_LOCATION, \
788 REPORT_LOCATION_ARGS(loc)); \
147508a2
KW
789} STMT_END
790
d528642a
KW
791#define ckWARNregdep(loc,m) STMT_START { \
792 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
793 WARN_REGEXP), \
794 m REPORT_LOCATION, \
795 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
796} STMT_END
797
d528642a
KW
798#define ckWARN2reg_d(loc,m, a1) STMT_START { \
799 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
800 m REPORT_LOCATION, \
801 a1, REPORT_LOCATION_ARGS(loc)); \
2335b3d3
KW
802} STMT_END
803
d528642a
KW
804#define ckWARN2reg(loc, m, a1) STMT_START { \
805 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
806 m REPORT_LOCATION, \
807 a1, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
808} STMT_END
809
d528642a
KW
810#define vWARN3(loc, m, a1, a2) STMT_START { \
811 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
812 m REPORT_LOCATION, \
813 a1, a2, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
814} STMT_END
815
d528642a
KW
816#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
817 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
818 m REPORT_LOCATION, \
819 a1, a2, \
820 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
821} STMT_END
822
ccb2c380 823#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
824 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
825 m REPORT_LOCATION, \
826 a1, a2, a3, \
827 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
828} STMT_END
829
668c081a 830#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
831 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
832 m REPORT_LOCATION, \
833 a1, a2, a3, \
834 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
835} STMT_END
836
ccb2c380 837#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
d528642a
KW
838 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
839 m REPORT_LOCATION, \
840 a1, a2, a3, a4, \
841 REPORT_LOCATION_ARGS(loc)); \
ccb2c380 842} STMT_END
9d1d55b5 843
538e84ed 844/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
845 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
846 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
847 * Element 0 holds the number n.
07be1b83 848 * Position is 1 indexed.
fac92740 849 */
7122b237
YO
850#ifndef RE_TRACK_PATTERN_OFFSETS
851#define Set_Node_Offset_To_R(node,byte)
852#define Set_Node_Offset(node,byte)
853#define Set_Cur_Node_Offset
854#define Set_Node_Length_To_R(node,len)
855#define Set_Node_Length(node,len)
6a86c6ad 856#define Set_Node_Cur_Length(node,start)
538e84ed
KW
857#define Node_Offset(n)
858#define Node_Length(n)
7122b237
YO
859#define Set_Node_Offset_Length(node,offset,len)
860#define ProgLen(ri) ri->u.proglen
861#define SetProgLen(ri,x) ri->u.proglen = x
862#else
863#define ProgLen(ri) ri->u.offsets[0]
864#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
865#define Set_Node_Offset_To_R(node,byte) STMT_START { \
866 if (! SIZE_ONLY) { \
867 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 868 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 869 if((node) < 0) { \
538e84ed
KW
870 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
871 (int)(node)); \
ccb2c380
MP
872 } else { \
873 RExC_offsets[2*(node)-1] = (byte); \
874 } \
875 } \
876} STMT_END
877
878#define Set_Node_Offset(node,byte) \
879 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
880#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
881
882#define Set_Node_Length_To_R(node,len) STMT_START { \
883 if (! SIZE_ONLY) { \
884 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 885 __LINE__, (int)(node), (int)(len))); \
ccb2c380 886 if((node) < 0) { \
538e84ed
KW
887 Perl_croak(aTHX_ "value of node is %d in Length macro", \
888 (int)(node)); \
ccb2c380
MP
889 } else { \
890 RExC_offsets[2*(node)] = (len); \
891 } \
892 } \
893} STMT_END
894
895#define Set_Node_Length(node,len) \
896 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
897#define Set_Node_Cur_Length(node, start) \
898 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
899
900/* Get offsets and lengths */
901#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
902#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
903
07be1b83
YO
904#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
905 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
906 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
907} STMT_END
7122b237 908#endif
07be1b83
YO
909
910#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
911#define EXPERIMENTAL_INPLACESCAN
f427392e 912#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 913
cb41e5d6
YO
914#ifdef DEBUGGING
915int
6ad9a8ab 916Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
917{
918 va_list ap;
919 int result;
920 PerlIO *f= Perl_debug_log;
921 PERL_ARGS_ASSERT_RE_PRINTF;
922 va_start(ap, fmt);
923 result = PerlIO_vprintf(f, fmt, ap);
924 va_end(ap);
925 return result;
926}
927
928int
7b031478 929Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
930{
931 va_list ap;
932 int result;
933 PerlIO *f= Perl_debug_log;
934 PERL_ARGS_ASSERT_RE_INDENTF;
935 va_start(ap, depth);
936 PerlIO_printf(f, "%*s", ( depth % 20 ) * 2, "");
937 result = PerlIO_vprintf(f, fmt, ap);
938 va_end(ap);
939 return result;
940}
941#endif /* DEBUGGING */
942
7b031478 943#define DEBUG_RExC_seen() \
538e84ed 944 DEBUG_OPTIMISE_MORE_r({ \
6ad9a8ab 945 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 946 \
e384d5c1 947 if (RExC_seen & REG_ZERO_LEN_SEEN) \
6ad9a8ab 948 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 949 \
e384d5c1 950 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
6ad9a8ab 951 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 952 \
e384d5c1 953 if (RExC_seen & REG_GPOS_SEEN) \
6ad9a8ab 954 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 955 \
e384d5c1 956 if (RExC_seen & REG_RECURSE_SEEN) \
6ad9a8ab 957 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 958 \
7b031478 959 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
6ad9a8ab 960 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 961 \
e384d5c1 962 if (RExC_seen & REG_VERBARG_SEEN) \
6ad9a8ab 963 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 964 \
e384d5c1 965 if (RExC_seen & REG_CUTGROUP_SEEN) \
6ad9a8ab 966 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 967 \
e384d5c1 968 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
6ad9a8ab 969 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 970 \
e384d5c1 971 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
6ad9a8ab 972 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 973 \
7b031478 974 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
6ad9a8ab 975 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 976 \
6ad9a8ab 977 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
978 });
979
fdfb4f21 980#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 981 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21
YO
982
983#define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
984 if ( ( flags ) ) { \
6ad9a8ab 985 Perl_re_printf( aTHX_ "%s", open_str); \
fdfb4f21
YO
986 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
987 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
988 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
989 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
990 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
991 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
992 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
993 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
994 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
995 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
996 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
997 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
998 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
999 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
1000 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
6ad9a8ab 1001 Perl_re_printf( aTHX_ "%s", close_str); \
fdfb4f21
YO
1002 }
1003
1004
304ee84b
YO
1005#define DEBUG_STUDYDATA(str,data,depth) \
1006DEBUG_OPTIMISE_MORE_r(if(data){ \
6ad9a8ab 1007 Perl_re_indentf( aTHX_ "" str "Pos:%"IVdf"/%"IVdf \
fdfb4f21 1008 " Flags: 0x%"UVXf, \
7b031478 1009 depth, \
1de06328
YO
1010 (IV)((data)->pos_min), \
1011 (IV)((data)->pos_delta), \
fdfb4f21
YO
1012 (UV)((data)->flags) \
1013 ); \
1014 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
6ad9a8ab 1015 Perl_re_printf( aTHX_ \
fdfb4f21 1016 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328 1017 (IV)((data)->whilem_c), \
304ee84b
YO
1018 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
1019 is_inf ? "INF " : "" \
1de06328
YO
1020 ); \
1021 if ((data)->last_found) \
6ad9a8ab 1022 Perl_re_printf( aTHX_ \
1de06328
YO
1023 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
1024 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
1025 SvPVX_const((data)->last_found), \
1026 (IV)((data)->last_end), \
1027 (IV)((data)->last_start_min), \
1028 (IV)((data)->last_start_max), \
1029 ((data)->longest && \
1030 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
1031 SvPVX_const((data)->longest_fixed), \
1032 (IV)((data)->offset_fixed), \
1033 ((data)->longest && \
1034 (data)->longest==&((data)->longest_float)) ? "*" : "", \
1035 SvPVX_const((data)->longest_float), \
1036 (IV)((data)->offset_float_min), \
1037 (IV)((data)->offset_float_max) \
1038 ); \
6ad9a8ab 1039 Perl_re_printf( aTHX_ "\n"); \
1de06328
YO
1040});
1041
cb41e5d6 1042
c6871b76
KW
1043/* =========================================================
1044 * BEGIN edit_distance stuff.
1045 *
1046 * This calculates how many single character changes of any type are needed to
1047 * transform a string into another one. It is taken from version 3.1 of
1048 *
1049 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1050 */
1051
1052/* Our unsorted dictionary linked list. */
1053/* Note we use UVs, not chars. */
1054
1055struct dictionary{
1056 UV key;
1057 UV value;
1058 struct dictionary* next;
1059};
1060typedef struct dictionary item;
1061
1062
1063PERL_STATIC_INLINE item*
1064push(UV key,item* curr)
1065{
1066 item* head;
1067 Newxz(head, 1, item);
1068 head->key = key;
1069 head->value = 0;
1070 head->next = curr;
1071 return head;
1072}
1073
1074
1075PERL_STATIC_INLINE item*
1076find(item* head, UV key)
1077{
1078 item* iterator = head;
1079 while (iterator){
1080 if (iterator->key == key){
1081 return iterator;
1082 }
1083 iterator = iterator->next;
1084 }
1085
1086 return NULL;
1087}
1088
1089PERL_STATIC_INLINE item*
1090uniquePush(item* head,UV key)
1091{
1092 item* iterator = head;
1093
1094 while (iterator){
1095 if (iterator->key == key) {
1096 return head;
1097 }
1098 iterator = iterator->next;
1099 }
1100
1101 return push(key,head);
1102}
1103
1104PERL_STATIC_INLINE void
1105dict_free(item* head)
1106{
1107 item* iterator = head;
1108
1109 while (iterator) {
1110 item* temp = iterator;
1111 iterator = iterator->next;
1112 Safefree(temp);
1113 }
1114
1115 head = NULL;
1116}
1117
1118/* End of Dictionary Stuff */
1119
1120/* All calculations/work are done here */
1121STATIC int
1122S_edit_distance(const UV* src,
1123 const UV* tgt,
1124 const STRLEN x, /* length of src[] */
1125 const STRLEN y, /* length of tgt[] */
1126 const SSize_t maxDistance
1127)
1128{
1129 item *head = NULL;
1130 UV swapCount,swapScore,targetCharCount,i,j;
1131 UV *scores;
1132 UV score_ceil = x + y;
1133
1134 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1135
1136 /* intialize matrix start values */
1137 Newxz(scores, ( (x + 2) * (y + 2)), UV);
1138 scores[0] = score_ceil;
1139 scores[1 * (y + 2) + 0] = score_ceil;
1140 scores[0 * (y + 2) + 1] = score_ceil;
1141 scores[1 * (y + 2) + 1] = 0;
1142 head = uniquePush(uniquePush(head,src[0]),tgt[0]);
1143
1144 /* work loops */
1145 /* i = src index */
1146 /* j = tgt index */
1147 for (i=1;i<=x;i++) {
1148 if (i < x)
1149 head = uniquePush(head,src[i]);
1150 scores[(i+1) * (y + 2) + 1] = i;
1151 scores[(i+1) * (y + 2) + 0] = score_ceil;
1152 swapCount = 0;
1153
1154 for (j=1;j<=y;j++) {
1155 if (i == 1) {
1156 if(j < y)
1157 head = uniquePush(head,tgt[j]);
1158 scores[1 * (y + 2) + (j + 1)] = j;
1159 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1160 }
1161
1162 targetCharCount = find(head,tgt[j-1])->value;
1163 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1164
1165 if (src[i-1] != tgt[j-1]){
1166 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1167 }
1168 else {
1169 swapCount = j;
1170 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1171 }
1172 }
1173
1174 find(head,src[i-1])->value = i;
1175 }
1176
1177 {
1178 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1179 dict_free(head);
1180 Safefree(scores);
1181 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1182 }
1183}
1184
1185/* END of edit_distance() stuff
1186 * ========================================================= */
1187
8e35b056
KW
1188/* is c a control character for which we have a mnemonic? */
1189#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1190
549b4e78
KW
1191STATIC const char *
1192S_cntrl_to_mnemonic(const U8 c)
1193{
1194 /* Returns the mnemonic string that represents character 'c', if one
1195 * exists; NULL otherwise. The only ones that exist for the purposes of
1196 * this routine are a few control characters */
1197
1198 switch (c) {
1199 case '\a': return "\\a";
1200 case '\b': return "\\b";
1201 case ESC_NATIVE: return "\\e";
1202 case '\f': return "\\f";
1203 case '\n': return "\\n";
1204 case '\r': return "\\r";
1205 case '\t': return "\\t";
1206 }
1207
1208 return NULL;
1209}
1210
653099ff 1211/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 1212 Update the longest found anchored substring and the longest found
653099ff
GS
1213 floating substrings if needed. */
1214
4327152a 1215STATIC void
ea3daa5d
FC
1216S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1217 SSize_t *minlenp, int is_inf)
c277df42 1218{
e1ec3a88
AL
1219 const STRLEN l = CHR_SVLEN(data->last_found);
1220 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 1221 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1222
7918f24d
NC
1223 PERL_ARGS_ASSERT_SCAN_COMMIT;
1224
c277df42 1225 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 1226 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
1227 if (*data->longest == data->longest_fixed) {
1228 data->offset_fixed = l ? data->last_start_min : data->pos_min;
1229 if (data->flags & SF_BEFORE_EOL)
b81d288d 1230 data->flags
c277df42
IZ
1231 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
1232 else
1233 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 1234 data->minlen_fixed=minlenp;
1de06328 1235 data->lookbehind_fixed=0;
a0ed51b3 1236 }
304ee84b 1237 else { /* *data->longest == data->longest_float */
c277df42 1238 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d 1239 data->offset_float_max = (l
646e8787
DM
1240 ? data->last_start_max
1241 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1242 ? SSize_t_MAX
1243 : data->pos_min + data->pos_delta));
1244 if (is_inf
1245 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
1246 data->offset_float_max = SSize_t_MAX;
c277df42 1247 if (data->flags & SF_BEFORE_EOL)
b81d288d 1248 data->flags
c277df42
IZ
1249 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1250 else
1251 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
1252 data->minlen_float=minlenp;
1253 data->lookbehind_float=0;
c277df42
IZ
1254 }
1255 }
1256 SvCUR_set(data->last_found, 0);
0eda9292 1257 {
a28509cc 1258 SV * const sv = data->last_found;
097eb12c
AL
1259 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1260 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1261 if (mg)
1262 mg->mg_len = 0;
1263 }
0eda9292 1264 }
c277df42
IZ
1265 data->last_end = -1;
1266 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 1267 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
1268}
1269
cdd87c1d
KW
1270/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1271 * list that describes which code points it matches */
1272
653099ff 1273STATIC void
3420edd7 1274S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1275{
cdd87c1d
KW
1276 /* Set the SSC 'ssc' to match an empty string or any code point */
1277
557bd3fb 1278 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1279
71068078 1280 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1281
cdd87c1d
KW
1282 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
1283 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
93e92956 1284 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1285}
1286
653099ff 1287STATIC int
dc3bf405 1288S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1289{
c144baaa
KW
1290 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1291 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1292 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1293 * in any way, so there's no point in using it */
cdd87c1d
KW
1294
1295 UV start, end;
1296 bool ret;
653099ff 1297
557bd3fb 1298 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1299
71068078 1300 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1301
93e92956 1302 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1303 return FALSE;
1304 }
1305
1306 /* See if the list consists solely of the range 0 - Infinity */
1307 invlist_iterinit(ssc->invlist);
1308 ret = invlist_iternext(ssc->invlist, &start, &end)
1309 && start == 0
1310 && end == UV_MAX;
1311
1312 invlist_iterfinish(ssc->invlist);
1313
1314 if (ret) {
1315 return TRUE;
1316 }
1317
1318 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1319 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1320 int i;
1321 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1322 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1323 return TRUE;
1324 }
1325 }
1326 }
1327
1328 return FALSE;
653099ff
GS
1329}
1330
653099ff 1331STATIC void
cdd87c1d 1332S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1333{
cdd87c1d
KW
1334 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1335 * string, any code point, or any posix class under locale */
1336
557bd3fb 1337 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1338
557bd3fb 1339 Zero(ssc, 1, regnode_ssc);
71068078 1340 set_ANYOF_SYNTHETIC(ssc);
93e92956 1341 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1342 ssc_anything(ssc);
cdd87c1d 1343
2f306ab9
KW
1344 /* If any portion of the regex is to operate under locale rules that aren't
1345 * fully known at compile time, initialization includes it. The reason
1346 * this isn't done for all regexes is that the optimizer was written under
1347 * the assumption that locale was all-or-nothing. Given the complexity and
1348 * lack of documentation in the optimizer, and that there are inadequate
1349 * test cases for locale, many parts of it may not work properly, it is
1350 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1351 if (RExC_contains_locale) {
1352 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1353 }
1354 else {
1355 ANYOF_POSIXL_ZERO(ssc);
1356 }
653099ff
GS
1357}
1358
b423522f 1359STATIC int
dc3bf405
BF
1360S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1361 const regnode_ssc *ssc)
b423522f
KW
1362{
1363 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1364 * to the list of code points matched, and locale posix classes; hence does
1365 * not check its flags) */
1366
1367 UV start, end;
1368 bool ret;
1369
1370 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1371
71068078 1372 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1373
1374 invlist_iterinit(ssc->invlist);
1375 ret = invlist_iternext(ssc->invlist, &start, &end)
1376 && start == 0
1377 && end == UV_MAX;
1378
1379 invlist_iterfinish(ssc->invlist);
1380
1381 if (! ret) {
1382 return FALSE;
1383 }
1384
e0e1be5f 1385 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1386 return FALSE;
b423522f
KW
1387 }
1388
1389 return TRUE;
1390}
1391
1392STATIC SV*
1393S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1394 const regnode_charclass* const node)
b423522f
KW
1395{
1396 /* Returns a mortal inversion list defining which code points are matched
1397 * by 'node', which is of type ANYOF. Handles complementing the result if
1398 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1399 * returned list must, and will, contain every code point that is a
1400 * possibility. */
b423522f 1401
e2506fa7 1402 SV* invlist = NULL;
1ee208c4 1403 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1404 unsigned int i;
1405 const U32 n = ARG(node);
31f05a37 1406 bool new_node_has_latin1 = FALSE;
b423522f
KW
1407
1408 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1409
1410 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1411 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1412 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1413 AV * const av = MUTABLE_AV(SvRV(rv));
1414 SV **const ary = AvARRAY(av);
1415 assert(RExC_rxi->data->what[n] == 's');
1416
1417 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1418 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1419 }
1420 else if (ary[0] && ary[0] != &PL_sv_undef) {
1421
1422 /* Here, no compile-time swash, and there are things that won't be
1423 * known until runtime -- we have to assume it could be anything */
e2506fa7 1424 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1425 return _add_range_to_invlist(invlist, 0, UV_MAX);
1426 }
1ee208c4 1427 else if (ary[3] && ary[3] != &PL_sv_undef) {
b423522f
KW
1428
1429 /* Here no compile-time swash, and no run-time only data. Use the
1430 * node's inversion list */
1ee208c4
KW
1431 invlist = sv_2mortal(invlist_clone(ary[3]));
1432 }
1433
1434 /* Get the code points valid only under UTF-8 locales */
037715a6 1435 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1ee208c4
KW
1436 && ary[2] && ary[2] != &PL_sv_undef)
1437 {
1438 only_utf8_locale_invlist = ary[2];
b423522f
KW
1439 }
1440 }
1441
e2506fa7
KW
1442 if (! invlist) {
1443 invlist = sv_2mortal(_new_invlist(0));
1444 }
1445
dcb20b36
KW
1446 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1447 * code points, and an inversion list for the others, but if there are code
1448 * points that should match only conditionally on the target string being
1449 * UTF-8, those are placed in the inversion list, and not the bitmap.
1450 * Since there are circumstances under which they could match, they are
1451 * included in the SSC. But if the ANYOF node is to be inverted, we have
1452 * to exclude them here, so that when we invert below, the end result
1453 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1454 * have to do this here before we add the unconditionally matched code
1455 * points */
b423522f
KW
1456 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1457 _invlist_intersection_complement_2nd(invlist,
1458 PL_UpperLatin1,
1459 &invlist);
1460 }
1461
1462 /* Add in the points from the bit map */
dcb20b36 1463 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
b423522f 1464 if (ANYOF_BITMAP_TEST(node, i)) {
6f8848d5
TC
1465 unsigned int start = i++;
1466
1467 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1468 /* empty */
1469 }
1470 invlist = _add_range_to_invlist(invlist, start, i-1);
31f05a37 1471 new_node_has_latin1 = TRUE;
b423522f
KW
1472 }
1473 }
1474
1475 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1476 * as well. But don't add them if inverting, as when that gets done below,
1477 * it would exclude all these characters, including the ones it shouldn't
1478 * that were added just above */
1479 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
f240c685
KW
1480 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1481 {
b423522f
KW
1482 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1483 }
1484
1485 /* Similarly for these */
93e92956 1486 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1487 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1488 }
1489
1490 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1491 _invlist_invert(invlist);
1492 }
037715a6 1493 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
31f05a37
KW
1494
1495 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1496 * locale. We can skip this if there are no 0-255 at all. */
1497 _invlist_union(invlist, PL_Latin1, &invlist);
1498 }
1499
1ee208c4
KW
1500 /* Similarly add the UTF-8 locale possible matches. These have to be
1501 * deferred until after the non-UTF-8 locale ones are taken care of just
1502 * above, or it leads to wrong results under ANYOF_INVERT */
1503 if (only_utf8_locale_invlist) {
31f05a37 1504 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1505 only_utf8_locale_invlist,
31f05a37
KW
1506 ANYOF_FLAGS(node) & ANYOF_INVERT,
1507 &invlist);
1508 }
b423522f
KW
1509
1510 return invlist;
1511}
1512
1051e1c4 1513/* These two functions currently do the exact same thing */
557bd3fb 1514#define ssc_init_zero ssc_init
653099ff 1515
cdd87c1d
KW
1516#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1517#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1518
557bd3fb 1519/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1520 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1521 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1522
653099ff 1523STATIC void
b423522f 1524S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1525 const regnode_charclass *and_with)
653099ff 1526{
cdd87c1d
KW
1527 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1528 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1529
a0dd4231
KW
1530 SV* anded_cp_list;
1531 U8 anded_flags;
1e6ade67 1532
cdd87c1d 1533 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1534
71068078 1535 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1536
1537 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1538 * the code point inversion list and just the relevant flags */
71068078 1539 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1540 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
a0dd4231 1541 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1542
1543 /* XXX This is a kludge around what appears to be deficiencies in the
1544 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1545 * there are paths through the optimizer where it doesn't get weeded
1546 * out when it should. And if we don't make some extra provision for
1547 * it like the code just below, it doesn't get added when it should.
1548 * This solution is to add it only when AND'ing, which is here, and
1549 * only when what is being AND'ed is the pristine, original node
1550 * matching anything. Thus it is like adding it to ssc_anything() but
1551 * only when the result is to be AND'ed. Probably the same solution
1552 * could be adopted for the same problem we have with /l matching,
1553 * which is solved differently in S_ssc_init(), and that would lead to
1554 * fewer false positives than that solution has. But if this solution
1555 * creates bugs, the consequences are only that a warning isn't raised
1556 * that should be; while the consequences for having /l bugs is
1557 * incorrect matches */
7dcac5f6 1558 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1559 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1560 }
a0dd4231
KW
1561 }
1562 else {
5c0f85ef 1563 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685
KW
1564 if (OP(and_with) == ANYOFD) {
1565 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1566 }
1567 else {
1568 anded_flags = ANYOF_FLAGS(and_with)
1569 &( ANYOF_COMMON_FLAGS
108316fb
KW
1570 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1571 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1572 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1573 anded_flags &=
1574 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1575 }
f240c685 1576 }
a0dd4231
KW
1577 }
1578
1579 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1580
1581 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1582 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1583 * 'and_with' may be inverted. When not inverted, we have the situation of
1584 * computing:
1585 * (C1 | P1) & (C2 | P2)
1586 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1587 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1588 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1589 * <= ((C1 & C2) | P1 | P2)
1590 * Alternatively, the last few steps could be:
1591 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1592 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1593 * <= (C1 | C2 | (P1 & P2))
1594 * We favor the second approach if either P1 or P2 is non-empty. This is
1595 * because these components are a barrier to doing optimizations, as what
1596 * they match cannot be known until the moment of matching as they are
1597 * dependent on the current locale, 'AND"ing them likely will reduce or
1598 * eliminate them.
1599 * But we can do better if we know that C1,P1 are in their initial state (a
1600 * frequent occurrence), each matching everything:
1601 * (<everything>) & (C2 | P2) = C2 | P2
1602 * Similarly, if C2,P2 are in their initial state (again a frequent
1603 * occurrence), the result is a no-op
1604 * (C1 | P1) & (<everything>) = C1 | P1
1605 *
1606 * Inverted, we have
1607 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1608 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1609 * <= (C1 & ~C2) | (P1 & ~P2)
1610 * */
1aa99e6b 1611
a0dd4231 1612 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1613 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1614 {
cdd87c1d 1615 unsigned int i;
8951c461 1616
cdd87c1d
KW
1617 ssc_intersection(ssc,
1618 anded_cp_list,
1619 FALSE /* Has already been inverted */
1620 );
c6b76537 1621
cdd87c1d
KW
1622 /* If either P1 or P2 is empty, the intersection will be also; can skip
1623 * the loop */
93e92956 1624 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1625 ANYOF_POSIXL_ZERO(ssc);
1626 }
e0e1be5f 1627 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1628
1629 /* Note that the Posix class component P from 'and_with' actually
1630 * looks like:
1631 * P = Pa | Pb | ... | Pn
1632 * where each component is one posix class, such as in [\w\s].
1633 * Thus
1634 * ~P = ~(Pa | Pb | ... | Pn)
1635 * = ~Pa & ~Pb & ... & ~Pn
1636 * <= ~Pa | ~Pb | ... | ~Pn
1637 * The last is something we can easily calculate, but unfortunately
1638 * is likely to have many false positives. We could do better
1639 * in some (but certainly not all) instances if two classes in
1640 * P have known relationships. For example
1641 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1642 * So
1643 * :lower: & :print: = :lower:
1644 * And similarly for classes that must be disjoint. For example,
1645 * since \s and \w can have no elements in common based on rules in
1646 * the POSIX standard,
1647 * \w & ^\S = nothing
1648 * Unfortunately, some vendor locales do not meet the Posix
1649 * standard, in particular almost everything by Microsoft.
1650 * The loop below just changes e.g., \w into \W and vice versa */
1651
1ee208c4 1652 regnode_charclass_posixl temp;
cdd87c1d
KW
1653 int add = 1; /* To calculate the index of the complement */
1654
1655 ANYOF_POSIXL_ZERO(&temp);
1656 for (i = 0; i < ANYOF_MAX; i++) {
1657 assert(i % 2 != 0
7dcac5f6
KW
1658 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1659 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1660
7dcac5f6 1661 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1662 ANYOF_POSIXL_SET(&temp, i + add);
1663 }
1664 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1665 }
1666 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1667
cdd87c1d
KW
1668 } /* else ssc already has no posixes */
1669 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1670 in its initial state */
71068078 1671 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1672 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1673 {
1674 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1675 * copy it over 'ssc' */
1676 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1677 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1678 StructCopy(and_with, ssc, regnode_ssc);
1679 }
1680 else {
1681 ssc->invlist = anded_cp_list;
1682 ANYOF_POSIXL_ZERO(ssc);
93e92956 1683 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1684 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1685 }
1686 }
1687 }
e0e1be5f 1688 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
93e92956 1689 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1690 {
1691 /* One or the other of P1, P2 is non-empty. */
93e92956 1692 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1693 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1694 }
cdd87c1d
KW
1695 ssc_union(ssc, anded_cp_list, FALSE);
1696 }
1697 else { /* P1 = P2 = empty */
1698 ssc_intersection(ssc, anded_cp_list, FALSE);
1699 }
137165a6 1700 }
653099ff
GS
1701}
1702
653099ff 1703STATIC void
cdd87c1d 1704S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1705 const regnode_charclass *or_with)
653099ff 1706{
cdd87c1d
KW
1707 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1708 * another SSC or a regular ANYOF class. Can create false positives if
1709 * 'or_with' is to be inverted. */
7918f24d 1710
a0dd4231
KW
1711 SV* ored_cp_list;
1712 U8 ored_flags;
c6b76537 1713
cdd87c1d 1714 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1715
71068078 1716 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1717
1718 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1719 * the code point inversion list and just the relevant flags */
71068078 1720 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1721 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
a0dd4231
KW
1722 ored_flags = ANYOF_FLAGS(or_with);
1723 }
1724 else {
5c0f85ef 1725 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
eff8b7dc 1726 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
f240c685
KW
1727 if (OP(or_with) != ANYOFD) {
1728 ored_flags
1729 |= ANYOF_FLAGS(or_with)
108316fb
KW
1730 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1731 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1732 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1733 ored_flags |=
1734 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1735 }
f240c685 1736 }
a0dd4231
KW
1737 }
1738
1739 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1740
1741 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1742 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1743 * 'or_with' may be inverted. When not inverted, we have the simple
1744 * situation of computing:
1745 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1746 * If P1|P2 yields a situation with both a class and its complement are
1747 * set, like having both \w and \W, this matches all code points, and we
1748 * can delete these from the P component of the ssc going forward. XXX We
1749 * might be able to delete all the P components, but I (khw) am not certain
1750 * about this, and it is better to be safe.
1751 *
1752 * Inverted, we have
1753 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1754 * <= (C1 | P1) | ~C2
1755 * <= (C1 | ~C2) | P1
1756 * (which results in actually simpler code than the non-inverted case)
1757 * */
9826f543 1758
a0dd4231 1759 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1760 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1761 {
cdd87c1d 1762 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1763 } /* else Not inverted */
93e92956 1764 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1765 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1766 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1767 unsigned int i;
1768 for (i = 0; i < ANYOF_MAX; i += 2) {
1769 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1770 {
1771 ssc_match_all_cp(ssc);
1772 ANYOF_POSIXL_CLEAR(ssc, i);
1773 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1774 }
1775 }
1776 }
1aa99e6b 1777 }
cdd87c1d
KW
1778
1779 ssc_union(ssc,
1780 ored_cp_list,
1781 FALSE /* Already has been inverted */
1782 );
653099ff
GS
1783}
1784
b423522f
KW
1785PERL_STATIC_INLINE void
1786S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1787{
1788 PERL_ARGS_ASSERT_SSC_UNION;
1789
71068078 1790 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1791
1792 _invlist_union_maybe_complement_2nd(ssc->invlist,
1793 invlist,
1794 invert2nd,
1795 &ssc->invlist);
1796}
1797
1798PERL_STATIC_INLINE void
1799S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1800 SV* const invlist,
1801 const bool invert2nd)
1802{
1803 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1804
71068078 1805 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1806
1807 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1808 invlist,
1809 invert2nd,
1810 &ssc->invlist);
1811}
1812
1813PERL_STATIC_INLINE void
1814S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1815{
1816 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1817
71068078 1818 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1819
1820 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1821}
1822
1823PERL_STATIC_INLINE void
1824S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1825{
1826 /* AND just the single code point 'cp' into the SSC 'ssc' */
1827
1828 SV* cp_list = _new_invlist(2);
1829
1830 PERL_ARGS_ASSERT_SSC_CP_AND;
1831
71068078 1832 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1833
1834 cp_list = add_cp_to_invlist(cp_list, cp);
1835 ssc_intersection(ssc, cp_list,
1836 FALSE /* Not inverted */
1837 );
1838 SvREFCNT_dec_NN(cp_list);
1839}
1840
1841PERL_STATIC_INLINE void
dc3bf405 1842S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
1843{
1844 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
1845 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1846
71068078 1847 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1848
1849 ANYOF_POSIXL_ZERO(ssc);
1850 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1851}
1852
b35552de
KW
1853#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1854
1855STATIC bool
1856S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1857{
1858 /* The synthetic start class is used to hopefully quickly winnow down
1859 * places where a pattern could start a match in the target string. If it
1860 * doesn't really narrow things down that much, there isn't much point to
1861 * having the overhead of using it. This function uses some very crude
1862 * heuristics to decide if to use the ssc or not.
1863 *
1864 * It returns TRUE if 'ssc' rules out more than half what it considers to
1865 * be the "likely" possible matches, but of course it doesn't know what the
1866 * actual things being matched are going to be; these are only guesses
1867 *
1868 * For /l matches, it assumes that the only likely matches are going to be
1869 * in the 0-255 range, uniformly distributed, so half of that is 127
1870 * For /a and /d matches, it assumes that the likely matches will be just
1871 * the ASCII range, so half of that is 63
1872 * For /u and there isn't anything matching above the Latin1 range, it
1873 * assumes that that is the only range likely to be matched, and uses
1874 * half that as the cut-off: 127. If anything matches above Latin1,
1875 * it assumes that all of Unicode could match (uniformly), except for
1876 * non-Unicode code points and things in the General Category "Other"
1877 * (unassigned, private use, surrogates, controls and formats). This
1878 * is a much large number. */
1879
b35552de
KW
1880 U32 count = 0; /* Running total of number of code points matched by
1881 'ssc' */
1882 UV start, end; /* Start and end points of current range in inversion
1883 list */
72400949
KW
1884 const U32 max_code_points = (LOC)
1885 ? 256
1886 : (( ! UNI_SEMANTICS
1887 || invlist_highest(ssc->invlist) < 256)
1888 ? 128
1889 : NON_OTHER_COUNT);
1890 const U32 max_match = max_code_points / 2;
b35552de
KW
1891
1892 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1893
1894 invlist_iterinit(ssc->invlist);
1895 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
1896 if (start >= max_code_points) {
1897 break;
b35552de 1898 }
72400949 1899 end = MIN(end, max_code_points - 1);
b35552de 1900 count += end - start + 1;
72400949 1901 if (count >= max_match) {
b35552de
KW
1902 invlist_iterfinish(ssc->invlist);
1903 return FALSE;
1904 }
1905 }
1906
1907 return TRUE;
1908}
1909
1910
b423522f
KW
1911STATIC void
1912S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1913{
1914 /* The inversion list in the SSC is marked mortal; now we need a more
1915 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
1916 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1917 * map */
b423522f
KW
1918
1919 SV* invlist = invlist_clone(ssc->invlist);
1920
1921 PERL_ARGS_ASSERT_SSC_FINALIZE;
1922
71068078 1923 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 1924
a0dd4231 1925 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
1926 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1927 * by the time we reach here */
f240c685
KW
1928 assert(! (ANYOF_FLAGS(ssc)
1929 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
1930 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1931 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 1932
b423522f
KW
1933 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1934
1ee208c4
KW
1935 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1936 NULL, NULL, NULL, FALSE);
b423522f 1937
85c8e306
KW
1938 /* Make sure is clone-safe */
1939 ssc->invlist = NULL;
1940
e0e1be5f 1941 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 1942 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
e0e1be5f 1943 }
1462525b 1944
b2e90ddf
KW
1945 if (RExC_contains_locale) {
1946 OP(ssc) = ANYOFL;
1947 }
1948
1462525b 1949 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
1950}
1951
a3621e74
YO
1952#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1953#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1954#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
1955#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1956 ? (TRIE_LIST_CUR( idx ) - 1) \
1957 : 0 )
a3621e74 1958
3dab1dad
YO
1959
1960#ifdef DEBUGGING
07be1b83 1961/*
2b8b4781
NC
1962 dump_trie(trie,widecharmap,revcharmap)
1963 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1964 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1965
1966 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1967 The _interim_ variants are used for debugging the interim
1968 tables that are used to generate the final compressed
1969 representation which is what dump_trie expects.
1970
486ec47a 1971 Part of the reason for their existence is to provide a form
3dab1dad 1972 of documentation as to how the different representations function.
07be1b83
YO
1973
1974*/
3dab1dad
YO
1975
1976/*
3dab1dad
YO
1977 Dumps the final compressed table form of the trie to Perl_debug_log.
1978 Used for debugging make_trie().
1979*/
b9a59e08 1980
3dab1dad 1981STATIC void
2b8b4781
NC
1982S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1983 AV *revcharmap, U32 depth)
3dab1dad
YO
1984{
1985 U32 state;
ab3bbdeb 1986 SV *sv=sv_newmortal();
55eed653 1987 int colwidth= widecharmap ? 6 : 4;
2e64971a 1988 U16 word;
3dab1dad
YO
1989 GET_RE_DEBUG_FLAGS_DECL;
1990
7918f24d 1991 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1992
6ad9a8ab 1993 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 1994 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
1995
1996 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1997 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1998 if ( tmp ) {
6ad9a8ab 1999 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2000 colwidth,
538e84ed 2001 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2002 PL_colors[0], PL_colors[1],
2003 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2004 PERL_PV_ESCAPE_FIRSTCHAR
2005 )
ab3bbdeb 2006 );
3dab1dad
YO
2007 }
2008 }
1e37780e
YO
2009 Perl_re_printf( aTHX_ "\n");
2010 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2011
2012 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2013 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2014 Perl_re_printf( aTHX_ "\n");
3dab1dad 2015
1e2e3d02 2016 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2017 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2018
1e37780e 2019 Perl_re_indentf( aTHX_ "#%4"UVXf"|", depth+1, (UV)state);
3dab1dad
YO
2020
2021 if ( trie->states[ state ].wordnum ) {
1e37780e 2022 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2023 } else {
6ad9a8ab 2024 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2025 }
2026
6ad9a8ab 2027 Perl_re_printf( aTHX_ " @%4"UVXf" ", (UV)base );
3dab1dad
YO
2028
2029 if ( base ) {
2030 U32 ofs = 0;
2031
2032 while( ( base + ofs < trie->uniquecharcount ) ||
2033 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2034 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2035 != state))
3dab1dad
YO
2036 ofs++;
2037
6ad9a8ab 2038 Perl_re_printf( aTHX_ "+%2"UVXf"[ ", (UV)ofs);
3dab1dad
YO
2039
2040 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2041 if ( ( base + ofs >= trie->uniquecharcount )
2042 && ( base + ofs - trie->uniquecharcount
2043 < trie->lasttrans )
2044 && trie->trans[ base + ofs
2045 - trie->uniquecharcount ].check == state )
3dab1dad 2046 {
1e37780e
YO
2047 Perl_re_printf( aTHX_ "%*"UVXf, colwidth,
2048 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2049 );
3dab1dad 2050 } else {
6ad9a8ab 2051 Perl_re_printf( aTHX_ "%*s",colwidth," ." );
3dab1dad
YO
2052 }
2053 }
2054
6ad9a8ab 2055 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2056
2057 }
6ad9a8ab 2058 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2059 }
6ad9a8ab 2060 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2061 depth);
2e64971a 2062 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2063 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2064 (int)word, (int)(trie->wordinfo[word].prev),
2065 (int)(trie->wordinfo[word].len));
2066 }
6ad9a8ab 2067 Perl_re_printf( aTHX_ "\n" );
538e84ed 2068}
3dab1dad 2069/*
3dab1dad 2070 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2071 List tries normally only are used for construction when the number of
3dab1dad
YO
2072 possible chars (trie->uniquecharcount) is very high.
2073 Used for debugging make_trie().
2074*/
2075STATIC void
55eed653 2076S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2077 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2078 U32 depth)
3dab1dad
YO
2079{
2080 U32 state;
ab3bbdeb 2081 SV *sv=sv_newmortal();
55eed653 2082 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2083 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2084
2085 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2086
3dab1dad 2087 /* print out the table precompression. */
6ad9a8ab 2088 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2089 depth+1 );
6ad9a8ab 2090 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2091 depth+1, "------:-----+-----------------\n" );
538e84ed 2092
3dab1dad
YO
2093 for( state=1 ; state < next_alloc ; state ++ ) {
2094 U16 charid;
538e84ed 2095
6ad9a8ab 2096 Perl_re_indentf( aTHX_ " %4"UVXf" :",
cb41e5d6 2097 depth+1, (UV)state );
3dab1dad 2098 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2099 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2100 } else {
6ad9a8ab 2101 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2102 trie->states[ state ].wordnum
2103 );
2104 }
2105 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed
KW
2106 SV ** const tmp = av_fetch( revcharmap,
2107 TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb 2108 if ( tmp ) {
6ad9a8ab 2109 Perl_re_printf( aTHX_ "%*s:%3X=%4"UVXf" | ",
ab3bbdeb 2110 colwidth,
538e84ed
KW
2111 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2112 colwidth,
2113 PL_colors[0], PL_colors[1],
2114 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2115 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2116 ) ,
1e2e3d02
YO
2117 TRIE_LIST_ITEM(state,charid).forid,
2118 (UV)TRIE_LIST_ITEM(state,charid).newstate
2119 );
538e84ed 2120 if (!(charid % 10))
6ad9a8ab 2121 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2122 (int)((depth * 2) + 14), "");
1e2e3d02 2123 }
ab3bbdeb 2124 }
6ad9a8ab 2125 Perl_re_printf( aTHX_ "\n");
3dab1dad 2126 }
538e84ed 2127}
3dab1dad
YO
2128
2129/*
3dab1dad 2130 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2131 This is the normal DFA style state transition table, with a few
2132 twists to facilitate compression later.
3dab1dad
YO
2133 Used for debugging make_trie().
2134*/
2135STATIC void
55eed653 2136S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2137 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2138 U32 depth)
3dab1dad
YO
2139{
2140 U32 state;
2141 U16 charid;
ab3bbdeb 2142 SV *sv=sv_newmortal();
55eed653 2143 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2144 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2145
2146 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2147
3dab1dad
YO
2148 /*
2149 print out the table precompression so that we can do a visual check
2150 that they are identical.
2151 */
538e84ed 2152
6ad9a8ab 2153 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2154
2155 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2156 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2157 if ( tmp ) {
6ad9a8ab 2158 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2159 colwidth,
538e84ed 2160 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2161 PL_colors[0], PL_colors[1],
2162 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2163 PERL_PV_ESCAPE_FIRSTCHAR
2164 )
ab3bbdeb 2165 );
3dab1dad
YO
2166 }
2167 }
2168
6ad9a8ab 2169 Perl_re_printf( aTHX_ "\n%*sState+-",depth+1 );
3dab1dad
YO
2170
2171 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2172 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2173 }
2174
6ad9a8ab 2175 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2176
2177 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2178
6ad9a8ab 2179 Perl_re_indentf( aTHX_ "%4"UVXf" : ",
cb41e5d6 2180 depth+1,
3dab1dad
YO
2181 (UV)TRIE_NODENUM( state ) );
2182
2183 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2184 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2185 if (v)
6ad9a8ab 2186 Perl_re_printf( aTHX_ "%*"UVXf, colwidth, v );
ab3bbdeb 2187 else
6ad9a8ab 2188 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2189 }
2190 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
6ad9a8ab 2191 Perl_re_printf( aTHX_ " (%4"UVXf")\n",
538e84ed 2192 (UV)trie->trans[ state ].check );
3dab1dad 2193 } else {
6ad9a8ab 2194 Perl_re_printf( aTHX_ " (%4"UVXf") W%4X\n",
538e84ed 2195 (UV)trie->trans[ state ].check,
3dab1dad
YO
2196 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2197 }
2198 }
07be1b83 2199}
3dab1dad
YO
2200
2201#endif
2202
2e64971a 2203
786e8c11
YO
2204/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2205 startbranch: the first branch in the whole branch sequence
2206 first : start branch of sequence of branch-exact nodes.
2207 May be the same as startbranch
2208 last : Thing following the last branch.
2209 May be the same as tail.
2210 tail : item following the branch sequence
2211 count : words in the sequence
a4525e78 2212 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2213 depth : indent depth
3dab1dad 2214
786e8c11 2215Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2216
786e8c11
YO
2217A trie is an N'ary tree where the branches are determined by digital
2218decomposition of the key. IE, at the root node you look up the 1st character and
2219follow that branch repeat until you find the end of the branches. Nodes can be
2220marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2221
786e8c11 2222 /he|she|his|hers/
72f13be8 2223
786e8c11
YO
2224would convert into the following structure. Numbers represent states, letters
2225following numbers represent valid transitions on the letter from that state, if
2226the number is in square brackets it represents an accepting state, otherwise it
2227will be in parenthesis.
07be1b83 2228
786e8c11
YO
2229 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2230 | |
2231 | (2)
2232 | |
2233 (1) +-i->(6)-+-s->[7]
2234 |
2235 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2236
786e8c11
YO
2237 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2238
2239This shows that when matching against the string 'hers' we will begin at state 1
2240read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2241then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2242is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2243single traverse. We store a mapping from accepting to state to which word was
2244matched, and then when we have multiple possibilities we try to complete the
b8fda935 2245rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2246
2247The only prior NFA like behaviour that would be changed by the TRIE support is
2248the silent ignoring of duplicate alternations which are of the form:
2249
2250 / (DUPE|DUPE) X? (?{ ... }) Y /x
2251
4b714af6 2252Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2253and without the optimisation. With the optimisations dupes will be silently
486ec47a 2254ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2255the following demonstrates:
2256
2257 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2258
2259which prints out 'word' three times, but
2260
2261 'words'=~/(word|word|word)(?{ print $1 })S/
2262
2263which doesnt print it out at all. This is due to other optimisations kicking in.
2264
2265Example of what happens on a structural level:
2266
486ec47a 2267The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2268
2269 1: CURLYM[1] {1,32767}(18)
2270 5: BRANCH(8)
2271 6: EXACT <ac>(16)
2272 8: BRANCH(11)
2273 9: EXACT <ad>(16)
2274 11: BRANCH(14)
2275 12: EXACT <ab>(16)
2276 16: SUCCEED(0)
2277 17: NOTHING(18)
2278 18: END(0)
2279
2280This would be optimizable with startbranch=5, first=5, last=16, tail=16
2281and should turn into:
2282
2283 1: CURLYM[1] {1,32767}(18)
2284 5: TRIE(16)
2285 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2286 <ac>
2287 <ad>
2288 <ab>
2289 16: SUCCEED(0)
2290 17: NOTHING(18)
2291 18: END(0)
2292
2293Cases where tail != last would be like /(?foo|bar)baz/:
2294
2295 1: BRANCH(4)
2296 2: EXACT <foo>(8)
2297 4: BRANCH(7)
2298 5: EXACT <bar>(8)
2299 7: TAIL(8)
2300 8: EXACT <baz>(10)
2301 10: END(0)
2302
2303which would be optimizable with startbranch=1, first=1, last=7, tail=8
2304and would end up looking like:
2305
2306 1: TRIE(8)
2307 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2308 <foo>
2309 <bar>
2310 7: TAIL(8)
2311 8: EXACT <baz>(10)
2312 10: END(0)
2313
c80e42f3 2314 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2315
2316is the recommended Unicode-aware way of saying
2317
2318 *(d++) = uv;
2319*/
2320
fab2782b 2321#define TRIE_STORE_REVCHAR(val) \
786e8c11 2322 STMT_START { \
73031816 2323 if (UTF) { \
668fcfea 2324 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2325 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 2326 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
2327 SvCUR_set(zlopp, kapow - flrbbbbb); \
2328 SvPOK_on(zlopp); \
2329 SvUTF8_on(zlopp); \
2330 av_push(revcharmap, zlopp); \
2331 } else { \
fab2782b 2332 char ooooff = (char)val; \
73031816
NC
2333 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2334 } \
2335 } STMT_END
786e8c11 2336
914a25d5
KW
2337/* This gets the next character from the input, folding it if not already
2338 * folded. */
2339#define TRIE_READ_CHAR STMT_START { \
2340 wordlen++; \
2341 if ( UTF ) { \
2342 /* if it is UTF then it is either already folded, or does not need \
2343 * folding */ \
1c1d615a 2344 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2345 } \
2346 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2347 /* This folder implies Unicode rules, which in the range expressible \
2348 * by not UTF is the lower case, with the two exceptions, one of \
2349 * which should have been taken care of before calling this */ \
2350 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2351 uvc = toLOWER_L1(*uc); \
2352 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2353 len = 1; \
914a25d5
KW
2354 } else { \
2355 /* raw data, will be folded later if needed */ \
2356 uvc = (U32)*uc; \
2357 len = 1; \
2358 } \
786e8c11
YO
2359} STMT_END
2360
2361
2362
2363#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2364 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
2365 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2366 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
2367 } \
2368 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2369 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2370 TRIE_LIST_CUR( state )++; \
2371} STMT_END
07be1b83 2372
786e8c11
YO
2373#define TRIE_LIST_NEW(state) STMT_START { \
2374 Newxz( trie->states[ state ].trans.list, \
2375 4, reg_trie_trans_le ); \
2376 TRIE_LIST_CUR( state ) = 1; \
2377 TRIE_LIST_LEN( state ) = 4; \
2378} STMT_END
07be1b83 2379
786e8c11
YO
2380#define TRIE_HANDLE_WORD(state) STMT_START { \
2381 U16 dupe= trie->states[ state ].wordnum; \
2382 regnode * const noper_next = regnext( noper ); \
2383 \
786e8c11
YO
2384 DEBUG_r({ \
2385 /* store the word for dumping */ \
2386 SV* tmp; \
2387 if (OP(noper) != NOTHING) \
740cce10 2388 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2389 else \
740cce10 2390 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2391 av_push( trie_words, tmp ); \
786e8c11
YO
2392 }); \
2393 \
2394 curword++; \
2e64971a
DM
2395 trie->wordinfo[curword].prev = 0; \
2396 trie->wordinfo[curword].len = wordlen; \
2397 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2398 \
2399 if ( noper_next < tail ) { \
2400 if (!trie->jump) \
538e84ed
KW
2401 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2402 sizeof(U16) ); \
7f69552c 2403 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2404 if (!jumper) \
2405 jumper = noper_next; \
2406 if (!nextbranch) \
2407 nextbranch= regnext(cur); \
2408 } \
2409 \
2410 if ( dupe ) { \
2e64971a
DM
2411 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2412 /* chain, so that when the bits of chain are later */\
2413 /* linked together, the dups appear in the chain */\
2414 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2415 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2416 } else { \
2417 /* we haven't inserted this word yet. */ \
2418 trie->states[ state ].wordnum = curword; \
2419 } \
2420} STMT_END
07be1b83 2421
3dab1dad 2422
786e8c11
YO
2423#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2424 ( ( base + charid >= ucharcount \
2425 && base + charid < ubound \
2426 && state == trie->trans[ base - ucharcount + charid ].check \
2427 && trie->trans[ base - ucharcount + charid ].next ) \
2428 ? trie->trans[ base - ucharcount + charid ].next \
2429 : ( state==1 ? special : 0 ) \
2430 )
3dab1dad 2431
786e8c11
YO
2432#define MADE_TRIE 1
2433#define MADE_JUMP_TRIE 2
2434#define MADE_EXACT_TRIE 4
3dab1dad 2435
a3621e74 2436STATIC I32
538e84ed
KW
2437S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2438 regnode *first, regnode *last, regnode *tail,
2439 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2440{
2441 /* first pass, loop through and scan words */
2442 reg_trie_data *trie;
55eed653 2443 HV *widecharmap = NULL;
2b8b4781 2444 AV *revcharmap = newAV();
a3621e74 2445 regnode *cur;
a3621e74
YO
2446 STRLEN len = 0;
2447 UV uvc = 0;
2448 U16 curword = 0;
2449 U32 next_alloc = 0;
786e8c11
YO
2450 regnode *jumper = NULL;
2451 regnode *nextbranch = NULL;
7f69552c 2452 regnode *convert = NULL;
2e64971a 2453 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2454 /* we just use folder as a flag in utf8 */
1e696034 2455 const U8 * folder = NULL;
a3621e74 2456
2b8b4781 2457#ifdef DEBUGGING
cf78de0b 2458 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2b8b4781
NC
2459 AV *trie_words = NULL;
2460 /* along with revcharmap, this only used during construction but both are
2461 * useful during debugging so we store them in the struct when debugging.
8e11feef 2462 */
2b8b4781 2463#else
cf78de0b 2464 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2465 STRLEN trie_charcount=0;
3dab1dad 2466#endif
2b8b4781 2467 SV *re_trie_maxbuff;
a3621e74 2468 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2469
2470 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2471#ifndef DEBUGGING
2472 PERL_UNUSED_ARG(depth);
2473#endif
a3621e74 2474
1e696034 2475 switch (flags) {
a4525e78 2476 case EXACT: case EXACTL: break;
2f7f8cb1 2477 case EXACTFA:
fab2782b 2478 case EXACTFU_SS:
a4525e78
KW
2479 case EXACTFU:
2480 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2481 case EXACTF: folder = PL_fold; break;
fab2782b 2482 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2483 }
2484
c944940b 2485 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2486 trie->refcount = 1;
3dab1dad 2487 trie->startstate = 1;
786e8c11 2488 trie->wordcount = word_count;
f8fc2ecf 2489 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2490 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
a4525e78 2491 if (flags == EXACT || flags == EXACTL)
c944940b 2492 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2493 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2494 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2495
a3621e74 2496 DEBUG_r({
2b8b4781 2497 trie_words = newAV();
a3621e74 2498 });
a3621e74 2499
0111c4fd 2500 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
316ebaf2 2501 assert(re_trie_maxbuff);
a3621e74 2502 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2503 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2504 }
df826430 2505 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2506 Perl_re_indentf( aTHX_
cb41e5d6
YO
2507 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2508 depth+1,
538e84ed
KW
2509 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2510 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2511 });
538e84ed 2512
7f69552c
YO
2513 /* Find the node we are going to overwrite */
2514 if ( first == startbranch && OP( last ) != BRANCH ) {
2515 /* whole branch chain */
2516 convert = first;
2517 } else {
2518 /* branch sub-chain */
2519 convert = NEXTOPER( first );
2520 }
538e84ed 2521
a3621e74
YO
2522 /* -- First loop and Setup --
2523
2524 We first traverse the branches and scan each word to determine if it
2525 contains widechars, and how many unique chars there are, this is
2526 important as we have to build a table with at least as many columns as we
2527 have unique chars.
2528
2529 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2530 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2531 the native representation of the character value as the key and IV's for
2532 the coded index.
a3621e74
YO
2533
2534 *TODO* If we keep track of how many times each character is used we can
2535 remap the columns so that the table compression later on is more
3b753521 2536 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2537 middle and the least common are on the outside. IMO this would be better
2538 than a most to least common mapping as theres a decent chance the most
2539 common letter will share a node with the least common, meaning the node
486ec47a 2540 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2541 case is when we have the least common nodes twice.
2542
2543 */
2544
a3621e74 2545 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2546 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2547 const U8 *uc;
2548 const U8 *e;
bc031a7d 2549 int foldlen = 0;
07be1b83 2550 U32 wordlen = 0; /* required init */
bc031a7d
KW
2551 STRLEN minchars = 0;
2552 STRLEN maxchars = 0;
538e84ed
KW
2553 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2554 bitmap?*/
a3621e74 2555
3dab1dad 2556 if (OP(noper) == NOTHING) {
df826430 2557 regnode *noper_next= regnext(noper);
944e05e3
YO
2558 if (noper_next < tail)
2559 noper= noper_next;
2560 }
2561
2562 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2563 uc= (U8*)STRING(noper);
2564 e= uc + STR_LEN(noper);
2565 } else {
2566 trie->minlen= 0;
2567 continue;
3dab1dad 2568 }
df826430 2569
944e05e3 2570
fab2782b 2571 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2572 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2573 regardless of encoding */
fab2782b
YO
2574 if (OP( noper ) == EXACTFU_SS) {
2575 /* false positives are ok, so just set this */
0dc4a61d 2576 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2577 }
2578 }
bc031a7d
KW
2579 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2580 branch */
3dab1dad 2581 TRIE_CHARCOUNT(trie)++;
a3621e74 2582 TRIE_READ_CHAR;
645de4ce 2583
bc031a7d
KW
2584 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2585 * is in effect. Under /i, this character can match itself, or
2586 * anything that folds to it. If not under /i, it can match just
2587 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2588 * all fold to k, and all are single characters. But some folds
2589 * expand to more than one character, so for example LATIN SMALL
2590 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2591 * the string beginning at 'uc' is 'ffi', it could be matched by
2592 * three characters, or just by the one ligature character. (It
2593 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2594 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2595 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2596 * match.) The trie needs to know the minimum and maximum number
2597 * of characters that could match so that it can use size alone to
2598 * quickly reject many match attempts. The max is simple: it is
2599 * the number of folded characters in this branch (since a fold is
2600 * never shorter than what folds to it. */
2601
2602 maxchars++;
2603
2604 /* And the min is equal to the max if not under /i (indicated by
2605 * 'folder' being NULL), or there are no multi-character folds. If
2606 * there is a multi-character fold, the min is incremented just
2607 * once, for the character that folds to the sequence. Each
2608 * character in the sequence needs to be added to the list below of
2609 * characters in the trie, but we count only the first towards the
2610 * min number of characters needed. This is done through the
2611 * variable 'foldlen', which is returned by the macros that look
2612 * for these sequences as the number of bytes the sequence
2613 * occupies. Each time through the loop, we decrement 'foldlen' by
2614 * how many bytes the current char occupies. Only when it reaches
2615 * 0 do we increment 'minchars' or look for another multi-character
2616 * sequence. */
2617 if (folder == NULL) {
2618 minchars++;
2619 }
2620 else if (foldlen > 0) {
2621 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2622 }
2623 else {
bc031a7d
KW
2624 minchars++;
2625
2626 /* See if *uc is the beginning of a multi-character fold. If
2627 * so, we decrement the length remaining to look at, to account
2628 * for the current character this iteration. (We can use 'uc'
2629 * instead of the fold returned by TRIE_READ_CHAR because for
2630 * non-UTF, the latin1_safe macro is smart enough to account
2631 * for all the unfolded characters, and because for UTF, the
2632 * string will already have been folded earlier in the
2633 * compilation process */
2634 if (UTF) {
2635 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2636 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2637 }
2638 }
bc031a7d
KW
2639 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2640 foldlen--;
2641 }
645de4ce 2642 }
bc031a7d
KW
2643
2644 /* The current character (and any potential folds) should be added
2645 * to the possible matching characters for this position in this
2646 * branch */
a3621e74 2647 if ( uvc < 256 ) {
fab2782b
YO
2648 if ( folder ) {
2649 U8 folded= folder[ (U8) uvc ];
2650 if ( !trie->charmap[ folded ] ) {
2651 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2652 TRIE_STORE_REVCHAR( folded );
2653 }
2654 }
a3621e74
YO
2655 if ( !trie->charmap[ uvc ] ) {
2656 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2657 TRIE_STORE_REVCHAR( uvc );
a3621e74 2658 }
02daf0ab 2659 if ( set_bit ) {
62012aee
KW
2660 /* store the codepoint in the bitmap, and its folded
2661 * equivalent. */
fab2782b 2662 TRIE_BITMAP_SET(trie, uvc);
0921ee73
T
2663
2664 /* store the folded codepoint */
fab2782b 2665 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
0921ee73
T
2666
2667 if ( !UTF ) {
2668 /* store first byte of utf8 representation of
acdf4139 2669 variant codepoints */
6f2d5cbc 2670 if (! UVCHR_IS_INVARIANT(uvc)) {
acdf4139 2671 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
2672 }
2673 }
02daf0ab
YO
2674 set_bit = 0; /* We've done our bit :-) */
2675 }
a3621e74 2676 } else {
bc031a7d
KW
2677
2678 /* XXX We could come up with the list of code points that fold
2679 * to this using PL_utf8_foldclosures, except not for
2680 * multi-char folds, as there may be multiple combinations
2681 * there that could work, which needs to wait until runtime to
2682 * resolve (The comment about LIGATURE FFI above is such an
2683 * example */
2684
a3621e74 2685 SV** svpp;
55eed653
NC
2686 if ( !widecharmap )
2687 widecharmap = newHV();
a3621e74 2688
55eed653 2689 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2690
2691 if ( !svpp )
e4584336 2692 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
2693
2694 if ( !SvTRUE( *svpp ) ) {
2695 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2696 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2697 }
2698 }
bc031a7d
KW
2699 } /* end loop through characters in this branch of the trie */
2700
2701 /* We take the min and max for this branch and combine to find the min
2702 * and max for all branches processed so far */
3dab1dad 2703 if( cur == first ) {
bc031a7d
KW
2704 trie->minlen = minchars;
2705 trie->maxlen = maxchars;
2706 } else if (minchars < trie->minlen) {
2707 trie->minlen = minchars;
2708 } else if (maxchars > trie->maxlen) {
2709 trie->maxlen = maxchars;
fab2782b 2710 }
a3621e74
YO
2711 } /* end first pass */
2712 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2713 Perl_re_indentf( aTHX_
cb41e5d6
YO
2714 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2715 depth+1,
55eed653 2716 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2717 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2718 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2719 );
a3621e74
YO
2720
2721 /*
2722 We now know what we are dealing with in terms of unique chars and
2723 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2724 representation using a flat table will take. If it's over a reasonable
2725 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2726 conservative but potentially much slower representation using an array
2727 of lists.
2728
2729 At the end we convert both representations into the same compressed
2730 form that will be used in regexec.c for matching with. The latter
2731 is a form that cannot be used to construct with but has memory
2732 properties similar to the list form and access properties similar
2733 to the table form making it both suitable for fast searches and
2734 small enough that its feasable to store for the duration of a program.
2735
2736 See the comment in the code where the compressed table is produced
2737 inplace from the flat tabe representation for an explanation of how
2738 the compression works.
2739
2740 */
2741
2742
2e64971a
DM
2743 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2744 prev_states[1] = 0;
2745
538e84ed
KW
2746 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2747 > SvIV(re_trie_maxbuff) )
2748 {
a3621e74
YO
2749 /*
2750 Second Pass -- Array Of Lists Representation
2751
2752 Each state will be represented by a list of charid:state records
2753 (reg_trie_trans_le) the first such element holds the CUR and LEN
2754 points of the allocated array. (See defines above).
2755
2756 We build the initial structure using the lists, and then convert
2757 it into the compressed table form which allows faster lookups
2758 (but cant be modified once converted).
a3621e74
YO
2759 */
2760
a3621e74
YO
2761 STRLEN transcount = 1;
2762
6ad9a8ab 2763 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 2764 depth+1));
686b73d4 2765
c944940b
JH
2766 trie->states = (reg_trie_state *)
2767 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2768 sizeof(reg_trie_state) );
a3621e74
YO
2769 TRIE_LIST_NEW(1);
2770 next_alloc = 2;
2771
2772 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2773
df826430 2774 regnode *noper = NEXTOPER( cur );
c445ea15
AL
2775 U32 state = 1; /* required init */
2776 U16 charid = 0; /* sanity init */
07be1b83 2777 U32 wordlen = 0; /* required init */
c445ea15 2778
df826430
YO
2779 if (OP(noper) == NOTHING) {
2780 regnode *noper_next= regnext(noper);
944e05e3
YO
2781 if (noper_next < tail)
2782 noper= noper_next;
df826430
YO
2783 }
2784
944e05e3
YO
2785 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2786 const U8 *uc= (U8*)STRING(noper);
2787 const U8 *e= uc + STR_LEN(noper);
2788
786e8c11 2789 for ( ; uc < e ; uc += len ) {
c445ea15 2790
786e8c11 2791 TRIE_READ_CHAR;
c445ea15 2792
786e8c11
YO
2793 if ( uvc < 256 ) {
2794 charid = trie->charmap[ uvc ];
c445ea15 2795 } else {
538e84ed
KW
2796 SV** const svpp = hv_fetch( widecharmap,
2797 (char*)&uvc,
2798 sizeof( UV ),
2799 0);
786e8c11
YO
2800 if ( !svpp ) {
2801 charid = 0;
2802 } else {
2803 charid=(U16)SvIV( *svpp );
2804 }
c445ea15 2805 }
538e84ed
KW
2806 /* charid is now 0 if we dont know the char read, or
2807 * nonzero if we do */
786e8c11 2808 if ( charid ) {
a3621e74 2809
786e8c11
YO
2810 U16 check;
2811 U32 newstate = 0;
a3621e74 2812
786e8c11
YO
2813 charid--;
2814 if ( !trie->states[ state ].trans.list ) {
2815 TRIE_LIST_NEW( state );
c445ea15 2816 }
538e84ed
KW
2817 for ( check = 1;
2818 check <= TRIE_LIST_USED( state );
2819 check++ )
2820 {
2821 if ( TRIE_LIST_ITEM( state, check ).forid
2822 == charid )
2823 {
786e8c11
YO
2824 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2825 break;
2826 }
2827 }
2828 if ( ! newstate ) {
2829 newstate = next_alloc++;
2e64971a 2830 prev_states[newstate] = state;
786e8c11
YO
2831 TRIE_LIST_PUSH( state, charid, newstate );
2832 transcount++;
2833 }
2834 state = newstate;
2835 } else {
2836 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 2837 }
a28509cc 2838 }
c445ea15 2839 }
3dab1dad 2840 TRIE_HANDLE_WORD(state);
a3621e74
YO
2841
2842 } /* end second pass */
2843
1e2e3d02 2844 /* next alloc is the NEXT state to be allocated */
538e84ed 2845 trie->statecount = next_alloc;
c944940b
JH
2846 trie->states = (reg_trie_state *)
2847 PerlMemShared_realloc( trie->states,
2848 next_alloc
2849 * sizeof(reg_trie_state) );
a3621e74 2850
3dab1dad 2851 /* and now dump it out before we compress it */
2b8b4781
NC
2852 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2853 revcharmap, next_alloc,
2854 depth+1)
1e2e3d02 2855 );
a3621e74 2856
c944940b
JH
2857 trie->trans = (reg_trie_trans *)
2858 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2859 {
2860 U32 state;
a3621e74
YO
2861 U32 tp = 0;
2862 U32 zp = 0;
2863
2864
2865 for( state=1 ; state < next_alloc ; state ++ ) {
2866 U32 base=0;
2867
2868 /*
2869 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 2870 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
2871 );
2872 */
2873
2874 if (trie->states[state].trans.list) {
2875 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2876 U16 maxid=minid;
a28509cc 2877 U16 idx;
a3621e74
YO
2878
2879 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
2880 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2881 if ( forid < minid ) {
2882 minid=forid;
2883 } else if ( forid > maxid ) {
2884 maxid=forid;
2885 }
a3621e74
YO
2886 }
2887 if ( transcount < tp + maxid - minid + 1) {
2888 transcount *= 2;
c944940b
JH
2889 trie->trans = (reg_trie_trans *)
2890 PerlMemShared_realloc( trie->trans,
446bd890
NC
2891 transcount
2892 * sizeof(reg_trie_trans) );
538e84ed
KW
2893 Zero( trie->trans + (transcount / 2),
2894 transcount / 2,
2895 reg_trie_trans );
a3621e74
YO
2896 }
2897 base = trie->uniquecharcount + tp - minid;
2898 if ( maxid == minid ) {
2899 U32 set = 0;
2900 for ( ; zp < tp ; zp++ ) {
2901 if ( ! trie->trans[ zp ].next ) {
2902 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
2903 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2904 1).newstate;
a3621e74
YO
2905 trie->trans[ zp ].check = state;
2906 set = 1;
2907 break;
2908 }
2909 }
2910 if ( !set ) {
538e84ed
KW
2911 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2912 1).newstate;
a3621e74
YO
2913 trie->trans[ tp ].check = state;
2914 tp++;
2915 zp = tp;
2916 }
2917 } else {
2918 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
2919 const U32 tid = base
2920 - trie->uniquecharcount
2921 + TRIE_LIST_ITEM( state, idx ).forid;
2922 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2923 idx ).newstate;
a3621e74
YO
2924 trie->trans[ tid ].check = state;
2925 }
2926 tp += ( maxid - minid + 1 );
2927 }
2928 Safefree(trie->states[ state ].trans.list);
2929 }
2930 /*
2931 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 2932 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
2933 );
2934 */
2935 trie->states[ state ].trans.base=base;
2936 }
cc601c31 2937 trie->lasttrans = tp + 1;
a3621e74
YO
2938 }
2939 } else {
2940 /*
2941 Second Pass -- Flat Table Representation.
2942
b423522f
KW
2943 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2944 each. We know that we will need Charcount+1 trans at most to store
2945 the data (one row per char at worst case) So we preallocate both
2946 structures assuming worst case.
a3621e74
YO
2947
2948 We then construct the trie using only the .next slots of the entry
2949 structs.
2950
b423522f
KW
2951 We use the .check field of the first entry of the node temporarily
2952 to make compression both faster and easier by keeping track of how
2953 many non zero fields are in the node.
a3621e74
YO
2954
2955 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2956 transition.
2957
b423522f
KW
2958 There are two terms at use here: state as a TRIE_NODEIDX() which is
2959 a number representing the first entry of the node, and state as a
2960 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2961 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2962 if there are 2 entrys per node. eg:
a3621e74
YO
2963
2964 A B A B
2965 1. 2 4 1. 3 7
2966 2. 0 3 3. 0 5
2967 3. 0 0 5. 0 0
2968 4. 0 0 7. 0 0
2969
b423522f
KW
2970 The table is internally in the right hand, idx form. However as we
2971 also have to deal with the states array which is indexed by nodenum
2972 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
2973
2974 */
6ad9a8ab 2975 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 2976 depth+1));
3dab1dad 2977
c944940b
JH
2978 trie->trans = (reg_trie_trans *)
2979 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2980 * trie->uniquecharcount + 1,
2981 sizeof(reg_trie_trans) );
2982 trie->states = (reg_trie_state *)
2983 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2984 sizeof(reg_trie_state) );
a3621e74
YO
2985 next_alloc = trie->uniquecharcount + 1;
2986
3dab1dad 2987
a3621e74
YO
2988 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2989
df826430 2990 regnode *noper = NEXTOPER( cur );
a3621e74
YO
2991
2992 U32 state = 1; /* required init */
2993
2994 U16 charid = 0; /* sanity init */
2995 U32 accept_state = 0; /* sanity init */
a3621e74 2996
07be1b83 2997 U32 wordlen = 0; /* required init */
a3621e74 2998
df826430
YO
2999 if (OP(noper) == NOTHING) {
3000 regnode *noper_next= regnext(noper);
944e05e3
YO
3001 if (noper_next < tail)
3002 noper= noper_next;
df826430 3003 }
fab2782b 3004
944e05e3
YO
3005 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3006 const U8 *uc= (U8*)STRING(noper);
3007 const U8 *e= uc + STR_LEN(noper);
3008
786e8c11 3009 for ( ; uc < e ; uc += len ) {
a3621e74 3010
786e8c11 3011 TRIE_READ_CHAR;
a3621e74 3012
786e8c11
YO
3013 if ( uvc < 256 ) {
3014 charid = trie->charmap[ uvc ];
3015 } else {
538e84ed
KW
3016 SV* const * const svpp = hv_fetch( widecharmap,
3017 (char*)&uvc,
3018 sizeof( UV ),
3019 0);
786e8c11 3020 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3021 }
786e8c11
YO
3022 if ( charid ) {
3023 charid--;
3024 if ( !trie->trans[ state + charid ].next ) {
3025 trie->trans[ state + charid ].next = next_alloc;
3026 trie->trans[ state ].check++;
2e64971a
DM
3027 prev_states[TRIE_NODENUM(next_alloc)]
3028 = TRIE_NODENUM(state);
786e8c11
YO
3029 next_alloc += trie->uniquecharcount;
3030 }
3031 state = trie->trans[ state + charid ].next;
3032 } else {
3033 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
3034 }
538e84ed
KW
3035 /* charid is now 0 if we dont know the char read, or
3036 * nonzero if we do */
a3621e74 3037 }
a3621e74 3038 }
3dab1dad
YO
3039 accept_state = TRIE_NODENUM( state );
3040 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3041
3042 } /* end second pass */
3043
3dab1dad 3044 /* and now dump it out before we compress it */
2b8b4781
NC
3045 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3046 revcharmap,
3047 next_alloc, depth+1));
a3621e74 3048
a3621e74
YO
3049 {
3050 /*
3051 * Inplace compress the table.*
3052
3053 For sparse data sets the table constructed by the trie algorithm will
3054 be mostly 0/FAIL transitions or to put it another way mostly empty.
3055 (Note that leaf nodes will not contain any transitions.)
3056
3057 This algorithm compresses the tables by eliminating most such
3058 transitions, at the cost of a modest bit of extra work during lookup:
3059
3060 - Each states[] entry contains a .base field which indicates the
3061 index in the state[] array wheres its transition data is stored.
3062
3b753521 3063 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3064
3065 - If .base is nonzero then charid is added to it to find an entry in
3066 the trans array.
3067
3068 -If trans[states[state].base+charid].check!=state then the
3069 transition is taken to be a 0/Fail transition. Thus if there are fail
3070 transitions at the front of the node then the .base offset will point
3071 somewhere inside the previous nodes data (or maybe even into a node
3072 even earlier), but the .check field determines if the transition is
3073 valid.
3074
786e8c11 3075 XXX - wrong maybe?
a3621e74 3076 The following process inplace converts the table to the compressed
3b753521 3077 table: We first do not compress the root node 1,and mark all its
a3621e74 3078 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3079 allows us to do a DFA construction from the compressed table later,
3080 and ensures that any .base pointers we calculate later are greater
3081 than 0.
a3621e74
YO
3082
3083 - We set 'pos' to indicate the first entry of the second node.
3084
3085 - We then iterate over the columns of the node, finding the first and
3086 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3087 and set the .check pointers accordingly, and advance pos
3088 appropriately and repreat for the next node. Note that when we copy
3089 the next pointers we have to convert them from the original
3090 NODEIDX form to NODENUM form as the former is not valid post
3091 compression.
3092
3093 - If a node has no transitions used we mark its base as 0 and do not
3094 advance the pos pointer.
3095
3096 - If a node only has one transition we use a second pointer into the
3097 structure to fill in allocated fail transitions from other states.
3098 This pointer is independent of the main pointer and scans forward
3099 looking for null transitions that are allocated to a state. When it
3100 finds one it writes the single transition into the "hole". If the
786e8c11 3101 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3102
3103 - Once compressed we can Renew/realloc the structures to release the
3104 excess space.
3105
3106 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3107 specifically Fig 3.47 and the associated pseudocode.
3108
3109 demq
3110 */
a3b680e6 3111 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3112 U32 state, charid;
a3621e74 3113 U32 pos = 0, zp=0;
1e2e3d02 3114 trie->statecount = laststate;
a3621e74
YO
3115
3116 for ( state = 1 ; state < laststate ; state++ ) {
3117 U8 flag = 0;
a28509cc
AL
3118 const U32 stateidx = TRIE_NODEIDX( state );
3119 const U32 o_used = trie->trans[ stateidx ].check;
3120 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3121 trie->trans[ stateidx ].check = 0;
3122
538e84ed
KW
3123 for ( charid = 0;
3124 used && charid < trie->uniquecharcount;
3125 charid++ )
3126 {
a3621e74
YO
3127 if ( flag || trie->trans[ stateidx + charid ].next ) {
3128 if ( trie->trans[ stateidx + charid ].next ) {
3129 if (o_used == 1) {
3130 for ( ; zp < pos ; zp++ ) {
3131 if ( ! trie->trans[ zp ].next ) {
3132 break;
3133 }
3134 }
538e84ed
KW
3135 trie->states[ state ].trans.base
3136 = zp
3137 + trie->uniquecharcount
3138 - charid ;
3139 trie->trans[ zp ].next
3140 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3141 + charid ].next );
a3621e74
YO
3142 trie->trans[ zp ].check = state;
3143 if ( ++zp > pos ) pos = zp;
3144 break;
3145 }
3146 used--;
3147 }
3148 if ( !flag ) {
3149 flag = 1;
538e84ed
KW
3150 trie->states[ state ].trans.base
3151 = pos + trie->uniquecharcount - charid ;
a3621e74 3152 }
538e84ed
KW
3153 trie->trans[ pos ].next
3154 = SAFE_TRIE_NODENUM(
3155 trie->trans[ stateidx + charid ].next );
a3621e74
YO
3156 trie->trans[ pos ].check = state;
3157 pos++;
3158 }
3159 }
3160 }
cc601c31 3161 trie->lasttrans = pos + 1;
c944940b
JH
3162 trie->states = (reg_trie_state *)
3163 PerlMemShared_realloc( trie->states, laststate
3164 * sizeof(reg_trie_state) );
a3621e74 3165 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3166 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
cb41e5d6 3167 depth+1,
538e84ed
KW
3168 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3169 + 1 ),
3170 (IV)next_alloc,
3171 (IV)pos,
3172 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
3173 );
3174
3175 } /* end table compress */
3176 }
1e2e3d02 3177 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3178 Perl_re_indentf( aTHX_ "Statecount:%"UVxf" Lasttrans:%"UVxf"\n",
cb41e5d6 3179 depth+1,
1e2e3d02
YO
3180 (UV)trie->statecount,
3181 (UV)trie->lasttrans)
3182 );
cc601c31 3183 /* resize the trans array to remove unused space */
c944940b
JH
3184 trie->trans = (reg_trie_trans *)
3185 PerlMemShared_realloc( trie->trans, trie->lasttrans
3186 * sizeof(reg_trie_trans) );
a3621e74 3187
538e84ed 3188 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
3189 U8 nodetype =(U8)(flags & 0xFF);
3190 char *str=NULL;
538e84ed 3191
07be1b83 3192#ifdef DEBUGGING
e62cc96a 3193 regnode *optimize = NULL;
7122b237
YO
3194#ifdef RE_TRACK_PATTERN_OFFSETS
3195
b57a0404
JH
3196 U32 mjd_offset = 0;
3197 U32 mjd_nodelen = 0;
7122b237
YO
3198#endif /* RE_TRACK_PATTERN_OFFSETS */
3199#endif /* DEBUGGING */
a3621e74 3200 /*
3dab1dad
YO
3201 This means we convert either the first branch or the first Exact,
3202 depending on whether the thing following (in 'last') is a branch
3203 or not and whther first is the startbranch (ie is it a sub part of
3204 the alternation or is it the whole thing.)
3b753521 3205 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 3206 the whole branch sequence, including the first.
a3621e74 3207 */
3dab1dad 3208 /* Find the node we are going to overwrite */
7f69552c 3209 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 3210 /* branch sub-chain */
3dab1dad 3211 NEXT_OFF( first ) = (U16)(last - first);
7122b237 3212#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
3213 DEBUG_r({
3214 mjd_offset= Node_Offset((convert));
3215 mjd_nodelen= Node_Length((convert));
3216 });
7122b237 3217#endif
7f69552c 3218 /* whole branch chain */
7122b237
YO
3219 }
3220#ifdef RE_TRACK_PATTERN_OFFSETS
3221 else {
7f69552c
YO
3222 DEBUG_r({
3223 const regnode *nop = NEXTOPER( convert );
3224 mjd_offset= Node_Offset((nop));
3225 mjd_nodelen= Node_Length((nop));
3226 });
07be1b83
YO
3227 }
3228 DEBUG_OPTIMISE_r(
6ad9a8ab 3229 Perl_re_indentf( aTHX_ "MJD offset:%"UVuf" MJD length:%"UVuf"\n",
cb41e5d6 3230 depth+1,
786e8c11 3231 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 3232 );
7122b237 3233#endif
538e84ed 3234 /* But first we check to see if there is a common prefix we can
3dab1dad
YO
3235 split out as an EXACT and put in front of the TRIE node. */
3236 trie->startstate= 1;
55eed653 3237 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 3238 U32 state;
1e2e3d02 3239 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 3240 U32 ofs = 0;
8e11feef
RGS
3241 I32 idx = -1;
3242 U32 count = 0;
3243 const U32 base = trie->states[ state ].trans.base;
a3621e74 3244
3dab1dad 3245 if ( trie->states[state].wordnum )
8e11feef 3246 count = 1;
a3621e74 3247
8e11feef 3248 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
3249 if ( ( base + ofs >= trie->uniquecharcount ) &&
3250 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
3251 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3252 {
3dab1dad 3253 if ( ++count > 1 ) {
2b8b4781 3254 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 3255 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 3256 if ( state == 1 ) break;
3dab1dad
YO
3257 if ( count == 2 ) {
3258 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3259 DEBUG_OPTIMISE_r(
6ad9a8ab 3260 Perl_re_indentf( aTHX_ "New Start State=%"UVuf" Class: [",
cb41e5d6 3261 depth+1,
786e8c11 3262 (UV)state));
be8e71aa 3263 if (idx >= 0) {
2b8b4781 3264 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 3265 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 3266
3dab1dad 3267 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
3268 if ( folder )
3269 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 3270 DEBUG_OPTIMISE_r(
6ad9a8ab 3271 Perl_re_printf( aTHX_ "%s", (char*)ch)
3dab1dad 3272 );
8e11feef
RGS
3273 }
3274 }
3275 TRIE_BITMAP_SET(trie,*ch);
3276 if ( folder )
3277 TRIE_BITMAP_SET(trie,folder[ *ch ]);
6ad9a8ab 3278 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
8e11feef
RGS
3279 }
3280 idx = ofs;
3281 }
3dab1dad
YO
3282 }
3283 if ( count == 1 ) {
2b8b4781 3284 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
3285 STRLEN len;
3286 char *ch = SvPV( *tmp, len );
de734bd5
A
3287 DEBUG_OPTIMISE_r({
3288 SV *sv=sv_newmortal();
6ad9a8ab 3289 Perl_re_indentf( aTHX_ "Prefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
cb41e5d6 3290 depth+1,
538e84ed
KW
3291 (UV)state, (UV)idx,
3292 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
de734bd5
A
3293 PL_colors[0], PL_colors[1],
3294 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed 3295 PERL_PV_ESCAPE_FIRSTCHAR
de734bd5
A
3296 )
3297 );
3298 });
3dab1dad
YO
3299 if ( state==1 ) {
3300 OP( convert ) = nodetype;
3301 str=STRING(convert);
3302 STR_LEN(convert)=0;
3303 }
c490c714
YO
3304 STR_LEN(convert) += len;
3305 while (len--)
de734bd5 3306 *str++ = *ch++;
8e11feef 3307 } else {
538e84ed 3308#ifdef DEBUGGING
8e11feef 3309 if (state>1)
6ad9a8ab 3310 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
f9049ba1 3311#endif
8e11feef
RGS
3312 break;
3313 }
3314 }
2e64971a 3315 trie->prefixlen = (state-1);
3dab1dad 3316 if (str) {
8e11feef 3317 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 3318 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 3319 trie->startstate = state;
07be1b83
YO
3320 trie->minlen -= (state - 1);
3321 trie->maxlen -= (state - 1);
33809eae
JH
3322#ifdef DEBUGGING
3323 /* At least the UNICOS C compiler choked on this
3324 * being argument to DEBUG_r(), so let's just have
3325 * it right here. */
3326 if (
3327#ifdef PERL_EXT_RE_BUILD
3328 1
3329#else
3330 DEBUG_r_TEST
3331#endif
3332 ) {
3333 regnode *fix = convert;
3334 U32 word = trie->wordcount;
3335 mjd_nodelen++;
3336 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3337 while( ++fix < n ) {
3338 Set_Node_Offset_Length(fix, 0, 0);
3339 }
3340 while (word--) {
3341 SV ** const tmp = av_fetch( trie_words, word, 0 );
3342 if (tmp) {
3343 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3344 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3345 else
3346 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3347 }
3348 }
3349 }
3350#endif
8e11feef
RGS
3351 if (trie->maxlen) {
3352 convert = n;
3353 } else {
3dab1dad 3354 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 3355 DEBUG_r(optimize= n);
3dab1dad
YO
3356 }
3357 }
3358 }
538e84ed
KW
3359 if (!jumper)
3360 jumper = last;
3dab1dad 3361 if ( trie->maxlen ) {
8e11feef
RGS
3362 NEXT_OFF( convert ) = (U16)(tail - convert);
3363 ARG_SET( convert, data_slot );
538e84ed
KW
3364 /* Store the offset to the first unabsorbed branch in
3365 jump[0], which is otherwise unused by the jump logic.
786e8c11 3366 We use this when dumping a trie and during optimisation. */
538e84ed 3367 if (trie->jump)
7f69552c 3368 trie->jump[0] = (U16)(nextbranch - convert);
538e84ed 3369
6c48061a
YO
3370 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3371 * and there is a bitmap
3372 * and the first "jump target" node we found leaves enough room
3373 * then convert the TRIE node into a TRIEC node, with the bitmap
3374 * embedded inline in the opcode - this is hypothetically faster.
3375 */
3376 if ( !trie->states[trie->startstate].wordnum
3377 && trie->bitmap
3378 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
3379 {
3380 OP( convert ) = TRIEC;
3381 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 3382 PerlMemShared_free(trie->bitmap);
786e8c11 3383 trie->bitmap= NULL;
538e84ed 3384 } else
786e8c11 3385 OP( convert ) = TRIE;
a3621e74 3386
3dab1dad
YO
3387 /* store the type in the flags */
3388 convert->flags = nodetype;
a5ca303d 3389 DEBUG_r({
538e84ed
KW
3390 optimize = convert
3391 + NODE_STEP_REGNODE
a5ca303d
YO
3392 + regarglen[ OP( convert ) ];
3393 });
538e84ed 3394 /* XXX We really should free up the resource in trie now,
a5ca303d 3395 as we won't use them - (which resources?) dmq */
3dab1dad 3396 }
a3621e74 3397 /* needed for dumping*/
e62cc96a 3398 DEBUG_r(if (optimize) {
07be1b83 3399 regnode *opt = convert;
bcdf7404 3400
e62cc96a 3401 while ( ++opt < optimize) {
07be1b83
YO
3402 Set_Node_Offset_Length(opt,0,0);
3403 }
538e84ed
KW
3404 /*
3405 Try to clean up some of the debris left after the
786e8c11 3406 optimisation.
a3621e74 3407 */
786e8c11 3408 while( optimize < jumper ) {
07be1b83 3409 mjd_nodelen += Node_Length((optimize));
a3621e74 3410 OP( optimize ) = OPTIMIZED;
07be1b83 3411 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
3412 optimize++;
3413 }
07be1b83 3414 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
3415 });
3416 } /* end node insert */
2e64971a
DM
3417
3418 /* Finish populating the prev field of the wordinfo array. Walk back
3419 * from each accept state until we find another accept state, and if
3420 * so, point the first word's .prev field at the second word. If the
3421 * second already has a .prev field set, stop now. This will be the
3422 * case either if we've already processed that word's accept state,
3b753521
FN
3423 * or that state had multiple words, and the overspill words were
3424 * already linked up earlier.
2e64971a
DM
3425 */
3426 {
3427 U16 word;
3428 U32 state;
3429 U16 prev;
3430
3431 for (word=1; word <= trie->wordcount; word++) {
3432 prev = 0;
3433 if (trie->wordinfo[word].prev)
3434 continue;
3435 state = trie->wordinfo[word].accept;
3436 while (state) {
3437 state = prev_states[state];
3438 if (!state)
3439 break;
3440 prev = trie->states[state].wordnum;
3441 if (prev)
3442 break;
3443 }
3444 trie->wordinfo[word].prev = prev;
3445 }
3446 Safefree(prev_states);
3447 }
3448
3449
3450 /* and now dump out the compressed format */
3451 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3452
55eed653 3453 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
3454#ifdef DEBUGGING
3455 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3456 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3457#else
03e70be4 3458 SvREFCNT_dec_NN(revcharmap);
07be1b83 3459#endif
538e84ed
KW
3460 return trie->jump
3461 ? MADE_JUMP_TRIE
3462 : trie->startstate>1
3463 ? MADE_EXACT_TRIE
786e8c11
YO
3464 : MADE_TRIE;
3465}
3466
615a2e7f
YO
3467STATIC regnode *
3468S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
786e8c11 3469{
b423522f
KW
3470/* The Trie is constructed and compressed now so we can build a fail array if
3471 * it's needed
786e8c11 3472
b423522f
KW
3473 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3474 3.32 in the
3475 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3476 Ullman 1985/88
786e8c11
YO
3477 ISBN 0-201-10088-6
3478
b423522f
KW
3479 We find the fail state for each state in the trie, this state is the longest
3480 proper suffix of the current state's 'word' that is also a proper prefix of
3481 another word in our trie. State 1 represents the word '' and is thus the
3482 default fail state. This allows the DFA not to have to restart after its
3483 tried