This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Change some append_range_to_invlist() calls
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
f7e03a10 89#include "dquote_inline.h"
b992490d 90#include "invlist_inline.h"
1b0f46bf 91#include "unicode_constants.h"
04e98a4d 92
538e84ed
KW
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
95#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 97#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 98#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
dea37815
KW
104#ifndef MIN
105#define MIN(a,b) ((a) < (b) ? (a) : (b))
106#endif
b1603ef8 107
3001ef3e
KW
108#ifndef MAX
109#define MAX(a,b) ((a) > (b) ? (a) : (b))
110#endif
111
3f910e62
YO
112/* this is a chain of data about sub patterns we are processing that
113 need to be handled separately/specially in study_chunk. Its so
114 we can simulate recursion without losing state. */
115struct scan_frame;
116typedef struct scan_frame {
117 regnode *last_regnode; /* last node to process in this frame */
118 regnode *next_regnode; /* next node to process when last is reached */
119 U32 prev_recursed_depth;
120 I32 stopparen; /* what stopparen do we use */
121 U32 is_top_frame; /* what flags do we use? */
122
123 struct scan_frame *this_prev_frame; /* this previous frame */
124 struct scan_frame *prev_frame; /* previous frame */
125 struct scan_frame *next_frame; /* next frame */
126} scan_frame;
127
f4ae5a27
KW
128/* Certain characters are output as a sequence with the first being a
129 * backslash. */
130#define isBACKSLASHED_PUNCT(c) \
131 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
132
133
09b2b2e6 134struct RExC_state_t {
514a91f1
DM
135 U32 flags; /* RXf_* are we folding, multilining? */
136 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 137 char *precomp; /* uncompiled string. */
711b303b 138 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 139 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 140 regexp *rx; /* perl core regexp structure */
538e84ed
KW
141 regexp_internal *rxi; /* internal data for regexp object
142 pprivate field */
fac92740 143 char *start; /* Start of input for compile */
830247a4
IZ
144 char *end; /* End of input for compile */
145 char *parse; /* Input-scan pointer. */
285b5ca0
KW
146 char *adjusted_start; /* 'start', adjusted. See code use */
147 STRLEN precomp_adj; /* an offset beyond precomp. See code use */
ea3daa5d 148 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 149 regnode *emit_start; /* Start of emitted-code area */
538e84ed
KW
150 regnode *emit_bound; /* First regnode outside of the
151 allocated space */
f7c7e32a
DM
152 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
153 implies compiling, so don't emit */
9a81a976
KW
154 regnode_ssc emit_dummy; /* placeholder for emit to point to;
155 large enough for the largest
156 non-EXACTish node, so can use it as
157 scratch in pass1 */
830247a4
IZ
158 I32 naughty; /* How bad is this pattern? */
159 I32 sawback; /* Did we see \1, ...? */
160 U32 seen;
ea3daa5d 161 SSize_t size; /* Code size. */
538e84ed
KW
162 I32 npar; /* Capture buffer count, (OPEN) plus
163 one. ("par" 0 is the whole
164 pattern)*/
165 I32 nestroot; /* root parens we are in - used by
166 accept */
830247a4
IZ
167 I32 extralen;
168 I32 seen_zerolen;
40d049e4
YO
169 regnode **open_parens; /* pointers to open parens */
170 regnode **close_parens; /* pointers to close parens */
d5a00e4a 171 regnode *end_op; /* END node in program */
02daf0ab
YO
172 I32 utf8; /* whether the pattern is utf8 or not */
173 I32 orig_utf8; /* whether the pattern was originally in utf8 */
174 /* XXX use this for future optimisation of case
175 * where pattern must be upgraded to utf8. */
e40e74fe
KW
176 I32 uni_semantics; /* If a d charset modifier should use unicode
177 rules, even if the pattern is not in
178 utf8 */
81714fb9 179 HV *paren_names; /* Paren names */
538e84ed 180
40d049e4 181 regnode **recurse; /* Recurse regops */
d5a00e4a 182 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 183 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 184 through */
09a65838 185 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 186 I32 in_lookbehind;
4624b182 187 I32 contains_locale;
cfafade5 188 I32 contains_i;
bb3f3ed2 189 I32 override_recoding;
b6d67071
KW
190#ifdef EBCDIC
191 I32 recode_x_to_native;
192#endif
9d53c457 193 I32 in_multi_char_class;
3d2bd50a 194 struct reg_code_block *code_blocks; /* positions of literal (?{})
68e2671b 195 within pattern */
b1603ef8
DM
196 int num_code_blocks; /* size of code_blocks[] */
197 int code_index; /* next code_blocks[] slot */
ee273784 198 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
199 scan_frame *frame_head;
200 scan_frame *frame_last;
201 U32 frame_count;
7eec73eb 202 AV *warn_text;
dc6d7f5c 203#ifdef ADD_TO_REGEXEC
830247a4
IZ
204 char *starttry; /* -Dr: where regtry was called. */
205#define RExC_starttry (pRExC_state->starttry)
206#endif
d24ca0c5 207 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 208#ifdef DEBUGGING
be8e71aa 209 const char *lastparse;
3dab1dad 210 I32 lastnum;
1f1031fe 211 AV *paren_name_list; /* idx -> name */
d9a72fcc 212 U32 study_chunk_recursed_count;
c9f0d54c
YO
213 SV *mysv1;
214 SV *mysv2;
3dab1dad
YO
215#define RExC_lastparse (pRExC_state->lastparse)
216#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 217#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 218#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
219#define RExC_mysv (pRExC_state->mysv1)
220#define RExC_mysv1 (pRExC_state->mysv1)
221#define RExC_mysv2 (pRExC_state->mysv2)
222
3dab1dad 223#endif
512c0f5a 224 bool seen_unfolded_sharp_s;
911bd04e 225 bool strict;
da7cf1cc 226 bool study_started;
09b2b2e6 227};
830247a4 228
e2509266 229#define RExC_flags (pRExC_state->flags)
514a91f1 230#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 231#define RExC_precomp (pRExC_state->precomp)
285b5ca0
KW
232#define RExC_precomp_adj (pRExC_state->precomp_adj)
233#define RExC_adjusted_start (pRExC_state->adjusted_start)
711b303b 234#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 235#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 236#define RExC_rx (pRExC_state->rx)
f8fc2ecf 237#define RExC_rxi (pRExC_state->rxi)
fac92740 238#define RExC_start (pRExC_state->start)
830247a4
IZ
239#define RExC_end (pRExC_state->end)
240#define RExC_parse (pRExC_state->parse)
241#define RExC_whilem_seen (pRExC_state->whilem_seen)
512c0f5a
KW
242
243/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
244 * EXACTF node, hence was parsed under /di rules. If later in the parse,
245 * something forces the pattern into using /ui rules, the sharp s should be
246 * folded into the sequence 'ss', which takes up more space than previously
247 * calculated. This means that the sizing pass needs to be restarted. (The
248 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
249 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
250 * so there is no need to resize [perl #125990]. */
251#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
252
7122b237 253#ifdef RE_TRACK_PATTERN_OFFSETS
538e84ed
KW
254#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
255 others */
7122b237 256#endif
830247a4 257#define RExC_emit (pRExC_state->emit)
f7c7e32a 258#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 259#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 260#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
261#define RExC_sawback (pRExC_state->sawback)
262#define RExC_seen (pRExC_state->seen)
263#define RExC_size (pRExC_state->size)
ee273784 264#define RExC_maxlen (pRExC_state->maxlen)
830247a4 265#define RExC_npar (pRExC_state->npar)
e2e6a0f1 266#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
267#define RExC_extralen (pRExC_state->extralen)
268#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 269#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 270#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 271#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
272#define RExC_open_parens (pRExC_state->open_parens)
273#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 274#define RExC_end_op (pRExC_state->end_op)
81714fb9 275#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
276#define RExC_recurse (pRExC_state->recurse)
277#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 278#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
279#define RExC_study_chunk_recursed_bytes \
280 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 281#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 282#define RExC_contains_locale (pRExC_state->contains_locale)
cfafade5 283#define RExC_contains_i (pRExC_state->contains_i)
9d53c457 284#define RExC_override_recoding (pRExC_state->override_recoding)
b6d67071
KW
285#ifdef EBCDIC
286# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
287#endif
9d53c457 288#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
289#define RExC_frame_head (pRExC_state->frame_head)
290#define RExC_frame_last (pRExC_state->frame_last)
291#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 292#define RExC_strict (pRExC_state->strict)
da7cf1cc 293#define RExC_study_started (pRExC_state->study_started)
7eec73eb 294#define RExC_warn_text (pRExC_state->warn_text)
830247a4 295
99807a43
HS
296/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
297 * a flag to disable back-off on the fixed/floating substrings - if it's
298 * a high complexity pattern we assume the benefit of avoiding a full match
299 * is worth the cost of checking for the substrings even if they rarely help.
300 */
301#define RExC_naughty (pRExC_state->naughty)
302#define TOO_NAUGHTY (10)
303#define MARK_NAUGHTY(add) \
304 if (RExC_naughty < TOO_NAUGHTY) \
305 RExC_naughty += (add)
306#define MARK_NAUGHTY_EXP(exp, add) \
307 if (RExC_naughty < TOO_NAUGHTY) \
308 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 309
a687059c
LW
310#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
311#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 312 ((*s) == '{' && regcurly(s)))
a687059c
LW
313
314/*
315 * Flags to be passed up and down.
316 */
a687059c 317#define WORST 0 /* Worst case. */
a3b492c3 318#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 319
e64f369d 320/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
321 * character. (There needs to be a case: in the switch statement in regexec.c
322 * for any node marked SIMPLE.) Note that this is not the same thing as
323 * REGNODE_SIMPLE */
fda99bee 324#define SIMPLE 0x02
e64f369d 325#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
326#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
327#define TRYAGAIN 0x10 /* Weeded out a declaration. */
b97943f7
KW
328#define RESTART_PASS1 0x20 /* Need to restart sizing pass */
329#define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
330 calcuate sizes as UTF-8 */
a687059c 331
3dab1dad
YO
332#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
333
07be1b83
YO
334/* whether trie related optimizations are enabled */
335#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
336#define TRIE_STUDY_OPT
786e8c11 337#define FULL_TRIE_STUDY
07be1b83
YO
338#define TRIE_STCLASS
339#endif
1de06328
YO
340
341
40d049e4
YO
342
343#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
344#define PBITVAL(paren) (1 << ((paren) & 7))
345#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
346#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
347#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
348
82a6ada4 349#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 350 if (!UTF) { \
82a6ada4
KW
351 assert(PASS1); \
352 *flagp = RESTART_PASS1|NEED_UTF8; \
8d9c2815
NC
353 return NULL; \
354 } \
82a6ada4 355 } STMT_END
40d049e4 356
512c0f5a
KW
357/* Change from /d into /u rules, and restart the parse if we've already seen
358 * something whose size would increase as a result, by setting *flagp and
359 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
360 * we've change to /u during the parse. */
361#define REQUIRE_UNI_RULES(flagp, restart_retval) \
362 STMT_START { \
363 if (DEPENDS_SEMANTICS) { \
364 assert(PASS1); \
365 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
366 RExC_uni_semantics = 1; \
367 if (RExC_seen_unfolded_sharp_s) { \
368 *flagp |= RESTART_PASS1; \
369 return restart_retval; \
370 } \
371 } \
372 } STMT_END
373
f19b1a63
KW
374/* This converts the named class defined in regcomp.h to its equivalent class
375 * number defined in handy.h. */
376#define namedclass_to_classnum(class) ((int) ((class) / 2))
377#define classnum_to_namedclass(classnum) ((classnum) * 2)
378
de92f5e6
KW
379#define _invlist_union_complement_2nd(a, b, output) \
380 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
381#define _invlist_intersection_complement_2nd(a, b, output) \
382 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
383
1de06328
YO
384/* About scan_data_t.
385
386 During optimisation we recurse through the regexp program performing
387 various inplace (keyhole style) optimisations. In addition study_chunk
388 and scan_commit populate this data structure with information about
538e84ed 389 what strings MUST appear in the pattern. We look for the longest
3b753521 390 string that must appear at a fixed location, and we look for the
1de06328
YO
391 longest string that may appear at a floating location. So for instance
392 in the pattern:
538e84ed 393
1de06328 394 /FOO[xX]A.*B[xX]BAR/
538e84ed 395
1de06328
YO
396 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
397 strings (because they follow a .* construct). study_chunk will identify
398 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 399
1de06328 400 The strings can be composites, for instance
538e84ed 401
1de06328 402 /(f)(o)(o)/
538e84ed 403
1de06328 404 will result in a composite fixed substring 'foo'.
538e84ed 405
1de06328 406 For each string some basic information is maintained:
538e84ed 407
1de06328
YO
408 - offset or min_offset
409 This is the position the string must appear at, or not before.
410 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
411 characters must match before the string we are searching for.
412 Likewise when combined with minlenp and the length of the string it
538e84ed 413 tells us how many characters must appear after the string we have
1de06328 414 found.
538e84ed 415
1de06328
YO
416 - max_offset
417 Only used for floating strings. This is the rightmost point that
ea3daa5d 418 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 419 string can occur infinitely far to the right.
538e84ed 420
1de06328 421 - minlenp
2d608413
KW
422 A pointer to the minimum number of characters of the pattern that the
423 string was found inside. This is important as in the case of positive
538e84ed 424 lookahead or positive lookbehind we can have multiple patterns
1de06328 425 involved. Consider
538e84ed 426
1de06328 427 /(?=FOO).*F/
538e84ed 428
1de06328
YO
429 The minimum length of the pattern overall is 3, the minimum length
430 of the lookahead part is 3, but the minimum length of the part that
538e84ed 431 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 432 minimum length for the F is 1. This is important as the minimum length
538e84ed 433 is used to determine offsets in front of and behind the string being
1de06328 434 looked for. Since strings can be composites this is the length of the
486ec47a 435 pattern at the time it was committed with a scan_commit. Note that
1de06328 436 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 437 are not known until the full pattern has been compiled, thus the
1de06328 438 pointer to the value.
538e84ed 439
1de06328 440 - lookbehind
538e84ed 441
1de06328 442 In the case of lookbehind the string being searched for can be
538e84ed 443 offset past the start point of the final matching string.
1de06328
YO
444 If this value was just blithely removed from the min_offset it would
445 invalidate some of the calculations for how many chars must match
446 before or after (as they are derived from min_offset and minlen and
538e84ed 447 the length of the string being searched for).
1de06328
YO
448 When the final pattern is compiled and the data is moved from the
449 scan_data_t structure into the regexp structure the information
538e84ed
KW
450 about lookbehind is factored in, with the information that would
451 have been lost precalculated in the end_shift field for the
1de06328
YO
452 associated string.
453
454 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 455 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
456
457*/
2c2d71f5
JH
458
459typedef struct scan_data_t {
1de06328
YO
460 /*I32 len_min; unused */
461 /*I32 len_delta; unused */
49f55535 462 SSize_t pos_min;
ea3daa5d 463 SSize_t pos_delta;
2c2d71f5 464 SV *last_found;
ea3daa5d 465 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 466 SSize_t last_start_min;
ea3daa5d 467 SSize_t last_start_max;
1de06328
YO
468 SV **longest; /* Either &l_fixed, or &l_float. */
469 SV *longest_fixed; /* longest fixed string found in pattern */
49f55535 470 SSize_t offset_fixed; /* offset where it starts */
ea3daa5d 471 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
472 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
473 SV *longest_float; /* longest floating string found in pattern */
49f55535 474 SSize_t offset_float_min; /* earliest point in string it can appear */
ea3daa5d
FC
475 SSize_t offset_float_max; /* latest point in string it can appear */
476 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
49f55535 477 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
2c2d71f5
JH
478 I32 flags;
479 I32 whilem_c;
49f55535 480 SSize_t *last_closep;
b8f7bb16 481 regnode_ssc *start_class;
2c2d71f5
JH
482} scan_data_t;
483
a687059c 484/*
e50aee73 485 * Forward declarations for pregcomp()'s friends.
a687059c 486 */
a0d0e21e 487
27da23d5 488static const scan_data_t zero_scan_data =
1de06328 489 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
490
491#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
492#define SF_BEFORE_SEOL 0x0001
493#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
494#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
495#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
496
44e3dfd2
BF
497#define SF_FIX_SHIFT_EOL (+2)
498#define SF_FL_SHIFT_EOL (+4)
c277df42
IZ
499
500#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
501#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
502
503#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
504#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
505#define SF_IS_INF 0x0040
506#define SF_HAS_PAR 0x0080
507#define SF_IN_PAR 0x0100
508#define SF_HAS_EVAL 0x0200
bc604ad8
KW
509
510
511/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
512 * longest substring in the pattern. When it is not set the optimiser keeps
513 * track of position, but does not keep track of the actual strings seen,
514 *
515 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
516 * /foo/i will not.
517 *
518 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
519 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
520 * turned off because of the alternation (BRANCH). */
07be1b83 521#define SCF_DO_SUBSTR 0x0400
bc604ad8 522
653099ff
GS
523#define SCF_DO_STCLASS_AND 0x0800
524#define SCF_DO_STCLASS_OR 0x1000
525#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 526#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 527
786e8c11 528#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 529#define SCF_SEEN_ACCEPT 0x8000
688e0391 530#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
531#define SCF_IN_DEFINE 0x20000
532
533
534
07be1b83 535
43fead97 536#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
537
538/* The enums for all these are ordered so things work out correctly */
a62b1201 539#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
540#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
541 == REGEX_DEPENDS_CHARSET)
00b27cfc 542#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
543#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
544 >= REGEX_UNICODE_CHARSET)
545#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
546 == REGEX_ASCII_RESTRICTED_CHARSET)
547#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
548 >= REGEX_ASCII_RESTRICTED_CHARSET)
549#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
550 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 551
43fead97 552#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 553
f2c2a6ab
KW
554/* For programs that want to be strictly Unicode compatible by dying if any
555 * attempt is made to match a non-Unicode code point against a Unicode
556 * property. */
557#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
558
93733859 559#define OOB_NAMEDCLASS -1
b8c5462f 560
8e661ac5
KW
561/* There is no code point that is out-of-bounds, so this is problematic. But
562 * its only current use is to initialize a variable that is always set before
563 * looked at. */
564#define OOB_UNICODE 0xDEADBEEF
565
a0ed51b3
LW
566#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
567#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
568
8615cb43 569
b45f050a
JF
570/* length of regex to show in messages that don't mark a position within */
571#define RegexLengthToShowInErrorMessages 127
572
573/*
574 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
575 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
576 * op/pragma/warn/regcomp.
577 */
7253e4e3
RK
578#define MARKER1 "<-- HERE" /* marker as it appears in the description */
579#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 580
538e84ed
KW
581#define REPORT_LOCATION " in regex; marked by " MARKER1 \
582 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
b45f050a 583
285b5ca0
KW
584/* The code in this file in places uses one level of recursion with parsing
585 * rebased to an alternate string constructed by us in memory. This can take
586 * the form of something that is completely different from the input, or
587 * something that uses the input as part of the alternate. In the first case,
588 * there should be no possibility of an error, as we are in complete control of
589 * the alternate string. But in the second case we don't control the input
590 * portion, so there may be errors in that. Here's an example:
591 * /[abc\x{DF}def]/ui
592 * is handled specially because \x{df} folds to a sequence of more than one
593 * character, 'ss'. What is done is to create and parse an alternate string,
594 * which looks like this:
595 * /(?:\x{DF}|[abc\x{DF}def])/ui
596 * where it uses the input unchanged in the middle of something it constructs,
597 * which is a branch for the DF outside the character class, and clustering
598 * parens around the whole thing. (It knows enough to skip the DF inside the
599 * class while in this substitute parse.) 'abc' and 'def' may have errors that
600 * need to be reported. The general situation looks like this:
601 *
602 * sI tI xI eI
603 * Input: ----------------------------------------------------
604 * Constructed: ---------------------------------------------------
605 * sC tC xC eC EC
606 *
607 * The input string sI..eI is the input pattern. The string sC..EC is the
608 * constructed substitute parse string. The portions sC..tC and eC..EC are
609 * constructed by us. The portion tC..eC is an exact duplicate of the input
610 * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that
611 * while parsing, we find an error at xC. We want to display a message showing
612 * the real input string. Thus we need to find the point xI in it which
613 * corresponds to xC. xC >= tC, since the portion of the string sC..tC has
614 * been constructed by us, and so shouldn't have errors. We get:
615 *
616 * xI = sI + (tI - sI) + (xC - tC)
617 *
618 * and, the offset into sI is:
619 *
620 * (xI - sI) = (tI - sI) + (xC - tC)
621 *
622 * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
623 * and we save tC as RExC_adjusted_start.
903c858a
KW
624 *
625 * During normal processing of the input pattern, everything points to that,
626 * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
285b5ca0
KW
627 */
628
629#define tI_sI RExC_precomp_adj
630#define tC RExC_adjusted_start
631#define sC RExC_precomp
632#define xI_offset(xC) ((IV) (tI_sI + (xC - tC)))
633#define xI(xC) (sC + xI_offset(xC))
634#define eC RExC_precomp_end
635
636#define REPORT_LOCATION_ARGS(xC) \
637 UTF8fARG(UTF, \
638 (xI(xC) > eC) /* Don't run off end */ \
639 ? eC - sC /* Length before the <--HERE */ \
640 : xI_offset(xC), \
641 sC), /* The input pattern printed up to the <--HERE */ \
642 UTF8fARG(UTF, \
643 (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \
644 (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */
c1d900c3 645
8a6d8ec6
HS
646/* Used to point after bad bytes for an error message, but avoid skipping
647 * past a nul byte. */
648#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
649
b45f050a
JF
650/*
651 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
652 * arg. Show regex, up to a maximum length. If it's too long, chop and add
653 * "...".
654 */
58e23c8d 655#define _FAIL(code) STMT_START { \
bfed75c6 656 const char *ellipses = ""; \
711b303b 657 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380
MP
658 \
659 if (!SIZE_ONLY) \
a5e7bc51 660 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
661 if (len > RegexLengthToShowInErrorMessages) { \
662 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
663 len = RegexLengthToShowInErrorMessages - 10; \
664 ellipses = "..."; \
665 } \
58e23c8d 666 code; \
ccb2c380 667} STMT_END
8615cb43 668
58e23c8d 669#define FAIL(msg) _FAIL( \
c1d900c3
BF
670 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
671 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
672
673#define FAIL2(msg,arg) _FAIL( \
c1d900c3
BF
674 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
675 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 676
b45f050a 677/*
b45f050a
JF
678 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
679 */
ccb2c380 680#define Simple_vFAIL(m) STMT_START { \
ccb2c380 681 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 682 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 683} STMT_END
b45f050a
JF
684
685/*
686 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
687 */
ccb2c380
MP
688#define vFAIL(m) STMT_START { \
689 if (!SIZE_ONLY) \
a5e7bc51 690 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
691 Simple_vFAIL(m); \
692} STMT_END
b45f050a
JF
693
694/*
695 * Like Simple_vFAIL(), but accepts two arguments.
696 */
ccb2c380 697#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
698 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
699 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 700} STMT_END
b45f050a
JF
701
702/*
703 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
704 */
ccb2c380
MP
705#define vFAIL2(m,a1) STMT_START { \
706 if (!SIZE_ONLY) \
a5e7bc51 707 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
708 Simple_vFAIL2(m, a1); \
709} STMT_END
b45f050a
JF
710
711
712/*
713 * Like Simple_vFAIL(), but accepts three arguments.
714 */
ccb2c380 715#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 716 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 717 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 718} STMT_END
b45f050a
JF
719
720/*
721 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
722 */
ccb2c380
MP
723#define vFAIL3(m,a1,a2) STMT_START { \
724 if (!SIZE_ONLY) \
a5e7bc51 725 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
726 Simple_vFAIL3(m, a1, a2); \
727} STMT_END
b45f050a
JF
728
729/*
730 * Like Simple_vFAIL(), but accepts four arguments.
731 */
ccb2c380 732#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
733 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
734 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 735} STMT_END
b45f050a 736
95db3ffa
KW
737#define vFAIL4(m,a1,a2,a3) STMT_START { \
738 if (!SIZE_ONLY) \
739 SAVEFREESV(RExC_rx_sv); \
740 Simple_vFAIL4(m, a1, a2, a3); \
741} STMT_END
742
946095af 743/* A specialized version of vFAIL2 that works with UTF8f */
d528642a
KW
744#define vFAIL2utf8f(m, a1) STMT_START { \
745 if (!SIZE_ONLY) \
746 SAVEFREESV(RExC_rx_sv); \
747 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
748 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
749} STMT_END
750
3ba22297 751#define vFAIL3utf8f(m, a1, a2) STMT_START { \
3ba22297
KW
752 if (!SIZE_ONLY) \
753 SAVEFREESV(RExC_rx_sv); \
754 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 755 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
756} STMT_END
757
499333dc
KW
758/* These have asserts in them because of [perl #122671] Many warnings in
759 * regcomp.c can occur twice. If they get output in pass1 and later in that
760 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
761 * would get output again. So they should be output in pass2, and these
762 * asserts make sure new warnings follow that paradigm. */
946095af 763
5e0a247b
KW
764/* m is not necessarily a "literal string", in this macro */
765#define reg_warn_non_literal_string(loc, m) STMT_START { \
d528642a
KW
766 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
767 "%s" REPORT_LOCATION, \
768 m, REPORT_LOCATION_ARGS(loc)); \
5e0a247b
KW
769} STMT_END
770
668c081a 771#define ckWARNreg(loc,m) STMT_START { \
d528642a
KW
772 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
773 m REPORT_LOCATION, \
774 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
775} STMT_END
776
b927b7e9 777#define vWARN(loc, m) STMT_START { \
d528642a
KW
778 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
779 m REPORT_LOCATION, \
780 REPORT_LOCATION_ARGS(loc)); \
b927b7e9
KW
781} STMT_END
782
0d6106aa 783#define vWARN_dep(loc, m) STMT_START { \
d528642a
KW
784 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
785 m REPORT_LOCATION, \
786 REPORT_LOCATION_ARGS(loc)); \
0d6106aa
KW
787} STMT_END
788
147508a2 789#define ckWARNdep(loc,m) STMT_START { \
d528642a
KW
790 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
791 m REPORT_LOCATION, \
792 REPORT_LOCATION_ARGS(loc)); \
147508a2
KW
793} STMT_END
794
d528642a
KW
795#define ckWARNregdep(loc,m) STMT_START { \
796 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
797 WARN_REGEXP), \
798 m REPORT_LOCATION, \
799 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
800} STMT_END
801
d528642a
KW
802#define ckWARN2reg_d(loc,m, a1) STMT_START { \
803 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
804 m REPORT_LOCATION, \
805 a1, REPORT_LOCATION_ARGS(loc)); \
2335b3d3
KW
806} STMT_END
807
d528642a
KW
808#define ckWARN2reg(loc, m, a1) STMT_START { \
809 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
810 m REPORT_LOCATION, \
811 a1, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
812} STMT_END
813
d528642a
KW
814#define vWARN3(loc, m, a1, a2) STMT_START { \
815 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
816 m REPORT_LOCATION, \
817 a1, a2, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
818} STMT_END
819
d528642a
KW
820#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
821 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
822 m REPORT_LOCATION, \
823 a1, a2, \
824 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
825} STMT_END
826
ccb2c380 827#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
828 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
829 m REPORT_LOCATION, \
830 a1, a2, a3, \
831 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
832} STMT_END
833
668c081a 834#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
835 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
836 m REPORT_LOCATION, \
837 a1, a2, a3, \
838 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
839} STMT_END
840
ccb2c380 841#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
d528642a
KW
842 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
843 m REPORT_LOCATION, \
844 a1, a2, a3, a4, \
845 REPORT_LOCATION_ARGS(loc)); \
ccb2c380 846} STMT_END
9d1d55b5 847
538e84ed 848/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
849 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
850 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
851 * Element 0 holds the number n.
07be1b83 852 * Position is 1 indexed.
fac92740 853 */
7122b237
YO
854#ifndef RE_TRACK_PATTERN_OFFSETS
855#define Set_Node_Offset_To_R(node,byte)
856#define Set_Node_Offset(node,byte)
857#define Set_Cur_Node_Offset
858#define Set_Node_Length_To_R(node,len)
859#define Set_Node_Length(node,len)
6a86c6ad 860#define Set_Node_Cur_Length(node,start)
538e84ed
KW
861#define Node_Offset(n)
862#define Node_Length(n)
7122b237
YO
863#define Set_Node_Offset_Length(node,offset,len)
864#define ProgLen(ri) ri->u.proglen
865#define SetProgLen(ri,x) ri->u.proglen = x
866#else
867#define ProgLen(ri) ri->u.offsets[0]
868#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
869#define Set_Node_Offset_To_R(node,byte) STMT_START { \
870 if (! SIZE_ONLY) { \
871 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 872 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 873 if((node) < 0) { \
538e84ed
KW
874 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
875 (int)(node)); \
ccb2c380
MP
876 } else { \
877 RExC_offsets[2*(node)-1] = (byte); \
878 } \
879 } \
880} STMT_END
881
882#define Set_Node_Offset(node,byte) \
883 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
884#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
885
886#define Set_Node_Length_To_R(node,len) STMT_START { \
887 if (! SIZE_ONLY) { \
888 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 889 __LINE__, (int)(node), (int)(len))); \
ccb2c380 890 if((node) < 0) { \
538e84ed
KW
891 Perl_croak(aTHX_ "value of node is %d in Length macro", \
892 (int)(node)); \
ccb2c380
MP
893 } else { \
894 RExC_offsets[2*(node)] = (len); \
895 } \
896 } \
897} STMT_END
898
899#define Set_Node_Length(node,len) \
900 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
901#define Set_Node_Cur_Length(node, start) \
902 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
903
904/* Get offsets and lengths */
905#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
906#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
907
07be1b83
YO
908#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
909 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
910 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
911} STMT_END
7122b237 912#endif
07be1b83
YO
913
914#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
915#define EXPERIMENTAL_INPLACESCAN
f427392e 916#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 917
cb41e5d6
YO
918#ifdef DEBUGGING
919int
6ad9a8ab 920Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
921{
922 va_list ap;
923 int result;
924 PerlIO *f= Perl_debug_log;
925 PERL_ARGS_ASSERT_RE_PRINTF;
926 va_start(ap, fmt);
927 result = PerlIO_vprintf(f, fmt, ap);
928 va_end(ap);
929 return result;
930}
931
932int
7b031478 933Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
934{
935 va_list ap;
936 int result;
937 PerlIO *f= Perl_debug_log;
938 PERL_ARGS_ASSERT_RE_INDENTF;
939 va_start(ap, depth);
daeb874b 940 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
941 result = PerlIO_vprintf(f, fmt, ap);
942 va_end(ap);
943 return result;
944}
945#endif /* DEBUGGING */
946
7b031478 947#define DEBUG_RExC_seen() \
538e84ed 948 DEBUG_OPTIMISE_MORE_r({ \
6ad9a8ab 949 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 950 \
e384d5c1 951 if (RExC_seen & REG_ZERO_LEN_SEEN) \
6ad9a8ab 952 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 953 \
e384d5c1 954 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
6ad9a8ab 955 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 956 \
e384d5c1 957 if (RExC_seen & REG_GPOS_SEEN) \
6ad9a8ab 958 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 959 \
e384d5c1 960 if (RExC_seen & REG_RECURSE_SEEN) \
6ad9a8ab 961 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 962 \
7b031478 963 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
6ad9a8ab 964 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 965 \
e384d5c1 966 if (RExC_seen & REG_VERBARG_SEEN) \
6ad9a8ab 967 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 968 \
e384d5c1 969 if (RExC_seen & REG_CUTGROUP_SEEN) \
6ad9a8ab 970 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 971 \
e384d5c1 972 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
6ad9a8ab 973 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 974 \
e384d5c1 975 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
6ad9a8ab 976 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 977 \
7b031478 978 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
6ad9a8ab 979 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 980 \
6ad9a8ab 981 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
982 });
983
fdfb4f21 984#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 985 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21
YO
986
987#define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
988 if ( ( flags ) ) { \
6ad9a8ab 989 Perl_re_printf( aTHX_ "%s", open_str); \
fdfb4f21
YO
990 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
991 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
992 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
993 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
994 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
995 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
996 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
997 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
998 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
999 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
1000 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
1001 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
1002 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
1003 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
1004 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
6ad9a8ab 1005 Perl_re_printf( aTHX_ "%s", close_str); \
fdfb4f21
YO
1006 }
1007
1008
304ee84b
YO
1009#define DEBUG_STUDYDATA(str,data,depth) \
1010DEBUG_OPTIMISE_MORE_r(if(data){ \
6ad9a8ab 1011 Perl_re_indentf( aTHX_ "" str "Pos:%"IVdf"/%"IVdf \
fdfb4f21 1012 " Flags: 0x%"UVXf, \
7b031478 1013 depth, \
1de06328
YO
1014 (IV)((data)->pos_min), \
1015 (IV)((data)->pos_delta), \
fdfb4f21
YO
1016 (UV)((data)->flags) \
1017 ); \
1018 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
6ad9a8ab 1019 Perl_re_printf( aTHX_ \
fdfb4f21 1020 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328 1021 (IV)((data)->whilem_c), \
304ee84b
YO
1022 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
1023 is_inf ? "INF " : "" \
1de06328
YO
1024 ); \
1025 if ((data)->last_found) \
6ad9a8ab 1026 Perl_re_printf( aTHX_ \
1de06328
YO
1027 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
1028 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
1029 SvPVX_const((data)->last_found), \
1030 (IV)((data)->last_end), \
1031 (IV)((data)->last_start_min), \
1032 (IV)((data)->last_start_max), \
1033 ((data)->longest && \
1034 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
1035 SvPVX_const((data)->longest_fixed), \
1036 (IV)((data)->offset_fixed), \
1037 ((data)->longest && \
1038 (data)->longest==&((data)->longest_float)) ? "*" : "", \
1039 SvPVX_const((data)->longest_float), \
1040 (IV)((data)->offset_float_min), \
1041 (IV)((data)->offset_float_max) \
1042 ); \
6ad9a8ab 1043 Perl_re_printf( aTHX_ "\n"); \
1de06328
YO
1044});
1045
cb41e5d6 1046
c6871b76
KW
1047/* =========================================================
1048 * BEGIN edit_distance stuff.
1049 *
1050 * This calculates how many single character changes of any type are needed to
1051 * transform a string into another one. It is taken from version 3.1 of
1052 *
1053 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1054 */
1055
1056/* Our unsorted dictionary linked list. */
1057/* Note we use UVs, not chars. */
1058
1059struct dictionary{
1060 UV key;
1061 UV value;
1062 struct dictionary* next;
1063};
1064typedef struct dictionary item;
1065
1066
1067PERL_STATIC_INLINE item*
1068push(UV key,item* curr)
1069{
1070 item* head;
1071 Newxz(head, 1, item);
1072 head->key = key;
1073 head->value = 0;
1074 head->next = curr;
1075 return head;
1076}
1077
1078
1079PERL_STATIC_INLINE item*
1080find(item* head, UV key)
1081{
1082 item* iterator = head;
1083 while (iterator){
1084 if (iterator->key == key){
1085 return iterator;
1086 }
1087 iterator = iterator->next;
1088 }
1089
1090 return NULL;
1091}
1092
1093PERL_STATIC_INLINE item*
1094uniquePush(item* head,UV key)
1095{
1096 item* iterator = head;
1097
1098 while (iterator){
1099 if (iterator->key == key) {
1100 return head;
1101 }
1102 iterator = iterator->next;
1103 }
1104
1105 return push(key,head);
1106}
1107
1108PERL_STATIC_INLINE void
1109dict_free(item* head)
1110{
1111 item* iterator = head;
1112
1113 while (iterator) {
1114 item* temp = iterator;
1115 iterator = iterator->next;
1116 Safefree(temp);
1117 }
1118
1119 head = NULL;
1120}
1121
1122/* End of Dictionary Stuff */
1123
1124/* All calculations/work are done here */
1125STATIC int
1126S_edit_distance(const UV* src,
1127 const UV* tgt,
1128 const STRLEN x, /* length of src[] */
1129 const STRLEN y, /* length of tgt[] */
1130 const SSize_t maxDistance
1131)
1132{
1133 item *head = NULL;
1134 UV swapCount,swapScore,targetCharCount,i,j;
1135 UV *scores;
1136 UV score_ceil = x + y;
1137
1138 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1139
1140 /* intialize matrix start values */
1141 Newxz(scores, ( (x + 2) * (y + 2)), UV);
1142 scores[0] = score_ceil;
1143 scores[1 * (y + 2) + 0] = score_ceil;
1144 scores[0 * (y + 2) + 1] = score_ceil;
1145 scores[1 * (y + 2) + 1] = 0;
1146 head = uniquePush(uniquePush(head,src[0]),tgt[0]);
1147
1148 /* work loops */
1149 /* i = src index */
1150 /* j = tgt index */
1151 for (i=1;i<=x;i++) {
1152 if (i < x)
1153 head = uniquePush(head,src[i]);
1154 scores[(i+1) * (y + 2) + 1] = i;
1155 scores[(i+1) * (y + 2) + 0] = score_ceil;
1156 swapCount = 0;
1157
1158 for (j=1;j<=y;j++) {
1159 if (i == 1) {
1160 if(j < y)
1161 head = uniquePush(head,tgt[j]);
1162 scores[1 * (y + 2) + (j + 1)] = j;
1163 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1164 }
1165
1166 targetCharCount = find(head,tgt[j-1])->value;
1167 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1168
1169 if (src[i-1] != tgt[j-1]){
1170 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1171 }
1172 else {
1173 swapCount = j;
1174 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1175 }
1176 }
1177
1178 find(head,src[i-1])->value = i;
1179 }
1180
1181 {
1182 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1183 dict_free(head);
1184 Safefree(scores);
1185 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1186 }
1187}
1188
1189/* END of edit_distance() stuff
1190 * ========================================================= */
1191
8e35b056
KW
1192/* is c a control character for which we have a mnemonic? */
1193#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1194
549b4e78
KW
1195STATIC const char *
1196S_cntrl_to_mnemonic(const U8 c)
1197{
1198 /* Returns the mnemonic string that represents character 'c', if one
1199 * exists; NULL otherwise. The only ones that exist for the purposes of
1200 * this routine are a few control characters */
1201
1202 switch (c) {
1203 case '\a': return "\\a";
1204 case '\b': return "\\b";
1205 case ESC_NATIVE: return "\\e";
1206 case '\f': return "\\f";
1207 case '\n': return "\\n";
1208 case '\r': return "\\r";
1209 case '\t': return "\\t";
1210 }
1211
1212 return NULL;
1213}
1214
653099ff 1215/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 1216 Update the longest found anchored substring and the longest found
653099ff
GS
1217 floating substrings if needed. */
1218
4327152a 1219STATIC void
ea3daa5d
FC
1220S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1221 SSize_t *minlenp, int is_inf)
c277df42 1222{
e1ec3a88
AL
1223 const STRLEN l = CHR_SVLEN(data->last_found);
1224 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 1225 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1226
7918f24d
NC
1227 PERL_ARGS_ASSERT_SCAN_COMMIT;
1228
c277df42 1229 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 1230 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
1231 if (*data->longest == data->longest_fixed) {
1232 data->offset_fixed = l ? data->last_start_min : data->pos_min;
1233 if (data->flags & SF_BEFORE_EOL)
b81d288d 1234 data->flags
c277df42
IZ
1235 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
1236 else
1237 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 1238 data->minlen_fixed=minlenp;
1de06328 1239 data->lookbehind_fixed=0;
a0ed51b3 1240 }
304ee84b 1241 else { /* *data->longest == data->longest_float */
c277df42 1242 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d 1243 data->offset_float_max = (l
646e8787
DM
1244 ? data->last_start_max
1245 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1246 ? SSize_t_MAX
1247 : data->pos_min + data->pos_delta));
1248 if (is_inf
1249 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
1250 data->offset_float_max = SSize_t_MAX;
c277df42 1251 if (data->flags & SF_BEFORE_EOL)
b81d288d 1252 data->flags
c277df42
IZ
1253 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1254 else
1255 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
1256 data->minlen_float=minlenp;
1257 data->lookbehind_float=0;
c277df42
IZ
1258 }
1259 }
1260 SvCUR_set(data->last_found, 0);
0eda9292 1261 {
a28509cc 1262 SV * const sv = data->last_found;
097eb12c
AL
1263 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1264 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1265 if (mg)
1266 mg->mg_len = 0;
1267 }
0eda9292 1268 }
c277df42
IZ
1269 data->last_end = -1;
1270 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 1271 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
1272}
1273
cdd87c1d
KW
1274/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1275 * list that describes which code points it matches */
1276
653099ff 1277STATIC void
3420edd7 1278S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1279{
cdd87c1d
KW
1280 /* Set the SSC 'ssc' to match an empty string or any code point */
1281
557bd3fb 1282 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1283
71068078 1284 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1285
0854ea0b
KW
1286 /* mortalize so won't leak */
1287 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1288 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1289}
1290
653099ff 1291STATIC int
dc3bf405 1292S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1293{
c144baaa
KW
1294 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1295 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1296 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1297 * in any way, so there's no point in using it */
cdd87c1d
KW
1298
1299 UV start, end;
1300 bool ret;
653099ff 1301
557bd3fb 1302 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1303
71068078 1304 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1305
93e92956 1306 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1307 return FALSE;
1308 }
1309
1310 /* See if the list consists solely of the range 0 - Infinity */
1311 invlist_iterinit(ssc->invlist);
1312 ret = invlist_iternext(ssc->invlist, &start, &end)
1313 && start == 0
1314 && end == UV_MAX;
1315
1316 invlist_iterfinish(ssc->invlist);
1317
1318 if (ret) {
1319 return TRUE;
1320 }
1321
1322 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1323 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1324 int i;
1325 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1326 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1327 return TRUE;
1328 }
1329 }
1330 }
1331
1332 return FALSE;
653099ff
GS
1333}
1334
653099ff 1335STATIC void
cdd87c1d 1336S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1337{
cdd87c1d
KW
1338 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1339 * string, any code point, or any posix class under locale */
1340
557bd3fb 1341 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1342
557bd3fb 1343 Zero(ssc, 1, regnode_ssc);
71068078 1344 set_ANYOF_SYNTHETIC(ssc);
93e92956 1345 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1346 ssc_anything(ssc);
cdd87c1d 1347
2f306ab9
KW
1348 /* If any portion of the regex is to operate under locale rules that aren't
1349 * fully known at compile time, initialization includes it. The reason
1350 * this isn't done for all regexes is that the optimizer was written under
1351 * the assumption that locale was all-or-nothing. Given the complexity and
1352 * lack of documentation in the optimizer, and that there are inadequate
1353 * test cases for locale, many parts of it may not work properly, it is
1354 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1355 if (RExC_contains_locale) {
1356 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1357 }
1358 else {
1359 ANYOF_POSIXL_ZERO(ssc);
1360 }
653099ff
GS
1361}
1362
b423522f 1363STATIC int
dc3bf405
BF
1364S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1365 const regnode_ssc *ssc)
b423522f
KW
1366{
1367 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1368 * to the list of code points matched, and locale posix classes; hence does
1369 * not check its flags) */
1370
1371 UV start, end;
1372 bool ret;
1373
1374 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1375
71068078 1376 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1377
1378 invlist_iterinit(ssc->invlist);
1379 ret = invlist_iternext(ssc->invlist, &start, &end)
1380 && start == 0
1381 && end == UV_MAX;
1382
1383 invlist_iterfinish(ssc->invlist);
1384
1385 if (! ret) {
1386 return FALSE;
1387 }
1388
e0e1be5f 1389 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1390 return FALSE;
b423522f
KW
1391 }
1392
1393 return TRUE;
1394}
1395
1396STATIC SV*
1397S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1398 const regnode_charclass* const node)
b423522f
KW
1399{
1400 /* Returns a mortal inversion list defining which code points are matched
1401 * by 'node', which is of type ANYOF. Handles complementing the result if
1402 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1403 * returned list must, and will, contain every code point that is a
1404 * possibility. */
b423522f 1405
e2506fa7 1406 SV* invlist = NULL;
1ee208c4 1407 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1408 unsigned int i;
1409 const U32 n = ARG(node);
31f05a37 1410 bool new_node_has_latin1 = FALSE;
b423522f
KW
1411
1412 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1413
1414 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1415 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1416 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1417 AV * const av = MUTABLE_AV(SvRV(rv));
1418 SV **const ary = AvARRAY(av);
1419 assert(RExC_rxi->data->what[n] == 's');
1420
1421 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1422 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1423 }
1424 else if (ary[0] && ary[0] != &PL_sv_undef) {
1425
1426 /* Here, no compile-time swash, and there are things that won't be
1427 * known until runtime -- we have to assume it could be anything */
e2506fa7 1428 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1429 return _add_range_to_invlist(invlist, 0, UV_MAX);
1430 }
1ee208c4 1431 else if (ary[3] && ary[3] != &PL_sv_undef) {
b423522f
KW
1432
1433 /* Here no compile-time swash, and no run-time only data. Use the
1434 * node's inversion list */
1ee208c4
KW
1435 invlist = sv_2mortal(invlist_clone(ary[3]));
1436 }
1437
1438 /* Get the code points valid only under UTF-8 locales */
037715a6 1439 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1ee208c4
KW
1440 && ary[2] && ary[2] != &PL_sv_undef)
1441 {
1442 only_utf8_locale_invlist = ary[2];
b423522f
KW
1443 }
1444 }
1445
e2506fa7
KW
1446 if (! invlist) {
1447 invlist = sv_2mortal(_new_invlist(0));
1448 }
1449
dcb20b36
KW
1450 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1451 * code points, and an inversion list for the others, but if there are code
1452 * points that should match only conditionally on the target string being
1453 * UTF-8, those are placed in the inversion list, and not the bitmap.
1454 * Since there are circumstances under which they could match, they are
1455 * included in the SSC. But if the ANYOF node is to be inverted, we have
1456 * to exclude them here, so that when we invert below, the end result
1457 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1458 * have to do this here before we add the unconditionally matched code
1459 * points */
b423522f
KW
1460 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1461 _invlist_intersection_complement_2nd(invlist,
1462 PL_UpperLatin1,
1463 &invlist);
1464 }
1465
1466 /* Add in the points from the bit map */
dcb20b36 1467 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
b423522f 1468 if (ANYOF_BITMAP_TEST(node, i)) {
6f8848d5
TC
1469 unsigned int start = i++;
1470
1471 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1472 /* empty */
1473 }
1474 invlist = _add_range_to_invlist(invlist, start, i-1);
31f05a37 1475 new_node_has_latin1 = TRUE;
b423522f
KW
1476 }
1477 }
1478
1479 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1480 * as well. But don't add them if inverting, as when that gets done below,
1481 * it would exclude all these characters, including the ones it shouldn't
1482 * that were added just above */
1483 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
f240c685
KW
1484 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1485 {
b423522f
KW
1486 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1487 }
1488
1489 /* Similarly for these */
93e92956 1490 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1491 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1492 }
1493
1494 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1495 _invlist_invert(invlist);
1496 }
037715a6 1497 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
31f05a37
KW
1498
1499 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1500 * locale. We can skip this if there are no 0-255 at all. */
1501 _invlist_union(invlist, PL_Latin1, &invlist);
1502 }
1503
1ee208c4
KW
1504 /* Similarly add the UTF-8 locale possible matches. These have to be
1505 * deferred until after the non-UTF-8 locale ones are taken care of just
1506 * above, or it leads to wrong results under ANYOF_INVERT */
1507 if (only_utf8_locale_invlist) {
31f05a37 1508 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1509 only_utf8_locale_invlist,
31f05a37
KW
1510 ANYOF_FLAGS(node) & ANYOF_INVERT,
1511 &invlist);
1512 }
b423522f
KW
1513
1514 return invlist;
1515}
1516
1051e1c4 1517/* These two functions currently do the exact same thing */
557bd3fb 1518#define ssc_init_zero ssc_init
653099ff 1519
cdd87c1d
KW
1520#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1521#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1522
557bd3fb 1523/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1524 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1525 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1526
653099ff 1527STATIC void
b423522f 1528S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1529 const regnode_charclass *and_with)
653099ff 1530{
cdd87c1d
KW
1531 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1532 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1533
a0dd4231
KW
1534 SV* anded_cp_list;
1535 U8 anded_flags;
1e6ade67 1536
cdd87c1d 1537 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1538
71068078 1539 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1540
1541 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1542 * the code point inversion list and just the relevant flags */
71068078 1543 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1544 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
a0dd4231 1545 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1546
1547 /* XXX This is a kludge around what appears to be deficiencies in the
1548 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1549 * there are paths through the optimizer where it doesn't get weeded
1550 * out when it should. And if we don't make some extra provision for
1551 * it like the code just below, it doesn't get added when it should.
1552 * This solution is to add it only when AND'ing, which is here, and
1553 * only when what is being AND'ed is the pristine, original node
1554 * matching anything. Thus it is like adding it to ssc_anything() but
1555 * only when the result is to be AND'ed. Probably the same solution
1556 * could be adopted for the same problem we have with /l matching,
1557 * which is solved differently in S_ssc_init(), and that would lead to
1558 * fewer false positives than that solution has. But if this solution
1559 * creates bugs, the consequences are only that a warning isn't raised
1560 * that should be; while the consequences for having /l bugs is
1561 * incorrect matches */
7dcac5f6 1562 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1563 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1564 }
a0dd4231
KW
1565 }
1566 else {
5c0f85ef 1567 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685
KW
1568 if (OP(and_with) == ANYOFD) {
1569 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1570 }
1571 else {
1572 anded_flags = ANYOF_FLAGS(and_with)
1573 &( ANYOF_COMMON_FLAGS
108316fb
KW
1574 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1575 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1576 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1577 anded_flags &=
1578 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1579 }
f240c685 1580 }
a0dd4231
KW
1581 }
1582
1583 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1584
1585 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1586 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1587 * 'and_with' may be inverted. When not inverted, we have the situation of
1588 * computing:
1589 * (C1 | P1) & (C2 | P2)
1590 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1591 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1592 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1593 * <= ((C1 & C2) | P1 | P2)
1594 * Alternatively, the last few steps could be:
1595 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1596 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1597 * <= (C1 | C2 | (P1 & P2))
1598 * We favor the second approach if either P1 or P2 is non-empty. This is
1599 * because these components are a barrier to doing optimizations, as what
1600 * they match cannot be known until the moment of matching as they are
1601 * dependent on the current locale, 'AND"ing them likely will reduce or
1602 * eliminate them.
1603 * But we can do better if we know that C1,P1 are in their initial state (a
1604 * frequent occurrence), each matching everything:
1605 * (<everything>) & (C2 | P2) = C2 | P2
1606 * Similarly, if C2,P2 are in their initial state (again a frequent
1607 * occurrence), the result is a no-op
1608 * (C1 | P1) & (<everything>) = C1 | P1
1609 *
1610 * Inverted, we have
1611 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1612 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1613 * <= (C1 & ~C2) | (P1 & ~P2)
1614 * */
1aa99e6b 1615
a0dd4231 1616 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1617 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1618 {
cdd87c1d 1619 unsigned int i;
8951c461 1620
cdd87c1d
KW
1621 ssc_intersection(ssc,
1622 anded_cp_list,
1623 FALSE /* Has already been inverted */
1624 );
c6b76537 1625
cdd87c1d
KW
1626 /* If either P1 or P2 is empty, the intersection will be also; can skip
1627 * the loop */
93e92956 1628 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1629 ANYOF_POSIXL_ZERO(ssc);
1630 }
e0e1be5f 1631 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1632
1633 /* Note that the Posix class component P from 'and_with' actually
1634 * looks like:
1635 * P = Pa | Pb | ... | Pn
1636 * where each component is one posix class, such as in [\w\s].
1637 * Thus
1638 * ~P = ~(Pa | Pb | ... | Pn)
1639 * = ~Pa & ~Pb & ... & ~Pn
1640 * <= ~Pa | ~Pb | ... | ~Pn
1641 * The last is something we can easily calculate, but unfortunately
1642 * is likely to have many false positives. We could do better
1643 * in some (but certainly not all) instances if two classes in
1644 * P have known relationships. For example
1645 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1646 * So
1647 * :lower: & :print: = :lower:
1648 * And similarly for classes that must be disjoint. For example,
1649 * since \s and \w can have no elements in common based on rules in
1650 * the POSIX standard,
1651 * \w & ^\S = nothing
1652 * Unfortunately, some vendor locales do not meet the Posix
1653 * standard, in particular almost everything by Microsoft.
1654 * The loop below just changes e.g., \w into \W and vice versa */
1655
1ee208c4 1656 regnode_charclass_posixl temp;
cdd87c1d
KW
1657 int add = 1; /* To calculate the index of the complement */
1658
1659 ANYOF_POSIXL_ZERO(&temp);
1660 for (i = 0; i < ANYOF_MAX; i++) {
1661 assert(i % 2 != 0
7dcac5f6
KW
1662 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1663 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1664
7dcac5f6 1665 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1666 ANYOF_POSIXL_SET(&temp, i + add);
1667 }
1668 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1669 }
1670 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1671
cdd87c1d
KW
1672 } /* else ssc already has no posixes */
1673 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1674 in its initial state */
71068078 1675 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1676 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1677 {
1678 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1679 * copy it over 'ssc' */
1680 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1681 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1682 StructCopy(and_with, ssc, regnode_ssc);
1683 }
1684 else {
1685 ssc->invlist = anded_cp_list;
1686 ANYOF_POSIXL_ZERO(ssc);
93e92956 1687 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1688 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1689 }
1690 }
1691 }
e0e1be5f 1692 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
93e92956 1693 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1694 {
1695 /* One or the other of P1, P2 is non-empty. */
93e92956 1696 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1697 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1698 }
cdd87c1d
KW
1699 ssc_union(ssc, anded_cp_list, FALSE);
1700 }
1701 else { /* P1 = P2 = empty */
1702 ssc_intersection(ssc, anded_cp_list, FALSE);
1703 }
137165a6 1704 }
653099ff
GS
1705}
1706
653099ff 1707STATIC void
cdd87c1d 1708S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1709 const regnode_charclass *or_with)
653099ff 1710{
cdd87c1d
KW
1711 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1712 * another SSC or a regular ANYOF class. Can create false positives if
1713 * 'or_with' is to be inverted. */
7918f24d 1714
a0dd4231
KW
1715 SV* ored_cp_list;
1716 U8 ored_flags;
c6b76537 1717
cdd87c1d 1718 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1719
71068078 1720 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1721
1722 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1723 * the code point inversion list and just the relevant flags */
71068078 1724 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1725 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
a0dd4231
KW
1726 ored_flags = ANYOF_FLAGS(or_with);
1727 }
1728 else {
5c0f85ef 1729 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
eff8b7dc 1730 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
f240c685
KW
1731 if (OP(or_with) != ANYOFD) {
1732 ored_flags
1733 |= ANYOF_FLAGS(or_with)
108316fb
KW
1734 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1735 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1736 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1737 ored_flags |=
1738 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1739 }
f240c685 1740 }
a0dd4231
KW
1741 }
1742
1743 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1744
1745 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1746 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1747 * 'or_with' may be inverted. When not inverted, we have the simple
1748 * situation of computing:
1749 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1750 * If P1|P2 yields a situation with both a class and its complement are
1751 * set, like having both \w and \W, this matches all code points, and we
1752 * can delete these from the P component of the ssc going forward. XXX We
1753 * might be able to delete all the P components, but I (khw) am not certain
1754 * about this, and it is better to be safe.
1755 *
1756 * Inverted, we have
1757 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1758 * <= (C1 | P1) | ~C2
1759 * <= (C1 | ~C2) | P1
1760 * (which results in actually simpler code than the non-inverted case)
1761 * */
9826f543 1762
a0dd4231 1763 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1764 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1765 {
cdd87c1d 1766 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1767 } /* else Not inverted */
93e92956 1768 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1769 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1770 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1771 unsigned int i;
1772 for (i = 0; i < ANYOF_MAX; i += 2) {
1773 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1774 {
1775 ssc_match_all_cp(ssc);
1776 ANYOF_POSIXL_CLEAR(ssc, i);
1777 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1778 }
1779 }
1780 }
1aa99e6b 1781 }
cdd87c1d
KW
1782
1783 ssc_union(ssc,
1784 ored_cp_list,
1785 FALSE /* Already has been inverted */
1786 );
653099ff
GS
1787}
1788
b423522f
KW
1789PERL_STATIC_INLINE void
1790S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1791{
1792 PERL_ARGS_ASSERT_SSC_UNION;
1793
71068078 1794 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1795
1796 _invlist_union_maybe_complement_2nd(ssc->invlist,
1797 invlist,
1798 invert2nd,
1799 &ssc->invlist);
1800}
1801
1802PERL_STATIC_INLINE void
1803S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1804 SV* const invlist,
1805 const bool invert2nd)
1806{
1807 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1808
71068078 1809 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1810
1811 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1812 invlist,
1813 invert2nd,
1814 &ssc->invlist);
1815}
1816
1817PERL_STATIC_INLINE void
1818S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1819{
1820 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1821
71068078 1822 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1823
1824 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1825}
1826
1827PERL_STATIC_INLINE void
1828S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1829{
1830 /* AND just the single code point 'cp' into the SSC 'ssc' */
1831
1832 SV* cp_list = _new_invlist(2);
1833
1834 PERL_ARGS_ASSERT_SSC_CP_AND;
1835
71068078 1836 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1837
1838 cp_list = add_cp_to_invlist(cp_list, cp);
1839 ssc_intersection(ssc, cp_list,
1840 FALSE /* Not inverted */
1841 );
1842 SvREFCNT_dec_NN(cp_list);
1843}
1844
1845PERL_STATIC_INLINE void
dc3bf405 1846S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
1847{
1848 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
1849 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1850
71068078 1851 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1852
1853 ANYOF_POSIXL_ZERO(ssc);
1854 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1855}
1856
b35552de
KW
1857#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1858
1859STATIC bool
1860S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1861{
1862 /* The synthetic start class is used to hopefully quickly winnow down
1863 * places where a pattern could start a match in the target string. If it
1864 * doesn't really narrow things down that much, there isn't much point to
1865 * having the overhead of using it. This function uses some very crude
1866 * heuristics to decide if to use the ssc or not.
1867 *
1868 * It returns TRUE if 'ssc' rules out more than half what it considers to
1869 * be the "likely" possible matches, but of course it doesn't know what the
1870 * actual things being matched are going to be; these are only guesses
1871 *
1872 * For /l matches, it assumes that the only likely matches are going to be
1873 * in the 0-255 range, uniformly distributed, so half of that is 127
1874 * For /a and /d matches, it assumes that the likely matches will be just
1875 * the ASCII range, so half of that is 63
1876 * For /u and there isn't anything matching above the Latin1 range, it
1877 * assumes that that is the only range likely to be matched, and uses
1878 * half that as the cut-off: 127. If anything matches above Latin1,
1879 * it assumes that all of Unicode could match (uniformly), except for
1880 * non-Unicode code points and things in the General Category "Other"
1881 * (unassigned, private use, surrogates, controls and formats). This
1882 * is a much large number. */
1883
b35552de
KW
1884 U32 count = 0; /* Running total of number of code points matched by
1885 'ssc' */
1886 UV start, end; /* Start and end points of current range in inversion
1887 list */
72400949
KW
1888 const U32 max_code_points = (LOC)
1889 ? 256
1890 : (( ! UNI_SEMANTICS
1891 || invlist_highest(ssc->invlist) < 256)
1892 ? 128
1893 : NON_OTHER_COUNT);
1894 const U32 max_match = max_code_points / 2;
b35552de
KW
1895
1896 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1897
1898 invlist_iterinit(ssc->invlist);
1899 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
1900 if (start >= max_code_points) {
1901 break;
b35552de 1902 }
72400949 1903 end = MIN(end, max_code_points - 1);
b35552de 1904 count += end - start + 1;
72400949 1905 if (count >= max_match) {
b35552de
KW
1906 invlist_iterfinish(ssc->invlist);
1907 return FALSE;
1908 }
1909 }
1910
1911 return TRUE;
1912}
1913
1914
b423522f
KW
1915STATIC void
1916S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1917{
1918 /* The inversion list in the SSC is marked mortal; now we need a more
1919 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
1920 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1921 * map */
b423522f
KW
1922
1923 SV* invlist = invlist_clone(ssc->invlist);
1924
1925 PERL_ARGS_ASSERT_SSC_FINALIZE;
1926
71068078 1927 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 1928
a0dd4231 1929 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
1930 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1931 * by the time we reach here */
f240c685
KW
1932 assert(! (ANYOF_FLAGS(ssc)
1933 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
1934 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1935 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 1936
b423522f
KW
1937 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1938
1ee208c4
KW
1939 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1940 NULL, NULL, NULL, FALSE);
b423522f 1941
85c8e306
KW
1942 /* Make sure is clone-safe */
1943 ssc->invlist = NULL;
1944
e0e1be5f 1945 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 1946 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
e0e1be5f 1947 }
1462525b 1948
b2e90ddf
KW
1949 if (RExC_contains_locale) {
1950 OP(ssc) = ANYOFL;
1951 }
1952
1462525b 1953 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
1954}
1955
a3621e74
YO
1956#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1957#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1958#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
1959#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1960 ? (TRIE_LIST_CUR( idx ) - 1) \
1961 : 0 )
a3621e74 1962
3dab1dad
YO
1963
1964#ifdef DEBUGGING
07be1b83 1965/*
2b8b4781
NC
1966 dump_trie(trie,widecharmap,revcharmap)
1967 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1968 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1969
1970 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1971 The _interim_ variants are used for debugging the interim
1972 tables that are used to generate the final compressed
1973 representation which is what dump_trie expects.
1974
486ec47a 1975 Part of the reason for their existence is to provide a form
3dab1dad 1976 of documentation as to how the different representations function.
07be1b83
YO
1977
1978*/
3dab1dad
YO
1979
1980/*
3dab1dad
YO
1981 Dumps the final compressed table form of the trie to Perl_debug_log.
1982 Used for debugging make_trie().
1983*/
b9a59e08 1984
3dab1dad 1985STATIC void
2b8b4781
NC
1986S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1987 AV *revcharmap, U32 depth)
3dab1dad
YO
1988{
1989 U32 state;
ab3bbdeb 1990 SV *sv=sv_newmortal();
55eed653 1991 int colwidth= widecharmap ? 6 : 4;
2e64971a 1992 U16 word;
3dab1dad
YO
1993 GET_RE_DEBUG_FLAGS_DECL;
1994
7918f24d 1995 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1996
6ad9a8ab 1997 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 1998 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
1999
2000 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 2001 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 2002 if ( tmp ) {
6ad9a8ab 2003 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2004 colwidth,
538e84ed 2005 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2006 PL_colors[0], PL_colors[1],
2007 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2008 PERL_PV_ESCAPE_FIRSTCHAR
2009 )
ab3bbdeb 2010 );
3dab1dad
YO
2011 }
2012 }
1e37780e
YO
2013 Perl_re_printf( aTHX_ "\n");
2014 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2015
2016 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2017 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2018 Perl_re_printf( aTHX_ "\n");
3dab1dad 2019
1e2e3d02 2020 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2021 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2022
1e37780e 2023 Perl_re_indentf( aTHX_ "#%4"UVXf"|", depth+1, (UV)state);
3dab1dad
YO
2024
2025 if ( trie->states[ state ].wordnum ) {
1e37780e 2026 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2027 } else {
6ad9a8ab 2028 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2029 }
2030
6ad9a8ab 2031 Perl_re_printf( aTHX_ " @%4"UVXf" ", (UV)base );
3dab1dad
YO
2032
2033 if ( base ) {
2034 U32 ofs = 0;
2035
2036 while( ( base + ofs < trie->uniquecharcount ) ||
2037 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2038 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2039 != state))
3dab1dad
YO
2040 ofs++;
2041
6ad9a8ab 2042 Perl_re_printf( aTHX_ "+%2"UVXf"[ ", (UV)ofs);
3dab1dad
YO
2043
2044 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2045 if ( ( base + ofs >= trie->uniquecharcount )
2046 && ( base + ofs - trie->uniquecharcount
2047 < trie->lasttrans )
2048 && trie->trans[ base + ofs
2049 - trie->uniquecharcount ].check == state )
3dab1dad 2050 {
1e37780e
YO
2051 Perl_re_printf( aTHX_ "%*"UVXf, colwidth,
2052 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2053 );
3dab1dad 2054 } else {
6ad9a8ab 2055 Perl_re_printf( aTHX_ "%*s",colwidth," ." );
3dab1dad
YO
2056 }
2057 }
2058
6ad9a8ab 2059 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2060
2061 }
6ad9a8ab 2062 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2063 }
6ad9a8ab 2064 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2065 depth);
2e64971a 2066 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2067 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2068 (int)word, (int)(trie->wordinfo[word].prev),
2069 (int)(trie->wordinfo[word].len));
2070 }
6ad9a8ab 2071 Perl_re_printf( aTHX_ "\n" );
538e84ed 2072}
3dab1dad 2073/*
3dab1dad 2074 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2075 List tries normally only are used for construction when the number of
3dab1dad
YO
2076 possible chars (trie->uniquecharcount) is very high.
2077 Used for debugging make_trie().
2078*/
2079STATIC void
55eed653 2080S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2081 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2082 U32 depth)
3dab1dad
YO
2083{
2084 U32 state;
ab3bbdeb 2085 SV *sv=sv_newmortal();
55eed653 2086 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2087 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2088
2089 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2090
3dab1dad 2091 /* print out the table precompression. */
6ad9a8ab 2092 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2093 depth+1 );
6ad9a8ab 2094 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2095 depth+1, "------:-----+-----------------\n" );
538e84ed 2096
3dab1dad
YO
2097 for( state=1 ; state < next_alloc ; state ++ ) {
2098 U16 charid;
538e84ed 2099
6ad9a8ab 2100 Perl_re_indentf( aTHX_ " %4"UVXf" :",
cb41e5d6 2101 depth+1, (UV)state );
3dab1dad 2102 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2103 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2104 } else {
6ad9a8ab 2105 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2106 trie->states[ state ].wordnum
2107 );
2108 }
2109 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed
KW
2110 SV ** const tmp = av_fetch( revcharmap,
2111 TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb 2112 if ( tmp ) {
6ad9a8ab 2113 Perl_re_printf( aTHX_ "%*s:%3X=%4"UVXf" | ",
ab3bbdeb 2114 colwidth,
538e84ed
KW
2115 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2116 colwidth,
2117 PL_colors[0], PL_colors[1],
2118 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2119 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2120 ) ,
1e2e3d02
YO
2121 TRIE_LIST_ITEM(state,charid).forid,
2122 (UV)TRIE_LIST_ITEM(state,charid).newstate
2123 );
538e84ed 2124 if (!(charid % 10))
6ad9a8ab 2125 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2126 (int)((depth * 2) + 14), "");
1e2e3d02 2127 }
ab3bbdeb 2128 }
6ad9a8ab 2129 Perl_re_printf( aTHX_ "\n");
3dab1dad 2130 }
538e84ed 2131}
3dab1dad
YO
2132
2133/*
3dab1dad 2134 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2135 This is the normal DFA style state transition table, with a few
2136 twists to facilitate compression later.
3dab1dad
YO
2137 Used for debugging make_trie().
2138*/
2139STATIC void
55eed653 2140S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2141 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2142 U32 depth)
3dab1dad
YO
2143{
2144 U32 state;
2145 U16 charid;
ab3bbdeb 2146 SV *sv=sv_newmortal();
55eed653 2147 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2148 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2149
2150 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2151
3dab1dad
YO
2152 /*
2153 print out the table precompression so that we can do a visual check
2154 that they are identical.
2155 */
538e84ed 2156
6ad9a8ab 2157 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2158
2159 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2160 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2161 if ( tmp ) {
6ad9a8ab 2162 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2163 colwidth,
538e84ed 2164 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2165 PL_colors[0], PL_colors[1],
2166 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2167 PERL_PV_ESCAPE_FIRSTCHAR
2168 )
ab3bbdeb 2169 );
3dab1dad
YO
2170 }
2171 }
2172
4aaafc03
YO
2173 Perl_re_printf( aTHX_ "\n");
2174 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2175
2176 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2177 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2178 }
2179
6ad9a8ab 2180 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2181
2182 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2183
6ad9a8ab 2184 Perl_re_indentf( aTHX_ "%4"UVXf" : ",
cb41e5d6 2185 depth+1,
3dab1dad
YO
2186 (UV)TRIE_NODENUM( state ) );
2187
2188 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2189 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2190 if (v)
6ad9a8ab 2191 Perl_re_printf( aTHX_ "%*"UVXf, colwidth, v );
ab3bbdeb 2192 else
6ad9a8ab 2193 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2194 }
2195 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
6ad9a8ab 2196 Perl_re_printf( aTHX_ " (%4"UVXf")\n",
538e84ed 2197 (UV)trie->trans[ state ].check );
3dab1dad 2198 } else {
6ad9a8ab 2199 Perl_re_printf( aTHX_ " (%4"UVXf") W%4X\n",
538e84ed 2200 (UV)trie->trans[ state ].check,
3dab1dad
YO
2201 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2202 }
2203 }
07be1b83 2204}
3dab1dad
YO
2205
2206#endif
2207
2e64971a 2208
786e8c11
YO
2209/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2210 startbranch: the first branch in the whole branch sequence
2211 first : start branch of sequence of branch-exact nodes.
2212 May be the same as startbranch
2213 last : Thing following the last branch.
2214 May be the same as tail.
2215 tail : item following the branch sequence
2216 count : words in the sequence
a4525e78 2217 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2218 depth : indent depth
3dab1dad 2219
786e8c11 2220Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2221
786e8c11
YO
2222A trie is an N'ary tree where the branches are determined by digital
2223decomposition of the key. IE, at the root node you look up the 1st character and
2224follow that branch repeat until you find the end of the branches. Nodes can be
2225marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2226
786e8c11 2227 /he|she|his|hers/
72f13be8 2228
786e8c11
YO
2229would convert into the following structure. Numbers represent states, letters
2230following numbers represent valid transitions on the letter from that state, if
2231the number is in square brackets it represents an accepting state, otherwise it
2232will be in parenthesis.
07be1b83 2233
786e8c11
YO
2234 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2235 | |
2236 | (2)
2237 | |
2238 (1) +-i->(6)-+-s->[7]
2239 |
2240 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2241
786e8c11
YO
2242 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2243
2244This shows that when matching against the string 'hers' we will begin at state 1
2245read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2246then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2247is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2248single traverse. We store a mapping from accepting to state to which word was
2249matched, and then when we have multiple possibilities we try to complete the
b8fda935 2250rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2251
2252The only prior NFA like behaviour that would be changed by the TRIE support is
2253the silent ignoring of duplicate alternations which are of the form:
2254
2255 / (DUPE|DUPE) X? (?{ ... }) Y /x
2256
4b714af6 2257Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2258and without the optimisation. With the optimisations dupes will be silently
486ec47a 2259ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2260the following demonstrates:
2261
2262 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2263
2264which prints out 'word' three times, but
2265
2266 'words'=~/(word|word|word)(?{ print $1 })S/
2267
2268which doesnt print it out at all. This is due to other optimisations kicking in.
2269
2270Example of what happens on a structural level:
2271
486ec47a 2272The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2273
2274 1: CURLYM[1] {1,32767}(18)
2275 5: BRANCH(8)
2276 6: EXACT <ac>(16)
2277 8: BRANCH(11)
2278 9: EXACT <ad>(16)
2279 11: BRANCH(14)
2280 12: EXACT <ab>(16)
2281 16: SUCCEED(0)
2282 17: NOTHING(18)
2283 18: END(0)
2284
2285This would be optimizable with startbranch=5, first=5, last=16, tail=16
2286and should turn into:
2287
2288 1: CURLYM[1] {1,32767}(18)
2289 5: TRIE(16)
2290 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2291 <ac>
2292 <ad>
2293 <ab>
2294 16: SUCCEED(0)
2295 17: NOTHING(18)
2296 18: END(0)
2297
2298Cases where tail != last would be like /(?foo|bar)baz/:
2299
2300 1: BRANCH(4)
2301 2: EXACT <foo>(8)
2302 4: BRANCH(7)
2303 5: EXACT <bar>(8)
2304 7: TAIL(8)
2305 8: EXACT <baz>(10)
2306 10: END(0)
2307
2308which would be optimizable with startbranch=1, first=1, last=7, tail=8
2309and would end up looking like:
2310
2311 1: TRIE(8)
2312 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2313 <foo>
2314 <bar>
2315 7: TAIL(8)
2316 8: EXACT <baz>(10)
2317 10: END(0)
2318
c80e42f3 2319 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2320
2321is the recommended Unicode-aware way of saying
2322
2323 *(d++) = uv;
2324*/
2325
fab2782b 2326#define TRIE_STORE_REVCHAR(val) \
786e8c11 2327 STMT_START { \
73031816 2328 if (UTF) { \
668fcfea 2329 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2330 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 2331 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
2332 SvCUR_set(zlopp, kapow - flrbbbbb); \
2333 SvPOK_on(zlopp); \
2334 SvUTF8_on(zlopp); \
2335 av_push(revcharmap, zlopp); \
2336 } else { \
fab2782b 2337 char ooooff = (char)val; \
73031816
NC
2338 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2339 } \
2340 } STMT_END
786e8c11 2341
914a25d5
KW
2342/* This gets the next character from the input, folding it if not already
2343 * folded. */
2344#define TRIE_READ_CHAR STMT_START { \
2345 wordlen++; \
2346 if ( UTF ) { \
2347 /* if it is UTF then it is either already folded, or does not need \
2348 * folding */ \
1c1d615a 2349 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2350 } \
2351 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2352 /* This folder implies Unicode rules, which in the range expressible \
2353 * by not UTF is the lower case, with the two exceptions, one of \
2354 * which should have been taken care of before calling this */ \
2355 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2356 uvc = toLOWER_L1(*uc); \
2357 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2358 len = 1; \
914a25d5
KW
2359 } else { \
2360 /* raw data, will be folded later if needed */ \
2361 uvc = (U32)*uc; \
2362 len = 1; \
2363 } \
786e8c11
YO
2364} STMT_END
2365
2366
2367
2368#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2369 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
2370 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2371 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
2372 } \
2373 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2374 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2375 TRIE_LIST_CUR( state )++; \
2376} STMT_END
07be1b83 2377
786e8c11
YO
2378#define TRIE_LIST_NEW(state) STMT_START { \
2379 Newxz( trie->states[ state ].trans.list, \
2380 4, reg_trie_trans_le ); \
2381 TRIE_LIST_CUR( state ) = 1; \
2382 TRIE_LIST_LEN( state ) = 4; \
2383} STMT_END
07be1b83 2384
786e8c11
YO
2385#define TRIE_HANDLE_WORD(state) STMT_START { \
2386 U16 dupe= trie->states[ state ].wordnum; \
2387 regnode * const noper_next = regnext( noper ); \
2388 \
786e8c11
YO
2389 DEBUG_r({ \
2390 /* store the word for dumping */ \
2391 SV* tmp; \
2392 if (OP(noper) != NOTHING) \
740cce10 2393 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2394 else \
740cce10 2395 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2396 av_push( trie_words, tmp ); \
786e8c11
YO
2397 }); \
2398 \
2399 curword++; \
2e64971a
DM
2400 trie->wordinfo[curword].prev = 0; \
2401 trie->wordinfo[curword].len = wordlen; \
2402 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2403 \
2404 if ( noper_next < tail ) { \
2405 if (!trie->jump) \
538e84ed
KW
2406 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2407 sizeof(U16) ); \
7f69552c 2408 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2409 if (!jumper) \
2410 jumper = noper_next; \
2411 if (!nextbranch) \
2412 nextbranch= regnext(cur); \
2413 } \
2414 \
2415 if ( dupe ) { \
2e64971a
DM
2416 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2417 /* chain, so that when the bits of chain are later */\
2418 /* linked together, the dups appear in the chain */\
2419 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2420 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2421 } else { \
2422 /* we haven't inserted this word yet. */ \
2423 trie->states[ state ].wordnum = curword; \
2424 } \
2425} STMT_END
07be1b83 2426
3dab1dad 2427
786e8c11
YO
2428#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2429 ( ( base + charid >= ucharcount \
2430 && base + charid < ubound \
2431 && state == trie->trans[ base - ucharcount + charid ].check \
2432 && trie->trans[ base - ucharcount + charid ].next ) \
2433 ? trie->trans[ base - ucharcount + charid ].next \
2434 : ( state==1 ? special : 0 ) \
2435 )
3dab1dad 2436
786e8c11
YO
2437#define MADE_TRIE 1
2438#define MADE_JUMP_TRIE 2
2439#define MADE_EXACT_TRIE 4
3dab1dad 2440
a3621e74 2441STATIC I32
538e84ed
KW
2442S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2443 regnode *first, regnode *last, regnode *tail,
2444 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2445{
2446 /* first pass, loop through and scan words */
2447 reg_trie_data *trie;
55eed653 2448 HV *widecharmap = NULL;
2b8b4781 2449 AV *revcharmap = newAV();
a3621e74 2450 regnode *cur;
a3621e74
YO
2451 STRLEN len = 0;
2452 UV uvc = 0;
2453 U16 curword = 0;
2454 U32 next_alloc = 0;
786e8c11
YO
2455 regnode *jumper = NULL;
2456 regnode *nextbranch = NULL;
7f69552c 2457 regnode *convert = NULL;
2e64971a 2458 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2459 /* we just use folder as a flag in utf8 */
1e696034 2460 const U8 * folder = NULL;
a3621e74 2461
2b8b4781 2462#ifdef DEBUGGING
cf78de0b 2463 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2b8b4781
NC
2464 AV *trie_words = NULL;
2465 /* along with revcharmap, this only used during construction but both are
2466 * useful during debugging so we store them in the struct when debugging.
8e11feef 2467 */
2b8b4781 2468#else
cf78de0b 2469 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2470 STRLEN trie_charcount=0;
3dab1dad 2471#endif
2b8b4781 2472 SV *re_trie_maxbuff;
a3621e74 2473 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2474
2475 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2476#ifndef DEBUGGING
2477 PERL_UNUSED_ARG(depth);
2478#endif
a3621e74 2479
1e696034 2480 switch (flags) {
a4525e78 2481 case EXACT: case EXACTL: break;
2f7f8cb1 2482 case EXACTFA:
fab2782b 2483 case EXACTFU_SS:
a4525e78
KW
2484 case EXACTFU:
2485 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2486 case EXACTF: folder = PL_fold; break;
fab2782b 2487 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2488 }
2489
c944940b 2490 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2491 trie->refcount = 1;
3dab1dad 2492 trie->startstate = 1;
786e8c11 2493 trie->wordcount = word_count;
f8fc2ecf 2494 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2495 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
a4525e78 2496 if (flags == EXACT || flags == EXACTL)
c944940b 2497 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2498 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2499 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2500
a3621e74 2501 DEBUG_r({
2b8b4781 2502 trie_words = newAV();
a3621e74 2503 });
a3621e74 2504
0111c4fd 2505 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
316ebaf2 2506 assert(re_trie_maxbuff);
a3621e74 2507 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2508 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2509 }
df826430 2510 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2511 Perl_re_indentf( aTHX_
cb41e5d6
YO
2512 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2513 depth+1,
538e84ed
KW
2514 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2515 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2516 });
538e84ed 2517
7f69552c
YO
2518 /* Find the node we are going to overwrite */
2519 if ( first == startbranch && OP( last ) != BRANCH ) {
2520 /* whole branch chain */
2521 convert = first;
2522 } else {
2523 /* branch sub-chain */
2524 convert = NEXTOPER( first );
2525 }
538e84ed 2526
a3621e74
YO
2527 /* -- First loop and Setup --
2528
2529 We first traverse the branches and scan each word to determine if it
2530 contains widechars, and how many unique chars there are, this is
2531 important as we have to build a table with at least as many columns as we
2532 have unique chars.
2533
2534 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2535 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2536 the native representation of the character value as the key and IV's for
2537 the coded index.
a3621e74
YO
2538
2539 *TODO* If we keep track of how many times each character is used we can
2540 remap the columns so that the table compression later on is more
3b753521 2541 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2542 middle and the least common are on the outside. IMO this would be better
2543 than a most to least common mapping as theres a decent chance the most
2544 common letter will share a node with the least common, meaning the node
486ec47a 2545 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2546 case is when we have the least common nodes twice.
2547
2548 */
2549
a3621e74 2550 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2551 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2552 const U8 *uc;
2553 const U8 *e;
bc031a7d 2554 int foldlen = 0;
07be1b83 2555 U32 wordlen = 0; /* required init */
bc031a7d
KW
2556 STRLEN minchars = 0;
2557 STRLEN maxchars = 0;
538e84ed
KW
2558 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2559 bitmap?*/
a3621e74 2560
3dab1dad 2561 if (OP(noper) == NOTHING) {
df826430 2562 regnode *noper_next= regnext(noper);
944e05e3
YO
2563 if (noper_next < tail)
2564 noper= noper_next;
2565 }
2566
2567 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2568 uc= (U8*)STRING(noper);
2569 e= uc + STR_LEN(noper);
2570 } else {
2571 trie->minlen= 0;
2572 continue;
3dab1dad 2573 }
df826430 2574
944e05e3 2575
fab2782b 2576 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2577 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2578 regardless of encoding */
fab2782b
YO
2579 if (OP( noper ) == EXACTFU_SS) {
2580 /* false positives are ok, so just set this */
0dc4a61d 2581 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2582 }
2583 }
bc031a7d
KW
2584 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2585 branch */
3dab1dad 2586 TRIE_CHARCOUNT(trie)++;
a3621e74 2587 TRIE_READ_CHAR;
645de4ce 2588
bc031a7d
KW
2589 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2590 * is in effect. Under /i, this character can match itself, or
2591 * anything that folds to it. If not under /i, it can match just
2592 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2593 * all fold to k, and all are single characters. But some folds
2594 * expand to more than one character, so for example LATIN SMALL
2595 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2596 * the string beginning at 'uc' is 'ffi', it could be matched by
2597 * three characters, or just by the one ligature character. (It
2598 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2599 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2600 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2601 * match.) The trie needs to know the minimum and maximum number
2602 * of characters that could match so that it can use size alone to
2603 * quickly reject many match attempts. The max is simple: it is
2604 * the number of folded characters in this branch (since a fold is
2605 * never shorter than what folds to it. */
2606
2607 maxchars++;
2608
2609 /* And the min is equal to the max if not under /i (indicated by
2610 * 'folder' being NULL), or there are no multi-character folds. If
2611 * there is a multi-character fold, the min is incremented just
2612 * once, for the character that folds to the sequence. Each
2613 * character in the sequence needs to be added to the list below of
2614 * characters in the trie, but we count only the first towards the
2615 * min number of characters needed. This is done through the
2616 * variable 'foldlen', which is returned by the macros that look
2617 * for these sequences as the number of bytes the sequence
2618 * occupies. Each time through the loop, we decrement 'foldlen' by
2619 * how many bytes the current char occupies. Only when it reaches
2620 * 0 do we increment 'minchars' or look for another multi-character
2621 * sequence. */
2622 if (folder == NULL) {
2623 minchars++;
2624 }
2625 else if (foldlen > 0) {
2626 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2627 }
2628 else {
bc031a7d
KW
2629 minchars++;
2630
2631 /* See if *uc is the beginning of a multi-character fold. If
2632 * so, we decrement the length remaining to look at, to account
2633 * for the current character this iteration. (We can use 'uc'
2634 * instead of the fold returned by TRIE_READ_CHAR because for
2635 * non-UTF, the latin1_safe macro is smart enough to account
2636 * for all the unfolded characters, and because for UTF, the
2637 * string will already have been folded earlier in the
2638 * compilation process */
2639 if (UTF) {
2640 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2641 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2642 }
2643 }
bc031a7d
KW
2644 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2645 foldlen--;
2646 }
645de4ce 2647 }
bc031a7d
KW
2648
2649 /* The current character (and any potential folds) should be added
2650 * to the possible matching characters for this position in this
2651 * branch */
a3621e74 2652 if ( uvc < 256 ) {
fab2782b
YO
2653 if ( folder ) {
2654 U8 folded= folder[ (U8) uvc ];
2655 if ( !trie->charmap[ folded ] ) {
2656 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2657 TRIE_STORE_REVCHAR( folded );
2658 }
2659 }
a3621e74
YO
2660 if ( !trie->charmap[ uvc ] ) {
2661 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2662 TRIE_STORE_REVCHAR( uvc );
a3621e74 2663 }
02daf0ab 2664 if ( set_bit ) {
62012aee
KW
2665 /* store the codepoint in the bitmap, and its folded
2666 * equivalent. */
fab2782b 2667 TRIE_BITMAP_SET(trie, uvc);
0921ee73
T
2668
2669 /* store the folded codepoint */
fab2782b 2670 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
0921ee73
T
2671
2672 if ( !UTF ) {
2673 /* store first byte of utf8 representation of
acdf4139 2674 variant codepoints */
6f2d5cbc 2675 if (! UVCHR_IS_INVARIANT(uvc)) {
acdf4139 2676 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
2677 }
2678 }
02daf0ab
YO
2679 set_bit = 0; /* We've done our bit :-) */
2680 }
a3621e74 2681 } else {
bc031a7d
KW
2682
2683 /* XXX We could come up with the list of code points that fold
2684 * to this using PL_utf8_foldclosures, except not for
2685 * multi-char folds, as there may be multiple combinations
2686 * there that could work, which needs to wait until runtime to
2687 * resolve (The comment about LIGATURE FFI above is such an
2688 * example */
2689
a3621e74 2690 SV** svpp;
55eed653
NC
2691 if ( !widecharmap )
2692 widecharmap = newHV();
a3621e74 2693
55eed653 2694 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2695
2696 if ( !svpp )
e4584336 2697 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
2698
2699 if ( !SvTRUE( *svpp ) ) {
2700 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2701 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2702 }
2703 }
bc031a7d
KW
2704 } /* end loop through characters in this branch of the trie */
2705
2706 /* We take the min and max for this branch and combine to find the min
2707 * and max for all branches processed so far */
3dab1dad 2708 if( cur == first ) {
bc031a7d
KW
2709 trie->minlen = minchars;
2710 trie->maxlen = maxchars;
2711 } else if (minchars < trie->minlen) {
2712 trie->minlen = minchars;
2713 } else if (maxchars > trie->maxlen) {
2714 trie->maxlen = maxchars;
fab2782b 2715 }
a3621e74
YO
2716 } /* end first pass */
2717 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2718 Perl_re_indentf( aTHX_
cb41e5d6
YO
2719 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2720 depth+1,
55eed653 2721 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2722 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2723 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2724 );
a3621e74
YO
2725
2726 /*
2727 We now know what we are dealing with in terms of unique chars and
2728 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2729 representation using a flat table will take. If it's over a reasonable
2730 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2731 conservative but potentially much slower representation using an array
2732 of lists.
2733
2734 At the end we convert both representations into the same compressed
2735 form that will be used in regexec.c for matching with. The latter
2736 is a form that cannot be used to construct with but has memory
2737 properties similar to the list form and access properties similar
2738 to the table form making it both suitable for fast searches and
2739 small enough that its feasable to store for the duration of a program.
2740
2741 See the comment in the code where the compressed table is produced
2742 inplace from the flat tabe representation for an explanation of how
2743 the compression works.
2744
2745 */
2746
2747
2e64971a
DM
2748 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2749 prev_states[1] = 0;
2750
538e84ed
KW
2751 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2752 > SvIV(re_trie_maxbuff) )
2753 {
a3621e74
YO
2754 /*
2755 Second Pass -- Array Of Lists Representation
2756
2757 Each state will be represented by a list of charid:state records
2758 (reg_trie_trans_le) the first such element holds the CUR and LEN
2759 points of the allocated array. (See defines above).
2760
2761 We build the initial structure using the lists, and then convert
2762 it into the compressed table form which allows faster lookups
2763 (but cant be modified once converted).
a3621e74
YO
2764 */
2765
a3621e74
YO
2766 STRLEN transcount = 1;
2767
6ad9a8ab 2768 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 2769 depth+1));
686b73d4 2770
c944940b
JH
2771 trie->states = (reg_trie_state *)
2772 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2773 sizeof(reg_trie_state) );
a3621e74
YO
2774 TRIE_LIST_NEW(1);
2775 next_alloc = 2;
2776
2777 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2778
df826430 2779 regnode *noper = NEXTOPER( cur );
c445ea15
AL
2780 U32 state = 1; /* required init */
2781 U16 charid = 0; /* sanity init */
07be1b83 2782 U32 wordlen = 0; /* required init */
c445ea15 2783
df826430
YO
2784 if (OP(noper) == NOTHING) {
2785 regnode *noper_next= regnext(noper);
944e05e3
YO
2786 if (noper_next < tail)
2787 noper= noper_next;
df826430
YO
2788 }
2789
944e05e3
YO
2790 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2791 const U8 *uc= (U8*)STRING(noper);
2792 const U8 *e= uc + STR_LEN(noper);
2793
786e8c11 2794 for ( ; uc < e ; uc += len ) {
c445ea15 2795
786e8c11 2796 TRIE_READ_CHAR;
c445ea15 2797
786e8c11
YO
2798 if ( uvc < 256 ) {
2799 charid = trie->charmap[ uvc ];
c445ea15 2800 } else {
538e84ed
KW
2801 SV** const svpp = hv_fetch( widecharmap,
2802 (char*)&uvc,
2803 sizeof( UV ),
2804 0);
786e8c11
YO
2805 if ( !svpp ) {
2806 charid = 0;
2807 } else {
2808 charid=(U16)SvIV( *svpp );
2809 }
c445ea15 2810 }
538e84ed
KW
2811 /* charid is now 0 if we dont know the char read, or
2812 * nonzero if we do */
786e8c11 2813 if ( charid ) {
a3621e74 2814
786e8c11
YO
2815 U16 check;
2816 U32 newstate = 0;
a3621e74 2817
786e8c11
YO
2818 charid--;
2819 if ( !trie->states[ state ].trans.list ) {
2820 TRIE_LIST_NEW( state );
c445ea15 2821 }
538e84ed
KW
2822 for ( check = 1;
2823 check <= TRIE_LIST_USED( state );
2824 check++ )
2825 {
2826 if ( TRIE_LIST_ITEM( state, check ).forid
2827 == charid )
2828 {
786e8c11
YO
2829 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2830 break;
2831 }
2832 }
2833 if ( ! newstate ) {
2834 newstate = next_alloc++;
2e64971a 2835 prev_states[newstate] = state;
786e8c11
YO
2836 TRIE_LIST_PUSH( state, charid, newstate );
2837 transcount++;
2838 }
2839 state = newstate;
2840 } else {
2841 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 2842 }
a28509cc 2843 }
c445ea15 2844 }
3dab1dad 2845 TRIE_HANDLE_WORD(state);
a3621e74
YO
2846
2847 } /* end second pass */
2848
1e2e3d02 2849 /* next alloc is the NEXT state to be allocated */
538e84ed 2850 trie->statecount = next_alloc;
c944940b
JH
2851 trie->states = (reg_trie_state *)
2852 PerlMemShared_realloc( trie->states,
2853 next_alloc
2854 * sizeof(reg_trie_state) );
a3621e74 2855
3dab1dad 2856 /* and now dump it out before we compress it */
2b8b4781
NC
2857 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2858 revcharmap, next_alloc,
2859 depth+1)
1e2e3d02 2860 );
a3621e74 2861
c944940b
JH
2862 trie->trans = (reg_trie_trans *)
2863 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2864 {
2865 U32 state;
a3621e74
YO
2866 U32 tp = 0;
2867 U32 zp = 0;
2868
2869
2870 for( state=1 ; state < next_alloc ; state ++ ) {
2871 U32 base=0;
2872
2873 /*
2874 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 2875 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
2876 );
2877 */
2878
2879 if (trie->states[state].trans.list) {
2880 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2881 U16 maxid=minid;
a28509cc 2882 U16 idx;
a3621e74
YO
2883
2884 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
2885 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2886 if ( forid < minid ) {
2887 minid=forid;
2888 } else if ( forid > maxid ) {
2889 maxid=forid;
2890 }
a3621e74
YO
2891 }
2892 if ( transcount < tp + maxid - minid + 1) {
2893 transcount *= 2;
c944940b
JH
2894 trie->trans = (reg_trie_trans *)
2895 PerlMemShared_realloc( trie->trans,
446bd890
NC
2896 transcount
2897 * sizeof(reg_trie_trans) );
538e84ed
KW
2898 Zero( trie->trans + (transcount / 2),
2899 transcount / 2,
2900 reg_trie_trans );
a3621e74
YO
2901 }
2902 base = trie->uniquecharcount + tp - minid;
2903 if ( maxid == minid ) {
2904 U32 set = 0;
2905 for ( ; zp < tp ; zp++ ) {
2906 if ( ! trie->trans[ zp ].next ) {
2907 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
2908 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2909 1).newstate;
a3621e74
YO
2910 trie->trans[ zp ].check = state;
2911 set = 1;
2912 break;
2913 }
2914 }
2915 if ( !set ) {
538e84ed
KW
2916 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2917 1).newstate;
a3621e74
YO
2918 trie->trans[ tp ].check = state;
2919 tp++;
2920 zp = tp;
2921 }
2922 } else {
2923 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
2924 const U32 tid = base
2925 - trie->uniquecharcount
2926 + TRIE_LIST_ITEM( state, idx ).forid;
2927 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2928 idx ).newstate;
a3621e74
YO
2929 trie->trans[ tid ].check = state;
2930 }
2931 tp += ( maxid - minid + 1 );
2932 }
2933 Safefree(trie->states[ state ].trans.list);
2934 }
2935 /*
2936 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 2937 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
2938 );
2939 */
2940 trie->states[ state ].trans.base=base;
2941 }
cc601c31 2942 trie->lasttrans = tp + 1;
a3621e74
YO
2943 }
2944 } else {
2945 /*
2946 Second Pass -- Flat Table Representation.
2947
b423522f
KW
2948 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2949 each. We know that we will need Charcount+1 trans at most to store
2950 the data (one row per char at worst case) So we preallocate both
2951 structures assuming worst case.
a3621e74
YO
2952
2953 We then construct the trie using only the .next slots of the entry
2954 structs.
2955
b423522f
KW
2956 We use the .check field of the first entry of the node temporarily
2957 to make compression both faster and easier by keeping track of how
2958 many non zero fields are in the node.
a3621e74
YO
2959
2960 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2961 transition.
2962
b423522f
KW
2963 There are two terms at use here: state as a TRIE_NODEIDX() which is
2964 a number representing the first entry of the node, and state as a
2965 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2966 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2967 if there are 2 entrys per node. eg:
a3621e74
YO
2968
2969 A B A B
2970 1. 2 4 1. 3 7
2971 2. 0 3 3. 0 5
2972 3. 0 0 5. 0 0
2973 4. 0 0 7. 0 0
2974
b423522f
KW
2975 The table is internally in the right hand, idx form. However as we
2976 also have to deal with the states array which is indexed by nodenum
2977 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
2978
2979 */
6ad9a8ab 2980 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 2981 depth+1));
3dab1dad 2982
c944940b
JH
2983 trie->trans = (reg_trie_trans *)
2984 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2985 * trie->uniquecharcount + 1,
2986 sizeof(reg_trie_trans) );
2987 trie->states = (reg_trie_state *)
2988 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2989 sizeof(reg_trie_state) );
a3621e74
YO
2990 next_alloc = trie->uniquecharcount + 1;
2991
3dab1dad 2992
a3621e74
YO
2993 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2994
df826430 2995 regnode *noper = NEXTOPER( cur );
a3621e74
YO
2996
2997 U32 state = 1; /* required init */
2998
2999 U16 charid = 0; /* sanity init */
3000 U32 accept_state = 0; /* sanity init */
a3621e74 3001
07be1b83 3002 U32 wordlen = 0; /* required init */
a3621e74 3003
df826430
YO
3004 if (OP(noper) == NOTHING) {
3005 regnode *noper_next= regnext(noper);
944e05e3
YO
3006 if (noper_next < tail)
3007 noper= noper_next;
df826430 3008 }
fab2782b 3009
944e05e3
YO
3010 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3011 const U8 *uc= (U8*)STRING(noper);
3012 const U8 *e= uc + STR_LEN(noper);
3013
786e8c11 3014 for ( ; uc < e ; uc += len ) {
a3621e74 3015
786e8c11 3016 TRIE_READ_CHAR;
a3621e74 3017
786e8c11
YO
3018 if ( uvc < 256 ) {
3019 charid = trie->charmap[ uvc ];
3020 } else {
538e84ed
KW
3021 SV* const * const svpp = hv_fetch( widecharmap,
3022 (char*)&uvc,
3023 sizeof( UV ),
3024 0);
786e8c11 3025 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3026 }
786e8c11
YO
3027 if ( charid ) {
3028 charid--;
3029 if ( !trie->trans[ state + charid ].next ) {
3030 trie->trans[ state + charid ].next = next_alloc;
3031 trie->trans[ state ].check++;
2e64971a
DM
3032 prev_states[TRIE_NODENUM(next_alloc)]
3033 = TRIE_NODENUM(state);
786e8c11
YO
3034 next_alloc += trie->uniquecharcount;
3035 }
3036 state = trie->trans[ state + charid ].next;
3037 } else {
3038 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
3039 }
538e84ed
KW
3040 /* charid is now 0 if we dont know the char read, or
3041 * nonzero if we do */
a3621e74 3042 }
a3621e74 3043 }
3dab1dad
YO
3044 accept_state = TRIE_NODENUM( state );
3045 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3046
3047 } /* end second pass */
3048
3dab1dad 3049 /* and now dump it out before we compress it */
2b8b4781
NC
3050 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3051 revcharmap,
3052 next_alloc, depth+1));
a3621e74 3053
a3621e74
YO
3054 {
3055 /*
3056 * Inplace compress the table.*
3057
3058 For sparse data sets the table constructed by the trie algorithm will
3059 be mostly 0/FAIL transitions or to put it another way mostly empty.
3060 (Note that leaf nodes will not contain any transitions.)
3061
3062 This algorithm compresses the tables by eliminating most such
3063 transitions, at the cost of a modest bit of extra work during lookup:
3064
3065 - Each states[] entry contains a .base field which indicates the
3066 index in the state[] array wheres its transition data is stored.
3067
3b753521 3068 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3069
3070 - If .base is nonzero then charid is added to it to find an entry in
3071 the trans array.
3072
3073 -If trans[states[state].base+charid].check!=state then the
3074 transition is taken to be a 0/Fail transition. Thus if there are fail
3075 transitions at the front of the node then the .base offset will point
3076 somewhere inside the previous nodes data (or maybe even into a node
3077 even earlier), but the .check field determines if the transition is
3078 valid.
3079
786e8c11 3080 XXX - wrong maybe?
a3621e74 3081 The following process inplace converts the table to the compressed
3b753521 3082 table: We first do not compress the root node 1,and mark all its
a3621e74 3083 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3084 allows us to do a DFA construction from the compressed table later,
3085 and ensures that any .base pointers we calculate later are greater
3086 than 0.
a3621e74
YO
3087
3088 - We set 'pos' to indicate the first entry of the second node.
3089
3090 - We then iterate over the columns of the node, finding the first and
3091 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3092 and set the .check pointers accordingly, and advance pos
3093 appropriately and repreat for the next node. Note that when we copy
3094 the next pointers we have to convert them from the original
3095 NODEIDX form to NODENUM form as the former is not valid post
3096 compression.
3097
3098 - If a node has no transitions used we mark its base as 0 and do not
3099 advance the pos pointer.
3100
3101 - If a node only has one transition we use a second pointer into the
3102 structure to fill in allocated fail transitions from other states.
3103 This pointer is independent of the main pointer and scans forward
3104 looking for null transitions that are allocated to a state. When it
3105 finds one it writes the single transition into the "hole". If the
786e8c11 3106 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3107
3108 - Once compressed we can Renew/realloc the structures to release the
3109 excess space.
3110
3111 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3112 specifically Fig 3.47 and the associated pseudocode.
3113
3114 demq
3115 */
a3b680e6 3116 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3117 U32 state, charid;
a3621e74 3118 U32 pos = 0, zp=0;
1e2e3d02 3119 trie->statecount = laststate;
a3621e74
YO
3120
3121 for ( state = 1 ; state < laststate ; state++ ) {
3122 U8 flag = 0;
a28509cc
AL
3123 const U32 stateidx = TRIE_NODEIDX( state );
3124 const U32 o_used = trie->trans[ stateidx ].check;
3125 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3126 trie->trans[ stateidx ].check = 0;
3127
538e84ed
KW
3128 for ( charid = 0;
3129 used && charid < trie->uniquecharcount;
3130 charid++ )
3131 {
a3621e74
YO
3132 if ( flag || trie->trans[ stateidx + charid ].next ) {
3133 if ( trie->trans[ stateidx + charid ].next ) {
3134 if (o_used == 1) {
3135 for ( ; zp < pos ; zp++ ) {
3136 if ( ! trie->trans[ zp ].next ) {
3137 break;
3138 }
3139 }
538e84ed
KW
3140 trie->states[ state ].trans.base
3141 = zp
3142 + trie->uniquecharcount
3143 - charid ;
3144 trie->trans[ zp ].next
3145 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3146 + charid ].next );
a3621e74
YO
3147 trie->trans[ zp ].check = state;
3148 if ( ++zp > pos ) pos = zp;
3149 break;
3150 }
3151 used--;
3152 }
3153 if ( !flag ) {
3154 flag = 1;
538e84ed
KW
3155 trie->states[ state ].trans.base
3156 = pos + trie->uniquecharcount - charid ;
a3621e74 3157 }
538e84ed
KW
3158 trie->trans[ pos ].next
3159 = SAFE_TRIE_NODENUM(
3160 trie->trans[ stateidx + charid ].next );
a3621e74
YO
3161 trie->trans[ pos ].check = state;
3162 pos++;
3163 }
3164 }
3165 }
cc601c31 3166 trie->lasttrans = pos + 1;
c944940b
JH
3167 trie->states = (reg_trie_state *)
3168 PerlMemShared_realloc( trie->states, laststate
3169 * sizeof(reg_trie_state) );
a3621e74 3170 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3171 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
cb41e5d6 3172 depth+1,
538e84ed
KW
3173 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3174 + 1 ),
3175 (IV)next_alloc,
3176 (IV)pos,
3177 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
3178 );
3179
3180 } /* end table compress */
3181 }
1e2e3d02 3182 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3183 Perl_re_indentf( aTHX_ "Statecount:%"UVxf" Lasttrans:%"UVxf"\n",
cb41e5d6 3184 depth+1,
1e2e3d02
YO
3185 (UV)trie->statecount,
3186 (UV)trie->lasttrans)
3187 );
cc601c31 3188 /* resize the trans array to remove unused space */
c944940b
JH
3189 trie->trans = (reg_trie_trans *)
3190 PerlMemShared_realloc( trie->trans, trie->lasttrans
3191 * sizeof(reg_trie_trans) );
a3621e74 3192
538e84ed 3193 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
3194 U8 nodetype =(U8)(flags & 0xFF);
3195 char *str=NULL;
538e84ed 3196
07be1b83 3197#ifdef DEBUGGING
e62cc96a 3198 regnode *optimize = NULL;
7122b237
YO
3199#ifdef RE_TRACK_PATTERN_OFFSETS
3200
b57a0404
JH
3201 U32 mjd_offset = 0;
3202 U32 mjd_nodelen = 0;
7122b237
YO
3203#endif /* RE_TRACK_PATTERN_OFFSETS */
3204#endif /* DEBUGGING */
a3621e74 3205 /*
3dab1dad
YO
3206 This means we convert either the first branch or the first Exact,
3207 depending on whether the thing following (in 'last') is a branch
3208 or not and whther first is the startbranch (ie is it a sub part of
3209 the alternation or is it the whole thing.)
3b753521 3210 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 3211 the whole branch sequence, including the first.
a3621e74 3212 */
3dab1dad 3213 /* Find the node we are going to overwrite */
7f69552c 3214 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 3215 /* branch sub-chain */
3dab1dad 3216 NEXT_OFF( first ) = (U16)(last - first);
7122b237 3217#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
3218 DEBUG_r({
3219 mjd_offset= Node_Offset((convert));
3220 mjd_nodelen= Node_Length((convert));
3221 });
7122b237 3222#endif
7f69552c 3223 /* whole branch chain */
7122b237
YO
3224 }
3225#ifdef RE_TRACK_PATTERN_OFFSETS
3226 else {
7f69552c
YO
3227 DEBUG_r({
3228 const regnode *nop = NEXTOPER( convert );
3229 mjd_offset= Node_Offset((nop));
3230 mjd_nodelen= Node_Length((nop));
3231 });
07be1b83
YO
3232 }
3233 DEBUG_OPTIMISE_r(
6ad9a8ab 3234 Perl_re_indentf( aTHX_ "MJD offset:%"UVuf" MJD length:%"UVuf"\n",
cb41e5d6 3235 depth+1,
786e8c11 3236 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 3237 );
7122b237 3238#endif
538e84ed 3239 /* But first we check to see if there is a common prefix we can
3dab1dad
YO
3240 split out as an EXACT and put in front of the TRIE node. */
3241 trie->startstate= 1;
55eed653 3242 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 3243 U32 state;
1e2e3d02 3244 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 3245 U32 ofs = 0;
8e11feef
RGS
3246 I32 idx = -1;
3247 U32 count = 0;
3248 const U32 base = trie->states[ state ].trans.base;
a3621e74 3249
3dab1dad 3250 if ( trie->states[state].wordnum )
8e11feef 3251 count = 1;
a3621e74 3252
8e11feef 3253 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
3254 if ( ( base + ofs >= trie->uniquecharcount ) &&
3255 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
3256 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3257 {
3dab1dad 3258 if ( ++count > 1 ) {
2b8b4781 3259 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 3260 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 3261 if ( state == 1 ) break;
3dab1dad
YO
3262 if ( count == 2 ) {
3263 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3264 DEBUG_OPTIMISE_r(
6ad9a8ab 3265 Perl_re_indentf( aTHX_ "New Start State=%"UVuf" Class: [",
cb41e5d6 3266 depth+1,
786e8c11 3267 (UV)state));
be8e71aa 3268 if (idx >= 0) {
2b8b4781 3269 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 3270 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 3271
3dab1dad 3272 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
3273 if ( folder )
3274 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 3275 DEBUG_OPTIMISE_r(
6ad9a8ab 3276 Perl_re_printf( aTHX_ "%s", (char*)ch)
3dab1dad 3277 );
8e11feef
RGS
3278 }
3279 }
3280 TRIE_BITMAP_SET(trie,*ch);
3281 if ( folder )
3282 TRIE_BITMAP_SET(trie,folder[ *ch ]);
6ad9a8ab 3283 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
8e11feef
RGS
3284 }
3285 idx = ofs;
3286 }
3dab1dad
YO
3287 }
3288 if ( count == 1 ) {
2b8b4781 3289 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
3290 STRLEN len;
3291 char *ch = SvPV( *tmp, len );
de734bd5
A
3292 DEBUG_OPTIMISE_r({
3293 SV *sv=sv_newmortal();
6ad9a8ab 3294 Perl_re_indentf( aTHX_ "Prefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
cb41e5d6 3295 depth+1,
538e84ed
KW
3296 (UV)state, (UV)idx,
3297 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
de734bd5
A
3298 PL_colors[0], PL_colors[1],
3299 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed 3300 PERL_PV_ESCAPE_FIRSTCHAR
de734bd5
A
3301 )
3302 );
3303 });
3dab1dad
YO
3304 if ( state==1 ) {
3305 OP( convert ) = nodetype;
3306 str=STRING(convert);
3307 STR_LEN(convert)=0;
3308 }
c490c714
YO
3309 STR_LEN(convert) += len;
3310 while (len--)
de734bd5 3311 *str++ = *ch++;
8e11feef 3312 } else {
538e84ed 3313#ifdef DEBUGGING
8e11feef 3314 if (state>1)
6ad9a8ab 3315 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
f9049ba1 3316#endif
8e11feef
RGS
3317 break;
3318 }
3319 }
2e64971a 3320 trie->prefixlen = (state-1);
3dab1dad 3321 if (str) {
8e11feef 3322 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 3323 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 3324 trie->startstate = state;
07be1b83
YO
3325 trie->minlen -= (state - 1);
3326 trie->maxlen -= (state - 1);
33809eae
JH
3327#ifdef DEBUGGING
3328 /* At least the UNICOS C compiler choked on this
3329 * being argument to DEBUG_r(), so let's just have
3330 * it right here. */
3331 if (
3332#ifdef PERL_EXT_RE_BUILD
3333 1
3334#else
3335 DEBUG_r_TEST
3336#endif
3337 ) {
3338 regnode *fix = convert;
3339 U32 word = trie->wordcount;
3340 mjd_nodelen++;
3341 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3342 while( ++fix < n ) {
3343 Set_Node_Offset_Length(fix, 0, 0);
3344 }
3345 while (word--) {
3346 SV ** const tmp = av_fetch( trie_words, word, 0 );
3347 if (tmp) {
3348 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3349 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3350 else
3351 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3352 }
3353 }
3354 }
3355#endif
8e11feef
RGS
3356 if (trie->maxlen) {
3357 convert = n;
3358 } else {
3dab1dad 3359 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 3360 DEBUG_r(optimize= n);
3dab1dad
YO
3361 }
3362 }
3363 }
538e84ed
KW
3364 if (!jumper)
3365 jumper = last;
3dab1dad 3366 if ( trie->maxlen ) {
8e11feef
RGS
3367 NEXT_OFF( convert ) = (U16)(tail - convert);
3368 ARG_SET( convert, data_slot );
538e84ed
KW
3369 /* Store the offset to the first unabsorbed branch in
3370 jump[0], which is otherwise unused by the jump logic.
786e8c11 3371 We use this when dumping a trie and during optimisation. */
538e84ed 3372 if (trie->jump)
7f69552c 3373 trie->jump[0] = (U16)(nextbranch - convert);
538e84ed 3374
6c48061a
YO
3375 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3376 * and there is a bitmap
3377 * and the first "jump target" node we found leaves enough room
3378 * then convert the TRIE node into a TRIEC node, with the bitmap
3379 * embedded inline in the opcode - this is hypothetically faster.
3380 */
3381 if ( !trie->states[trie->startstate].wordnum
3382 && trie->bitmap
3383 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
3384 {
3385 OP( convert ) = TRIEC;
3386 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 3387 PerlMemShared_free(trie->bitmap);
786e8c11 3388 trie->bitmap= NULL;
538e84ed 3389 } else
786e8c11 3390 OP( convert ) = TRIE;
a3621e74 3391
3dab1dad
YO
3392 /* store the type in the flags */
3393 convert->flags = nodetype;
a5ca303d 3394 DEBUG_r({
538e84ed
KW
3395 optimize = convert
3396 + NODE_STEP_REGNODE
a5ca303d
YO
3397 + regarglen[ OP( convert ) ];
3398 });
538e84ed 3399 /* XXX We really should free up the resource in trie now,
a5ca303d 3400 as we won't use them - (which resources?) dmq */
3dab1dad 3401 }
a3621e74 3402 /* needed for dumping*/
e62cc96a 3403 DEBUG_r(if (optimize) {
07be1b83 3404 regnode *opt = convert;
bcdf7404 3405
e62cc96a 3406 while ( ++opt < optimize) {
07be1b83
YO
3407 Set_Node_Offset_Length(opt,0,0);
3408 }
538e84ed
KW
3409 /*
3410 Try to clean up some of the debris left after the
786e8c11 3411 optimisation.
a3621e74 3412 */
786e8c11 3413 while( optimize < jumper ) {
07be1b83 3414 mjd_nodelen += Node_Length((optimize));
a3621e74 3415 OP( optimize ) = OPTIMIZED;
07be1b83 3416 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
3417 optimize++;
3418 }
07be1b83 3419 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
3420 });
3421 } /* end node insert */
2e64971a
DM
3422
3423 /* Finish populating the prev field of the wordinfo array. Walk back
3424 * from each accept state until we find another accept state, and if
3425 * so, point the first word's .prev field at the second word. If the
3426 * second already has a .prev field set, stop now. This will be the
3427 * case either if we've already processed that word's accept state,
3b753521
FN
3428 * or that state had multiple words, and the overspill words were
3429 * already linked up earlier.
2e64971a
DM
3430 */
3431 {
3432 U16 word;
3433 U32 state;
3434 U16 prev;
3435
3436 for (word=1; word <= trie->wordcount; word++) {
3437 prev = 0;
3438 if (trie->wordinfo[word].prev)
3439 continue;
3440 state = trie->wordinfo[word].accept;
3441 while (state) {
3442 state = prev_states[state];
3443 if (!state)
3444 break;
3445 prev = trie->states[state].wordnum;
3446 if (prev)
3447 break;
3448 }
3449 trie->wordinfo[word].prev = prev;
3450 }
3451 Safefree(prev_states);
3452 }
3453
3454
3455 /* and now dump out the compressed format */
3456 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3457
55eed653 3458 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
3459#ifdef DEBUGGING
3460 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3461 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3462#else
03e70be4 3463 SvREFCNT_dec_NN(revcharmap);
07be1b83 3464#endif
538e84ed
KW
3465 return trie->jump
3466 ? MADE_JUMP_TRIE
3467 : trie->startstate>1
3468 ? MADE_EXACT_TRIE
786e8c11
YO
3469 : MADE_TRIE;
3470}
3471
615a2e7f
YO
3472STATIC regnode *
3473S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
786e8c11 3474{
b423522f
KW
3475/* The Trie is constructed and compressed now so we can build a fail array if
3476 * it's needed
786e8c11 3477
b423522f
KW
3478 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3479 3.32 in the
3480 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,