This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
make struct scan_data_t->longest an index val
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
f7e03a10 89#include "dquote_inline.h"
b992490d 90#include "invlist_inline.h"
1b0f46bf 91#include "unicode_constants.h"
04e98a4d 92
538e84ed
KW
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
95#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 97#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 98#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
3f910e62
YO
104/* this is a chain of data about sub patterns we are processing that
105 need to be handled separately/specially in study_chunk. Its so
106 we can simulate recursion without losing state. */
107struct scan_frame;
108typedef struct scan_frame {
109 regnode *last_regnode; /* last node to process in this frame */
110 regnode *next_regnode; /* next node to process when last is reached */
111 U32 prev_recursed_depth;
112 I32 stopparen; /* what stopparen do we use */
113 U32 is_top_frame; /* what flags do we use? */
114
115 struct scan_frame *this_prev_frame; /* this previous frame */
116 struct scan_frame *prev_frame; /* previous frame */
117 struct scan_frame *next_frame; /* next frame */
118} scan_frame;
119
f4ae5a27
KW
120/* Certain characters are output as a sequence with the first being a
121 * backslash. */
eb7486de 122#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
f4ae5a27
KW
123
124
09b2b2e6 125struct RExC_state_t {
514a91f1
DM
126 U32 flags; /* RXf_* are we folding, multilining? */
127 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 128 char *precomp; /* uncompiled string. */
711b303b 129 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 130 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 131 regexp *rx; /* perl core regexp structure */
538e84ed
KW
132 regexp_internal *rxi; /* internal data for regexp object
133 pprivate field */
fac92740 134 char *start; /* Start of input for compile */
830247a4
IZ
135 char *end; /* End of input for compile */
136 char *parse; /* Input-scan pointer. */
285b5ca0
KW
137 char *adjusted_start; /* 'start', adjusted. See code use */
138 STRLEN precomp_adj; /* an offset beyond precomp. See code use */
ea3daa5d 139 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 140 regnode *emit_start; /* Start of emitted-code area */
538e84ed
KW
141 regnode *emit_bound; /* First regnode outside of the
142 allocated space */
f7c7e32a
DM
143 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
144 implies compiling, so don't emit */
9a81a976
KW
145 regnode_ssc emit_dummy; /* placeholder for emit to point to;
146 large enough for the largest
147 non-EXACTish node, so can use it as
148 scratch in pass1 */
830247a4
IZ
149 I32 naughty; /* How bad is this pattern? */
150 I32 sawback; /* Did we see \1, ...? */
151 U32 seen;
ea3daa5d 152 SSize_t size; /* Code size. */
538e84ed
KW
153 I32 npar; /* Capture buffer count, (OPEN) plus
154 one. ("par" 0 is the whole
155 pattern)*/
156 I32 nestroot; /* root parens we are in - used by
157 accept */
830247a4
IZ
158 I32 extralen;
159 I32 seen_zerolen;
40d049e4
YO
160 regnode **open_parens; /* pointers to open parens */
161 regnode **close_parens; /* pointers to close parens */
d5a00e4a 162 regnode *end_op; /* END node in program */
02daf0ab
YO
163 I32 utf8; /* whether the pattern is utf8 or not */
164 I32 orig_utf8; /* whether the pattern was originally in utf8 */
165 /* XXX use this for future optimisation of case
166 * where pattern must be upgraded to utf8. */
e40e74fe
KW
167 I32 uni_semantics; /* If a d charset modifier should use unicode
168 rules, even if the pattern is not in
169 utf8 */
81714fb9 170 HV *paren_names; /* Paren names */
538e84ed 171
40d049e4 172 regnode **recurse; /* Recurse regops */
d5a00e4a 173 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 174 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 175 through */
09a65838 176 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 177 I32 in_lookbehind;
4624b182 178 I32 contains_locale;
bb3f3ed2 179 I32 override_recoding;
b6d67071
KW
180#ifdef EBCDIC
181 I32 recode_x_to_native;
182#endif
9d53c457 183 I32 in_multi_char_class;
1acab4c5 184 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 185 within pattern */
b1603ef8 186 int code_index; /* next code_blocks[] slot */
ee273784 187 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
188 scan_frame *frame_head;
189 scan_frame *frame_last;
190 U32 frame_count;
7eec73eb 191 AV *warn_text;
dc6d7f5c 192#ifdef ADD_TO_REGEXEC
830247a4
IZ
193 char *starttry; /* -Dr: where regtry was called. */
194#define RExC_starttry (pRExC_state->starttry)
195#endif
d24ca0c5 196 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 197#ifdef DEBUGGING
be8e71aa 198 const char *lastparse;
3dab1dad 199 I32 lastnum;
1f1031fe 200 AV *paren_name_list; /* idx -> name */
d9a72fcc 201 U32 study_chunk_recursed_count;
c9f0d54c
YO
202 SV *mysv1;
203 SV *mysv2;
3dab1dad
YO
204#define RExC_lastparse (pRExC_state->lastparse)
205#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 206#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 207#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
208#define RExC_mysv (pRExC_state->mysv1)
209#define RExC_mysv1 (pRExC_state->mysv1)
210#define RExC_mysv2 (pRExC_state->mysv2)
211
3dab1dad 212#endif
512c0f5a 213 bool seen_unfolded_sharp_s;
911bd04e 214 bool strict;
da7cf1cc 215 bool study_started;
09b2b2e6 216};
830247a4 217
e2509266 218#define RExC_flags (pRExC_state->flags)
514a91f1 219#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 220#define RExC_precomp (pRExC_state->precomp)
285b5ca0
KW
221#define RExC_precomp_adj (pRExC_state->precomp_adj)
222#define RExC_adjusted_start (pRExC_state->adjusted_start)
711b303b 223#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 224#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 225#define RExC_rx (pRExC_state->rx)
f8fc2ecf 226#define RExC_rxi (pRExC_state->rxi)
fac92740 227#define RExC_start (pRExC_state->start)
830247a4
IZ
228#define RExC_end (pRExC_state->end)
229#define RExC_parse (pRExC_state->parse)
230#define RExC_whilem_seen (pRExC_state->whilem_seen)
512c0f5a
KW
231
232/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
233 * EXACTF node, hence was parsed under /di rules. If later in the parse,
234 * something forces the pattern into using /ui rules, the sharp s should be
235 * folded into the sequence 'ss', which takes up more space than previously
236 * calculated. This means that the sizing pass needs to be restarted. (The
237 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
238 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
239 * so there is no need to resize [perl #125990]. */
240#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
241
7122b237 242#ifdef RE_TRACK_PATTERN_OFFSETS
538e84ed
KW
243#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
244 others */
7122b237 245#endif
830247a4 246#define RExC_emit (pRExC_state->emit)
f7c7e32a 247#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 248#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 249#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
250#define RExC_sawback (pRExC_state->sawback)
251#define RExC_seen (pRExC_state->seen)
252#define RExC_size (pRExC_state->size)
ee273784 253#define RExC_maxlen (pRExC_state->maxlen)
830247a4 254#define RExC_npar (pRExC_state->npar)
e2e6a0f1 255#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
256#define RExC_extralen (pRExC_state->extralen)
257#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 258#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 259#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 260#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
261#define RExC_open_parens (pRExC_state->open_parens)
262#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 263#define RExC_end_op (pRExC_state->end_op)
81714fb9 264#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
265#define RExC_recurse (pRExC_state->recurse)
266#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 267#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
268#define RExC_study_chunk_recursed_bytes \
269 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 270#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 271#define RExC_contains_locale (pRExC_state->contains_locale)
b6d67071
KW
272#ifdef EBCDIC
273# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
274#endif
9d53c457 275#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
276#define RExC_frame_head (pRExC_state->frame_head)
277#define RExC_frame_last (pRExC_state->frame_last)
278#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 279#define RExC_strict (pRExC_state->strict)
da7cf1cc 280#define RExC_study_started (pRExC_state->study_started)
7eec73eb 281#define RExC_warn_text (pRExC_state->warn_text)
830247a4 282
99807a43
HS
283/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
284 * a flag to disable back-off on the fixed/floating substrings - if it's
285 * a high complexity pattern we assume the benefit of avoiding a full match
286 * is worth the cost of checking for the substrings even if they rarely help.
287 */
288#define RExC_naughty (pRExC_state->naughty)
289#define TOO_NAUGHTY (10)
290#define MARK_NAUGHTY(add) \
291 if (RExC_naughty < TOO_NAUGHTY) \
292 RExC_naughty += (add)
293#define MARK_NAUGHTY_EXP(exp, add) \
294 if (RExC_naughty < TOO_NAUGHTY) \
295 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 296
a687059c
LW
297#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
298#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 299 ((*s) == '{' && regcurly(s)))
a687059c
LW
300
301/*
302 * Flags to be passed up and down.
303 */
a687059c 304#define WORST 0 /* Worst case. */
a3b492c3 305#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 306
e64f369d 307/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
308 * character. (There needs to be a case: in the switch statement in regexec.c
309 * for any node marked SIMPLE.) Note that this is not the same thing as
310 * REGNODE_SIMPLE */
fda99bee 311#define SIMPLE 0x02
e64f369d 312#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
313#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
314#define TRYAGAIN 0x10 /* Weeded out a declaration. */
b97943f7
KW
315#define RESTART_PASS1 0x20 /* Need to restart sizing pass */
316#define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
317 calcuate sizes as UTF-8 */
a687059c 318
3dab1dad
YO
319#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
320
07be1b83
YO
321/* whether trie related optimizations are enabled */
322#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
323#define TRIE_STUDY_OPT
786e8c11 324#define FULL_TRIE_STUDY
07be1b83
YO
325#define TRIE_STCLASS
326#endif
1de06328
YO
327
328
40d049e4
YO
329
330#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
331#define PBITVAL(paren) (1 << ((paren) & 7))
332#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
333#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
334#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
335
82a6ada4 336#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 337 if (!UTF) { \
82a6ada4
KW
338 assert(PASS1); \
339 *flagp = RESTART_PASS1|NEED_UTF8; \
8d9c2815
NC
340 return NULL; \
341 } \
82a6ada4 342 } STMT_END
40d049e4 343
512c0f5a
KW
344/* Change from /d into /u rules, and restart the parse if we've already seen
345 * something whose size would increase as a result, by setting *flagp and
346 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
347 * we've change to /u during the parse. */
348#define REQUIRE_UNI_RULES(flagp, restart_retval) \
349 STMT_START { \
350 if (DEPENDS_SEMANTICS) { \
351 assert(PASS1); \
352 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
353 RExC_uni_semantics = 1; \
354 if (RExC_seen_unfolded_sharp_s) { \
355 *flagp |= RESTART_PASS1; \
356 return restart_retval; \
357 } \
358 } \
359 } STMT_END
360
f19b1a63
KW
361/* This converts the named class defined in regcomp.h to its equivalent class
362 * number defined in handy.h. */
363#define namedclass_to_classnum(class) ((int) ((class) / 2))
364#define classnum_to_namedclass(classnum) ((classnum) * 2)
365
de92f5e6
KW
366#define _invlist_union_complement_2nd(a, b, output) \
367 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
368#define _invlist_intersection_complement_2nd(a, b, output) \
369 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
370
1de06328
YO
371/* About scan_data_t.
372
373 During optimisation we recurse through the regexp program performing
374 various inplace (keyhole style) optimisations. In addition study_chunk
375 and scan_commit populate this data structure with information about
538e84ed 376 what strings MUST appear in the pattern. We look for the longest
3b753521 377 string that must appear at a fixed location, and we look for the
1de06328
YO
378 longest string that may appear at a floating location. So for instance
379 in the pattern:
538e84ed 380
1de06328 381 /FOO[xX]A.*B[xX]BAR/
538e84ed 382
1de06328
YO
383 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
384 strings (because they follow a .* construct). study_chunk will identify
385 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 386
1de06328 387 The strings can be composites, for instance
538e84ed 388
1de06328 389 /(f)(o)(o)/
538e84ed 390
1de06328 391 will result in a composite fixed substring 'foo'.
538e84ed 392
1de06328 393 For each string some basic information is maintained:
538e84ed 394
6ea77169 395 - min_offset
1de06328
YO
396 This is the position the string must appear at, or not before.
397 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
398 characters must match before the string we are searching for.
399 Likewise when combined with minlenp and the length of the string it
538e84ed 400 tells us how many characters must appear after the string we have
1de06328 401 found.
538e84ed 402
1de06328
YO
403 - max_offset
404 Only used for floating strings. This is the rightmost point that
ea3daa5d 405 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 406 string can occur infinitely far to the right.
538e84ed 407
1de06328 408 - minlenp
2d608413
KW
409 A pointer to the minimum number of characters of the pattern that the
410 string was found inside. This is important as in the case of positive
538e84ed 411 lookahead or positive lookbehind we can have multiple patterns
1de06328 412 involved. Consider
538e84ed 413
1de06328 414 /(?=FOO).*F/
538e84ed 415
1de06328
YO
416 The minimum length of the pattern overall is 3, the minimum length
417 of the lookahead part is 3, but the minimum length of the part that
538e84ed 418 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 419 minimum length for the F is 1. This is important as the minimum length
538e84ed 420 is used to determine offsets in front of and behind the string being
1de06328 421 looked for. Since strings can be composites this is the length of the
486ec47a 422 pattern at the time it was committed with a scan_commit. Note that
1de06328 423 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 424 are not known until the full pattern has been compiled, thus the
1de06328 425 pointer to the value.
538e84ed 426
1de06328 427 - lookbehind
538e84ed 428
1de06328 429 In the case of lookbehind the string being searched for can be
538e84ed 430 offset past the start point of the final matching string.
1de06328
YO
431 If this value was just blithely removed from the min_offset it would
432 invalidate some of the calculations for how many chars must match
433 before or after (as they are derived from min_offset and minlen and
538e84ed 434 the length of the string being searched for).
1de06328
YO
435 When the final pattern is compiled and the data is moved from the
436 scan_data_t structure into the regexp structure the information
538e84ed
KW
437 about lookbehind is factored in, with the information that would
438 have been lost precalculated in the end_shift field for the
1de06328
YO
439 associated string.
440
441 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 442 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
443
444*/
2c2d71f5 445
cb6aef8b
DM
446struct scan_data_substrs {
447 SV *str; /* longest substring found in pattern */
448 SSize_t min_offset; /* earliest point in string it can appear */
449 SSize_t max_offset; /* latest point in string it can appear */
450 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
451 SSize_t lookbehind; /* is the pos of the string modified by LB */
452};
453
2c2d71f5 454typedef struct scan_data_t {
1de06328
YO
455 /*I32 len_min; unused */
456 /*I32 len_delta; unused */
49f55535 457 SSize_t pos_min;
ea3daa5d 458 SSize_t pos_delta;
2c2d71f5 459 SV *last_found;
ea3daa5d 460 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 461 SSize_t last_start_min;
ea3daa5d 462 SSize_t last_start_max;
a2ca2017 463 U8 longest; /* 0 = fixed is longest, 1 = float is longest */
6ea77169 464
a2ca2017 465 /* [0] is longest fixed substring so far, [1] is longest float so far */
cb6aef8b 466 struct scan_data_substrs substrs[2];
6ea77169 467
2c2d71f5
JH
468 I32 flags;
469 I32 whilem_c;
49f55535 470 SSize_t *last_closep;
b8f7bb16 471 regnode_ssc *start_class;
2c2d71f5
JH
472} scan_data_t;
473
a687059c 474/*
e50aee73 475 * Forward declarations for pregcomp()'s friends.
a687059c 476 */
a0d0e21e 477
6ea77169 478static const scan_data_t zero_scan_data = {
a2ca2017 479 0, 0, NULL, 0, 0, 0, 0,
6ea77169
DM
480 {
481 { NULL, 0, 0, 0, 0 },
482 { NULL, 0, 0, 0, 0 },
483 },
484 0, 0, NULL, NULL
485};
c277df42
IZ
486
487#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
488#define SF_BEFORE_SEOL 0x0001
489#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
490#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
491#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
492
44e3dfd2
BF
493#define SF_FIX_SHIFT_EOL (+2)
494#define SF_FL_SHIFT_EOL (+4)
c277df42
IZ
495
496#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
497#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
498
499#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
500#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
501#define SF_IS_INF 0x0040
502#define SF_HAS_PAR 0x0080
503#define SF_IN_PAR 0x0100
504#define SF_HAS_EVAL 0x0200
bc604ad8
KW
505
506
507/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
508 * longest substring in the pattern. When it is not set the optimiser keeps
509 * track of position, but does not keep track of the actual strings seen,
510 *
511 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
512 * /foo/i will not.
513 *
514 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
515 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
516 * turned off because of the alternation (BRANCH). */
07be1b83 517#define SCF_DO_SUBSTR 0x0400
bc604ad8 518
653099ff
GS
519#define SCF_DO_STCLASS_AND 0x0800
520#define SCF_DO_STCLASS_OR 0x1000
521#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 522#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 523
786e8c11 524#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 525#define SCF_SEEN_ACCEPT 0x8000
688e0391 526#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
527#define SCF_IN_DEFINE 0x20000
528
529
530
07be1b83 531
43fead97 532#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
533
534/* The enums for all these are ordered so things work out correctly */
a62b1201 535#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
536#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
537 == REGEX_DEPENDS_CHARSET)
00b27cfc 538#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
539#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
540 >= REGEX_UNICODE_CHARSET)
541#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
542 == REGEX_ASCII_RESTRICTED_CHARSET)
543#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
544 >= REGEX_ASCII_RESTRICTED_CHARSET)
545#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
546 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 547
43fead97 548#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 549
f2c2a6ab
KW
550/* For programs that want to be strictly Unicode compatible by dying if any
551 * attempt is made to match a non-Unicode code point against a Unicode
552 * property. */
553#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
554
93733859 555#define OOB_NAMEDCLASS -1
b8c5462f 556
8e661ac5
KW
557/* There is no code point that is out-of-bounds, so this is problematic. But
558 * its only current use is to initialize a variable that is always set before
559 * looked at. */
560#define OOB_UNICODE 0xDEADBEEF
561
a0ed51b3 562#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 563
8615cb43 564
b45f050a
JF
565/* length of regex to show in messages that don't mark a position within */
566#define RegexLengthToShowInErrorMessages 127
567
568/*
569 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
570 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
571 * op/pragma/warn/regcomp.
572 */
7253e4e3
RK
573#define MARKER1 "<-- HERE" /* marker as it appears in the description */
574#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 575
538e84ed 576#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 577 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 578
285b5ca0
KW
579/* The code in this file in places uses one level of recursion with parsing
580 * rebased to an alternate string constructed by us in memory. This can take
581 * the form of something that is completely different from the input, or
582 * something that uses the input as part of the alternate. In the first case,
583 * there should be no possibility of an error, as we are in complete control of
584 * the alternate string. But in the second case we don't control the input
585 * portion, so there may be errors in that. Here's an example:
586 * /[abc\x{DF}def]/ui
587 * is handled specially because \x{df} folds to a sequence of more than one
588 * character, 'ss'. What is done is to create and parse an alternate string,
589 * which looks like this:
590 * /(?:\x{DF}|[abc\x{DF}def])/ui
591 * where it uses the input unchanged in the middle of something it constructs,
592 * which is a branch for the DF outside the character class, and clustering
593 * parens around the whole thing. (It knows enough to skip the DF inside the
594 * class while in this substitute parse.) 'abc' and 'def' may have errors that
595 * need to be reported. The general situation looks like this:
596 *
597 * sI tI xI eI
598 * Input: ----------------------------------------------------
599 * Constructed: ---------------------------------------------------
600 * sC tC xC eC EC
601 *
602 * The input string sI..eI is the input pattern. The string sC..EC is the
603 * constructed substitute parse string. The portions sC..tC and eC..EC are
604 * constructed by us. The portion tC..eC is an exact duplicate of the input
605 * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that
606 * while parsing, we find an error at xC. We want to display a message showing
607 * the real input string. Thus we need to find the point xI in it which
608 * corresponds to xC. xC >= tC, since the portion of the string sC..tC has
609 * been constructed by us, and so shouldn't have errors. We get:
610 *
611 * xI = sI + (tI - sI) + (xC - tC)
612 *
613 * and, the offset into sI is:
614 *
615 * (xI - sI) = (tI - sI) + (xC - tC)
616 *
617 * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
618 * and we save tC as RExC_adjusted_start.
903c858a
KW
619 *
620 * During normal processing of the input pattern, everything points to that,
621 * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
285b5ca0
KW
622 */
623
624#define tI_sI RExC_precomp_adj
625#define tC RExC_adjusted_start
626#define sC RExC_precomp
627#define xI_offset(xC) ((IV) (tI_sI + (xC - tC)))
628#define xI(xC) (sC + xI_offset(xC))
629#define eC RExC_precomp_end
630
631#define REPORT_LOCATION_ARGS(xC) \
632 UTF8fARG(UTF, \
633 (xI(xC) > eC) /* Don't run off end */ \
634 ? eC - sC /* Length before the <--HERE */ \
635 : xI_offset(xC), \
636 sC), /* The input pattern printed up to the <--HERE */ \
637 UTF8fARG(UTF, \
638 (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \
639 (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */
c1d900c3 640
8a6d8ec6
HS
641/* Used to point after bad bytes for an error message, but avoid skipping
642 * past a nul byte. */
643#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
644
b45f050a
JF
645/*
646 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
647 * arg. Show regex, up to a maximum length. If it's too long, chop and add
648 * "...".
649 */
58e23c8d 650#define _FAIL(code) STMT_START { \
bfed75c6 651 const char *ellipses = ""; \
711b303b 652 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380
MP
653 \
654 if (!SIZE_ONLY) \
a5e7bc51 655 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
656 if (len > RegexLengthToShowInErrorMessages) { \
657 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
658 len = RegexLengthToShowInErrorMessages - 10; \
659 ellipses = "..."; \
660 } \
58e23c8d 661 code; \
ccb2c380 662} STMT_END
8615cb43 663
58e23c8d 664#define FAIL(msg) _FAIL( \
147e3846 665 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 666 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
667
668#define FAIL2(msg,arg) _FAIL( \
147e3846 669 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 670 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 671
b45f050a 672/*
b45f050a
JF
673 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
674 */
ccb2c380 675#define Simple_vFAIL(m) STMT_START { \
ccb2c380 676 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 677 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 678} STMT_END
b45f050a
JF
679
680/*
681 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
682 */
ccb2c380
MP
683#define vFAIL(m) STMT_START { \
684 if (!SIZE_ONLY) \
a5e7bc51 685 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
686 Simple_vFAIL(m); \
687} STMT_END
b45f050a
JF
688
689/*
690 * Like Simple_vFAIL(), but accepts two arguments.
691 */
ccb2c380 692#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
693 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
694 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 695} STMT_END
b45f050a
JF
696
697/*
698 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
699 */
ccb2c380
MP
700#define vFAIL2(m,a1) STMT_START { \
701 if (!SIZE_ONLY) \
a5e7bc51 702 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
703 Simple_vFAIL2(m, a1); \
704} STMT_END
b45f050a
JF
705
706
707/*
708 * Like Simple_vFAIL(), but accepts three arguments.
709 */
ccb2c380 710#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 711 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 712 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 713} STMT_END
b45f050a
JF
714
715/*
716 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
717 */
ccb2c380
MP
718#define vFAIL3(m,a1,a2) STMT_START { \
719 if (!SIZE_ONLY) \
a5e7bc51 720 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
721 Simple_vFAIL3(m, a1, a2); \
722} STMT_END
b45f050a
JF
723
724/*
725 * Like Simple_vFAIL(), but accepts four arguments.
726 */
ccb2c380 727#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
728 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
729 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 730} STMT_END
b45f050a 731
95db3ffa
KW
732#define vFAIL4(m,a1,a2,a3) STMT_START { \
733 if (!SIZE_ONLY) \
734 SAVEFREESV(RExC_rx_sv); \
735 Simple_vFAIL4(m, a1, a2, a3); \
736} STMT_END
737
946095af 738/* A specialized version of vFAIL2 that works with UTF8f */
d528642a
KW
739#define vFAIL2utf8f(m, a1) STMT_START { \
740 if (!SIZE_ONLY) \
741 SAVEFREESV(RExC_rx_sv); \
742 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
743 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
744} STMT_END
745
3ba22297 746#define vFAIL3utf8f(m, a1, a2) STMT_START { \
3ba22297
KW
747 if (!SIZE_ONLY) \
748 SAVEFREESV(RExC_rx_sv); \
749 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 750 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
751} STMT_END
752
499333dc
KW
753/* These have asserts in them because of [perl #122671] Many warnings in
754 * regcomp.c can occur twice. If they get output in pass1 and later in that
755 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
756 * would get output again. So they should be output in pass2, and these
757 * asserts make sure new warnings follow that paradigm. */
946095af 758
5e0a247b
KW
759/* m is not necessarily a "literal string", in this macro */
760#define reg_warn_non_literal_string(loc, m) STMT_START { \
d528642a
KW
761 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
762 "%s" REPORT_LOCATION, \
763 m, REPORT_LOCATION_ARGS(loc)); \
5e0a247b
KW
764} STMT_END
765
668c081a 766#define ckWARNreg(loc,m) STMT_START { \
d528642a
KW
767 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
768 m REPORT_LOCATION, \
769 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
770} STMT_END
771
b927b7e9 772#define vWARN(loc, m) STMT_START { \
d528642a
KW
773 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
774 m REPORT_LOCATION, \
775 REPORT_LOCATION_ARGS(loc)); \
b927b7e9
KW
776} STMT_END
777
0d6106aa 778#define vWARN_dep(loc, m) STMT_START { \
d528642a
KW
779 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
780 m REPORT_LOCATION, \
781 REPORT_LOCATION_ARGS(loc)); \
0d6106aa
KW
782} STMT_END
783
147508a2 784#define ckWARNdep(loc,m) STMT_START { \
d528642a
KW
785 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
786 m REPORT_LOCATION, \
787 REPORT_LOCATION_ARGS(loc)); \
147508a2
KW
788} STMT_END
789
d528642a
KW
790#define ckWARNregdep(loc,m) STMT_START { \
791 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
792 WARN_REGEXP), \
793 m REPORT_LOCATION, \
794 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
795} STMT_END
796
d528642a
KW
797#define ckWARN2reg_d(loc,m, a1) STMT_START { \
798 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
799 m REPORT_LOCATION, \
800 a1, REPORT_LOCATION_ARGS(loc)); \
2335b3d3
KW
801} STMT_END
802
d528642a
KW
803#define ckWARN2reg(loc, m, a1) STMT_START { \
804 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
805 m REPORT_LOCATION, \
806 a1, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
807} STMT_END
808
d528642a
KW
809#define vWARN3(loc, m, a1, a2) STMT_START { \
810 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
811 m REPORT_LOCATION, \
812 a1, a2, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
813} STMT_END
814
d528642a
KW
815#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
816 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
817 m REPORT_LOCATION, \
818 a1, a2, \
819 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
820} STMT_END
821
ccb2c380 822#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
823 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
824 m REPORT_LOCATION, \
825 a1, a2, a3, \
826 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
827} STMT_END
828
668c081a 829#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
830 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
831 m REPORT_LOCATION, \
832 a1, a2, a3, \
833 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
834} STMT_END
835
ccb2c380 836#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
d528642a
KW
837 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
838 m REPORT_LOCATION, \
839 a1, a2, a3, a4, \
840 REPORT_LOCATION_ARGS(loc)); \
ccb2c380 841} STMT_END
9d1d55b5 842
538e84ed 843/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
844 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
845 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
846 * Element 0 holds the number n.
07be1b83 847 * Position is 1 indexed.
fac92740 848 */
7122b237
YO
849#ifndef RE_TRACK_PATTERN_OFFSETS
850#define Set_Node_Offset_To_R(node,byte)
851#define Set_Node_Offset(node,byte)
852#define Set_Cur_Node_Offset
853#define Set_Node_Length_To_R(node,len)
854#define Set_Node_Length(node,len)
6a86c6ad 855#define Set_Node_Cur_Length(node,start)
538e84ed
KW
856#define Node_Offset(n)
857#define Node_Length(n)
7122b237
YO
858#define Set_Node_Offset_Length(node,offset,len)
859#define ProgLen(ri) ri->u.proglen
860#define SetProgLen(ri,x) ri->u.proglen = x
861#else
862#define ProgLen(ri) ri->u.offsets[0]
863#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
864#define Set_Node_Offset_To_R(node,byte) STMT_START { \
865 if (! SIZE_ONLY) { \
866 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 867 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 868 if((node) < 0) { \
538e84ed
KW
869 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
870 (int)(node)); \
ccb2c380
MP
871 } else { \
872 RExC_offsets[2*(node)-1] = (byte); \
873 } \
874 } \
875} STMT_END
876
877#define Set_Node_Offset(node,byte) \
878 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
879#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
880
881#define Set_Node_Length_To_R(node,len) STMT_START { \
882 if (! SIZE_ONLY) { \
883 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 884 __LINE__, (int)(node), (int)(len))); \
ccb2c380 885 if((node) < 0) { \
538e84ed
KW
886 Perl_croak(aTHX_ "value of node is %d in Length macro", \
887 (int)(node)); \
ccb2c380
MP
888 } else { \
889 RExC_offsets[2*(node)] = (len); \
890 } \
891 } \
892} STMT_END
893
894#define Set_Node_Length(node,len) \
895 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
896#define Set_Node_Cur_Length(node, start) \
897 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
898
899/* Get offsets and lengths */
900#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
901#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
902
07be1b83
YO
903#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
904 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
905 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
906} STMT_END
7122b237 907#endif
07be1b83
YO
908
909#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
910#define EXPERIMENTAL_INPLACESCAN
f427392e 911#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 912
cb41e5d6
YO
913#ifdef DEBUGGING
914int
6ad9a8ab 915Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
916{
917 va_list ap;
918 int result;
919 PerlIO *f= Perl_debug_log;
920 PERL_ARGS_ASSERT_RE_PRINTF;
921 va_start(ap, fmt);
922 result = PerlIO_vprintf(f, fmt, ap);
923 va_end(ap);
924 return result;
925}
926
927int
7b031478 928Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
929{
930 va_list ap;
931 int result;
932 PerlIO *f= Perl_debug_log;
933 PERL_ARGS_ASSERT_RE_INDENTF;
934 va_start(ap, depth);
daeb874b 935 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
936 result = PerlIO_vprintf(f, fmt, ap);
937 va_end(ap);
938 return result;
939}
940#endif /* DEBUGGING */
941
7b031478 942#define DEBUG_RExC_seen() \
538e84ed 943 DEBUG_OPTIMISE_MORE_r({ \
6ad9a8ab 944 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 945 \
e384d5c1 946 if (RExC_seen & REG_ZERO_LEN_SEEN) \
6ad9a8ab 947 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 948 \
e384d5c1 949 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
6ad9a8ab 950 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 951 \
e384d5c1 952 if (RExC_seen & REG_GPOS_SEEN) \
6ad9a8ab 953 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 954 \
e384d5c1 955 if (RExC_seen & REG_RECURSE_SEEN) \
6ad9a8ab 956 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 957 \
7b031478 958 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
6ad9a8ab 959 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 960 \
e384d5c1 961 if (RExC_seen & REG_VERBARG_SEEN) \
6ad9a8ab 962 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 963 \
e384d5c1 964 if (RExC_seen & REG_CUTGROUP_SEEN) \
6ad9a8ab 965 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 966 \
e384d5c1 967 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
6ad9a8ab 968 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 969 \
e384d5c1 970 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
6ad9a8ab 971 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 972 \
7b031478 973 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
6ad9a8ab 974 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 975 \
6ad9a8ab 976 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
977 });
978
fdfb4f21 979#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 980 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21
YO
981
982#define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
983 if ( ( flags ) ) { \
6ad9a8ab 984 Perl_re_printf( aTHX_ "%s", open_str); \
fdfb4f21
YO
985 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
986 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
987 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
988 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
989 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
990 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
991 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
992 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
993 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
994 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
995 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
996 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
997 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
998 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
999 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
6ad9a8ab 1000 Perl_re_printf( aTHX_ "%s", close_str); \
fdfb4f21
YO
1001 }
1002
1003
6ea77169 1004#define DEBUG_STUDYDATA(where,data,depth) \
304ee84b 1005DEBUG_OPTIMISE_MORE_r(if(data){ \
6ea77169 1006 Perl_re_indentf( aTHX_ "" where "Pos:%" IVdf "/%" IVdf \
147e3846 1007 " Flags: 0x%" UVXf, \
7b031478 1008 depth, \
1de06328
YO
1009 (IV)((data)->pos_min), \
1010 (IV)((data)->pos_delta), \
fdfb4f21
YO
1011 (UV)((data)->flags) \
1012 ); \
1013 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
147e3846
KW
1014 Perl_re_printf( aTHX_ \
1015 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s", \
1de06328 1016 (IV)((data)->whilem_c), \
304ee84b
YO
1017 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
1018 is_inf ? "INF " : "" \
1de06328
YO
1019 ); \
1020 if ((data)->last_found) \
147e3846
KW
1021 Perl_re_printf( aTHX_ \
1022 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf \
1023 " %sFixed:'%s' @ %" IVdf \
1024 " %sFloat: '%s' @ %" IVdf "/%" IVdf, \
1de06328
YO
1025 SvPVX_const((data)->last_found), \
1026 (IV)((data)->last_end), \
1027 (IV)((data)->last_start_min), \
1028 (IV)((data)->last_start_max), \
a2ca2017 1029 (data)->longest == 0 ? "*" : "", \
6ea77169
DM
1030 SvPVX_const((data)->substrs[0].str), \
1031 (IV)((data)->substrs[0].min_offset), \
a2ca2017 1032 (data)->longest == 1 ? "*" : "", \
6ea77169
DM
1033 SvPVX_const((data)->substrs[1].str), \
1034 (IV)((data)->substrs[1].min_offset), \
1035 (IV)((data)->substrs[1].max_offset) \
1de06328 1036 ); \
6ad9a8ab 1037 Perl_re_printf( aTHX_ "\n"); \
1de06328
YO
1038});
1039
cb41e5d6 1040
c6871b76
KW
1041/* =========================================================
1042 * BEGIN edit_distance stuff.
1043 *
1044 * This calculates how many single character changes of any type are needed to
1045 * transform a string into another one. It is taken from version 3.1 of
1046 *
1047 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1048 */
1049
1050/* Our unsorted dictionary linked list. */
1051/* Note we use UVs, not chars. */
1052
1053struct dictionary{
1054 UV key;
1055 UV value;
1056 struct dictionary* next;
1057};
1058typedef struct dictionary item;
1059
1060
1061PERL_STATIC_INLINE item*
1062push(UV key,item* curr)
1063{
1064 item* head;
1065 Newxz(head, 1, item);
1066 head->key = key;
1067 head->value = 0;
1068 head->next = curr;
1069 return head;
1070}
1071
1072
1073PERL_STATIC_INLINE item*
1074find(item* head, UV key)
1075{
1076 item* iterator = head;
1077 while (iterator){
1078 if (iterator->key == key){
1079 return iterator;
1080 }
1081 iterator = iterator->next;
1082 }
1083
1084 return NULL;
1085}
1086
1087PERL_STATIC_INLINE item*
1088uniquePush(item* head,UV key)
1089{
1090 item* iterator = head;
1091
1092 while (iterator){
1093 if (iterator->key == key) {
1094 return head;
1095 }
1096 iterator = iterator->next;
1097 }
1098
1099 return push(key,head);
1100}
1101
1102PERL_STATIC_INLINE void
1103dict_free(item* head)
1104{
1105 item* iterator = head;
1106
1107 while (iterator) {
1108 item* temp = iterator;
1109 iterator = iterator->next;
1110 Safefree(temp);
1111 }
1112
1113 head = NULL;
1114}
1115
1116/* End of Dictionary Stuff */
1117
1118/* All calculations/work are done here */
1119STATIC int
1120S_edit_distance(const UV* src,
1121 const UV* tgt,
1122 const STRLEN x, /* length of src[] */
1123 const STRLEN y, /* length of tgt[] */
1124 const SSize_t maxDistance
1125)
1126{
1127 item *head = NULL;
1128 UV swapCount,swapScore,targetCharCount,i,j;
1129 UV *scores;
1130 UV score_ceil = x + y;
1131
1132 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1133
1134 /* intialize matrix start values */
1135 Newxz(scores, ( (x + 2) * (y + 2)), UV);
1136 scores[0] = score_ceil;
1137 scores[1 * (y + 2) + 0] = score_ceil;
1138 scores[0 * (y + 2) + 1] = score_ceil;
1139 scores[1 * (y + 2) + 1] = 0;
1140 head = uniquePush(uniquePush(head,src[0]),tgt[0]);
1141
1142 /* work loops */
1143 /* i = src index */
1144 /* j = tgt index */
1145 for (i=1;i<=x;i++) {
1146 if (i < x)
1147 head = uniquePush(head,src[i]);
1148 scores[(i+1) * (y + 2) + 1] = i;
1149 scores[(i+1) * (y + 2) + 0] = score_ceil;
1150 swapCount = 0;
1151
1152 for (j=1;j<=y;j++) {
1153 if (i == 1) {
1154 if(j < y)
1155 head = uniquePush(head,tgt[j]);
1156 scores[1 * (y + 2) + (j + 1)] = j;
1157 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1158 }
1159
1160 targetCharCount = find(head,tgt[j-1])->value;
1161 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1162
1163 if (src[i-1] != tgt[j-1]){
1164 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1165 }
1166 else {
1167 swapCount = j;
1168 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1169 }
1170 }
1171
1172 find(head,src[i-1])->value = i;
1173 }
1174
1175 {
1176 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1177 dict_free(head);
1178 Safefree(scores);
1179 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1180 }
1181}
1182
1183/* END of edit_distance() stuff
1184 * ========================================================= */
1185
8e35b056
KW
1186/* is c a control character for which we have a mnemonic? */
1187#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1188
549b4e78
KW
1189STATIC const char *
1190S_cntrl_to_mnemonic(const U8 c)
1191{
1192 /* Returns the mnemonic string that represents character 'c', if one
1193 * exists; NULL otherwise. The only ones that exist for the purposes of
1194 * this routine are a few control characters */
1195
1196 switch (c) {
1197 case '\a': return "\\a";
1198 case '\b': return "\\b";
1199 case ESC_NATIVE: return "\\e";
1200 case '\f': return "\\f";
1201 case '\n': return "\\n";
1202 case '\r': return "\\r";
1203 case '\t': return "\\t";
1204 }
1205
1206 return NULL;
1207}
1208
653099ff 1209/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 1210 Update the longest found anchored substring and the longest found
653099ff
GS
1211 floating substrings if needed. */
1212
4327152a 1213STATIC void
ea3daa5d
FC
1214S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1215 SSize_t *minlenp, int is_inf)
c277df42 1216{
e1ec3a88 1217 const STRLEN l = CHR_SVLEN(data->last_found);
a2ca2017
DM
1218 SV * const longest_sv = data->substrs[data->longest].str;
1219 const STRLEN old_l = CHR_SVLEN(longest_sv);
1de06328 1220 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1221
7918f24d
NC
1222 PERL_ARGS_ASSERT_SCAN_COMMIT;
1223
c277df42 1224 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
a2ca2017
DM
1225 SvSetMagicSV(longest_sv, data->last_found);
1226 if (data->longest == 0) { /* fixed */
6ea77169 1227 data->substrs[0].min_offset = l ? data->last_start_min : data->pos_min;
c277df42 1228 if (data->flags & SF_BEFORE_EOL)
b81d288d 1229 data->flags
c277df42
IZ
1230 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
1231 else
1232 data->flags &= ~SF_FIX_BEFORE_EOL;
6ea77169
DM
1233 data->substrs[0].minlenp = minlenp;
1234 data->substrs[0].lookbehind = 0;
a0ed51b3 1235 }
a2ca2017 1236 else { /* float */
6ea77169
DM
1237 data->substrs[1].min_offset = l ? data->last_start_min : data->pos_min;
1238 data->substrs[1].max_offset = (l
646e8787
DM
1239 ? data->last_start_max
1240 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1241 ? SSize_t_MAX
1242 : data->pos_min + data->pos_delta));
1243 if (is_inf
6ea77169
DM
1244 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1245 data->substrs[1].max_offset = SSize_t_MAX;
c277df42 1246 if (data->flags & SF_BEFORE_EOL)
b81d288d 1247 data->flags
c277df42
IZ
1248 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1249 else
1250 data->flags &= ~SF_FL_BEFORE_EOL;
6ea77169
DM
1251 data->substrs[1].minlenp = minlenp;
1252 data->substrs[1].lookbehind = 0;
c277df42
IZ
1253 }
1254 }
1255 SvCUR_set(data->last_found, 0);
0eda9292 1256 {
a28509cc 1257 SV * const sv = data->last_found;
097eb12c
AL
1258 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1259 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1260 if (mg)
1261 mg->mg_len = 0;
1262 }
0eda9292 1263 }
c277df42
IZ
1264 data->last_end = -1;
1265 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 1266 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
1267}
1268
cdd87c1d
KW
1269/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1270 * list that describes which code points it matches */
1271
653099ff 1272STATIC void
3420edd7 1273S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1274{
cdd87c1d
KW
1275 /* Set the SSC 'ssc' to match an empty string or any code point */
1276
557bd3fb 1277 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1278
71068078 1279 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1280
0854ea0b
KW
1281 /* mortalize so won't leak */
1282 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1283 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1284}
1285
653099ff 1286STATIC int
dc3bf405 1287S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1288{
c144baaa
KW
1289 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1290 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1291 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1292 * in any way, so there's no point in using it */
cdd87c1d
KW
1293
1294 UV start, end;
1295 bool ret;
653099ff 1296
557bd3fb 1297 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1298
71068078 1299 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1300
93e92956 1301 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1302 return FALSE;
1303 }
1304
1305 /* See if the list consists solely of the range 0 - Infinity */
1306 invlist_iterinit(ssc->invlist);
1307 ret = invlist_iternext(ssc->invlist, &start, &end)
1308 && start == 0
1309 && end == UV_MAX;
1310
1311 invlist_iterfinish(ssc->invlist);
1312
1313 if (ret) {
1314 return TRUE;
1315 }
1316
1317 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1318 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1319 int i;
1320 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1321 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1322 return TRUE;
1323 }
1324 }
1325 }
1326
1327 return FALSE;
653099ff
GS
1328}
1329
653099ff 1330STATIC void
cdd87c1d 1331S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1332{
cdd87c1d
KW
1333 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1334 * string, any code point, or any posix class under locale */
1335
557bd3fb 1336 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1337
557bd3fb 1338 Zero(ssc, 1, regnode_ssc);
71068078 1339 set_ANYOF_SYNTHETIC(ssc);
93e92956 1340 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1341 ssc_anything(ssc);
cdd87c1d 1342
2f306ab9
KW
1343 /* If any portion of the regex is to operate under locale rules that aren't
1344 * fully known at compile time, initialization includes it. The reason
1345 * this isn't done for all regexes is that the optimizer was written under
1346 * the assumption that locale was all-or-nothing. Given the complexity and
1347 * lack of documentation in the optimizer, and that there are inadequate
1348 * test cases for locale, many parts of it may not work properly, it is
1349 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1350 if (RExC_contains_locale) {
1351 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1352 }
1353 else {
1354 ANYOF_POSIXL_ZERO(ssc);
1355 }
653099ff
GS
1356}
1357
b423522f 1358STATIC int
dc3bf405
BF
1359S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1360 const regnode_ssc *ssc)
b423522f
KW
1361{
1362 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1363 * to the list of code points matched, and locale posix classes; hence does
1364 * not check its flags) */
1365
1366 UV start, end;
1367 bool ret;
1368
1369 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1370
71068078 1371 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1372
1373 invlist_iterinit(ssc->invlist);
1374 ret = invlist_iternext(ssc->invlist, &start, &end)
1375 && start == 0
1376 && end == UV_MAX;
1377
1378 invlist_iterfinish(ssc->invlist);
1379
1380 if (! ret) {
1381 return FALSE;
1382 }
1383
e0e1be5f 1384 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1385 return FALSE;
b423522f
KW
1386 }
1387
1388 return TRUE;
1389}
1390
1391STATIC SV*
1392S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1393 const regnode_charclass* const node)
b423522f
KW
1394{
1395 /* Returns a mortal inversion list defining which code points are matched
1396 * by 'node', which is of type ANYOF. Handles complementing the result if
1397 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1398 * returned list must, and will, contain every code point that is a
1399 * possibility. */
b423522f 1400
e2506fa7 1401 SV* invlist = NULL;
1ee208c4 1402 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1403 unsigned int i;
1404 const U32 n = ARG(node);
31f05a37 1405 bool new_node_has_latin1 = FALSE;
b423522f
KW
1406
1407 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1408
1409 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1410 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1411 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1412 AV * const av = MUTABLE_AV(SvRV(rv));
1413 SV **const ary = AvARRAY(av);
1414 assert(RExC_rxi->data->what[n] == 's');
1415
1416 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1417 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1418 }
1419 else if (ary[0] && ary[0] != &PL_sv_undef) {
1420
1421 /* Here, no compile-time swash, and there are things that won't be
1422 * known until runtime -- we have to assume it could be anything */
e2506fa7 1423 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1424 return _add_range_to_invlist(invlist, 0, UV_MAX);
1425 }
1ee208c4 1426 else if (ary[3] && ary[3] != &PL_sv_undef) {
b423522f
KW
1427
1428 /* Here no compile-time swash, and no run-time only data. Use the
1429 * node's inversion list */
1ee208c4
KW
1430 invlist = sv_2mortal(invlist_clone(ary[3]));
1431 }
1432
1433 /* Get the code points valid only under UTF-8 locales */
037715a6 1434 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1ee208c4
KW
1435 && ary[2] && ary[2] != &PL_sv_undef)
1436 {
1437 only_utf8_locale_invlist = ary[2];
b423522f
KW
1438 }
1439 }
1440
e2506fa7
KW
1441 if (! invlist) {
1442 invlist = sv_2mortal(_new_invlist(0));
1443 }
1444
dcb20b36
KW
1445 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1446 * code points, and an inversion list for the others, but if there are code
1447 * points that should match only conditionally on the target string being
1448 * UTF-8, those are placed in the inversion list, and not the bitmap.
1449 * Since there are circumstances under which they could match, they are
1450 * included in the SSC. But if the ANYOF node is to be inverted, we have
1451 * to exclude them here, so that when we invert below, the end result
1452 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1453 * have to do this here before we add the unconditionally matched code
1454 * points */
b423522f
KW
1455 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1456 _invlist_intersection_complement_2nd(invlist,
1457 PL_UpperLatin1,
1458 &invlist);
1459 }
1460
1461 /* Add in the points from the bit map */
dcb20b36 1462 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
b423522f 1463 if (ANYOF_BITMAP_TEST(node, i)) {
6f8848d5
TC
1464 unsigned int start = i++;
1465
1466 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1467 /* empty */
1468 }
1469 invlist = _add_range_to_invlist(invlist, start, i-1);
31f05a37 1470 new_node_has_latin1 = TRUE;
b423522f
KW
1471 }
1472 }
1473
1474 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1475 * as well. But don't add them if inverting, as when that gets done below,
1476 * it would exclude all these characters, including the ones it shouldn't
1477 * that were added just above */
1478 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
f240c685
KW
1479 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1480 {
b423522f
KW
1481 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1482 }
1483
1484 /* Similarly for these */
93e92956 1485 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1486 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1487 }
1488
1489 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1490 _invlist_invert(invlist);
1491 }
037715a6 1492 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
31f05a37
KW
1493
1494 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1495 * locale. We can skip this if there are no 0-255 at all. */
1496 _invlist_union(invlist, PL_Latin1, &invlist);
1497 }
1498
1ee208c4
KW
1499 /* Similarly add the UTF-8 locale possible matches. These have to be
1500 * deferred until after the non-UTF-8 locale ones are taken care of just
1501 * above, or it leads to wrong results under ANYOF_INVERT */
1502 if (only_utf8_locale_invlist) {
31f05a37 1503 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1504 only_utf8_locale_invlist,
31f05a37
KW
1505 ANYOF_FLAGS(node) & ANYOF_INVERT,
1506 &invlist);
1507 }
b423522f
KW
1508
1509 return invlist;
1510}
1511
1051e1c4 1512/* These two functions currently do the exact same thing */
557bd3fb 1513#define ssc_init_zero ssc_init
653099ff 1514
cdd87c1d
KW
1515#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1516#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1517
557bd3fb 1518/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1519 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1520 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1521
653099ff 1522STATIC void
b423522f 1523S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1524 const regnode_charclass *and_with)
653099ff 1525{
cdd87c1d
KW
1526 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1527 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1528
a0dd4231
KW
1529 SV* anded_cp_list;
1530 U8 anded_flags;
1e6ade67 1531
cdd87c1d 1532 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1533
71068078 1534 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1535
1536 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1537 * the code point inversion list and just the relevant flags */
71068078 1538 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1539 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
a0dd4231 1540 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1541
1542 /* XXX This is a kludge around what appears to be deficiencies in the
1543 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1544 * there are paths through the optimizer where it doesn't get weeded
1545 * out when it should. And if we don't make some extra provision for
1546 * it like the code just below, it doesn't get added when it should.
1547 * This solution is to add it only when AND'ing, which is here, and
1548 * only when what is being AND'ed is the pristine, original node
1549 * matching anything. Thus it is like adding it to ssc_anything() but
1550 * only when the result is to be AND'ed. Probably the same solution
1551 * could be adopted for the same problem we have with /l matching,
1552 * which is solved differently in S_ssc_init(), and that would lead to
1553 * fewer false positives than that solution has. But if this solution
1554 * creates bugs, the consequences are only that a warning isn't raised
1555 * that should be; while the consequences for having /l bugs is
1556 * incorrect matches */
7dcac5f6 1557 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1558 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1559 }
a0dd4231
KW
1560 }
1561 else {
5c0f85ef 1562 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685
KW
1563 if (OP(and_with) == ANYOFD) {
1564 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1565 }
1566 else {
1567 anded_flags = ANYOF_FLAGS(and_with)
1568 &( ANYOF_COMMON_FLAGS
108316fb
KW
1569 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1570 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1571 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1572 anded_flags &=
1573 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1574 }
f240c685 1575 }
a0dd4231
KW
1576 }
1577
1578 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1579
1580 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1581 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1582 * 'and_with' may be inverted. When not inverted, we have the situation of
1583 * computing:
1584 * (C1 | P1) & (C2 | P2)
1585 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1586 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1587 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1588 * <= ((C1 & C2) | P1 | P2)
1589 * Alternatively, the last few steps could be:
1590 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1591 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1592 * <= (C1 | C2 | (P1 & P2))
1593 * We favor the second approach if either P1 or P2 is non-empty. This is
1594 * because these components are a barrier to doing optimizations, as what
1595 * they match cannot be known until the moment of matching as they are
1596 * dependent on the current locale, 'AND"ing them likely will reduce or
1597 * eliminate them.
1598 * But we can do better if we know that C1,P1 are in their initial state (a
1599 * frequent occurrence), each matching everything:
1600 * (<everything>) & (C2 | P2) = C2 | P2
1601 * Similarly, if C2,P2 are in their initial state (again a frequent
1602 * occurrence), the result is a no-op
1603 * (C1 | P1) & (<everything>) = C1 | P1
1604 *
1605 * Inverted, we have
1606 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1607 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1608 * <= (C1 & ~C2) | (P1 & ~P2)
1609 * */
1aa99e6b 1610
a0dd4231 1611 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1612 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1613 {
cdd87c1d 1614 unsigned int i;
8951c461 1615
cdd87c1d
KW
1616 ssc_intersection(ssc,
1617 anded_cp_list,
1618 FALSE /* Has already been inverted */
1619 );
c6b76537 1620
cdd87c1d
KW
1621 /* If either P1 or P2 is empty, the intersection will be also; can skip
1622 * the loop */
93e92956 1623 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1624 ANYOF_POSIXL_ZERO(ssc);
1625 }
e0e1be5f 1626 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1627
1628 /* Note that the Posix class component P from 'and_with' actually
1629 * looks like:
1630 * P = Pa | Pb | ... | Pn
1631 * where each component is one posix class, such as in [\w\s].
1632 * Thus
1633 * ~P = ~(Pa | Pb | ... | Pn)
1634 * = ~Pa & ~Pb & ... & ~Pn
1635 * <= ~Pa | ~Pb | ... | ~Pn
1636 * The last is something we can easily calculate, but unfortunately
1637 * is likely to have many false positives. We could do better
1638 * in some (but certainly not all) instances if two classes in
1639 * P have known relationships. For example
1640 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1641 * So
1642 * :lower: & :print: = :lower:
1643 * And similarly for classes that must be disjoint. For example,
1644 * since \s and \w can have no elements in common based on rules in
1645 * the POSIX standard,
1646 * \w & ^\S = nothing
1647 * Unfortunately, some vendor locales do not meet the Posix
1648 * standard, in particular almost everything by Microsoft.
1649 * The loop below just changes e.g., \w into \W and vice versa */
1650
1ee208c4 1651 regnode_charclass_posixl temp;
cdd87c1d
KW
1652 int add = 1; /* To calculate the index of the complement */
1653
1654 ANYOF_POSIXL_ZERO(&temp);
1655 for (i = 0; i < ANYOF_MAX; i++) {
1656 assert(i % 2 != 0
7dcac5f6
KW
1657 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1658 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1659
7dcac5f6 1660 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1661 ANYOF_POSIXL_SET(&temp, i + add);
1662 }
1663 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1664 }
1665 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1666
cdd87c1d
KW
1667 } /* else ssc already has no posixes */
1668 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1669 in its initial state */
71068078 1670 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1671 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1672 {
1673 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1674 * copy it over 'ssc' */
1675 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1676 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1677 StructCopy(and_with, ssc, regnode_ssc);
1678 }
1679 else {
1680 ssc->invlist = anded_cp_list;
1681 ANYOF_POSIXL_ZERO(ssc);
93e92956 1682 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1683 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1684 }
1685 }
1686 }
e0e1be5f 1687 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
93e92956 1688 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1689 {
1690 /* One or the other of P1, P2 is non-empty. */
93e92956 1691 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1692 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1693 }
cdd87c1d
KW
1694 ssc_union(ssc, anded_cp_list, FALSE);
1695 }
1696 else { /* P1 = P2 = empty */
1697 ssc_intersection(ssc, anded_cp_list, FALSE);
1698 }
137165a6 1699 }
653099ff
GS
1700}
1701
653099ff 1702STATIC void
cdd87c1d 1703S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1704 const regnode_charclass *or_with)
653099ff 1705{
cdd87c1d
KW
1706 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1707 * another SSC or a regular ANYOF class. Can create false positives if
1708 * 'or_with' is to be inverted. */
7918f24d 1709
a0dd4231
KW
1710 SV* ored_cp_list;
1711 U8 ored_flags;
c6b76537 1712
cdd87c1d 1713 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1714
71068078 1715 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1716
1717 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1718 * the code point inversion list and just the relevant flags */
71068078 1719 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1720 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
a0dd4231
KW
1721 ored_flags = ANYOF_FLAGS(or_with);
1722 }
1723 else {
5c0f85ef 1724 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
eff8b7dc 1725 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
f240c685
KW
1726 if (OP(or_with) != ANYOFD) {
1727 ored_flags
1728 |= ANYOF_FLAGS(or_with)
108316fb
KW
1729 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1730 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1731 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1732 ored_flags |=
1733 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1734 }
f240c685 1735 }
a0dd4231
KW
1736 }
1737
1738 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1739
1740 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1741 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1742 * 'or_with' may be inverted. When not inverted, we have the simple
1743 * situation of computing:
1744 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1745 * If P1|P2 yields a situation with both a class and its complement are
1746 * set, like having both \w and \W, this matches all code points, and we
1747 * can delete these from the P component of the ssc going forward. XXX We
1748 * might be able to delete all the P components, but I (khw) am not certain
1749 * about this, and it is better to be safe.
1750 *
1751 * Inverted, we have
1752 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1753 * <= (C1 | P1) | ~C2
1754 * <= (C1 | ~C2) | P1
1755 * (which results in actually simpler code than the non-inverted case)
1756 * */
9826f543 1757
a0dd4231 1758 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1759 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1760 {
cdd87c1d 1761 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1762 } /* else Not inverted */
93e92956 1763 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1764 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1765 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1766 unsigned int i;
1767 for (i = 0; i < ANYOF_MAX; i += 2) {
1768 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1769 {
1770 ssc_match_all_cp(ssc);
1771 ANYOF_POSIXL_CLEAR(ssc, i);
1772 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1773 }
1774 }
1775 }
1aa99e6b 1776 }
cdd87c1d
KW
1777
1778 ssc_union(ssc,
1779 ored_cp_list,
1780 FALSE /* Already has been inverted */
1781 );
653099ff
GS
1782}
1783
b423522f
KW
1784PERL_STATIC_INLINE void
1785S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1786{
1787 PERL_ARGS_ASSERT_SSC_UNION;
1788
71068078 1789 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1790
1791 _invlist_union_maybe_complement_2nd(ssc->invlist,
1792 invlist,
1793 invert2nd,
1794 &ssc->invlist);
1795}
1796
1797PERL_STATIC_INLINE void
1798S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1799 SV* const invlist,
1800 const bool invert2nd)
1801{
1802 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1803
71068078 1804 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1805
1806 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1807 invlist,
1808 invert2nd,
1809 &ssc->invlist);
1810}
1811
1812PERL_STATIC_INLINE void
1813S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1814{
1815 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1816
71068078 1817 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1818
1819 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1820}
1821
1822PERL_STATIC_INLINE void
1823S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1824{
1825 /* AND just the single code point 'cp' into the SSC 'ssc' */
1826
1827 SV* cp_list = _new_invlist(2);
1828
1829 PERL_ARGS_ASSERT_SSC_CP_AND;
1830
71068078 1831 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1832
1833 cp_list = add_cp_to_invlist(cp_list, cp);
1834 ssc_intersection(ssc, cp_list,
1835 FALSE /* Not inverted */
1836 );
1837 SvREFCNT_dec_NN(cp_list);
1838}
1839
1840PERL_STATIC_INLINE void
dc3bf405 1841S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
1842{
1843 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
1844 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1845
71068078 1846 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1847
1848 ANYOF_POSIXL_ZERO(ssc);
1849 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1850}
1851
b35552de
KW
1852#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1853
1854STATIC bool
1855S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1856{
1857 /* The synthetic start class is used to hopefully quickly winnow down
1858 * places where a pattern could start a match in the target string. If it
1859 * doesn't really narrow things down that much, there isn't much point to
1860 * having the overhead of using it. This function uses some very crude
1861 * heuristics to decide if to use the ssc or not.
1862 *
1863 * It returns TRUE if 'ssc' rules out more than half what it considers to
1864 * be the "likely" possible matches, but of course it doesn't know what the
1865 * actual things being matched are going to be; these are only guesses
1866 *
1867 * For /l matches, it assumes that the only likely matches are going to be
1868 * in the 0-255 range, uniformly distributed, so half of that is 127
1869 * For /a and /d matches, it assumes that the likely matches will be just
1870 * the ASCII range, so half of that is 63
1871 * For /u and there isn't anything matching above the Latin1 range, it
1872 * assumes that that is the only range likely to be matched, and uses
1873 * half that as the cut-off: 127. If anything matches above Latin1,
1874 * it assumes that all of Unicode could match (uniformly), except for
1875 * non-Unicode code points and things in the General Category "Other"
1876 * (unassigned, private use, surrogates, controls and formats). This
1877 * is a much large number. */
1878
b35552de
KW
1879 U32 count = 0; /* Running total of number of code points matched by
1880 'ssc' */
1881 UV start, end; /* Start and end points of current range in inversion
1882 list */
72400949
KW
1883 const U32 max_code_points = (LOC)
1884 ? 256
1885 : (( ! UNI_SEMANTICS
1886 || invlist_highest(ssc->invlist) < 256)
1887 ? 128
1888 : NON_OTHER_COUNT);
1889 const U32 max_match = max_code_points / 2;
b35552de
KW
1890
1891 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1892
1893 invlist_iterinit(ssc->invlist);
1894 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
1895 if (start >= max_code_points) {
1896 break;
b35552de 1897 }
72400949 1898 end = MIN(end, max_code_points - 1);
b35552de 1899 count += end - start + 1;
72400949 1900 if (count >= max_match) {
b35552de
KW
1901 invlist_iterfinish(ssc->invlist);
1902 return FALSE;
1903 }
1904 }
1905
1906 return TRUE;
1907}
1908
1909
b423522f
KW
1910STATIC void
1911S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1912{
1913 /* The inversion list in the SSC is marked mortal; now we need a more
1914 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
1915 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1916 * map */
b423522f
KW
1917
1918 SV* invlist = invlist_clone(ssc->invlist);
1919
1920 PERL_ARGS_ASSERT_SSC_FINALIZE;
1921
71068078 1922 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 1923
a0dd4231 1924 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
1925 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1926 * by the time we reach here */
f240c685
KW
1927 assert(! (ANYOF_FLAGS(ssc)
1928 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
1929 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1930 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 1931
b423522f
KW
1932 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1933
1ee208c4
KW
1934 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1935 NULL, NULL, NULL, FALSE);
b423522f 1936
85c8e306
KW
1937 /* Make sure is clone-safe */
1938 ssc->invlist = NULL;
1939
e0e1be5f 1940 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 1941 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
e0e1be5f 1942 }
1462525b 1943
b2e90ddf
KW
1944 if (RExC_contains_locale) {
1945 OP(ssc) = ANYOFL;
1946 }
1947
1462525b 1948 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
1949}
1950
a3621e74
YO
1951#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1952#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1953#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
1954#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1955 ? (TRIE_LIST_CUR( idx ) - 1) \
1956 : 0 )
a3621e74 1957
3dab1dad
YO
1958
1959#ifdef DEBUGGING
07be1b83 1960/*
2b8b4781
NC
1961 dump_trie(trie,widecharmap,revcharmap)
1962 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1963 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1964
1965 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1966 The _interim_ variants are used for debugging the interim
1967 tables that are used to generate the final compressed
1968 representation which is what dump_trie expects.
1969
486ec47a 1970 Part of the reason for their existence is to provide a form
3dab1dad 1971 of documentation as to how the different representations function.
07be1b83
YO
1972
1973*/
3dab1dad
YO
1974
1975/*
3dab1dad
YO
1976 Dumps the final compressed table form of the trie to Perl_debug_log.
1977 Used for debugging make_trie().
1978*/
b9a59e08 1979
3dab1dad 1980STATIC void
2b8b4781
NC
1981S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1982 AV *revcharmap, U32 depth)
3dab1dad
YO
1983{
1984 U32 state;
ab3bbdeb 1985 SV *sv=sv_newmortal();
55eed653 1986 int colwidth= widecharmap ? 6 : 4;
2e64971a 1987 U16 word;
3dab1dad
YO
1988 GET_RE_DEBUG_FLAGS_DECL;
1989
7918f24d 1990 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1991
6ad9a8ab 1992 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 1993 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
1994
1995 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1996 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1997 if ( tmp ) {
6ad9a8ab 1998 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 1999 colwidth,
538e84ed 2000 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2001 PL_colors[0], PL_colors[1],
2002 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2003 PERL_PV_ESCAPE_FIRSTCHAR
2004 )
ab3bbdeb 2005 );
3dab1dad
YO
2006 }
2007 }
1e37780e
YO
2008 Perl_re_printf( aTHX_ "\n");
2009 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2010
2011 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2012 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2013 Perl_re_printf( aTHX_ "\n");
3dab1dad 2014
1e2e3d02 2015 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2016 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2017
147e3846 2018 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2019
2020 if ( trie->states[ state ].wordnum ) {
1e37780e 2021 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2022 } else {
6ad9a8ab 2023 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2024 }
2025
147e3846 2026 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2027
2028 if ( base ) {
2029 U32 ofs = 0;
2030
2031 while( ( base + ofs < trie->uniquecharcount ) ||
2032 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2033 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2034 != state))
3dab1dad
YO
2035 ofs++;
2036
147e3846 2037 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2038
2039 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2040 if ( ( base + ofs >= trie->uniquecharcount )
2041 && ( base + ofs - trie->uniquecharcount
2042 < trie->lasttrans )
2043 && trie->trans[ base + ofs
2044 - trie->uniquecharcount ].check == state )
3dab1dad 2045 {
147e3846 2046 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2047 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2048 );
3dab1dad 2049 } else {
6ad9a8ab 2050 Perl_re_printf( aTHX_ "%*s",colwidth," ." );
3dab1dad
YO
2051 }
2052 }
2053
6ad9a8ab 2054 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2055
2056 }
6ad9a8ab 2057 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2058 }
6ad9a8ab 2059 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2060 depth);
2e64971a 2061 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2062 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2063 (int)word, (int)(trie->wordinfo[word].prev),
2064 (int)(trie->wordinfo[word].len));
2065 }
6ad9a8ab 2066 Perl_re_printf( aTHX_ "\n" );
538e84ed 2067}
3dab1dad 2068/*
3dab1dad 2069 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2070 List tries normally only are used for construction when the number of
3dab1dad
YO
2071 possible chars (trie->uniquecharcount) is very high.
2072 Used for debugging make_trie().
2073*/
2074STATIC void
55eed653 2075S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2076 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2077 U32 depth)
3dab1dad
YO
2078{
2079 U32 state;
ab3bbdeb 2080 SV *sv=sv_newmortal();
55eed653 2081 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2082 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2083
2084 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2085
3dab1dad 2086 /* print out the table precompression. */
6ad9a8ab 2087 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2088 depth+1 );
6ad9a8ab 2089 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2090 depth+1, "------:-----+-----------------\n" );
538e84ed 2091
3dab1dad
YO
2092 for( state=1 ; state < next_alloc ; state ++ ) {
2093 U16 charid;
538e84ed 2094
147e3846 2095 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2096 depth+1, (UV)state );
3dab1dad 2097 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2098 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2099 } else {
6ad9a8ab 2100 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2101 trie->states[ state ].wordnum
2102 );
2103 }
2104 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed
KW
2105 SV ** const tmp = av_fetch( revcharmap,
2106 TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb 2107 if ( tmp ) {
147e3846 2108 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2109 colwidth,
538e84ed
KW
2110 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2111 colwidth,
2112 PL_colors[0], PL_colors[1],
2113 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2114 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2115 ) ,
1e2e3d02
YO
2116 TRIE_LIST_ITEM(state,charid).forid,
2117 (UV)TRIE_LIST_ITEM(state,charid).newstate
2118 );
538e84ed 2119 if (!(charid % 10))
6ad9a8ab 2120 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2121 (int)((depth * 2) + 14), "");
1e2e3d02 2122 }
ab3bbdeb 2123 }
6ad9a8ab 2124 Perl_re_printf( aTHX_ "\n");
3dab1dad 2125 }
538e84ed 2126}
3dab1dad
YO
2127
2128/*
3dab1dad 2129 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2130 This is the normal DFA style state transition table, with a few
2131 twists to facilitate compression later.
3dab1dad
YO
2132 Used for debugging make_trie().
2133*/
2134STATIC void
55eed653 2135S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2136 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2137 U32 depth)
3dab1dad
YO
2138{
2139 U32 state;
2140 U16 charid;
ab3bbdeb 2141 SV *sv=sv_newmortal();
55eed653 2142 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2143 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2144
2145 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2146
3dab1dad
YO
2147 /*
2148 print out the table precompression so that we can do a visual check
2149 that they are identical.
2150 */
538e84ed 2151
6ad9a8ab 2152 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2153
2154 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2155 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2156 if ( tmp ) {
6ad9a8ab 2157 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2158 colwidth,
538e84ed 2159 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2160 PL_colors[0], PL_colors[1],
2161 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2162 PERL_PV_ESCAPE_FIRSTCHAR
2163 )
ab3bbdeb 2164 );
3dab1dad
YO
2165 }
2166 }
2167
4aaafc03
YO
2168 Perl_re_printf( aTHX_ "\n");
2169 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2170
2171 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2172 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2173 }
2174
6ad9a8ab 2175 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2176
2177 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2178
147e3846 2179 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2180 depth+1,
3dab1dad
YO
2181 (UV)TRIE_NODENUM( state ) );
2182
2183 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2184 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2185 if (v)
147e3846 2186 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2187 else
6ad9a8ab 2188 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2189 }
2190 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2191 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2192 (UV)trie->trans[ state ].check );
3dab1dad 2193 } else {
147e3846 2194 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2195 (UV)trie->trans[ state ].check,
3dab1dad
YO
2196 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2197 }
2198 }
07be1b83 2199}
3dab1dad
YO
2200
2201#endif
2202
2e64971a 2203
786e8c11
YO
2204/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2205 startbranch: the first branch in the whole branch sequence
2206 first : start branch of sequence of branch-exact nodes.
2207 May be the same as startbranch
2208 last : Thing following the last branch.
2209 May be the same as tail.
2210 tail : item following the branch sequence
2211 count : words in the sequence
a4525e78 2212 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2213 depth : indent depth
3dab1dad 2214
786e8c11 2215Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2216
786e8c11
YO
2217A trie is an N'ary tree where the branches are determined by digital
2218decomposition of the key. IE, at the root node you look up the 1st character and
2219follow that branch repeat until you find the end of the branches. Nodes can be
2220marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2221
786e8c11 2222 /he|she|his|hers/
72f13be8 2223
786e8c11
YO
2224would convert into the following structure. Numbers represent states, letters
2225following numbers represent valid transitions on the letter from that state, if
2226the number is in square brackets it represents an accepting state, otherwise it
2227will be in parenthesis.
07be1b83 2228
786e8c11
YO
2229 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2230 | |
2231 | (2)
2232 | |
2233 (1) +-i->(6)-+-s->[7]
2234 |
2235 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2236
786e8c11
YO
2237 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2238
2239This shows that when matching against the string 'hers' we will begin at state 1
2240read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2241then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2242is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2243single traverse. We store a mapping from accepting to state to which word was
2244matched, and then when we have multiple possibilities we try to complete the
b8fda935 2245rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2246
2247The only prior NFA like behaviour that would be changed by the TRIE support is
2248the silent ignoring of duplicate alternations which are of the form:
2249
2250 / (DUPE|DUPE) X? (?{ ... }) Y /x
2251
4b714af6 2252Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2253and without the optimisation. With the optimisations dupes will be silently
486ec47a 2254ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2255the following demonstrates:
2256
2257 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2258
2259which prints out 'word' three times, but
2260
2261 'words'=~/(word|word|word)(?{ print $1 })S/
2262
2263which doesnt print it out at all. This is due to other optimisations kicking in.
2264
2265Example of what happens on a structural level:
2266
486ec47a 2267The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2268
2269 1: CURLYM[1] {1,32767}(18)
2270 5: BRANCH(8)
2271 6: EXACT <ac>(16)
2272 8: BRANCH(11)
2273 9: EXACT <ad>(16)
2274 11: BRANCH(14)
2275 12: EXACT <ab>(16)
2276 16: SUCCEED(0)
2277 17: NOTHING(18)
2278 18: END(0)
2279
2280This would be optimizable with startbranch=5, first=5, last=16, tail=16
2281and should turn into:
2282
2283 1: CURLYM[1] {1,32767}(18)
2284 5: TRIE(16)
2285 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2286 <ac>
2287 <ad>
2288 <ab>
2289 16: SUCCEED(0)
2290 17: NOTHING(18)
2291 18: END(0)
2292
2293Cases where tail != last would be like /(?foo|bar)baz/:
2294
2295 1: BRANCH(4)
2296 2: EXACT <foo>(8)
2297 4: BRANCH(7)
2298 5: EXACT <bar>(8)
2299 7: TAIL(8)
2300 8: EXACT <baz>(10)
2301 10: END(0)
2302
2303which would be optimizable with startbranch=1, first=1, last=7, tail=8
2304and would end up looking like:
2305
2306 1: TRIE(8)
2307 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2308 <foo>
2309 <bar>
2310 7: TAIL(8)
2311 8: EXACT <baz>(10)
2312 10: END(0)
2313
c80e42f3 2314 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2315
2316is the recommended Unicode-aware way of saying
2317
2318 *(d++) = uv;
2319*/
2320
fab2782b 2321#define TRIE_STORE_REVCHAR(val) \
786e8c11 2322 STMT_START { \
73031816 2323 if (UTF) { \
668fcfea 2324 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2325 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 2326 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
2327 SvCUR_set(zlopp, kapow - flrbbbbb); \
2328 SvPOK_on(zlopp); \
2329 SvUTF8_on(zlopp); \
2330 av_push(revcharmap, zlopp); \
2331 } else { \
fab2782b 2332 char ooooff = (char)val; \
73031816
NC
2333 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2334 } \
2335 } STMT_END
786e8c11 2336
914a25d5
KW
2337/* This gets the next character from the input, folding it if not already
2338 * folded. */
2339#define TRIE_READ_CHAR STMT_START { \
2340 wordlen++; \
2341 if ( UTF ) { \
2342 /* if it is UTF then it is either already folded, or does not need \
2343 * folding */ \
1c1d615a 2344 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2345 } \
2346 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2347 /* This folder implies Unicode rules, which in the range expressible \
2348 * by not UTF is the lower case, with the two exceptions, one of \
2349 * which should have been taken care of before calling this */ \
2350 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2351 uvc = toLOWER_L1(*uc); \
2352 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2353 len = 1; \
914a25d5
KW
2354 } else { \
2355 /* raw data, will be folded later if needed */ \
2356 uvc = (U32)*uc; \
2357 len = 1; \
2358 } \
786e8c11
YO
2359} STMT_END
2360
2361
2362
2363#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2364 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
00195859 2365 U32 ging = TRIE_LIST_LEN( state ) * 2; \
f9003953 2366 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
00195859 2367 TRIE_LIST_LEN( state ) = ging; \
786e8c11
YO
2368 } \
2369 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2370 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2371 TRIE_LIST_CUR( state )++; \
2372} STMT_END
07be1b83 2373
786e8c11
YO
2374#define TRIE_LIST_NEW(state) STMT_START { \
2375 Newxz( trie->states[ state ].trans.list, \
2376 4, reg_trie_trans_le ); \
2377 TRIE_LIST_CUR( state ) = 1; \
2378 TRIE_LIST_LEN( state ) = 4; \
2379} STMT_END
07be1b83 2380
786e8c11
YO
2381#define TRIE_HANDLE_WORD(state) STMT_START { \
2382 U16 dupe= trie->states[ state ].wordnum; \
2383 regnode * const noper_next = regnext( noper ); \
2384 \
786e8c11
YO
2385 DEBUG_r({ \
2386 /* store the word for dumping */ \
2387 SV* tmp; \
2388 if (OP(noper) != NOTHING) \
740cce10 2389 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2390 else \
740cce10 2391 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2392 av_push( trie_words, tmp ); \
786e8c11
YO
2393 }); \
2394 \
2395 curword++; \
2e64971a
DM
2396 trie->wordinfo[curword].prev = 0; \
2397 trie->wordinfo[curword].len = wordlen; \
2398 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2399 \
2400 if ( noper_next < tail ) { \
2401 if (!trie->jump) \
538e84ed
KW
2402 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2403 sizeof(U16) ); \
7f69552c 2404 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2405 if (!jumper) \
2406 jumper = noper_next; \
2407 if (!nextbranch) \
2408 nextbranch= regnext(cur); \
2409 } \
2410 \
2411 if ( dupe ) { \
2e64971a
DM
2412 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2413 /* chain, so that when the bits of chain are later */\
2414 /* linked together, the dups appear in the chain */\
2415 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2416 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2417 } else { \
2418 /* we haven't inserted this word yet. */ \
2419 trie->states[ state ].wordnum = curword; \
2420 } \
2421} STMT_END
07be1b83 2422
3dab1dad 2423
786e8c11
YO
2424#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2425 ( ( base + charid >= ucharcount \
2426 && base + charid < ubound \
2427 && state == trie->trans[ base - ucharcount + charid ].check \
2428 && trie->trans[ base - ucharcount + charid ].next ) \
2429 ? trie->trans[ base - ucharcount + charid ].next \
2430 : ( state==1 ? special : 0 ) \
2431 )
3dab1dad 2432
8bcafbf4
YO
2433#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2434STMT_START { \
2435 TRIE_BITMAP_SET(trie, uvc); \
2436 /* store the folded codepoint */ \
2437 if ( folder ) \
2438 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2439 \
2440 if ( !UTF ) { \
2441 /* store first byte of utf8 representation of */ \
2442 /* variant codepoints */ \
2443 if (! UVCHR_IS_INVARIANT(uvc)) { \
2444 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2445 } \
2446 } \
2447} STMT_END
786e8c11
YO
2448#define MADE_TRIE 1
2449#define MADE_JUMP_TRIE 2
2450#define MADE_EXACT_TRIE 4
3dab1dad 2451
a3621e74 2452STATIC I32
538e84ed
KW
2453S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2454 regnode *first, regnode *last, regnode *tail,
2455 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2456{
2457 /* first pass, loop through and scan words */
2458 reg_trie_data *trie;
55eed653 2459 HV *widecharmap = NULL;
2b8b4781 2460 AV *revcharmap = newAV();
a3621e74 2461 regnode *cur;
a3621e74
YO
2462 STRLEN len = 0;
2463 UV uvc = 0;
2464 U16 curword = 0;
2465 U32 next_alloc = 0;
786e8c11
YO
2466 regnode *jumper = NULL;
2467 regnode *nextbranch = NULL;
7f69552c 2468 regnode *convert = NULL;
2e64971a 2469 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2470 /* we just use folder as a flag in utf8 */
1e696034 2471 const U8 * folder = NULL;
a3621e74 2472
3a611511
YO
2473 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2474 * which stands for one trie structure, one hash, optionally followed
2475 * by two arrays */
2b8b4781 2476#ifdef DEBUGGING
3a611511 2477 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2b8b4781
NC
2478 AV *trie_words = NULL;
2479 /* along with revcharmap, this only used during construction but both are
2480 * useful during debugging so we store them in the struct when debugging.
8e11feef 2481 */
2b8b4781 2482#else
cf78de0b 2483 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2484 STRLEN trie_charcount=0;
3dab1dad 2485#endif
2b8b4781 2486 SV *re_trie_maxbuff;
a3621e74 2487 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2488
2489 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2490#ifndef DEBUGGING
2491 PERL_UNUSED_ARG(depth);
2492#endif
a3621e74 2493
1e696034 2494 switch (flags) {
a4525e78 2495 case EXACT: case EXACTL: break;
2f7f8cb1 2496 case EXACTFA:
fab2782b 2497 case EXACTFU_SS:
a4525e78
KW
2498 case EXACTFU:
2499 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2500 case EXACTF: folder = PL_fold; break;
fab2782b 2501 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2502 }
2503
c944940b 2504 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2505 trie->refcount = 1;
3dab1dad 2506 trie->startstate = 1;
786e8c11 2507 trie->wordcount = word_count;
f8fc2ecf 2508 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2509 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
a4525e78 2510 if (flags == EXACT || flags == EXACTL)
c944940b 2511 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2512 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2513 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2514
a3621e74 2515 DEBUG_r({
2b8b4781 2516 trie_words = newAV();
a3621e74 2517 });
a3621e74 2518
0111c4fd 2519 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
316ebaf2 2520 assert(re_trie_maxbuff);
a3621e74 2521 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2522 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2523 }
df826430 2524 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2525 Perl_re_indentf( aTHX_
cb41e5d6
YO
2526 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2527 depth+1,
538e84ed
KW
2528 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2529 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2530 });
538e84ed 2531
7f69552c
YO
2532 /* Find the node we are going to overwrite */
2533 if ( first == startbranch && OP( last ) != BRANCH ) {
2534 /* whole branch chain */
2535 convert = first;
2536 } else {
2537 /* branch sub-chain */
2538 convert = NEXTOPER( first );
2539 }
538e84ed 2540
a3621e74
YO
2541 /* -- First loop and Setup --
2542
2543 We first traverse the branches and scan each word to determine if it
2544 contains widechars, and how many unique chars there are, this is
2545 important as we have to build a table with at least as many columns as we
2546 have unique chars.
2547
2548 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2549 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2550 the native representation of the character value as the key and IV's for
2551 the coded index.
a3621e74
YO
2552
2553 *TODO* If we keep track of how many times each character is used we can
2554 remap the columns so that the table compression later on is more
3b753521 2555 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2556 middle and the least common are on the outside. IMO this would be better
2557 than a most to least common mapping as theres a decent chance the most
2558 common letter will share a node with the least common, meaning the node
486ec47a 2559 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2560 case is when we have the least common nodes twice.
2561
2562 */
2563
a3621e74 2564 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2565 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2566 const U8 *uc;
2567 const U8 *e;
bc031a7d 2568 int foldlen = 0;
07be1b83 2569 U32 wordlen = 0; /* required init */
bc031a7d
KW
2570 STRLEN minchars = 0;
2571 STRLEN maxchars = 0;
538e84ed
KW
2572 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2573 bitmap?*/
a3621e74 2574
3dab1dad 2575 if (OP(noper) == NOTHING) {
20ed8c88
YO
2576 /* skip past a NOTHING at the start of an alternation
2577 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2578 */
df826430 2579 regnode *noper_next= regnext(noper);
944e05e3
YO
2580 if (noper_next < tail)
2581 noper= noper_next;
2582 }
2583
dca5fc4c
YO
2584 if ( noper < tail &&
2585 (
2586 OP(noper) == flags ||
2587 (
2588 flags == EXACTFU &&
2589 OP(noper) == EXACTFU_SS
2590 )
2591 )
2592 ) {
944e05e3
YO
2593 uc= (U8*)STRING(noper);
2594 e= uc + STR_LEN(noper);
2595 } else {
2596 trie->minlen= 0;
2597 continue;
3dab1dad 2598 }
df826430 2599
944e05e3 2600
fab2782b 2601 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2602 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2603 regardless of encoding */
fab2782b
YO
2604 if (OP( noper ) == EXACTFU_SS) {
2605 /* false positives are ok, so just set this */
0dc4a61d 2606 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2607 }
2608 }
dca5fc4c 2609
bc031a7d
KW
2610 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2611 branch */
3dab1dad 2612 TRIE_CHARCOUNT(trie)++;
a3621e74 2613 TRIE_READ_CHAR;
645de4ce 2614
bc031a7d
KW
2615 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2616 * is in effect. Under /i, this character can match itself, or
2617 * anything that folds to it. If not under /i, it can match just
2618 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2619 * all fold to k, and all are single characters. But some folds
2620 * expand to more than one character, so for example LATIN SMALL
2621 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2622 * the string beginning at 'uc' is 'ffi', it could be matched by
2623 * three characters, or just by the one ligature character. (It
2624 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2625 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2626 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2627 * match.) The trie needs to know the minimum and maximum number
2628 * of characters that could match so that it can use size alone to
2629 * quickly reject many match attempts. The max is simple: it is
2630 * the number of folded characters in this branch (since a fold is
2631 * never shorter than what folds to it. */
2632
2633 maxchars++;
2634
2635 /* And the min is equal to the max if not under /i (indicated by
2636 * 'folder' being NULL), or there are no multi-character folds. If
2637 * there is a multi-character fold, the min is incremented just
2638 * once, for the character that folds to the sequence. Each
2639 * character in the sequence needs to be added to the list below of
2640 * characters in the trie, but we count only the first towards the
2641 * min number of characters needed. This is done through the
2642 * variable 'foldlen', which is returned by the macros that look
2643 * for these sequences as the number of bytes the sequence
2644 * occupies. Each time through the loop, we decrement 'foldlen' by
2645 * how many bytes the current char occupies. Only when it reaches
2646 * 0 do we increment 'minchars' or look for another multi-character
2647 * sequence. */
2648 if (folder == NULL) {
2649 minchars++;
2650 }
2651 else if (foldlen > 0) {
2652 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2653 }
2654 else {
bc031a7d
KW
2655 minchars++;
2656
2657 /* See if *uc is the beginning of a multi-character fold. If
2658 * so, we decrement the length remaining to look at, to account
2659 * for the current character this iteration. (We can use 'uc'
2660 * instead of the fold returned by TRIE_READ_CHAR because for
2661 * non-UTF, the latin1_safe macro is smart enough to account
2662 * for all the unfolded characters, and because for UTF, the
2663 * string will already have been folded earlier in the
2664 * compilation process */
2665 if (UTF) {
2666 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2667 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2668 }
2669 }
bc031a7d
KW
2670 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2671 foldlen--;
2672 }
645de4ce 2673 }
bc031a7d
KW
2674
2675 /* The current character (and any potential folds) should be added
2676 * to the possible matching characters for this position in this
2677 * branch */
a3621e74 2678 if ( uvc < 256 ) {
fab2782b
YO
2679 if ( folder ) {
2680 U8 folded= folder[ (U8) uvc ];
2681 if ( !trie->charmap[ folded ] ) {
2682 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2683 TRIE_STORE_REVCHAR( folded );
2684 }
2685 }
a3621e74
YO
2686 if ( !trie->charmap[ uvc ] ) {
2687 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2688 TRIE_STORE_REVCHAR( uvc );
a3621e74 2689 }
02daf0ab 2690 if ( set_bit ) {
62012aee
KW
2691 /* store the codepoint in the bitmap, and its folded
2692 * equivalent. */
8bcafbf4 2693 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2694 set_bit = 0; /* We've done our bit :-) */
2695 }
a3621e74 2696 } else {
bc031a7d
KW
2697
2698 /* XXX We could come up with the list of code points that fold
2699 * to this using PL_utf8_foldclosures, except not for
2700 * multi-char folds, as there may be multiple combinations
2701 * there that could work, which needs to wait until runtime to
2702 * resolve (The comment about LIGATURE FFI above is such an
2703 * example */
2704
a3621e74 2705 SV** svpp;
55eed653
NC
2706 if ( !widecharmap )
2707 widecharmap = newHV();
a3621e74 2708
55eed653 2709 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2710
2711 if ( !svpp )
147e3846 2712 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
2713
2714 if ( !SvTRUE( *svpp ) ) {
2715 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2716 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2717 }
2718 }
bc031a7d
KW
2719 } /* end loop through characters in this branch of the trie */
2720
2721 /* We take the min and max for this branch and combine to find the min
2722 * and max for all branches processed so far */
3dab1dad 2723 if( cur == first ) {
bc031a7d
KW
2724 trie->minlen = minchars;
2725 trie->maxlen = maxchars;
2726 } else if (minchars < trie->minlen) {
2727 trie->minlen = minchars;
2728 } else if (maxchars > trie->maxlen) {
2729 trie->maxlen = maxchars;
fab2782b 2730 }
a3621e74
YO
2731 } /* end first pass */
2732 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2733 Perl_re_indentf( aTHX_
cb41e5d6
YO
2734 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2735 depth+1,
55eed653 2736 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2737 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2738 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2739 );
a3621e74
YO
2740
2741 /*
2742 We now know what we are dealing with in terms of unique chars and
2743 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2744 representation using a flat table will take. If it's over a reasonable
2745 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2746 conservative but potentially much slower representation using an array
2747 of lists.
2748
2749 At the end we convert both representations into the same compressed
2750 form that will be used in regexec.c for matching with. The latter
2751 is a form that cannot be used to construct with but has memory
2752 properties similar to the list form and access properties similar
2753 to the table form making it both suitable for fast searches and
2754 small enough that its feasable to store for the duration of a program.
2755
2756 See the comment in the code where the compressed table is produced
2757 inplace from the flat tabe representation for an explanation of how
2758 the compression works.
2759
2760 */
2761
2762
2e64971a
DM
2763 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2764 prev_states[1] = 0;
2765
538e84ed
KW
2766 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2767 > SvIV(re_trie_maxbuff) )
2768 {
a3621e74
YO
2769 /*
2770 Second Pass -- Array Of Lists Representation
2771
2772 Each state will be represented by a list of charid:state records
2773 (reg_trie_trans_le) the first such element holds the CUR and LEN
2774 points of the allocated array. (See defines above).
2775
2776 We build the initial structure using the lists, and then convert
2777 it into the compressed table form which allows faster lookups
2778 (but cant be modified once converted).
a3621e74
YO
2779 */
2780
a3621e74
YO
2781 STRLEN transcount = 1;
2782
6ad9a8ab 2783 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 2784 depth+1));
686b73d4 2785
c944940b
JH
2786 trie->states = (reg_trie_state *)
2787 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2788 sizeof(reg_trie_state) );
a3621e74
YO
2789 TRIE_LIST_NEW(1);
2790 next_alloc = 2;
2791
2792 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2793
df826430 2794 regnode *noper = NEXTOPER( cur );
c445ea15
AL
2795 U32 state = 1; /* required init */
2796 U16 charid = 0; /* sanity init */
07be1b83 2797 U32 wordlen = 0; /* required init */
c445ea15 2798
df826430
YO
2799 if (OP(noper) == NOTHING) {
2800 regnode *noper_next= regnext(noper);
944e05e3
YO
2801 if (noper_next < tail)
2802 noper= noper_next;
df826430
YO
2803 }
2804
944e05e3
YO
2805 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2806 const U8 *uc= (U8*)STRING(noper);
2807 const U8 *e= uc + STR_LEN(noper);
2808
786e8c11 2809 for ( ; uc < e ; uc += len ) {
c445ea15 2810
786e8c11 2811 TRIE_READ_CHAR;
c445ea15 2812
786e8c11
YO
2813 if ( uvc < 256 ) {
2814 charid = trie->charmap[ uvc ];
c445ea15 2815 } else {
538e84ed
KW
2816 SV** const svpp = hv_fetch( widecharmap,
2817 (char*)&uvc,
2818 sizeof( UV ),
2819 0);
786e8c11
YO
2820 if ( !svpp ) {
2821 charid = 0;
2822 } else {
2823 charid=(U16)SvIV( *svpp );
2824 }
c445ea15 2825 }
538e84ed
KW
2826 /* charid is now 0 if we dont know the char read, or
2827 * nonzero if we do */
786e8c11 2828 if ( charid ) {
a3621e74 2829
786e8c11
YO
2830 U16 check;
2831 U32 newstate = 0;
a3621e74 2832
786e8c11
YO
2833 charid--;
2834 if ( !trie->states[ state ].trans.list ) {
2835 TRIE_LIST_NEW( state );
c445ea15 2836 }
538e84ed
KW
2837 for ( check = 1;
2838 check <= TRIE_LIST_USED( state );
2839 check++ )
2840 {
2841 if ( TRIE_LIST_ITEM( state, check ).forid
2842 == charid )
2843 {
786e8c11
YO
2844 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2845 break;
2846 }
2847 }
2848 if ( ! newstate ) {
2849 newstate = next_alloc++;
2e64971a 2850 prev_states[newstate] = state;
786e8c11
YO
2851 TRIE_LIST_PUSH( state, charid, newstate );
2852 transcount++;
2853 }
2854 state = newstate;
2855 } else {
147e3846 2856 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 2857 }
a28509cc 2858 }
c445ea15 2859 }
3dab1dad 2860 TRIE_HANDLE_WORD(state);
a3621e74
YO
2861
2862 } /* end second pass */
2863
1e2e3d02 2864 /* next alloc is the NEXT state to be allocated */
538e84ed 2865 trie->statecount = next_alloc;
c944940b
JH
2866 trie->states = (reg_trie_state *)
2867 PerlMemShared_realloc( trie->states,
2868 next_alloc
2869 * sizeof(reg_trie_state) );
a3621e74 2870
3dab1dad 2871 /* and now dump it out before we compress it */
2b8b4781
NC
2872 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2873 revcharmap, next_alloc,
2874 depth+1)
1e2e3d02 2875 );
a3621e74 2876
c944940b
JH
2877 trie->trans = (reg_trie_trans *)
2878 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2879 {
2880 U32 state;
a3621e74
YO
2881 U32 tp = 0;
2882 U32 zp = 0;
2883
2884
2885 for( state=1 ; state < next_alloc ; state ++ ) {
2886 U32 base=0;
2887
2888 /*
2889 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 2890 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
2891 );
2892 */
2893
2894 if (trie->states[state].trans.list) {
2895 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2896 U16 maxid=minid;
a28509cc 2897 U16 idx;
a3621e74
YO
2898
2899 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
2900 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2901 if ( forid < minid ) {
2902 minid=forid;
2903 } else if ( forid > maxid ) {
2904 maxid=forid;
2905 }
a3621e74
YO
2906 }
2907 if ( transcount < tp + maxid - minid + 1) {
2908 transcount *= 2;
c944940b
JH
2909 trie->trans = (reg_trie_trans *)
2910 PerlMemShared_realloc( trie->trans,
446bd890
NC
2911 transcount
2912 * sizeof(reg_trie_trans) );
538e84ed
KW
2913 Zero( trie->trans + (transcount / 2),
2914 transcount / 2,
2915 reg_trie_trans );
a3621e74
YO
2916 }
2917 base = trie->uniquecharcount + tp - minid;
2918 if ( maxid == minid ) {
2919 U32 set = 0;
2920 for ( ; zp < tp ; zp++ ) {
2921 if ( ! trie->trans[ zp ].next ) {
2922 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
2923 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2924 1).newstate;
a3621e74
YO
2925 trie->trans[ zp ].check = state;
2926 set = 1;
2927 break;
2928 }
2929 }
2930 if ( !set ) {
538e84ed
KW
2931 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2932 1).newstate;
a3621e74
YO
2933 trie->trans[ tp ].check = state;
2934 tp++;
2935 zp = tp;
2936 }
2937 } else {
2938 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
2939 const U32 tid = base
2940 - trie->uniquecharcount
2941 + TRIE_LIST_ITEM( state, idx ).forid;
2942 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2943 idx ).newstate;
a3621e74
YO
2944 trie->trans[ tid ].check = state;
2945 }
2946 tp += ( maxid - minid + 1 );
2947 }
2948 Safefree(trie->states[ state ].trans.list);
2949 }
2950 /*
2951 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 2952 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
2953 );
2954 */
2955 trie->states[ state ].trans.base=base;
2956 }
cc601c31 2957 trie->lasttrans = tp + 1;
a3621e74
YO
2958 }
2959 } else {
2960 /*
2961 Second Pass -- Flat Table Representation.
2962
b423522f
KW
2963 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2964 each. We know that we will need Charcount+1 trans at most to store
2965 the data (one row per char at worst case) So we preallocate both
2966 structures assuming worst case.
a3621e74
YO
2967
2968 We then construct the trie using only the .next slots of the entry
2969 structs.
2970
b423522f
KW
2971 We use the .check field of the first entry of the node temporarily
2972 to make compression both faster and easier by keeping track of how
2973 many non zero fields are in the node.
a3621e74
YO
2974
2975 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2976 transition.
2977
b423522f
KW
2978 There are two terms at use here: state as a TRIE_NODEIDX() which is
2979 a number representing the first entry of the node, and state as a
2980 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2981 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2982 if there are 2 entrys per node. eg:
a3621e74
YO
2983
2984 A B A B
2985 1. 2 4 1. 3 7
2986 2. 0 3 3. 0 5
2987 3. 0 0 5. 0 0
2988 4. 0 0 7. 0 0
2989
b423522f
KW
2990 The table is internally in the right hand, idx form. However as we
2991 also have to deal with the states array which is indexed by nodenum
2992 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
2993
2994 */
6ad9a8ab 2995 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 2996 depth+1));
3dab1dad 2997
c944940b
JH
2998 trie->trans = (reg_trie_trans *)
2999 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3000 * trie->uniquecharcount + 1,
3001 sizeof(reg_trie_trans) );
3002 trie->states = (reg_trie_state *)
3003 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3004 sizeof(reg_trie_state) );
a3621e74
YO
3005 next_alloc = trie->uniquecharcount + 1;
3006
3dab1dad 3007
a3621e74
YO
3008 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3009
df826430 3010 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3011
3012 U32 state = 1; /* required init */
3013
3014 U16 charid = 0; /* sanity init */
3015 U32 accept_state = 0; /* sanity init */
a3621e74 3016
07be1b83 3017 U32 wordlen = 0; /* required init */
a3621e74 3018
df826430
YO
3019 if (OP(noper) == NOTHING) {
3020 regnode *noper_next= regnext(noper);
944e05e3
YO
3021 if (noper_next < tail)
3022 noper= noper_next;
df826430 3023 }
fab2782b 3024
944e05e3
YO
3025 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3026 const U8 *uc= (U8*)STRING(noper);
3027 const U8 *e= uc + STR_LEN(noper);
3028
786e8c11 3029 for ( ; uc < e ; uc += len ) {
a3621e74 3030
786e8c11 3031 TRIE_READ_CHAR;
a3621e74 3032
786e8c11
YO
3033 if ( uvc < 256 ) {
3034 charid = trie->charmap[ uvc ];
3035 } else {
538e84ed
KW
3036 SV* const * const svpp = hv_fetch( widecharmap,
3037 (char*)&uvc,
3038 sizeof( UV ),
3039 0);
786e8c11 3040 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3041 }
786e8c11
YO
3042 if ( charid ) {
3043 charid--;
3044 if ( !trie->trans[ state + charid ].next ) {
3045 trie->trans[ state + charid ].next = next_alloc;
3046 trie->trans[ state ].check++;
2e64971a
DM
3047 prev_states[TRIE_NODENUM(next_alloc)]
3048 = TRIE_NODENUM(state);
786e8c11
YO
3049 next_alloc += trie->uniquecharcount;
3050 }
3051 state = trie->trans[ state + charid ].next;
3052 } else {
147e3846 3053 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3054 }
538e84ed
KW
3055 /* charid is now 0 if we dont know the char read, or
3056 * nonzero if we do */
a3621e74 3057 }
a3621e74 3058 }
3dab1dad
YO
3059 accept_state = TRIE_NODENUM( state );
3060 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3061
3062 } /* end second pass */
3063
3dab1dad 3064 /* and now dump it out before we compress it */
2b8b4781
NC
3065 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3066 revcharmap,
3067 next_alloc, depth+1));
a3621e74 3068
a3621e74
YO
3069 {
3070 /*
3071 * Inplace compress the table.*
3072
3073 For sparse data sets the table constructed by the trie algorithm will
3074 be mostly 0/FAIL transitions or to put it another way mostly empty.
3075 (Note that leaf nodes will not contain any transitions.)
3076
3077 This algorithm compresses the tables by eliminating most such
3078 transitions, at the cost of a modest bit of extra work during lookup:
3079
3080 - Each states[] entry contains a .base field which indicates the
3081 index in the state[] array wheres its transition data is stored.
3082
3b753521 3083 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3084
3085 - If .base is nonzero then charid is added to it to find an entry in
3086 the trans array.
3087
3088 -If trans[states[state].base+charid].check!=state then the
3089 transition is taken to be a 0/Fail transition. Thus if there are fail
3090 transitions at the front of the node then the .base offset will point
3091 somewhere inside the previous nodes data (or maybe even into a node
3092 even earlier), but the .check field determines if the transition is
3093 valid.
3094
786e8c11 3095 XXX - wrong maybe?
a3621e74 3096 The following process inplace converts the table to the compressed
3b753521 3097 table: We first do not compress the root node 1,and mark all its
a3621e74 3098 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3099 allows us to do a DFA construction from the compressed table later,
3100 and ensures that any .base pointers we calculate later are greater
3101 than 0.
a3621e74
YO
3102
3103 - We set 'pos' to indicate the first entry of the second node.
3104
3105 - We then iterate over the columns of the node, finding the first and
3106 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3107 and set the .check pointers accordingly, and advance pos
3108 appropriately and repreat for the next node. Note that when we copy
3109 the next pointers we have to convert them from the original
3110 NODEIDX form to NODENUM form as the former is not valid post
3111 compression.
3112
3113 - If a node has no transitions used we mark its base as 0 and do not
3114 advance the pos pointer.
3115
3116 - If a node only has one transition we use a second pointer into the
3117 structure to fill in allocated fail transitions from other states.
3118 This pointer is independent of the main pointer and scans forward
3119 looking for null transitions that are allocated to a state. When it
3120 finds one it writes the single transition into the "hole". If the
786e8c11 3121 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3122
3123 - Once compressed we can Renew/realloc the structures to release the
3124 excess space.
3125
3126 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3127 specifically Fig 3.47 and the associated pseudocode.
3128
3129 demq
3130 */
a3b680e6 3131 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3132 U32 state, charid;
a3621e74 3133 U32 pos = 0, zp=0;
1e2e3d02 3134 trie->statecount = laststate;
a3621e74
YO
3135
3136 for ( state = 1 ; state < laststate ; state++ ) {
3137 U8 flag = 0;
a28509cc
AL
3138 const U32 stateidx = TRIE_NODEIDX( state );
3139 const U32 o_used = trie->trans[ stateidx ].check;
3140 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3141 trie->trans[ stateidx ].check = 0;
3142
538e84ed
KW
3143 for ( charid = 0;
3144 used && charid < trie->uniquecharcount;
3145 charid++ )
3146 {
a3621e74
YO
3147 if ( flag || trie->trans[ stateidx + charid ].next ) {
3148 if ( trie->trans[ stateidx + charid ].next ) {
3149 if (o_used == 1) {
3150 for ( ; zp < pos ; zp++ ) {
3151 if ( ! trie->trans[ zp ].next ) {
3152 break;
3153 }
3154 }
538e84ed
KW
3155 trie->states[ state ].trans.base
3156 = zp
3157 + trie->uniquecharcount
3158 - charid ;
3159 trie->trans[ zp ].next
3160 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3161 + charid ].next );
a3621e74
YO
3162 trie->trans[ zp ].check = state;
3163 if ( ++zp > pos ) pos = zp;
3164 break;
3165 }
3166 used--;
3167 }
3168 if ( !flag ) {
3169 flag = 1;
538e84ed
KW
3170 trie->states[ state ].trans.base
3171 = pos + trie->uniquecharcount - charid ;
a3621e74 3172 }
538e84ed
KW
3173 trie->trans[ pos ].next
3174 = SAFE_TRIE_NODENUM(
3175 trie->trans[ stateidx + charid ].next );
a3621e74
YO
3176 trie->trans[ pos ].check = state;
3177 pos++;
3178 }
3179 }
3180 }
cc601c31 3181 trie->lasttrans = pos + 1;
c944940b
JH
3182 trie->states = (reg_trie_state *)
3183 PerlMemShared_realloc( trie->states, laststate
3184 * sizeof(reg_trie_state) );
a3621e74 3185 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3186 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
cb41e5d6 3187 depth+1,
538e84ed
KW
3188 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3189 + 1 ),
3190 (IV)next_alloc,
3191 (IV)pos,
3192 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
3193 );
3194
3195 } /* end table compress */
3196 }
1e2e3d02 3197 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3198 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
cb41e5d6 3199 depth+1,
1e2e3d02
YO
3200 (UV)trie->statecount,
3201 (UV)trie->lasttrans)
3202 );
cc601c31 3203 /* resize the trans array to remove unused space */
c944940b
JH
3204 trie->trans = (reg_trie_trans *)
3205 PerlMemShared_realloc( trie->trans, trie->lasttrans
3206 * sizeof(reg_trie_trans) );
a3621e74 3207
538e84ed 3208 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
3209 U8 nodetype =(U8)(flags & 0xFF);
3210 char *str=NULL;
538e84ed 3211
07be1b83 3212#ifdef DEBUGGING
e62cc96a 3213 regnode *optimize = NULL;
7122b237
YO
3214#ifdef RE_TRACK_PATTERN_OFFSETS
3215
b57a0404
JH
3216 U32 mjd_offset = 0;
3217 U32 mjd_nodelen = 0;
7122b237
YO
3218#endif /* RE_TRACK_PATTERN_OFFSETS */
3219#endif /* DEBUGGING */
a3621e74 3220 /*
3dab1dad
YO
3221 This means we convert either the first branch or the first Exact,
3222 depending on whether the thing following (in 'last') is a branch
3223 or not and whther first is the startbranch (ie is it a sub part of
3224 the alternation or is it the whole thing.)
3b753521 3225 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 3226 the whole branch sequence, including the first.
a3621e74 3227 */
3dab1dad 3228 /* Find the node we are going to overwrite */
7f69552c 3229 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 3230 /* branch sub-chain */
3dab1dad 3231 NEXT_OFF( first ) = (U16)(last - first);
7122b237 3232#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
3233 DEBUG_r({
3234 mjd_offset= Node_Offset((convert));
3235 mjd_nodelen= Node_Length((convert));
3236 });
7122b237 3237#endif
7f69552c 3238 /* whole branch chain */
7122b237
YO
3239 }
3240#ifdef RE_TRACK_PATTERN_OFFSETS
3241 else {
7f69552c
YO
3242 DEBUG_r({
3243 const regnode *nop = NEXTOPER( convert );
3244 mjd_offset= Node_Offset((nop));
3245 mjd_nodelen= Node_Length((nop));
3246 });
07be1b83
YO
3247 }
3248 DEBUG_OPTIMISE_r(
147e3846 3249 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
cb41e5d6 3250 depth+1,
786e8c11 3251 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 3252 );
7122b237 3253#endif
538e84ed 3254 /* But first we check to see if there is a common prefix we can
3dab1dad
YO
3255 split out as an EXACT and put in front of the TRIE node. */
3256 trie->startstate= 1;
55eed653 3257 if ( trie->bitmap && !widecharmap && !trie->jump ) {
5ee57374
YO
3258 /* we want to find the first state that has more than
3259 * one transition, if that state is not the first state
3260 * then we have a common prefix which we can remove.
3261 */
3dab1dad 3262 U32 state;
1e2e3d02 3263 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 3264 U32 ofs = 0;
ee0dfd0b
YO
3265 I32 first_ofs = -1; /* keeps track of the ofs of the first
3266 transition, -1 means none */
8e11feef
RGS
3267 U32 count = 0;
3268 const U32 base = trie->states[ state ].trans.base;
a3621e74 3269
5ee57374 3270 /* does this state terminate an alternation? */
3dab1dad 3271 if ( trie->states[state].wordnum )
8e11feef 3272 count = 1;
a3621e74 3273
8e11feef 3274 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
3275 if ( ( base + ofs >= trie->uniquecharcount ) &&
3276 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
3277 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3278 {
3dab1dad 3279 if ( ++count > 1 ) {
5ee57374 3280 /* we have more than one transition */
d3d91c7c
YO
3281 SV **tmp;
3282 U8 *ch;
5ee57374
YO
3283 /* if this is the first state there is no common prefix
3284 * to extract, so we can exit */
8e11feef 3285 if ( state == 1 ) break;
d3d91c7c
YO
3286 tmp = av_fetch( revcharmap, ofs, 0);
3287 ch = (U8*)SvPV_nolen_const( *tmp );
3288
5ee57374
YO
3289 /* if we are on count 2 then we need to initialize the
3290 * bitmap, and store the previous char if there was one
3291 * in it*/
3dab1dad 3292 if ( count == 2 ) {
5ee57374 3293 /* clear the bitmap */
3dab1dad
YO
3294 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3295 DEBUG_OPTIMISE_r(
147e3846 3296 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
cb41e5d6 3297 depth+1,
786e8c11 3298 (UV)state));
ee0dfd0b
YO
3299 if (first_ofs >= 0) {
3300 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
be8e71aa 3301 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 3302
8bcafbf4 3303 TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
3dab1dad 3304 DEBUG_OPTIMISE_r(
6ad9a8ab 3305 Perl_re_printf( aTHX_ "%s", (char*)ch)
3dab1dad 3306 );
8e11feef
RGS
3307 }
3308 }
8bcafbf4
YO
3309 /* store the current firstchar in the bitmap */
3310 TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
6ad9a8ab 3311 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
8e11feef 3312 }
ee0dfd0b 3313 first_ofs = ofs;
8e11feef 3314 }
3dab1dad
YO
3315 }
3316 if ( count == 1 ) {
5ee57374
YO
3317 /* This state has only one transition, its transition is part
3318 * of a common prefix - we need to concatenate the char it
3319 * represents to what we have so far. */
ee0dfd0b 3320 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
c490c714
YO
3321 STRLEN len;
3322 char *ch = SvPV( *tmp, len );
de734bd5
A
3323 DEBUG_OPTIMISE_r({
3324 SV *sv=sv_newmortal();
147e3846 3325 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
cb41e5d6 3326 depth+1,
ee0dfd0b 3327 (UV)state, (UV)first_ofs,
538e84ed 3328 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
de734bd5
A
3329 PL_colors[0], PL_colors[1],
3330 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed 3331 PERL_PV_ESCAPE_FIRSTCHAR
de734bd5
A
3332 )
3333 );
3334 });
3dab1dad
YO
3335 if ( state==1 ) {
3336 OP( convert ) = nodetype;
3337 str=STRING(convert);
3338 STR_LEN(convert)=0;
3339 }
c490c714
YO
3340 STR_LEN(convert) += len;
3341 while (len--)
de734bd5 3342 *str++ = *ch++;
8e11feef 3343 } else {
538e84ed 3344#ifdef DEBUGGING
8e11feef 3345 if (state>1)
6ad9a8ab 3346 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
f9049ba1 3347#endif
8e11feef
RGS
3348 break;
3349 }
3350 }
2e64971a 3351 trie->prefixlen = (state-1);
3dab1dad 3352 if (str) {
8e11feef 3353 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 3354 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 3355 trie->startstate = state;
07be1b83
YO
3356 trie->minlen -= (state - 1);
3357 trie->maxlen -= (state - 1);
33809eae
JH
3358#ifdef DEBUGGING
3359 /* At least the UNICOS C compiler choked on this
3360 * being argument to DEBUG_r(), so let's just have
3361 * it right here. */
3362 if (
3363#ifdef PERL_EXT_RE_BUILD
3364 1
3365#else
3366 DEBUG_r_TEST
3367#endif
3368 ) {
3369 regnode *fix = convert;
3370 U32 word = trie->wordcount;
3371 mjd_nodelen++;
3372 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3373 while( ++fix < n ) {
3374 Set_Node_Offset_Length(fix, 0, 0);
3375 }
3376 while (word--) {
3377 SV ** const tmp = av_fetch( trie_words, word, 0 );
3378 if (tmp) {
3379 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3380 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3381 else
3382 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3383 }
3384 }
3385 }
3386#endif
8e11feef
RGS
3387 if (trie->maxlen) {
3388 convert = n;
3389 } else {
3dab1dad 3390 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 3391 DEBUG_r(optimize= n);
3dab1dad
YO
3392 }
3393 }
3394 }
538e84ed
KW
3395 if (!jumper)
3396 jumper = last;
3dab1dad 3397 if ( trie->maxlen ) {
8e11feef
RGS
3398 NEXT_OFF( convert ) = (U16)(tail - convert);
3399 ARG_SET( convert, data_slot );
538e84ed
KW
3400 /* Store the offset to the first unabsorbed branch in
3401 jump[0], which is otherwise unused by the jump logic.
786e8c11 3402 We use this when dumping a trie and during optimisation. */
538e84ed 3403 if (trie->jump)
7f69552c 3404 trie->jump[0] = (U16)(nextbranch - convert);
538e84ed 3405
6c48061a
YO
3406 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3407 * and there is a bitmap
3408 * and the first "jump target" node we found leaves enough room
3409 * then convert the TRIE node into a TRIEC node, with the bitmap
3410 * embedded inline in the opcode - this is hypothetically faster.
3411 */
3412 if ( !trie->states[trie->startstate].wordnum
3413 && trie->bitmap
3414 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
3415 {
3416 OP( convert ) = TRIEC;
3417 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 3418 PerlMemShared_free(trie->bitmap);
786e8c11 3419 trie->bitmap= NULL;
538e84ed 3420 } else
786e8c11 3421 OP( convert ) = TRIE;
a3621e74 3422
3dab1dad
YO
3423 /* store the type in the flags */
3424 convert->flags = nodetype;
a5ca303d 3425 DEBUG_r({
538e84ed
KW
3426 optimize = convert
3427 + NODE_STEP_REGNODE
a5ca303d
YO
3428 + regarglen[ OP( convert ) ];
3429 });
538e84ed 3430 /* XXX We really should free up the resource in trie now,
a5ca303d 3431 as we won't use them - (which resources?) dmq */
3dab1dad 3432 }
a3621e74 3433 /* needed for dumping*/
e62cc96a 3434 DEBUG_r(if (optimize) {
07be1b83 3435 regnode *opt = convert;
bcdf7404 3436
e62cc96a 3437 while ( ++opt < optimize) {
07be1b83
YO
3438 Set_Node_Offset_Length(opt,0,0);
3439 }
538e84ed
KW
3440 /*
3441 Try to clean up some of the debris left after the
786e8c11 3442 optimisation.
a3621e74 3443 */
786e8c11 3444 while( optimize < jumper ) {
07be1b83 3445 mjd_nodelen += Node_Length((optimize));
a3621e74 3446 OP( optimize ) = OPTIMIZED;
07be1b83 3447 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
3448 optimize++;
3449 }
07be1b83 3450 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
3451 });
3452 } /* end node insert */
2e64971a
DM
3453
3454 /* Finish populating the prev field of the wordinfo array. Walk back
3455 * from each accept state until we find another accept state, and if
3456 * so, point the first word's .prev field at the second word. If the
3457 * second already has a .prev field set, stop now. This will be the
3458 * case either if we've already processed that word's accept state,
3b753521
FN
3459 * or that state had multiple words, and the overspill words were
3460 * already linked up earlier.
2e64971a
DM
3461 */
3462 {
3463 U16 word;
3464 U32 state;
3465 U16 prev;
3466
3467