This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
avoid double-freeing regex code blocks
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
f7e03a10 89#include "dquote_inline.h"
b992490d 90#include "invlist_inline.h"
1b0f46bf 91#include "unicode_constants.h"
04e98a4d 92
538e84ed
KW
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
95#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 97#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 98#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
3f910e62
YO
104/* this is a chain of data about sub patterns we are processing that
105 need to be handled separately/specially in study_chunk. Its so
106 we can simulate recursion without losing state. */
107struct scan_frame;
108typedef struct scan_frame {
109 regnode *last_regnode; /* last node to process in this frame */
110 regnode *next_regnode; /* next node to process when last is reached */
111 U32 prev_recursed_depth;
112 I32 stopparen; /* what stopparen do we use */
113 U32 is_top_frame; /* what flags do we use? */
114
115 struct scan_frame *this_prev_frame; /* this previous frame */
116 struct scan_frame *prev_frame; /* previous frame */
117 struct scan_frame *next_frame; /* next frame */
118} scan_frame;
119
f4ae5a27
KW
120/* Certain characters are output as a sequence with the first being a
121 * backslash. */
122#define isBACKSLASHED_PUNCT(c) \
123 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
124
125
09b2b2e6 126struct RExC_state_t {
514a91f1
DM
127 U32 flags; /* RXf_* are we folding, multilining? */
128 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 129 char *precomp; /* uncompiled string. */
711b303b 130 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 131 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 132 regexp *rx; /* perl core regexp structure */
538e84ed
KW
133 regexp_internal *rxi; /* internal data for regexp object
134 pprivate field */
fac92740 135 char *start; /* Start of input for compile */
830247a4
IZ
136 char *end; /* End of input for compile */
137 char *parse; /* Input-scan pointer. */
285b5ca0
KW
138 char *adjusted_start; /* 'start', adjusted. See code use */
139 STRLEN precomp_adj; /* an offset beyond precomp. See code use */
ea3daa5d 140 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 141 regnode *emit_start; /* Start of emitted-code area */
538e84ed
KW
142 regnode *emit_bound; /* First regnode outside of the
143 allocated space */
f7c7e32a
DM
144 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
145 implies compiling, so don't emit */
9a81a976
KW
146 regnode_ssc emit_dummy; /* placeholder for emit to point to;
147 large enough for the largest
148 non-EXACTish node, so can use it as
149 scratch in pass1 */
830247a4
IZ
150 I32 naughty; /* How bad is this pattern? */
151 I32 sawback; /* Did we see \1, ...? */
152 U32 seen;
ea3daa5d 153 SSize_t size; /* Code size. */
538e84ed
KW
154 I32 npar; /* Capture buffer count, (OPEN) plus
155 one. ("par" 0 is the whole
156 pattern)*/
157 I32 nestroot; /* root parens we are in - used by
158 accept */
830247a4
IZ
159 I32 extralen;
160 I32 seen_zerolen;
40d049e4
YO
161 regnode **open_parens; /* pointers to open parens */
162 regnode **close_parens; /* pointers to close parens */
d5a00e4a 163 regnode *end_op; /* END node in program */
02daf0ab
YO
164 I32 utf8; /* whether the pattern is utf8 or not */
165 I32 orig_utf8; /* whether the pattern was originally in utf8 */
166 /* XXX use this for future optimisation of case
167 * where pattern must be upgraded to utf8. */
e40e74fe
KW
168 I32 uni_semantics; /* If a d charset modifier should use unicode
169 rules, even if the pattern is not in
170 utf8 */
81714fb9 171 HV *paren_names; /* Paren names */
538e84ed 172
40d049e4 173 regnode **recurse; /* Recurse regops */
d5a00e4a 174 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 175 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 176 through */
09a65838 177 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 178 I32 in_lookbehind;
4624b182 179 I32 contains_locale;
bb3f3ed2 180 I32 override_recoding;
b6d67071
KW
181#ifdef EBCDIC
182 I32 recode_x_to_native;
183#endif
9d53c457 184 I32 in_multi_char_class;
1acab4c5 185 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 186 within pattern */
b1603ef8 187 int code_index; /* next code_blocks[] slot */
ee273784 188 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
189 scan_frame *frame_head;
190 scan_frame *frame_last;
191 U32 frame_count;
7eec73eb 192 AV *warn_text;
dc6d7f5c 193#ifdef ADD_TO_REGEXEC
830247a4
IZ
194 char *starttry; /* -Dr: where regtry was called. */
195#define RExC_starttry (pRExC_state->starttry)
196#endif
d24ca0c5 197 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 198#ifdef DEBUGGING
be8e71aa 199 const char *lastparse;
3dab1dad 200 I32 lastnum;
1f1031fe 201 AV *paren_name_list; /* idx -> name */
d9a72fcc 202 U32 study_chunk_recursed_count;
c9f0d54c
YO
203 SV *mysv1;
204 SV *mysv2;
3dab1dad
YO
205#define RExC_lastparse (pRExC_state->lastparse)
206#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 207#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 208#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
209#define RExC_mysv (pRExC_state->mysv1)
210#define RExC_mysv1 (pRExC_state->mysv1)
211#define RExC_mysv2 (pRExC_state->mysv2)
212
3dab1dad 213#endif
512c0f5a 214 bool seen_unfolded_sharp_s;
911bd04e 215 bool strict;
da7cf1cc 216 bool study_started;
09b2b2e6 217};
830247a4 218
e2509266 219#define RExC_flags (pRExC_state->flags)
514a91f1 220#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 221#define RExC_precomp (pRExC_state->precomp)
285b5ca0
KW
222#define RExC_precomp_adj (pRExC_state->precomp_adj)
223#define RExC_adjusted_start (pRExC_state->adjusted_start)
711b303b 224#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 225#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 226#define RExC_rx (pRExC_state->rx)
f8fc2ecf 227#define RExC_rxi (pRExC_state->rxi)
fac92740 228#define RExC_start (pRExC_state->start)
830247a4
IZ
229#define RExC_end (pRExC_state->end)
230#define RExC_parse (pRExC_state->parse)
231#define RExC_whilem_seen (pRExC_state->whilem_seen)
512c0f5a
KW
232
233/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
234 * EXACTF node, hence was parsed under /di rules. If later in the parse,
235 * something forces the pattern into using /ui rules, the sharp s should be
236 * folded into the sequence 'ss', which takes up more space than previously
237 * calculated. This means that the sizing pass needs to be restarted. (The
238 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
239 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
240 * so there is no need to resize [perl #125990]. */
241#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
242
7122b237 243#ifdef RE_TRACK_PATTERN_OFFSETS
538e84ed
KW
244#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
245 others */
7122b237 246#endif
830247a4 247#define RExC_emit (pRExC_state->emit)
f7c7e32a 248#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 249#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 250#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
251#define RExC_sawback (pRExC_state->sawback)
252#define RExC_seen (pRExC_state->seen)
253#define RExC_size (pRExC_state->size)
ee273784 254#define RExC_maxlen (pRExC_state->maxlen)
830247a4 255#define RExC_npar (pRExC_state->npar)
e2e6a0f1 256#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
257#define RExC_extralen (pRExC_state->extralen)
258#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 259#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 260#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 261#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
262#define RExC_open_parens (pRExC_state->open_parens)
263#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 264#define RExC_end_op (pRExC_state->end_op)
81714fb9 265#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
266#define RExC_recurse (pRExC_state->recurse)
267#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 268#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
269#define RExC_study_chunk_recursed_bytes \
270 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 271#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 272#define RExC_contains_locale (pRExC_state->contains_locale)
b6d67071
KW
273#ifdef EBCDIC
274# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
275#endif
9d53c457 276#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
277#define RExC_frame_head (pRExC_state->frame_head)
278#define RExC_frame_last (pRExC_state->frame_last)
279#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 280#define RExC_strict (pRExC_state->strict)
da7cf1cc 281#define RExC_study_started (pRExC_state->study_started)
7eec73eb 282#define RExC_warn_text (pRExC_state->warn_text)
830247a4 283
99807a43
HS
284/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
285 * a flag to disable back-off on the fixed/floating substrings - if it's
286 * a high complexity pattern we assume the benefit of avoiding a full match
287 * is worth the cost of checking for the substrings even if they rarely help.
288 */
289#define RExC_naughty (pRExC_state->naughty)
290#define TOO_NAUGHTY (10)
291#define MARK_NAUGHTY(add) \
292 if (RExC_naughty < TOO_NAUGHTY) \
293 RExC_naughty += (add)
294#define MARK_NAUGHTY_EXP(exp, add) \
295 if (RExC_naughty < TOO_NAUGHTY) \
296 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 297
a687059c
LW
298#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
299#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 300 ((*s) == '{' && regcurly(s)))
a687059c
LW
301
302/*
303 * Flags to be passed up and down.
304 */
a687059c 305#define WORST 0 /* Worst case. */
a3b492c3 306#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 307
e64f369d 308/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
309 * character. (There needs to be a case: in the switch statement in regexec.c
310 * for any node marked SIMPLE.) Note that this is not the same thing as
311 * REGNODE_SIMPLE */
fda99bee 312#define SIMPLE 0x02
e64f369d 313#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
314#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
315#define TRYAGAIN 0x10 /* Weeded out a declaration. */
b97943f7
KW
316#define RESTART_PASS1 0x20 /* Need to restart sizing pass */
317#define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
318 calcuate sizes as UTF-8 */
a687059c 319
3dab1dad
YO
320#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
321
07be1b83
YO
322/* whether trie related optimizations are enabled */
323#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
324#define TRIE_STUDY_OPT
786e8c11 325#define FULL_TRIE_STUDY
07be1b83
YO
326#define TRIE_STCLASS
327#endif
1de06328
YO
328
329
40d049e4
YO
330
331#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
332#define PBITVAL(paren) (1 << ((paren) & 7))
333#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
334#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
335#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
336
82a6ada4 337#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 338 if (!UTF) { \
82a6ada4
KW
339 assert(PASS1); \
340 *flagp = RESTART_PASS1|NEED_UTF8; \
8d9c2815
NC
341 return NULL; \
342 } \
82a6ada4 343 } STMT_END
40d049e4 344
512c0f5a
KW
345/* Change from /d into /u rules, and restart the parse if we've already seen
346 * something whose size would increase as a result, by setting *flagp and
347 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
348 * we've change to /u during the parse. */
349#define REQUIRE_UNI_RULES(flagp, restart_retval) \
350 STMT_START { \
351 if (DEPENDS_SEMANTICS) { \
352 assert(PASS1); \
353 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
354 RExC_uni_semantics = 1; \
355 if (RExC_seen_unfolded_sharp_s) { \
356 *flagp |= RESTART_PASS1; \
357 return restart_retval; \
358 } \
359 } \
360 } STMT_END
361
f19b1a63
KW
362/* This converts the named class defined in regcomp.h to its equivalent class
363 * number defined in handy.h. */
364#define namedclass_to_classnum(class) ((int) ((class) / 2))
365#define classnum_to_namedclass(classnum) ((classnum) * 2)
366
de92f5e6
KW
367#define _invlist_union_complement_2nd(a, b, output) \
368 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
369#define _invlist_intersection_complement_2nd(a, b, output) \
370 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
371
1de06328
YO
372/* About scan_data_t.
373
374 During optimisation we recurse through the regexp program performing
375 various inplace (keyhole style) optimisations. In addition study_chunk
376 and scan_commit populate this data structure with information about
538e84ed 377 what strings MUST appear in the pattern. We look for the longest
3b753521 378 string that must appear at a fixed location, and we look for the
1de06328
YO
379 longest string that may appear at a floating location. So for instance
380 in the pattern:
538e84ed 381
1de06328 382 /FOO[xX]A.*B[xX]BAR/
538e84ed 383
1de06328
YO
384 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
385 strings (because they follow a .* construct). study_chunk will identify
386 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 387
1de06328 388 The strings can be composites, for instance
538e84ed 389
1de06328 390 /(f)(o)(o)/
538e84ed 391
1de06328 392 will result in a composite fixed substring 'foo'.
538e84ed 393
1de06328 394 For each string some basic information is maintained:
538e84ed 395
1de06328
YO
396 - offset or min_offset
397 This is the position the string must appear at, or not before.
398 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
399 characters must match before the string we are searching for.
400 Likewise when combined with minlenp and the length of the string it
538e84ed 401 tells us how many characters must appear after the string we have
1de06328 402 found.
538e84ed 403
1de06328
YO
404 - max_offset
405 Only used for floating strings. This is the rightmost point that
ea3daa5d 406 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 407 string can occur infinitely far to the right.
538e84ed 408
1de06328 409 - minlenp
2d608413
KW
410 A pointer to the minimum number of characters of the pattern that the
411 string was found inside. This is important as in the case of positive
538e84ed 412 lookahead or positive lookbehind we can have multiple patterns
1de06328 413 involved. Consider
538e84ed 414
1de06328 415 /(?=FOO).*F/
538e84ed 416
1de06328
YO
417 The minimum length of the pattern overall is 3, the minimum length
418 of the lookahead part is 3, but the minimum length of the part that
538e84ed 419 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 420 minimum length for the F is 1. This is important as the minimum length
538e84ed 421 is used to determine offsets in front of and behind the string being
1de06328 422 looked for. Since strings can be composites this is the length of the
486ec47a 423 pattern at the time it was committed with a scan_commit. Note that
1de06328 424 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 425 are not known until the full pattern has been compiled, thus the
1de06328 426 pointer to the value.
538e84ed 427
1de06328 428 - lookbehind
538e84ed 429
1de06328 430 In the case of lookbehind the string being searched for can be
538e84ed 431 offset past the start point of the final matching string.
1de06328
YO
432 If this value was just blithely removed from the min_offset it would
433 invalidate some of the calculations for how many chars must match
434 before or after (as they are derived from min_offset and minlen and
538e84ed 435 the length of the string being searched for).
1de06328
YO
436 When the final pattern is compiled and the data is moved from the
437 scan_data_t structure into the regexp structure the information
538e84ed
KW
438 about lookbehind is factored in, with the information that would
439 have been lost precalculated in the end_shift field for the
1de06328
YO
440 associated string.
441
442 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 443 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
444
445*/
2c2d71f5
JH
446
447typedef struct scan_data_t {
1de06328
YO
448 /*I32 len_min; unused */
449 /*I32 len_delta; unused */
49f55535 450 SSize_t pos_min;
ea3daa5d 451 SSize_t pos_delta;
2c2d71f5 452 SV *last_found;
ea3daa5d 453 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 454 SSize_t last_start_min;
ea3daa5d 455 SSize_t last_start_max;
1de06328
YO
456 SV **longest; /* Either &l_fixed, or &l_float. */
457 SV *longest_fixed; /* longest fixed string found in pattern */
49f55535 458 SSize_t offset_fixed; /* offset where it starts */
ea3daa5d 459 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
460 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
461 SV *longest_float; /* longest floating string found in pattern */
49f55535 462 SSize_t offset_float_min; /* earliest point in string it can appear */
ea3daa5d
FC
463 SSize_t offset_float_max; /* latest point in string it can appear */
464 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
49f55535 465 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
2c2d71f5
JH
466 I32 flags;
467 I32 whilem_c;
49f55535 468 SSize_t *last_closep;
b8f7bb16 469 regnode_ssc *start_class;
2c2d71f5
JH
470} scan_data_t;
471
a687059c 472/*
e50aee73 473 * Forward declarations for pregcomp()'s friends.
a687059c 474 */
a0d0e21e 475
27da23d5 476static const scan_data_t zero_scan_data =
1de06328 477 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
478
479#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
480#define SF_BEFORE_SEOL 0x0001
481#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
482#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
483#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
484
44e3dfd2
BF
485#define SF_FIX_SHIFT_EOL (+2)
486#define SF_FL_SHIFT_EOL (+4)
c277df42
IZ
487
488#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
489#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
490
491#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
492#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
493#define SF_IS_INF 0x0040
494#define SF_HAS_PAR 0x0080
495#define SF_IN_PAR 0x0100
496#define SF_HAS_EVAL 0x0200
bc604ad8
KW
497
498
499/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
500 * longest substring in the pattern. When it is not set the optimiser keeps
501 * track of position, but does not keep track of the actual strings seen,
502 *
503 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
504 * /foo/i will not.
505 *
506 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
507 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
508 * turned off because of the alternation (BRANCH). */
07be1b83 509#define SCF_DO_SUBSTR 0x0400
bc604ad8 510
653099ff
GS
511#define SCF_DO_STCLASS_AND 0x0800
512#define SCF_DO_STCLASS_OR 0x1000
513#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 514#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 515
786e8c11 516#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 517#define SCF_SEEN_ACCEPT 0x8000
688e0391 518#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
519#define SCF_IN_DEFINE 0x20000
520
521
522
07be1b83 523
43fead97 524#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
525
526/* The enums for all these are ordered so things work out correctly */
a62b1201 527#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
528#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
529 == REGEX_DEPENDS_CHARSET)
00b27cfc 530#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
531#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
532 >= REGEX_UNICODE_CHARSET)
533#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
534 == REGEX_ASCII_RESTRICTED_CHARSET)
535#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
536 >= REGEX_ASCII_RESTRICTED_CHARSET)
537#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
538 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 539
43fead97 540#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 541
f2c2a6ab
KW
542/* For programs that want to be strictly Unicode compatible by dying if any
543 * attempt is made to match a non-Unicode code point against a Unicode
544 * property. */
545#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
546
93733859 547#define OOB_NAMEDCLASS -1
b8c5462f 548
8e661ac5
KW
549/* There is no code point that is out-of-bounds, so this is problematic. But
550 * its only current use is to initialize a variable that is always set before
551 * looked at. */
552#define OOB_UNICODE 0xDEADBEEF
553
a0ed51b3 554#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 555
8615cb43 556
b45f050a
JF
557/* length of regex to show in messages that don't mark a position within */
558#define RegexLengthToShowInErrorMessages 127
559
560/*
561 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
562 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
563 * op/pragma/warn/regcomp.
564 */
7253e4e3
RK
565#define MARKER1 "<-- HERE" /* marker as it appears in the description */
566#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 567
538e84ed 568#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 569 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 570
285b5ca0
KW
571/* The code in this file in places uses one level of recursion with parsing
572 * rebased to an alternate string constructed by us in memory. This can take
573 * the form of something that is completely different from the input, or
574 * something that uses the input as part of the alternate. In the first case,
575 * there should be no possibility of an error, as we are in complete control of
576 * the alternate string. But in the second case we don't control the input
577 * portion, so there may be errors in that. Here's an example:
578 * /[abc\x{DF}def]/ui
579 * is handled specially because \x{df} folds to a sequence of more than one
580 * character, 'ss'. What is done is to create and parse an alternate string,
581 * which looks like this:
582 * /(?:\x{DF}|[abc\x{DF}def])/ui
583 * where it uses the input unchanged in the middle of something it constructs,
584 * which is a branch for the DF outside the character class, and clustering
585 * parens around the whole thing. (It knows enough to skip the DF inside the
586 * class while in this substitute parse.) 'abc' and 'def' may have errors that
587 * need to be reported. The general situation looks like this:
588 *
589 * sI tI xI eI
590 * Input: ----------------------------------------------------
591 * Constructed: ---------------------------------------------------
592 * sC tC xC eC EC
593 *
594 * The input string sI..eI is the input pattern. The string sC..EC is the
595 * constructed substitute parse string. The portions sC..tC and eC..EC are
596 * constructed by us. The portion tC..eC is an exact duplicate of the input
597 * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that
598 * while parsing, we find an error at xC. We want to display a message showing
599 * the real input string. Thus we need to find the point xI in it which
600 * corresponds to xC. xC >= tC, since the portion of the string sC..tC has
601 * been constructed by us, and so shouldn't have errors. We get:
602 *
603 * xI = sI + (tI - sI) + (xC - tC)
604 *
605 * and, the offset into sI is:
606 *
607 * (xI - sI) = (tI - sI) + (xC - tC)
608 *
609 * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
610 * and we save tC as RExC_adjusted_start.
903c858a
KW
611 *
612 * During normal processing of the input pattern, everything points to that,
613 * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
285b5ca0
KW
614 */
615
616#define tI_sI RExC_precomp_adj
617#define tC RExC_adjusted_start
618#define sC RExC_precomp
619#define xI_offset(xC) ((IV) (tI_sI + (xC - tC)))
620#define xI(xC) (sC + xI_offset(xC))
621#define eC RExC_precomp_end
622
623#define REPORT_LOCATION_ARGS(xC) \
624 UTF8fARG(UTF, \
625 (xI(xC) > eC) /* Don't run off end */ \
626 ? eC - sC /* Length before the <--HERE */ \
627 : xI_offset(xC), \
628 sC), /* The input pattern printed up to the <--HERE */ \
629 UTF8fARG(UTF, \
630 (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \
631 (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */
c1d900c3 632
8a6d8ec6
HS
633/* Used to point after bad bytes for an error message, but avoid skipping
634 * past a nul byte. */
635#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
636
b45f050a
JF
637/*
638 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
639 * arg. Show regex, up to a maximum length. If it's too long, chop and add
640 * "...".
641 */
58e23c8d 642#define _FAIL(code) STMT_START { \
bfed75c6 643 const char *ellipses = ""; \
711b303b 644 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380
MP
645 \
646 if (!SIZE_ONLY) \
a5e7bc51 647 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
648 if (len > RegexLengthToShowInErrorMessages) { \
649 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
650 len = RegexLengthToShowInErrorMessages - 10; \
651 ellipses = "..."; \
652 } \
58e23c8d 653 code; \
ccb2c380 654} STMT_END
8615cb43 655
58e23c8d 656#define FAIL(msg) _FAIL( \
147e3846 657 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 658 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
659
660#define FAIL2(msg,arg) _FAIL( \
147e3846 661 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 662 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 663
b45f050a 664/*
b45f050a
JF
665 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
666 */
ccb2c380 667#define Simple_vFAIL(m) STMT_START { \
ccb2c380 668 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 669 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 670} STMT_END
b45f050a
JF
671
672/*
673 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
674 */
ccb2c380
MP
675#define vFAIL(m) STMT_START { \
676 if (!SIZE_ONLY) \
a5e7bc51 677 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
678 Simple_vFAIL(m); \
679} STMT_END
b45f050a
JF
680
681/*
682 * Like Simple_vFAIL(), but accepts two arguments.
683 */
ccb2c380 684#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
685 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
686 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 687} STMT_END
b45f050a
JF
688
689/*
690 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
691 */
ccb2c380
MP
692#define vFAIL2(m,a1) STMT_START { \
693 if (!SIZE_ONLY) \
a5e7bc51 694 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
695 Simple_vFAIL2(m, a1); \
696} STMT_END
b45f050a
JF
697
698
699/*
700 * Like Simple_vFAIL(), but accepts three arguments.
701 */
ccb2c380 702#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 703 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 704 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 705} STMT_END
b45f050a
JF
706
707/*
708 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
709 */
ccb2c380
MP
710#define vFAIL3(m,a1,a2) STMT_START { \
711 if (!SIZE_ONLY) \
a5e7bc51 712 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
713 Simple_vFAIL3(m, a1, a2); \
714} STMT_END
b45f050a
JF
715
716/*
717 * Like Simple_vFAIL(), but accepts four arguments.
718 */
ccb2c380 719#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
720 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
721 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 722} STMT_END
b45f050a 723
95db3ffa
KW
724#define vFAIL4(m,a1,a2,a3) STMT_START { \
725 if (!SIZE_ONLY) \
726 SAVEFREESV(RExC_rx_sv); \
727 Simple_vFAIL4(m, a1, a2, a3); \
728} STMT_END
729
946095af 730/* A specialized version of vFAIL2 that works with UTF8f */
d528642a
KW
731#define vFAIL2utf8f(m, a1) STMT_START { \
732 if (!SIZE_ONLY) \
733 SAVEFREESV(RExC_rx_sv); \
734 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
735 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
736} STMT_END
737
3ba22297 738#define vFAIL3utf8f(m, a1, a2) STMT_START { \
3ba22297
KW
739 if (!SIZE_ONLY) \
740 SAVEFREESV(RExC_rx_sv); \
741 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 742 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
743} STMT_END
744
499333dc
KW
745/* These have asserts in them because of [perl #122671] Many warnings in
746 * regcomp.c can occur twice. If they get output in pass1 and later in that
747 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
748 * would get output again. So they should be output in pass2, and these
749 * asserts make sure new warnings follow that paradigm. */
946095af 750
5e0a247b
KW
751/* m is not necessarily a "literal string", in this macro */
752#define reg_warn_non_literal_string(loc, m) STMT_START { \
d528642a
KW
753 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
754 "%s" REPORT_LOCATION, \
755 m, REPORT_LOCATION_ARGS(loc)); \
5e0a247b
KW
756} STMT_END
757
668c081a 758#define ckWARNreg(loc,m) STMT_START { \
d528642a
KW
759 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
760 m REPORT_LOCATION, \
761 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
762} STMT_END
763
b927b7e9 764#define vWARN(loc, m) STMT_START { \
d528642a
KW
765 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
766 m REPORT_LOCATION, \
767 REPORT_LOCATION_ARGS(loc)); \
b927b7e9
KW
768} STMT_END
769
0d6106aa 770#define vWARN_dep(loc, m) STMT_START { \
d528642a
KW
771 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
772 m REPORT_LOCATION, \
773 REPORT_LOCATION_ARGS(loc)); \
0d6106aa
KW
774} STMT_END
775
147508a2 776#define ckWARNdep(loc,m) STMT_START { \
d528642a
KW
777 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
778 m REPORT_LOCATION, \
779 REPORT_LOCATION_ARGS(loc)); \
147508a2
KW
780} STMT_END
781
d528642a
KW
782#define ckWARNregdep(loc,m) STMT_START { \
783 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
784 WARN_REGEXP), \
785 m REPORT_LOCATION, \
786 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
787} STMT_END
788
d528642a
KW
789#define ckWARN2reg_d(loc,m, a1) STMT_START { \
790 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
791 m REPORT_LOCATION, \
792 a1, REPORT_LOCATION_ARGS(loc)); \
2335b3d3
KW
793} STMT_END
794
d528642a
KW
795#define ckWARN2reg(loc, m, a1) STMT_START { \
796 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
797 m REPORT_LOCATION, \
798 a1, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
799} STMT_END
800
d528642a
KW
801#define vWARN3(loc, m, a1, a2) STMT_START { \
802 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
803 m REPORT_LOCATION, \
804 a1, a2, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
805} STMT_END
806
d528642a
KW
807#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
808 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
809 m REPORT_LOCATION, \
810 a1, a2, \
811 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
812} STMT_END
813
ccb2c380 814#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
815 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
816 m REPORT_LOCATION, \
817 a1, a2, a3, \
818 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
819} STMT_END
820
bfdc8cd3
A
821#define vWARN4dep(loc, m, a1, a2, a3) STMT_START { \
822 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN2(WARN_REGEXP,WARN_DEPRECATED), \
823 m REPORT_LOCATION, \
824 a1, a2, a3, \
825 REPORT_LOCATION_ARGS(loc)); \
826} STMT_END
827
668c081a 828#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
829 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
830 m REPORT_LOCATION, \
831 a1, a2, a3, \
832 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
833} STMT_END
834
ccb2c380 835#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
d528642a
KW
836 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
837 m REPORT_LOCATION, \
838 a1, a2, a3, a4, \
839 REPORT_LOCATION_ARGS(loc)); \
ccb2c380 840} STMT_END
9d1d55b5 841
538e84ed 842/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
843 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
844 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
845 * Element 0 holds the number n.
07be1b83 846 * Position is 1 indexed.
fac92740 847 */
7122b237
YO
848#ifndef RE_TRACK_PATTERN_OFFSETS
849#define Set_Node_Offset_To_R(node,byte)
850#define Set_Node_Offset(node,byte)
851#define Set_Cur_Node_Offset
852#define Set_Node_Length_To_R(node,len)
853#define Set_Node_Length(node,len)
6a86c6ad 854#define Set_Node_Cur_Length(node,start)
538e84ed
KW
855#define Node_Offset(n)
856#define Node_Length(n)
7122b237
YO
857#define Set_Node_Offset_Length(node,offset,len)
858#define ProgLen(ri) ri->u.proglen
859#define SetProgLen(ri,x) ri->u.proglen = x
860#else
861#define ProgLen(ri) ri->u.offsets[0]
862#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
863#define Set_Node_Offset_To_R(node,byte) STMT_START { \
864 if (! SIZE_ONLY) { \
865 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 866 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 867 if((node) < 0) { \
538e84ed
KW
868 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
869 (int)(node)); \
ccb2c380
MP
870 } else { \
871 RExC_offsets[2*(node)-1] = (byte); \
872 } \
873 } \
874} STMT_END
875
876#define Set_Node_Offset(node,byte) \
877 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
878#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
879
880#define Set_Node_Length_To_R(node,len) STMT_START { \
881 if (! SIZE_ONLY) { \
882 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 883 __LINE__, (int)(node), (int)(len))); \
ccb2c380 884 if((node) < 0) { \
538e84ed
KW
885 Perl_croak(aTHX_ "value of node is %d in Length macro", \
886 (int)(node)); \
ccb2c380
MP
887 } else { \
888 RExC_offsets[2*(node)] = (len); \
889 } \
890 } \
891} STMT_END
892
893#define Set_Node_Length(node,len) \
894 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
895#define Set_Node_Cur_Length(node, start) \
896 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
897
898/* Get offsets and lengths */
899#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
900#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
901
07be1b83
YO
902#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
903 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
904 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
905} STMT_END
7122b237 906#endif
07be1b83
YO
907
908#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
909#define EXPERIMENTAL_INPLACESCAN
f427392e 910#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 911
cb41e5d6
YO
912#ifdef DEBUGGING
913int
6ad9a8ab 914Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
915{
916 va_list ap;
917 int result;
918 PerlIO *f= Perl_debug_log;
919 PERL_ARGS_ASSERT_RE_PRINTF;
920 va_start(ap, fmt);
921 result = PerlIO_vprintf(f, fmt, ap);
922 va_end(ap);
923 return result;
924}
925
926int
7b031478 927Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
928{
929 va_list ap;
930 int result;
931 PerlIO *f= Perl_debug_log;
932 PERL_ARGS_ASSERT_RE_INDENTF;
933 va_start(ap, depth);
daeb874b 934 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
935 result = PerlIO_vprintf(f, fmt, ap);
936 va_end(ap);
937 return result;
938}
939#endif /* DEBUGGING */
940
7b031478 941#define DEBUG_RExC_seen() \
538e84ed 942 DEBUG_OPTIMISE_MORE_r({ \
6ad9a8ab 943 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 944 \
e384d5c1 945 if (RExC_seen & REG_ZERO_LEN_SEEN) \
6ad9a8ab 946 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 947 \
e384d5c1 948 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
6ad9a8ab 949 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 950 \
e384d5c1 951 if (RExC_seen & REG_GPOS_SEEN) \
6ad9a8ab 952 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 953 \
e384d5c1 954 if (RExC_seen & REG_RECURSE_SEEN) \
6ad9a8ab 955 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 956 \
7b031478 957 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
6ad9a8ab 958 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 959 \
e384d5c1 960 if (RExC_seen & REG_VERBARG_SEEN) \
6ad9a8ab 961 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 962 \
e384d5c1 963 if (RExC_seen & REG_CUTGROUP_SEEN) \
6ad9a8ab 964 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 965 \
e384d5c1 966 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
6ad9a8ab 967 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 968 \
e384d5c1 969 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
6ad9a8ab 970 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 971 \
7b031478 972 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
6ad9a8ab 973 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 974 \
6ad9a8ab 975 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
976 });
977
fdfb4f21 978#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 979 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21
YO
980
981#define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
982 if ( ( flags ) ) { \
6ad9a8ab 983 Perl_re_printf( aTHX_ "%s", open_str); \
fdfb4f21
YO
984 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
985 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
986 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
987 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
988 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
989 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
990 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
991 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
992 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
993 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
994 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
995 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
996 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
997 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
998 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
6ad9a8ab 999 Perl_re_printf( aTHX_ "%s", close_str); \
fdfb4f21
YO
1000 }
1001
1002
304ee84b
YO
1003#define DEBUG_STUDYDATA(str,data,depth) \
1004DEBUG_OPTIMISE_MORE_r(if(data){ \
147e3846
KW
1005 Perl_re_indentf( aTHX_ "" str "Pos:%" IVdf "/%" IVdf \
1006 " Flags: 0x%" UVXf, \
7b031478 1007 depth, \
1de06328
YO
1008 (IV)((data)->pos_min), \
1009 (IV)((data)->pos_delta), \
fdfb4f21
YO
1010 (UV)((data)->flags) \
1011 ); \
1012 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
147e3846
KW
1013 Perl_re_printf( aTHX_ \
1014 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s", \
1de06328 1015 (IV)((data)->whilem_c), \
304ee84b
YO
1016 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
1017 is_inf ? "INF " : "" \
1de06328
YO
1018 ); \
1019 if ((data)->last_found) \
147e3846
KW
1020 Perl_re_printf( aTHX_ \
1021 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf \
1022 " %sFixed:'%s' @ %" IVdf \
1023 " %sFloat: '%s' @ %" IVdf "/%" IVdf, \
1de06328
YO
1024 SvPVX_const((data)->last_found), \
1025 (IV)((data)->last_end), \
1026 (IV)((data)->last_start_min), \
1027 (IV)((data)->last_start_max), \
1028 ((data)->longest && \
1029 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
1030 SvPVX_const((data)->longest_fixed), \
1031 (IV)((data)->offset_fixed), \
1032 ((data)->longest && \
1033 (data)->longest==&((data)->longest_float)) ? "*" : "", \
1034 SvPVX_const((data)->longest_float), \
1035 (IV)((data)->offset_float_min), \
1036 (IV)((data)->offset_float_max) \
1037 ); \
6ad9a8ab 1038 Perl_re_printf( aTHX_ "\n"); \
1de06328
YO
1039});
1040
cb41e5d6 1041
c6871b76
KW
1042/* =========================================================
1043 * BEGIN edit_distance stuff.
1044 *
1045 * This calculates how many single character changes of any type are needed to
1046 * transform a string into another one. It is taken from version 3.1 of
1047 *
1048 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1049 */
1050
1051/* Our unsorted dictionary linked list. */
1052/* Note we use UVs, not chars. */
1053
1054struct dictionary{
1055 UV key;
1056 UV value;
1057 struct dictionary* next;
1058};
1059typedef struct dictionary item;
1060
1061
1062PERL_STATIC_INLINE item*
1063push(UV key,item* curr)
1064{
1065 item* head;
1066 Newxz(head, 1, item);
1067 head->key = key;
1068 head->value = 0;
1069 head->next = curr;
1070 return head;
1071}
1072
1073
1074PERL_STATIC_INLINE item*
1075find(item* head, UV key)
1076{
1077 item* iterator = head;
1078 while (iterator){
1079 if (iterator->key == key){
1080 return iterator;
1081 }
1082 iterator = iterator->next;
1083 }
1084
1085 return NULL;
1086}
1087
1088PERL_STATIC_INLINE item*
1089uniquePush(item* head,UV key)
1090{
1091 item* iterator = head;
1092
1093 while (iterator){
1094 if (iterator->key == key) {
1095 return head;
1096 }
1097 iterator = iterator->next;
1098 }
1099
1100 return push(key,head);
1101}
1102
1103PERL_STATIC_INLINE void
1104dict_free(item* head)
1105{
1106 item* iterator = head;
1107
1108 while (iterator) {
1109 item* temp = iterator;
1110 iterator = iterator->next;
1111 Safefree(temp);
1112 }
1113
1114 head = NULL;
1115}
1116
1117/* End of Dictionary Stuff */
1118
1119/* All calculations/work are done here */
1120STATIC int
1121S_edit_distance(const UV* src,
1122 const UV* tgt,
1123 const STRLEN x, /* length of src[] */
1124 const STRLEN y, /* length of tgt[] */
1125 const SSize_t maxDistance
1126)
1127{
1128 item *head = NULL;
1129 UV swapCount,swapScore,targetCharCount,i,j;
1130 UV *scores;
1131 UV score_ceil = x + y;
1132
1133 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1134
1135 /* intialize matrix start values */
1136 Newxz(scores, ( (x + 2) * (y + 2)), UV);
1137 scores[0] = score_ceil;
1138 scores[1 * (y + 2) + 0] = score_ceil;
1139 scores[0 * (y + 2) + 1] = score_ceil;
1140 scores[1 * (y + 2) + 1] = 0;
1141 head = uniquePush(uniquePush(head,src[0]),tgt[0]);
1142
1143 /* work loops */
1144 /* i = src index */
1145 /* j = tgt index */
1146 for (i=1;i<=x;i++) {
1147 if (i < x)
1148 head = uniquePush(head,src[i]);
1149 scores[(i+1) * (y + 2) + 1] = i;
1150 scores[(i+1) * (y + 2) + 0] = score_ceil;
1151 swapCount = 0;
1152
1153 for (j=1;j<=y;j++) {
1154 if (i == 1) {
1155 if(j < y)
1156 head = uniquePush(head,tgt[j]);
1157 scores[1 * (y + 2) + (j + 1)] = j;
1158 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1159 }
1160
1161 targetCharCount = find(head,tgt[j-1])->value;
1162 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1163
1164 if (src[i-1] != tgt[j-1]){
1165 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1166 }
1167 else {
1168 swapCount = j;
1169 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1170 }
1171 }
1172
1173 find(head,src[i-1])->value = i;
1174 }
1175
1176 {
1177 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1178 dict_free(head);
1179 Safefree(scores);
1180 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1181 }
1182}
1183
1184/* END of edit_distance() stuff
1185 * ========================================================= */
1186
8e35b056
KW
1187/* is c a control character for which we have a mnemonic? */
1188#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1189
549b4e78
KW
1190STATIC const char *
1191S_cntrl_to_mnemonic(const U8 c)
1192{
1193 /* Returns the mnemonic string that represents character 'c', if one
1194 * exists; NULL otherwise. The only ones that exist for the purposes of
1195 * this routine are a few control characters */
1196
1197 switch (c) {
1198 case '\a': return "\\a";
1199 case '\b': return "\\b";
1200 case ESC_NATIVE: return "\\e";
1201 case '\f': return "\\f";
1202 case '\n': return "\\n";
1203 case '\r': return "\\r";
1204 case '\t': return "\\t";
1205 }
1206
1207 return NULL;
1208}
1209
653099ff 1210/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 1211 Update the longest found anchored substring and the longest found
653099ff
GS
1212 floating substrings if needed. */
1213
4327152a 1214STATIC void
ea3daa5d
FC
1215S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1216 SSize_t *minlenp, int is_inf)
c277df42 1217{
e1ec3a88
AL
1218 const STRLEN l = CHR_SVLEN(data->last_found);
1219 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 1220 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1221
7918f24d
NC
1222 PERL_ARGS_ASSERT_SCAN_COMMIT;
1223
c277df42 1224 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 1225 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
1226 if (*data->longest == data->longest_fixed) {
1227 data->offset_fixed = l ? data->last_start_min : data->pos_min;
1228 if (data->flags & SF_BEFORE_EOL)
b81d288d 1229 data->flags
c277df42
IZ
1230 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
1231 else
1232 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 1233 data->minlen_fixed=minlenp;
1de06328 1234 data->lookbehind_fixed=0;
a0ed51b3 1235 }
304ee84b 1236 else { /* *data->longest == data->longest_float */
c277df42 1237 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d 1238 data->offset_float_max = (l
646e8787
DM
1239 ? data->last_start_max
1240 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1241 ? SSize_t_MAX
1242 : data->pos_min + data->pos_delta));
1243 if (is_inf
1244 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
1245 data->offset_float_max = SSize_t_MAX;
c277df42 1246 if (data->flags & SF_BEFORE_EOL)
b81d288d 1247 data->flags
c277df42
IZ
1248 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1249 else
1250 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
1251 data->minlen_float=minlenp;
1252 data->lookbehind_float=0;
c277df42
IZ
1253 }
1254 }
1255 SvCUR_set(data->last_found, 0);
0eda9292 1256 {
a28509cc 1257 SV * const sv = data->last_found;
097eb12c
AL
1258 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1259 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1260 if (mg)
1261 mg->mg_len = 0;
1262 }
0eda9292 1263 }
c277df42
IZ
1264 data->last_end = -1;
1265 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 1266 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
1267}
1268
cdd87c1d
KW
1269/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1270 * list that describes which code points it matches */
1271
653099ff 1272STATIC void
3420edd7 1273S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1274{
cdd87c1d
KW
1275 /* Set the SSC 'ssc' to match an empty string or any code point */
1276
557bd3fb 1277 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1278
71068078 1279 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1280
0854ea0b
KW
1281 /* mortalize so won't leak */
1282 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1283 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1284}
1285
653099ff 1286STATIC int
dc3bf405 1287S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1288{
c144baaa
KW
1289 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1290 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1291 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1292 * in any way, so there's no point in using it */
cdd87c1d
KW
1293
1294 UV start, end;
1295 bool ret;
653099ff 1296
557bd3fb 1297 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1298
71068078 1299 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1300
93e92956 1301 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1302 return FALSE;
1303 }
1304
1305 /* See if the list consists solely of the range 0 - Infinity */
1306 invlist_iterinit(ssc->invlist);
1307 ret = invlist_iternext(ssc->invlist, &start, &end)
1308 && start == 0
1309 && end == UV_MAX;
1310
1311 invlist_iterfinish(ssc->invlist);
1312
1313 if (ret) {
1314 return TRUE;
1315 }
1316
1317 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1318 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1319 int i;
1320 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1321 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1322 return TRUE;
1323 }
1324 }
1325 }
1326
1327 return FALSE;
653099ff
GS
1328}
1329
653099ff 1330STATIC void
cdd87c1d 1331S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1332{
cdd87c1d
KW
1333 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1334 * string, any code point, or any posix class under locale */
1335
557bd3fb 1336 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1337
557bd3fb 1338 Zero(ssc, 1, regnode_ssc);
71068078 1339 set_ANYOF_SYNTHETIC(ssc);
93e92956 1340 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1341 ssc_anything(ssc);
cdd87c1d 1342
2f306ab9
KW
1343 /* If any portion of the regex is to operate under locale rules that aren't
1344 * fully known at compile time, initialization includes it. The reason
1345 * this isn't done for all regexes is that the optimizer was written under
1346 * the assumption that locale was all-or-nothing. Given the complexity and
1347 * lack of documentation in the optimizer, and that there are inadequate
1348 * test cases for locale, many parts of it may not work properly, it is
1349 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1350 if (RExC_contains_locale) {
1351 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1352 }
1353 else {
1354 ANYOF_POSIXL_ZERO(ssc);
1355 }
653099ff
GS
1356}
1357
b423522f 1358STATIC int
dc3bf405
BF
1359S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1360 const regnode_ssc *ssc)
b423522f
KW
1361{
1362 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1363 * to the list of code points matched, and locale posix classes; hence does
1364 * not check its flags) */
1365
1366 UV start, end;
1367 bool ret;
1368
1369 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1370
71068078 1371 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1372
1373 invlist_iterinit(ssc->invlist);
1374 ret = invlist_iternext(ssc->invlist, &start, &end)
1375 && start == 0
1376 && end == UV_MAX;
1377
1378 invlist_iterfinish(ssc->invlist);
1379
1380 if (! ret) {
1381 return FALSE;
1382 }
1383
e0e1be5f 1384 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1385 return FALSE;
b423522f
KW
1386 }
1387
1388 return TRUE;
1389}
1390
1391STATIC SV*
1392S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1393 const regnode_charclass* const node)
b423522f
KW
1394{
1395 /* Returns a mortal inversion list defining which code points are matched
1396 * by 'node', which is of type ANYOF. Handles complementing the result if
1397 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1398 * returned list must, and will, contain every code point that is a
1399 * possibility. */
b423522f 1400
e2506fa7 1401 SV* invlist = NULL;
1ee208c4 1402 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1403 unsigned int i;
1404 const U32 n = ARG(node);
31f05a37 1405 bool new_node_has_latin1 = FALSE;
b423522f
KW
1406
1407 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1408
1409 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1410 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1411 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1412 AV * const av = MUTABLE_AV(SvRV(rv));
1413 SV **const ary = AvARRAY(av);
1414 assert(RExC_rxi->data->what[n] == 's');
1415
1416 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1417 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1418 }
1419 else if (ary[0] && ary[0] != &PL_sv_undef) {
1420
1421 /* Here, no compile-time swash, and there are things that won't be
1422 * known until runtime -- we have to assume it could be anything */
e2506fa7 1423 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1424 return _add_range_to_invlist(invlist, 0, UV_MAX);
1425 }
1ee208c4 1426 else if (ary[3] && ary[3] != &PL_sv_undef) {
b423522f
KW
1427
1428 /* Here no compile-time swash, and no run-time only data. Use the
1429 * node's inversion list */
1ee208c4
KW
1430 invlist = sv_2mortal(invlist_clone(ary[3]));
1431 }
1432
1433 /* Get the code points valid only under UTF-8 locales */
037715a6 1434 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1ee208c4
KW
1435 && ary[2] && ary[2] != &PL_sv_undef)
1436 {
1437 only_utf8_locale_invlist = ary[2];
b423522f
KW
1438 }
1439 }
1440
e2506fa7
KW
1441 if (! invlist) {
1442 invlist = sv_2mortal(_new_invlist(0));
1443 }
1444
dcb20b36
KW
1445 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1446 * code points, and an inversion list for the others, but if there are code
1447 * points that should match only conditionally on the target string being
1448 * UTF-8, those are placed in the inversion list, and not the bitmap.
1449 * Since there are circumstances under which they could match, they are
1450 * included in the SSC. But if the ANYOF node is to be inverted, we have
1451 * to exclude them here, so that when we invert below, the end result
1452 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1453 * have to do this here before we add the unconditionally matched code
1454 * points */
b423522f
KW
1455 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1456 _invlist_intersection_complement_2nd(invlist,
1457 PL_UpperLatin1,
1458 &invlist);
1459 }
1460
1461 /* Add in the points from the bit map */
dcb20b36 1462 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
b423522f 1463 if (ANYOF_BITMAP_TEST(node, i)) {
6f8848d5
TC
1464 unsigned int start = i++;
1465
1466 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1467 /* empty */
1468 }
1469 invlist = _add_range_to_invlist(invlist, start, i-1);
31f05a37 1470 new_node_has_latin1 = TRUE;
b423522f
KW
1471 }
1472 }
1473
1474 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1475 * as well. But don't add them if inverting, as when that gets done below,
1476 * it would exclude all these characters, including the ones it shouldn't
1477 * that were added just above */
1478 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
f240c685
KW
1479 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1480 {
b423522f
KW
1481 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1482 }
1483
1484 /* Similarly for these */
93e92956 1485 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1486 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1487 }
1488
1489 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1490 _invlist_invert(invlist);
1491 }
037715a6 1492 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
31f05a37
KW
1493
1494 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1495 * locale. We can skip this if there are no 0-255 at all. */
1496 _invlist_union(invlist, PL_Latin1, &invlist);
1497 }
1498
1ee208c4
KW
1499 /* Similarly add the UTF-8 locale possible matches. These have to be
1500 * deferred until after the non-UTF-8 locale ones are taken care of just
1501 * above, or it leads to wrong results under ANYOF_INVERT */
1502 if (only_utf8_locale_invlist) {
31f05a37 1503 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1504 only_utf8_locale_invlist,
31f05a37
KW
1505 ANYOF_FLAGS(node) & ANYOF_INVERT,
1506 &invlist);
1507 }
b423522f
KW
1508
1509 return invlist;
1510}
1511
1051e1c4 1512/* These two functions currently do the exact same thing */
557bd3fb 1513#define ssc_init_zero ssc_init
653099ff 1514
cdd87c1d
KW
1515#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1516#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1517
557bd3fb 1518/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1519 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1520 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1521
653099ff 1522STATIC void
b423522f 1523S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1524 const regnode_charclass *and_with)
653099ff 1525{
cdd87c1d
KW
1526 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1527 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1528
a0dd4231
KW
1529 SV* anded_cp_list;
1530 U8 anded_flags;
1e6ade67 1531
cdd87c1d 1532 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1533
71068078 1534 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1535
1536 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1537 * the code point inversion list and just the relevant flags */
71068078 1538 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1539 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
a0dd4231 1540 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1541
1542 /* XXX This is a kludge around what appears to be deficiencies in the
1543 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1544 * there are paths through the optimizer where it doesn't get weeded
1545 * out when it should. And if we don't make some extra provision for
1546 * it like the code just below, it doesn't get added when it should.
1547 * This solution is to add it only when AND'ing, which is here, and
1548 * only when what is being AND'ed is the pristine, original node
1549 * matching anything. Thus it is like adding it to ssc_anything() but
1550 * only when the result is to be AND'ed. Probably the same solution
1551 * could be adopted for the same problem we have with /l matching,
1552 * which is solved differently in S_ssc_init(), and that would lead to
1553 * fewer false positives than that solution has. But if this solution
1554 * creates bugs, the consequences are only that a warning isn't raised
1555 * that should be; while the consequences for having /l bugs is
1556 * incorrect matches */
7dcac5f6 1557 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1558 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1559 }
a0dd4231
KW
1560 }
1561 else {
5c0f85ef 1562 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685
KW
1563 if (OP(and_with) == ANYOFD) {
1564 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1565 }
1566 else {
1567 anded_flags = ANYOF_FLAGS(and_with)
1568 &( ANYOF_COMMON_FLAGS
108316fb
KW
1569 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1570 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1571 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1572 anded_flags &=
1573 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1574 }
f240c685 1575 }
a0dd4231
KW
1576 }
1577
1578 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1579
1580 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1581 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1582 * 'and_with' may be inverted. When not inverted, we have the situation of
1583 * computing:
1584 * (C1 | P1) & (C2 | P2)
1585 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1586 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1587 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1588 * <= ((C1 & C2) | P1 | P2)
1589 * Alternatively, the last few steps could be:
1590 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1591 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1592 * <= (C1 | C2 | (P1 & P2))
1593 * We favor the second approach if either P1 or P2 is non-empty. This is
1594 * because these components are a barrier to doing optimizations, as what
1595 * they match cannot be known until the moment of matching as they are
1596 * dependent on the current locale, 'AND"ing them likely will reduce or
1597 * eliminate them.
1598 * But we can do better if we know that C1,P1 are in their initial state (a
1599 * frequent occurrence), each matching everything:
1600 * (<everything>) & (C2 | P2) = C2 | P2
1601 * Similarly, if C2,P2 are in their initial state (again a frequent
1602 * occurrence), the result is a no-op
1603 * (C1 | P1) & (<everything>) = C1 | P1
1604 *
1605 * Inverted, we have
1606 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1607 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1608 * <= (C1 & ~C2) | (P1 & ~P2)
1609 * */
1aa99e6b 1610
a0dd4231 1611 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1612 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1613 {
cdd87c1d 1614 unsigned int i;
8951c461 1615
cdd87c1d
KW
1616 ssc_intersection(ssc,
1617 anded_cp_list,
1618 FALSE /* Has already been inverted */
1619 );
c6b76537 1620
cdd87c1d
KW
1621 /* If either P1 or P2 is empty, the intersection will be also; can skip
1622 * the loop */
93e92956 1623 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1624 ANYOF_POSIXL_ZERO(ssc);
1625 }
e0e1be5f 1626 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1627
1628 /* Note that the Posix class component P from 'and_with' actually
1629 * looks like:
1630 * P = Pa | Pb | ... | Pn
1631 * where each component is one posix class, such as in [\w\s].
1632 * Thus
1633 * ~P = ~(Pa | Pb | ... | Pn)
1634 * = ~Pa & ~Pb & ... & ~Pn
1635 * <= ~Pa | ~Pb | ... | ~Pn
1636 * The last is something we can easily calculate, but unfortunately
1637 * is likely to have many false positives. We could do better
1638 * in some (but certainly not all) instances if two classes in
1639 * P have known relationships. For example
1640 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1641 * So
1642 * :lower: & :print: = :lower:
1643 * And similarly for classes that must be disjoint. For example,
1644 * since \s and \w can have no elements in common based on rules in
1645 * the POSIX standard,
1646 * \w & ^\S = nothing
1647 * Unfortunately, some vendor locales do not meet the Posix
1648 * standard, in particular almost everything by Microsoft.
1649 * The loop below just changes e.g., \w into \W and vice versa */
1650
1ee208c4 1651 regnode_charclass_posixl temp;
cdd87c1d
KW
1652 int add = 1; /* To calculate the index of the complement */
1653
1654 ANYOF_POSIXL_ZERO(&temp);
1655 for (i = 0; i < ANYOF_MAX; i++) {
1656 assert(i % 2 != 0
7dcac5f6
KW
1657 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1658 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1659
7dcac5f6 1660 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1661 ANYOF_POSIXL_SET(&temp, i + add);
1662 }
1663 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1664 }
1665 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1666
cdd87c1d
KW
1667 } /* else ssc already has no posixes */
1668 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1669 in its initial state */
71068078 1670 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1671 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1672 {
1673 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1674 * copy it over 'ssc' */
1675 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1676 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1677 StructCopy(and_with, ssc, regnode_ssc);
1678 }
1679 else {
1680 ssc->invlist = anded_cp_list;
1681 ANYOF_POSIXL_ZERO(ssc);
93e92956 1682 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1683 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1684 }
1685 }
1686 }
e0e1be5f 1687 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
93e92956 1688 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1689 {
1690 /* One or the other of P1, P2 is non-empty. */
93e92956 1691 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1692 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1693 }
cdd87c1d
KW
1694 ssc_union(ssc, anded_cp_list, FALSE);
1695 }
1696 else { /* P1 = P2 = empty */
1697 ssc_intersection(ssc, anded_cp_list, FALSE);
1698 }
137165a6 1699 }
653099ff
GS
1700}
1701
653099ff 1702STATIC void
cdd87c1d 1703S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1704 const regnode_charclass *or_with)
653099ff 1705{
cdd87c1d
KW
1706 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1707 * another SSC or a regular ANYOF class. Can create false positives if
1708 * 'or_with' is to be inverted. */
7918f24d 1709
a0dd4231
KW
1710 SV* ored_cp_list;
1711 U8 ored_flags;
c6b76537 1712
cdd87c1d 1713 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1714
71068078 1715 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1716
1717 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1718 * the code point inversion list and just the relevant flags */
71068078 1719 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1720 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
a0dd4231
KW
1721 ored_flags = ANYOF_FLAGS(or_with);
1722 }
1723 else {
5c0f85ef 1724 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
eff8b7dc 1725 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
f240c685
KW
1726 if (OP(or_with) != ANYOFD) {
1727 ored_flags
1728 |= ANYOF_FLAGS(or_with)
108316fb
KW
1729 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1730 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1731 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1732 ored_flags |=
1733 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1734 }
f240c685 1735 }
a0dd4231
KW
1736 }
1737
1738 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1739
1740 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1741 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1742 * 'or_with' may be inverted. When not inverted, we have the simple
1743 * situation of computing:
1744 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1745 * If P1|P2 yields a situation with both a class and its complement are
1746 * set, like having both \w and \W, this matches all code points, and we
1747 * can delete these from the P component of the ssc going forward. XXX We
1748 * might be able to delete all the P components, but I (khw) am not certain
1749 * about this, and it is better to be safe.
1750 *
1751 * Inverted, we have
1752 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1753 * <= (C1 | P1) | ~C2
1754 * <= (C1 | ~C2) | P1
1755 * (which results in actually simpler code than the non-inverted case)
1756 * */
9826f543 1757
a0dd4231 1758 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1759 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1760 {
cdd87c1d 1761 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1762 } /* else Not inverted */
93e92956 1763 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1764 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1765 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1766 unsigned int i;
1767 for (i = 0; i < ANYOF_MAX; i += 2) {
1768 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1769 {
1770 ssc_match_all_cp(ssc);
1771 ANYOF_POSIXL_CLEAR(ssc, i);
1772 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1773 }
1774 }
1775 }
1aa99e6b 1776 }
cdd87c1d
KW
1777
1778 ssc_union(ssc,
1779 ored_cp_list,
1780 FALSE /* Already has been inverted */
1781 );
653099ff
GS
1782}
1783
b423522f
KW
1784PERL_STATIC_INLINE void
1785S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1786{
1787 PERL_ARGS_ASSERT_SSC_UNION;
1788
71068078 1789 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1790
1791 _invlist_union_maybe_complement_2nd(ssc->invlist,
1792 invlist,
1793 invert2nd,
1794 &ssc->invlist);
1795}
1796
1797PERL_STATIC_INLINE void
1798S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1799 SV* const invlist,
1800 const bool invert2nd)
1801{
1802 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1803
71068078 1804 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1805
1806 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1807 invlist,
1808 invert2nd,
1809 &ssc->invlist);
1810}
1811
1812PERL_STATIC_INLINE void
1813S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1814{
1815 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1816
71068078 1817 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1818
1819 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1820}
1821
1822PERL_STATIC_INLINE void
1823S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1824{
1825 /* AND just the single code point 'cp' into the SSC 'ssc' */
1826
1827 SV* cp_list = _new_invlist(2);
1828
1829 PERL_ARGS_ASSERT_SSC_CP_AND;
1830
71068078 1831 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1832
1833 cp_list = add_cp_to_invlist(cp_list, cp);
1834 ssc_intersection(ssc, cp_list,
1835 FALSE /* Not inverted */
1836 );
1837 SvREFCNT_dec_NN(cp_list);
1838}
1839
1840PERL_STATIC_INLINE void
dc3bf405 1841S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
1842{
1843 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
1844 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1845
71068078 1846 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1847
1848 ANYOF_POSIXL_ZERO(ssc);
1849 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1850}
1851
b35552de
KW
1852#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1853
1854STATIC bool
1855S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1856{
1857 /* The synthetic start class is used to hopefully quickly winnow down
1858 * places where a pattern could start a match in the target string. If it
1859 * doesn't really narrow things down that much, there isn't much point to
1860 * having the overhead of using it. This function uses some very crude
1861 * heuristics to decide if to use the ssc or not.
1862 *
1863 * It returns TRUE if 'ssc' rules out more than half what it considers to
1864 * be the "likely" possible matches, but of course it doesn't know what the
1865 * actual things being matched are going to be; these are only guesses
1866 *
1867 * For /l matches, it assumes that the only likely matches are going to be
1868 * in the 0-255 range, uniformly distributed, so half of that is 127
1869 * For /a and /d matches, it assumes that the likely matches will be just
1870 * the ASCII range, so half of that is 63
1871 * For /u and there isn't anything matching above the Latin1 range, it
1872 * assumes that that is the only range likely to be matched, and uses
1873 * half that as the cut-off: 127. If anything matches above Latin1,
1874 * it assumes that all of Unicode could match (uniformly), except for
1875 * non-Unicode code points and things in the General Category "Other"
1876 * (unassigned, private use, surrogates, controls and formats). This
1877 * is a much large number. */
1878
b35552de
KW
1879 U32 count = 0; /* Running total of number of code points matched by
1880 'ssc' */
1881 UV start, end; /* Start and end points of current range in inversion
1882 list */
72400949
KW
1883 const U32 max_code_points = (LOC)
1884 ? 256
1885 : (( ! UNI_SEMANTICS
1886 || invlist_highest(ssc->invlist) < 256)
1887 ? 128
1888 : NON_OTHER_COUNT);
1889 const U32 max_match = max_code_points / 2;
b35552de
KW
1890
1891 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1892
1893 invlist_iterinit(ssc->invlist);
1894 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
1895 if (start >= max_code_points) {
1896 break;
b35552de 1897 }
72400949 1898 end = MIN(end, max_code_points - 1);
b35552de 1899 count += end - start + 1;
72400949 1900 if (count >= max_match) {
b35552de
KW
1901 invlist_iterfinish(ssc->invlist);
1902 return FALSE;
1903 }
1904 }
1905
1906 return TRUE;
1907}
1908
1909
b423522f
KW
1910STATIC void
1911S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1912{
1913 /* The inversion list in the SSC is marked mortal; now we need a more
1914 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
1915 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1916 * map */
b423522f
KW
1917
1918 SV* invlist = invlist_clone(ssc->invlist);
1919
1920 PERL_ARGS_ASSERT_SSC_FINALIZE;
1921
71068078 1922 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 1923
a0dd4231 1924 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
1925 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1926 * by the time we reach here */
f240c685
KW
1927 assert(! (ANYOF_FLAGS(ssc)
1928 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
1929 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1930 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 1931
b423522f
KW
1932 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1933
1ee208c4
KW
1934 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1935 NULL, NULL, NULL, FALSE);
b423522f 1936
85c8e306
KW
1937 /* Make sure is clone-safe */
1938 ssc->invlist = NULL;
1939
e0e1be5f 1940 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 1941 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
e0e1be5f 1942 }
1462525b 1943
b2e90ddf
KW
1944 if (RExC_contains_locale) {
1945 OP(ssc) = ANYOFL;
1946 }
1947
1462525b 1948 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
1949}
1950
a3621e74
YO
1951#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1952#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1953#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
1954#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1955 ? (TRIE_LIST_CUR( idx ) - 1) \
1956 : 0 )
a3621e74 1957
3dab1dad
YO
1958
1959#ifdef DEBUGGING
07be1b83 1960/*
2b8b4781
NC
1961 dump_trie(trie,widecharmap,revcharmap)
1962 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1963 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1964
1965 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1966 The _interim_ variants are used for debugging the interim
1967 tables that are used to generate the final compressed
1968 representation which is what dump_trie expects.
1969
486ec47a 1970 Part of the reason for their existence is to provide a form
3dab1dad 1971 of documentation as to how the different representations function.
07be1b83
YO
1972
1973*/
3dab1dad
YO
1974
1975/*
3dab1dad
YO
1976 Dumps the final compressed table form of the trie to Perl_debug_log.
1977 Used for debugging make_trie().
1978*/
b9a59e08 1979
3dab1dad 1980STATIC void
2b8b4781
NC
1981S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1982 AV *revcharmap, U32 depth)
3dab1dad
YO
1983{
1984 U32 state;
ab3bbdeb 1985 SV *sv=sv_newmortal();
55eed653 1986 int colwidth= widecharmap ? 6 : 4;
2e64971a 1987 U16 word;
3dab1dad
YO
1988 GET_RE_DEBUG_FLAGS_DECL;
1989
7918f24d 1990 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1991
6ad9a8ab 1992 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 1993 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
1994
1995 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1996 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1997 if ( tmp ) {
6ad9a8ab 1998 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 1999 colwidth,
538e84ed 2000 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2001 PL_colors[0], PL_colors[1],
2002 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2003 PERL_PV_ESCAPE_FIRSTCHAR
2004 )
ab3bbdeb 2005 );
3dab1dad
YO
2006 }
2007 }
1e37780e
YO
2008 Perl_re_printf( aTHX_ "\n");
2009 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2010
2011 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2012 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2013 Perl_re_printf( aTHX_ "\n");
3dab1dad 2014
1e2e3d02 2015 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2016 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2017
147e3846 2018 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2019
2020 if ( trie->states[ state ].wordnum ) {
1e37780e 2021 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2022 } else {
6ad9a8ab 2023 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2024 }
2025
147e3846 2026 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2027
2028 if ( base ) {
2029 U32 ofs = 0;
2030
2031 while( ( base + ofs < trie->uniquecharcount ) ||
2032 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2033 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2034 != state))
3dab1dad
YO
2035 ofs++;
2036
147e3846 2037 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2038
2039 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2040 if ( ( base + ofs >= trie->uniquecharcount )
2041 && ( base + ofs - trie->uniquecharcount
2042 < trie->lasttrans )
2043 && trie->trans[ base + ofs
2044 - trie->uniquecharcount ].check == state )
3dab1dad 2045 {
147e3846 2046 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2047 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2048 );
3dab1dad 2049 } else {
6ad9a8ab 2050 Perl_re_printf( aTHX_ "%*s",colwidth," ." );
3dab1dad
YO
2051 }
2052 }
2053
6ad9a8ab 2054 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2055
2056 }
6ad9a8ab 2057 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2058 }
6ad9a8ab 2059 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2060 depth);
2e64971a 2061 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2062 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2063 (int)word, (int)(trie->wordinfo[word].prev),
2064 (int)(trie->wordinfo[word].len));
2065 }
6ad9a8ab 2066 Perl_re_printf( aTHX_ "\n" );
538e84ed 2067}
3dab1dad 2068/*
3dab1dad 2069 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2070 List tries normally only are used for construction when the number of
3dab1dad
YO
2071 possible chars (trie->uniquecharcount) is very high.
2072 Used for debugging make_trie().
2073*/
2074STATIC void
55eed653 2075S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2076 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2077 U32 depth)
3dab1dad
YO
2078{
2079 U32 state;
ab3bbdeb 2080 SV *sv=sv_newmortal();
55eed653 2081 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2082 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2083
2084 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2085
3dab1dad 2086 /* print out the table precompression. */
6ad9a8ab 2087 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2088 depth+1 );
6ad9a8ab 2089 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2090 depth+1, "------:-----+-----------------\n" );
538e84ed 2091
3dab1dad
YO
2092 for( state=1 ; state < next_alloc ; state ++ ) {
2093 U16 charid;
538e84ed 2094
147e3846 2095 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2096 depth+1, (UV)state );
3dab1dad 2097 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2098 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2099 } else {
6ad9a8ab 2100 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2101 trie->states[ state ].wordnum
2102 );
2103 }
2104 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed
KW
2105 SV ** const tmp = av_fetch( revcharmap,
2106 TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb 2107 if ( tmp ) {
147e3846 2108 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2109 colwidth,
538e84ed
KW
2110 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2111 colwidth,
2112 PL_colors[0], PL_colors[1],
2113 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2114 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2115 ) ,
1e2e3d02
YO
2116 TRIE_LIST_ITEM(state,charid).forid,
2117 (UV)TRIE_LIST_ITEM(state,charid).newstate
2118 );
538e84ed 2119 if (!(charid % 10))
6ad9a8ab 2120 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2121 (int)((depth * 2) + 14), "");
1e2e3d02 2122 }
ab3bbdeb 2123 }
6ad9a8ab 2124 Perl_re_printf( aTHX_ "\n");
3dab1dad 2125 }
538e84ed 2126}
3dab1dad
YO
2127
2128/*
3dab1dad 2129 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2130 This is the normal DFA style state transition table, with a few
2131 twists to facilitate compression later.
3dab1dad
YO
2132 Used for debugging make_trie().
2133*/
2134STATIC void
55eed653 2135S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2136 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2137 U32 depth)
3dab1dad
YO
2138{
2139 U32 state;
2140 U16 charid;
ab3bbdeb 2141 SV *sv=sv_newmortal();
55eed653 2142 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2143 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2144
2145 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2146
3dab1dad
YO
2147 /*
2148 print out the table precompression so that we can do a visual check
2149 that they are identical.
2150 */
538e84ed 2151
6ad9a8ab 2152 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2153
2154 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2155 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2156 if ( tmp ) {
6ad9a8ab 2157 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2158 colwidth,
538e84ed 2159 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2160 PL_colors[0], PL_colors[1],
2161 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2162 PERL_PV_ESCAPE_FIRSTCHAR
2163 )
ab3bbdeb 2164 );
3dab1dad
YO
2165 }
2166 }
2167
4aaafc03
YO
2168 Perl_re_printf( aTHX_ "\n");
2169 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2170
2171 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2172 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2173 }
2174
6ad9a8ab 2175 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2176
2177 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2178
147e3846 2179 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2180 depth+1,
3dab1dad
YO
2181 (UV)TRIE_NODENUM( state ) );
2182
2183 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2184 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2185 if (v)
147e3846 2186 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2187 else
6ad9a8ab 2188 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2189 }
2190 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2191 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2192 (UV)trie->trans[ state ].check );
3dab1dad 2193 } else {
147e3846 2194 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2195 (UV)trie->trans[ state ].check,
3dab1dad
YO
2196 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2197 }
2198 }
07be1b83 2199}
3dab1dad
YO
2200
2201#endif
2202
2e64971a 2203
786e8c11
YO
2204/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2205 startbranch: the first branch in the whole branch sequence
2206 first : start branch of sequence of branch-exact nodes.
2207 May be the same as startbranch
2208 last : Thing following the last branch.
2209 May be the same as tail.
2210 tail : item following the branch sequence
2211 count : words in the sequence
a4525e78 2212 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2213 depth : indent depth
3dab1dad 2214
786e8c11 2215Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2216
786e8c11
YO
2217A trie is an N'ary tree where the branches are determined by digital
2218decomposition of the key. IE, at the root node you look up the 1st character and
2219follow that branch repeat until you find the end of the branches. Nodes can be
2220marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2221
786e8c11 2222 /he|she|his|hers/
72f13be8 2223
786e8c11
YO
2224would convert into the following structure. Numbers represent states, letters
2225following numbers represent valid transitions on the letter from that state, if
2226the number is in square brackets it represents an accepting state, otherwise it
2227will be in parenthesis.
07be1b83 2228
786e8c11
YO
2229 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2230 | |
2231 | (2)
2232 | |
2233 (1) +-i->(6)-+-s->[7]
2234 |
2235 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2236
786e8c11
YO
2237 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2238
2239This shows that when matching against the string 'hers' we will begin at state 1
2240read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2241then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2242is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2243single traverse. We store a mapping from accepting to state to which word was
2244matched, and then when we have multiple possibilities we try to complete the
b8fda935 2245rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2246
2247The only prior NFA like behaviour that would be changed by the TRIE support is
2248the silent ignoring of duplicate alternations which are of the form:
2249
2250 / (DUPE|DUPE) X? (?{ ... }) Y /x
2251
4b714af6 2252Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2253and without the optimisation. With the optimisations dupes will be silently
486ec47a 2254ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2255the following demonstrates:
2256
2257 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2258
2259which prints out 'word' three times, but
2260
2261 'words'=~/(word|word|word)(?{ print $1 })S/
2262
2263which doesnt print it out at all. This is due to other optimisations kicking in.
2264
2265Example of what happens on a structural level:
2266
486ec47a 2267The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2268
2269 1: CURLYM[1] {1,32767}(18)
2270 5: BRANCH(8)
2271 6: EXACT <ac>(16)
2272 8: BRANCH(11)
2273 9: EXACT <ad>(16)
2274 11: BRANCH(14)
2275 12: EXACT <ab>(16)
2276 16: SUCCEED(0)
2277 17: NOTHING(18)
2278 18: END(0)
2279
2280This would be optimizable with startbranch=5, first=5, last=16, tail=16
2281and should turn into:
2282
2283 1: CURLYM[1] {1,32767}(18)
2284 5: TRIE(16)
2285 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2286 <ac>
2287 <ad>
2288 <ab>
2289 16: SUCCEED(0)
2290 17: NOTHING(18)
2291 18: END(0)
2292
2293Cases where tail != last would be like /(?foo|bar)baz/:
2294
2295 1: BRANCH(4)
2296 2: EXACT <foo>(8)
2297 4: BRANCH(7)
2298 5: EXACT <bar>(8)
2299 7: TAIL(8)
2300 8: EXACT <baz>(10)
2301 10: END(0)
2302
2303which would be optimizable with startbranch=1, first=1, last=7, tail=8
2304and would end up looking like:
2305
2306 1: TRIE(8)
2307 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2308 <foo>
2309 <bar>
2310 7: TAIL(8)
2311 8: EXACT <baz>(10)
2312 10: END(0)
2313
c80e42f3 2314 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2315
2316is the recommended Unicode-aware way of saying
2317
2318 *(d++) = uv;
2319*/
2320
fab2782b 2321#define TRIE_STORE_REVCHAR(val) \
786e8c11 2322 STMT_START { \
73031816 2323 if (UTF) { \
668fcfea 2324 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2325 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 2326 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
2327 SvCUR_set(zlopp, kapow - flrbbbbb); \
2328 SvPOK_on(zlopp); \
2329 SvUTF8_on(zlopp); \
2330 av_push(revcharmap, zlopp); \
2331 } else { \
fab2782b 2332 char ooooff = (char)val; \
73031816
NC
2333 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2334 } \
2335 } STMT_END
786e8c11 2336
914a25d5
KW
2337/* This gets the next character from the input, folding it if not already
2338 * folded. */
2339#define TRIE_READ_CHAR STMT_START { \
2340 wordlen++; \
2341 if ( UTF ) { \
2342 /* if it is UTF then it is either already folded, or does not need \
2343 * folding */ \
1c1d615a 2344 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2345 } \
2346 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2347 /* This folder implies Unicode rules, which in the range expressible \
2348 * by not UTF is the lower case, with the two exceptions, one of \
2349 * which should have been taken care of before calling this */ \
2350 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2351 uvc = toLOWER_L1(*uc); \
2352 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2353 len = 1; \
914a25d5
KW
2354 } else { \
2355 /* raw data, will be folded later if needed */ \
2356 uvc = (U32)*uc; \
2357 len = 1; \
2358 } \
786e8c11
YO
2359} STMT_END
2360
2361
2362
2363#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2364 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
2365 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2366 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
2367 } \
2368 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2369 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2370 TRIE_LIST_CUR( state )++; \
2371} STMT_END
07be1b83 2372
786e8c11
YO
2373#define TRIE_LIST_NEW(state) STMT_START { \
2374 Newxz( trie->states[ state ].trans.list, \
2375 4, reg_trie_trans_le ); \
2376 TRIE_LIST_CUR( state ) = 1; \
2377 TRIE_LIST_LEN( state ) = 4; \
2378} STMT_END
07be1b83 2379
786e8c11
YO
2380#define TRIE_HANDLE_WORD(state) STMT_START { \
2381 U16 dupe= trie->states[ state ].wordnum; \
2382 regnode * const noper_next = regnext( noper ); \
2383 \
786e8c11
YO
2384 DEBUG_r({ \
2385 /* store the word for dumping */ \
2386 SV* tmp; \
2387 if (OP(noper) != NOTHING) \
740cce10 2388 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2389 else \
740cce10 2390 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2391 av_push( trie_words, tmp ); \
786e8c11
YO
2392 }); \
2393 \
2394 curword++; \
2e64971a
DM
2395 trie->wordinfo[curword].prev = 0; \
2396 trie->wordinfo[curword].len = wordlen; \
2397 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2398 \
2399 if ( noper_next < tail ) { \
2400 if (!trie->jump) \
538e84ed
KW
2401 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2402 sizeof(U16) ); \
7f69552c 2403 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2404 if (!jumper) \
2405 jumper = noper_next; \
2406 if (!nextbranch) \
2407 nextbranch= regnext(cur); \
2408 } \
2409 \
2410 if ( dupe ) { \
2e64971a
DM
2411 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2412 /* chain, so that when the bits of chain are later */\
2413 /* linked together, the dups appear in the chain */\
2414 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2415 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2416 } else { \
2417 /* we haven't inserted this word yet. */ \
2418 trie->states[ state ].wordnum = curword; \
2419 } \
2420} STMT_END
07be1b83 2421
3dab1dad 2422
786e8c11
YO
2423#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2424 ( ( base + charid >= ucharcount \
2425 && base + charid < ubound \
2426 && state == trie->trans[ base - ucharcount + charid ].check \
2427 && trie->trans[ base - ucharcount + charid ].next ) \
2428 ? trie->trans[ base - ucharcount + charid ].next \
2429 : ( state==1 ? special : 0 ) \
2430 )
3dab1dad 2431
8bcafbf4
YO
2432#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2433STMT_START { \
2434 TRIE_BITMAP_SET(trie, uvc); \
2435 /* store the folded codepoint */ \
2436 if ( folder ) \
2437 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2438 \
2439 if ( !UTF ) { \
2440 /* store first byte of utf8 representation of */ \
2441 /* variant codepoints */ \
2442 if (! UVCHR_IS_INVARIANT(uvc)) { \
2443 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2444 } \
2445 } \
2446} STMT_END
786e8c11
YO
2447#define MADE_TRIE 1
2448#define MADE_JUMP_TRIE 2
2449#define MADE_EXACT_TRIE 4
3dab1dad 2450
a3621e74 2451STATIC I32
538e84ed
KW
2452S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2453 regnode *first, regnode *last, regnode *tail,
2454 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2455{
2456 /* first pass, loop through and scan words */
2457 reg_trie_data *trie;
55eed653 2458 HV *widecharmap = NULL;
2b8b4781 2459 AV *revcharmap = newAV();
a3621e74 2460 regnode *cur;
a3621e74
YO
2461 STRLEN len = 0;
2462 UV uvc = 0;
2463 U16 curword = 0;
2464 U32 next_alloc = 0;
786e8c11
YO
2465 regnode *jumper = NULL;
2466 regnode *nextbranch = NULL;
7f69552c 2467 regnode *convert = NULL;
2e64971a 2468 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2469 /* we just use folder as a flag in utf8 */
1e696034 2470 const U8 * folder = NULL;
a3621e74 2471
2b8b4781 2472#ifdef DEBUGGING
cf78de0b 2473 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2b8b4781
NC
2474 AV *trie_words = NULL;
2475 /* along with revcharmap, this only used during construction but both are
2476 * useful during debugging so we store them in the struct when debugging.
8e11feef 2477 */
2b8b4781 2478#else
cf78de0b 2479 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2480 STRLEN trie_charcount=0;
3dab1dad 2481#endif
2b8b4781 2482 SV *re_trie_maxbuff;
a3621e74 2483 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2484
2485 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2486#ifndef DEBUGGING
2487 PERL_UNUSED_ARG(depth);
2488#endif
a3621e74 2489
1e696034 2490 switch (flags) {
a4525e78 2491 case EXACT: case EXACTL: break;
2f7f8cb1 2492 case EXACTFA:
fab2782b 2493 case EXACTFU_SS:
a4525e78
KW
2494 case EXACTFU:
2495 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2496 case EXACTF: folder = PL_fold; break;
fab2782b 2497 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2498 }
2499
c944940b 2500 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2501 trie->refcount = 1;
3dab1dad 2502 trie->startstate = 1;
786e8c11 2503 trie->wordcount = word_count;
f8fc2ecf 2504 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2505 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
a4525e78 2506 if (flags == EXACT || flags == EXACTL)
c944940b 2507 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2508 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2509 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2510
a3621e74 2511 DEBUG_r({
2b8b4781 2512 trie_words = newAV();
a3621e74 2513 });
a3621e74 2514
0111c4fd 2515 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
316ebaf2 2516 assert(re_trie_maxbuff);
a3621e74 2517 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2518 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2519 }
df826430 2520 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2521 Perl_re_indentf( aTHX_
cb41e5d6
YO
2522 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2523 depth+1,
538e84ed
KW
2524 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2525 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2526 });
538e84ed 2527
7f69552c
YO
2528 /* Find the node we are going to overwrite */
2529 if ( first == startbranch && OP( last ) != BRANCH ) {
2530 /* whole branch chain */
2531 convert = first;
2532 } else {
2533 /* branch sub-chain */
2534 convert = NEXTOPER( first );
2535 }
538e84ed 2536
a3621e74
YO
2537 /* -- First loop and Setup --
2538
2539 We first traverse the branches and scan each word to determine if it
2540 contains widechars, and how many unique chars there are, this is
2541 important as we have to build a table with at least as many columns as we
2542 have unique chars.
2543
2544 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2545 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2546 the native representation of the character value as the key and IV's for
2547 the coded index.
a3621e74
YO
2548
2549 *TODO* If we keep track of how many times each character is used we can
2550 remap the columns so that the table compression later on is more
3b753521 2551 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2552 middle and the least common are on the outside. IMO this would be better
2553 than a most to least common mapping as theres a decent chance the most
2554 common letter will share a node with the least common, meaning the node
486ec47a 2555 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2556 case is when we have the least common nodes twice.
2557
2558 */
2559
a3621e74 2560 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2561 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2562 const U8 *uc;
2563 const U8 *e;
bc031a7d 2564 int foldlen = 0;
07be1b83 2565 U32 wordlen = 0; /* required init */
bc031a7d
KW
2566 STRLEN minchars = 0;
2567 STRLEN maxchars = 0;
538e84ed
KW
2568 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2569 bitmap?*/
a3621e74 2570
3dab1dad 2571 if (OP(noper) == NOTHING) {
20ed8c88
YO
2572 /* skip past a NOTHING at the start of an alternation
2573 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2574 */
df826430 2575 regnode *noper_next= regnext(noper);
944e05e3
YO
2576 if (noper_next < tail)
2577 noper= noper_next;
2578 }
2579
dca5fc4c
YO
2580 if ( noper < tail &&
2581 (
2582 OP(noper) == flags ||
2583 (
2584 flags == EXACTFU &&
2585 OP(noper) == EXACTFU_SS
2586 )
2587 )
2588 ) {
944e05e3
YO
2589 uc= (U8*)STRING(noper);
2590 e= uc + STR_LEN(noper);
2591 } else {
2592 trie->minlen= 0;
2593 continue;
3dab1dad 2594 }
df826430 2595
944e05e3 2596
fab2782b 2597 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2598 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2599 regardless of encoding */
fab2782b
YO
2600 if (OP( noper ) == EXACTFU_SS) {
2601 /* false positives are ok, so just set this */
0dc4a61d 2602 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2603 }
2604 }
dca5fc4c 2605
bc031a7d
KW
2606 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2607 branch */
3dab1dad 2608 TRIE_CHARCOUNT(trie)++;
a3621e74 2609 TRIE_READ_CHAR;
645de4ce 2610
bc031a7d
KW
2611 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2612 * is in effect. Under /i, this character can match itself, or
2613 * anything that folds to it. If not under /i, it can match just
2614 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2615 * all fold to k, and all are single characters. But some folds
2616 * expand to more than one character, so for example LATIN SMALL
2617 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2618 * the string beginning at 'uc' is 'ffi', it could be matched by
2619 * three characters, or just by the one ligature character. (It
2620 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2621 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2622 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2623 * match.) The trie needs to know the minimum and maximum number
2624 * of characters that could match so that it can use size alone to
2625 * quickly reject many match attempts. The max is simple: it is
2626 * the number of folded characters in this branch (since a fold is
2627 * never shorter than what folds to it. */
2628
2629 maxchars++;
2630
2631 /* And the min is equal to the max if not under /i (indicated by
2632 * 'folder' being NULL), or there are no multi-character folds. If
2633 * there is a multi-character fold, the min is incremented just
2634 * once, for the character that folds to the sequence. Each
2635 * character in the sequence needs to be added to the list below of
2636 * characters in the trie, but we count only the first towards the
2637 * min number of characters needed. This is done through the
2638 * variable 'foldlen', which is returned by the macros that look
2639 * for these sequences as the number of bytes the sequence
2640 * occupies. Each time through the loop, we decrement 'foldlen' by
2641 * how many bytes the current char occupies. Only when it reaches
2642 * 0 do we increment 'minchars' or look for another multi-character
2643 * sequence. */
2644 if (folder == NULL) {
2645 minchars++;
2646 }
2647 else if (foldlen > 0) {
2648 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2649 }
2650 else {
bc031a7d
KW
2651 minchars++;
2652
2653 /* See if *uc is the beginning of a multi-character fold. If
2654 * so, we decrement the length remaining to look at, to account
2655 * for the current character this iteration. (We can use 'uc'
2656 * instead of the fold returned by TRIE_READ_CHAR because for
2657 * non-UTF, the latin1_safe macro is smart enough to account
2658 * for all the unfolded characters, and because for UTF, the
2659 * string will already have been folded earlier in the
2660 * compilation process */
2661 if (UTF) {
2662 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2663 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2664 }
2665 }
bc031a7d
KW
2666 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2667 foldlen--;
2668 }
645de4ce 2669 }
bc031a7d
KW
2670
2671 /* The current character (and any potential folds) should be added
2672 * to the possible matching characters for this position in this
2673 * branch */
a3621e74 2674 if ( uvc < 256 ) {
fab2782b
YO
2675 if ( folder ) {
2676 U8 folded= folder[ (U8) uvc ];
2677 if ( !trie->charmap[ folded ] ) {
2678 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2679 TRIE_STORE_REVCHAR( folded );
2680 }
2681 }
a3621e74
YO
2682 if ( !trie->charmap[ uvc ] ) {
2683 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2684 TRIE_STORE_REVCHAR( uvc );
a3621e74 2685 }
02daf0ab 2686 if ( set_bit ) {
62012aee
KW
2687 /* store the codepoint in the bitmap, and its folded
2688 * equivalent. */
8bcafbf4 2689 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2690 set_bit = 0; /* We've done our bit :-) */
2691 }
a3621e74 2692 } else {
bc031a7d
KW
2693
2694 /* XXX We could come up with the list of code points that fold
2695 * to this using PL_utf8_foldclosures, except not for
2696 * multi-char folds, as there may be multiple combinations
2697 * there that could work, which needs to wait until runtime to
2698 * resolve (The comment about LIGATURE FFI above is such an
2699 * example */
2700
a3621e74 2701 SV** svpp;
55eed653
NC
2702 if ( !widecharmap )
2703 widecharmap = newHV();
a3621e74 2704
55eed653 2705 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2706
2707 if ( !svpp )
147e3846 2708 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
2709
2710 if ( !SvTRUE( *svpp ) ) {
2711 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2712 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2713 }
2714 }
bc031a7d
KW
2715 } /* end loop through characters in this branch of the trie */
2716
2717 /* We take the min and max for this branch and combine to find the min
2718 * and max for all branches processed so far */
3dab1dad 2719 if( cur == first ) {
bc031a7d
KW
2720 trie->minlen = minchars;
2721 trie->maxlen = maxchars;
2722 } else if (minchars < trie->minlen) {
2723 trie->minlen = minchars;
2724 } else if (maxchars > trie->maxlen) {
2725 trie->maxlen = maxchars;
fab2782b 2726 }
a3621e74
YO
2727 } /* end first pass */
2728 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2729 Perl_re_indentf( aTHX_
cb41e5d6
YO
2730 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2731 depth+1,
55eed653 2732 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2733 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2734 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2735 );
a3621e74
YO
2736
2737 /*
2738 We now know what we are dealing with in terms of unique chars and
2739 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2740 representation using a flat table will take. If it's over a reasonable
2741 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2742 conservative but potentially much slower representation using an array
2743 of lists.
2744
2745 At the end we convert both representations into the same compressed
2746 form that will be used in regexec.c for matching with. The latter
2747 is a form that cannot be used to construct with but has memory
2748 properties similar to the list form and access properties similar
2749 to the table form making it both suitable for fast searches and
2750 small enough that its feasable to store for the duration of a program.
2751
2752 See the comment in the code where the compressed table is produced
2753 inplace from the flat tabe representation for an explanation of how
2754 the compression works.
2755
2756 */
2757
2758
2e64971a
DM
2759 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2760 prev_states[1] = 0;
2761
538e84ed
KW
2762 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2763 > SvIV(re_trie_maxbuff) )
2764 {
a3621e74
YO
2765 /*
2766 Second Pass -- Array Of Lists Representation
2767
2768 Each state will be represented by a list of charid:state records
2769 (reg_trie_trans_le) the first such element holds the CUR and LEN
2770 points of the allocated array. (See defines above).
2771
2772 We build the initial structure using the lists, and then convert
2773 it into the compressed table form which allows faster lookups
2774 (but cant be modified once converted).
a3621e74
YO
2775 */
2776
a3621e74
YO
2777 STRLEN transcount = 1;
2778
6ad9a8ab 2779 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 2780 depth+1));
686b73d4 2781
c944940b
JH
2782 trie->states = (reg_trie_state *)
2783 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2784 sizeof(reg_trie_state) );
a3621e74
YO
2785 TRIE_LIST_NEW(1);
2786 next_alloc = 2;
2787
2788 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2789
df826430 2790 regnode *noper = NEXTOPER( cur );
c445ea15
AL
2791 U32 state = 1; /* required init */
2792 U16 charid = 0; /* sanity init */
07be1b83 2793 U32 wordlen = 0; /* required init */
c445ea15 2794
df826430
YO
2795 if (OP(noper) == NOTHING) {
2796 regnode *noper_next= regnext(noper);
944e05e3
YO
2797 if (noper_next < tail)
2798 noper= noper_next;
df826430
YO
2799 }
2800
944e05e3
YO
2801 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2802 const U8 *uc= (U8*)STRING(noper);
2803 const U8 *e= uc + STR_LEN(noper);
2804
786e8c11 2805 for ( ; uc < e ; uc += len ) {
c445ea15 2806
786e8c11 2807 TRIE_READ_CHAR;
c445ea15 2808
786e8c11
YO
2809 if ( uvc < 256 ) {
2810 charid = trie->charmap[ uvc ];
c445ea15 2811 } else {
538e84ed
KW
2812 SV** const svpp = hv_fetch( widecharmap,
2813 (char*)&uvc,
2814 sizeof( UV ),
2815 0);
786e8c11
YO
2816 if ( !svpp ) {
2817 charid = 0;
2818 } else {
2819 charid=(U16)SvIV( *svpp );
2820 }
c445ea15 2821 }
538e84ed
KW
2822 /* charid is now 0 if we dont know the char read, or
2823 * nonzero if we do */
786e8c11 2824 if ( charid ) {
a3621e74 2825
786e8c11
YO
2826 U16 check;
2827 U32 newstate = 0;
a3621e74 2828
786e8c11
YO
2829 charid--;
2830 if ( !trie->states[ state ].trans.list ) {
2831 TRIE_LIST_NEW( state );
c445ea15 2832 }
538e84ed
KW
2833 for ( check = 1;
2834 check <= TRIE_LIST_USED( state );
2835 check++ )
2836 {
2837 if ( TRIE_LIST_ITEM( state, check ).forid
2838 == charid )
2839 {
786e8c11
YO
2840 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2841 break;
2842 }
2843 }
2844 if ( ! newstate ) {
2845 newstate = next_alloc++;
2e64971a 2846 prev_states[newstate] = state;
786e8c11
YO
2847 TRIE_LIST_PUSH( state, charid, newstate );
2848 transcount++;
2849 }
2850 state = newstate;
2851 } else {
147e3846 2852 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 2853 }
a28509cc 2854 }
c445ea15 2855 }
3dab1dad 2856 TRIE_HANDLE_WORD(state);
a3621e74
YO
2857
2858 } /* end second pass */
2859
1e2e3d02 2860 /* next alloc is the NEXT state to be allocated */
538e84ed 2861 trie->statecount = next_alloc;
c944940b
JH
2862 trie->states = (reg_trie_state *)
2863 PerlMemShared_realloc( trie->states,
2864 next_alloc
2865 * sizeof(reg_trie_state) );
a3621e74 2866
3dab1dad 2867 /* and now dump it out before we compress it */
2b8b4781
NC
2868 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2869 revcharmap, next_alloc,
2870 depth+1)
1e2e3d02 2871 );
a3621e74 2872
c944940b
JH
2873 trie->trans = (reg_trie_trans *)
2874 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2875 {
2876 U32 state;
a3621e74
YO
2877 U32 tp = 0;
2878 U32 zp = 0;
2879
2880
2881 for( state=1 ; state < next_alloc ; state ++ ) {
2882 U32 base=0;
2883
2884 /*
2885 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 2886 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
2887 );
2888 */
2889
2890 if (trie->states[state].trans.list) {
2891 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2892 U16 maxid=minid;
a28509cc 2893 U16 idx;
a3621e74
YO
2894
2895 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
2896 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2897 if ( forid < minid ) {
2898 minid=forid;
2899 } else if ( forid > maxid ) {
2900 maxid=forid;
2901 }
a3621e74
YO
2902 }
2903 if ( transcount < tp + maxid - minid + 1) {
2904 transcount *= 2;
c944940b
JH
2905 trie->trans = (reg_trie_trans *)
2906 PerlMemShared_realloc( trie->trans,
446bd890
NC
2907 transcount
2908 * sizeof(reg_trie_trans) );
538e84ed
KW
2909 Zero( trie->trans + (transcount / 2),
2910 transcount / 2,
2911 reg_trie_trans );
a3621e74
YO
2912 }
2913 base = trie->uniquecharcount + tp - minid;
2914 if ( maxid == minid ) {
2915 U32 set = 0;
2916 for ( ; zp < tp ; zp++ ) {
2917 if ( ! trie->trans[ zp ].next ) {
2918 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
2919 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2920 1).newstate;
a3621e74
YO
2921 trie->trans[ zp ].check = state;
2922 set = 1;
2923 break;
2924 }
2925 }
2926 if ( !set ) {
538e84ed
KW
2927 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2928 1).newstate;
a3621e74
YO
2929 trie->trans[ tp ].check = state;
2930 tp++;
2931 zp = tp;
2932 }
2933 } else {
2934 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
2935 const U32 tid = base
2936 - trie->uniquecharcount
2937 + TRIE_LIST_ITEM( state, idx ).forid;
2938 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2939 idx ).newstate;
a3621e74
YO
2940 trie->trans[ tid ].check = state;
2941 }
2942 tp += ( maxid - minid + 1 );
2943 }
2944 Safefree(trie->states[ state ].trans.list);
2945 }
2946 /*
2947 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 2948 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
2949 );
2950 */
2951 trie->states[ state ].trans.base=base;
2952 }
cc601c31 2953 trie->lasttrans = tp + 1;
a3621e74
YO
2954 }
2955 } else {
2956 /*
2957 Second Pass -- Flat Table Representation.
2958
b423522f
KW
2959 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2960 each. We know that we will need Charcount+1 trans at most to store
2961 the data (one row per char at worst case) So we preallocate both
2962 structures assuming worst case.
a3621e74
YO
2963
2964 We then construct the trie using only the .next slots of the entry
2965 structs.
2966
b423522f
KW
2967 We use the .check field of the first entry of the node temporarily
2968 to make compression both faster and easier by keeping track of how
2969 many non zero fields are in the node.
a3621e74
YO
2970
2971 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2972 transition.
2973
b423522f
KW
2974 There are two terms at use here: state as a TRIE_NODEIDX() which is
2975 a number representing the first entry of the node, and state as a
2976 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2977 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2978 if there are 2 entrys per node. eg:
a3621e74
YO
2979
2980 A B A B
2981 1. 2 4 1. 3 7
2982 2. 0 3 3. 0 5
2983 3. 0 0 5. 0 0
2984 4. 0 0 7. 0 0
2985
b423522f
KW
2986 The table is internally in the right hand, idx form. However as we
2987 also have to deal with the states array which is indexed by nodenum
2988 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
2989
2990 */
6ad9a8ab 2991 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 2992 depth+1));
3dab1dad 2993
c944940b
JH
2994 trie->trans = (reg_trie_trans *)
2995 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2996 * trie->uniquecharcount + 1,
2997 sizeof(reg_trie_trans) );
2998 trie->states = (reg_trie_state *)
2999 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3000 sizeof(reg_trie_state) );
a3621e74
YO
3001 next_alloc = trie->uniquecharcount + 1;
3002
3dab1dad 3003
a3621e74
YO
3004 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3005
df826430 3006 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3007
3008 U32 state = 1; /* required init */
3009
3010 U16 charid = 0; /* sanity init */
3011 U32 accept_state = 0; /* sanity init */
a3621e74 3012
07be1b83 3013 U32 wordlen = 0; /* required init */
a3621e74 3014
df826430
YO
3015 if (OP(noper) == NOTHING) {
3016 regnode *noper_next= regnext(noper);
944e05e3
YO
3017 if (noper_next < tail)
3018 noper= noper_next;
df826430 3019 }
fab2782b 3020
944e05e3
YO
3021 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3022 const U8 *uc= (U8*)STRING(noper);
3023 const U8 *e= uc + STR_LEN(noper);
3024
786e8c11 3025 for ( ; uc < e ; uc += len ) {
a3621e74 3026
786e8c11 3027 TRIE_READ_CHAR;
a3621e74 3028
786e8c11
YO
3029 if ( uvc < 256 ) {
3030 charid = trie->charmap[ uvc ];
3031 } else {
538e84ed
KW
3032 SV* const * const svpp = hv_fetch( widecharmap,
3033 (char*)&uvc,
3034 sizeof( UV ),
3035 0);
786e8c11 3036 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3037 }
786e8c11
YO
3038 if ( charid ) {
3039 charid--;
3040 if ( !trie->trans[ state + charid ].next ) {
3041 trie->trans[ state + charid ].next = next_alloc;
3042 trie->trans[ state ].check++;
2e64971a
DM
3043 prev_states[TRIE_NODENUM(next_alloc)]
3044 = TRIE_NODENUM(state);
786e8c11
YO
3045 next_alloc += trie->uniquecharcount;
3046 }
3047 state = trie->trans[ state + charid ].next;
3048 } else {
147e3846 3049 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3050 }
538e84ed
KW
3051 /* charid is now 0 if we dont know the char read, or
3052 * nonzero if we do */
a3621e74 3053 }
a3621e74 3054 }
3dab1dad
YO
3055 accept_state = TRIE_NODENUM( state );
3056 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3057
3058 } /* end second pass */
3059
3dab1dad 3060 /* and now dump it out before we compress it */
2b8b4781
NC
3061 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3062 revcharmap,
3063 next_alloc, depth+1));
a3621e74 3064
a3621e74
YO
3065 {
3066 /*
3067 * Inplace compress the table.*
3068
3069 For sparse data sets the table constructed by the trie algorithm will
3070 be mostly 0/FAIL transitions or to put it another way mostly empty.
3071 (Note that leaf nodes will not contain any transitions.)
3072
3073 This algorithm compresses the tables by eliminating most such
3074 transitions, at the cost of a modest bit of extra work during lookup:
3075
3076 - Each states[] entry contains a .base field which indicates the
3077 index in the state[] array wheres its transition data is stored.
3078
3b753521 3079 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3080
3081 - If .base is nonzero then charid is added to it to find an entry in
3082 the trans array.
3083
3084 -If trans[states[state].base+charid].check!=state then the
3085 transition is taken to be a 0/Fail transition. Thus if there are fail
3086 transitions at the front of the node then the .base offset will point
3087 somewhere inside the previous nodes data (or maybe even into a node
3088 even earlier), but the .check field determines if the transition is
3089 valid.
3090
786e8c11 3091 XXX - wrong maybe?
a3621e74 3092 The following process inplace converts the table to the compressed
3b753521 3093 table: We first do not compress the root node 1,and mark all its
a3621e74 3094 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3095 allows us to do a DFA construction from the compressed table later,
3096 and ensures that any .base pointers we calculate later are greater
3097 than 0.
a3621e74
YO
3098
3099 - We set 'pos' to indicate the first entry of the second node.
3100
3101 - We then iterate over the columns of the node, finding the first and
3102 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3103 and set the .check pointers accordingly, and advance pos
3104 appropriately and repreat for the next node. Note that when we copy
3105 the next pointers we have to convert them from the original
3106 NODEIDX form to NODENUM form as the former is not valid post
3107 compression.
3108
3109 - If a node has no transitions used we mark its base as 0 and do not
3110 advance the pos pointer.
3111
3112 - If a node only has one transition we use a second pointer into the
3113 structure to fill in allocated fail transitions from other states.
3114 This pointer is independent of the main pointer and scans forward
3115 looking for null transitions that are allocated to a state. When it
3116 finds one it writes the single transition into the "hole". If the
786e8c11 3117 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3118
3119 - Once compressed we can Renew/realloc the structures to release the
3120 excess space.
3121
3122 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3123 specifically Fig 3.47 and the associated pseudocode.
3124
3125 demq
3126 */
a3b680e6 3127 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3128 U32 state, charid;
a3621e74 3129 U32 pos = 0, zp=0;
1e2e3d02 3130 trie->statecount = laststate;
a3621e74
YO
3131
3132 for ( state = 1 ; state < laststate ; state++ ) {
3133 U8 flag = 0;
a28509cc
AL
3134 const U32 stateidx = TRIE_NODEIDX( state );
3135 const U32 o_used = trie->trans[ stateidx ].check;
3136 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3137 trie->trans[ stateidx ].check = 0;
3138
538e84ed
KW
3139 for ( charid = 0;
3140 used && charid < trie->uniquecharcount;
3141 charid++ )
3142 {
a3621e74
YO
3143 if ( flag || trie->trans[ stateidx + charid ].next ) {
3144 if ( trie->trans[ stateidx + charid ].next ) {
3145 if (o_used == 1) {
3146 for ( ; zp < pos ; zp++ ) {
3147 if ( ! trie->trans[ zp ].next ) {
3148 break;
3149 }
3150 }
538e84ed
KW
3151 trie->states[ state ].trans.base
3152 = zp
3153 + trie->uniquecharcount
3154 - charid ;
3155 trie->trans[ zp ].next
3156 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3157 + charid ].next );
a3621e74
YO
3158 trie->trans[ zp ].check = state;
3159 if ( ++zp > pos ) pos = zp;
3160 break;
3161 }
3162 used--;
3163 }
3164 if ( !flag ) {
3165 flag = 1;
538e84ed
KW
3166 trie->states[ state ].trans.base
3167 = pos + trie->uniquecharcount - charid ;
a3621e74 3168 }
538e84ed
KW
3169 trie->trans[ pos ].next
3170 = SAFE_TRIE_NODENUM(
3171 trie->trans[ stateidx + charid ].next );
a3621e74
YO
3172 trie->trans[ pos ].check = state;
3173 pos++;
3174 }
3175 }
3176 }
cc601c31 3177 trie->lasttrans = pos + 1;
c944940b
JH
3178 trie->states = (reg_trie_state *)
3179 PerlMemShared_realloc( trie->states, laststate
3180 * sizeof(reg_trie_state) );
a3621e74 3181 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3182 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
cb41e5d6 3183 depth+1,
538e84ed
KW
3184 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3185 + 1 ),
3186 (IV)next_alloc,
3187 (IV)pos,
3188 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
3189 );
3190
3191 } /* end table compress */
3192 }
1e2e3d02 3193 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3194 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
cb41e5d6 3195 depth+1,
1e2e3d02
YO
3196 (UV)trie->statecount,
3197 (UV)trie->lasttrans)
3198 );
cc601c31 3199 /* resize the trans array to remove unused space */
c944940b
JH
3200 trie->trans = (reg_trie_trans *)
3201 PerlMemShared_realloc( trie->trans, trie->lasttrans
3202 * sizeof(reg_trie_trans) );
a3621e74 3203
538e84ed 3204 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
3205 U8 nodetype =(U8)(flags & 0xFF);
3206 char *str=NULL;
538e84ed 3207
07be1b83 3208#ifdef DEBUGGING
e62cc96a 3209 regnode *optimize = NULL;
7122b237
YO
3210#ifdef RE_TRACK_PATTERN_OFFSETS
3211
b57a0404
JH
3212 U32 mjd_offset = 0;
3213 U32 mjd_nodelen = 0;
7122b237
YO
3214#endif /* RE_TRACK_PATTERN_OFFSETS */
3215#endif /* DEBUGGING */
a3621e74 3216 /*
3dab1dad
YO
3217 This means we convert either the first branch or the first Exact,
3218 depending on whether the thing following (in 'last') is a branch
3219 or not and whther first is the startbranch (ie is it a sub part of
3220 the alternation or is it the whole thing.)
3b753521 3221 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 3222 the whole branch sequence, including the first.
a3621e74 3223 */
3dab1dad 3224 /* Find the node we are going to overwrite */
7f69552c 3225 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 3226 /* branch sub-chain */
3dab1dad 3227 NEXT_OFF( first ) = (U16)(last - first);
7122b237 3228#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
3229 DEBUG_r({
3230 mjd_offset= Node_Offset((convert));
3231 mjd_nodelen= Node_Length((convert));
3232 });
7122b237 3233#endif
7f69552c 3234 /* whole branch chain */
7122b237
YO
3235 }
3236#ifdef RE_TRACK_PATTERN_OFFSETS
3237 else {
7f69552c
YO
3238 DEBUG_r({
3239 const regnode *nop = NEXTOPER( convert );
3240 mjd_offset= Node_Offset((nop));
3241 mjd_nodelen= Node_Length((nop));
3242 });
07be1b83
YO
3243 }
3244 DEBUG_OPTIMISE_r(
147e3846 3245 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
cb41e5d6 3246 depth+1,
786e8c11 3247 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 3248 );
7122b237 3249#endif
538e84ed 3250 /* But first we check to see if there is a common prefix we can
3dab1dad
YO
3251 split out as an EXACT and put in front of the TRIE node. */
3252 trie->startstate= 1;
55eed653 3253 if ( trie->bitmap && !widecharmap && !trie->jump ) {
5ee57374
YO
3254 /* we want to find the first state that has more than
3255 * one transition, if that state is not the first state
3256 * then we have a common prefix which we can remove.
3257 */
3dab1dad 3258 U32 state;
1e2e3d02 3259 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 3260 U32 ofs = 0;
ee0dfd0b
YO
3261 I32 first_ofs = -1; /* keeps track of the ofs of the first
3262 transition, -1 means none */
8e11feef
RGS
3263 U32 count = 0;
3264 const U32 base = trie->states[ state ].trans.base;
a3621e74 3265
5ee57374 3266 /* does this state terminate an alternation? */
3dab1dad 3267 if ( trie->states[state].wordnum )
8e11feef 3268 count = 1;
a3621e74 3269
8e11feef 3270 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
3271 if ( ( base + ofs >= trie->uniquecharcount ) &&
3272 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
3273 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3274 {
3dab1dad 3275 if ( ++count > 1 ) {
5ee57374 3276 /* we have more than one transition */
d3d91c7c
YO
3277 SV **tmp;
3278 U8 *ch;
5ee57374
YO
3279 /* if this is the first state there is no common prefix
3280 * to extract, so we can exit */
8e11feef 3281 if ( state == 1 ) break;
d3d91c7c
YO
3282 tmp = av_fetch( revcharmap, ofs, 0);
3283 ch = (U8*)SvPV_nolen_const( *tmp );
3284
5ee57374
YO
3285 /* if we are on count 2 then we need to initialize the
3286 * bitmap, and store the previous char if there was one
3287 * in it*/
3dab1dad 3288 if ( count == 2 ) {
5ee57374 3289 /* clear the bitmap */
3dab1dad
YO
3290 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3291 DEBUG_OPTIMISE_r(
147e3846 3292 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
cb41e5d6 3293 depth+1,
786e8c11 3294 (UV)state));
ee0dfd0b
YO
3295 if (first_ofs >= 0) {
3296 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
be8e71aa 3297 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 3298
8bcafbf4 3299 TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
3dab1dad 3300 DEBUG_OPTIMISE_r(
6ad9a8ab 3301 Perl_re_printf( aTHX_ "%s", (char*)ch)
3dab1dad 3302 );
8e11feef
RGS
3303 }
3304 }
8bcafbf4
YO
3305 /* store the current firstchar in the bitmap */
3306 TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
6ad9a8ab 3307 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
8e11feef 3308 }
ee0dfd0b 3309 first_ofs = ofs;
8e11feef 3310 }
3dab1dad
YO
3311 }
3312 if ( count == 1 ) {
5ee57374
YO
3313 /* This state has only one transition, its transition is part
3314 * of a common prefix - we need to concatenate the char it
3315 * represents to what we have so far. */
ee0dfd0b 3316 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
c490c714
YO
3317 STRLEN len;
3318 char *ch = SvPV( *tmp, len );
de734bd5
A
3319 DEBUG_OPTIMISE_r({
3320 SV *sv=sv_newmortal();
147e3846 3321 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
cb41e5d6 3322 depth+1,
ee0dfd0b 3323 (UV)state, (UV)first_ofs,
538e84ed 3324 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
de734bd5
A
3325 PL_colors[0], PL_colors[1],
3326 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed 3327 PERL_PV_ESCAPE_FIRSTCHAR
de734bd5
A
3328 )
3329 );
3330 });
3dab1dad
YO
3331 if ( state==1 ) {
3332 OP( convert ) = nodetype;
3333 str=STRING(convert);
3334 STR_LEN(convert)=0;
3335 }
c490c714
YO
3336 STR_LEN(convert) += len;
3337 while (len--)
de734bd5 3338 *str++ = *ch++;
8e11feef 3339 } else {
538e84ed 3340#ifdef DEBUGGING
8e11feef 3341 if (state>1)
6ad9a8ab 3342 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
f9049ba1 3343#endif
8e11feef
RGS
3344 break;
3345 }
3346 }
2e64971a 3347 trie->prefixlen = (state-1);
3dab1dad 3348 if (str) {
8e11feef 3349 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 3350 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 3351 trie->startstate = state;
07be1b83
YO
3352 trie->minlen -= (state - 1);
3353 trie->maxlen -= (state - 1);
33809eae
JH
3354#ifdef DEBUGGING
3355 /* At least the UNICOS C compiler choked on this
3356 * being argument to DEBUG_r(), so let's just have
3357 * it right here. */
3358 if (
3359#ifdef PERL_EXT_RE_BUILD
3360 1
3361#else
3362 DEBUG_r_TEST
3363#endif
3364 ) {
3365 regnode *fix = convert;
3366 U32 word = trie->wordcount;
3367 mjd_nodelen++;
3368 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3369 while( ++fix < n ) {
3370 Set_Node_Offset_Length(fix, 0, 0);
3371 }
3372 while (word--) {
3373 SV ** const tmp = av_fetch( trie_words, word, 0 );
3374 if (tmp) {
3375 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3376 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3377 else
3378 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3379 }
3380 }
3381 }
3382#endif
8e11feef
RGS
3383 if (trie->maxlen) {
3384 convert = n;
3385 } else {
3dab1dad 3386 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 3387 DEBUG_r(optimize= n);
3dab1dad
YO
3388 }
3389 }
3390 }
538e84ed
KW
3391 if (!jumper)
3392 jumper = last;
3dab1dad 3393 if ( trie->maxlen ) {
8e11feef
RGS
3394 NEXT_OFF( convert ) = (U16)(tail - convert);
3395 ARG_SET( convert, data_slot );
538e84ed
KW
3396 /* Store the offset to the first unabsorbed branch in
3397 jump[0], which is otherwise unused by the jump logic.
786e8c11 3398 We use this when dumping a trie and during optimisation. */
538e84ed 3399 if (trie->jump)
7f69552c 3400 trie->jump[0] = (U16)(nextbranch - convert);
538e84ed 3401
6c48061a
YO
3402 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3403 * and there is a bitmap
3404 * and the first "jump target" node we found leaves enough room
3405 * then convert the TRIE node into a TRIEC node, with the bitmap
3406 * embedded inline in the opcode - this is hypothetically faster.
3407 */
3408 if ( !trie->states[trie->startstate].wordnum
3409 && trie->bitmap
3410 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
3411 {
3412 OP( convert ) = TRIEC;
3413 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 3414 PerlMemShared_free(trie->bitmap);
786e8c11 3415 trie->bitmap= NULL;
538e84ed 3416 } else
786e8c11 3417 OP( convert ) = TRIE;
a3621e74 3418
3dab1dad
YO
3419 /* store the type in the flags */
3420 convert->flags = nodetype;
a5ca303d 3421 DEBUG_r({
538e84ed
KW
3422 optimize = convert
3423 + NODE_STEP_REGNODE
a5ca303d
YO
3424 + regarglen[ OP( convert ) ];
3425 });
538e84ed 3426 /* XXX We really should free up the resource in trie now,
a5ca303d 3427 as we won't use them - (which resources?) dmq */
3dab1dad 3428 }
a3621e74 3429 /* needed for dumping*/
e62cc96a 3430 DEBUG_r(if (optimize) {
07be1b83 3431 regnode *opt = convert;
bcdf7404 3432
e62cc96a 3433 while ( ++opt < optimize) {
07be1b83
YO
3434 Set_Node_Offset_Length(opt,0,0);
3435 }
538e84ed
KW
3436 /*
3437 Try to clean up some of the debris left after the
786e8c11 3438 optimisation.
a3621e74 3439 */
786e8c11 3440 while( optimize < jumper ) {
07be1b83 3441 mjd_nodelen += Node_Length((optimize));
a3621e74 3442 OP( optimize ) = OPTIMIZED;
07be1b83 3443 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
3444 optimize++;
3445 }
07be1b83 3446 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
3447 });
3448 } /* end node insert */
2e64971a
DM
3449
3450 /* Finish populating the prev field of the wordinfo array. Walk back
3451 * from each accept state until we find another accept state, and if
3452 * so, point the first word's .prev field at the second word. If the
3453 * second already has a .prev field set, stop now. This will be the
3454 * case either if we've already processed that word's accept state,
3b753521
FN
3455 * or that state had multiple words, and the overspill words were
3456 * already linked up earlier.
2e64971a
DM
3457 */
3458 {
3459 U16 word;
3460 U32 state;
3461 U16 prev;
3462
3463 for (word=1; word <= trie->wordcount; word++) {
3464 prev = 0;
3465 if (trie->wordinfo[word].prev)
3466 continue;
3467 state = trie->wordinfo[word].accept;
3468 while (state) {
3469 state = prev_states[state];
3470 if (!state)
3471 break;
3472 prev = trie->states[state].wordnum;
3473 if (prev)
3474 break;
3475 }
3476 trie->wordinfo[word].prev = prev;