This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
For character case changing, create macros and use
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
f7e03a10 89#include "dquote_inline.h"
b992490d 90#include "invlist_inline.h"
1b0f46bf 91#include "unicode_constants.h"
04e98a4d 92
538e84ed
KW
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
95#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 97#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 98#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
3f910e62
YO
104/* this is a chain of data about sub patterns we are processing that
105 need to be handled separately/specially in study_chunk. Its so
106 we can simulate recursion without losing state. */
107struct scan_frame;
108typedef struct scan_frame {
109 regnode *last_regnode; /* last node to process in this frame */
110 regnode *next_regnode; /* next node to process when last is reached */
111 U32 prev_recursed_depth;
112 I32 stopparen; /* what stopparen do we use */
113 U32 is_top_frame; /* what flags do we use? */
114
115 struct scan_frame *this_prev_frame; /* this previous frame */
116 struct scan_frame *prev_frame; /* previous frame */
117 struct scan_frame *next_frame; /* next frame */
118} scan_frame;
119
f4ae5a27
KW
120/* Certain characters are output as a sequence with the first being a
121 * backslash. */
122#define isBACKSLASHED_PUNCT(c) \
123 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
124
125
09b2b2e6 126struct RExC_state_t {
514a91f1
DM
127 U32 flags; /* RXf_* are we folding, multilining? */
128 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 129 char *precomp; /* uncompiled string. */
711b303b 130 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 131 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 132 regexp *rx; /* perl core regexp structure */
538e84ed
KW
133 regexp_internal *rxi; /* internal data for regexp object
134 pprivate field */
fac92740 135 char *start; /* Start of input for compile */
830247a4
IZ
136 char *end; /* End of input for compile */
137 char *parse; /* Input-scan pointer. */
285b5ca0
KW
138 char *adjusted_start; /* 'start', adjusted. See code use */
139 STRLEN precomp_adj; /* an offset beyond precomp. See code use */
ea3daa5d 140 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 141 regnode *emit_start; /* Start of emitted-code area */
538e84ed
KW
142 regnode *emit_bound; /* First regnode outside of the
143 allocated space */
f7c7e32a
DM
144 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
145 implies compiling, so don't emit */
9a81a976
KW
146 regnode_ssc emit_dummy; /* placeholder for emit to point to;
147 large enough for the largest
148 non-EXACTish node, so can use it as
149 scratch in pass1 */
830247a4
IZ
150 I32 naughty; /* How bad is this pattern? */
151 I32 sawback; /* Did we see \1, ...? */
152 U32 seen;
ea3daa5d 153 SSize_t size; /* Code size. */
538e84ed
KW
154 I32 npar; /* Capture buffer count, (OPEN) plus
155 one. ("par" 0 is the whole
156 pattern)*/
157 I32 nestroot; /* root parens we are in - used by
158 accept */
830247a4
IZ
159 I32 extralen;
160 I32 seen_zerolen;
40d049e4
YO
161 regnode **open_parens; /* pointers to open parens */
162 regnode **close_parens; /* pointers to close parens */
d5a00e4a 163 regnode *end_op; /* END node in program */
02daf0ab
YO
164 I32 utf8; /* whether the pattern is utf8 or not */
165 I32 orig_utf8; /* whether the pattern was originally in utf8 */
166 /* XXX use this for future optimisation of case
167 * where pattern must be upgraded to utf8. */
e40e74fe
KW
168 I32 uni_semantics; /* If a d charset modifier should use unicode
169 rules, even if the pattern is not in
170 utf8 */
81714fb9 171 HV *paren_names; /* Paren names */
538e84ed 172
40d049e4 173 regnode **recurse; /* Recurse regops */
d5a00e4a 174 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 175 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 176 through */
09a65838 177 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 178 I32 in_lookbehind;
4624b182 179 I32 contains_locale;
cfafade5 180 I32 contains_i;
bb3f3ed2 181 I32 override_recoding;
b6d67071
KW
182#ifdef EBCDIC
183 I32 recode_x_to_native;
184#endif
9d53c457 185 I32 in_multi_char_class;
3d2bd50a 186 struct reg_code_block *code_blocks; /* positions of literal (?{})
68e2671b 187 within pattern */
b1603ef8
DM
188 int num_code_blocks; /* size of code_blocks[] */
189 int code_index; /* next code_blocks[] slot */
ee273784 190 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
191 scan_frame *frame_head;
192 scan_frame *frame_last;
193 U32 frame_count;
7eec73eb 194 AV *warn_text;
dc6d7f5c 195#ifdef ADD_TO_REGEXEC
830247a4
IZ
196 char *starttry; /* -Dr: where regtry was called. */
197#define RExC_starttry (pRExC_state->starttry)
198#endif
d24ca0c5 199 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 200#ifdef DEBUGGING
be8e71aa 201 const char *lastparse;
3dab1dad 202 I32 lastnum;
1f1031fe 203 AV *paren_name_list; /* idx -> name */
d9a72fcc 204 U32 study_chunk_recursed_count;
c9f0d54c
YO
205 SV *mysv1;
206 SV *mysv2;
3dab1dad
YO
207#define RExC_lastparse (pRExC_state->lastparse)
208#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 209#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 210#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
211#define RExC_mysv (pRExC_state->mysv1)
212#define RExC_mysv1 (pRExC_state->mysv1)
213#define RExC_mysv2 (pRExC_state->mysv2)
214
3dab1dad 215#endif
512c0f5a 216 bool seen_unfolded_sharp_s;
911bd04e 217 bool strict;
da7cf1cc 218 bool study_started;
09b2b2e6 219};
830247a4 220
e2509266 221#define RExC_flags (pRExC_state->flags)
514a91f1 222#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 223#define RExC_precomp (pRExC_state->precomp)
285b5ca0
KW
224#define RExC_precomp_adj (pRExC_state->precomp_adj)
225#define RExC_adjusted_start (pRExC_state->adjusted_start)
711b303b 226#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 227#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 228#define RExC_rx (pRExC_state->rx)
f8fc2ecf 229#define RExC_rxi (pRExC_state->rxi)
fac92740 230#define RExC_start (pRExC_state->start)
830247a4
IZ
231#define RExC_end (pRExC_state->end)
232#define RExC_parse (pRExC_state->parse)
233#define RExC_whilem_seen (pRExC_state->whilem_seen)
512c0f5a
KW
234
235/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
236 * EXACTF node, hence was parsed under /di rules. If later in the parse,
237 * something forces the pattern into using /ui rules, the sharp s should be
238 * folded into the sequence 'ss', which takes up more space than previously
239 * calculated. This means that the sizing pass needs to be restarted. (The
240 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
241 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
242 * so there is no need to resize [perl #125990]. */
243#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
244
7122b237 245#ifdef RE_TRACK_PATTERN_OFFSETS
538e84ed
KW
246#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
247 others */
7122b237 248#endif
830247a4 249#define RExC_emit (pRExC_state->emit)
f7c7e32a 250#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 251#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 252#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
253#define RExC_sawback (pRExC_state->sawback)
254#define RExC_seen (pRExC_state->seen)
255#define RExC_size (pRExC_state->size)
ee273784 256#define RExC_maxlen (pRExC_state->maxlen)
830247a4 257#define RExC_npar (pRExC_state->npar)
e2e6a0f1 258#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
259#define RExC_extralen (pRExC_state->extralen)
260#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 261#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 262#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 263#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
264#define RExC_open_parens (pRExC_state->open_parens)
265#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 266#define RExC_end_op (pRExC_state->end_op)
81714fb9 267#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
268#define RExC_recurse (pRExC_state->recurse)
269#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 270#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
271#define RExC_study_chunk_recursed_bytes \
272 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 273#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 274#define RExC_contains_locale (pRExC_state->contains_locale)
cfafade5 275#define RExC_contains_i (pRExC_state->contains_i)
9d53c457 276#define RExC_override_recoding (pRExC_state->override_recoding)
b6d67071
KW
277#ifdef EBCDIC
278# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
279#endif
9d53c457 280#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
281#define RExC_frame_head (pRExC_state->frame_head)
282#define RExC_frame_last (pRExC_state->frame_last)
283#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 284#define RExC_strict (pRExC_state->strict)
da7cf1cc 285#define RExC_study_started (pRExC_state->study_started)
7eec73eb 286#define RExC_warn_text (pRExC_state->warn_text)
830247a4 287
99807a43
HS
288/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
289 * a flag to disable back-off on the fixed/floating substrings - if it's
290 * a high complexity pattern we assume the benefit of avoiding a full match
291 * is worth the cost of checking for the substrings even if they rarely help.
292 */
293#define RExC_naughty (pRExC_state->naughty)
294#define TOO_NAUGHTY (10)
295#define MARK_NAUGHTY(add) \
296 if (RExC_naughty < TOO_NAUGHTY) \
297 RExC_naughty += (add)
298#define MARK_NAUGHTY_EXP(exp, add) \
299 if (RExC_naughty < TOO_NAUGHTY) \
300 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 301
a687059c
LW
302#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
303#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 304 ((*s) == '{' && regcurly(s)))
a687059c
LW
305
306/*
307 * Flags to be passed up and down.
308 */
a687059c 309#define WORST 0 /* Worst case. */
a3b492c3 310#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 311
e64f369d 312/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
313 * character. (There needs to be a case: in the switch statement in regexec.c
314 * for any node marked SIMPLE.) Note that this is not the same thing as
315 * REGNODE_SIMPLE */
fda99bee 316#define SIMPLE 0x02
e64f369d 317#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
318#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
319#define TRYAGAIN 0x10 /* Weeded out a declaration. */
b97943f7
KW
320#define RESTART_PASS1 0x20 /* Need to restart sizing pass */
321#define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
322 calcuate sizes as UTF-8 */
a687059c 323
3dab1dad
YO
324#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
325
07be1b83
YO
326/* whether trie related optimizations are enabled */
327#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
328#define TRIE_STUDY_OPT
786e8c11 329#define FULL_TRIE_STUDY
07be1b83
YO
330#define TRIE_STCLASS
331#endif
1de06328
YO
332
333
40d049e4
YO
334
335#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
336#define PBITVAL(paren) (1 << ((paren) & 7))
337#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
338#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
339#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
340
82a6ada4 341#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 342 if (!UTF) { \
82a6ada4
KW
343 assert(PASS1); \
344 *flagp = RESTART_PASS1|NEED_UTF8; \
8d9c2815
NC
345 return NULL; \
346 } \
82a6ada4 347 } STMT_END
40d049e4 348
512c0f5a
KW
349/* Change from /d into /u rules, and restart the parse if we've already seen
350 * something whose size would increase as a result, by setting *flagp and
351 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
352 * we've change to /u during the parse. */
353#define REQUIRE_UNI_RULES(flagp, restart_retval) \
354 STMT_START { \
355 if (DEPENDS_SEMANTICS) { \
356 assert(PASS1); \
357 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
358 RExC_uni_semantics = 1; \
359 if (RExC_seen_unfolded_sharp_s) { \
360 *flagp |= RESTART_PASS1; \
361 return restart_retval; \
362 } \
363 } \
364 } STMT_END
365
f19b1a63
KW
366/* This converts the named class defined in regcomp.h to its equivalent class
367 * number defined in handy.h. */
368#define namedclass_to_classnum(class) ((int) ((class) / 2))
369#define classnum_to_namedclass(classnum) ((classnum) * 2)
370
de92f5e6
KW
371#define _invlist_union_complement_2nd(a, b, output) \
372 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
373#define _invlist_intersection_complement_2nd(a, b, output) \
374 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
375
1de06328
YO
376/* About scan_data_t.
377
378 During optimisation we recurse through the regexp program performing
379 various inplace (keyhole style) optimisations. In addition study_chunk
380 and scan_commit populate this data structure with information about
538e84ed 381 what strings MUST appear in the pattern. We look for the longest
3b753521 382 string that must appear at a fixed location, and we look for the
1de06328
YO
383 longest string that may appear at a floating location. So for instance
384 in the pattern:
538e84ed 385
1de06328 386 /FOO[xX]A.*B[xX]BAR/
538e84ed 387
1de06328
YO
388 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
389 strings (because they follow a .* construct). study_chunk will identify
390 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 391
1de06328 392 The strings can be composites, for instance
538e84ed 393
1de06328 394 /(f)(o)(o)/
538e84ed 395
1de06328 396 will result in a composite fixed substring 'foo'.
538e84ed 397
1de06328 398 For each string some basic information is maintained:
538e84ed 399
1de06328
YO
400 - offset or min_offset
401 This is the position the string must appear at, or not before.
402 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
403 characters must match before the string we are searching for.
404 Likewise when combined with minlenp and the length of the string it
538e84ed 405 tells us how many characters must appear after the string we have
1de06328 406 found.
538e84ed 407
1de06328
YO
408 - max_offset
409 Only used for floating strings. This is the rightmost point that
ea3daa5d 410 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 411 string can occur infinitely far to the right.
538e84ed 412
1de06328 413 - minlenp
2d608413
KW
414 A pointer to the minimum number of characters of the pattern that the
415 string was found inside. This is important as in the case of positive
538e84ed 416 lookahead or positive lookbehind we can have multiple patterns
1de06328 417 involved. Consider
538e84ed 418
1de06328 419 /(?=FOO).*F/
538e84ed 420
1de06328
YO
421 The minimum length of the pattern overall is 3, the minimum length
422 of the lookahead part is 3, but the minimum length of the part that
538e84ed 423 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 424 minimum length for the F is 1. This is important as the minimum length
538e84ed 425 is used to determine offsets in front of and behind the string being
1de06328 426 looked for. Since strings can be composites this is the length of the
486ec47a 427 pattern at the time it was committed with a scan_commit. Note that
1de06328 428 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 429 are not known until the full pattern has been compiled, thus the
1de06328 430 pointer to the value.
538e84ed 431
1de06328 432 - lookbehind
538e84ed 433
1de06328 434 In the case of lookbehind the string being searched for can be
538e84ed 435 offset past the start point of the final matching string.
1de06328
YO
436 If this value was just blithely removed from the min_offset it would
437 invalidate some of the calculations for how many chars must match
438 before or after (as they are derived from min_offset and minlen and
538e84ed 439 the length of the string being searched for).
1de06328
YO
440 When the final pattern is compiled and the data is moved from the
441 scan_data_t structure into the regexp structure the information
538e84ed
KW
442 about lookbehind is factored in, with the information that would
443 have been lost precalculated in the end_shift field for the
1de06328
YO
444 associated string.
445
446 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 447 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
448
449*/
2c2d71f5
JH
450
451typedef struct scan_data_t {
1de06328
YO
452 /*I32 len_min; unused */
453 /*I32 len_delta; unused */
49f55535 454 SSize_t pos_min;
ea3daa5d 455 SSize_t pos_delta;
2c2d71f5 456 SV *last_found;
ea3daa5d 457 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 458 SSize_t last_start_min;
ea3daa5d 459 SSize_t last_start_max;
1de06328
YO
460 SV **longest; /* Either &l_fixed, or &l_float. */
461 SV *longest_fixed; /* longest fixed string found in pattern */
49f55535 462 SSize_t offset_fixed; /* offset where it starts */
ea3daa5d 463 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
464 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
465 SV *longest_float; /* longest floating string found in pattern */
49f55535 466 SSize_t offset_float_min; /* earliest point in string it can appear */
ea3daa5d
FC
467 SSize_t offset_float_max; /* latest point in string it can appear */
468 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
49f55535 469 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
2c2d71f5
JH
470 I32 flags;
471 I32 whilem_c;
49f55535 472 SSize_t *last_closep;
b8f7bb16 473 regnode_ssc *start_class;
2c2d71f5
JH
474} scan_data_t;
475
a687059c 476/*
e50aee73 477 * Forward declarations for pregcomp()'s friends.
a687059c 478 */
a0d0e21e 479
27da23d5 480static const scan_data_t zero_scan_data =
1de06328 481 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
482
483#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
484#define SF_BEFORE_SEOL 0x0001
485#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
486#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
487#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
488
44e3dfd2
BF
489#define SF_FIX_SHIFT_EOL (+2)
490#define SF_FL_SHIFT_EOL (+4)
c277df42
IZ
491
492#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
493#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
494
495#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
496#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
497#define SF_IS_INF 0x0040
498#define SF_HAS_PAR 0x0080
499#define SF_IN_PAR 0x0100
500#define SF_HAS_EVAL 0x0200
bc604ad8
KW
501
502
503/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
504 * longest substring in the pattern. When it is not set the optimiser keeps
505 * track of position, but does not keep track of the actual strings seen,
506 *
507 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
508 * /foo/i will not.
509 *
510 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
511 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
512 * turned off because of the alternation (BRANCH). */
07be1b83 513#define SCF_DO_SUBSTR 0x0400
bc604ad8 514
653099ff
GS
515#define SCF_DO_STCLASS_AND 0x0800
516#define SCF_DO_STCLASS_OR 0x1000
517#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 518#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 519
786e8c11 520#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 521#define SCF_SEEN_ACCEPT 0x8000
688e0391 522#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
523#define SCF_IN_DEFINE 0x20000
524
525
526
07be1b83 527
43fead97 528#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
529
530/* The enums for all these are ordered so things work out correctly */
a62b1201 531#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
532#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
533 == REGEX_DEPENDS_CHARSET)
00b27cfc 534#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
535#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
536 >= REGEX_UNICODE_CHARSET)
537#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
538 == REGEX_ASCII_RESTRICTED_CHARSET)
539#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
540 >= REGEX_ASCII_RESTRICTED_CHARSET)
541#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
542 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 543
43fead97 544#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 545
f2c2a6ab
KW
546/* For programs that want to be strictly Unicode compatible by dying if any
547 * attempt is made to match a non-Unicode code point against a Unicode
548 * property. */
549#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
550
93733859 551#define OOB_NAMEDCLASS -1
b8c5462f 552
8e661ac5
KW
553/* There is no code point that is out-of-bounds, so this is problematic. But
554 * its only current use is to initialize a variable that is always set before
555 * looked at. */
556#define OOB_UNICODE 0xDEADBEEF
557
a0ed51b3
LW
558#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
559#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
560
8615cb43 561
b45f050a
JF
562/* length of regex to show in messages that don't mark a position within */
563#define RegexLengthToShowInErrorMessages 127
564
565/*
566 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
567 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
568 * op/pragma/warn/regcomp.
569 */
7253e4e3
RK
570#define MARKER1 "<-- HERE" /* marker as it appears in the description */
571#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 572
538e84ed 573#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 574 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 575
285b5ca0
KW
576/* The code in this file in places uses one level of recursion with parsing
577 * rebased to an alternate string constructed by us in memory. This can take
578 * the form of something that is completely different from the input, or
579 * something that uses the input as part of the alternate. In the first case,
580 * there should be no possibility of an error, as we are in complete control of
581 * the alternate string. But in the second case we don't control the input
582 * portion, so there may be errors in that. Here's an example:
583 * /[abc\x{DF}def]/ui
584 * is handled specially because \x{df} folds to a sequence of more than one
585 * character, 'ss'. What is done is to create and parse an alternate string,
586 * which looks like this:
587 * /(?:\x{DF}|[abc\x{DF}def])/ui
588 * where it uses the input unchanged in the middle of something it constructs,
589 * which is a branch for the DF outside the character class, and clustering
590 * parens around the whole thing. (It knows enough to skip the DF inside the
591 * class while in this substitute parse.) 'abc' and 'def' may have errors that
592 * need to be reported. The general situation looks like this:
593 *
594 * sI tI xI eI
595 * Input: ----------------------------------------------------
596 * Constructed: ---------------------------------------------------
597 * sC tC xC eC EC
598 *
599 * The input string sI..eI is the input pattern. The string sC..EC is the
600 * constructed substitute parse string. The portions sC..tC and eC..EC are
601 * constructed by us. The portion tC..eC is an exact duplicate of the input
602 * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that
603 * while parsing, we find an error at xC. We want to display a message showing
604 * the real input string. Thus we need to find the point xI in it which
605 * corresponds to xC. xC >= tC, since the portion of the string sC..tC has
606 * been constructed by us, and so shouldn't have errors. We get:
607 *
608 * xI = sI + (tI - sI) + (xC - tC)
609 *
610 * and, the offset into sI is:
611 *
612 * (xI - sI) = (tI - sI) + (xC - tC)
613 *
614 * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
615 * and we save tC as RExC_adjusted_start.
903c858a
KW
616 *
617 * During normal processing of the input pattern, everything points to that,
618 * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
285b5ca0
KW
619 */
620
621#define tI_sI RExC_precomp_adj
622#define tC RExC_adjusted_start
623#define sC RExC_precomp
624#define xI_offset(xC) ((IV) (tI_sI + (xC - tC)))
625#define xI(xC) (sC + xI_offset(xC))
626#define eC RExC_precomp_end
627
628#define REPORT_LOCATION_ARGS(xC) \
629 UTF8fARG(UTF, \
630 (xI(xC) > eC) /* Don't run off end */ \
631 ? eC - sC /* Length before the <--HERE */ \
632 : xI_offset(xC), \
633 sC), /* The input pattern printed up to the <--HERE */ \
634 UTF8fARG(UTF, \
635 (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \
636 (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */
c1d900c3 637
8a6d8ec6
HS
638/* Used to point after bad bytes for an error message, but avoid skipping
639 * past a nul byte. */
640#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
641
b45f050a
JF
642/*
643 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
644 * arg. Show regex, up to a maximum length. If it's too long, chop and add
645 * "...".
646 */
58e23c8d 647#define _FAIL(code) STMT_START { \
bfed75c6 648 const char *ellipses = ""; \
711b303b 649 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380
MP
650 \
651 if (!SIZE_ONLY) \
a5e7bc51 652 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
653 if (len > RegexLengthToShowInErrorMessages) { \
654 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
655 len = RegexLengthToShowInErrorMessages - 10; \
656 ellipses = "..."; \
657 } \
58e23c8d 658 code; \
ccb2c380 659} STMT_END
8615cb43 660
58e23c8d 661#define FAIL(msg) _FAIL( \
147e3846 662 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 663 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
664
665#define FAIL2(msg,arg) _FAIL( \
147e3846 666 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 667 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 668
b45f050a 669/*
b45f050a
JF
670 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
671 */
ccb2c380 672#define Simple_vFAIL(m) STMT_START { \
ccb2c380 673 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 674 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 675} STMT_END
b45f050a
JF
676
677/*
678 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
679 */
ccb2c380
MP
680#define vFAIL(m) STMT_START { \
681 if (!SIZE_ONLY) \
a5e7bc51 682 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
683 Simple_vFAIL(m); \
684} STMT_END
b45f050a
JF
685
686/*
687 * Like Simple_vFAIL(), but accepts two arguments.
688 */
ccb2c380 689#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
690 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
691 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 692} STMT_END
b45f050a
JF
693
694/*
695 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
696 */
ccb2c380
MP
697#define vFAIL2(m,a1) STMT_START { \
698 if (!SIZE_ONLY) \
a5e7bc51 699 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
700 Simple_vFAIL2(m, a1); \
701} STMT_END
b45f050a
JF
702
703
704/*
705 * Like Simple_vFAIL(), but accepts three arguments.
706 */
ccb2c380 707#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 708 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 709 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 710} STMT_END
b45f050a
JF
711
712/*
713 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
714 */
ccb2c380
MP
715#define vFAIL3(m,a1,a2) STMT_START { \
716 if (!SIZE_ONLY) \
a5e7bc51 717 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
718 Simple_vFAIL3(m, a1, a2); \
719} STMT_END
b45f050a
JF
720
721/*
722 * Like Simple_vFAIL(), but accepts four arguments.
723 */
ccb2c380 724#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
725 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
726 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 727} STMT_END
b45f050a 728
95db3ffa
KW
729#define vFAIL4(m,a1,a2,a3) STMT_START { \
730 if (!SIZE_ONLY) \
731 SAVEFREESV(RExC_rx_sv); \
732 Simple_vFAIL4(m, a1, a2, a3); \
733} STMT_END
734
946095af 735/* A specialized version of vFAIL2 that works with UTF8f */
d528642a
KW
736#define vFAIL2utf8f(m, a1) STMT_START { \
737 if (!SIZE_ONLY) \
738 SAVEFREESV(RExC_rx_sv); \
739 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
740 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
741} STMT_END
742
3ba22297 743#define vFAIL3utf8f(m, a1, a2) STMT_START { \
3ba22297
KW
744 if (!SIZE_ONLY) \
745 SAVEFREESV(RExC_rx_sv); \
746 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 747 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
748} STMT_END
749
499333dc
KW
750/* These have asserts in them because of [perl #122671] Many warnings in
751 * regcomp.c can occur twice. If they get output in pass1 and later in that
752 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
753 * would get output again. So they should be output in pass2, and these
754 * asserts make sure new warnings follow that paradigm. */
946095af 755
5e0a247b
KW
756/* m is not necessarily a "literal string", in this macro */
757#define reg_warn_non_literal_string(loc, m) STMT_START { \
d528642a
KW
758 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
759 "%s" REPORT_LOCATION, \
760 m, REPORT_LOCATION_ARGS(loc)); \
5e0a247b
KW
761} STMT_END
762
668c081a 763#define ckWARNreg(loc,m) STMT_START { \
d528642a
KW
764 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
765 m REPORT_LOCATION, \
766 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
767} STMT_END
768
b927b7e9 769#define vWARN(loc, m) STMT_START { \
d528642a
KW
770 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
771 m REPORT_LOCATION, \
772 REPORT_LOCATION_ARGS(loc)); \
b927b7e9
KW
773} STMT_END
774
0d6106aa 775#define vWARN_dep(loc, m) STMT_START { \
d528642a
KW
776 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
777 m REPORT_LOCATION, \
778 REPORT_LOCATION_ARGS(loc)); \
0d6106aa
KW
779} STMT_END
780
147508a2 781#define ckWARNdep(loc,m) STMT_START { \
d528642a
KW
782 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
783 m REPORT_LOCATION, \
784 REPORT_LOCATION_ARGS(loc)); \
147508a2
KW
785} STMT_END
786
d528642a
KW
787#define ckWARNregdep(loc,m) STMT_START { \
788 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
789 WARN_REGEXP), \
790 m REPORT_LOCATION, \
791 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
792} STMT_END
793
d528642a
KW
794#define ckWARN2reg_d(loc,m, a1) STMT_START { \
795 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
796 m REPORT_LOCATION, \
797 a1, REPORT_LOCATION_ARGS(loc)); \
2335b3d3
KW
798} STMT_END
799
d528642a
KW
800#define ckWARN2reg(loc, m, a1) STMT_START { \
801 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
802 m REPORT_LOCATION, \
803 a1, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
804} STMT_END
805
d528642a
KW
806#define vWARN3(loc, m, a1, a2) STMT_START { \
807 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
808 m REPORT_LOCATION, \
809 a1, a2, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
810} STMT_END
811
d528642a
KW
812#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
813 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
814 m REPORT_LOCATION, \
815 a1, a2, \
816 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
817} STMT_END
818
ccb2c380 819#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
820 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
821 m REPORT_LOCATION, \
822 a1, a2, a3, \
823 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
824} STMT_END
825
668c081a 826#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
827 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
828 m REPORT_LOCATION, \
829 a1, a2, a3, \
830 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
831} STMT_END
832
ccb2c380 833#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
d528642a
KW
834 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
835 m REPORT_LOCATION, \
836 a1, a2, a3, a4, \
837 REPORT_LOCATION_ARGS(loc)); \
ccb2c380 838} STMT_END
9d1d55b5 839
538e84ed 840/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
841 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
842 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
843 * Element 0 holds the number n.
07be1b83 844 * Position is 1 indexed.
fac92740 845 */
7122b237
YO
846#ifndef RE_TRACK_PATTERN_OFFSETS
847#define Set_Node_Offset_To_R(node,byte)
848#define Set_Node_Offset(node,byte)
849#define Set_Cur_Node_Offset
850#define Set_Node_Length_To_R(node,len)
851#define Set_Node_Length(node,len)
6a86c6ad 852#define Set_Node_Cur_Length(node,start)
538e84ed
KW
853#define Node_Offset(n)
854#define Node_Length(n)
7122b237
YO
855#define Set_Node_Offset_Length(node,offset,len)
856#define ProgLen(ri) ri->u.proglen
857#define SetProgLen(ri,x) ri->u.proglen = x
858#else
859#define ProgLen(ri) ri->u.offsets[0]
860#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
861#define Set_Node_Offset_To_R(node,byte) STMT_START { \
862 if (! SIZE_ONLY) { \
863 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 864 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 865 if((node) < 0) { \
538e84ed
KW
866 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
867 (int)(node)); \
ccb2c380
MP
868 } else { \
869 RExC_offsets[2*(node)-1] = (byte); \
870 } \
871 } \
872} STMT_END
873
874#define Set_Node_Offset(node,byte) \
875 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
876#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
877
878#define Set_Node_Length_To_R(node,len) STMT_START { \
879 if (! SIZE_ONLY) { \
880 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 881 __LINE__, (int)(node), (int)(len))); \
ccb2c380 882 if((node) < 0) { \
538e84ed
KW
883 Perl_croak(aTHX_ "value of node is %d in Length macro", \
884 (int)(node)); \
ccb2c380
MP
885 } else { \
886 RExC_offsets[2*(node)] = (len); \
887 } \
888 } \
889} STMT_END
890
891#define Set_Node_Length(node,len) \
892 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
893#define Set_Node_Cur_Length(node, start) \
894 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
895
896/* Get offsets and lengths */
897#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
898#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
899
07be1b83
YO
900#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
901 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
902 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
903} STMT_END
7122b237 904#endif
07be1b83
YO
905
906#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
907#define EXPERIMENTAL_INPLACESCAN
f427392e 908#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 909
cb41e5d6
YO
910#ifdef DEBUGGING
911int
6ad9a8ab 912Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
913{
914 va_list ap;
915 int result;
916 PerlIO *f= Perl_debug_log;
917 PERL_ARGS_ASSERT_RE_PRINTF;
918 va_start(ap, fmt);
919 result = PerlIO_vprintf(f, fmt, ap);
920 va_end(ap);
921 return result;
922}
923
924int
7b031478 925Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
926{
927 va_list ap;
928 int result;
929 PerlIO *f= Perl_debug_log;
930 PERL_ARGS_ASSERT_RE_INDENTF;
931 va_start(ap, depth);
daeb874b 932 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
933 result = PerlIO_vprintf(f, fmt, ap);
934 va_end(ap);
935 return result;
936}
937#endif /* DEBUGGING */
938
7b031478 939#define DEBUG_RExC_seen() \
538e84ed 940 DEBUG_OPTIMISE_MORE_r({ \
6ad9a8ab 941 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 942 \
e384d5c1 943 if (RExC_seen & REG_ZERO_LEN_SEEN) \
6ad9a8ab 944 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 945 \
e384d5c1 946 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
6ad9a8ab 947 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 948 \
e384d5c1 949 if (RExC_seen & REG_GPOS_SEEN) \
6ad9a8ab 950 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 951 \
e384d5c1 952 if (RExC_seen & REG_RECURSE_SEEN) \
6ad9a8ab 953 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 954 \
7b031478 955 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
6ad9a8ab 956 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 957 \
e384d5c1 958 if (RExC_seen & REG_VERBARG_SEEN) \
6ad9a8ab 959 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 960 \
e384d5c1 961 if (RExC_seen & REG_CUTGROUP_SEEN) \
6ad9a8ab 962 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 963 \
e384d5c1 964 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
6ad9a8ab 965 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 966 \
e384d5c1 967 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
6ad9a8ab 968 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 969 \
7b031478 970 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
6ad9a8ab 971 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 972 \
6ad9a8ab 973 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
974 });
975
fdfb4f21 976#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 977 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21
YO
978
979#define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
980 if ( ( flags ) ) { \
6ad9a8ab 981 Perl_re_printf( aTHX_ "%s", open_str); \
fdfb4f21
YO
982 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
983 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
984 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
985 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
986 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
987 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
988 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
989 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
990 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
991 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
992 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
993 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
994 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
995 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
996 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
6ad9a8ab 997 Perl_re_printf( aTHX_ "%s", close_str); \
fdfb4f21
YO
998 }
999
1000
304ee84b
YO
1001#define DEBUG_STUDYDATA(str,data,depth) \
1002DEBUG_OPTIMISE_MORE_r(if(data){ \
147e3846
KW
1003 Perl_re_indentf( aTHX_ "" str "Pos:%" IVdf "/%" IVdf \
1004 " Flags: 0x%" UVXf, \
7b031478 1005 depth, \
1de06328
YO
1006 (IV)((data)->pos_min), \
1007 (IV)((data)->pos_delta), \
fdfb4f21
YO
1008 (UV)((data)->flags) \
1009 ); \
1010 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
147e3846
KW
1011 Perl_re_printf( aTHX_ \
1012 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s", \
1de06328 1013 (IV)((data)->whilem_c), \
304ee84b
YO
1014 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
1015 is_inf ? "INF " : "" \
1de06328
YO
1016 ); \
1017 if ((data)->last_found) \
147e3846
KW
1018 Perl_re_printf( aTHX_ \
1019 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf \
1020 " %sFixed:'%s' @ %" IVdf \
1021 " %sFloat: '%s' @ %" IVdf "/%" IVdf, \
1de06328
YO
1022 SvPVX_const((data)->last_found), \
1023 (IV)((data)->last_end), \
1024 (IV)((data)->last_start_min), \
1025 (IV)((data)->last_start_max), \
1026 ((data)->longest && \
1027 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
1028 SvPVX_const((data)->longest_fixed), \
1029 (IV)((data)->offset_fixed), \
1030 ((data)->longest && \
1031 (data)->longest==&((data)->longest_float)) ? "*" : "", \
1032 SvPVX_const((data)->longest_float), \
1033 (IV)((data)->offset_float_min), \
1034 (IV)((data)->offset_float_max) \
1035 ); \
6ad9a8ab 1036 Perl_re_printf( aTHX_ "\n"); \
1de06328
YO
1037});
1038
cb41e5d6 1039
c6871b76
KW
1040/* =========================================================
1041 * BEGIN edit_distance stuff.
1042 *
1043 * This calculates how many single character changes of any type are needed to
1044 * transform a string into another one. It is taken from version 3.1 of
1045 *
1046 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1047 */
1048
1049/* Our unsorted dictionary linked list. */
1050/* Note we use UVs, not chars. */
1051
1052struct dictionary{
1053 UV key;
1054 UV value;
1055 struct dictionary* next;
1056};
1057typedef struct dictionary item;
1058
1059
1060PERL_STATIC_INLINE item*
1061push(UV key,item* curr)
1062{
1063 item* head;
1064 Newxz(head, 1, item);
1065 head->key = key;
1066 head->value = 0;
1067 head->next = curr;
1068 return head;
1069}
1070
1071
1072PERL_STATIC_INLINE item*
1073find(item* head, UV key)
1074{
1075 item* iterator = head;
1076 while (iterator){
1077 if (iterator->key == key){
1078 return iterator;
1079 }
1080 iterator = iterator->next;
1081 }
1082
1083 return NULL;
1084}
1085
1086PERL_STATIC_INLINE item*
1087uniquePush(item* head,UV key)
1088{
1089 item* iterator = head;
1090
1091 while (iterator){
1092 if (iterator->key == key) {
1093 return head;
1094 }
1095 iterator = iterator->next;
1096 }
1097
1098 return push(key,head);
1099}
1100
1101PERL_STATIC_INLINE void
1102dict_free(item* head)
1103{
1104 item* iterator = head;
1105
1106 while (iterator) {
1107 item* temp = iterator;
1108 iterator = iterator->next;
1109 Safefree(temp);
1110 }
1111
1112 head = NULL;
1113}
1114
1115/* End of Dictionary Stuff */
1116
1117/* All calculations/work are done here */
1118STATIC int
1119S_edit_distance(const UV* src,
1120 const UV* tgt,
1121 const STRLEN x, /* length of src[] */
1122 const STRLEN y, /* length of tgt[] */
1123 const SSize_t maxDistance
1124)
1125{
1126 item *head = NULL;
1127 UV swapCount,swapScore,targetCharCount,i,j;
1128 UV *scores;
1129 UV score_ceil = x + y;
1130
1131 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1132
1133 /* intialize matrix start values */
1134 Newxz(scores, ( (x + 2) * (y + 2)), UV);
1135 scores[0] = score_ceil;
1136 scores[1 * (y + 2) + 0] = score_ceil;
1137 scores[0 * (y + 2) + 1] = score_ceil;
1138 scores[1 * (y + 2) + 1] = 0;
1139 head = uniquePush(uniquePush(head,src[0]),tgt[0]);
1140
1141 /* work loops */
1142 /* i = src index */
1143 /* j = tgt index */
1144 for (i=1;i<=x;i++) {
1145 if (i < x)
1146 head = uniquePush(head,src[i]);
1147 scores[(i+1) * (y + 2) + 1] = i;
1148 scores[(i+1) * (y + 2) + 0] = score_ceil;
1149 swapCount = 0;
1150
1151 for (j=1;j<=y;j++) {
1152 if (i == 1) {
1153 if(j < y)
1154 head = uniquePush(head,tgt[j]);
1155 scores[1 * (y + 2) + (j + 1)] = j;
1156 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1157 }
1158
1159 targetCharCount = find(head,tgt[j-1])->value;
1160 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1161
1162 if (src[i-1] != tgt[j-1]){
1163 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1164 }
1165 else {
1166 swapCount = j;
1167 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1168 }
1169 }
1170
1171 find(head,src[i-1])->value = i;
1172 }
1173
1174 {
1175 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1176 dict_free(head);
1177 Safefree(scores);
1178 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1179 }
1180}
1181
1182/* END of edit_distance() stuff
1183 * ========================================================= */
1184
8e35b056
KW
1185/* is c a control character for which we have a mnemonic? */
1186#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1187
549b4e78
KW
1188STATIC const char *
1189S_cntrl_to_mnemonic(const U8 c)
1190{
1191 /* Returns the mnemonic string that represents character 'c', if one
1192 * exists; NULL otherwise. The only ones that exist for the purposes of
1193 * this routine are a few control characters */
1194
1195 switch (c) {
1196 case '\a': return "\\a";
1197 case '\b': return "\\b";
1198 case ESC_NATIVE: return "\\e";
1199 case '\f': return "\\f";
1200 case '\n': return "\\n";
1201 case '\r': return "\\r";
1202 case '\t': return "\\t";
1203 }
1204
1205 return NULL;
1206}
1207
653099ff 1208/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 1209 Update the longest found anchored substring and the longest found
653099ff
GS
1210 floating substrings if needed. */
1211
4327152a 1212STATIC void
ea3daa5d
FC
1213S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1214 SSize_t *minlenp, int is_inf)
c277df42 1215{
e1ec3a88
AL
1216 const STRLEN l = CHR_SVLEN(data->last_found);
1217 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 1218 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1219
7918f24d
NC
1220 PERL_ARGS_ASSERT_SCAN_COMMIT;
1221
c277df42 1222 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 1223 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
1224 if (*data->longest == data->longest_fixed) {
1225 data->offset_fixed = l ? data->last_start_min : data->pos_min;
1226 if (data->flags & SF_BEFORE_EOL)
b81d288d 1227 data->flags
c277df42
IZ
1228 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
1229 else
1230 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 1231 data->minlen_fixed=minlenp;
1de06328 1232 data->lookbehind_fixed=0;
a0ed51b3 1233 }
304ee84b 1234 else { /* *data->longest == data->longest_float */
c277df42 1235 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d 1236 data->offset_float_max = (l
646e8787
DM
1237 ? data->last_start_max
1238 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1239 ? SSize_t_MAX
1240 : data->pos_min + data->pos_delta));
1241 if (is_inf
1242 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
1243 data->offset_float_max = SSize_t_MAX;
c277df42 1244 if (data->flags & SF_BEFORE_EOL)
b81d288d 1245 data->flags
c277df42
IZ
1246 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1247 else
1248 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
1249 data->minlen_float=minlenp;
1250 data->lookbehind_float=0;
c277df42
IZ
1251 }
1252 }
1253 SvCUR_set(data->last_found, 0);
0eda9292 1254 {
a28509cc 1255 SV * const sv = data->last_found;
097eb12c
AL
1256 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1257 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1258 if (mg)
1259 mg->mg_len = 0;
1260 }
0eda9292 1261 }
c277df42
IZ
1262 data->last_end = -1;
1263 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 1264 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
1265}
1266
cdd87c1d
KW
1267/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1268 * list that describes which code points it matches */
1269
653099ff 1270STATIC void
3420edd7 1271S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1272{
cdd87c1d
KW
1273 /* Set the SSC 'ssc' to match an empty string or any code point */
1274
557bd3fb 1275 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1276
71068078 1277 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1278
0854ea0b
KW
1279 /* mortalize so won't leak */
1280 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1281 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1282}
1283
653099ff 1284STATIC int
dc3bf405 1285S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1286{
c144baaa
KW
1287 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1288 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1289 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1290 * in any way, so there's no point in using it */
cdd87c1d
KW
1291
1292 UV start, end;
1293 bool ret;
653099ff 1294
557bd3fb 1295 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1296
71068078 1297 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1298
93e92956 1299 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1300 return FALSE;
1301 }
1302
1303 /* See if the list consists solely of the range 0 - Infinity */
1304 invlist_iterinit(ssc->invlist);
1305 ret = invlist_iternext(ssc->invlist, &start, &end)
1306 && start == 0
1307 && end == UV_MAX;
1308
1309 invlist_iterfinish(ssc->invlist);
1310
1311 if (ret) {
1312 return TRUE;
1313 }
1314
1315 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1316 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1317 int i;
1318 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1319 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1320 return TRUE;
1321 }
1322 }
1323 }
1324
1325 return FALSE;
653099ff
GS
1326}
1327
653099ff 1328STATIC void
cdd87c1d 1329S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1330{
cdd87c1d
KW
1331 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1332 * string, any code point, or any posix class under locale */
1333
557bd3fb 1334 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1335
557bd3fb 1336 Zero(ssc, 1, regnode_ssc);
71068078 1337 set_ANYOF_SYNTHETIC(ssc);
93e92956 1338 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1339 ssc_anything(ssc);
cdd87c1d 1340
2f306ab9
KW
1341 /* If any portion of the regex is to operate under locale rules that aren't
1342 * fully known at compile time, initialization includes it. The reason
1343 * this isn't done for all regexes is that the optimizer was written under
1344 * the assumption that locale was all-or-nothing. Given the complexity and
1345 * lack of documentation in the optimizer, and that there are inadequate
1346 * test cases for locale, many parts of it may not work properly, it is
1347 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1348 if (RExC_contains_locale) {
1349 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1350 }
1351 else {
1352 ANYOF_POSIXL_ZERO(ssc);
1353 }
653099ff
GS
1354}
1355
b423522f 1356STATIC int
dc3bf405
BF
1357S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1358 const regnode_ssc *ssc)
b423522f
KW
1359{
1360 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1361 * to the list of code points matched, and locale posix classes; hence does
1362 * not check its flags) */
1363
1364 UV start, end;
1365 bool ret;
1366
1367 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1368
71068078 1369 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1370
1371 invlist_iterinit(ssc->invlist);
1372 ret = invlist_iternext(ssc->invlist, &start, &end)
1373 && start == 0
1374 && end == UV_MAX;
1375
1376 invlist_iterfinish(ssc->invlist);
1377
1378 if (! ret) {
1379 return FALSE;
1380 }
1381
e0e1be5f 1382 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1383 return FALSE;
b423522f
KW
1384 }
1385
1386 return TRUE;
1387}
1388
1389STATIC SV*
1390S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1391 const regnode_charclass* const node)
b423522f
KW
1392{
1393 /* Returns a mortal inversion list defining which code points are matched
1394 * by 'node', which is of type ANYOF. Handles complementing the result if
1395 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1396 * returned list must, and will, contain every code point that is a
1397 * possibility. */
b423522f 1398
e2506fa7 1399 SV* invlist = NULL;
1ee208c4 1400 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1401 unsigned int i;
1402 const U32 n = ARG(node);
31f05a37 1403 bool new_node_has_latin1 = FALSE;
b423522f
KW
1404
1405 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1406
1407 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1408 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1409 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1410 AV * const av = MUTABLE_AV(SvRV(rv));
1411 SV **const ary = AvARRAY(av);
1412 assert(RExC_rxi->data->what[n] == 's');
1413
1414 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1415 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1416 }
1417 else if (ary[0] && ary[0] != &PL_sv_undef) {
1418
1419 /* Here, no compile-time swash, and there are things that won't be
1420 * known until runtime -- we have to assume it could be anything */
e2506fa7 1421 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1422 return _add_range_to_invlist(invlist, 0, UV_MAX);
1423 }
1ee208c4 1424 else if (ary[3] && ary[3] != &PL_sv_undef) {
b423522f
KW
1425
1426 /* Here no compile-time swash, and no run-time only data. Use the
1427 * node's inversion list */
1ee208c4
KW
1428 invlist = sv_2mortal(invlist_clone(ary[3]));
1429 }
1430
1431 /* Get the code points valid only under UTF-8 locales */
037715a6 1432 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1ee208c4
KW
1433 && ary[2] && ary[2] != &PL_sv_undef)
1434 {
1435 only_utf8_locale_invlist = ary[2];
b423522f
KW
1436 }
1437 }
1438
e2506fa7
KW
1439 if (! invlist) {
1440 invlist = sv_2mortal(_new_invlist(0));
1441 }
1442
dcb20b36
KW
1443 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1444 * code points, and an inversion list for the others, but if there are code
1445 * points that should match only conditionally on the target string being
1446 * UTF-8, those are placed in the inversion list, and not the bitmap.
1447 * Since there are circumstances under which they could match, they are
1448 * included in the SSC. But if the ANYOF node is to be inverted, we have
1449 * to exclude them here, so that when we invert below, the end result
1450 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1451 * have to do this here before we add the unconditionally matched code
1452 * points */
b423522f
KW
1453 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1454 _invlist_intersection_complement_2nd(invlist,
1455 PL_UpperLatin1,
1456 &invlist);
1457 }
1458
1459 /* Add in the points from the bit map */
dcb20b36 1460 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
b423522f 1461 if (ANYOF_BITMAP_TEST(node, i)) {
6f8848d5
TC
1462 unsigned int start = i++;
1463
1464 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1465 /* empty */
1466 }
1467 invlist = _add_range_to_invlist(invlist, start, i-1);
31f05a37 1468 new_node_has_latin1 = TRUE;
b423522f
KW
1469 }
1470 }
1471
1472 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1473 * as well. But don't add them if inverting, as when that gets done below,
1474 * it would exclude all these characters, including the ones it shouldn't
1475 * that were added just above */
1476 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
f240c685
KW
1477 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1478 {
b423522f
KW
1479 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1480 }
1481
1482 /* Similarly for these */
93e92956 1483 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1484 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1485 }
1486
1487 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1488 _invlist_invert(invlist);
1489 }
037715a6 1490 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
31f05a37
KW
1491
1492 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1493 * locale. We can skip this if there are no 0-255 at all. */
1494 _invlist_union(invlist, PL_Latin1, &invlist);
1495 }
1496
1ee208c4
KW
1497 /* Similarly add the UTF-8 locale possible matches. These have to be
1498 * deferred until after the non-UTF-8 locale ones are taken care of just
1499 * above, or it leads to wrong results under ANYOF_INVERT */
1500 if (only_utf8_locale_invlist) {
31f05a37 1501 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1502 only_utf8_locale_invlist,
31f05a37
KW
1503 ANYOF_FLAGS(node) & ANYOF_INVERT,
1504 &invlist);
1505 }
b423522f
KW
1506
1507 return invlist;
1508}
1509
1051e1c4 1510/* These two functions currently do the exact same thing */
557bd3fb 1511#define ssc_init_zero ssc_init
653099ff 1512
cdd87c1d
KW
1513#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1514#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1515
557bd3fb 1516/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1517 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1518 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1519
653099ff 1520STATIC void
b423522f 1521S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1522 const regnode_charclass *and_with)
653099ff 1523{
cdd87c1d
KW
1524 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1525 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1526
a0dd4231
KW
1527 SV* anded_cp_list;
1528 U8 anded_flags;
1e6ade67 1529
cdd87c1d 1530 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1531
71068078 1532 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1533
1534 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1535 * the code point inversion list and just the relevant flags */
71068078 1536 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1537 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
a0dd4231 1538 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1539
1540 /* XXX This is a kludge around what appears to be deficiencies in the
1541 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1542 * there are paths through the optimizer where it doesn't get weeded
1543 * out when it should. And if we don't make some extra provision for
1544 * it like the code just below, it doesn't get added when it should.
1545 * This solution is to add it only when AND'ing, which is here, and
1546 * only when what is being AND'ed is the pristine, original node
1547 * matching anything. Thus it is like adding it to ssc_anything() but
1548 * only when the result is to be AND'ed. Probably the same solution
1549 * could be adopted for the same problem we have with /l matching,
1550 * which is solved differently in S_ssc_init(), and that would lead to
1551 * fewer false positives than that solution has. But if this solution
1552 * creates bugs, the consequences are only that a warning isn't raised
1553 * that should be; while the consequences for having /l bugs is
1554 * incorrect matches */
7dcac5f6 1555 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1556 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1557 }
a0dd4231
KW
1558 }
1559 else {
5c0f85ef 1560 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685
KW
1561 if (OP(and_with) == ANYOFD) {
1562 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1563 }
1564 else {
1565 anded_flags = ANYOF_FLAGS(and_with)
1566 &( ANYOF_COMMON_FLAGS
108316fb
KW
1567 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1568 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1569 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1570 anded_flags &=
1571 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1572 }
f240c685 1573 }
a0dd4231
KW
1574 }
1575
1576 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1577
1578 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1579 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1580 * 'and_with' may be inverted. When not inverted, we have the situation of
1581 * computing:
1582 * (C1 | P1) & (C2 | P2)
1583 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1584 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1585 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1586 * <= ((C1 & C2) | P1 | P2)
1587 * Alternatively, the last few steps could be:
1588 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1589 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1590 * <= (C1 | C2 | (P1 & P2))
1591 * We favor the second approach if either P1 or P2 is non-empty. This is
1592 * because these components are a barrier to doing optimizations, as what
1593 * they match cannot be known until the moment of matching as they are
1594 * dependent on the current locale, 'AND"ing them likely will reduce or
1595 * eliminate them.
1596 * But we can do better if we know that C1,P1 are in their initial state (a
1597 * frequent occurrence), each matching everything:
1598 * (<everything>) & (C2 | P2) = C2 | P2
1599 * Similarly, if C2,P2 are in their initial state (again a frequent
1600 * occurrence), the result is a no-op
1601 * (C1 | P1) & (<everything>) = C1 | P1
1602 *
1603 * Inverted, we have
1604 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1605 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1606 * <= (C1 & ~C2) | (P1 & ~P2)
1607 * */
1aa99e6b 1608
a0dd4231 1609 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1610 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1611 {
cdd87c1d 1612 unsigned int i;
8951c461 1613
cdd87c1d
KW
1614 ssc_intersection(ssc,
1615 anded_cp_list,
1616 FALSE /* Has already been inverted */
1617 );
c6b76537 1618
cdd87c1d
KW
1619 /* If either P1 or P2 is empty, the intersection will be also; can skip
1620 * the loop */
93e92956 1621 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1622 ANYOF_POSIXL_ZERO(ssc);
1623 }
e0e1be5f 1624 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1625
1626 /* Note that the Posix class component P from 'and_with' actually
1627 * looks like:
1628 * P = Pa | Pb | ... | Pn
1629 * where each component is one posix class, such as in [\w\s].
1630 * Thus
1631 * ~P = ~(Pa | Pb | ... | Pn)
1632 * = ~Pa & ~Pb & ... & ~Pn
1633 * <= ~Pa | ~Pb | ... | ~Pn
1634 * The last is something we can easily calculate, but unfortunately
1635 * is likely to have many false positives. We could do better
1636 * in some (but certainly not all) instances if two classes in
1637 * P have known relationships. For example
1638 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1639 * So
1640 * :lower: & :print: = :lower:
1641 * And similarly for classes that must be disjoint. For example,
1642 * since \s and \w can have no elements in common based on rules in
1643 * the POSIX standard,
1644 * \w & ^\S = nothing
1645 * Unfortunately, some vendor locales do not meet the Posix
1646 * standard, in particular almost everything by Microsoft.
1647 * The loop below just changes e.g., \w into \W and vice versa */
1648
1ee208c4 1649 regnode_charclass_posixl temp;
cdd87c1d
KW
1650 int add = 1; /* To calculate the index of the complement */
1651
1652 ANYOF_POSIXL_ZERO(&temp);
1653 for (i = 0; i < ANYOF_MAX; i++) {
1654 assert(i % 2 != 0
7dcac5f6
KW
1655 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1656 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1657
7dcac5f6 1658 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1659 ANYOF_POSIXL_SET(&temp, i + add);
1660 }
1661 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1662 }
1663 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1664
cdd87c1d
KW
1665 } /* else ssc already has no posixes */
1666 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1667 in its initial state */
71068078 1668 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1669 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1670 {
1671 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1672 * copy it over 'ssc' */
1673 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1674 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1675 StructCopy(and_with, ssc, regnode_ssc);
1676 }
1677 else {
1678 ssc->invlist = anded_cp_list;
1679 ANYOF_POSIXL_ZERO(ssc);
93e92956 1680 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1681 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1682 }
1683 }
1684 }
e0e1be5f 1685 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
93e92956 1686 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1687 {
1688 /* One or the other of P1, P2 is non-empty. */
93e92956 1689 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1690 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1691 }
cdd87c1d
KW
1692 ssc_union(ssc, anded_cp_list, FALSE);
1693 }
1694 else { /* P1 = P2 = empty */
1695 ssc_intersection(ssc, anded_cp_list, FALSE);
1696 }
137165a6 1697 }
653099ff
GS
1698}
1699
653099ff 1700STATIC void
cdd87c1d 1701S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1702 const regnode_charclass *or_with)
653099ff 1703{
cdd87c1d
KW
1704 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1705 * another SSC or a regular ANYOF class. Can create false positives if
1706 * 'or_with' is to be inverted. */
7918f24d 1707
a0dd4231
KW
1708 SV* ored_cp_list;
1709 U8 ored_flags;
c6b76537 1710
cdd87c1d 1711 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1712
71068078 1713 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1714
1715 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1716 * the code point inversion list and just the relevant flags */
71068078 1717 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1718 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
a0dd4231
KW
1719 ored_flags = ANYOF_FLAGS(or_with);
1720 }
1721 else {
5c0f85ef 1722 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
eff8b7dc 1723 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
f240c685
KW
1724 if (OP(or_with) != ANYOFD) {
1725 ored_flags
1726 |= ANYOF_FLAGS(or_with)
108316fb
KW
1727 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1728 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1729 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1730 ored_flags |=
1731 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1732 }
f240c685 1733 }
a0dd4231
KW
1734 }
1735
1736 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1737
1738 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1739 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1740 * 'or_with' may be inverted. When not inverted, we have the simple
1741 * situation of computing:
1742 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1743 * If P1|P2 yields a situation with both a class and its complement are
1744 * set, like having both \w and \W, this matches all code points, and we
1745 * can delete these from the P component of the ssc going forward. XXX We
1746 * might be able to delete all the P components, but I (khw) am not certain
1747 * about this, and it is better to be safe.
1748 *
1749 * Inverted, we have
1750 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1751 * <= (C1 | P1) | ~C2
1752 * <= (C1 | ~C2) | P1
1753 * (which results in actually simpler code than the non-inverted case)
1754 * */
9826f543 1755
a0dd4231 1756 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1757 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1758 {
cdd87c1d 1759 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1760 } /* else Not inverted */
93e92956 1761 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1762 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1763 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1764 unsigned int i;
1765 for (i = 0; i < ANYOF_MAX; i += 2) {
1766 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1767 {
1768 ssc_match_all_cp(ssc);
1769 ANYOF_POSIXL_CLEAR(ssc, i);
1770 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1771 }
1772 }
1773 }
1aa99e6b 1774 }
cdd87c1d
KW
1775
1776 ssc_union(ssc,
1777 ored_cp_list,
1778 FALSE /* Already has been inverted */
1779 );
653099ff
GS
1780}
1781
b423522f
KW
1782PERL_STATIC_INLINE void
1783S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1784{
1785 PERL_ARGS_ASSERT_SSC_UNION;
1786
71068078 1787 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1788
1789 _invlist_union_maybe_complement_2nd(ssc->invlist,
1790 invlist,
1791 invert2nd,
1792 &ssc->invlist);
1793}
1794
1795PERL_STATIC_INLINE void
1796S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1797 SV* const invlist,
1798 const bool invert2nd)
1799{
1800 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1801
71068078 1802 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1803
1804 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1805 invlist,
1806 invert2nd,
1807 &ssc->invlist);
1808}
1809
1810PERL_STATIC_INLINE void
1811S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1812{
1813 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1814
71068078 1815 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1816
1817 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1818}
1819
1820PERL_STATIC_INLINE void
1821S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1822{
1823 /* AND just the single code point 'cp' into the SSC 'ssc' */
1824
1825 SV* cp_list = _new_invlist(2);
1826
1827 PERL_ARGS_ASSERT_SSC_CP_AND;
1828
71068078 1829 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1830
1831 cp_list = add_cp_to_invlist(cp_list, cp);
1832 ssc_intersection(ssc, cp_list,
1833 FALSE /* Not inverted */
1834 );
1835 SvREFCNT_dec_NN(cp_list);
1836}
1837
1838PERL_STATIC_INLINE void
dc3bf405 1839S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
1840{
1841 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
1842 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1843
71068078 1844 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1845
1846 ANYOF_POSIXL_ZERO(ssc);
1847 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1848}
1849
b35552de
KW
1850#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1851
1852STATIC bool
1853S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1854{
1855 /* The synthetic start class is used to hopefully quickly winnow down
1856 * places where a pattern could start a match in the target string. If it
1857 * doesn't really narrow things down that much, there isn't much point to
1858 * having the overhead of using it. This function uses some very crude
1859 * heuristics to decide if to use the ssc or not.
1860 *
1861 * It returns TRUE if 'ssc' rules out more than half what it considers to
1862 * be the "likely" possible matches, but of course it doesn't know what the
1863 * actual things being matched are going to be; these are only guesses
1864 *
1865 * For /l matches, it assumes that the only likely matches are going to be
1866 * in the 0-255 range, uniformly distributed, so half of that is 127
1867 * For /a and /d matches, it assumes that the likely matches will be just
1868 * the ASCII range, so half of that is 63
1869 * For /u and there isn't anything matching above the Latin1 range, it
1870 * assumes that that is the only range likely to be matched, and uses
1871 * half that as the cut-off: 127. If anything matches above Latin1,
1872 * it assumes that all of Unicode could match (uniformly), except for
1873 * non-Unicode code points and things in the General Category "Other"
1874 * (unassigned, private use, surrogates, controls and formats). This
1875 * is a much large number. */
1876
b35552de
KW
1877 U32 count = 0; /* Running total of number of code points matched by
1878 'ssc' */
1879 UV start, end; /* Start and end points of current range in inversion
1880 list */
72400949
KW
1881 const U32 max_code_points = (LOC)
1882 ? 256
1883 : (( ! UNI_SEMANTICS
1884 || invlist_highest(ssc->invlist) < 256)
1885 ? 128
1886 : NON_OTHER_COUNT);
1887 const U32 max_match = max_code_points / 2;
b35552de
KW
1888
1889 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1890
1891 invlist_iterinit(ssc->invlist);
1892 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
1893 if (start >= max_code_points) {
1894 break;
b35552de 1895 }
72400949 1896 end = MIN(end, max_code_points - 1);
b35552de 1897 count += end - start + 1;
72400949 1898 if (count >= max_match) {
b35552de
KW
1899 invlist_iterfinish(ssc->invlist);
1900 return FALSE;
1901 }
1902 }
1903
1904 return TRUE;
1905}
1906
1907
b423522f
KW
1908STATIC void
1909S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1910{
1911 /* The inversion list in the SSC is marked mortal; now we need a more
1912 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
1913 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1914 * map */
b423522f
KW
1915
1916 SV* invlist = invlist_clone(ssc->invlist);
1917
1918 PERL_ARGS_ASSERT_SSC_FINALIZE;
1919
71068078 1920 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 1921
a0dd4231 1922 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
1923 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1924 * by the time we reach here */
f240c685
KW
1925 assert(! (ANYOF_FLAGS(ssc)
1926 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
1927 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1928 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 1929
b423522f
KW
1930 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1931
1ee208c4
KW
1932 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1933 NULL, NULL, NULL, FALSE);
b423522f 1934
85c8e306
KW
1935 /* Make sure is clone-safe */
1936 ssc->invlist = NULL;
1937
e0e1be5f 1938 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 1939 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
e0e1be5f 1940 }
1462525b 1941
b2e90ddf
KW
1942 if (RExC_contains_locale) {
1943 OP(ssc) = ANYOFL;
1944 }
1945
1462525b 1946 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
1947}
1948
a3621e74
YO
1949#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1950#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1951#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
1952#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1953 ? (TRIE_LIST_CUR( idx ) - 1) \
1954 : 0 )
a3621e74 1955
3dab1dad
YO
1956
1957#ifdef DEBUGGING
07be1b83 1958/*
2b8b4781
NC
1959 dump_trie(trie,widecharmap,revcharmap)
1960 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1961 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1962
1963 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1964 The _interim_ variants are used for debugging the interim
1965 tables that are used to generate the final compressed
1966 representation which is what dump_trie expects.
1967
486ec47a 1968 Part of the reason for their existence is to provide a form
3dab1dad 1969 of documentation as to how the different representations function.
07be1b83
YO
1970
1971*/
3dab1dad
YO
1972
1973/*
3dab1dad
YO
1974 Dumps the final compressed table form of the trie to Perl_debug_log.
1975 Used for debugging make_trie().
1976*/
b9a59e08 1977
3dab1dad 1978STATIC void
2b8b4781
NC
1979S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1980 AV *revcharmap, U32 depth)
3dab1dad
YO
1981{
1982 U32 state;
ab3bbdeb 1983 SV *sv=sv_newmortal();
55eed653 1984 int colwidth= widecharmap ? 6 : 4;
2e64971a 1985 U16 word;
3dab1dad
YO
1986 GET_RE_DEBUG_FLAGS_DECL;
1987
7918f24d 1988 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1989
6ad9a8ab 1990 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 1991 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
1992
1993 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1994 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1995 if ( tmp ) {
6ad9a8ab 1996 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 1997 colwidth,
538e84ed 1998 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1999 PL_colors[0], PL_colors[1],
2000 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2001 PERL_PV_ESCAPE_FIRSTCHAR
2002 )
ab3bbdeb 2003 );
3dab1dad
YO
2004 }
2005 }
1e37780e
YO
2006 Perl_re_printf( aTHX_ "\n");
2007 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2008
2009 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2010 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2011 Perl_re_printf( aTHX_ "\n");
3dab1dad 2012
1e2e3d02 2013 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2014 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2015
147e3846 2016 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2017
2018 if ( trie->states[ state ].wordnum ) {
1e37780e 2019 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2020 } else {
6ad9a8ab 2021 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2022 }
2023
147e3846 2024 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2025
2026 if ( base ) {
2027 U32 ofs = 0;
2028
2029 while( ( base + ofs < trie->uniquecharcount ) ||
2030 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2031 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2032 != state))
3dab1dad
YO
2033 ofs++;
2034
147e3846 2035 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2036
2037 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2038 if ( ( base + ofs >= trie->uniquecharcount )
2039 && ( base + ofs - trie->uniquecharcount
2040 < trie->lasttrans )
2041 && trie->trans[ base + ofs
2042 - trie->uniquecharcount ].check == state )
3dab1dad 2043 {
147e3846 2044 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2045 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2046 );
3dab1dad 2047 } else {
6ad9a8ab 2048 Perl_re_printf( aTHX_ "%*s",colwidth," ." );
3dab1dad
YO
2049 }
2050 }
2051
6ad9a8ab 2052 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2053
2054 }
6ad9a8ab 2055 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2056 }
6ad9a8ab 2057 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2058 depth);
2e64971a 2059 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2060 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2061 (int)word, (int)(trie->wordinfo[word].prev),
2062 (int)(trie->wordinfo[word].len));
2063 }
6ad9a8ab 2064 Perl_re_printf( aTHX_ "\n" );
538e84ed 2065}
3dab1dad 2066/*
3dab1dad 2067 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2068 List tries normally only are used for construction when the number of
3dab1dad
YO
2069 possible chars (trie->uniquecharcount) is very high.
2070 Used for debugging make_trie().
2071*/
2072STATIC void
55eed653 2073S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2074 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2075 U32 depth)
3dab1dad
YO
2076{
2077 U32 state;
ab3bbdeb 2078 SV *sv=sv_newmortal();
55eed653 2079 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2080 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2081
2082 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2083
3dab1dad 2084 /* print out the table precompression. */
6ad9a8ab 2085 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2086 depth+1 );
6ad9a8ab 2087 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2088 depth+1, "------:-----+-----------------\n" );
538e84ed 2089
3dab1dad
YO
2090 for( state=1 ; state < next_alloc ; state ++ ) {
2091 U16 charid;
538e84ed 2092
147e3846 2093 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2094 depth+1, (UV)state );
3dab1dad 2095 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2096 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2097 } else {
6ad9a8ab 2098 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2099 trie->states[ state ].wordnum
2100 );
2101 }
2102 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed
KW
2103 SV ** const tmp = av_fetch( revcharmap,
2104 TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb 2105 if ( tmp ) {
147e3846 2106 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2107 colwidth,
538e84ed
KW
2108 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2109 colwidth,
2110 PL_colors[0], PL_colors[1],
2111 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2112 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2113 ) ,
1e2e3d02
YO
2114 TRIE_LIST_ITEM(state,charid).forid,
2115 (UV)TRIE_LIST_ITEM(state,charid).newstate
2116 );
538e84ed 2117 if (!(charid % 10))
6ad9a8ab 2118 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2119 (int)((depth * 2) + 14), "");
1e2e3d02 2120 }
ab3bbdeb 2121 }
6ad9a8ab 2122 Perl_re_printf( aTHX_ "\n");
3dab1dad 2123 }
538e84ed 2124}
3dab1dad
YO
2125
2126/*
3dab1dad 2127 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2128 This is the normal DFA style state transition table, with a few
2129 twists to facilitate compression later.
3dab1dad
YO
2130 Used for debugging make_trie().
2131*/
2132STATIC void
55eed653 2133S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2134 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2135 U32 depth)
3dab1dad
YO
2136{
2137 U32 state;
2138 U16 charid;
ab3bbdeb 2139 SV *sv=sv_newmortal();
55eed653 2140 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2141 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2142
2143 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2144
3dab1dad
YO
2145 /*
2146 print out the table precompression so that we can do a visual check
2147 that they are identical.
2148 */
538e84ed 2149
6ad9a8ab 2150 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2151
2152 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2153 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2154 if ( tmp ) {
6ad9a8ab 2155 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2156 colwidth,
538e84ed 2157 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2158 PL_colors[0], PL_colors[1],
2159 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2160 PERL_PV_ESCAPE_FIRSTCHAR
2161 )
ab3bbdeb 2162 );
3dab1dad
YO
2163 }
2164 }
2165
4aaafc03
YO
2166 Perl_re_printf( aTHX_ "\n");
2167 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2168
2169 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2170 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2171 }
2172
6ad9a8ab 2173 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2174
2175 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2176
147e3846 2177 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2178 depth+1,
3dab1dad
YO
2179 (UV)TRIE_NODENUM( state ) );
2180
2181 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2182 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2183 if (v)
147e3846 2184 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2185 else
6ad9a8ab 2186 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2187 }
2188 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2189 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2190 (UV)trie->trans[ state ].check );
3dab1dad 2191 } else {
147e3846 2192 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2193 (UV)trie->trans[ state ].check,
3dab1dad
YO
2194 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2195 }
2196 }
07be1b83 2197}
3dab1dad
YO
2198
2199#endif
2200
2e64971a 2201
786e8c11
YO
2202/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2203 startbranch: the first branch in the whole branch sequence
2204 first : start branch of sequence of branch-exact nodes.
2205 May be the same as startbranch
2206 last : Thing following the last branch.
2207 May be the same as tail.
2208 tail : item following the branch sequence
2209 count : words in the sequence
a4525e78 2210 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2211 depth : indent depth
3dab1dad 2212
786e8c11 2213Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2214
786e8c11
YO
2215A trie is an N'ary tree where the branches are determined by digital
2216decomposition of the key. IE, at the root node you look up the 1st character and
2217follow that branch repeat until you find the end of the branches. Nodes can be
2218marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2219
786e8c11 2220 /he|she|his|hers/
72f13be8 2221
786e8c11
YO
2222would convert into the following structure. Numbers represent states, letters
2223following numbers represent valid transitions on the letter from that state, if
2224the number is in square brackets it represents an accepting state, otherwise it
2225will be in parenthesis.
07be1b83 2226
786e8c11
YO
2227 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2228 | |
2229 | (2)
2230 | |
2231 (1) +-i->(6)-+-s->[7]
2232 |
2233 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2234
786e8c11
YO
2235 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2236
2237This shows that when matching against the string 'hers' we will begin at state 1
2238read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2239then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2240is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2241single traverse. We store a mapping from accepting to state to which word was
2242matched, and then when we have multiple possibilities we try to complete the
b8fda935 2243rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2244
2245The only prior NFA like behaviour that would be changed by the TRIE support is
2246the silent ignoring of duplicate alternations which are of the form:
2247
2248 / (DUPE|DUPE) X? (?{ ... }) Y /x
2249
4b714af6 2250Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2251and without the optimisation. With the optimisations dupes will be silently
486ec47a 2252ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2253the following demonstrates:
2254
2255 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2256
2257which prints out 'word' three times, but
2258
2259 'words'=~/(word|word|word)(?{ print $1 })S/
2260
2261which doesnt print it out at all. This is due to other optimisations kicking in.
2262
2263Example of what happens on a structural level:
2264
486ec47a 2265The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2266
2267 1: CURLYM[1] {1,32767}(18)
2268 5: BRANCH(8)
2269 6: EXACT <ac>(16)
2270 8: BRANCH(11)
2271 9: EXACT <ad>(16)
2272 11: BRANCH(14)
2273 12: EXACT <ab>(16)
2274 16: SUCCEED(0)
2275 17: NOTHING(18)
2276 18: END(0)
2277
2278This would be optimizable with startbranch=5, first=5, last=16, tail=16
2279and should turn into:
2280
2281 1: CURLYM[1] {1,32767}(18)
2282 5: TRIE(16)
2283 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2284 <ac>
2285 <ad>
2286 <ab>
2287 16: SUCCEED(0)
2288 17: NOTHING(18)
2289 18: END(0)
2290
2291Cases where tail != last would be like /(?foo|bar)baz/:
2292
2293 1: BRANCH(4)
2294 2: EXACT <foo>(8)
2295 4: BRANCH(7)
2296 5: EXACT <bar>(8)
2297 7: TAIL(8)
2298 8: EXACT <baz>(10)
2299 10: END(0)
2300
2301which would be optimizable with startbranch=1, first=1, last=7, tail=8
2302and would end up looking like:
2303
2304 1: TRIE(8)
2305 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2306 <foo>
2307 <bar>
2308 7: TAIL(8)
2309 8: EXACT <baz>(10)
2310 10: END(0)
2311
c80e42f3 2312 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2313
2314is the recommended Unicode-aware way of saying
2315
2316 *(d++) = uv;
2317*/
2318
fab2782b 2319#define TRIE_STORE_REVCHAR(val) \
786e8c11 2320 STMT_START { \
73031816 2321 if (UTF) { \
668fcfea 2322 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2323 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 2324 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
2325 SvCUR_set(zlopp, kapow - flrbbbbb); \
2326 SvPOK_on(zlopp); \
2327 SvUTF8_on(zlopp); \
2328 av_push(revcharmap, zlopp); \
2329 } else { \
fab2782b 2330 char ooooff = (char)val; \
73031816
NC
2331 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2332 } \
2333 } STMT_END
786e8c11 2334
914a25d5
KW
2335/* This gets the next character from the input, folding it if not already
2336 * folded. */
2337#define TRIE_READ_CHAR STMT_START { \
2338 wordlen++; \
2339 if ( UTF ) { \
2340 /* if it is UTF then it is either already folded, or does not need \
2341 * folding */ \
1c1d615a 2342 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2343 } \
2344 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2345 /* This folder implies Unicode rules, which in the range expressible \
2346 * by not UTF is the lower case, with the two exceptions, one of \
2347 * which should have been taken care of before calling this */ \
2348 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2349 uvc = toLOWER_L1(*uc); \
2350 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2351 len = 1; \
914a25d5
KW
2352 } else { \
2353 /* raw data, will be folded later if needed */ \
2354 uvc = (U32)*uc; \
2355 len = 1; \
2356 } \
786e8c11
YO
2357} STMT_END
2358
2359
2360
2361#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2362 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
2363 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2364 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
2365 } \
2366 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2367 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2368 TRIE_LIST_CUR( state )++; \
2369} STMT_END
07be1b83 2370
786e8c11
YO
2371#define TRIE_LIST_NEW(state) STMT_START { \
2372 Newxz( trie->states[ state ].trans.list, \
2373 4, reg_trie_trans_le ); \
2374 TRIE_LIST_CUR( state ) = 1; \
2375 TRIE_LIST_LEN( state ) = 4; \
2376} STMT_END
07be1b83 2377
786e8c11
YO
2378#define TRIE_HANDLE_WORD(state) STMT_START { \
2379 U16 dupe= trie->states[ state ].wordnum; \
2380 regnode * const noper_next = regnext( noper ); \
2381 \
786e8c11
YO
2382 DEBUG_r({ \
2383 /* store the word for dumping */ \
2384 SV* tmp; \
2385 if (OP(noper) != NOTHING) \
740cce10 2386 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2387 else \
740cce10 2388 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2389 av_push( trie_words, tmp ); \
786e8c11
YO
2390 }); \
2391 \
2392 curword++; \
2e64971a
DM
2393 trie->wordinfo[curword].prev = 0; \
2394 trie->wordinfo[curword].len = wordlen; \
2395 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2396 \
2397 if ( noper_next < tail ) { \
2398 if (!trie->jump) \
538e84ed
KW
2399 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2400 sizeof(U16) ); \
7f69552c 2401 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2402 if (!jumper) \
2403 jumper = noper_next; \
2404 if (!nextbranch) \
2405 nextbranch= regnext(cur); \
2406 } \
2407 \
2408 if ( dupe ) { \
2e64971a
DM
2409 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2410 /* chain, so that when the bits of chain are later */\
2411 /* linked together, the dups appear in the chain */\
2412 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2413 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2414 } else { \
2415 /* we haven't inserted this word yet. */ \
2416 trie->states[ state ].wordnum = curword; \
2417 } \
2418} STMT_END
07be1b83 2419
3dab1dad 2420
786e8c11
YO
2421#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2422 ( ( base + charid >= ucharcount \
2423 && base + charid < ubound \
2424 && state == trie->trans[ base - ucharcount + charid ].check \
2425 && trie->trans[ base - ucharcount + charid ].next ) \
2426 ? trie->trans[ base - ucharcount + charid ].next \
2427 : ( state==1 ? special : 0 ) \
2428 )
3dab1dad 2429
8bcafbf4
YO
2430#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2431STMT_START { \
2432 TRIE_BITMAP_SET(trie, uvc); \
2433 /* store the folded codepoint */ \
2434 if ( folder ) \
2435 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2436 \
2437 if ( !UTF ) { \
2438 /* store first byte of utf8 representation of */ \
2439 /* variant codepoints */ \
2440 if (! UVCHR_IS_INVARIANT(uvc)) { \
2441 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2442 } \
2443 } \
2444} STMT_END
786e8c11
YO
2445#define MADE_TRIE 1
2446#define MADE_JUMP_TRIE 2
2447#define MADE_EXACT_TRIE 4
3dab1dad 2448
a3621e74 2449STATIC I32
538e84ed
KW
2450S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2451 regnode *first, regnode *last, regnode *tail,
2452 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2453{
2454 /* first pass, loop through and scan words */
2455 reg_trie_data *trie;
55eed653 2456 HV *widecharmap = NULL;
2b8b4781 2457 AV *revcharmap = newAV();
a3621e74 2458 regnode *cur;
a3621e74
YO
2459 STRLEN len = 0;
2460 UV uvc = 0;
2461 U16 curword = 0;
2462 U32 next_alloc = 0;
786e8c11
YO
2463 regnode *jumper = NULL;
2464 regnode *nextbranch = NULL;
7f69552c 2465 regnode *convert = NULL;
2e64971a 2466 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2467 /* we just use folder as a flag in utf8 */
1e696034 2468 const U8 * folder = NULL;
a3621e74 2469
2b8b4781 2470#ifdef DEBUGGING
cf78de0b 2471 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2b8b4781
NC
2472 AV *trie_words = NULL;
2473 /* along with revcharmap, this only used during construction but both are
2474 * useful during debugging so we store them in the struct when debugging.
8e11feef 2475 */
2b8b4781 2476#else
cf78de0b 2477 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2478 STRLEN trie_charcount=0;
3dab1dad 2479#endif
2b8b4781 2480 SV *re_trie_maxbuff;
a3621e74 2481 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2482
2483 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2484#ifndef DEBUGGING
2485 PERL_UNUSED_ARG(depth);
2486#endif
a3621e74 2487
1e696034 2488 switch (flags) {
a4525e78 2489 case EXACT: case EXACTL: break;
2f7f8cb1 2490 case EXACTFA:
fab2782b 2491 case EXACTFU_SS:
a4525e78
KW
2492 case EXACTFU:
2493 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2494 case EXACTF: folder = PL_fold; break;
fab2782b 2495 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2496 }
2497
c944940b 2498 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2499 trie->refcount = 1;
3dab1dad 2500 trie->startstate = 1;
786e8c11 2501 trie->wordcount = word_count;
f8fc2ecf 2502 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2503 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
a4525e78 2504 if (flags == EXACT || flags == EXACTL)
c944940b 2505 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2506 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2507 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2508
a3621e74 2509 DEBUG_r({
2b8b4781 2510 trie_words = newAV();
a3621e74 2511 });
a3621e74 2512
0111c4fd 2513 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
316ebaf2 2514 assert(re_trie_maxbuff);
a3621e74 2515 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2516 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2517 }
df826430 2518 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2519 Perl_re_indentf( aTHX_
cb41e5d6
YO
2520 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2521 depth+1,
538e84ed
KW
2522 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2523 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2524 });
538e84ed 2525
7f69552c
YO
2526 /* Find the node we are going to overwrite */
2527 if ( first == startbranch && OP( last ) != BRANCH ) {
2528 /* whole branch chain */
2529 convert = first;
2530 } else {
2531 /* branch sub-chain */
2532 convert = NEXTOPER( first );
2533 }
538e84ed 2534
a3621e74
YO
2535 /* -- First loop and Setup --
2536
2537 We first traverse the branches and scan each word to determine if it
2538 contains widechars, and how many unique chars there are, this is
2539 important as we have to build a table with at least as many columns as we
2540 have unique chars.
2541
2542 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2543 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2544 the native representation of the character value as the key and IV's for
2545 the coded index.
a3621e74
YO
2546
2547 *TODO* If we keep track of how many times each character is used we can
2548 remap the columns so that the table compression later on is more
3b753521 2549 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2550 middle and the least common are on the outside. IMO this would be better
2551 than a most to least common mapping as theres a decent chance the most
2552 common letter will share a node with the least common, meaning the node
486ec47a 2553 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2554 case is when we have the least common nodes twice.
2555
2556 */
2557
a3621e74 2558 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2559 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2560 const U8 *uc;
2561 const U8 *e;
bc031a7d 2562 int foldlen = 0;
07be1b83 2563 U32 wordlen = 0; /* required init */
bc031a7d
KW
2564 STRLEN minchars = 0;
2565 STRLEN maxchars = 0;
538e84ed
KW
2566 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2567 bitmap?*/
a3621e74 2568
3dab1dad 2569 if (OP(noper) == NOTHING) {
20ed8c88
YO
2570 /* skip past a NOTHING at the start of an alternation
2571 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2572 */
df826430 2573 regnode *noper_next= regnext(noper);
944e05e3
YO
2574 if (noper_next < tail)
2575 noper= noper_next;
2576 }
2577
dca5fc4c
YO
2578 if ( noper < tail &&
2579 (
2580 OP(noper) == flags ||
2581 (
2582 flags == EXACTFU &&
2583 OP(noper) == EXACTFU_SS
2584 )
2585 )
2586 ) {
944e05e3
YO
2587 uc= (U8*)STRING(noper);
2588 e= uc + STR_LEN(noper);
2589 } else {
2590 trie->minlen= 0;
2591 continue;
3dab1dad 2592 }
df826430 2593
944e05e3 2594
fab2782b 2595 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2596 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2597 regardless of encoding */
fab2782b
YO
2598 if (OP( noper ) == EXACTFU_SS) {
2599 /* false positives are ok, so just set this */
0dc4a61d 2600 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2601 }
2602 }
dca5fc4c 2603
bc031a7d
KW
2604 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2605 branch */
3dab1dad 2606 TRIE_CHARCOUNT(trie)++;
a3621e74 2607 TRIE_READ_CHAR;
645de4ce 2608
bc031a7d
KW
2609 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2610 * is in effect. Under /i, this character can match itself, or
2611 * anything that folds to it. If not under /i, it can match just
2612 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2613 * all fold to k, and all are single characters. But some folds
2614 * expand to more than one character, so for example LATIN SMALL
2615 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2616 * the string beginning at 'uc' is 'ffi', it could be matched by
2617 * three characters, or just by the one ligature character. (It
2618 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2619 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2620 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2621 * match.) The trie needs to know the minimum and maximum number
2622 * of characters that could match so that it can use size alone to
2623 * quickly reject many match attempts. The max is simple: it is
2624 * the number of folded characters in this branch (since a fold is
2625 * never shorter than what folds to it. */
2626
2627 maxchars++;
2628
2629 /* And the min is equal to the max if not under /i (indicated by
2630 * 'folder' being NULL), or there are no multi-character folds. If
2631 * there is a multi-character fold, the min is incremented just
2632 * once, for the character that folds to the sequence. Each
2633 * character in the sequence needs to be added to the list below of
2634 * characters in the trie, but we count only the first towards the
2635 * min number of characters needed. This is done through the
2636 * variable 'foldlen', which is returned by the macros that look
2637 * for these sequences as the number of bytes the sequence
2638 * occupies. Each time through the loop, we decrement 'foldlen' by
2639 * how many bytes the current char occupies. Only when it reaches
2640 * 0 do we increment 'minchars' or look for another multi-character
2641 * sequence. */
2642 if (folder == NULL) {
2643 minchars++;
2644 }
2645 else if (foldlen > 0) {
2646 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2647 }
2648 else {
bc031a7d
KW
2649 minchars++;
2650
2651 /* See if *uc is the beginning of a multi-character fold. If
2652 * so, we decrement the length remaining to look at, to account
2653 * for the current character this iteration. (We can use 'uc'
2654 * instead of the fold returned by TRIE_READ_CHAR because for
2655 * non-UTF, the latin1_safe macro is smart enough to account
2656 * for all the unfolded characters, and because for UTF, the
2657 * string will already have been folded earlier in the
2658 * compilation process */
2659 if (UTF) {
2660 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2661 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2662 }
2663 }
bc031a7d
KW
2664 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2665 foldlen--;
2666 }
645de4ce 2667 }
bc031a7d
KW
2668
2669 /* The current character (and any potential folds) should be added
2670 * to the possible matching characters for this position in this
2671 * branch */
a3621e74 2672 if ( uvc < 256 ) {
fab2782b
YO
2673 if ( folder ) {
2674 U8 folded= folder[ (U8) uvc ];
2675 if ( !trie->charmap[ folded ] ) {
2676 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2677 TRIE_STORE_REVCHAR( folded );
2678 }
2679 }
a3621e74
YO
2680 if ( !trie->charmap[ uvc ] ) {
2681 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2682 TRIE_STORE_REVCHAR( uvc );
a3621e74 2683 }
02daf0ab 2684 if ( set_bit ) {
62012aee
KW
2685 /* store the codepoint in the bitmap, and its folded
2686 * equivalent. */
8bcafbf4 2687 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2688 set_bit = 0; /* We've done our bit :-) */
2689 }
a3621e74 2690 } else {
bc031a7d
KW
2691
2692 /* XXX We could come up with the list of code points that fold
2693 * to this using PL_utf8_foldclosures, except not for
2694 * multi-char folds, as there may be multiple combinations
2695 * there that could work, which needs to wait until runtime to
2696 * resolve (The comment about LIGATURE FFI above is such an
2697 * example */
2698
a3621e74 2699 SV** svpp;
55eed653
NC
2700 if ( !widecharmap )
2701 widecharmap = newHV();
a3621e74 2702
55eed653 2703 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2704
2705 if ( !svpp )
147e3846 2706 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
2707
2708 if ( !SvTRUE( *svpp ) ) {
2709 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2710 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2711 }
2712 }
bc031a7d
KW
2713 } /* end loop through characters in this branch of the trie */
2714
2715 /* We take the min and max for this branch and combine to find the min
2716 * and max for all branches processed so far */
3dab1dad 2717 if( cur == first ) {
bc031a7d
KW
2718 trie->minlen = minchars;
2719 trie->maxlen = maxchars;
2720 } else if (minchars < trie->minlen) {
2721 trie->minlen = minchars;
2722 } else if (maxchars > trie->maxlen) {
2723 trie->maxlen = maxchars;
fab2782b 2724 }
a3621e74
YO
2725 } /* end first pass */
2726 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2727 Perl_re_indentf( aTHX_
cb41e5d6
YO
2728 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2729 depth+1,
55eed653 2730 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2731 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2732 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2733 );
a3621e74
YO
2734
2735 /*
2736 We now know what we are dealing with in terms of unique chars and
2737 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2738 representation using a flat table will take. If it's over a reasonable
2739 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2740 conservative but potentially much slower representation using an array
2741 of lists.
2742
2743 At the end we convert both representations into the same compressed
2744 form that will be used in regexec.c for matching with. The latter
2745 is a form that cannot be used to construct with but has memory
2746 properties similar to the list form and access properties similar
2747 to the table form making it both suitable for fast searches and
2748 small enough that its feasable to store for the duration of a program.
2749
2750 See the comment in the code where the compressed table is produced
2751 inplace from the flat tabe representation for an explanation of how
2752 the compression works.
2753
2754 */
2755
2756
2e64971a
DM
2757 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2758 prev_states[1] = 0;
2759
538e84ed
KW
2760 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2761 > SvIV(re_trie_maxbuff) )
2762 {
a3621e74
YO
2763 /*
2764 Second Pass -- Array Of Lists Representation
2765
2766 Each state will be represented by a list of charid:state records
2767 (reg_trie_trans_le) the first such element holds the CUR and LEN
2768 points of the allocated array. (See defines above).
2769
2770 We build the initial structure using the lists, and then convert
2771 it into the compressed table form which allows faster lookups
2772 (but cant be modified once converted).
a3621e74
YO
2773 */
2774
a3621e74
YO
2775 STRLEN transcount = 1;
2776
6ad9a8ab 2777 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 2778 depth+1));
686b73d4 2779
c944940b
JH
2780 trie->states = (reg_trie_state *)
2781 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2782 sizeof(reg_trie_state) );
a3621e74
YO
2783 TRIE_LIST_NEW(1);
2784 next_alloc = 2;
2785
2786 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2787
df826430 2788 regnode *noper = NEXTOPER( cur );
c445ea15
AL
2789 U32 state = 1; /* required init */
2790 U16 charid = 0; /* sanity init */
07be1b83 2791 U32 wordlen = 0; /* required init */
c445ea15 2792
df826430
YO
2793 if (OP(noper) == NOTHING) {
2794 regnode *noper_next= regnext(noper);
944e05e3
YO
2795 if (noper_next < tail)
2796 noper= noper_next;
df826430
YO
2797 }
2798
944e05e3
YO
2799 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2800 const U8 *uc= (U8*)STRING(noper);
2801 const U8 *e= uc + STR_LEN(noper);
2802
786e8c11 2803 for ( ; uc < e ; uc += len ) {
c445ea15 2804
786e8c11 2805 TRIE_READ_CHAR;
c445ea15 2806
786e8c11
YO
2807 if ( uvc < 256 ) {
2808 charid = trie->charmap[ uvc ];
c445ea15 2809 } else {
538e84ed
KW
2810 SV** const svpp = hv_fetch( widecharmap,
2811 (char*)&uvc,
2812 sizeof( UV ),
2813 0);
786e8c11
YO
2814 if ( !svpp ) {
2815 charid = 0;
2816 } else {
2817 charid=(U16)SvIV( *svpp );
2818 }
c445ea15 2819 }
538e84ed
KW
2820 /* charid is now 0 if we dont know the char read, or
2821 * nonzero if we do */
786e8c11 2822 if ( charid ) {
a3621e74 2823
786e8c11
YO
2824 U16 check;
2825 U32 newstate = 0;
a3621e74 2826
786e8c11
YO
2827 charid--;
2828 if ( !trie->states[ state ].trans.list ) {
2829 TRIE_LIST_NEW( state );
c445ea15 2830 }
538e84ed
KW
2831 for ( check = 1;
2832 check <= TRIE_LIST_USED( state );
2833 check++ )
2834 {
2835 if ( TRIE_LIST_ITEM( state, check ).forid
2836 == charid )
2837 {
786e8c11
YO
2838 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2839 break;
2840 }
2841 }
2842 if ( ! newstate ) {
2843 newstate = next_alloc++;
2e64971a 2844 prev_states[newstate] = state;
786e8c11
YO
2845 TRIE_LIST_PUSH( state, charid, newstate );
2846 transcount++;
2847 }
2848 state = newstate;
2849 } else {
147e3846 2850 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 2851 }
a28509cc 2852 }
c445ea15 2853 }
3dab1dad 2854 TRIE_HANDLE_WORD(state);
a3621e74
YO
2855
2856 } /* end second pass */
2857
1e2e3d02 2858 /* next alloc is the NEXT state to be allocated */
538e84ed 2859 trie->statecount = next_alloc;
c944940b
JH
2860 trie->states = (reg_trie_state *)
2861 PerlMemShared_realloc( trie->states,
2862 next_alloc
2863 * sizeof(reg_trie_state) );
a3621e74 2864
3dab1dad 2865 /* and now dump it out before we compress it */
2b8b4781
NC
2866 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2867 revcharmap, next_alloc,
2868 depth+1)
1e2e3d02 2869 );
a3621e74 2870
c944940b
JH
2871 trie->trans = (reg_trie_trans *)
2872 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2873 {
2874 U32 state;
a3621e74
YO
2875 U32 tp = 0;
2876 U32 zp = 0;
2877
2878
2879 for( state=1 ; state < next_alloc ; state ++ ) {
2880 U32 base=0;
2881
2882 /*
2883 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 2884 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
2885 );
2886 */
2887
2888 if (trie->states[state].trans.list) {
2889 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2890 U16 maxid=minid;
a28509cc 2891 U16 idx;
a3621e74
YO
2892
2893 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
2894 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2895 if ( forid < minid ) {
2896 minid=forid;
2897 } else if ( forid > maxid ) {
2898 maxid=forid;
2899 }
a3621e74
YO
2900 }
2901 if ( transcount < tp + maxid - minid + 1) {
2902 transcount *= 2;
c944940b
JH
2903 trie->trans = (reg_trie_trans *)
2904 PerlMemShared_realloc( trie->trans,
446bd890
NC
2905 transcount
2906 * sizeof(reg_trie_trans) );
538e84ed
KW
2907 Zero( trie->trans + (transcount / 2),
2908 transcount / 2,
2909 reg_trie_trans );
a3621e74
YO
2910 }
2911 base = trie->uniquecharcount + tp - minid;
2912 if ( maxid == minid ) {
2913 U32 set = 0;
2914 for ( ; zp < tp ; zp++ ) {
2915 if ( ! trie->trans[ zp ].next ) {
2916 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
2917 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2918 1).newstate;
a3621e74
YO
2919 trie->trans[ zp ].check = state;
2920 set = 1;
2921 break;
2922 }
2923 }
2924 if ( !set ) {
538e84ed
KW
2925 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2926 1).newstate;
a3621e74
YO
2927 trie->trans[ tp ].check = state;
2928 tp++;
2929 zp = tp;
2930 }
2931 } else {
2932 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
2933 const U32 tid = base
2934 - trie->uniquecharcount
2935 + TRIE_LIST_ITEM( state, idx ).forid;
2936 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2937 idx ).newstate;
a3621e74
YO
2938 trie->trans[ tid ].check = state;
2939 }
2940 tp += ( maxid - minid + 1 );
2941 }
2942 Safefree(trie->states[ state ].trans.list);
2943 }
2944 /*
2945 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 2946 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
2947 );
2948 */
2949 trie->states[ state ].trans.base=base;
2950 }
cc601c31 2951 trie->lasttrans = tp + 1;
a3621e74
YO
2952 }
2953 } else {
2954 /*
2955 Second Pass -- Flat Table Representation.
2956
b423522f
KW
2957 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2958 each. We know that we will need Charcount+1 trans at most to store
2959 the data (one row per char at worst case) So we preallocate both
2960 structures assuming worst case.
a3621e74
YO
2961
2962 We then construct the trie using only the .next slots of the entry
2963 structs.
2964
b423522f
KW
2965 We use the .check field of the first entry of the node temporarily
2966 to make compression both faster and easier by keeping track of how
2967 many non zero fields are in the node.
a3621e74
YO
2968
2969 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2970 transition.
2971
b423522f
KW
2972 There are two terms at use here: state as a TRIE_NODEIDX() which is
2973 a number representing the first entry of the node, and state as a
2974 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2975 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2976 if there are 2 entrys per node. eg:
a3621e74
YO
2977
2978 A B A B
2979 1. 2 4 1. 3 7
2980 2. 0 3 3. 0 5
2981 3. 0 0 5. 0 0
2982 4. 0 0 7. 0 0
2983
b423522f
KW
2984 The table is internally in the right hand, idx form. However as we
2985 also have to deal with the states array which is indexed by nodenum
2986 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
2987
2988 */
6ad9a8ab 2989 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 2990 depth+1));
3dab1dad 2991
c944940b
JH
2992 trie->trans = (reg_trie_trans *)
2993 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2994 * trie->uniquecharcount + 1,
2995 sizeof(reg_trie_trans) );
2996 trie->states = (reg_trie_state *)
2997 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2998 sizeof(reg_trie_state) );
a3621e74
YO
2999 next_alloc = trie->uniquecharcount + 1;
3000
3dab1dad 3001
a3621e74
YO
3002 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3003
df826430 3004 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3005
3006 U32 state = 1; /* required init */
3007
3008 U16 charid = 0; /* sanity init */
3009 U32 accept_state = 0; /* sanity init */
a3621e74 3010
07be1b83 3011 U32 wordlen = 0; /* required init */
a3621e74 3012
df826430
YO
3013 if (OP(noper) == NOTHING) {
3014 regnode *noper_next= regnext(noper);
944e05e3
YO
3015 if (noper_next < tail)
3016 noper= noper_next;
df826430 3017 }
fab2782b 3018
944e05e3
YO
3019 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3020 const U8 *uc= (U8*)STRING(noper);
3021 const U8 *e= uc + STR_LEN(noper);
3022
786e8c11 3023 for ( ; uc < e ; uc += len ) {
a3621e74 3024
786e8c11 3025 TRIE_READ_CHAR;
a3621e74 3026
786e8c11
YO
3027 if ( uvc < 256 ) {
3028 charid = trie->charmap[ uvc ];
3029 } else {
538e84ed
KW
3030 SV* const * const svpp = hv_fetch( widecharmap,
3031 (char*)&uvc,
3032 sizeof( UV ),
3033 0);
786e8c11 3034 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3035 }
786e8c11
YO
3036 if ( charid ) {
3037 charid--;
3038 if ( !trie->trans[ state + charid ].next ) {
3039 trie->trans[ state + charid ].next = next_alloc;
3040 trie->trans[ state ].check++;
2e64971a
DM
3041 prev_states[TRIE_NODENUM(next_alloc)]
3042 = TRIE_NODENUM(state);
786e8c11
YO
3043 next_alloc += trie->uniquecharcount;
3044 }
3045 state = trie->trans[ state + charid ].next;
3046 } else {
147e3846 3047 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3048 }
538e84ed
KW
3049 /* charid is now 0 if we dont know the char read, or
3050 * nonzero if we do */
a3621e74 3051 }
a3621e74 3052 }
3dab1dad
YO
3053 accept_state = TRIE_NODENUM( state );
3054 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3055
3056 } /* end second pass */
3057
3dab1dad 3058 /* and now dump it out before we compress it */
2b8b4781
NC
3059 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3060 revcharmap,
3061 next_alloc, depth+1));
a3621e74 3062
a3621e74
YO
3063 {
3064 /*
3065 * Inplace compress the table.*
3066
3067 For sparse data sets the table constructed by the trie algorithm will
3068 be mostly 0/FAIL transitions or to put it another way mostly empty.
3069 (Note that leaf nodes will not contain any transitions.)
3070
3071 This algorithm compresses the tables by eliminating most such
3072 transitions, at the cost of a modest bit of extra work during lookup:
3073
3074 - Each states[] entry contains a .base field which indicates the
3075 index in the state[] array wheres its transition data is stored.
3076
3b753521 3077 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3078
3079 - If .base is nonzero then charid is added to it to find an entry in
3080 the trans array.
3081
3082 -If trans[states[state].base+charid].check!=state then the
3083 transition is taken to be a 0/Fail transition. Thus if there are fail
3084 transitions at the front of the node then the .base offset will point
3085 somewhere inside the previous nodes data (or maybe even into a node
3086 even earlier), but the .check field determines if the transition is
3087 valid.
3088
786e8c11 3089 XXX - wrong maybe?
a3621e74 3090 The following process inplace converts the table to the compressed
3b753521 3091 table: We first do not compress the root node 1,and mark all its
a3621e74 3092 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3093 allows us to do a DFA construction from the compressed table later,
3094 and ensures that any .base pointers we calculate later are greater
3095 than 0.
a3621e74
YO
3096
3097 - We set 'pos' to indicate the first entry of the second node.
3098
3099 - We then iterate over the columns of the node, finding the first and
3100 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3101 and set the .check pointers accordingly, and advance pos
3102 appropriately and repreat for the next node. Note that when we copy
3103 the next pointers we have to convert them from the original
3104 NODEIDX form to NODENUM form as the former is not valid post
3105 compression.
3106
3107 - If a node has no transitions used we mark its base as 0 and do not
3108 advance the pos pointer.
3109
3110 - If a node only has one transition we use a second pointer into the
3111 structure to fill in allocated fail transitions from other states.
3112 This pointer is independent of the main pointer and scans forward
3113 looking for null transitions that are allocated to a state. When it
3114 finds one it writes the single transition into the "hole". If the
786e8c11 3115 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3116
3117 - Once compressed we can Renew/realloc the structures to release the
3118 excess space.
3119
3120 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3121 specifically Fig 3.47 and the associated pseudocode.
3122
3123 demq
3124 */
a3b680e6 3125 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3126 U32 state, charid;
a3621e74 3127 U32 pos = 0, zp=0;
1e2e3d02 3128 trie->statecount = laststate;
a3621e74
YO
3129
3130 for ( state = 1 ; state < laststate ; state++ ) {
3131 U8 flag = 0;
a28509cc
AL
3132 const U32 stateidx = TRIE_NODEIDX( state );
3133 const U32 o_used = trie->trans[ stateidx ].check;
3134 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3135 trie->trans[ stateidx ].check = 0;
3136
538e84ed
KW
3137 for ( charid = 0;
3138 used && charid < trie->uniquecharcount;
3139 charid++ )
3140 {
a3621e74
YO
3141 if ( flag || trie->trans[ stateidx + charid ].next ) {
3142 if ( trie->trans[ stateidx + charid ].next ) {
3143 if (o_used == 1) {
3144 for ( ; zp < pos ; zp++ ) {
3145 if ( ! trie->trans[ zp ].next ) {
3146 break;
3147 }
3148 }
538e84ed
KW
3149 trie->states[ state ].trans.base
3150 = zp
3151 + trie->uniquecharcount
3152 - charid ;
3153 trie->trans[ zp ].next
3154 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3155 + charid ].next );
a3621e74
YO
3156 trie->trans[ zp ].check = state;
3157 if ( ++zp > pos ) pos = zp;
3158 break;
3159 }
3160 used--;
3161 }
3162 if ( !flag ) {
3163 flag = 1;
538e84ed
KW
3164 trie->states[ state ].trans.base
3165 = pos + trie->uniquecharcount - charid ;
a3621e74 3166 }
538e84ed
KW
3167 trie->trans[ pos ].next
3168 = SAFE_TRIE_NODENUM(
3169 trie->trans[ stateidx + charid ].next );
a3621e74
YO
3170 trie->trans[ pos ].check = state;
3171 pos++;
3172 }
3173 }
3174 }
cc601c31 3175 trie->lasttrans = pos + 1;
c944940b
JH
3176 trie->states = (reg_trie_state *)
3177 PerlMemShared_realloc( trie->states, laststate
3178 * sizeof(reg_trie_state) );
a3621e74 3179 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3180 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
cb41e5d6 3181 depth+1,
538e84ed
KW
3182 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3183 + 1 ),
3184 (IV)next_alloc,
3185 (IV)pos,
3186 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
3187 );
3188
3189 } /* end table compress */
3190 }
1e2e3d02 3191 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3192 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
cb41e5d6 3193 depth+1,
1e2e3d02
YO
3194 (UV)trie->statecount,
3195 (UV)trie->lasttrans)
3196 );
cc601c31 3197 /* resize the trans array to remove unused space */
c944940b
JH
3198 trie->trans = (reg_trie_trans *)
3199 PerlMemShared_realloc( trie->trans, trie->lasttrans
3200 * sizeof(reg_trie_trans) );
a3621e74 3201
538e84ed 3202 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
3203 U8 nodetype =(U8)(flags & 0xFF);
3204 char *str=NULL;
538e84ed 3205
07be1b83 3206#ifdef DEBUGGING
e62cc96a 3207 regnode *optimize = NULL;
7122b237
YO
3208#ifdef RE_TRACK_PATTERN_OFFSETS
3209
b57a0404
JH
3210 U32 mjd_offset = 0;
3211 U32 mjd_nodelen = 0;
7122b237
YO
3212#endif /* RE_TRACK_PATTERN_OFFSETS */
3213#endif /* DEBUGGING */
a3621e74 3214 /*
3dab1dad
YO
3215 This means we convert either the first branch or the first Exact,
3216 depending on whether the thing following (in 'last') is a branch
3217 or not and whther first is the startbranch (ie is it a sub part of
3218 the alternation or is it the whole thing.)
3b753521 3219 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 3220 the whole branch sequence, including the first.
a3621e74 3221 */
3dab1dad 3222 /* Find the node we are going to overwrite */
7f69552c 3223 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 3224 /* branch sub-chain */
3dab1dad 3225 NEXT_OFF( first ) = (U16)(last - first);
7122b237 3226#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
3227 DEBUG_r({
3228 mjd_offset= Node_Offset((convert));
3229 mjd_nodelen= Node_Length((convert));
3230 });
7122b237 3231#endif
7f69552c 3232 /* whole branch chain */
7122b237
YO
3233 }
3234#ifdef RE_TRACK_PATTERN_OFFSETS
3235 else {
7f69552c
YO
3236 DEBUG_r({
3237 const regnode *nop = NEXTOPER( convert );
3238 mjd_offset= Node_Offset((nop));
3239 mjd_nodelen= Node_Length((nop));
3240 });
07be1b83
YO
3241 }
3242 DEBUG_OPTIMISE_r(
147e3846 3243 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
cb41e5d6 3244 depth+1,
786e8c11 3245 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 3246 );
7122b237 3247#endif
538e84ed 3248 /* But first we check to see if there is a common prefix we can
3dab1dad
YO
3249 split out as an EXACT and put in front of the TRIE node. */
3250 trie->startstate= 1;
55eed653 3251 if ( trie->bitmap && !widecharmap && !trie->jump ) {
5ee57374
YO
3252 /* we want to find the first state that has more than
3253 * one transition, if that state is not the first state
3254 * then we have a common prefix which we can remove.
3255 */
3dab1dad 3256 U32 state;
1e2e3d02 3257 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 3258 U32 ofs = 0;
ee0dfd0b
YO
3259 I32 first_ofs = -1; /* keeps track of the ofs of the first
3260 transition, -1 means none */
8e11feef
RGS
3261 U32 count = 0;
3262 const U32 base = trie->states[ state ].trans.base;
a3621e74 3263
5ee57374 3264 /* does this state terminate an alternation? */
3dab1dad 3265 if ( trie->states[state].wordnum )
8e11feef 3266 count = 1;
a3621e74 3267
8e11feef 3268 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
3269 if ( ( base + ofs >= trie->uniquecharcount ) &&
3270 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
3271 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3272 {
3dab1dad 3273 if ( ++count > 1 ) {
5ee57374 3274 /* we have more than one transition */
d3d91c7c
YO
3275 SV **tmp;
3276 U8 *ch;
5ee57374
YO
3277 /* if this is the first state there is no common prefix
3278 * to extract, so we can exit */
8e11feef 3279 if ( state == 1 ) break;
d3d91c7c
YO
3280 tmp = av_fetch( revcharmap, ofs, 0);
3281 ch = (U8*)SvPV_nolen_const( *tmp );
3282
5ee57374
YO
3283 /* if we are on count 2 then we need to initialize the
3284 * bitmap, and store the previous char if there was one
3285 * in it*/
3dab1dad 3286 if ( count == 2 ) {
5ee57374 3287 /* clear the bitmap */
3dab1dad
YO
3288 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3289 DEBUG_OPTIMISE_r(
147e3846 3290 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
cb41e5d6 3291 depth+1,
786e8c11 3292 (UV)state));
ee0dfd0b
YO
3293 if (first_ofs >= 0) {
3294 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
be8e71aa 3295 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 3296
8bcafbf4 3297 TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
3dab1dad 3298 DEBUG_OPTIMISE_r(
6ad9a8ab 3299 Perl_re_printf( aTHX_ "%s", (char*)ch)
3dab1dad 3300 );
8e11feef
RGS
3301 }
3302 }
8bcafbf4
YO
3303 /* store the current firstchar in the bitmap */
3304 TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
6ad9a8ab 3305 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
8e11feef 3306 }
ee0dfd0b 3307 first_ofs = ofs;
8e11feef 3308 }
3dab1dad
YO
3309 }
3310 if ( count == 1 ) {
5ee57374
YO
3311 /* This state has only one transition, its transition is part
3312 * of a common prefix - we need to concatenate the char it
3313 * represents to what we have so far. */
ee0dfd0b 3314 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
c490c714
YO
3315 STRLEN len;
3316 char *ch = SvPV( *tmp, len );
de734bd5
A
3317 DEBUG_OPTIMISE_r({
3318 SV *sv=sv_newmortal();
147e3846 3319 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
cb41e5d6 3320 depth+1,
ee0dfd0b 3321 (UV)state, (UV)first_ofs,
538e84ed 3322 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
de734bd5
A
3323 PL_colors[0], PL_colors[1],
3324 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed 3325 PERL_PV_ESCAPE_FIRSTCHAR
de734bd5
A
3326 )
3327 );
3328 });
3dab1dad
YO
3329 if ( state==1 ) {
3330 OP( convert ) = nodetype;
3331 str=STRING(convert);
3332 STR_LEN(convert)=0;
3333 }
c490c714
YO
3334 STR_LEN(convert) += len;
3335 while (len--)
de734bd5 3336 *str++ = *ch++;
8e11feef 3337 } else {
538e84ed 3338#ifdef DEBUGGING
8e11feef 3339 if (state>1)
6ad9a8ab 3340 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
f9049ba1 3341#endif
8e11feef
RGS
3342 break;
3343 }
3344 }
2e64971a 3345 trie->prefixlen = (state-1);
3dab1dad 3346 if (str) {
8e11feef 3347 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 3348 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 3349 trie->startstate = state;
07be1b83
YO
3350 trie->minlen -= (state - 1);
3351 trie->maxlen -= (state - 1);
33809eae
JH
3352#ifdef DEBUGGING
3353 /* At least the UNICOS C compiler choked on this
3354 * being argument to DEBUG_r(), so let's just have
3355 * it right here. */
3356 if (
3357#ifdef PERL_EXT_RE_BUILD
3358 1
3359#else
3360 DEBUG_r_TEST
3361#endif
3362 ) {
3363 regnode *fix = convert;
3364 U32 word = trie->wordcount;
3365 mjd_nodelen++;
3366 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3367 while( ++fix < n ) {
3368 Set_Node_Offset_Length(fix, 0, 0);
3369 }
3370 while (word--) {
3371 SV ** const tmp = av_fetch( trie_words, word, 0 );
3372 if (tmp) {
3373 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3374 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3375 else
3376 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3377 }
3378 }
3379 }
3380#endif
8e11feef
RGS
3381 if (trie->maxlen) {
3382 convert = n;
3383 } else {
3dab1dad 3384 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 3385 DEBUG_r(optimize= n);
3dab1dad
YO
3386 }
3387 }
3388 }
538e84ed
KW
3389 if (!jumper)
3390 jumper = last;
3dab1dad 3391 if ( trie->maxlen ) {
8e11feef
RGS
3392 NEXT_OFF( convert ) = (U16)(tail - convert);
3393 ARG_SET( convert, data_slot );
538e84ed
KW
3394 /* Store the offset to the first unabsorbed branch in
3395 jump[0], which is otherwise unused by the jump logic.
786e8c11 3396 We use this when dumping a trie and during optimisation. */
538e84ed 3397 if (trie->jump)
7f69552c 3398 trie->jump[0] = (U16)(nextbranch - convert);
538e84ed 3399
6c48061a
YO
3400 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3401 * and there is a bitmap
3402 * and the first "jump target" node we found leaves enough room
3403 * then convert the TRIE node into a TRIEC node, with the bitmap
3404 * embedded inline in the opcode - this is hypothetically faster.
3405 */
3406 if ( !trie->states[trie->startstate].wordnum
3407 && trie->bitmap
3408 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
3409 {
3410 OP( convert ) = TRIEC;
3411 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 3412 PerlMemShared_free(trie->bitmap);
786e8c11 3413 trie->bitmap= NULL;
538e84ed 3414 } else
786e8c11 3415 OP( convert ) = TRIE;
a3621e74 3416
3dab1dad
YO
3417 /* store the type in the flags */
3418 convert->flags = nodetype;
a5ca303d 3419 DEBUG_r({
538e84ed
KW
3420 optimize = convert
3421 + NODE_STEP_REGNODE
a5ca303d
YO
3422 + regarglen[ OP( convert ) ];
3423 });
538e84ed 3424 /* XXX We really should free up the resource in trie now,
a5ca303d 3425 as we won't use them - (which resources?) dmq */
3dab1dad 3426 }
a3621e74 3427 /* needed for dumping*/
e62cc96a 3428 DEBUG_r(if (optimize) {
07be1b83 3429 regnode *opt = convert;
bcdf7404 3430
e62cc96a 3431 while ( ++opt < optimize) {
07be1b83
YO
3432 Set_Node_Offset_Length(opt,0,0);
3433 }
538e84ed
KW
3434 /*
3435 Try to clean up some of the debris left after the
786e8c11 3436 optimisation.
a3621e74 3437 */
786e8c11 3438 while( optimize < jumper ) {
07be1b83 3439 mjd_nodelen += Node_Length((optimize));
a3621e74 3440 OP( optimize ) = OPTIMIZED;
07be1b83 3441 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
3442 optimize++;
3443 }
07be1b83 3444 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
3445 });
3446 } /* end node insert */
2e64971a
DM
3447
3448 /* Finish populating the prev field of the wordinfo array. Walk back
3449 * from each accept state until we find another accept state, and if
3450 * so, point the first word's .prev field at the second word. If the
3451 * second already has a .prev field set, stop now. This will be the
3452 * case either if we've already processed that word's accept state,
3b753521
FN
3453 * or that state had multiple words, and the overspill words were
3454 * already linked up earlier.
2e64971a
DM
3455 */
3456 {
3457 U16 word;
3458 U32 state;
3459 U16 prev;
3460
3461 for (word=1; word <= trie->wordcount; word++) {
3462 prev = 0;
3463 if (trie->wordinfo[word].prev)
3464 continue;
3465 state = trie->wordinfo[word].accept;
3466 while (state) {
3467 state = prev_states[state];
3468 if (!state)
3469 break;
3470 prev = trie->states[state].wordnum;
3471 if (prev)
3472 break;
3473 }
3474 trie->wordinfo[word].prev = prev;