This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Allow space for NUL is UTF-8 array decls
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
f7e03a10 89#include "dquote_inline.h"
b992490d 90#include "invlist_inline.h"
1b0f46bf 91#include "unicode_constants.h"
04e98a4d 92
538e84ed
KW
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
95#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 97#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 98#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
3f910e62
YO
104/* this is a chain of data about sub patterns we are processing that
105 need to be handled separately/specially in study_chunk. Its so
106 we can simulate recursion without losing state. */
107struct scan_frame;
108typedef struct scan_frame {
109 regnode *last_regnode; /* last node to process in this frame */
110 regnode *next_regnode; /* next node to process when last is reached */
111 U32 prev_recursed_depth;
112 I32 stopparen; /* what stopparen do we use */
3f910e62
YO
113
114 struct scan_frame *this_prev_frame; /* this previous frame */
115 struct scan_frame *prev_frame; /* previous frame */
116 struct scan_frame *next_frame; /* next frame */
117} scan_frame;
118
f4ae5a27
KW
119/* Certain characters are output as a sequence with the first being a
120 * backslash. */
eb7486de 121#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
f4ae5a27
KW
122
123
09b2b2e6 124struct RExC_state_t {
514a91f1
DM
125 U32 flags; /* RXf_* are we folding, multilining? */
126 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 127 char *precomp; /* uncompiled string. */
711b303b 128 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 129 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 130 regexp *rx; /* perl core regexp structure */
538e84ed
KW
131 regexp_internal *rxi; /* internal data for regexp object
132 pprivate field */
fac92740 133 char *start; /* Start of input for compile */
830247a4
IZ
134 char *end; /* End of input for compile */
135 char *parse; /* Input-scan pointer. */
285b5ca0
KW
136 char *adjusted_start; /* 'start', adjusted. See code use */
137 STRLEN precomp_adj; /* an offset beyond precomp. See code use */
ea3daa5d 138 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 139 regnode *emit_start; /* Start of emitted-code area */
538e84ed
KW
140 regnode *emit_bound; /* First regnode outside of the
141 allocated space */
f7c7e32a
DM
142 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
143 implies compiling, so don't emit */
9a81a976
KW
144 regnode_ssc emit_dummy; /* placeholder for emit to point to;
145 large enough for the largest
146 non-EXACTish node, so can use it as
147 scratch in pass1 */
830247a4
IZ
148 I32 naughty; /* How bad is this pattern? */
149 I32 sawback; /* Did we see \1, ...? */
150 U32 seen;
ea3daa5d 151 SSize_t size; /* Code size. */
e7f023ff 152 I32 npar; /* Capture buffer count, (OPEN) plus
538e84ed
KW
153 one. ("par" 0 is the whole
154 pattern)*/
155 I32 nestroot; /* root parens we are in - used by
156 accept */
830247a4
IZ
157 I32 extralen;
158 I32 seen_zerolen;
40d049e4
YO
159 regnode **open_parens; /* pointers to open parens */
160 regnode **close_parens; /* pointers to close parens */
d5a00e4a 161 regnode *end_op; /* END node in program */
02daf0ab
YO
162 I32 utf8; /* whether the pattern is utf8 or not */
163 I32 orig_utf8; /* whether the pattern was originally in utf8 */
164 /* XXX use this for future optimisation of case
165 * where pattern must be upgraded to utf8. */
e40e74fe
KW
166 I32 uni_semantics; /* If a d charset modifier should use unicode
167 rules, even if the pattern is not in
168 utf8 */
81714fb9 169 HV *paren_names; /* Paren names */
538e84ed 170
40d049e4 171 regnode **recurse; /* Recurse regops */
d5a00e4a 172 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 173 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 174 through */
09a65838 175 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 176 I32 in_lookbehind;
4624b182 177 I32 contains_locale;
bb3f3ed2 178 I32 override_recoding;
b6d67071
KW
179#ifdef EBCDIC
180 I32 recode_x_to_native;
181#endif
9d53c457 182 I32 in_multi_char_class;
1acab4c5 183 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 184 within pattern */
b1603ef8 185 int code_index; /* next code_blocks[] slot */
ee273784 186 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
187 scan_frame *frame_head;
188 scan_frame *frame_last;
189 U32 frame_count;
7eec73eb 190 AV *warn_text;
dc6d7f5c 191#ifdef ADD_TO_REGEXEC
830247a4
IZ
192 char *starttry; /* -Dr: where regtry was called. */
193#define RExC_starttry (pRExC_state->starttry)
194#endif
d24ca0c5 195 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 196#ifdef DEBUGGING
be8e71aa 197 const char *lastparse;
3dab1dad 198 I32 lastnum;
1f1031fe 199 AV *paren_name_list; /* idx -> name */
d9a72fcc 200 U32 study_chunk_recursed_count;
c9f0d54c
YO
201 SV *mysv1;
202 SV *mysv2;
3dab1dad
YO
203#define RExC_lastparse (pRExC_state->lastparse)
204#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 205#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 206#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
207#define RExC_mysv (pRExC_state->mysv1)
208#define RExC_mysv1 (pRExC_state->mysv1)
209#define RExC_mysv2 (pRExC_state->mysv2)
210
3dab1dad 211#endif
512c0f5a 212 bool seen_unfolded_sharp_s;
911bd04e 213 bool strict;
da7cf1cc 214 bool study_started;
034602eb 215 bool in_script_run;
09b2b2e6 216};
830247a4 217
e2509266 218#define RExC_flags (pRExC_state->flags)
514a91f1 219#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 220#define RExC_precomp (pRExC_state->precomp)
285b5ca0
KW
221#define RExC_precomp_adj (pRExC_state->precomp_adj)
222#define RExC_adjusted_start (pRExC_state->adjusted_start)
711b303b 223#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 224#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 225#define RExC_rx (pRExC_state->rx)
f8fc2ecf 226#define RExC_rxi (pRExC_state->rxi)
fac92740 227#define RExC_start (pRExC_state->start)
830247a4
IZ
228#define RExC_end (pRExC_state->end)
229#define RExC_parse (pRExC_state->parse)
230#define RExC_whilem_seen (pRExC_state->whilem_seen)
512c0f5a
KW
231
232/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
233 * EXACTF node, hence was parsed under /di rules. If later in the parse,
234 * something forces the pattern into using /ui rules, the sharp s should be
235 * folded into the sequence 'ss', which takes up more space than previously
236 * calculated. This means that the sizing pass needs to be restarted. (The
237 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
238 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
239 * so there is no need to resize [perl #125990]. */
240#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
241
7122b237 242#ifdef RE_TRACK_PATTERN_OFFSETS
538e84ed
KW
243#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
244 others */
7122b237 245#endif
830247a4 246#define RExC_emit (pRExC_state->emit)
f7c7e32a 247#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 248#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 249#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
250#define RExC_sawback (pRExC_state->sawback)
251#define RExC_seen (pRExC_state->seen)
252#define RExC_size (pRExC_state->size)
ee273784 253#define RExC_maxlen (pRExC_state->maxlen)
830247a4 254#define RExC_npar (pRExC_state->npar)
e2e6a0f1 255#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
256#define RExC_extralen (pRExC_state->extralen)
257#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 258#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 259#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 260#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
261#define RExC_open_parens (pRExC_state->open_parens)
262#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 263#define RExC_end_op (pRExC_state->end_op)
81714fb9 264#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
265#define RExC_recurse (pRExC_state->recurse)
266#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 267#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
268#define RExC_study_chunk_recursed_bytes \
269 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 270#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 271#define RExC_contains_locale (pRExC_state->contains_locale)
b6d67071
KW
272#ifdef EBCDIC
273# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
274#endif
9d53c457 275#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
276#define RExC_frame_head (pRExC_state->frame_head)
277#define RExC_frame_last (pRExC_state->frame_last)
278#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 279#define RExC_strict (pRExC_state->strict)
da7cf1cc 280#define RExC_study_started (pRExC_state->study_started)
7eec73eb 281#define RExC_warn_text (pRExC_state->warn_text)
034602eb 282#define RExC_in_script_run (pRExC_state->in_script_run)
830247a4 283
99807a43
HS
284/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
285 * a flag to disable back-off on the fixed/floating substrings - if it's
286 * a high complexity pattern we assume the benefit of avoiding a full match
287 * is worth the cost of checking for the substrings even if they rarely help.
288 */
289#define RExC_naughty (pRExC_state->naughty)
290#define TOO_NAUGHTY (10)
291#define MARK_NAUGHTY(add) \
292 if (RExC_naughty < TOO_NAUGHTY) \
293 RExC_naughty += (add)
294#define MARK_NAUGHTY_EXP(exp, add) \
295 if (RExC_naughty < TOO_NAUGHTY) \
296 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 297
a687059c
LW
298#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
299#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 300 ((*s) == '{' && regcurly(s)))
a687059c
LW
301
302/*
303 * Flags to be passed up and down.
304 */
a687059c 305#define WORST 0 /* Worst case. */
a3b492c3 306#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 307
e64f369d 308/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
309 * character. (There needs to be a case: in the switch statement in regexec.c
310 * for any node marked SIMPLE.) Note that this is not the same thing as
311 * REGNODE_SIMPLE */
fda99bee 312#define SIMPLE 0x02
e64f369d 313#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
314#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
315#define TRYAGAIN 0x10 /* Weeded out a declaration. */
b97943f7
KW
316#define RESTART_PASS1 0x20 /* Need to restart sizing pass */
317#define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
318 calcuate sizes as UTF-8 */
a687059c 319
3dab1dad
YO
320#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
321
07be1b83
YO
322/* whether trie related optimizations are enabled */
323#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
324#define TRIE_STUDY_OPT
786e8c11 325#define FULL_TRIE_STUDY
07be1b83
YO
326#define TRIE_STCLASS
327#endif
1de06328
YO
328
329
40d049e4
YO
330
331#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
332#define PBITVAL(paren) (1 << ((paren) & 7))
333#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
334#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
335#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
336
82a6ada4 337#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 338 if (!UTF) { \
82a6ada4
KW
339 assert(PASS1); \
340 *flagp = RESTART_PASS1|NEED_UTF8; \
8d9c2815
NC
341 return NULL; \
342 } \
82a6ada4 343 } STMT_END
40d049e4 344
512c0f5a
KW
345/* Change from /d into /u rules, and restart the parse if we've already seen
346 * something whose size would increase as a result, by setting *flagp and
347 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
348 * we've change to /u during the parse. */
349#define REQUIRE_UNI_RULES(flagp, restart_retval) \
350 STMT_START { \
351 if (DEPENDS_SEMANTICS) { \
352 assert(PASS1); \
353 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
354 RExC_uni_semantics = 1; \
355 if (RExC_seen_unfolded_sharp_s) { \
356 *flagp |= RESTART_PASS1; \
357 return restart_retval; \
358 } \
359 } \
360 } STMT_END
361
f19b1a63
KW
362/* This converts the named class defined in regcomp.h to its equivalent class
363 * number defined in handy.h. */
364#define namedclass_to_classnum(class) ((int) ((class) / 2))
365#define classnum_to_namedclass(classnum) ((classnum) * 2)
366
de92f5e6
KW
367#define _invlist_union_complement_2nd(a, b, output) \
368 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
369#define _invlist_intersection_complement_2nd(a, b, output) \
370 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
371
1de06328
YO
372/* About scan_data_t.
373
374 During optimisation we recurse through the regexp program performing
375 various inplace (keyhole style) optimisations. In addition study_chunk
376 and scan_commit populate this data structure with information about
538e84ed 377 what strings MUST appear in the pattern. We look for the longest
3b753521 378 string that must appear at a fixed location, and we look for the
1de06328
YO
379 longest string that may appear at a floating location. So for instance
380 in the pattern:
538e84ed 381
1de06328 382 /FOO[xX]A.*B[xX]BAR/
538e84ed 383
1de06328
YO
384 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
385 strings (because they follow a .* construct). study_chunk will identify
386 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 387
1de06328 388 The strings can be composites, for instance
538e84ed 389
1de06328 390 /(f)(o)(o)/
538e84ed 391
1de06328 392 will result in a composite fixed substring 'foo'.
538e84ed 393
1de06328 394 For each string some basic information is maintained:
538e84ed 395
6ea77169 396 - min_offset
1de06328
YO
397 This is the position the string must appear at, or not before.
398 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
399 characters must match before the string we are searching for.
400 Likewise when combined with minlenp and the length of the string it
538e84ed 401 tells us how many characters must appear after the string we have
1de06328 402 found.
538e84ed 403
1de06328
YO
404 - max_offset
405 Only used for floating strings. This is the rightmost point that
ea3daa5d 406 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 407 string can occur infinitely far to the right.
6e12f832 408 For fixed strings, it is equal to min_offset.
538e84ed 409
1de06328 410 - minlenp
2d608413
KW
411 A pointer to the minimum number of characters of the pattern that the
412 string was found inside. This is important as in the case of positive
538e84ed 413 lookahead or positive lookbehind we can have multiple patterns
1de06328 414 involved. Consider
538e84ed 415
1de06328 416 /(?=FOO).*F/
538e84ed 417
1de06328
YO
418 The minimum length of the pattern overall is 3, the minimum length
419 of the lookahead part is 3, but the minimum length of the part that
538e84ed 420 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 421 minimum length for the F is 1. This is important as the minimum length
538e84ed 422 is used to determine offsets in front of and behind the string being
1de06328 423 looked for. Since strings can be composites this is the length of the
486ec47a 424 pattern at the time it was committed with a scan_commit. Note that
1de06328 425 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 426 are not known until the full pattern has been compiled, thus the
1de06328 427 pointer to the value.
538e84ed 428
1de06328 429 - lookbehind
538e84ed 430
1de06328 431 In the case of lookbehind the string being searched for can be
538e84ed 432 offset past the start point of the final matching string.
1de06328
YO
433 If this value was just blithely removed from the min_offset it would
434 invalidate some of the calculations for how many chars must match
435 before or after (as they are derived from min_offset and minlen and
538e84ed 436 the length of the string being searched for).
1de06328
YO
437 When the final pattern is compiled and the data is moved from the
438 scan_data_t structure into the regexp structure the information
538e84ed
KW
439 about lookbehind is factored in, with the information that would
440 have been lost precalculated in the end_shift field for the
1de06328
YO
441 associated string.
442
443 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 444 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
445
446*/
2c2d71f5 447
cb6aef8b
DM
448struct scan_data_substrs {
449 SV *str; /* longest substring found in pattern */
450 SSize_t min_offset; /* earliest point in string it can appear */
451 SSize_t max_offset; /* latest point in string it can appear */
452 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
453 SSize_t lookbehind; /* is the pos of the string modified by LB */
11683ecb 454 I32 flags; /* per substring SF_* and SCF_* flags */
cb6aef8b
DM
455};
456
2c2d71f5 457typedef struct scan_data_t {
1de06328
YO
458 /*I32 len_min; unused */
459 /*I32 len_delta; unused */
49f55535 460 SSize_t pos_min;
ea3daa5d 461 SSize_t pos_delta;
2c2d71f5 462 SV *last_found;
ea3daa5d 463 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 464 SSize_t last_start_min;
ea3daa5d 465 SSize_t last_start_max;
2099df82
DM
466 U8 cur_is_floating; /* whether the last_* values should be set as
467 * the next fixed (0) or floating (1)
468 * substring */
6ea77169 469
2099df82
DM
470 /* [0] is longest fixed substring so far, [1] is longest float so far */
471 struct scan_data_substrs substrs[2];
6ea77169 472
11683ecb 473 I32 flags; /* common SF_* and SCF_* flags */
2c2d71f5 474 I32 whilem_c;
49f55535 475 SSize_t *last_closep;
b8f7bb16 476 regnode_ssc *start_class;
2c2d71f5
JH
477} scan_data_t;
478
a687059c 479/*
e50aee73 480 * Forward declarations for pregcomp()'s friends.
a687059c 481 */
a0d0e21e 482
6ea77169 483static const scan_data_t zero_scan_data = {
a2ca2017 484 0, 0, NULL, 0, 0, 0, 0,
6ea77169 485 {
11683ecb
DM
486 { NULL, 0, 0, 0, 0, 0 },
487 { NULL, 0, 0, 0, 0, 0 },
6ea77169
DM
488 },
489 0, 0, NULL, NULL
490};
c277df42 491
11683ecb
DM
492/* study flags */
493
07be1b83
YO
494#define SF_BEFORE_SEOL 0x0001
495#define SF_BEFORE_MEOL 0x0002
11683ecb 496#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
c277df42 497
07be1b83
YO
498#define SF_IS_INF 0x0040
499#define SF_HAS_PAR 0x0080
500#define SF_IN_PAR 0x0100
501#define SF_HAS_EVAL 0x0200
bc604ad8
KW
502
503
504/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
505 * longest substring in the pattern. When it is not set the optimiser keeps
506 * track of position, but does not keep track of the actual strings seen,
507 *
508 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
509 * /foo/i will not.
510 *
511 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
512 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
513 * turned off because of the alternation (BRANCH). */
07be1b83 514#define SCF_DO_SUBSTR 0x0400
bc604ad8 515
653099ff
GS
516#define SCF_DO_STCLASS_AND 0x0800
517#define SCF_DO_STCLASS_OR 0x1000
518#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 519#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 520
786e8c11 521#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 522#define SCF_SEEN_ACCEPT 0x8000
688e0391 523#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
524#define SCF_IN_DEFINE 0x20000
525
526
527
07be1b83 528
43fead97 529#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
530
531/* The enums for all these are ordered so things work out correctly */
a62b1201 532#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
533#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
534 == REGEX_DEPENDS_CHARSET)
00b27cfc 535#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
536#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
537 >= REGEX_UNICODE_CHARSET)
538#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
539 == REGEX_ASCII_RESTRICTED_CHARSET)
540#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
541 >= REGEX_ASCII_RESTRICTED_CHARSET)
542#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
543 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 544
43fead97 545#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 546
f2c2a6ab
KW
547/* For programs that want to be strictly Unicode compatible by dying if any
548 * attempt is made to match a non-Unicode code point against a Unicode
549 * property. */
550#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
551
93733859 552#define OOB_NAMEDCLASS -1
b8c5462f 553
8e661ac5
KW
554/* There is no code point that is out-of-bounds, so this is problematic. But
555 * its only current use is to initialize a variable that is always set before
556 * looked at. */
557#define OOB_UNICODE 0xDEADBEEF
558
a0ed51b3 559#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 560
8615cb43 561
b45f050a
JF
562/* length of regex to show in messages that don't mark a position within */
563#define RegexLengthToShowInErrorMessages 127
564
565/*
566 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
567 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
568 * op/pragma/warn/regcomp.
569 */
7253e4e3
RK
570#define MARKER1 "<-- HERE" /* marker as it appears in the description */
571#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 572
538e84ed 573#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 574 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 575
285b5ca0
KW
576/* The code in this file in places uses one level of recursion with parsing
577 * rebased to an alternate string constructed by us in memory. This can take
578 * the form of something that is completely different from the input, or
579 * something that uses the input as part of the alternate. In the first case,
580 * there should be no possibility of an error, as we are in complete control of
581 * the alternate string. But in the second case we don't control the input
582 * portion, so there may be errors in that. Here's an example:
583 * /[abc\x{DF}def]/ui
584 * is handled specially because \x{df} folds to a sequence of more than one
585 * character, 'ss'. What is done is to create and parse an alternate string,
586 * which looks like this:
587 * /(?:\x{DF}|[abc\x{DF}def])/ui
588 * where it uses the input unchanged in the middle of something it constructs,
589 * which is a branch for the DF outside the character class, and clustering
590 * parens around the whole thing. (It knows enough to skip the DF inside the
591 * class while in this substitute parse.) 'abc' and 'def' may have errors that
592 * need to be reported. The general situation looks like this:
593 *
594 * sI tI xI eI
595 * Input: ----------------------------------------------------
596 * Constructed: ---------------------------------------------------
597 * sC tC xC eC EC
598 *
599 * The input string sI..eI is the input pattern. The string sC..EC is the
600 * constructed substitute parse string. The portions sC..tC and eC..EC are
601 * constructed by us. The portion tC..eC is an exact duplicate of the input
602 * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that
603 * while parsing, we find an error at xC. We want to display a message showing
604 * the real input string. Thus we need to find the point xI in it which
605 * corresponds to xC. xC >= tC, since the portion of the string sC..tC has
606 * been constructed by us, and so shouldn't have errors. We get:
607 *
608 * xI = sI + (tI - sI) + (xC - tC)
609 *
610 * and, the offset into sI is:
611 *
612 * (xI - sI) = (tI - sI) + (xC - tC)
613 *
614 * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
615 * and we save tC as RExC_adjusted_start.
903c858a
KW
616 *
617 * During normal processing of the input pattern, everything points to that,
618 * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
285b5ca0
KW
619 */
620
621#define tI_sI RExC_precomp_adj
622#define tC RExC_adjusted_start
623#define sC RExC_precomp
624#define xI_offset(xC) ((IV) (tI_sI + (xC - tC)))
625#define xI(xC) (sC + xI_offset(xC))
626#define eC RExC_precomp_end
627
628#define REPORT_LOCATION_ARGS(xC) \
629 UTF8fARG(UTF, \
630 (xI(xC) > eC) /* Don't run off end */ \
631 ? eC - sC /* Length before the <--HERE */ \
e0c80300 632 : ( __ASSERT_(xI_offset(xC) >= 0) xI_offset(xC) ), \
285b5ca0
KW
633 sC), /* The input pattern printed up to the <--HERE */ \
634 UTF8fARG(UTF, \
635 (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \
636 (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */
c1d900c3 637
8a6d8ec6
HS
638/* Used to point after bad bytes for an error message, but avoid skipping
639 * past a nul byte. */
640#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
641
b45f050a
JF
642/*
643 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
644 * arg. Show regex, up to a maximum length. If it's too long, chop and add
645 * "...".
646 */
58e23c8d 647#define _FAIL(code) STMT_START { \
bfed75c6 648 const char *ellipses = ""; \
711b303b 649 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380
MP
650 \
651 if (!SIZE_ONLY) \
a5e7bc51 652 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
653 if (len > RegexLengthToShowInErrorMessages) { \
654 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
655 len = RegexLengthToShowInErrorMessages - 10; \
656 ellipses = "..."; \
657 } \
58e23c8d 658 code; \
ccb2c380 659} STMT_END
8615cb43 660
58e23c8d 661#define FAIL(msg) _FAIL( \
147e3846 662 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 663 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
664
665#define FAIL2(msg,arg) _FAIL( \
147e3846 666 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 667 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 668
b45f050a 669/*
b45f050a
JF
670 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
671 */
ccb2c380 672#define Simple_vFAIL(m) STMT_START { \
ccb2c380 673 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 674 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 675} STMT_END
b45f050a
JF
676
677/*
678 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
679 */
ccb2c380
MP
680#define vFAIL(m) STMT_START { \
681 if (!SIZE_ONLY) \
a5e7bc51 682 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
683 Simple_vFAIL(m); \
684} STMT_END
b45f050a
JF
685
686/*
687 * Like Simple_vFAIL(), but accepts two arguments.
688 */
ccb2c380 689#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
690 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
691 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 692} STMT_END
b45f050a
JF
693
694/*
695 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
696 */
ccb2c380
MP
697#define vFAIL2(m,a1) STMT_START { \
698 if (!SIZE_ONLY) \
a5e7bc51 699 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
700 Simple_vFAIL2(m, a1); \
701} STMT_END
b45f050a
JF
702
703
704/*
705 * Like Simple_vFAIL(), but accepts three arguments.
706 */
ccb2c380 707#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 708 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 709 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 710} STMT_END
b45f050a
JF
711
712/*
713 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
714 */
ccb2c380
MP
715#define vFAIL3(m,a1,a2) STMT_START { \
716 if (!SIZE_ONLY) \
a5e7bc51 717 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
718 Simple_vFAIL3(m, a1, a2); \
719} STMT_END
b45f050a
JF
720
721/*
722 * Like Simple_vFAIL(), but accepts four arguments.
723 */
ccb2c380 724#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
725 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
726 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 727} STMT_END
b45f050a 728
95db3ffa
KW
729#define vFAIL4(m,a1,a2,a3) STMT_START { \
730 if (!SIZE_ONLY) \
731 SAVEFREESV(RExC_rx_sv); \
732 Simple_vFAIL4(m, a1, a2, a3); \
733} STMT_END
734
946095af 735/* A specialized version of vFAIL2 that works with UTF8f */
d528642a
KW
736#define vFAIL2utf8f(m, a1) STMT_START { \
737 if (!SIZE_ONLY) \
738 SAVEFREESV(RExC_rx_sv); \
739 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
740 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
741} STMT_END
742
3ba22297 743#define vFAIL3utf8f(m, a1, a2) STMT_START { \
3ba22297
KW
744 if (!SIZE_ONLY) \
745 SAVEFREESV(RExC_rx_sv); \
746 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 747 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
748} STMT_END
749
499333dc
KW
750/* These have asserts in them because of [perl #122671] Many warnings in
751 * regcomp.c can occur twice. If they get output in pass1 and later in that
752 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
753 * would get output again. So they should be output in pass2, and these
754 * asserts make sure new warnings follow that paradigm. */
946095af 755
5e0a247b
KW
756/* m is not necessarily a "literal string", in this macro */
757#define reg_warn_non_literal_string(loc, m) STMT_START { \
d528642a
KW
758 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
759 "%s" REPORT_LOCATION, \
760 m, REPORT_LOCATION_ARGS(loc)); \
5e0a247b
KW
761} STMT_END
762
668c081a 763#define ckWARNreg(loc,m) STMT_START { \
d528642a
KW
764 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
765 m REPORT_LOCATION, \
766 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
767} STMT_END
768
b927b7e9 769#define vWARN(loc, m) STMT_START { \
d528642a
KW
770 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
771 m REPORT_LOCATION, \
772 REPORT_LOCATION_ARGS(loc)); \
b927b7e9
KW
773} STMT_END
774
0d6106aa 775#define vWARN_dep(loc, m) STMT_START { \
d528642a
KW
776 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
777 m REPORT_LOCATION, \
778 REPORT_LOCATION_ARGS(loc)); \
0d6106aa
KW
779} STMT_END
780
147508a2 781#define ckWARNdep(loc,m) STMT_START { \
d528642a
KW
782 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
783 m REPORT_LOCATION, \
784 REPORT_LOCATION_ARGS(loc)); \
147508a2
KW
785} STMT_END
786
d528642a
KW
787#define ckWARNregdep(loc,m) STMT_START { \
788 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
789 WARN_REGEXP), \
790 m REPORT_LOCATION, \
791 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
792} STMT_END
793
d528642a
KW
794#define ckWARN2reg_d(loc,m, a1) STMT_START { \
795 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
796 m REPORT_LOCATION, \
797 a1, REPORT_LOCATION_ARGS(loc)); \
2335b3d3
KW
798} STMT_END
799
d528642a
KW
800#define ckWARN2reg(loc, m, a1) STMT_START { \
801 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
802 m REPORT_LOCATION, \
803 a1, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
804} STMT_END
805
d528642a
KW
806#define vWARN3(loc, m, a1, a2) STMT_START { \
807 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
808 m REPORT_LOCATION, \
809 a1, a2, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
810} STMT_END
811
d528642a
KW
812#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
813 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
814 m REPORT_LOCATION, \
815 a1, a2, \
816 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
817} STMT_END
818
ccb2c380 819#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
820 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
821 m REPORT_LOCATION, \
822 a1, a2, a3, \
823 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
824} STMT_END
825
668c081a 826#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
827 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
828 m REPORT_LOCATION, \
829 a1, a2, a3, \
830 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
831} STMT_END
832
ccb2c380 833#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
d528642a
KW
834 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
835 m REPORT_LOCATION, \
836 a1, a2, a3, a4, \
837 REPORT_LOCATION_ARGS(loc)); \
ccb2c380 838} STMT_END
9d1d55b5 839
538e84ed 840/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
841 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
842 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
843 * Element 0 holds the number n.
07be1b83 844 * Position is 1 indexed.
fac92740 845 */
7122b237
YO
846#ifndef RE_TRACK_PATTERN_OFFSETS
847#define Set_Node_Offset_To_R(node,byte)
848#define Set_Node_Offset(node,byte)
849#define Set_Cur_Node_Offset
850#define Set_Node_Length_To_R(node,len)
851#define Set_Node_Length(node,len)
6a86c6ad 852#define Set_Node_Cur_Length(node,start)
538e84ed
KW
853#define Node_Offset(n)
854#define Node_Length(n)
7122b237
YO
855#define Set_Node_Offset_Length(node,offset,len)
856#define ProgLen(ri) ri->u.proglen
857#define SetProgLen(ri,x) ri->u.proglen = x
858#else
859#define ProgLen(ri) ri->u.offsets[0]
860#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
861#define Set_Node_Offset_To_R(node,byte) STMT_START { \
862 if (! SIZE_ONLY) { \
863 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 864 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 865 if((node) < 0) { \
538e84ed
KW
866 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
867 (int)(node)); \
ccb2c380
MP
868 } else { \
869 RExC_offsets[2*(node)-1] = (byte); \
870 } \
871 } \
872} STMT_END
873
874#define Set_Node_Offset(node,byte) \
875 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
876#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
877
878#define Set_Node_Length_To_R(node,len) STMT_START { \
879 if (! SIZE_ONLY) { \
880 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 881 __LINE__, (int)(node), (int)(len))); \
ccb2c380 882 if((node) < 0) { \
538e84ed
KW
883 Perl_croak(aTHX_ "value of node is %d in Length macro", \
884 (int)(node)); \
ccb2c380
MP
885 } else { \
886 RExC_offsets[2*(node)] = (len); \
887 } \
888 } \
889} STMT_END
890
891#define Set_Node_Length(node,len) \
892 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
893#define Set_Node_Cur_Length(node, start) \
894 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
895
896/* Get offsets and lengths */
897#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
898#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
899
07be1b83
YO
900#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
901 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
902 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
903} STMT_END
7122b237 904#endif
07be1b83
YO
905
906#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
907#define EXPERIMENTAL_INPLACESCAN
f427392e 908#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 909
cb41e5d6
YO
910#ifdef DEBUGGING
911int
6ad9a8ab 912Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
913{
914 va_list ap;
915 int result;
916 PerlIO *f= Perl_debug_log;
917 PERL_ARGS_ASSERT_RE_PRINTF;
918 va_start(ap, fmt);
919 result = PerlIO_vprintf(f, fmt, ap);
920 va_end(ap);
921 return result;
922}
923
924int
7b031478 925Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
926{
927 va_list ap;
928 int result;
929 PerlIO *f= Perl_debug_log;
930 PERL_ARGS_ASSERT_RE_INDENTF;
931 va_start(ap, depth);
daeb874b 932 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
933 result = PerlIO_vprintf(f, fmt, ap);
934 va_end(ap);
935 return result;
936}
937#endif /* DEBUGGING */
938
7b031478 939#define DEBUG_RExC_seen() \
538e84ed 940 DEBUG_OPTIMISE_MORE_r({ \
6ad9a8ab 941 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 942 \
e384d5c1 943 if (RExC_seen & REG_ZERO_LEN_SEEN) \
6ad9a8ab 944 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 945 \
e384d5c1 946 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
6ad9a8ab 947 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 948 \
e384d5c1 949 if (RExC_seen & REG_GPOS_SEEN) \
6ad9a8ab 950 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 951 \
e384d5c1 952 if (RExC_seen & REG_RECURSE_SEEN) \
6ad9a8ab 953 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 954 \
7b031478 955 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
6ad9a8ab 956 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 957 \
e384d5c1 958 if (RExC_seen & REG_VERBARG_SEEN) \
6ad9a8ab 959 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 960 \
e384d5c1 961 if (RExC_seen & REG_CUTGROUP_SEEN) \
6ad9a8ab 962 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 963 \
e384d5c1 964 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
6ad9a8ab 965 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 966 \
e384d5c1 967 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
6ad9a8ab 968 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 969 \
7b031478 970 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
6ad9a8ab 971 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 972 \
6ad9a8ab 973 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
974 });
975
fdfb4f21 976#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 977 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21 978
fdfb4f21 979
f5a36d78
DM
980#ifdef DEBUGGING
981static void
982S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
983 const char *close_str)
984{
985 if (!flags)
986 return;
987
988 Perl_re_printf( aTHX_ "%s", open_str);
11683ecb
DM
989 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
990 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
f5a36d78
DM
991 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
992 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
993 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
994 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
995 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
996 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
997 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
998 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
999 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1000 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1001 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1002 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1003 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1004 Perl_re_printf( aTHX_ "%s", close_str);
1005}
1006
1007
1008static void
1009S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1010 U32 depth, int is_inf)
1011{
1012 GET_RE_DEBUG_FLAGS_DECL;
1013
1014 DEBUG_OPTIMISE_MORE_r({
1015 if (!data)
1016 return;
1017 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1018 depth,
1019 where,
1020 (IV)data->pos_min,
1021 (IV)data->pos_delta,
1022 (UV)data->flags
1023 );
1024
1025 S_debug_show_study_flags(aTHX_ data->flags," [","]");
fdfb4f21 1026
f5a36d78
DM
1027 Perl_re_printf( aTHX_
1028 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1029 (IV)data->whilem_c,
1030 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1031 is_inf ? "INF " : ""
1032 );
1033
11683ecb 1034 if (data->last_found) {
37b6262f 1035 int i;
11683ecb
DM
1036 Perl_re_printf(aTHX_
1037 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1038 SvPVX_const(data->last_found),
1039 (IV)data->last_end,
1040 (IV)data->last_start_min,
1041 (IV)data->last_start_max
1042 );
1043
2099df82 1044 for (i = 0; i < 2; i++) {
37b6262f
DM
1045 Perl_re_printf(aTHX_
1046 " %s%s: '%s' @ %" IVdf "/%" IVdf,
2099df82
DM
1047 data->cur_is_floating == i ? "*" : "",
1048 i ? "Float" : "Fixed",
37b6262f
DM
1049 SvPVX_const(data->substrs[i].str),
1050 (IV)data->substrs[i].min_offset,
1051 (IV)data->substrs[i].max_offset
1052 );
1053 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1054 }
11683ecb 1055 }
f5a36d78
DM
1056
1057 Perl_re_printf( aTHX_ "\n");
1058 });
1059}
1060
1061
1062static void
1063S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1064 regnode *scan, U32 depth, U32 flags)
1065{
1066 GET_RE_DEBUG_FLAGS_DECL;
1067
1068 DEBUG_OPTIMISE_r({
1069 regnode *Next;
1070
1071 if (!scan)
1072 return;
1073 Next = regnext(scan);
1074 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1075 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1076 depth,
1077 str,
1078 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1079 Next ? (REG_NODE_NUM(Next)) : 0 );
1080 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1081 Perl_re_printf( aTHX_ "\n");
1082 });
1083}
1084
1085
1086# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1087 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1088
1089# define DEBUG_PEEP(str, scan, depth, flags) \
1090 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1091
1092#else
1093# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1094# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1095#endif
1de06328 1096
cb41e5d6 1097
c6871b76
KW
1098/* =========================================================
1099 * BEGIN edit_distance stuff.
1100 *
1101 * This calculates how many single character changes of any type are needed to
1102 * transform a string into another one. It is taken from version 3.1 of
1103 *
1104 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1105 */
1106
1107/* Our unsorted dictionary linked list. */
1108/* Note we use UVs, not chars. */
1109
1110struct dictionary{
1111 UV key;
1112 UV value;
1113 struct dictionary* next;
1114};
1115typedef struct dictionary item;
1116
1117
1118PERL_STATIC_INLINE item*
1119push(UV key,item* curr)
1120{
1121 item* head;
d09f14bf 1122 Newx(head, 1, item);
c6871b76
KW
1123 head->key = key;
1124 head->value = 0;
1125 head->next = curr;
1126 return head;
1127}
1128
1129
1130PERL_STATIC_INLINE item*
1131find(item* head, UV key)
1132{
1133 item* iterator = head;
1134 while (iterator){
1135 if (iterator->key == key){
1136 return iterator;
1137 }
1138 iterator = iterator->next;
1139 }
1140
1141 return NULL;
1142}
1143
1144PERL_STATIC_INLINE item*
1145uniquePush(item* head,UV key)
1146{
1147 item* iterator = head;
1148
1149 while (iterator){
1150 if (iterator->key == key) {
1151 return head;
1152 }
1153 iterator = iterator->next;
1154 }
1155
1156 return push(key,head);
1157}
1158
1159PERL_STATIC_INLINE void
1160dict_free(item* head)
1161{
1162 item* iterator = head;
1163
1164 while (iterator) {
1165 item* temp = iterator;
1166 iterator = iterator->next;
1167 Safefree(temp);
1168 }
1169
1170 head = NULL;
1171}
1172
1173/* End of Dictionary Stuff */
1174
1175/* All calculations/work are done here */
1176STATIC int
1177S_edit_distance(const UV* src,
1178 const UV* tgt,
1179 const STRLEN x, /* length of src[] */
1180 const STRLEN y, /* length of tgt[] */
1181 const SSize_t maxDistance
1182)
1183{
1184 item *head = NULL;
1185 UV swapCount,swapScore,targetCharCount,i,j;
1186 UV *scores;
1187 UV score_ceil = x + y;
1188
1189 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1190
1191 /* intialize matrix start values */
d09f14bf 1192 Newx(scores, ( (x + 2) * (y + 2)), UV);
c6871b76
KW
1193 scores[0] = score_ceil;
1194 scores[1 * (y + 2) + 0] = score_ceil;
1195 scores[0 * (y + 2) + 1] = score_ceil;
1196 scores[1 * (y + 2) + 1] = 0;
1197 head = uniquePush(uniquePush(head,src[0]),tgt[0]);
1198
1199 /* work loops */
1200 /* i = src index */
1201 /* j = tgt index */
1202 for (i=1;i<=x;i++) {
1203 if (i < x)
1204 head = uniquePush(head,src[i]);
1205 scores[(i+1) * (y + 2) + 1] = i;
1206 scores[(i+1) * (y + 2) + 0] = score_ceil;
1207 swapCount = 0;
1208
1209 for (j=1;j<=y;j++) {
1210 if (i == 1) {
1211 if(j < y)
1212 head = uniquePush(head,tgt[j]);
1213 scores[1 * (y + 2) + (j + 1)] = j;
1214 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1215 }
1216
1217 targetCharCount = find(head,tgt[j-1])->value;
1218 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1219
1220 if (src[i-1] != tgt[j-1]){
1221 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1222 }
1223 else {
1224 swapCount = j;
1225 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1226 }
1227 }
1228
1229 find(head,src[i-1])->value = i;
1230 }
1231
1232 {
1233 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1234 dict_free(head);
1235 Safefree(scores);
1236 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1237 }
1238}
1239
1240/* END of edit_distance() stuff
1241 * ========================================================= */
1242
8e35b056
KW
1243/* is c a control character for which we have a mnemonic? */
1244#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1245
549b4e78
KW
1246STATIC const char *
1247S_cntrl_to_mnemonic(const U8 c)
1248{
1249 /* Returns the mnemonic string that represents character 'c', if one
1250 * exists; NULL otherwise. The only ones that exist for the purposes of
1251 * this routine are a few control characters */
1252
1253 switch (c) {
1254 case '\a': return "\\a";
1255 case '\b': return "\\b";
1256 case ESC_NATIVE: return "\\e";
1257 case '\f': return "\\f";
1258 case '\n': return "\\n";
1259 case '\r': return "\\r";
1260 case '\t': return "\\t";
1261 }
1262
1263 return NULL;
1264}
1265
653099ff 1266/* Mark that we cannot extend a found fixed substring at this point.
37b6262f 1267 Update the longest found anchored substring or the longest found
653099ff
GS
1268 floating substrings if needed. */
1269
4327152a 1270STATIC void
ea3daa5d
FC
1271S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1272 SSize_t *minlenp, int is_inf)
c277df42 1273{
e1ec3a88 1274 const STRLEN l = CHR_SVLEN(data->last_found);
2099df82 1275 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
a2ca2017 1276 const STRLEN old_l = CHR_SVLEN(longest_sv);
1de06328 1277 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1278
7918f24d
NC
1279 PERL_ARGS_ASSERT_SCAN_COMMIT;
1280
c277df42 1281 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
2099df82 1282 const U8 i = data->cur_is_floating;
a2ca2017 1283 SvSetMagicSV(longest_sv, data->last_found);
37b6262f
DM
1284 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1285
2099df82
DM
1286 if (!i) /* fixed */
1287 data->substrs[0].max_offset = data->substrs[0].min_offset;
1288 else { /* float */
1289 data->substrs[1].max_offset = (l
646e8787
DM
1290 ? data->last_start_max
1291 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1292 ? SSize_t_MAX
1293 : data->pos_min + data->pos_delta));
1294 if (is_inf
2099df82
DM
1295 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1296 data->substrs[1].max_offset = SSize_t_MAX;
37b6262f 1297 }
11683ecb 1298
37b6262f
DM
1299 if (data->flags & SF_BEFORE_EOL)
1300 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1301 else
1302 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1303 data->substrs[i].minlenp = minlenp;
1304 data->substrs[i].lookbehind = 0;
c277df42 1305 }
37b6262f 1306
c277df42 1307 SvCUR_set(data->last_found, 0);
0eda9292 1308 {
a28509cc 1309 SV * const sv = data->last_found;
097eb12c
AL
1310 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1311 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1312 if (mg)
1313 mg->mg_len = 0;
1314 }
0eda9292 1315 }
c277df42
IZ
1316 data->last_end = -1;
1317 data->flags &= ~SF_BEFORE_EOL;
f5a36d78 1318 DEBUG_STUDYDATA("commit", data, 0, is_inf);
c277df42
IZ
1319}
1320
cdd87c1d
KW
1321/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1322 * list that describes which code points it matches */
1323
653099ff 1324STATIC void
3420edd7 1325S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1326{
cdd87c1d
KW
1327 /* Set the SSC 'ssc' to match an empty string or any code point */
1328
557bd3fb 1329 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1330
71068078 1331 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1332
0854ea0b
KW
1333 /* mortalize so won't leak */
1334 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1335 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1336}
1337
653099ff 1338STATIC int
dc3bf405 1339S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1340{
c144baaa
KW
1341 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1342 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1343 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1344 * in any way, so there's no point in using it */
cdd87c1d
KW
1345
1346 UV start, end;
1347 bool ret;
653099ff 1348
557bd3fb 1349 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1350
71068078 1351 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1352
93e92956 1353 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1354 return FALSE;
1355 }
1356
1357 /* See if the list consists solely of the range 0 - Infinity */
1358 invlist_iterinit(ssc->invlist);
1359 ret = invlist_iternext(ssc->invlist, &start, &end)
1360 && start == 0
1361 && end == UV_MAX;
1362
1363 invlist_iterfinish(ssc->invlist);
1364
1365 if (ret) {
1366 return TRUE;
1367 }
1368
1369 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1370 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1371 int i;
1372 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1373 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1374 return TRUE;
1375 }
1376 }
1377 }
1378
1379 return FALSE;
653099ff
GS
1380}
1381
653099ff 1382STATIC void
cdd87c1d 1383S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1384{
cdd87c1d
KW
1385 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1386 * string, any code point, or any posix class under locale */
1387
557bd3fb 1388 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1389
557bd3fb 1390 Zero(ssc, 1, regnode_ssc);
71068078 1391 set_ANYOF_SYNTHETIC(ssc);
93e92956 1392 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1393 ssc_anything(ssc);
cdd87c1d 1394
2f306ab9
KW
1395 /* If any portion of the regex is to operate under locale rules that aren't
1396 * fully known at compile time, initialization includes it. The reason
1397 * this isn't done for all regexes is that the optimizer was written under
1398 * the assumption that locale was all-or-nothing. Given the complexity and
1399 * lack of documentation in the optimizer, and that there are inadequate
1400 * test cases for locale, many parts of it may not work properly, it is
1401 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1402 if (RExC_contains_locale) {
1403 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1404 }
1405 else {
1406 ANYOF_POSIXL_ZERO(ssc);
1407 }
653099ff
GS
1408}
1409
b423522f 1410STATIC int
dc3bf405
BF
1411S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1412 const regnode_ssc *ssc)
b423522f
KW
1413{
1414 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1415 * to the list of code points matched, and locale posix classes; hence does
1416 * not check its flags) */
1417
1418 UV start, end;
1419 bool ret;
1420
1421 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1422
71068078 1423 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1424
1425 invlist_iterinit(ssc->invlist);
1426 ret = invlist_iternext(ssc->invlist, &start, &end)
1427 && start == 0
1428 && end == UV_MAX;
1429
1430 invlist_iterfinish(ssc->invlist);
1431
1432 if (! ret) {
1433 return FALSE;
1434 }
1435
e0e1be5f 1436 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1437 return FALSE;
b423522f
KW
1438 }
1439
1440 return TRUE;
1441}
1442
1443STATIC SV*
1444S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1445 const regnode_charclass* const node)
b423522f
KW
1446{
1447 /* Returns a mortal inversion list defining which code points are matched
1448 * by 'node', which is of type ANYOF. Handles complementing the result if
1449 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1450 * returned list must, and will, contain every code point that is a
1451 * possibility. */
b423522f 1452
e2506fa7 1453 SV* invlist = NULL;
1ee208c4 1454 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1455 unsigned int i;
1456 const U32 n = ARG(node);
31f05a37 1457 bool new_node_has_latin1 = FALSE;
b423522f
KW
1458
1459 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1460
1461 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1462 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1463 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1464 AV * const av = MUTABLE_AV(SvRV(rv));
1465 SV **const ary = AvARRAY(av);
1466 assert(RExC_rxi->data->what[n] == 's');
1467
1468 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1469 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1470 }
1471 else if (ary[0] && ary[0] != &PL_sv_undef) {
1472
1473 /* Here, no compile-time swash, and there are things that won't be
1474 * known until runtime -- we have to assume it could be anything */
e2506fa7 1475 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1476 return _add_range_to_invlist(invlist, 0, UV_MAX);
1477 }
1ee208c4 1478 else if (ary[3] && ary[3] != &PL_sv_undef) {
b423522f
KW
1479
1480 /* Here no compile-time swash, and no run-time only data. Use the
1481 * node's inversion list */
1ee208c4
KW
1482 invlist = sv_2mortal(invlist_clone(ary[3]));
1483 }
1484
1485 /* Get the code points valid only under UTF-8 locales */
037715a6 1486 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1ee208c4
KW
1487 && ary[2] && ary[2] != &PL_sv_undef)
1488 {
1489 only_utf8_locale_invlist = ary[2];
b423522f
KW
1490 }
1491 }
1492
e2506fa7
KW
1493 if (! invlist) {
1494 invlist = sv_2mortal(_new_invlist(0));
1495 }
1496
dcb20b36
KW
1497 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1498 * code points, and an inversion list for the others, but if there are code
1499 * points that should match only conditionally on the target string being
1500 * UTF-8, those are placed in the inversion list, and not the bitmap.
1501 * Since there are circumstances under which they could match, they are
1502 * included in the SSC. But if the ANYOF node is to be inverted, we have
1503 * to exclude them here, so that when we invert below, the end result
1504 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1505 * have to do this here before we add the unconditionally matched code
1506 * points */
b423522f
KW
1507 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1508 _invlist_intersection_complement_2nd(invlist,
1509 PL_UpperLatin1,
1510 &invlist);
1511 }
1512
1513 /* Add in the points from the bit map */
dcb20b36 1514 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
b423522f 1515 if (ANYOF_BITMAP_TEST(node, i)) {
6f8848d5
TC
1516 unsigned int start = i++;
1517
1518 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1519 /* empty */
1520 }
1521 invlist = _add_range_to_invlist(invlist, start, i-1);
31f05a37 1522 new_node_has_latin1 = TRUE;
b423522f
KW
1523 }
1524 }
1525
1526 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1527 * as well. But don't add them if inverting, as when that gets done below,
1528 * it would exclude all these characters, including the ones it shouldn't
1529 * that were added just above */
1530 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
f240c685
KW
1531 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1532 {
b423522f
KW
1533 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1534 }
1535
1536 /* Similarly for these */
93e92956 1537 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1538 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1539 }
1540
1541 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1542 _invlist_invert(invlist);
1543 }
037715a6 1544 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
31f05a37
KW
1545
1546 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1547 * locale. We can skip this if there are no 0-255 at all. */
1548 _invlist_union(invlist, PL_Latin1, &invlist);
1549 }
1550
1ee208c4
KW
1551 /* Similarly add the UTF-8 locale possible matches. These have to be
1552 * deferred until after the non-UTF-8 locale ones are taken care of just
1553 * above, or it leads to wrong results under ANYOF_INVERT */
1554 if (only_utf8_locale_invlist) {
31f05a37 1555 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1556 only_utf8_locale_invlist,
31f05a37
KW
1557 ANYOF_FLAGS(node) & ANYOF_INVERT,
1558 &invlist);
1559 }
b423522f
KW
1560
1561 return invlist;
1562}
1563
1051e1c4 1564/* These two functions currently do the exact same thing */
557bd3fb 1565#define ssc_init_zero ssc_init
653099ff 1566
cdd87c1d
KW
1567#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1568#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1569
557bd3fb 1570/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1571 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1572 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1573
653099ff 1574STATIC void
b423522f 1575S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1576 const regnode_charclass *and_with)
653099ff 1577{
cdd87c1d
KW
1578 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1579 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1580
a0dd4231
KW
1581 SV* anded_cp_list;
1582 U8 anded_flags;
1e6ade67 1583
cdd87c1d 1584 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1585
71068078 1586 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1587
1588 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1589 * the code point inversion list and just the relevant flags */
71068078 1590 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1591 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
a0dd4231 1592 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1593
1594 /* XXX This is a kludge around what appears to be deficiencies in the
1595 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1596 * there are paths through the optimizer where it doesn't get weeded
1597 * out when it should. And if we don't make some extra provision for
1598 * it like the code just below, it doesn't get added when it should.
1599 * This solution is to add it only when AND'ing, which is here, and
1600 * only when what is being AND'ed is the pristine, original node
1601 * matching anything. Thus it is like adding it to ssc_anything() but
1602 * only when the result is to be AND'ed. Probably the same solution
1603 * could be adopted for the same problem we have with /l matching,
1604 * which is solved differently in S_ssc_init(), and that would lead to
1605 * fewer false positives than that solution has. But if this solution
1606 * creates bugs, the consequences are only that a warning isn't raised
1607 * that should be; while the consequences for having /l bugs is
1608 * incorrect matches */
7dcac5f6 1609 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1610 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1611 }
a0dd4231
KW
1612 }
1613 else {
5c0f85ef 1614 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685
KW
1615 if (OP(and_with) == ANYOFD) {
1616 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1617 }
1618 else {
1619 anded_flags = ANYOF_FLAGS(and_with)
1620 &( ANYOF_COMMON_FLAGS
108316fb
KW
1621 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1622 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1623 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1624 anded_flags &=
1625 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1626 }
f240c685 1627 }
a0dd4231
KW
1628 }
1629
1630 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1631
1632 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1633 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1634 * 'and_with' may be inverted. When not inverted, we have the situation of
1635 * computing:
1636 * (C1 | P1) & (C2 | P2)
1637 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1638 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1639 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1640 * <= ((C1 & C2) | P1 | P2)
1641 * Alternatively, the last few steps could be:
1642 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1643 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1644 * <= (C1 | C2 | (P1 & P2))
1645 * We favor the second approach if either P1 or P2 is non-empty. This is
1646 * because these components are a barrier to doing optimizations, as what
1647 * they match cannot be known until the moment of matching as they are
1648 * dependent on the current locale, 'AND"ing them likely will reduce or
1649 * eliminate them.
1650 * But we can do better if we know that C1,P1 are in their initial state (a
1651 * frequent occurrence), each matching everything:
1652 * (<everything>) & (C2 | P2) = C2 | P2
1653 * Similarly, if C2,P2 are in their initial state (again a frequent
1654 * occurrence), the result is a no-op
1655 * (C1 | P1) & (<everything>) = C1 | P1
1656 *
1657 * Inverted, we have
1658 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1659 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1660 * <= (C1 & ~C2) | (P1 & ~P2)
1661 * */
1aa99e6b 1662
a0dd4231 1663 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1664 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1665 {
cdd87c1d 1666 unsigned int i;
8951c461 1667
cdd87c1d
KW
1668 ssc_intersection(ssc,
1669 anded_cp_list,
1670 FALSE /* Has already been inverted */
1671 );
c6b76537 1672
cdd87c1d
KW
1673 /* If either P1 or P2 is empty, the intersection will be also; can skip
1674 * the loop */
93e92956 1675 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1676 ANYOF_POSIXL_ZERO(ssc);
1677 }
e0e1be5f 1678 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1679
1680 /* Note that the Posix class component P from 'and_with' actually
1681 * looks like:
1682 * P = Pa | Pb | ... | Pn
1683 * where each component is one posix class, such as in [\w\s].
1684 * Thus
1685 * ~P = ~(Pa | Pb | ... | Pn)
1686 * = ~Pa & ~Pb & ... & ~Pn
1687 * <= ~Pa | ~Pb | ... | ~Pn
1688 * The last is something we can easily calculate, but unfortunately
1689 * is likely to have many false positives. We could do better
1690 * in some (but certainly not all) instances if two classes in
1691 * P have known relationships. For example
1692 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1693 * So
1694 * :lower: & :print: = :lower:
1695 * And similarly for classes that must be disjoint. For example,
1696 * since \s and \w can have no elements in common based on rules in
1697 * the POSIX standard,
1698 * \w & ^\S = nothing
1699 * Unfortunately, some vendor locales do not meet the Posix
1700 * standard, in particular almost everything by Microsoft.
1701 * The loop below just changes e.g., \w into \W and vice versa */
1702
1ee208c4 1703 regnode_charclass_posixl temp;
cdd87c1d
KW
1704 int add = 1; /* To calculate the index of the complement */
1705
b1234259 1706 Zero(&temp, 1, regnode_charclass_posixl);
cdd87c1d
KW
1707 ANYOF_POSIXL_ZERO(&temp);
1708 for (i = 0; i < ANYOF_MAX; i++) {
1709 assert(i % 2 != 0
7dcac5f6
KW
1710 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1711 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1712
7dcac5f6 1713 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1714 ANYOF_POSIXL_SET(&temp, i + add);
1715 }
1716 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1717 }
1718 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1719
cdd87c1d
KW
1720 } /* else ssc already has no posixes */
1721 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1722 in its initial state */
71068078 1723 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1724 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1725 {
1726 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1727 * copy it over 'ssc' */
1728 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1729 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1730 StructCopy(and_with, ssc, regnode_ssc);
1731 }
1732 else {
1733 ssc->invlist = anded_cp_list;
1734 ANYOF_POSIXL_ZERO(ssc);
93e92956 1735 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1736 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1737 }
1738 }
1739 }
e0e1be5f 1740 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
93e92956 1741 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1742 {
1743 /* One or the other of P1, P2 is non-empty. */
93e92956 1744 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1745 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1746 }
cdd87c1d
KW
1747 ssc_union(ssc, anded_cp_list, FALSE);
1748 }
1749 else { /* P1 = P2 = empty */
1750 ssc_intersection(ssc, anded_cp_list, FALSE);
1751 }
137165a6 1752 }
653099ff
GS
1753}
1754
653099ff 1755STATIC void
cdd87c1d 1756S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1757 const regnode_charclass *or_with)
653099ff 1758{
cdd87c1d
KW
1759 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1760 * another SSC or a regular ANYOF class. Can create false positives if
1761 * 'or_with' is to be inverted. */
7918f24d 1762
a0dd4231
KW
1763 SV* ored_cp_list;
1764 U8 ored_flags;
c6b76537 1765
cdd87c1d 1766 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1767
71068078 1768 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1769
1770 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1771 * the code point inversion list and just the relevant flags */
71068078 1772 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1773 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
a0dd4231
KW
1774 ored_flags = ANYOF_FLAGS(or_with);
1775 }
1776 else {
5c0f85ef 1777 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
eff8b7dc 1778 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
f240c685
KW
1779 if (OP(or_with) != ANYOFD) {
1780 ored_flags
1781 |= ANYOF_FLAGS(or_with)
108316fb
KW
1782 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1783 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1784 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1785 ored_flags |=
1786 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1787 }
f240c685 1788 }
a0dd4231
KW
1789 }
1790
1791 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1792
1793 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1794 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1795 * 'or_with' may be inverted. When not inverted, we have the simple
1796 * situation of computing:
1797 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1798 * If P1|P2 yields a situation with both a class and its complement are
1799 * set, like having both \w and \W, this matches all code points, and we
1800 * can delete these from the P component of the ssc going forward. XXX We
1801 * might be able to delete all the P components, but I (khw) am not certain
1802 * about this, and it is better to be safe.
1803 *
1804 * Inverted, we have
1805 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1806 * <= (C1 | P1) | ~C2
1807 * <= (C1 | ~C2) | P1
1808 * (which results in actually simpler code than the non-inverted case)
1809 * */
9826f543 1810
a0dd4231 1811 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1812 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1813 {
cdd87c1d 1814 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1815 } /* else Not inverted */
93e92956 1816 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1817 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1818 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1819 unsigned int i;
1820 for (i = 0; i < ANYOF_MAX; i += 2) {
1821 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1822 {
1823 ssc_match_all_cp(ssc);
1824 ANYOF_POSIXL_CLEAR(ssc, i);
1825 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1826 }
1827 }
1828 }
1aa99e6b 1829 }
cdd87c1d
KW
1830
1831 ssc_union(ssc,
1832 ored_cp_list,
1833 FALSE /* Already has been inverted */
1834 );
653099ff
GS
1835}
1836
b423522f
KW
1837PERL_STATIC_INLINE void
1838S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1839{
1840 PERL_ARGS_ASSERT_SSC_UNION;
1841
71068078 1842 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1843
1844 _invlist_union_maybe_complement_2nd(ssc->invlist,
1845 invlist,
1846 invert2nd,
1847 &ssc->invlist);
1848}
1849
1850PERL_STATIC_INLINE void
1851S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1852 SV* const invlist,
1853 const bool invert2nd)
1854{
1855 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1856
71068078 1857 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1858
1859 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1860 invlist,
1861 invert2nd,
1862 &ssc->invlist);
1863}
1864
1865PERL_STATIC_INLINE void
1866S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1867{
1868 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1869
71068078 1870 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1871
1872 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1873}
1874
1875PERL_STATIC_INLINE void
1876S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1877{
1878 /* AND just the single code point 'cp' into the SSC 'ssc' */
1879
1880 SV* cp_list = _new_invlist(2);
1881
1882 PERL_ARGS_ASSERT_SSC_CP_AND;
1883
71068078 1884 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1885
1886 cp_list = add_cp_to_invlist(cp_list, cp);
1887 ssc_intersection(ssc, cp_list,
1888 FALSE /* Not inverted */
1889 );
1890 SvREFCNT_dec_NN(cp_list);
1891}
1892
1893PERL_STATIC_INLINE void
dc3bf405 1894S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
1895{
1896 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
1897 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1898
71068078 1899 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1900
1901 ANYOF_POSIXL_ZERO(ssc);
1902 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1903}
1904
b35552de
KW
1905#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1906
1907STATIC bool
1908S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1909{
1910 /* The synthetic start class is used to hopefully quickly winnow down
1911 * places where a pattern could start a match in the target string. If it
1912 * doesn't really narrow things down that much, there isn't much point to
1913 * having the overhead of using it. This function uses some very crude
1914 * heuristics to decide if to use the ssc or not.
1915 *
1916 * It returns TRUE if 'ssc' rules out more than half what it considers to
1917 * be the "likely" possible matches, but of course it doesn't know what the
1918 * actual things being matched are going to be; these are only guesses
1919 *
1920 * For /l matches, it assumes that the only likely matches are going to be
1921 * in the 0-255 range, uniformly distributed, so half of that is 127
1922 * For /a and /d matches, it assumes that the likely matches will be just
1923 * the ASCII range, so half of that is 63
1924 * For /u and there isn't anything matching above the Latin1 range, it
1925 * assumes that that is the only range likely to be matched, and uses
1926 * half that as the cut-off: 127. If anything matches above Latin1,
1927 * it assumes that all of Unicode could match (uniformly), except for
1928 * non-Unicode code points and things in the General Category "Other"
1929 * (unassigned, private use, surrogates, controls and formats). This
1930 * is a much large number. */
1931
b35552de
KW
1932 U32 count = 0; /* Running total of number of code points matched by
1933 'ssc' */
1934 UV start, end; /* Start and end points of current range in inversion
1935 list */
72400949
KW
1936 const U32 max_code_points = (LOC)
1937 ? 256
1938 : (( ! UNI_SEMANTICS
1939 || invlist_highest(ssc->invlist) < 256)
1940 ? 128
1941 : NON_OTHER_COUNT);
1942 const U32 max_match = max_code_points / 2;
b35552de
KW
1943
1944 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1945
1946 invlist_iterinit(ssc->invlist);
1947 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
1948 if (start >= max_code_points) {
1949 break;
b35552de 1950 }
72400949 1951 end = MIN(end, max_code_points - 1);
b35552de 1952 count += end - start + 1;
72400949 1953 if (count >= max_match) {
b35552de
KW
1954 invlist_iterfinish(ssc->invlist);
1955 return FALSE;
1956 }
1957 }
1958
1959 return TRUE;
1960}
1961
1962
b423522f
KW
1963STATIC void
1964S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1965{
1966 /* The inversion list in the SSC is marked mortal; now we need a more
1967 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
1968 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1969 * map */
b423522f
KW
1970
1971 SV* invlist = invlist_clone(ssc->invlist);
1972
1973 PERL_ARGS_ASSERT_SSC_FINALIZE;
1974
71068078 1975 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 1976
a0dd4231 1977 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
1978 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1979 * by the time we reach here */
f240c685
KW
1980 assert(! (ANYOF_FLAGS(ssc)
1981 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
1982 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1983 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 1984
b423522f
KW
1985 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1986
1ee208c4
KW
1987 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1988 NULL, NULL, NULL, FALSE);
b423522f 1989
85c8e306
KW
1990 /* Make sure is clone-safe */
1991 ssc->invlist = NULL;
1992
e0e1be5f 1993 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 1994 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
e0e1be5f 1995 }
1462525b 1996
b2e90ddf
KW
1997 if (RExC_contains_locale) {
1998 OP(ssc) = ANYOFL;
1999 }
2000
1462525b 2001 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
2002}
2003
a3621e74
YO
2004#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2005#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2006#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
2007#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2008 ? (TRIE_LIST_CUR( idx ) - 1) \
2009 : 0 )
a3621e74 2010
3dab1dad
YO
2011
2012#ifdef DEBUGGING
07be1b83 2013/*
2b8b4781
NC
2014 dump_trie(trie,widecharmap,revcharmap)
2015 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2016 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
2017
2018 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
2019 The _interim_ variants are used for debugging the interim
2020 tables that are used to generate the final compressed
2021 representation which is what dump_trie expects.
2022
486ec47a 2023 Part of the reason for their existence is to provide a form
3dab1dad 2024 of documentation as to how the different representations function.
07be1b83
YO
2025
2026*/
3dab1dad
YO
2027
2028/*
3dab1dad
YO
2029 Dumps the final compressed table form of the trie to Perl_debug_log.
2030 Used for debugging make_trie().
2031*/
b9a59e08 2032
3dab1dad 2033STATIC void
2b8b4781
NC
2034S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2035 AV *revcharmap, U32 depth)
3dab1dad
YO
2036{
2037 U32 state;
ab3bbdeb 2038 SV *sv=sv_newmortal();
55eed653 2039 int colwidth= widecharmap ? 6 : 4;
2e64971a 2040 U16 word;
3dab1dad
YO
2041 GET_RE_DEBUG_FLAGS_DECL;
2042
7918f24d 2043 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 2044
6ad9a8ab 2045 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 2046 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
2047
2048 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 2049 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 2050 if ( tmp ) {
6ad9a8ab 2051 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2052 colwidth,
538e84ed 2053 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2054 PL_colors[0], PL_colors[1],
2055 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2056 PERL_PV_ESCAPE_FIRSTCHAR
2057 )
ab3bbdeb 2058 );
3dab1dad
YO
2059 }
2060 }
1e37780e
YO
2061 Perl_re_printf( aTHX_ "\n");
2062 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2063
2064 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2065 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2066 Perl_re_printf( aTHX_ "\n");
3dab1dad 2067
1e2e3d02 2068 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2069 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2070
147e3846 2071 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2072
2073 if ( trie->states[ state ].wordnum ) {
1e37780e 2074 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2075 } else {
6ad9a8ab 2076 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2077 }
2078
147e3846 2079 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2080
2081 if ( base ) {
2082 U32 ofs = 0;
2083
2084 while( ( base + ofs < trie->uniquecharcount ) ||
2085 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2086 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2087 != state))
3dab1dad
YO
2088 ofs++;
2089
147e3846 2090 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2091
2092 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2093 if ( ( base + ofs >= trie->uniquecharcount )
2094 && ( base + ofs - trie->uniquecharcount
2095 < trie->lasttrans )
2096 && trie->trans[ base + ofs
2097 - trie->uniquecharcount ].check == state )
3dab1dad 2098 {
147e3846 2099 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2100 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2101 );
3dab1dad 2102 } else {
6ad9a8ab 2103 Perl_re_printf( aTHX_ "%*s",colwidth," ." );
3dab1dad
YO
2104 }
2105 }
2106
6ad9a8ab 2107 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2108
2109 }
6ad9a8ab 2110 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2111 }
6ad9a8ab 2112 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2113 depth);
2e64971a 2114 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2115 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2116 (int)word, (int)(trie->wordinfo[word].prev),
2117 (int)(trie->wordinfo[word].len));
2118 }
6ad9a8ab 2119 Perl_re_printf( aTHX_ "\n" );
538e84ed 2120}
3dab1dad 2121/*
3dab1dad 2122 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2123 List tries normally only are used for construction when the number of
3dab1dad
YO
2124 possible chars (trie->uniquecharcount) is very high.
2125 Used for debugging make_trie().
2126*/
2127STATIC void
55eed653 2128S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2129 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2130 U32 depth)
3dab1dad
YO
2131{
2132 U32 state;
ab3bbdeb 2133 SV *sv=sv_newmortal();
55eed653 2134 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2135 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2136
2137 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2138
3dab1dad 2139 /* print out the table precompression. */
6ad9a8ab 2140 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2141 depth+1 );
6ad9a8ab 2142 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2143 depth+1, "------:-----+-----------------\n" );
538e84ed 2144
3dab1dad
YO
2145 for( state=1 ; state < next_alloc ; state ++ ) {
2146 U16 charid;
538e84ed 2147
147e3846 2148 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2149 depth+1, (UV)state );
3dab1dad 2150 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2151 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2152 } else {
6ad9a8ab 2153 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2154 trie->states[ state ].wordnum
2155 );
2156 }
2157 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed
KW
2158 SV ** const tmp = av_fetch( revcharmap,
2159 TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb 2160 if ( tmp ) {
147e3846 2161 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2162 colwidth,
538e84ed
KW
2163 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2164 colwidth,
2165 PL_colors[0], PL_colors[1],
2166 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2167 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2168 ) ,
1e2e3d02
YO
2169 TRIE_LIST_ITEM(state,charid).forid,
2170 (UV)TRIE_LIST_ITEM(state,charid).newstate
2171 );
538e84ed 2172 if (!(charid % 10))
6ad9a8ab 2173 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2174 (int)((depth * 2) + 14), "");
1e2e3d02 2175 }
ab3bbdeb 2176 }
6ad9a8ab 2177 Perl_re_printf( aTHX_ "\n");
3dab1dad 2178 }
538e84ed 2179}
3dab1dad
YO
2180
2181/*
3dab1dad 2182 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2183 This is the normal DFA style state transition table, with a few
2184 twists to facilitate compression later.
3dab1dad
YO
2185 Used for debugging make_trie().
2186*/
2187STATIC void
55eed653 2188S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2189 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2190 U32 depth)
3dab1dad
YO
2191{
2192 U32 state;
2193 U16 charid;
ab3bbdeb 2194 SV *sv=sv_newmortal();
55eed653 2195 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2196 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2197
2198 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2199
3dab1dad
YO
2200 /*
2201 print out the table precompression so that we can do a visual check
2202 that they are identical.
2203 */
538e84ed 2204
6ad9a8ab 2205 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2206
2207 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2208 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2209 if ( tmp ) {
6ad9a8ab 2210 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2211 colwidth,
538e84ed 2212 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2213 PL_colors[0], PL_colors[1],
2214 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2215 PERL_PV_ESCAPE_FIRSTCHAR
2216 )
ab3bbdeb 2217 );
3dab1dad
YO
2218 }
2219 }
2220
4aaafc03
YO
2221 Perl_re_printf( aTHX_ "\n");
2222 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2223
2224 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2225 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2226 }
2227
6ad9a8ab 2228 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2229
2230 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2231
147e3846 2232 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2233 depth+1,
3dab1dad
YO
2234 (UV)TRIE_NODENUM( state ) );
2235
2236 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2237 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2238 if (v)
147e3846 2239 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2240 else
6ad9a8ab 2241 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2242 }
2243 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2244 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2245 (UV)trie->trans[ state ].check );
3dab1dad 2246 } else {
147e3846 2247 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2248 (UV)trie->trans[ state ].check,
3dab1dad
YO
2249 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2250 }
2251 }
07be1b83 2252}
3dab1dad
YO
2253
2254#endif
2255
2e64971a 2256
786e8c11
YO
2257/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2258 startbranch: the first branch in the whole branch sequence
2259 first : start branch of sequence of branch-exact nodes.
2260 May be the same as startbranch
2261 last : Thing following the last branch.
2262 May be the same as tail.
2263 tail : item following the branch sequence
2264 count : words in the sequence
a4525e78 2265 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2266 depth : indent depth
3dab1dad 2267
786e8c11 2268Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2269
786e8c11
YO
2270A trie is an N'ary tree where the branches are determined by digital
2271decomposition of the key. IE, at the root node you look up the 1st character and
2272follow that branch repeat until you find the end of the branches. Nodes can be
2273marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2274
786e8c11 2275 /he|she|his|hers/
72f13be8 2276
786e8c11
YO
2277would convert into the following structure. Numbers represent states, letters
2278following numbers represent valid transitions on the letter from that state, if
2279the number is in square brackets it represents an accepting state, otherwise it
2280will be in parenthesis.
07be1b83 2281
786e8c11
YO
2282 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2283 | |
2284 | (2)
2285 | |
2286 (1) +-i->(6)-+-s->[7]
2287 |
2288 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2289
786e8c11
YO
2290 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2291
2292This shows that when matching against the string 'hers' we will begin at state 1
2293read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2294then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2295is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2296single traverse. We store a mapping from accepting to state to which word was
2297matched, and then when we have multiple possibilities we try to complete the
b8fda935 2298rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2299
2300The only prior NFA like behaviour that would be changed by the TRIE support is
2301the silent ignoring of duplicate alternations which are of the form:
2302
2303 / (DUPE|DUPE) X? (?{ ... }) Y /x
2304
4b714af6 2305Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2306and without the optimisation. With the optimisations dupes will be silently
486ec47a 2307ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2308the following demonstrates:
2309
2310 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2311
2312which prints out 'word' three times, but
2313
2314 'words'=~/(word|word|word)(?{ print $1 })S/
2315
2316which doesnt print it out at all. This is due to other optimisations kicking in.
2317
2318Example of what happens on a structural level:
2319
486ec47a 2320The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2321
2322 1: CURLYM[1] {1,32767}(18)
2323 5: BRANCH(8)
2324 6: EXACT <ac>(16)
2325 8: BRANCH(11)
2326 9: EXACT <ad>(16)
2327 11: BRANCH(14)
2328 12: EXACT <ab>(16)
2329 16: SUCCEED(0)
2330 17: NOTHING(18)
2331 18: END(0)
2332
2333This would be optimizable with startbranch=5, first=5, last=16, tail=16
2334and should turn into:
2335
2336 1: CURLYM[1] {1,32767}(18)
2337 5: TRIE(16)
2338 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2339 <ac>
2340 <ad>
2341 <ab>
2342 16: SUCCEED(0)
2343 17: NOTHING(18)
2344 18: END(0)
2345
2346Cases where tail != last would be like /(?foo|bar)baz/:
2347
2348 1: BRANCH(4)
2349 2: EXACT <foo>(8)
2350 4: BRANCH(7)
2351 5: EXACT <bar>(8)
2352 7: TAIL(8)
2353 8: EXACT <baz>(10)
2354 10: END(0)
2355
2356which would be optimizable with startbranch=1, first=1, last=7, tail=8
2357and would end up looking like:
2358
2359 1: TRIE(8)
2360 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2361 <foo>
2362 <bar>
2363 7: TAIL(8)
2364 8: EXACT <baz>(10)
2365 10: END(0)
2366
c80e42f3 2367 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2368
2369is the recommended Unicode-aware way of saying
2370
2371 *(d++) = uv;
2372*/
2373
fab2782b 2374#define TRIE_STORE_REVCHAR(val) \
786e8c11 2375 STMT_START { \
73031816 2376 if (UTF) { \
668fcfea 2377 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2378 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 2379 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
2380 SvCUR_set(zlopp, kapow - flrbbbbb); \
2381 SvPOK_on(zlopp); \
2382 SvUTF8_on(zlopp); \
2383 av_push(revcharmap, zlopp); \
2384 } else { \
fab2782b 2385 char ooooff = (char)val; \
73031816
NC
2386 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2387 } \
2388 } STMT_END
786e8c11 2389
914a25d5
KW
2390/* This gets the next character from the input, folding it if not already
2391 * folded. */
2392#define TRIE_READ_CHAR STMT_START { \
2393 wordlen++; \
2394 if ( UTF ) { \
2395 /* if it is UTF then it is either already folded, or does not need \
2396 * folding */ \
1c1d615a 2397 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2398 } \
2399 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2400 /* This folder implies Unicode rules, which in the range expressible \
2401 * by not UTF is the lower case, with the two exceptions, one of \
2402 * which should have been taken care of before calling this */ \
2403 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2404 uvc = toLOWER_L1(*uc); \
2405 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2406 len = 1; \
914a25d5
KW
2407 } else { \
2408 /* raw data, will be folded later if needed */ \
2409 uvc = (U32)*uc; \
2410 len = 1; \
2411 } \
786e8c11
YO
2412} STMT_END
2413
2414
2415
2416#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2417 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
00195859 2418 U32 ging = TRIE_LIST_LEN( state ) * 2; \
f9003953 2419 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
00195859 2420 TRIE_LIST_LEN( state ) = ging; \
786e8c11
YO
2421 } \
2422 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2423 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2424 TRIE_LIST_CUR( state )++; \
2425} STMT_END
07be1b83 2426
786e8c11 2427#define TRIE_LIST_NEW(state) STMT_START { \
d09f14bf 2428 Newx( trie->states[ state ].trans.list, \
786e8c11
YO
2429 4, reg_trie_trans_le ); \
2430 TRIE_LIST_CUR( state ) = 1; \
2431 TRIE_LIST_LEN( state ) = 4; \
2432} STMT_END
07be1b83 2433
786e8c11
YO
2434#define TRIE_HANDLE_WORD(state) STMT_START { \
2435 U16 dupe= trie->states[ state ].wordnum; \
2436 regnode * const noper_next = regnext( noper ); \
2437 \
786e8c11
YO
2438 DEBUG_r({ \
2439 /* store the word for dumping */ \
2440 SV* tmp; \
2441 if (OP(noper) != NOTHING) \
740cce10 2442 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2443 else \
740cce10 2444 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2445 av_push( trie_words, tmp ); \
786e8c11
YO
2446 }); \
2447 \
2448 curword++; \
2e64971a
DM
2449 trie->wordinfo[curword].prev = 0; \
2450 trie->wordinfo[curword].len = wordlen; \
2451 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2452 \
2453 if ( noper_next < tail ) { \
2454 if (!trie->jump) \
538e84ed
KW
2455 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2456 sizeof(U16) ); \
7f69552c 2457 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2458 if (!jumper) \
2459 jumper = noper_next; \
2460 if (!nextbranch) \
2461 nextbranch= regnext(cur); \
2462 } \
2463 \
2464 if ( dupe ) { \
2e64971a
DM
2465 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2466 /* chain, so that when the bits of chain are later */\
2467 /* linked together, the dups appear in the chain */\
2468 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2469 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2470 } else { \
2471 /* we haven't inserted this word yet. */ \
2472 trie->states[ state ].wordnum = curword; \
2473 } \
2474} STMT_END
07be1b83 2475
3dab1dad 2476
786e8c11
YO
2477#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2478 ( ( base + charid >= ucharcount \
2479 && base + charid < ubound \
2480 && state == trie->trans[ base - ucharcount + charid ].check \
2481 && trie->trans[ base - ucharcount + charid ].next ) \
2482 ? trie->trans[ base - ucharcount + charid ].next \
2483 : ( state==1 ? special : 0 ) \
2484 )
3dab1dad 2485
8bcafbf4
YO
2486#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2487STMT_START { \
2488 TRIE_BITMAP_SET(trie, uvc); \
2489 /* store the folded codepoint */ \
2490 if ( folder ) \
2491 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2492 \
2493 if ( !UTF ) { \
2494 /* store first byte of utf8 representation of */ \
2495 /* variant codepoints */ \
2496 if (! UVCHR_IS_INVARIANT(uvc)) { \
2497 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2498 } \
2499 } \
2500} STMT_END
786e8c11
YO
2501#define MADE_TRIE 1
2502#define MADE_JUMP_TRIE 2
2503#define MADE_EXACT_TRIE 4
3dab1dad 2504
a3621e74 2505STATIC I32
538e84ed
KW
2506S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2507 regnode *first, regnode *last, regnode *tail,
2508 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2509{
2510 /* first pass, loop through and scan words */
2511 reg_trie_data *trie;
55eed653 2512 HV *widecharmap = NULL;
2b8b4781 2513 AV *revcharmap = newAV();
a3621e74 2514 regnode *cur;
a3621e74
YO
2515 STRLEN len = 0;
2516 UV uvc = 0;
2517 U16 curword = 0;
2518 U32 next_alloc = 0;
786e8c11
YO
2519 regnode *jumper = NULL;
2520 regnode *nextbranch = NULL;
7f69552c 2521 regnode *convert = NULL;
2e64971a 2522 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2523 /* we just use folder as a flag in utf8 */
1e696034 2524 const U8 * folder = NULL;
a3621e74 2525
3a611511
YO
2526 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2527 * which stands for one trie structure, one hash, optionally followed
2528 * by two arrays */
2b8b4781 2529#ifdef DEBUGGING
3a611511 2530 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2b8b4781
NC
2531 AV *trie_words = NULL;
2532 /* along with revcharmap, this only used during construction but both are
2533 * useful during debugging so we store them in the struct when debugging.
8e11feef 2534 */
2b8b4781 2535#else
cf78de0b 2536 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2537 STRLEN trie_charcount=0;
3dab1dad 2538#endif
2b8b4781 2539 SV *re_trie_maxbuff;
a3621e74 2540 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2541
2542 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2543#ifndef DEBUGGING
2544 PERL_UNUSED_ARG(depth);
2545#endif
a3621e74 2546
1e696034 2547 switch (flags) {
a4525e78 2548 case EXACT: case EXACTL: break;
2f7f8cb1 2549 case EXACTFA:
fab2782b 2550 case EXACTFU_SS:
a4525e78
KW
2551 case EXACTFU:
2552 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2553 case EXACTF: folder = PL_fold; break;
fab2782b 2554 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2555 }
2556
c944940b 2557 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2558 trie->refcount = 1;
3dab1dad 2559 trie->startstate = 1;
786e8c11 2560 trie->wordcount = word_count;
f8fc2ecf 2561 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2562 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
a4525e78 2563 if (flags == EXACT || flags == EXACTL)
c944940b 2564 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2565 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2566 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2567
a3621e74 2568 DEBUG_r({
2b8b4781 2569 trie_words = newAV();
a3621e74 2570 });
a3621e74 2571
0111c4fd 2572 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
316ebaf2 2573 assert(re_trie_maxbuff);
a3621e74 2574 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2575 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2576 }
df826430 2577 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2578 Perl_re_indentf( aTHX_
cb41e5d6
YO
2579 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2580 depth+1,
538e84ed
KW
2581 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2582 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2583 });
538e84ed 2584
7f69552c
YO
2585 /* Find the node we are going to overwrite */
2586 if ( first == startbranch && OP( last ) != BRANCH ) {
2587 /* whole branch chain */
2588 convert = first;
2589 } else {
2590 /* branch sub-chain */
2591 convert = NEXTOPER( first );
2592 }
538e84ed 2593
a3621e74
YO
2594 /* -- First loop and Setup --
2595
2596 We first traverse the branches and scan each word to determine if it
2597 contains widechars, and how many unique chars there are, this is
2598 important as we have to build a table with at least as many columns as we
2599 have unique chars.
2600
2601 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2602 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2603 the native representation of the character value as the key and IV's for
2604 the coded index.
a3621e74
YO
2605
2606 *TODO* If we keep track of how many times each character is used we can
2607 remap the columns so that the table compression later on is more
3b753521 2608 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2609 middle and the least common are on the outside. IMO this would be better
2610 than a most to least common mapping as theres a decent chance the most
2611 common letter will share a node with the least common, meaning the node
486ec47a 2612 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2613 case is when we have the least common nodes twice.
2614
2615 */
2616
a3621e74 2617 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2618 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2619 const U8 *uc;
2620 const U8 *e;
bc031a7d 2621 int foldlen = 0;
07be1b83 2622 U32 wordlen = 0; /* required init */
bc031a7d
KW
2623 STRLEN minchars = 0;
2624 STRLEN maxchars = 0;
538e84ed
KW
2625 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2626 bitmap?*/
a3621e74 2627
3dab1dad 2628 if (OP(noper) == NOTHING) {
20ed8c88
YO
2629 /* skip past a NOTHING at the start of an alternation
2630 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2631 */
df826430 2632 regnode *noper_next= regnext(noper);
944e05e3
YO
2633 if (noper_next < tail)
2634 noper= noper_next;
2635 }
2636
dca5fc4c
YO
2637 if ( noper < tail &&
2638 (
2639 OP(noper) == flags ||
2640 (
2641 flags == EXACTFU &&
2642 OP(noper) == EXACTFU_SS
2643 )
2644 )
2645 ) {
944e05e3
YO
2646 uc= (U8*)STRING(noper);
2647 e= uc + STR_LEN(noper);
2648 } else {
2649 trie->minlen= 0;
2650 continue;
3dab1dad 2651 }
df826430 2652
944e05e3 2653
fab2782b 2654 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2655 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2656 regardless of encoding */
fab2782b
YO
2657 if (OP( noper ) == EXACTFU_SS) {
2658 /* false positives are ok, so just set this */
0dc4a61d 2659 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2660 }
2661 }
dca5fc4c 2662
bc031a7d
KW
2663 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2664 branch */
3dab1dad 2665 TRIE_CHARCOUNT(trie)++;
a3621e74 2666 TRIE_READ_CHAR;
645de4ce 2667
bc031a7d
KW
2668 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2669 * is in effect. Under /i, this character can match itself, or
2670 * anything that folds to it. If not under /i, it can match just
2671 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2672 * all fold to k, and all are single characters. But some folds
2673 * expand to more than one character, so for example LATIN SMALL
2674 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2675 * the string beginning at 'uc' is 'ffi', it could be matched by
2676 * three characters, or just by the one ligature character. (It
2677 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2678 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2679 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2680 * match.) The trie needs to know the minimum and maximum number
2681 * of characters that could match so that it can use size alone to
2682 * quickly reject many match attempts. The max is simple: it is
2683 * the number of folded characters in this branch (since a fold is
2684 * never shorter than what folds to it. */
2685
2686 maxchars++;
2687
2688 /* And the min is equal to the max if not under /i (indicated by
2689 * 'folder' being NULL), or there are no multi-character folds. If
2690 * there is a multi-character fold, the min is incremented just
2691 * once, for the character that folds to the sequence. Each
2692 * character in the sequence needs to be added to the list below of
2693 * characters in the trie, but we count only the first towards the
2694 * min number of characters needed. This is done through the
2695 * variable 'foldlen', which is returned by the macros that look
2696 * for these sequences as the number of bytes the sequence
2697 * occupies. Each time through the loop, we decrement 'foldlen' by
2698 * how many bytes the current char occupies. Only when it reaches
2699 * 0 do we increment 'minchars' or look for another multi-character
2700 * sequence. */
2701 if (folder == NULL) {
2702 minchars++;
2703 }
2704 else if (foldlen > 0) {
2705 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2706 }
2707 else {
bc031a7d
KW
2708 minchars++;
2709
2710 /* See if *uc is the beginning of a multi-character fold. If
2711 * so, we decrement the length remaining to look at, to account
2712 * for the current character this iteration. (We can use 'uc'
2713 * instead of the fold returned by TRIE_READ_CHAR because for
2714 * non-UTF, the latin1_safe macro is smart enough to account
2715 * for all the unfolded characters, and because for UTF, the
2716 * string will already have been folded earlier in the
2717 * compilation process */
2718 if (UTF) {
2719 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2720 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2721 }
2722 }
bc031a7d
KW
2723 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2724 foldlen--;
2725 }
645de4ce 2726 }
bc031a7d
KW
2727
2728 /* The current character (and any potential folds) should be added
2729 * to the possible matching characters for this position in this
2730 * branch */
a3621e74 2731 if ( uvc < 256 ) {
fab2782b
YO
2732 if ( folder ) {
2733 U8 folded= folder[ (U8) uvc ];
2734 if ( !trie->charmap[ folded ] ) {
2735 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2736 TRIE_STORE_REVCHAR( folded );
2737 }
2738 }
a3621e74
YO
2739 if ( !trie->charmap[ uvc ] ) {
2740 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2741 TRIE_STORE_REVCHAR( uvc );
a3621e74 2742 }
02daf0ab 2743 if ( set_bit ) {
62012aee
KW
2744 /* store the codepoint in the bitmap, and its folded
2745 * equivalent. */
8bcafbf4 2746 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2747 set_bit = 0; /* We've done our bit :-) */
2748 }
a3621e74 2749 } else {
bc031a7d
KW
2750
2751 /* XXX We could come up with the list of code points that fold
2752 * to this using PL_utf8_foldclosures, except not for
2753 * multi-char folds, as there may be multiple combinations
2754 * there that could work, which needs to wait until runtime to
2755 * resolve (The comment about LIGATURE FFI above is such an
2756 * example */
2757
a3621e74 2758 SV** svpp;
55eed653
NC
2759 if ( !widecharmap )
2760 widecharmap = newHV();
a3621e74 2761
55eed653 2762 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2763
2764 if ( !svpp )
147e3846 2765 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
2766
2767 if ( !SvTRUE( *svpp ) ) {
2768 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2769 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2770 }
2771 }
bc031a7d
KW
2772 } /* end loop through characters in this branch of the trie */
2773
2774 /* We take the min and max for this branch and combine to find the min
2775 * and max for all branches processed so far */
3dab1dad 2776 if( cur == first ) {
bc031a7d
KW
2777 trie->minlen = minchars;
2778 trie->maxlen = maxchars;
2779 } else if (minchars < trie->minlen) {
2780 trie->minlen = minchars;
2781 } else if (maxchars > trie->maxlen) {
2782 trie->maxlen = maxchars;
fab2782b 2783 }
a3621e74
YO
2784 } /* end first pass */
2785 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2786 Perl_re_indentf( aTHX_
cb41e5d6
YO
2787 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2788 depth+1,
55eed653 2789 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2790 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2791 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2792 );
a3621e74
YO
2793
2794 /*
2795 We now know what we are dealing with in terms of unique chars and
2796 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2797 representation using a flat table will take. If it's over a reasonable
2798 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2799 conservative but potentially much slower representation using an array
2800 of lists.
2801
2802 At the end we convert both representations into the same compressed
2803 form that will be used in regexec.c for matching with. The latter
2804 is a form that cannot be used to construct with but has memory
2805 properties similar to the list form and access properties similar
2806 to the table form making it both suitable for fast searches and
2807 small enough that its feasable to store for the duration of a program.
2808
2809 See the comment in the code where the compressed table is produced
2810 inplace from the flat tabe representation for an explanation of how
2811 the compression works.
2812
2813 */
2814
2815
2e64971a
DM
2816 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2817 prev_states[1] = 0;
2818
538e84ed
KW
2819 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2820 > SvIV(re_trie_maxbuff) )
2821 {
a3621e74
YO
2822 /*
2823 Second Pass -- Array Of Lists Representation
2824
2825 Each state will be represented by a list of charid:state records
2826 (reg_trie_trans_le) the first such element holds the CUR and LEN
2827 points of the allocated array. (See defines above).
2828
2829 We build the initial structure using the lists, and then convert
2830 it into the compressed table form which allows faster lookups
2831 (but cant be modified once converted).
a3621e74
YO
2832 */
2833
a3621e74
YO
2834 STRLEN transcount = 1;
2835
6ad9a8ab 2836 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 2837 depth+1));
686b73d4 2838
c944940b
JH
2839 trie->states = (reg_trie_state *)
2840 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2841 sizeof(reg_trie_state) );
a3621e74
YO
2842 TRIE_LIST_NEW(1);
2843 next_alloc = 2;
2844
2845 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2846
df826430 2847 regnode *noper = NEXTOPER( cur );
c445ea15
AL
2848 U32 state = 1; /* required init */
2849 U16 charid = 0; /* sanity init */
07be1b83 2850 U32 wordlen = 0; /* required init */
c445ea15 2851
df826430
YO
2852 if (OP(noper) == NOTHING) {
2853 regnode *noper_next= regnext(noper);
944e05e3
YO
2854 if (noper_next < tail)
2855 noper= noper_next;
df826430
YO
2856 }
2857
944e05e3
YO
2858 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2859 const U8 *uc= (U8*)STRING(noper);
2860 const U8 *e= uc + STR_LEN(noper);
2861
786e8c11 2862 for ( ; uc < e ; uc += len ) {
c445ea15 2863
786e8c11 2864 TRIE_READ_CHAR;
c445ea15 2865
786e8c11
YO
2866 if ( uvc < 256 ) {
2867 charid = trie->charmap[ uvc ];
c445ea15 2868 } else {
538e84ed
KW
2869 SV** const svpp = hv_fetch( widecharmap,
2870 (char*)&uvc,
2871 sizeof( UV ),
2872 0);
786e8c11
YO
2873 if ( !svpp ) {
2874 charid = 0;
2875 } else {
2876 charid=(U16)SvIV( *svpp );
2877 }
c445ea15 2878 }
538e84ed
KW
2879 /* charid is now 0 if we dont know the char read, or
2880 * nonzero if we do */
786e8c11 2881 if ( charid ) {
a3621e74 2882
786e8c11
YO
2883 U16 check;
2884 U32 newstate = 0;
a3621e74 2885
786e8c11
YO
2886 charid--;
2887 if ( !trie->states[ state ].trans.list ) {
2888 TRIE_LIST_NEW( state );
c445ea15 2889 }
538e84ed
KW
2890 for ( check = 1;
2891 check <= TRIE_LIST_USED( state );
2892 check++ )
2893 {
2894 if ( TRIE_LIST_ITEM( state, check ).forid
2895 == charid )
2896 {
786e8c11
YO
2897 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2898 break;
2899 }
2900 }
2901 if ( ! newstate ) {
2902 newstate = next_alloc++;
2e64971a 2903 prev_states[newstate] = state;
786e8c11
YO
2904 TRIE_LIST_PUSH( state, charid, newstate );
2905 transcount++;
2906 }
2907 state = newstate;
2908 } else {
147e3846 2909 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 2910 }
a28509cc 2911 }
c445ea15 2912 }
3dab1dad 2913 TRIE_HANDLE_WORD(state);
a3621e74
YO
2914
2915 } /* end second pass */
2916
1e2e3d02 2917 /* next alloc is the NEXT state to be allocated */
538e84ed 2918 trie->statecount = next_alloc;
c944940b
JH
2919 trie->states = (reg_trie_state *)
2920 PerlMemShared_realloc( trie->states,
2921 next_alloc
2922 * sizeof(reg_trie_state) );
a3621e74 2923
3dab1dad 2924 /* and now dump it out before we compress it */
2b8b4781
NC
2925 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2926 revcharmap, next_alloc,
2927 depth+1)
1e2e3d02 2928 );
a3621e74 2929
c944940b
JH
2930 trie->trans = (reg_trie_trans *)
2931 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2932 {
2933 U32 state;
a3621e74
YO
2934 U32 tp = 0;
2935 U32 zp = 0;
2936
2937
2938 for( state=1 ; state < next_alloc ; state ++ ) {
2939 U32 base=0;
2940
2941 /*
2942 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 2943 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
2944 );
2945 */
2946
2947 if (trie->states[state].trans.list) {
2948 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2949 U16 maxid=minid;
a28509cc 2950 U16 idx;
a3621e74
YO
2951
2952 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
2953 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2954 if ( forid < minid ) {
2955 minid=forid;
2956 } else if ( forid > maxid ) {
2957 maxid=forid;
2958 }
a3621e74
YO
2959 }
2960 if ( transcount < tp + maxid - minid + 1) {
2961 transcount *= 2;
c944940b
JH
2962 trie->trans = (reg_trie_trans *)
2963 PerlMemShared_realloc( trie->trans,
446bd890
NC
2964 transcount
2965 * sizeof(reg_trie_trans) );
538e84ed
KW
2966 Zero( trie->trans + (transcount / 2),
2967 transcount / 2,
2968 reg_trie_trans );
a3621e74
YO
2969 }
2970 base = trie->uniquecharcount + tp - minid;
2971 if ( maxid == minid ) {
2972 U32 set = 0;
2973 for ( ; zp < tp ; zp++ ) {
2974 if ( ! trie->trans[ zp ].next ) {
2975 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
2976 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2977 1).newstate;
a3621e74
YO
2978 trie->trans[ zp ].check = state;
2979 set = 1;
2980 break;
2981 }
2982 }
2983 if ( !set ) {
538e84ed
KW
2984 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2985 1).newstate;
a3621e74
YO
2986 trie->trans[ tp ].check = state;
2987 tp++;
2988 zp = tp;
2989 }
2990 } else {
2991 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
2992 const U32 tid = base
2993 - trie->uniquecharcount
2994 + TRIE_LIST_ITEM( state, idx ).forid;
2995 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2996 idx ).newstate;
a3621e74
YO
2997 trie->trans[ tid ].check = state;
2998 }
2999 tp += ( maxid - minid + 1 );
3000 }
3001 Safefree(trie->states[ state ].trans.list);
3002 }
3003 /*
3004 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3005 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
3006 );
3007 */
3008 trie->states[ state ].trans.base=base;
3009 }
cc601c31 3010 trie->lasttrans = tp + 1;
a3621e74
YO
3011 }
3012 } else {
3013 /*
3014 Second Pass -- Flat Table Representation.
3015
b423522f
KW
3016 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3017 each. We know that we will need Charcount+1 trans at most to store
3018 the data (one row per char at worst case) So we preallocate both
3019 structures assuming worst case.
a3621e74
YO
3020
3021 We then construct the trie using only the .next slots of the entry
3022 structs.
3023
b423522f
KW
3024 We use the .check field of the first entry of the node temporarily
3025 to make compression both faster and easier by keeping track of how
3026 many non zero fields are in the node.
a3621e74
YO
3027
3028 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3029 transition.
3030
b423522f
KW
3031 There are two terms at use here: state as a TRIE_NODEIDX() which is
3032 a number representing the first entry of the node, and state as a
3033 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3034 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3035 if there are 2 entrys per node. eg:
a3621e74
YO
3036
3037 A B A B
3038 1. 2 4 1. 3 7
3039 2. 0 3 3. 0 5
3040 3. 0 0 5. 0 0
3041 4. 0 0 7. 0 0
3042
b423522f
KW
3043 The table is internally in the right hand, idx form. However as we
3044 also have to deal with the states array which is indexed by nodenum
3045 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
3046
3047 */
6ad9a8ab 3048 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 3049 depth+1));
3dab1dad 3050
c944940b
JH
3051 trie->trans = (reg_trie_trans *)
3052 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3053 * trie->uniquecharcount + 1,
3054 sizeof(reg_trie_trans) );
3055 trie->states = (reg_trie_state *)
3056 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3057 sizeof(reg_trie_state) );
a3621e74
YO
3058 next_alloc = trie->uniquecharcount + 1;
3059
3dab1dad 3060
a3621e74
YO
3061 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3062
df826430 3063 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3064
3065 U32 state = 1; /* required init */
3066
3067 U16 charid = 0; /* sanity init */
3068 U32 accept_state = 0; /* sanity init */
a3621e74 3069
07be1b83 3070 U32 wordlen = 0; /* required init */
a3621e74 3071
df826430
YO
3072 if (OP(noper) == NOTHING) {
3073 regnode *noper_next= regnext(noper);
944e05e3
YO
3074 if (noper_next < tail)
3075 noper= noper_next;
df826430 3076 }
fab2782b 3077
944e05e3
YO
3078 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3079 const U8 *uc= (U8*)STRING(noper);
3080 const U8 *e= uc + STR_LEN(noper);
3081
786e8c11 3082 for ( ; uc < e ; uc += len ) {
a3621e74 3083
786e8c11 3084 TRIE_READ_CHAR;
a3621e74 3085
786e8c11
YO
3086 if ( uvc < 256 ) {
3087 charid = trie->charmap[ uvc ];
3088 } else {
538e84ed
KW
3089 SV* const * const svpp = hv_fetch( widecharmap,
3090 (char*)&uvc,
3091 sizeof( UV ),
3092 0);
786e8c11 3093 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3094 }
786e8c11
YO
3095 if ( charid ) {
3096 charid--;
3097 if ( !trie->trans[ state + charid ].next ) {
3098 trie->trans[ state + charid ].next = next_alloc;
3099 trie->trans[ state ].check++;
2e64971a
DM
3100 prev_states[TRIE_NODENUM(next_alloc)]
3101 = TRIE_NODENUM(state);
786e8c11
YO
3102 next_alloc += trie->uniquecharcount;
3103 }
3104 state = trie->trans[ state + charid ].next;
3105 } else {
147e3846 3106 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3107 }
538e84ed
KW
3108 /* charid is now 0 if we dont know the char read, or
3109 * nonzero if we do */
a3621e74 3110 }
a3621e74 3111 }
3dab1dad
YO
3112 accept_state = TRIE_NODENUM( state );
3113 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3114
3115 } /* end second pass */
3116
3dab1dad 3117 /* and now dump it out before we compress it */
2b8b4781
NC
3118 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3119 revcharmap,
3120 next_alloc, depth+1));
a3621e74 3121
a3621e74
YO
3122 {
3123 /*
3124 * Inplace compress the table.*
3125
3126 For sparse data sets the table constructed by the trie algorithm will
3127 be mostly 0/FAIL transitions or to put it another way mostly empty.
3128 (Note that leaf nodes will not contain any transitions.)
3129
3130 This algorithm compresses the tables by eliminating most such
3131 transitions, at the cost of a modest bit of extra work during lookup:
3132
3133 - Each states[] entry contains a .base field which indicates the
3134 index in the state[] array wheres its transition data is stored.
3135
3b753521 3136 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3137
3138 - If .base is nonzero then charid is added to it to find an entry in
3139 the trans array.
3140
3141 -If trans[states[state].base+charid].check!=state then the
3142 transition is taken to be a 0/Fail transition. Thus if there are fail
3143 transitions at the front of the node then the .base offset will point
3144 somewhere inside the previous nodes data (or maybe even into a node
3145 even earlier), but the .check field determines if the transition is
3146 valid.
3147
786e8c11 3148 XXX - wrong maybe?
a3621e74 3149 The following process inplace converts the table to the compressed
3b753521 3150 table: We first do not compress the root node 1,and mark all its
a3621e74 3151 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3152 allows us to do a DFA construction from the compressed table later,
3153 and ensures that any .base pointers we calculate later are greater
3154 than 0.
a3621e74
YO
3155
3156 - We set 'pos' to indicate the first entry of the second node.
3157
3158 - We then iterate over the columns of the node, finding the first and
3159 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3160 and set the .check pointers accordingly, and advance pos
3161 appropriately and repreat for the next node. Note that when we copy
3162 the next pointers we have to convert them from the original
3163 NODEIDX form to NODENUM form as the former is not valid post
3164 compression.
3165
3166 - If a node has no transitions used we mark its base as 0 and do not
3167 advance the pos pointer.
3168
3169 - If a node only has one transition we use a second pointer into the
3170 structure to fill in allocated fail transitions from other states.
3171 This pointer is independent of the main pointer and scans forward
3172 looking for null transitions that are allocated to a state. When it
3173 finds one it writes the single transition into the "hole". If the
786e8c11 3174 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3175
3176 - Once compressed we can Renew/realloc the structures to release the
3177 excess space.
3178
3179 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3180 specifically Fig 3.47 and the associated pseudocode.
3181
3182 demq
3183 */
a3b680e6 3184 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3185 U32 state, charid;
a3621e74 3186 U32 pos = 0, zp=0;
1e2e3d02 3187 trie->statecount = laststate;
a3621e74
YO
3188
3189 for ( state = 1 ; state < laststate ; state++ ) {
3190 U8 flag = 0;
a28509cc
AL
3191 const U32 stateidx = TRIE_NODEIDX( state );
3192 const U32 o_used = trie->trans[ stateidx ].check;
3193 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3194 trie->trans[ stateidx ].check = 0;
3195
538e84ed
KW
3196 for ( charid = 0;
3197 used && charid < trie->uniquecharcount;
3198 charid++ )
3199 {
a3621e74
YO
3200 if ( flag || trie->trans[ stateidx + charid ].next ) {
3201 if ( trie->trans[ stateidx + charid ].next ) {
3202 if (o_used == 1) {
3203 for ( ; zp < pos ; zp++ ) {
3204 if ( ! trie->trans[ zp ].next ) {
3205 break;
3206 }
3207 }
538e84ed
KW
3208 trie->states[ state ].trans.base
3209 = zp
3210 + trie->uniquecharcount
3211 - charid ;
3212 trie->trans[ zp ].next
3213 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3214 + charid ].next );
a3621e74
YO
3215 trie->trans[ zp ].check = state;
3216 if ( ++zp > pos ) pos = zp;
3217 break;
3218 }
3219 used--;
3220 }
3221 if ( !flag ) {
3222 flag = 1;
538e84ed
KW
3223 trie->states[ state ].trans.base
3224 = pos + trie->uniquecharcount - charid ;
a3621e74 3225 }
538e84ed
KW
3226 trie->trans[ pos ].next
3227 = SAFE_TRIE_NODENUM(
3228 trie->trans[ stateidx + charid ].next );
a3621e74
YO
3229 trie->trans[ pos ].check = state;
3230 pos++;
3231 }
3232 }
3233 }
cc601c31 3234 trie->lasttrans = pos + 1;
c944940b
JH
3235 trie->states = (reg_trie_state *)
3236 PerlMemShared_realloc( trie->states, laststate
3237 * sizeof(reg_trie_state) );
a3621e74 3238 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3239 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
cb41e5d6 3240 depth+1,
538e84ed
KW
3241 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3242 + 1 ),
3243 (IV)next_alloc,
3244 (IV)pos,
3245 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
3246 );
3247
3248 } /* end table compress */
3249 }
1e2e3d02 3250 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3251 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
cb41e5d6 3252 depth+1,
1e2e3d02
YO
3253 (UV)trie->statecount,
3254 (UV)trie->lasttrans)
3255 );
cc601c31 3256 /* resize the trans array to remove unused space */
c944940b
JH
3257 trie->trans = (reg_trie_trans *)
3258 PerlMemShared_realloc( trie->trans, trie->lasttrans
3259 * sizeof(reg_trie_trans) );
a3621e74 3260
538e84ed 3261 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
3262 U8 nodetype =(U8)(flags & 0xFF);
3263 char *str=NULL;
538e84ed 3264
07be1b83 3265#ifdef DEBUGGING
e62cc96a 3266 regnode *optimize = NULL;
7122b237
YO
3267#ifdef RE_TRACK_PATTERN_OFFSETS
3268
b57a0404
JH
3269 U32 mjd_offset = 0;
3270 U32 mjd_nodelen = 0;
7122b237
YO
3271#endif /* RE_TRACK_PATTERN_OFFSETS */
3272#endif /* DEBUGGING */
a3621e74 3273 /*
3dab1dad
YO
3274 This means we convert either the first branch or the first Exact,
3275 depending on whether the thing following (in 'last') is a branch
3276 or not and whther first is the startbranch (ie is it a sub part of
3277 the alternation or is it the whole thing.)
3b753521 3278 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 3279 the whole branch sequence, including the first.
a3621e74 3280 */
3dab1dad 3281 /* Find the node we are going to overwrite */
7f69552c 3282 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 3283 /* branch sub-chain */
3dab1dad 3284 NEXT_OFF( first ) = (U16)(last - first);
7122b237 3285#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
3286 DEBUG_r({
3287 mjd_offset= Node_Offset((convert));
3288 mjd_nodelen= Node_Length((convert));
3289 });
7122b237 3290#endif
7f69552c 3291 /* whole branch chain */
7122b237
YO
3292 }
3293#ifdef RE_TRACK_PATTERN_OFFSETS
3294 else {
7f69552c
YO
3295 DEBUG_r({
3296 const regnode *nop = NEXTOPER( convert );
3297 mjd_offset= Node_Offset((nop));
3298 mjd_nodelen= Node_Length((nop));
3299 });
07be1b83
YO
3300 }
3301 DEBUG_OPTIMISE_r(
147e3846 3302 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
cb41e5d6 3303 depth+1,
786e8c11 3304 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 3305 );
7122b237 3306#endif
538e84ed 3307 /* But first we check to see if there is a common prefix we can
3dab1dad
YO
3308 split out as an EXACT and put in front of the TRIE node. */
3309 trie->startstate= 1;
55eed653 3310 if ( trie->bitmap && !widecharmap && !trie->jump ) {
5ee57374
YO
3311 /* we want to find the first state that has more than
3312 * one transition, if that state is not the first state
3313 * then we have a common prefix which we can remove.
3314 */
3dab1dad 3315 U32 state;
1e2e3d02 3316 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 3317 U32 ofs = 0;
ee0dfd0b
YO
3318 I32 first_ofs = -1; /* keeps track of the ofs of the first
3319 transition, -1 means none */
8e11feef
RGS
3320 U32 count = 0;
3321 const U32 base = trie->states[ state ].trans.base;
a3621e74 3322
5ee57374 3323 /* does this state terminate an alternation? */
3dab1dad 3324 if ( trie->states[state].wordnum )
8e11feef 3325 count = 1;
a3621e74 3326
8e11feef 3327 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
3328 if ( ( base + ofs >= trie->uniquecharcount ) &&
3329 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
3330 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3331 {
3dab1dad 3332 if ( ++count > 1 ) {
5ee57374 3333 /* we have more than one transition */
d3d91c7c
YO
3334 SV **tmp;
3335 U8 *ch;
5ee57374
YO
3336 /* if this is the first state there is no common prefix
3337 * to extract, so we can exit */
8e11feef 3338 if ( state == 1 ) break;
d3d91c7c
YO
3339 tmp = av_fetch( revcharmap, ofs, 0);
3340 ch = (U8*)SvPV_nolen_const( *tmp );
3341
5ee57374
YO
3342 /* if we are on count 2 then we need to initialize the
3343 * bitmap, and store the previous char if there was one
3344 * in it*/
3dab1dad 3345 if ( count == 2 ) {
5ee57374 3346 /* clear the bitmap */
3dab1dad
YO
3347 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3348 DEBUG_OPTIMISE_r(
147e3846 3349 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
cb41e5d6 3350 depth+1,
786e8c11 3351 (UV)state));
ee0dfd0b
YO
3352 if (first_ofs >= 0) {
3353 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
be8e71aa 3354 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 3355
8bcafbf4 3356 TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
3dab1dad 3357 DEBUG_OPTIMISE_r(
6ad9a8ab 3358 Perl_re_printf( aTHX_ "%s", (char*)ch)
3dab1dad 3359 );
8e11feef
RGS
3360 }
3361 }
8bcafbf4
YO
3362 /* store the current firstchar in the bitmap */
3363 TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
6ad9a8ab 3364 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
8e11feef 3365 }
ee0dfd0b 3366 first_ofs = ofs;
8e11feef 3367 }
3dab1dad
YO
3368 }
3369 if ( count == 1 ) {
5ee57374
YO
3370 /* This state has only one transition, its transition is part
3371 * of a common prefix - we need to concatenate the char it
3372 * represents to what we have so far. */
ee0dfd0b 3373 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
c490c714
YO
3374 STRLEN len;
3375 char *ch = SvPV( *tmp, len );
de734bd5
A
3376 DEBUG_OPTIMISE_r({
3377 SV *sv=sv_newmortal();
147e3846 3378 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
cb41e5d6 3379 depth+1,
ee0dfd0b 3380 (UV)state, (UV)first_ofs,
538e84ed 3381 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
de734bd5
A
3382 PL_colors[0], PL_colors[1],
3383 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed 3384 PERL_PV_ESCAPE_FIRSTCHAR
de734bd5
A
3385 )
3386 );
3387 });
3dab1dad
YO
3388 if ( state==1 ) {
3389 OP( convert ) = nodetype;
3390 str=STRING(convert);
3391 STR_LEN(convert)=0;
3392 }
c490c714
YO
3393 STR_LEN(convert) += len;
3394 while (len--)
de734bd5 3395 *str++ = *ch++;
8e11feef 3396 } else {
538e84ed 3397#ifdef DEBUGGING
8e11feef 3398 if (state>1)
6ad9a8ab 3399 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
f9049ba1 3400#endif
8e11feef
RGS
3401 break;
3402 }
3403 }
2e64971a 3404 trie->prefixlen = (state-1);
3dab1dad 3405 if (str) {
8e11feef 3406 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 3407 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 3408 trie->startstate = state;
07be1b83
YO
3409 trie->minlen -= (state - 1);
3410 trie->maxlen -= (state - 1);
33809eae
JH
3411#ifdef DEBUGGING
3412 /* At least the UNICOS C compiler choked on this
3413 * being argument to DEBUG_r(), so let's just have
3414 * it right here. */
3415 if (
3416#ifdef PERL_EXT_RE_BUILD
3417 1
3418#else
3419 DEBUG_r_TEST
3420#endif
3421 ) {
3422 regnode *fix = convert;
3423 U32 word = trie->wordcount;
3424 mjd_nodelen++;
3425 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3426 while( ++fix < n ) {
3427 Set_Node_Offset_Length(fix, 0, 0);
3428 }
3429 while (word--) {
3430 SV ** const tmp = av_fetch( trie_words, word, 0 );
3431 if (tmp) {
3432 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3433 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3434 else
3435 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3436 }
3437 }
3438 }
3439#endif
8e11feef
RGS
3440 if (trie->maxlen) {
3441 convert = n;
3442 } else {
3dab1dad 3443 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 3444 DEBUG_r(optimize= n);
3dab1dad
YO
3445 }
3446 }
3447 }
538e84ed
KW
3448 if (!jumper)
3449 jumper = last;
3dab1dad 3450 if ( trie->maxlen ) {
8e11feef
RGS
3451 NEXT_OFF( convert ) = (U16)(tail - convert);
3452 ARG_SET( convert, data_slot );
538e84ed
KW
3453 /* Store the offset to the first unabsorbed branch in
3454 jump[0], which is otherwise unused by the jump logic.
786e8c11 3455 We use this when dumping a trie and during optimisation. */
538e84ed 3456 if (trie->jump)
7f69552c 3457 trie->jump[0] = (U16)(nextbranch - convert);
538e84ed 3458
6c48061a
YO
3459 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3460 * and there is a bitmap
3461 * and the first "jump target" node we found leaves enough room
3462 * then convert the TRIE node into a TRIEC node, with the bitmap
3463 * embedded inline in the opcode - this is hypothetically faster.
3464 */
3465 if ( !trie->states[trie->startstate].wordnum
3466 && trie->bitmap
3467 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
3468 {
3469 OP( convert ) = TRIEC;
3470 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 3471 PerlMemShared_free(trie->bitmap);
786e8c11 3472 trie->bitmap= NULL;
538e84ed 3473 } else
786e8c11 3474 OP( convert ) = TRIE;
a3621e74 3475
3dab1dad
YO
3476 /* store the type in the flags */
3477 convert->flags = nodetype;
a5ca303d 3478 DEBUG_r({
538e84ed
KW
3479 optimize = convert
3480 + NODE_STEP_REGNODE
a5ca303d
YO
3481 + regarglen[ OP( convert ) ];
3482 });
538e84ed 3483 /* XXX We really should free up the resource in trie now,
a5ca303d 3484 as we won't use them - (which resources?) dmq */
3dab1dad 3485 }