This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
give REGEXP SVs the POK flag again
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
f7e03a10 89#include "dquote_inline.h"
b992490d 90#include "invlist_inline.h"
1b0f46bf 91#include "unicode_constants.h"
04e98a4d 92
538e84ed
KW
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
95#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 97#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 98#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
3f910e62
YO
104/* this is a chain of data about sub patterns we are processing that
105 need to be handled separately/specially in study_chunk. Its so
106 we can simulate recursion without losing state. */
107struct scan_frame;
108typedef struct scan_frame {
109 regnode *last_regnode; /* last node to process in this frame */
110 regnode *next_regnode; /* next node to process when last is reached */
111 U32 prev_recursed_depth;
112 I32 stopparen; /* what stopparen do we use */
113 U32 is_top_frame; /* what flags do we use? */
114
115 struct scan_frame *this_prev_frame; /* this previous frame */
116 struct scan_frame *prev_frame; /* previous frame */
117 struct scan_frame *next_frame; /* next frame */
118} scan_frame;
119
f4ae5a27
KW
120/* Certain characters are output as a sequence with the first being a
121 * backslash. */
eb7486de 122#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
f4ae5a27
KW
123
124
09b2b2e6 125struct RExC_state_t {
514a91f1
DM
126 U32 flags; /* RXf_* are we folding, multilining? */
127 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 128 char *precomp; /* uncompiled string. */
711b303b 129 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 130 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 131 regexp *rx; /* perl core regexp structure */
538e84ed
KW
132 regexp_internal *rxi; /* internal data for regexp object
133 pprivate field */
fac92740 134 char *start; /* Start of input for compile */
830247a4
IZ
135 char *end; /* End of input for compile */
136 char *parse; /* Input-scan pointer. */
285b5ca0
KW
137 char *adjusted_start; /* 'start', adjusted. See code use */
138 STRLEN precomp_adj; /* an offset beyond precomp. See code use */
ea3daa5d 139 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 140 regnode *emit_start; /* Start of emitted-code area */
538e84ed
KW
141 regnode *emit_bound; /* First regnode outside of the
142 allocated space */
f7c7e32a
DM
143 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
144 implies compiling, so don't emit */
9a81a976
KW
145 regnode_ssc emit_dummy; /* placeholder for emit to point to;
146 large enough for the largest
147 non-EXACTish node, so can use it as
148 scratch in pass1 */
830247a4
IZ
149 I32 naughty; /* How bad is this pattern? */
150 I32 sawback; /* Did we see \1, ...? */
151 U32 seen;
ea3daa5d 152 SSize_t size; /* Code size. */
538e84ed
KW
153 I32 npar; /* Capture buffer count, (OPEN) plus
154 one. ("par" 0 is the whole
155 pattern)*/
156 I32 nestroot; /* root parens we are in - used by
157 accept */
830247a4
IZ
158 I32 extralen;
159 I32 seen_zerolen;
40d049e4
YO
160 regnode **open_parens; /* pointers to open parens */
161 regnode **close_parens; /* pointers to close parens */
d5a00e4a 162 regnode *end_op; /* END node in program */
02daf0ab
YO
163 I32 utf8; /* whether the pattern is utf8 or not */
164 I32 orig_utf8; /* whether the pattern was originally in utf8 */
165 /* XXX use this for future optimisation of case
166 * where pattern must be upgraded to utf8. */
e40e74fe
KW
167 I32 uni_semantics; /* If a d charset modifier should use unicode
168 rules, even if the pattern is not in
169 utf8 */
81714fb9 170 HV *paren_names; /* Paren names */
538e84ed 171
40d049e4 172 regnode **recurse; /* Recurse regops */
d5a00e4a 173 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 174 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 175 through */
09a65838 176 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 177 I32 in_lookbehind;
4624b182 178 I32 contains_locale;
bb3f3ed2 179 I32 override_recoding;
b6d67071
KW
180#ifdef EBCDIC
181 I32 recode_x_to_native;
182#endif
9d53c457 183 I32 in_multi_char_class;
1acab4c5 184 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 185 within pattern */
b1603ef8 186 int code_index; /* next code_blocks[] slot */
ee273784 187 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
188 scan_frame *frame_head;
189 scan_frame *frame_last;
190 U32 frame_count;
7eec73eb 191 AV *warn_text;
dc6d7f5c 192#ifdef ADD_TO_REGEXEC
830247a4
IZ
193 char *starttry; /* -Dr: where regtry was called. */
194#define RExC_starttry (pRExC_state->starttry)
195#endif
d24ca0c5 196 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 197#ifdef DEBUGGING
be8e71aa 198 const char *lastparse;
3dab1dad 199 I32 lastnum;
1f1031fe 200 AV *paren_name_list; /* idx -> name */
d9a72fcc 201 U32 study_chunk_recursed_count;
c9f0d54c
YO
202 SV *mysv1;
203 SV *mysv2;
3dab1dad
YO
204#define RExC_lastparse (pRExC_state->lastparse)
205#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 206#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 207#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
208#define RExC_mysv (pRExC_state->mysv1)
209#define RExC_mysv1 (pRExC_state->mysv1)
210#define RExC_mysv2 (pRExC_state->mysv2)
211
3dab1dad 212#endif
512c0f5a 213 bool seen_unfolded_sharp_s;
911bd04e 214 bool strict;
da7cf1cc 215 bool study_started;
09b2b2e6 216};
830247a4 217
e2509266 218#define RExC_flags (pRExC_state->flags)
514a91f1 219#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 220#define RExC_precomp (pRExC_state->precomp)
285b5ca0
KW
221#define RExC_precomp_adj (pRExC_state->precomp_adj)
222#define RExC_adjusted_start (pRExC_state->adjusted_start)
711b303b 223#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 224#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 225#define RExC_rx (pRExC_state->rx)
f8fc2ecf 226#define RExC_rxi (pRExC_state->rxi)
fac92740 227#define RExC_start (pRExC_state->start)
830247a4
IZ
228#define RExC_end (pRExC_state->end)
229#define RExC_parse (pRExC_state->parse)
230#define RExC_whilem_seen (pRExC_state->whilem_seen)
512c0f5a
KW
231
232/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
233 * EXACTF node, hence was parsed under /di rules. If later in the parse,
234 * something forces the pattern into using /ui rules, the sharp s should be
235 * folded into the sequence 'ss', which takes up more space than previously
236 * calculated. This means that the sizing pass needs to be restarted. (The
237 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
238 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
239 * so there is no need to resize [perl #125990]. */
240#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
241
7122b237 242#ifdef RE_TRACK_PATTERN_OFFSETS
538e84ed
KW
243#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
244 others */
7122b237 245#endif
830247a4 246#define RExC_emit (pRExC_state->emit)
f7c7e32a 247#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 248#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 249#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
250#define RExC_sawback (pRExC_state->sawback)
251#define RExC_seen (pRExC_state->seen)
252#define RExC_size (pRExC_state->size)
ee273784 253#define RExC_maxlen (pRExC_state->maxlen)
830247a4 254#define RExC_npar (pRExC_state->npar)
e2e6a0f1 255#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
256#define RExC_extralen (pRExC_state->extralen)
257#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 258#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 259#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 260#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
261#define RExC_open_parens (pRExC_state->open_parens)
262#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 263#define RExC_end_op (pRExC_state->end_op)
81714fb9 264#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
265#define RExC_recurse (pRExC_state->recurse)
266#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 267#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
268#define RExC_study_chunk_recursed_bytes \
269 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 270#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 271#define RExC_contains_locale (pRExC_state->contains_locale)
b6d67071
KW
272#ifdef EBCDIC
273# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
274#endif
9d53c457 275#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
276#define RExC_frame_head (pRExC_state->frame_head)
277#define RExC_frame_last (pRExC_state->frame_last)
278#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 279#define RExC_strict (pRExC_state->strict)
da7cf1cc 280#define RExC_study_started (pRExC_state->study_started)
7eec73eb 281#define RExC_warn_text (pRExC_state->warn_text)
830247a4 282
99807a43
HS
283/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
284 * a flag to disable back-off on the fixed/floating substrings - if it's
285 * a high complexity pattern we assume the benefit of avoiding a full match
286 * is worth the cost of checking for the substrings even if they rarely help.
287 */
288#define RExC_naughty (pRExC_state->naughty)
289#define TOO_NAUGHTY (10)
290#define MARK_NAUGHTY(add) \
291 if (RExC_naughty < TOO_NAUGHTY) \
292 RExC_naughty += (add)
293#define MARK_NAUGHTY_EXP(exp, add) \
294 if (RExC_naughty < TOO_NAUGHTY) \
295 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 296
a687059c
LW
297#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
298#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 299 ((*s) == '{' && regcurly(s)))
a687059c
LW
300
301/*
302 * Flags to be passed up and down.
303 */
a687059c 304#define WORST 0 /* Worst case. */
a3b492c3 305#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 306
e64f369d 307/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
308 * character. (There needs to be a case: in the switch statement in regexec.c
309 * for any node marked SIMPLE.) Note that this is not the same thing as
310 * REGNODE_SIMPLE */
fda99bee 311#define SIMPLE 0x02
e64f369d 312#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
313#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
314#define TRYAGAIN 0x10 /* Weeded out a declaration. */
b97943f7
KW
315#define RESTART_PASS1 0x20 /* Need to restart sizing pass */
316#define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
317 calcuate sizes as UTF-8 */
a687059c 318
3dab1dad
YO
319#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
320
07be1b83
YO
321/* whether trie related optimizations are enabled */
322#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
323#define TRIE_STUDY_OPT
786e8c11 324#define FULL_TRIE_STUDY
07be1b83
YO
325#define TRIE_STCLASS
326#endif
1de06328
YO
327
328
40d049e4
YO
329
330#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
331#define PBITVAL(paren) (1 << ((paren) & 7))
332#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
333#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
334#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
335
82a6ada4 336#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 337 if (!UTF) { \
82a6ada4
KW
338 assert(PASS1); \
339 *flagp = RESTART_PASS1|NEED_UTF8; \
8d9c2815
NC
340 return NULL; \
341 } \
82a6ada4 342 } STMT_END
40d049e4 343
512c0f5a
KW
344/* Change from /d into /u rules, and restart the parse if we've already seen
345 * something whose size would increase as a result, by setting *flagp and
346 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
347 * we've change to /u during the parse. */
348#define REQUIRE_UNI_RULES(flagp, restart_retval) \
349 STMT_START { \
350 if (DEPENDS_SEMANTICS) { \
351 assert(PASS1); \
352 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
353 RExC_uni_semantics = 1; \
354 if (RExC_seen_unfolded_sharp_s) { \
355 *flagp |= RESTART_PASS1; \
356 return restart_retval; \
357 } \
358 } \
359 } STMT_END
360
f19b1a63
KW
361/* This converts the named class defined in regcomp.h to its equivalent class
362 * number defined in handy.h. */
363#define namedclass_to_classnum(class) ((int) ((class) / 2))
364#define classnum_to_namedclass(classnum) ((classnum) * 2)
365
de92f5e6
KW
366#define _invlist_union_complement_2nd(a, b, output) \
367 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
368#define _invlist_intersection_complement_2nd(a, b, output) \
369 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
370
1de06328
YO
371/* About scan_data_t.
372
373 During optimisation we recurse through the regexp program performing
374 various inplace (keyhole style) optimisations. In addition study_chunk
375 and scan_commit populate this data structure with information about
538e84ed 376 what strings MUST appear in the pattern. We look for the longest
3b753521 377 string that must appear at a fixed location, and we look for the
1de06328
YO
378 longest string that may appear at a floating location. So for instance
379 in the pattern:
538e84ed 380
1de06328 381 /FOO[xX]A.*B[xX]BAR/
538e84ed 382
1de06328
YO
383 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
384 strings (because they follow a .* construct). study_chunk will identify
385 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 386
1de06328 387 The strings can be composites, for instance
538e84ed 388
1de06328 389 /(f)(o)(o)/
538e84ed 390
1de06328 391 will result in a composite fixed substring 'foo'.
538e84ed 392
1de06328 393 For each string some basic information is maintained:
538e84ed 394
6ea77169 395 - min_offset
1de06328
YO
396 This is the position the string must appear at, or not before.
397 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
398 characters must match before the string we are searching for.
399 Likewise when combined with minlenp and the length of the string it
538e84ed 400 tells us how many characters must appear after the string we have
1de06328 401 found.
538e84ed 402
1de06328
YO
403 - max_offset
404 Only used for floating strings. This is the rightmost point that
ea3daa5d 405 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 406 string can occur infinitely far to the right.
6e12f832 407 For fixed strings, it is equal to min_offset.
538e84ed 408
1de06328 409 - minlenp
2d608413
KW
410 A pointer to the minimum number of characters of the pattern that the
411 string was found inside. This is important as in the case of positive
538e84ed 412 lookahead or positive lookbehind we can have multiple patterns
1de06328 413 involved. Consider
538e84ed 414
1de06328 415 /(?=FOO).*F/
538e84ed 416
1de06328
YO
417 The minimum length of the pattern overall is 3, the minimum length
418 of the lookahead part is 3, but the minimum length of the part that
538e84ed 419 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 420 minimum length for the F is 1. This is important as the minimum length
538e84ed 421 is used to determine offsets in front of and behind the string being
1de06328 422 looked for. Since strings can be composites this is the length of the
486ec47a 423 pattern at the time it was committed with a scan_commit. Note that
1de06328 424 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 425 are not known until the full pattern has been compiled, thus the
1de06328 426 pointer to the value.
538e84ed 427
1de06328 428 - lookbehind
538e84ed 429
1de06328 430 In the case of lookbehind the string being searched for can be
538e84ed 431 offset past the start point of the final matching string.
1de06328
YO
432 If this value was just blithely removed from the min_offset it would
433 invalidate some of the calculations for how many chars must match
434 before or after (as they are derived from min_offset and minlen and
538e84ed 435 the length of the string being searched for).
1de06328
YO
436 When the final pattern is compiled and the data is moved from the
437 scan_data_t structure into the regexp structure the information
538e84ed
KW
438 about lookbehind is factored in, with the information that would
439 have been lost precalculated in the end_shift field for the
1de06328
YO
440 associated string.
441
442 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 443 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
444
445*/
2c2d71f5 446
cb6aef8b
DM
447struct scan_data_substrs {
448 SV *str; /* longest substring found in pattern */
449 SSize_t min_offset; /* earliest point in string it can appear */
450 SSize_t max_offset; /* latest point in string it can appear */
451 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
452 SSize_t lookbehind; /* is the pos of the string modified by LB */
11683ecb 453 I32 flags; /* per substring SF_* and SCF_* flags */
cb6aef8b
DM
454};
455
2c2d71f5 456typedef struct scan_data_t {
1de06328
YO
457 /*I32 len_min; unused */
458 /*I32 len_delta; unused */
49f55535 459 SSize_t pos_min;
ea3daa5d 460 SSize_t pos_delta;
2c2d71f5 461 SV *last_found;
ea3daa5d 462 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 463 SSize_t last_start_min;
ea3daa5d 464 SSize_t last_start_max;
2099df82
DM
465 U8 cur_is_floating; /* whether the last_* values should be set as
466 * the next fixed (0) or floating (1)
467 * substring */
6ea77169 468
2099df82
DM
469 /* [0] is longest fixed substring so far, [1] is longest float so far */
470 struct scan_data_substrs substrs[2];
6ea77169 471
11683ecb 472 I32 flags; /* common SF_* and SCF_* flags */
2c2d71f5 473 I32 whilem_c;
49f55535 474 SSize_t *last_closep;
b8f7bb16 475 regnode_ssc *start_class;
2c2d71f5
JH
476} scan_data_t;
477
a687059c 478/*
e50aee73 479 * Forward declarations for pregcomp()'s friends.
a687059c 480 */
a0d0e21e 481
6ea77169 482static const scan_data_t zero_scan_data = {
a2ca2017 483 0, 0, NULL, 0, 0, 0, 0,
6ea77169 484 {
11683ecb
DM
485 { NULL, 0, 0, 0, 0, 0 },
486 { NULL, 0, 0, 0, 0, 0 },
6ea77169
DM
487 },
488 0, 0, NULL, NULL
489};
c277df42 490
11683ecb
DM
491/* study flags */
492
07be1b83
YO
493#define SF_BEFORE_SEOL 0x0001
494#define SF_BEFORE_MEOL 0x0002
11683ecb 495#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
c277df42 496
07be1b83
YO
497#define SF_IS_INF 0x0040
498#define SF_HAS_PAR 0x0080
499#define SF_IN_PAR 0x0100
500#define SF_HAS_EVAL 0x0200
bc604ad8
KW
501
502
503/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
504 * longest substring in the pattern. When it is not set the optimiser keeps
505 * track of position, but does not keep track of the actual strings seen,
506 *
507 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
508 * /foo/i will not.
509 *
510 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
511 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
512 * turned off because of the alternation (BRANCH). */
07be1b83 513#define SCF_DO_SUBSTR 0x0400
bc604ad8 514
653099ff
GS
515#define SCF_DO_STCLASS_AND 0x0800
516#define SCF_DO_STCLASS_OR 0x1000
517#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 518#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 519
786e8c11 520#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 521#define SCF_SEEN_ACCEPT 0x8000
688e0391 522#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
523#define SCF_IN_DEFINE 0x20000
524
525
526
07be1b83 527
43fead97 528#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
529
530/* The enums for all these are ordered so things work out correctly */
a62b1201 531#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
532#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
533 == REGEX_DEPENDS_CHARSET)
00b27cfc 534#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
535#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
536 >= REGEX_UNICODE_CHARSET)
537#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
538 == REGEX_ASCII_RESTRICTED_CHARSET)
539#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
540 >= REGEX_ASCII_RESTRICTED_CHARSET)
541#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
542 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 543
43fead97 544#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 545
f2c2a6ab
KW
546/* For programs that want to be strictly Unicode compatible by dying if any
547 * attempt is made to match a non-Unicode code point against a Unicode
548 * property. */
549#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
550
93733859 551#define OOB_NAMEDCLASS -1
b8c5462f 552
8e661ac5
KW
553/* There is no code point that is out-of-bounds, so this is problematic. But
554 * its only current use is to initialize a variable that is always set before
555 * looked at. */
556#define OOB_UNICODE 0xDEADBEEF
557
a0ed51b3 558#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 559
8615cb43 560
b45f050a
JF
561/* length of regex to show in messages that don't mark a position within */
562#define RegexLengthToShowInErrorMessages 127
563
564/*
565 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
566 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
567 * op/pragma/warn/regcomp.
568 */
7253e4e3
RK
569#define MARKER1 "<-- HERE" /* marker as it appears in the description */
570#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 571
538e84ed 572#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 573 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 574
285b5ca0
KW
575/* The code in this file in places uses one level of recursion with parsing
576 * rebased to an alternate string constructed by us in memory. This can take
577 * the form of something that is completely different from the input, or
578 * something that uses the input as part of the alternate. In the first case,
579 * there should be no possibility of an error, as we are in complete control of
580 * the alternate string. But in the second case we don't control the input
581 * portion, so there may be errors in that. Here's an example:
582 * /[abc\x{DF}def]/ui
583 * is handled specially because \x{df} folds to a sequence of more than one
584 * character, 'ss'. What is done is to create and parse an alternate string,
585 * which looks like this:
586 * /(?:\x{DF}|[abc\x{DF}def])/ui
587 * where it uses the input unchanged in the middle of something it constructs,
588 * which is a branch for the DF outside the character class, and clustering
589 * parens around the whole thing. (It knows enough to skip the DF inside the
590 * class while in this substitute parse.) 'abc' and 'def' may have errors that
591 * need to be reported. The general situation looks like this:
592 *
593 * sI tI xI eI
594 * Input: ----------------------------------------------------
595 * Constructed: ---------------------------------------------------
596 * sC tC xC eC EC
597 *
598 * The input string sI..eI is the input pattern. The string sC..EC is the
599 * constructed substitute parse string. The portions sC..tC and eC..EC are
600 * constructed by us. The portion tC..eC is an exact duplicate of the input
601 * pattern tI..eI. In the diagram, these are vertically aligned. Suppose that
602 * while parsing, we find an error at xC. We want to display a message showing
603 * the real input string. Thus we need to find the point xI in it which
604 * corresponds to xC. xC >= tC, since the portion of the string sC..tC has
605 * been constructed by us, and so shouldn't have errors. We get:
606 *
607 * xI = sI + (tI - sI) + (xC - tC)
608 *
609 * and, the offset into sI is:
610 *
611 * (xI - sI) = (tI - sI) + (xC - tC)
612 *
613 * When the substitute is constructed, we save (tI -sI) as RExC_precomp_adj,
614 * and we save tC as RExC_adjusted_start.
903c858a
KW
615 *
616 * During normal processing of the input pattern, everything points to that,
617 * with RExC_precomp_adj set to 0, and RExC_adjusted_start set to sI.
285b5ca0
KW
618 */
619
620#define tI_sI RExC_precomp_adj
621#define tC RExC_adjusted_start
622#define sC RExC_precomp
623#define xI_offset(xC) ((IV) (tI_sI + (xC - tC)))
624#define xI(xC) (sC + xI_offset(xC))
625#define eC RExC_precomp_end
626
627#define REPORT_LOCATION_ARGS(xC) \
628 UTF8fARG(UTF, \
629 (xI(xC) > eC) /* Don't run off end */ \
630 ? eC - sC /* Length before the <--HERE */ \
631 : xI_offset(xC), \
632 sC), /* The input pattern printed up to the <--HERE */ \
633 UTF8fARG(UTF, \
634 (xI(xC) > eC) ? 0 : eC - xI(xC), /* Length after <--HERE */ \
635 (xI(xC) > eC) ? eC : xI(xC)) /* pattern after <--HERE */
c1d900c3 636
8a6d8ec6
HS
637/* Used to point after bad bytes for an error message, but avoid skipping
638 * past a nul byte. */
639#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
640
b45f050a
JF
641/*
642 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
643 * arg. Show regex, up to a maximum length. If it's too long, chop and add
644 * "...".
645 */
58e23c8d 646#define _FAIL(code) STMT_START { \
bfed75c6 647 const char *ellipses = ""; \
711b303b 648 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380
MP
649 \
650 if (!SIZE_ONLY) \
a5e7bc51 651 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
652 if (len > RegexLengthToShowInErrorMessages) { \
653 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
654 len = RegexLengthToShowInErrorMessages - 10; \
655 ellipses = "..."; \
656 } \
58e23c8d 657 code; \
ccb2c380 658} STMT_END
8615cb43 659
58e23c8d 660#define FAIL(msg) _FAIL( \
147e3846 661 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 662 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
663
664#define FAIL2(msg,arg) _FAIL( \
147e3846 665 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 666 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 667
b45f050a 668/*
b45f050a
JF
669 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
670 */
ccb2c380 671#define Simple_vFAIL(m) STMT_START { \
ccb2c380 672 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 673 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 674} STMT_END
b45f050a
JF
675
676/*
677 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
678 */
ccb2c380
MP
679#define vFAIL(m) STMT_START { \
680 if (!SIZE_ONLY) \
a5e7bc51 681 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
682 Simple_vFAIL(m); \
683} STMT_END
b45f050a
JF
684
685/*
686 * Like Simple_vFAIL(), but accepts two arguments.
687 */
ccb2c380 688#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
689 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
690 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 691} STMT_END
b45f050a
JF
692
693/*
694 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
695 */
ccb2c380
MP
696#define vFAIL2(m,a1) STMT_START { \
697 if (!SIZE_ONLY) \
a5e7bc51 698 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
699 Simple_vFAIL2(m, a1); \
700} STMT_END
b45f050a
JF
701
702
703/*
704 * Like Simple_vFAIL(), but accepts three arguments.
705 */
ccb2c380 706#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 707 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 708 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 709} STMT_END
b45f050a
JF
710
711/*
712 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
713 */
ccb2c380
MP
714#define vFAIL3(m,a1,a2) STMT_START { \
715 if (!SIZE_ONLY) \
a5e7bc51 716 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
717 Simple_vFAIL3(m, a1, a2); \
718} STMT_END
b45f050a
JF
719
720/*
721 * Like Simple_vFAIL(), but accepts four arguments.
722 */
ccb2c380 723#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
724 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
725 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 726} STMT_END
b45f050a 727
95db3ffa
KW
728#define vFAIL4(m,a1,a2,a3) STMT_START { \
729 if (!SIZE_ONLY) \
730 SAVEFREESV(RExC_rx_sv); \
731 Simple_vFAIL4(m, a1, a2, a3); \
732} STMT_END
733
946095af 734/* A specialized version of vFAIL2 that works with UTF8f */
d528642a
KW
735#define vFAIL2utf8f(m, a1) STMT_START { \
736 if (!SIZE_ONLY) \
737 SAVEFREESV(RExC_rx_sv); \
738 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
739 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
740} STMT_END
741
3ba22297 742#define vFAIL3utf8f(m, a1, a2) STMT_START { \
3ba22297
KW
743 if (!SIZE_ONLY) \
744 SAVEFREESV(RExC_rx_sv); \
745 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 746 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
747} STMT_END
748
499333dc
KW
749/* These have asserts in them because of [perl #122671] Many warnings in
750 * regcomp.c can occur twice. If they get output in pass1 and later in that
751 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
752 * would get output again. So they should be output in pass2, and these
753 * asserts make sure new warnings follow that paradigm. */
946095af 754
5e0a247b
KW
755/* m is not necessarily a "literal string", in this macro */
756#define reg_warn_non_literal_string(loc, m) STMT_START { \
d528642a
KW
757 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
758 "%s" REPORT_LOCATION, \
759 m, REPORT_LOCATION_ARGS(loc)); \
5e0a247b
KW
760} STMT_END
761
668c081a 762#define ckWARNreg(loc,m) STMT_START { \
d528642a
KW
763 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
764 m REPORT_LOCATION, \
765 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
766} STMT_END
767
b927b7e9 768#define vWARN(loc, m) STMT_START { \
d528642a
KW
769 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
770 m REPORT_LOCATION, \
771 REPORT_LOCATION_ARGS(loc)); \
b927b7e9
KW
772} STMT_END
773
0d6106aa 774#define vWARN_dep(loc, m) STMT_START { \
d528642a
KW
775 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
776 m REPORT_LOCATION, \
777 REPORT_LOCATION_ARGS(loc)); \
0d6106aa
KW
778} STMT_END
779
147508a2 780#define ckWARNdep(loc,m) STMT_START { \
d528642a
KW
781 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
782 m REPORT_LOCATION, \
783 REPORT_LOCATION_ARGS(loc)); \
147508a2
KW
784} STMT_END
785
d528642a
KW
786#define ckWARNregdep(loc,m) STMT_START { \
787 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
788 WARN_REGEXP), \
789 m REPORT_LOCATION, \
790 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
791} STMT_END
792
d528642a
KW
793#define ckWARN2reg_d(loc,m, a1) STMT_START { \
794 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
795 m REPORT_LOCATION, \
796 a1, REPORT_LOCATION_ARGS(loc)); \
2335b3d3
KW
797} STMT_END
798
d528642a
KW
799#define ckWARN2reg(loc, m, a1) STMT_START { \
800 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
801 m REPORT_LOCATION, \
802 a1, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
803} STMT_END
804
d528642a
KW
805#define vWARN3(loc, m, a1, a2) STMT_START { \
806 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
807 m REPORT_LOCATION, \
808 a1, a2, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
809} STMT_END
810
d528642a
KW
811#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
812 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
813 m REPORT_LOCATION, \
814 a1, a2, \
815 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
816} STMT_END
817
ccb2c380 818#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
819 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
820 m REPORT_LOCATION, \
821 a1, a2, a3, \
822 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
823} STMT_END
824
668c081a 825#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
826 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
827 m REPORT_LOCATION, \
828 a1, a2, a3, \
829 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
830} STMT_END
831
ccb2c380 832#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
d528642a
KW
833 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
834 m REPORT_LOCATION, \
835 a1, a2, a3, a4, \
836 REPORT_LOCATION_ARGS(loc)); \
ccb2c380 837} STMT_END
9d1d55b5 838
538e84ed 839/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
840 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
841 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
842 * Element 0 holds the number n.
07be1b83 843 * Position is 1 indexed.
fac92740 844 */
7122b237
YO
845#ifndef RE_TRACK_PATTERN_OFFSETS
846#define Set_Node_Offset_To_R(node,byte)
847#define Set_Node_Offset(node,byte)
848#define Set_Cur_Node_Offset
849#define Set_Node_Length_To_R(node,len)
850#define Set_Node_Length(node,len)
6a86c6ad 851#define Set_Node_Cur_Length(node,start)
538e84ed
KW
852#define Node_Offset(n)
853#define Node_Length(n)
7122b237
YO
854#define Set_Node_Offset_Length(node,offset,len)
855#define ProgLen(ri) ri->u.proglen
856#define SetProgLen(ri,x) ri->u.proglen = x
857#else
858#define ProgLen(ri) ri->u.offsets[0]
859#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
860#define Set_Node_Offset_To_R(node,byte) STMT_START { \
861 if (! SIZE_ONLY) { \
862 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 863 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 864 if((node) < 0) { \
538e84ed
KW
865 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
866 (int)(node)); \
ccb2c380
MP
867 } else { \
868 RExC_offsets[2*(node)-1] = (byte); \
869 } \
870 } \
871} STMT_END
872
873#define Set_Node_Offset(node,byte) \
874 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
875#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
876
877#define Set_Node_Length_To_R(node,len) STMT_START { \
878 if (! SIZE_ONLY) { \
879 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 880 __LINE__, (int)(node), (int)(len))); \
ccb2c380 881 if((node) < 0) { \
538e84ed
KW
882 Perl_croak(aTHX_ "value of node is %d in Length macro", \
883 (int)(node)); \
ccb2c380
MP
884 } else { \
885 RExC_offsets[2*(node)] = (len); \
886 } \
887 } \
888} STMT_END
889
890#define Set_Node_Length(node,len) \
891 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
892#define Set_Node_Cur_Length(node, start) \
893 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
894
895/* Get offsets and lengths */
896#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
897#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
898
07be1b83
YO
899#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
900 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
901 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
902} STMT_END
7122b237 903#endif
07be1b83
YO
904
905#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
906#define EXPERIMENTAL_INPLACESCAN
f427392e 907#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 908
cb41e5d6
YO
909#ifdef DEBUGGING
910int
6ad9a8ab 911Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
912{
913 va_list ap;
914 int result;
915 PerlIO *f= Perl_debug_log;
916 PERL_ARGS_ASSERT_RE_PRINTF;
917 va_start(ap, fmt);
918 result = PerlIO_vprintf(f, fmt, ap);
919 va_end(ap);
920 return result;
921}
922
923int
7b031478 924Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
925{
926 va_list ap;
927 int result;
928 PerlIO *f= Perl_debug_log;
929 PERL_ARGS_ASSERT_RE_INDENTF;
930 va_start(ap, depth);
daeb874b 931 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
932 result = PerlIO_vprintf(f, fmt, ap);
933 va_end(ap);
934 return result;
935}
936#endif /* DEBUGGING */
937
7b031478 938#define DEBUG_RExC_seen() \
538e84ed 939 DEBUG_OPTIMISE_MORE_r({ \
6ad9a8ab 940 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 941 \
e384d5c1 942 if (RExC_seen & REG_ZERO_LEN_SEEN) \
6ad9a8ab 943 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 944 \
e384d5c1 945 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
6ad9a8ab 946 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 947 \
e384d5c1 948 if (RExC_seen & REG_GPOS_SEEN) \
6ad9a8ab 949 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 950 \
e384d5c1 951 if (RExC_seen & REG_RECURSE_SEEN) \
6ad9a8ab 952 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 953 \
7b031478 954 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
6ad9a8ab 955 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 956 \
e384d5c1 957 if (RExC_seen & REG_VERBARG_SEEN) \
6ad9a8ab 958 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 959 \
e384d5c1 960 if (RExC_seen & REG_CUTGROUP_SEEN) \
6ad9a8ab 961 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 962 \
e384d5c1 963 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
6ad9a8ab 964 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 965 \
e384d5c1 966 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
6ad9a8ab 967 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 968 \
7b031478 969 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
6ad9a8ab 970 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 971 \
6ad9a8ab 972 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
973 });
974
fdfb4f21 975#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 976 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21 977
fdfb4f21 978
f5a36d78
DM
979#ifdef DEBUGGING
980static void
981S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
982 const char *close_str)
983{
984 if (!flags)
985 return;
986
987 Perl_re_printf( aTHX_ "%s", open_str);
11683ecb
DM
988 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
989 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
f5a36d78
DM
990 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
991 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
992 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
993 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
994 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
995 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
996 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
997 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
998 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
999 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1000 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1001 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1002 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1003 Perl_re_printf( aTHX_ "%s", close_str);
1004}
1005
1006
1007static void
1008S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1009 U32 depth, int is_inf)
1010{
1011 GET_RE_DEBUG_FLAGS_DECL;
1012
1013 DEBUG_OPTIMISE_MORE_r({
1014 if (!data)
1015 return;
1016 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1017 depth,
1018 where,
1019 (IV)data->pos_min,
1020 (IV)data->pos_delta,
1021 (UV)data->flags
1022 );
1023
1024 S_debug_show_study_flags(aTHX_ data->flags," [","]");
fdfb4f21 1025
f5a36d78
DM
1026 Perl_re_printf( aTHX_
1027 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1028 (IV)data->whilem_c,
1029 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1030 is_inf ? "INF " : ""
1031 );
1032
11683ecb 1033 if (data->last_found) {
37b6262f 1034 int i;
11683ecb
DM
1035 Perl_re_printf(aTHX_
1036 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1037 SvPVX_const(data->last_found),
1038 (IV)data->last_end,
1039 (IV)data->last_start_min,
1040 (IV)data->last_start_max
1041 );
1042
2099df82 1043 for (i = 0; i < 2; i++) {
37b6262f
DM
1044 Perl_re_printf(aTHX_
1045 " %s%s: '%s' @ %" IVdf "/%" IVdf,
2099df82
DM
1046 data->cur_is_floating == i ? "*" : "",
1047 i ? "Float" : "Fixed",
37b6262f
DM
1048 SvPVX_const(data->substrs[i].str),
1049 (IV)data->substrs[i].min_offset,
1050 (IV)data->substrs[i].max_offset
1051 );
1052 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1053 }
11683ecb 1054 }
f5a36d78
DM
1055
1056 Perl_re_printf( aTHX_ "\n");
1057 });
1058}
1059
1060
1061static void
1062S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1063 regnode *scan, U32 depth, U32 flags)
1064{
1065 GET_RE_DEBUG_FLAGS_DECL;
1066
1067 DEBUG_OPTIMISE_r({
1068 regnode *Next;
1069
1070 if (!scan)
1071 return;
1072 Next = regnext(scan);
1073 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1074 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1075 depth,
1076 str,
1077 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1078 Next ? (REG_NODE_NUM(Next)) : 0 );
1079 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1080 Perl_re_printf( aTHX_ "\n");
1081 });
1082}
1083
1084
1085# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1086 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1087
1088# define DEBUG_PEEP(str, scan, depth, flags) \
1089 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1090
1091#else
1092# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1093# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1094#endif
1de06328 1095
cb41e5d6 1096
c6871b76
KW
1097/* =========================================================
1098 * BEGIN edit_distance stuff.
1099 *
1100 * This calculates how many single character changes of any type are needed to
1101 * transform a string into another one. It is taken from version 3.1 of
1102 *
1103 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1104 */
1105
1106/* Our unsorted dictionary linked list. */
1107/* Note we use UVs, not chars. */
1108
1109struct dictionary{
1110 UV key;
1111 UV value;
1112 struct dictionary* next;
1113};
1114typedef struct dictionary item;
1115
1116
1117PERL_STATIC_INLINE item*
1118push(UV key,item* curr)
1119{
1120 item* head;
1121 Newxz(head, 1, item);
1122 head->key = key;
1123 head->value = 0;
1124 head->next = curr;
1125 return head;
1126}
1127
1128
1129PERL_STATIC_INLINE item*
1130find(item* head, UV key)
1131{
1132 item* iterator = head;
1133 while (iterator){
1134 if (iterator->key == key){
1135 return iterator;
1136 }
1137 iterator = iterator->next;
1138 }
1139
1140 return NULL;
1141}
1142
1143PERL_STATIC_INLINE item*
1144uniquePush(item* head,UV key)
1145{
1146 item* iterator = head;
1147
1148 while (iterator){
1149 if (iterator->key == key) {
1150 return head;
1151 }
1152 iterator = iterator->next;
1153 }
1154
1155 return push(key,head);
1156}
1157
1158PERL_STATIC_INLINE void
1159dict_free(item* head)
1160{
1161 item* iterator = head;
1162
1163 while (iterator) {
1164 item* temp = iterator;
1165 iterator = iterator->next;
1166 Safefree(temp);
1167 }
1168
1169 head = NULL;
1170}
1171
1172/* End of Dictionary Stuff */
1173
1174/* All calculations/work are done here */
1175STATIC int
1176S_edit_distance(const UV* src,
1177 const UV* tgt,
1178 const STRLEN x, /* length of src[] */
1179 const STRLEN y, /* length of tgt[] */
1180 const SSize_t maxDistance
1181)
1182{
1183 item *head = NULL;
1184 UV swapCount,swapScore,targetCharCount,i,j;
1185 UV *scores;
1186 UV score_ceil = x + y;
1187
1188 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1189
1190 /* intialize matrix start values */
1191 Newxz(scores, ( (x + 2) * (y + 2)), UV);
1192 scores[0] = score_ceil;
1193 scores[1 * (y + 2) + 0] = score_ceil;
1194 scores[0 * (y + 2) + 1] = score_ceil;
1195 scores[1 * (y + 2) + 1] = 0;
1196 head = uniquePush(uniquePush(head,src[0]),tgt[0]);
1197
1198 /* work loops */
1199 /* i = src index */
1200 /* j = tgt index */
1201 for (i=1;i<=x;i++) {
1202 if (i < x)
1203 head = uniquePush(head,src[i]);
1204 scores[(i+1) * (y + 2) + 1] = i;
1205 scores[(i+1) * (y + 2) + 0] = score_ceil;
1206 swapCount = 0;
1207
1208 for (j=1;j<=y;j++) {
1209 if (i == 1) {
1210 if(j < y)
1211 head = uniquePush(head,tgt[j]);
1212 scores[1 * (y + 2) + (j + 1)] = j;
1213 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1214 }
1215
1216 targetCharCount = find(head,tgt[j-1])->value;
1217 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1218
1219 if (src[i-1] != tgt[j-1]){
1220 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1221 }
1222 else {
1223 swapCount = j;
1224 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1225 }
1226 }
1227
1228 find(head,src[i-1])->value = i;
1229 }
1230
1231 {
1232 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1233 dict_free(head);
1234 Safefree(scores);
1235 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1236 }
1237}
1238
1239/* END of edit_distance() stuff
1240 * ========================================================= */
1241
8e35b056
KW
1242/* is c a control character for which we have a mnemonic? */
1243#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1244
549b4e78
KW
1245STATIC const char *
1246S_cntrl_to_mnemonic(const U8 c)
1247{
1248 /* Returns the mnemonic string that represents character 'c', if one
1249 * exists; NULL otherwise. The only ones that exist for the purposes of
1250 * this routine are a few control characters */
1251
1252 switch (c) {
1253 case '\a': return "\\a";
1254 case '\b': return "\\b";
1255 case ESC_NATIVE: return "\\e";
1256 case '\f': return "\\f";
1257 case '\n': return "\\n";
1258 case '\r': return "\\r";
1259 case '\t': return "\\t";
1260 }
1261
1262 return NULL;
1263}
1264
653099ff 1265/* Mark that we cannot extend a found fixed substring at this point.
37b6262f 1266 Update the longest found anchored substring or the longest found
653099ff
GS
1267 floating substrings if needed. */
1268
4327152a 1269STATIC void
ea3daa5d
FC
1270S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1271 SSize_t *minlenp, int is_inf)
c277df42 1272{
e1ec3a88 1273 const STRLEN l = CHR_SVLEN(data->last_found);
2099df82 1274 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
a2ca2017 1275 const STRLEN old_l = CHR_SVLEN(longest_sv);
1de06328 1276 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1277
7918f24d
NC
1278 PERL_ARGS_ASSERT_SCAN_COMMIT;
1279
c277df42 1280 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
2099df82 1281 const U8 i = data->cur_is_floating;
a2ca2017 1282 SvSetMagicSV(longest_sv, data->last_found);
37b6262f
DM
1283 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1284
2099df82
DM
1285 if (!i) /* fixed */
1286 data->substrs[0].max_offset = data->substrs[0].min_offset;
1287 else { /* float */
1288 data->substrs[1].max_offset = (l
646e8787
DM
1289 ? data->last_start_max
1290 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1291 ? SSize_t_MAX
1292 : data->pos_min + data->pos_delta));
1293 if (is_inf
2099df82
DM
1294 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1295 data->substrs[1].max_offset = SSize_t_MAX;
37b6262f 1296 }
11683ecb 1297
37b6262f
DM
1298 if (data->flags & SF_BEFORE_EOL)
1299 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1300 else
1301 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1302 data->substrs[i].minlenp = minlenp;
1303 data->substrs[i].lookbehind = 0;
c277df42 1304 }
37b6262f 1305
c277df42 1306 SvCUR_set(data->last_found, 0);
0eda9292 1307 {
a28509cc 1308 SV * const sv = data->last_found;
097eb12c
AL
1309 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1310 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1311 if (mg)
1312 mg->mg_len = 0;
1313 }
0eda9292 1314 }
c277df42
IZ
1315 data->last_end = -1;
1316 data->flags &= ~SF_BEFORE_EOL;
f5a36d78 1317 DEBUG_STUDYDATA("commit", data, 0, is_inf);
c277df42
IZ
1318}
1319
cdd87c1d
KW
1320/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1321 * list that describes which code points it matches */
1322
653099ff 1323STATIC void
3420edd7 1324S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1325{
cdd87c1d
KW
1326 /* Set the SSC 'ssc' to match an empty string or any code point */
1327
557bd3fb 1328 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1329
71068078 1330 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1331
0854ea0b
KW
1332 /* mortalize so won't leak */
1333 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1334 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1335}
1336
653099ff 1337STATIC int
dc3bf405 1338S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1339{
c144baaa
KW
1340 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1341 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1342 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1343 * in any way, so there's no point in using it */
cdd87c1d
KW
1344
1345 UV start, end;
1346 bool ret;
653099ff 1347
557bd3fb 1348 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1349
71068078 1350 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1351
93e92956 1352 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1353 return FALSE;
1354 }
1355
1356 /* See if the list consists solely of the range 0 - Infinity */
1357 invlist_iterinit(ssc->invlist);
1358 ret = invlist_iternext(ssc->invlist, &start, &end)
1359 && start == 0
1360 && end == UV_MAX;
1361
1362 invlist_iterfinish(ssc->invlist);
1363
1364 if (ret) {
1365 return TRUE;
1366 }
1367
1368 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1369 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1370 int i;
1371 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1372 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1373 return TRUE;
1374 }
1375 }
1376 }
1377
1378 return FALSE;
653099ff
GS
1379}
1380
653099ff 1381STATIC void
cdd87c1d 1382S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1383{
cdd87c1d
KW
1384 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1385 * string, any code point, or any posix class under locale */
1386
557bd3fb 1387 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1388
557bd3fb 1389 Zero(ssc, 1, regnode_ssc);
71068078 1390 set_ANYOF_SYNTHETIC(ssc);
93e92956 1391 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1392 ssc_anything(ssc);
cdd87c1d 1393
2f306ab9
KW
1394 /* If any portion of the regex is to operate under locale rules that aren't
1395 * fully known at compile time, initialization includes it. The reason
1396 * this isn't done for all regexes is that the optimizer was written under
1397 * the assumption that locale was all-or-nothing. Given the complexity and
1398 * lack of documentation in the optimizer, and that there are inadequate
1399 * test cases for locale, many parts of it may not work properly, it is
1400 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1401 if (RExC_contains_locale) {
1402 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1403 }
1404 else {
1405 ANYOF_POSIXL_ZERO(ssc);
1406 }
653099ff
GS
1407}
1408
b423522f 1409STATIC int
dc3bf405
BF
1410S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1411 const regnode_ssc *ssc)
b423522f
KW
1412{
1413 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1414 * to the list of code points matched, and locale posix classes; hence does
1415 * not check its flags) */
1416
1417 UV start, end;
1418 bool ret;
1419
1420 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1421
71068078 1422 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1423
1424 invlist_iterinit(ssc->invlist);
1425 ret = invlist_iternext(ssc->invlist, &start, &end)
1426 && start == 0
1427 && end == UV_MAX;
1428
1429 invlist_iterfinish(ssc->invlist);
1430
1431 if (! ret) {
1432 return FALSE;
1433 }
1434
e0e1be5f 1435 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1436 return FALSE;
b423522f
KW
1437 }
1438
1439 return TRUE;
1440}
1441
1442STATIC SV*
1443S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1444 const regnode_charclass* const node)
b423522f
KW
1445{
1446 /* Returns a mortal inversion list defining which code points are matched
1447 * by 'node', which is of type ANYOF. Handles complementing the result if
1448 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1449 * returned list must, and will, contain every code point that is a
1450 * possibility. */
b423522f 1451
e2506fa7 1452 SV* invlist = NULL;
1ee208c4 1453 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1454 unsigned int i;
1455 const U32 n = ARG(node);
31f05a37 1456 bool new_node_has_latin1 = FALSE;
b423522f
KW
1457
1458 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1459
1460 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1461 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1462 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1463 AV * const av = MUTABLE_AV(SvRV(rv));
1464 SV **const ary = AvARRAY(av);
1465 assert(RExC_rxi->data->what[n] == 's');
1466
1467 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1468 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1469 }
1470 else if (ary[0] && ary[0] != &PL_sv_undef) {
1471
1472 /* Here, no compile-time swash, and there are things that won't be
1473 * known until runtime -- we have to assume it could be anything */
e2506fa7 1474 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1475 return _add_range_to_invlist(invlist, 0, UV_MAX);
1476 }
1ee208c4 1477 else if (ary[3] && ary[3] != &PL_sv_undef) {
b423522f
KW
1478
1479 /* Here no compile-time swash, and no run-time only data. Use the
1480 * node's inversion list */
1ee208c4
KW
1481 invlist = sv_2mortal(invlist_clone(ary[3]));
1482 }
1483
1484 /* Get the code points valid only under UTF-8 locales */
037715a6 1485 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1ee208c4
KW
1486 && ary[2] && ary[2] != &PL_sv_undef)
1487 {
1488 only_utf8_locale_invlist = ary[2];
b423522f
KW
1489 }
1490 }
1491
e2506fa7
KW
1492 if (! invlist) {
1493 invlist = sv_2mortal(_new_invlist(0));
1494 }
1495
dcb20b36
KW
1496 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1497 * code points, and an inversion list for the others, but if there are code
1498 * points that should match only conditionally on the target string being
1499 * UTF-8, those are placed in the inversion list, and not the bitmap.
1500 * Since there are circumstances under which they could match, they are
1501 * included in the SSC. But if the ANYOF node is to be inverted, we have
1502 * to exclude them here, so that when we invert below, the end result
1503 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1504 * have to do this here before we add the unconditionally matched code
1505 * points */
b423522f
KW
1506 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1507 _invlist_intersection_complement_2nd(invlist,
1508 PL_UpperLatin1,
1509 &invlist);
1510 }
1511
1512 /* Add in the points from the bit map */
dcb20b36 1513 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
b423522f 1514 if (ANYOF_BITMAP_TEST(node, i)) {
6f8848d5
TC
1515 unsigned int start = i++;
1516
1517 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1518 /* empty */
1519 }
1520 invlist = _add_range_to_invlist(invlist, start, i-1);
31f05a37 1521 new_node_has_latin1 = TRUE;
b423522f
KW
1522 }
1523 }
1524
1525 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1526 * as well. But don't add them if inverting, as when that gets done below,
1527 * it would exclude all these characters, including the ones it shouldn't
1528 * that were added just above */
1529 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
f240c685
KW
1530 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1531 {
b423522f
KW
1532 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1533 }
1534
1535 /* Similarly for these */
93e92956 1536 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1537 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1538 }
1539
1540 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1541 _invlist_invert(invlist);
1542 }
037715a6 1543 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
31f05a37
KW
1544
1545 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1546 * locale. We can skip this if there are no 0-255 at all. */
1547 _invlist_union(invlist, PL_Latin1, &invlist);
1548 }
1549
1ee208c4
KW
1550 /* Similarly add the UTF-8 locale possible matches. These have to be
1551 * deferred until after the non-UTF-8 locale ones are taken care of just
1552 * above, or it leads to wrong results under ANYOF_INVERT */
1553 if (only_utf8_locale_invlist) {
31f05a37 1554 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1555 only_utf8_locale_invlist,
31f05a37
KW
1556 ANYOF_FLAGS(node) & ANYOF_INVERT,
1557 &invlist);
1558 }
b423522f
KW
1559
1560 return invlist;
1561}
1562
1051e1c4 1563/* These two functions currently do the exact same thing */
557bd3fb 1564#define ssc_init_zero ssc_init
653099ff 1565
cdd87c1d
KW
1566#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1567#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1568
557bd3fb 1569/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1570 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1571 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1572
653099ff 1573STATIC void
b423522f 1574S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1575 const regnode_charclass *and_with)
653099ff 1576{
cdd87c1d
KW
1577 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1578 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1579
a0dd4231
KW
1580 SV* anded_cp_list;
1581 U8 anded_flags;
1e6ade67 1582
cdd87c1d 1583 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1584
71068078 1585 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1586
1587 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1588 * the code point inversion list and just the relevant flags */
71068078 1589 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1590 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
a0dd4231 1591 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1592
1593 /* XXX This is a kludge around what appears to be deficiencies in the
1594 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1595 * there are paths through the optimizer where it doesn't get weeded
1596 * out when it should. And if we don't make some extra provision for
1597 * it like the code just below, it doesn't get added when it should.
1598 * This solution is to add it only when AND'ing, which is here, and
1599 * only when what is being AND'ed is the pristine, original node
1600 * matching anything. Thus it is like adding it to ssc_anything() but
1601 * only when the result is to be AND'ed. Probably the same solution
1602 * could be adopted for the same problem we have with /l matching,
1603 * which is solved differently in S_ssc_init(), and that would lead to
1604 * fewer false positives than that solution has. But if this solution
1605 * creates bugs, the consequences are only that a warning isn't raised
1606 * that should be; while the consequences for having /l bugs is
1607 * incorrect matches */
7dcac5f6 1608 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1609 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1610 }
a0dd4231
KW
1611 }
1612 else {
5c0f85ef 1613 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685
KW
1614 if (OP(and_with) == ANYOFD) {
1615 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1616 }
1617 else {
1618 anded_flags = ANYOF_FLAGS(and_with)
1619 &( ANYOF_COMMON_FLAGS
108316fb
KW
1620 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1621 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1622 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1623 anded_flags &=
1624 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1625 }
f240c685 1626 }
a0dd4231
KW
1627 }
1628
1629 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1630
1631 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1632 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1633 * 'and_with' may be inverted. When not inverted, we have the situation of
1634 * computing:
1635 * (C1 | P1) & (C2 | P2)
1636 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1637 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1638 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1639 * <= ((C1 & C2) | P1 | P2)
1640 * Alternatively, the last few steps could be:
1641 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1642 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1643 * <= (C1 | C2 | (P1 & P2))
1644 * We favor the second approach if either P1 or P2 is non-empty. This is
1645 * because these components are a barrier to doing optimizations, as what
1646 * they match cannot be known until the moment of matching as they are
1647 * dependent on the current locale, 'AND"ing them likely will reduce or
1648 * eliminate them.
1649 * But we can do better if we know that C1,P1 are in their initial state (a
1650 * frequent occurrence), each matching everything:
1651 * (<everything>) & (C2 | P2) = C2 | P2
1652 * Similarly, if C2,P2 are in their initial state (again a frequent
1653 * occurrence), the result is a no-op
1654 * (C1 | P1) & (<everything>) = C1 | P1
1655 *
1656 * Inverted, we have
1657 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1658 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1659 * <= (C1 & ~C2) | (P1 & ~P2)
1660 * */
1aa99e6b 1661
a0dd4231 1662 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1663 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1664 {
cdd87c1d 1665 unsigned int i;
8951c461 1666
cdd87c1d
KW
1667 ssc_intersection(ssc,
1668 anded_cp_list,
1669 FALSE /* Has already been inverted */
1670 );
c6b76537 1671
cdd87c1d
KW
1672 /* If either P1 or P2 is empty, the intersection will be also; can skip
1673 * the loop */
93e92956 1674 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1675 ANYOF_POSIXL_ZERO(ssc);
1676 }
e0e1be5f 1677 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1678
1679 /* Note that the Posix class component P from 'and_with' actually
1680 * looks like:
1681 * P = Pa | Pb | ... | Pn
1682 * where each component is one posix class, such as in [\w\s].
1683 * Thus
1684 * ~P = ~(Pa | Pb | ... | Pn)
1685 * = ~Pa & ~Pb & ... & ~Pn
1686 * <= ~Pa | ~Pb | ... | ~Pn
1687 * The last is something we can easily calculate, but unfortunately
1688 * is likely to have many false positives. We could do better
1689 * in some (but certainly not all) instances if two classes in
1690 * P have known relationships. For example
1691 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1692 * So
1693 * :lower: & :print: = :lower:
1694 * And similarly for classes that must be disjoint. For example,
1695 * since \s and \w can have no elements in common based on rules in
1696 * the POSIX standard,
1697 * \w & ^\S = nothing
1698 * Unfortunately, some vendor locales do not meet the Posix
1699 * standard, in particular almost everything by Microsoft.
1700 * The loop below just changes e.g., \w into \W and vice versa */
1701
1ee208c4 1702 regnode_charclass_posixl temp;
cdd87c1d
KW
1703 int add = 1; /* To calculate the index of the complement */
1704
1705 ANYOF_POSIXL_ZERO(&temp);
1706 for (i = 0; i < ANYOF_MAX; i++) {
1707 assert(i % 2 != 0
7dcac5f6
KW
1708 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1709 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1710
7dcac5f6 1711 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1712 ANYOF_POSIXL_SET(&temp, i + add);
1713 }
1714 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1715 }
1716 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1717
cdd87c1d
KW
1718 } /* else ssc already has no posixes */
1719 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1720 in its initial state */
71068078 1721 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1722 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1723 {
1724 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1725 * copy it over 'ssc' */
1726 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1727 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1728 StructCopy(and_with, ssc, regnode_ssc);
1729 }
1730 else {
1731 ssc->invlist = anded_cp_list;
1732 ANYOF_POSIXL_ZERO(ssc);
93e92956 1733 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1734 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1735 }
1736 }
1737 }
e0e1be5f 1738 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
93e92956 1739 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1740 {
1741 /* One or the other of P1, P2 is non-empty. */
93e92956 1742 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1743 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1744 }
cdd87c1d
KW
1745 ssc_union(ssc, anded_cp_list, FALSE);
1746 }
1747 else { /* P1 = P2 = empty */
1748 ssc_intersection(ssc, anded_cp_list, FALSE);
1749 }
137165a6 1750 }
653099ff
GS
1751}
1752
653099ff 1753STATIC void
cdd87c1d 1754S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1755 const regnode_charclass *or_with)
653099ff 1756{
cdd87c1d
KW
1757 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1758 * another SSC or a regular ANYOF class. Can create false positives if
1759 * 'or_with' is to be inverted. */
7918f24d 1760
a0dd4231
KW
1761 SV* ored_cp_list;
1762 U8 ored_flags;
c6b76537 1763
cdd87c1d 1764 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1765
71068078 1766 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1767
1768 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1769 * the code point inversion list and just the relevant flags */
71068078 1770 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1771 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
a0dd4231
KW
1772 ored_flags = ANYOF_FLAGS(or_with);
1773 }
1774 else {
5c0f85ef 1775 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
eff8b7dc 1776 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
f240c685
KW
1777 if (OP(or_with) != ANYOFD) {
1778 ored_flags
1779 |= ANYOF_FLAGS(or_with)
108316fb
KW
1780 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1781 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1782 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1783 ored_flags |=
1784 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1785 }
f240c685 1786 }
a0dd4231
KW
1787 }
1788
1789 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1790
1791 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1792 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1793 * 'or_with' may be inverted. When not inverted, we have the simple
1794 * situation of computing:
1795 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1796 * If P1|P2 yields a situation with both a class and its complement are
1797 * set, like having both \w and \W, this matches all code points, and we
1798 * can delete these from the P component of the ssc going forward. XXX We
1799 * might be able to delete all the P components, but I (khw) am not certain
1800 * about this, and it is better to be safe.
1801 *
1802 * Inverted, we have
1803 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1804 * <= (C1 | P1) | ~C2
1805 * <= (C1 | ~C2) | P1
1806 * (which results in actually simpler code than the non-inverted case)
1807 * */
9826f543 1808
a0dd4231 1809 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1810 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1811 {
cdd87c1d 1812 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1813 } /* else Not inverted */
93e92956 1814 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1815 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1816 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1817 unsigned int i;
1818 for (i = 0; i < ANYOF_MAX; i += 2) {
1819 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1820 {
1821 ssc_match_all_cp(ssc);
1822 ANYOF_POSIXL_CLEAR(ssc, i);
1823 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1824 }
1825 }
1826 }
1aa99e6b 1827 }
cdd87c1d
KW
1828
1829 ssc_union(ssc,
1830 ored_cp_list,
1831 FALSE /* Already has been inverted */
1832 );
653099ff
GS
1833}
1834
b423522f
KW
1835PERL_STATIC_INLINE void
1836S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1837{
1838 PERL_ARGS_ASSERT_SSC_UNION;
1839
71068078 1840 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1841
1842 _invlist_union_maybe_complement_2nd(ssc->invlist,
1843 invlist,
1844 invert2nd,
1845 &ssc->invlist);
1846}
1847
1848PERL_STATIC_INLINE void
1849S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1850 SV* const invlist,
1851 const bool invert2nd)
1852{
1853 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1854
71068078 1855 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1856
1857 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1858 invlist,
1859 invert2nd,
1860 &ssc->invlist);
1861}
1862
1863PERL_STATIC_INLINE void
1864S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1865{
1866 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1867
71068078 1868 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1869
1870 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1871}
1872
1873PERL_STATIC_INLINE void
1874S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1875{
1876 /* AND just the single code point 'cp' into the SSC 'ssc' */
1877
1878 SV* cp_list = _new_invlist(2);
1879
1880 PERL_ARGS_ASSERT_SSC_CP_AND;
1881
71068078 1882 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1883
1884 cp_list = add_cp_to_invlist(cp_list, cp);
1885 ssc_intersection(ssc, cp_list,
1886 FALSE /* Not inverted */
1887 );
1888 SvREFCNT_dec_NN(cp_list);
1889}
1890
1891PERL_STATIC_INLINE void
dc3bf405 1892S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
1893{
1894 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
1895 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1896
71068078 1897 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1898
1899 ANYOF_POSIXL_ZERO(ssc);
1900 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1901}
1902
b35552de
KW
1903#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1904
1905STATIC bool
1906S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1907{
1908 /* The synthetic start class is used to hopefully quickly winnow down
1909 * places where a pattern could start a match in the target string. If it
1910 * doesn't really narrow things down that much, there isn't much point to
1911 * having the overhead of using it. This function uses some very crude
1912 * heuristics to decide if to use the ssc or not.
1913 *
1914 * It returns TRUE if 'ssc' rules out more than half what it considers to
1915 * be the "likely" possible matches, but of course it doesn't know what the
1916 * actual things being matched are going to be; these are only guesses
1917 *
1918 * For /l matches, it assumes that the only likely matches are going to be
1919 * in the 0-255 range, uniformly distributed, so half of that is 127
1920 * For /a and /d matches, it assumes that the likely matches will be just
1921 * the ASCII range, so half of that is 63
1922 * For /u and there isn't anything matching above the Latin1 range, it
1923 * assumes that that is the only range likely to be matched, and uses
1924 * half that as the cut-off: 127. If anything matches above Latin1,
1925 * it assumes that all of Unicode could match (uniformly), except for
1926 * non-Unicode code points and things in the General Category "Other"
1927 * (unassigned, private use, surrogates, controls and formats). This
1928 * is a much large number. */
1929
b35552de
KW
1930 U32 count = 0; /* Running total of number of code points matched by
1931 'ssc' */
1932 UV start, end; /* Start and end points of current range in inversion
1933 list */
72400949
KW
1934 const U32 max_code_points = (LOC)
1935 ? 256
1936 : (( ! UNI_SEMANTICS
1937 || invlist_highest(ssc->invlist) < 256)
1938 ? 128
1939 : NON_OTHER_COUNT);
1940 const U32 max_match = max_code_points / 2;
b35552de
KW
1941
1942 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1943
1944 invlist_iterinit(ssc->invlist);
1945 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
1946 if (start >= max_code_points) {
1947 break;
b35552de 1948 }
72400949 1949 end = MIN(end, max_code_points - 1);
b35552de 1950 count += end - start + 1;
72400949 1951 if (count >= max_match) {
b35552de
KW
1952 invlist_iterfinish(ssc->invlist);
1953 return FALSE;
1954 }
1955 }
1956
1957 return TRUE;
1958}
1959
1960
b423522f
KW
1961STATIC void
1962S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1963{
1964 /* The inversion list in the SSC is marked mortal; now we need a more
1965 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
1966 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1967 * map */
b423522f
KW
1968
1969 SV* invlist = invlist_clone(ssc->invlist);
1970
1971 PERL_ARGS_ASSERT_SSC_FINALIZE;
1972
71068078 1973 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 1974
a0dd4231 1975 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
1976 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1977 * by the time we reach here */
f240c685
KW
1978 assert(! (ANYOF_FLAGS(ssc)
1979 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
1980 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1981 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 1982
b423522f
KW
1983 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1984
1ee208c4
KW
1985 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1986 NULL, NULL, NULL, FALSE);
b423522f 1987
85c8e306
KW
1988 /* Make sure is clone-safe */
1989 ssc->invlist = NULL;
1990
e0e1be5f 1991 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 1992 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
e0e1be5f 1993 }
1462525b 1994
b2e90ddf
KW
1995 if (RExC_contains_locale) {
1996 OP(ssc) = ANYOFL;
1997 }
1998
1462525b 1999 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
2000}
2001
a3621e74
YO
2002#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2003#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2004#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
2005#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2006 ? (TRIE_LIST_CUR( idx ) - 1) \
2007 : 0 )
a3621e74 2008
3dab1dad
YO
2009
2010#ifdef DEBUGGING
07be1b83 2011/*
2b8b4781
NC
2012 dump_trie(trie,widecharmap,revcharmap)
2013 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2014 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
2015
2016 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
2017 The _interim_ variants are used for debugging the interim
2018 tables that are used to generate the final compressed
2019 representation which is what dump_trie expects.
2020
486ec47a 2021 Part of the reason for their existence is to provide a form
3dab1dad 2022 of documentation as to how the different representations function.
07be1b83
YO
2023
2024*/
3dab1dad
YO
2025
2026/*
3dab1dad
YO
2027 Dumps the final compressed table form of the trie to Perl_debug_log.
2028 Used for debugging make_trie().
2029*/
b9a59e08 2030
3dab1dad 2031STATIC void
2b8b4781
NC
2032S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2033 AV *revcharmap, U32 depth)
3dab1dad
YO
2034{
2035 U32 state;
ab3bbdeb 2036 SV *sv=sv_newmortal();
55eed653 2037 int colwidth= widecharmap ? 6 : 4;
2e64971a 2038 U16 word;
3dab1dad
YO
2039 GET_RE_DEBUG_FLAGS_DECL;
2040
7918f24d 2041 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 2042
6ad9a8ab 2043 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 2044 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
2045
2046 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 2047 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 2048 if ( tmp ) {
6ad9a8ab 2049 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2050 colwidth,
538e84ed 2051 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2052 PL_colors[0], PL_colors[1],
2053 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2054 PERL_PV_ESCAPE_FIRSTCHAR
2055 )
ab3bbdeb 2056 );
3dab1dad
YO
2057 }
2058 }
1e37780e
YO
2059 Perl_re_printf( aTHX_ "\n");
2060 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2061
2062 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2063 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2064 Perl_re_printf( aTHX_ "\n");
3dab1dad 2065
1e2e3d02 2066 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2067 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2068
147e3846 2069 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2070
2071 if ( trie->states[ state ].wordnum ) {
1e37780e 2072 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2073 } else {
6ad9a8ab 2074 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2075 }
2076
147e3846 2077 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2078
2079 if ( base ) {
2080 U32 ofs = 0;
2081
2082 while( ( base + ofs < trie->uniquecharcount ) ||
2083 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2084 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2085 != state))
3dab1dad
YO
2086 ofs++;
2087
147e3846 2088 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2089
2090 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2091 if ( ( base + ofs >= trie->uniquecharcount )
2092 && ( base + ofs - trie->uniquecharcount
2093 < trie->lasttrans )
2094 && trie->trans[ base + ofs
2095 - trie->uniquecharcount ].check == state )
3dab1dad 2096 {
147e3846 2097 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2098 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2099 );
3dab1dad 2100 } else {
6ad9a8ab 2101 Perl_re_printf( aTHX_ "%*s",colwidth," ." );
3dab1dad
YO
2102 }
2103 }
2104
6ad9a8ab 2105 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2106
2107 }
6ad9a8ab 2108 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2109 }
6ad9a8ab 2110 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2111 depth);
2e64971a 2112 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2113 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2114 (int)word, (int)(trie->wordinfo[word].prev),
2115 (int)(trie->wordinfo[word].len));
2116 }
6ad9a8ab 2117 Perl_re_printf( aTHX_ "\n" );
538e84ed 2118}
3dab1dad 2119/*
3dab1dad 2120 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2121 List tries normally only are used for construction when the number of
3dab1dad
YO
2122 possible chars (trie->uniquecharcount) is very high.
2123 Used for debugging make_trie().
2124*/
2125STATIC void
55eed653 2126S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2127 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2128 U32 depth)
3dab1dad
YO
2129{
2130 U32 state;
ab3bbdeb 2131 SV *sv=sv_newmortal();
55eed653 2132 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2133 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2134
2135 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2136
3dab1dad 2137 /* print out the table precompression. */
6ad9a8ab 2138 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2139 depth+1 );
6ad9a8ab 2140 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2141 depth+1, "------:-----+-----------------\n" );
538e84ed 2142
3dab1dad
YO
2143 for( state=1 ; state < next_alloc ; state ++ ) {
2144 U16 charid;
538e84ed 2145
147e3846 2146 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2147 depth+1, (UV)state );
3dab1dad 2148 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2149 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2150 } else {
6ad9a8ab 2151 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2152 trie->states[ state ].wordnum
2153 );
2154 }
2155 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed
KW
2156 SV ** const tmp = av_fetch( revcharmap,
2157 TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb 2158 if ( tmp ) {
147e3846 2159 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2160 colwidth,
538e84ed
KW
2161 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2162 colwidth,
2163 PL_colors[0], PL_colors[1],
2164 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2165 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2166 ) ,
1e2e3d02
YO
2167 TRIE_LIST_ITEM(state,charid).forid,
2168 (UV)TRIE_LIST_ITEM(state,charid).newstate
2169 );
538e84ed 2170 if (!(charid % 10))
6ad9a8ab 2171 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2172 (int)((depth * 2) + 14), "");
1e2e3d02 2173 }
ab3bbdeb 2174 }
6ad9a8ab 2175 Perl_re_printf( aTHX_ "\n");
3dab1dad 2176 }
538e84ed 2177}
3dab1dad
YO
2178
2179/*
3dab1dad 2180 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2181 This is the normal DFA style state transition table, with a few
2182 twists to facilitate compression later.
3dab1dad
YO
2183 Used for debugging make_trie().
2184*/
2185STATIC void
55eed653 2186S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2187 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2188 U32 depth)
3dab1dad
YO
2189{
2190 U32 state;
2191 U16 charid;
ab3bbdeb 2192 SV *sv=sv_newmortal();
55eed653 2193 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2194 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2195
2196 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2197
3dab1dad
YO
2198 /*
2199 print out the table precompression so that we can do a visual check
2200 that they are identical.
2201 */
538e84ed 2202
6ad9a8ab 2203 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2204
2205 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2206 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2207 if ( tmp ) {
6ad9a8ab 2208 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2209 colwidth,
538e84ed 2210 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2211 PL_colors[0], PL_colors[1],
2212 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2213 PERL_PV_ESCAPE_FIRSTCHAR
2214 )
ab3bbdeb 2215 );
3dab1dad
YO
2216 }
2217 }
2218
4aaafc03
YO
2219 Perl_re_printf( aTHX_ "\n");
2220 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2221
2222 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2223 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2224 }
2225
6ad9a8ab 2226 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2227
2228 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2229
147e3846 2230 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2231 depth+1,
3dab1dad
YO
2232 (UV)TRIE_NODENUM( state ) );
2233
2234 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2235 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2236 if (v)
147e3846 2237 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2238 else
6ad9a8ab 2239 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2240 }
2241 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2242 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2243 (UV)trie->trans[ state ].check );
3dab1dad 2244 } else {
147e3846 2245 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2246 (UV)trie->trans[ state ].check,
3dab1dad
YO
2247 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2248 }
2249 }
07be1b83 2250}
3dab1dad
YO
2251
2252#endif
2253
2e64971a 2254
786e8c11
YO
2255/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2256 startbranch: the first branch in the whole branch sequence
2257 first : start branch of sequence of branch-exact nodes.
2258 May be the same as startbranch
2259 last : Thing following the last branch.
2260 May be the same as tail.
2261 tail : item following the branch sequence
2262 count : words in the sequence
a4525e78 2263 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2264 depth : indent depth
3dab1dad 2265
786e8c11 2266Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2267
786e8c11
YO
2268A trie is an N'ary tree where the branches are determined by digital
2269decomposition of the key. IE, at the root node you look up the 1st character and
2270follow that branch repeat until you find the end of the branches. Nodes can be
2271marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2272
786e8c11 2273 /he|she|his|hers/
72f13be8 2274
786e8c11
YO
2275would convert into the following structure. Numbers represent states, letters
2276following numbers represent valid transitions on the letter from that state, if
2277the number is in square brackets it represents an accepting state, otherwise it
2278will be in parenthesis.
07be1b83 2279
786e8c11
YO
2280 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2281 | |
2282 | (2)
2283 | |
2284 (1) +-i->(6)-+-s->[7]
2285 |
2286 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2287
786e8c11
YO
2288 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2289
2290This shows that when matching against the string 'hers' we will begin at state 1
2291read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2292then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2293is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2294single traverse. We store a mapping from accepting to state to which word was
2295matched, and then when we have multiple possibilities we try to complete the
b8fda935 2296rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2297
2298The only prior NFA like behaviour that would be changed by the TRIE support is
2299the silent ignoring of duplicate alternations which are of the form:
2300
2301 / (DUPE|DUPE) X? (?{ ... }) Y /x
2302
4b714af6 2303Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2304and without the optimisation. With the optimisations dupes will be silently
486ec47a 2305ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2306the following demonstrates:
2307
2308 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2309
2310which prints out 'word' three times, but
2311
2312 'words'=~/(word|word|word)(?{ print $1 })S/
2313
2314which doesnt print it out at all. This is due to other optimisations kicking in.
2315
2316Example of what happens on a structural level:
2317
486ec47a 2318The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2319
2320 1: CURLYM[1] {1,32767}(18)
2321 5: BRANCH(8)
2322 6: EXACT <ac>(16)
2323 8: BRANCH(11)
2324 9: EXACT <ad>(16)
2325 11: BRANCH(14)
2326 12: EXACT <ab>(16)
2327 16: SUCCEED(0)
2328 17: NOTHING(18)
2329 18: END(0)
2330
2331This would be optimizable with startbranch=5, first=5, last=16, tail=16
2332and should turn into:
2333
2334 1: CURLYM[1] {1,32767}(18)
2335 5: TRIE(16)
2336 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2337 <ac>
2338 <ad>
2339 <ab>
2340 16: SUCCEED(0)
2341 17: NOTHING(18)
2342 18: END(0)
2343
2344Cases where tail != last would be like /(?foo|bar)baz/:
2345
2346 1: BRANCH(4)
2347 2: EXACT <foo>(8)
2348 4: BRANCH(7)
2349 5: EXACT <bar>(8)
2350 7: TAIL(8)
2351 8: EXACT <baz>(10)
2352 10: END(0)
2353
2354which would be optimizable with startbranch=1, first=1, last=7, tail=8
2355and would end up looking like:
2356
2357 1: TRIE(8)
2358 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2359 <foo>
2360 <bar>
2361 7: TAIL(8)
2362 8: EXACT <baz>(10)
2363 10: END(0)
2364
c80e42f3 2365 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2366
2367is the recommended Unicode-aware way of saying
2368
2369 *(d++) = uv;
2370*/
2371
fab2782b 2372#define TRIE_STORE_REVCHAR(val) \
786e8c11 2373 STMT_START { \
73031816 2374 if (UTF) { \
668fcfea 2375 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2376 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 2377 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
2378 SvCUR_set(zlopp, kapow - flrbbbbb); \
2379 SvPOK_on(zlopp); \
2380 SvUTF8_on(zlopp); \
2381 av_push(revcharmap, zlopp); \
2382 } else { \
fab2782b 2383 char ooooff = (char)val; \
73031816
NC
2384 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2385 } \
2386 } STMT_END
786e8c11 2387
914a25d5
KW
2388/* This gets the next character from the input, folding it if not already
2389 * folded. */
2390#define TRIE_READ_CHAR STMT_START { \
2391 wordlen++; \
2392 if ( UTF ) { \
2393 /* if it is UTF then it is either already folded, or does not need \
2394 * folding */ \
1c1d615a 2395 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2396 } \
2397 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2398 /* This folder implies Unicode rules, which in the range expressible \
2399 * by not UTF is the lower case, with the two exceptions, one of \
2400 * which should have been taken care of before calling this */ \
2401 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2402 uvc = toLOWER_L1(*uc); \
2403 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2404 len = 1; \
914a25d5
KW
2405 } else { \
2406 /* raw data, will be folded later if needed */ \
2407 uvc = (U32)*uc; \
2408 len = 1; \
2409 } \
786e8c11
YO
2410} STMT_END
2411
2412
2413
2414#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2415 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
00195859 2416 U32 ging = TRIE_LIST_LEN( state ) * 2; \
f9003953 2417 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
00195859 2418 TRIE_LIST_LEN( state ) = ging; \
786e8c11
YO
2419 } \
2420 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2421 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2422 TRIE_LIST_CUR( state )++; \
2423} STMT_END
07be1b83 2424
786e8c11
YO
2425#define TRIE_LIST_NEW(state) STMT_START { \
2426 Newxz( trie->states[ state ].trans.list, \
2427 4, reg_trie_trans_le ); \
2428 TRIE_LIST_CUR( state ) = 1; \
2429 TRIE_LIST_LEN( state ) = 4; \
2430} STMT_END
07be1b83 2431
786e8c11
YO
2432#define TRIE_HANDLE_WORD(state) STMT_START { \
2433 U16 dupe= trie->states[ state ].wordnum; \
2434 regnode * const noper_next = regnext( noper ); \
2435 \
786e8c11
YO
2436 DEBUG_r({ \
2437 /* store the word for dumping */ \
2438 SV* tmp; \
2439 if (OP(noper) != NOTHING) \
740cce10 2440 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2441 else \
740cce10 2442 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2443 av_push( trie_words, tmp ); \
786e8c11
YO
2444 }); \
2445 \
2446 curword++; \
2e64971a
DM
2447 trie->wordinfo[curword].prev = 0; \
2448 trie->wordinfo[curword].len = wordlen; \
2449 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2450 \
2451 if ( noper_next < tail ) { \
2452 if (!trie->jump) \
538e84ed
KW
2453 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2454 sizeof(U16) ); \
7f69552c 2455 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2456 if (!jumper) \
2457 jumper = noper_next; \
2458 if (!nextbranch) \
2459 nextbranch= regnext(cur); \
2460 } \
2461 \
2462 if ( dupe ) { \
2e64971a
DM
2463 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2464 /* chain, so that when the bits of chain are later */\
2465 /* linked together, the dups appear in the chain */\
2466 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2467 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2468 } else { \
2469 /* we haven't inserted this word yet. */ \
2470 trie->states[ state ].wordnum = curword; \
2471 } \
2472} STMT_END
07be1b83 2473
3dab1dad 2474
786e8c11
YO
2475#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2476 ( ( base + charid >= ucharcount \
2477 && base + charid < ubound \
2478 && state == trie->trans[ base - ucharcount + charid ].check \
2479 && trie->trans[ base - ucharcount + charid ].next ) \
2480 ? trie->trans[ base - ucharcount + charid ].next \
2481 : ( state==1 ? special : 0 ) \
2482 )
3dab1dad 2483
8bcafbf4
YO
2484#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2485STMT_START { \
2486 TRIE_BITMAP_SET(trie, uvc); \
2487 /* store the folded codepoint */ \
2488 if ( folder ) \
2489 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2490 \
2491 if ( !UTF ) { \
2492 /* store first byte of utf8 representation of */ \
2493 /* variant codepoints */ \
2494 if (! UVCHR_IS_INVARIANT(uvc)) { \
2495 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2496 } \
2497 } \
2498} STMT_END
786e8c11
YO
2499#define MADE_TRIE 1
2500#define MADE_JUMP_TRIE 2
2501#define MADE_EXACT_TRIE 4
3dab1dad 2502
a3621e74 2503STATIC I32
538e84ed
KW
2504S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2505 regnode *first, regnode *last, regnode *tail,
2506 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2507{
2508 /* first pass, loop through and scan words */
2509 reg_trie_data *trie;
55eed653 2510 HV *widecharmap = NULL;
2b8b4781 2511 AV *revcharmap = newAV();
a3621e74 2512 regnode *cur;
a3621e74
YO
2513 STRLEN len = 0;
2514 UV uvc = 0;
2515 U16 curword = 0;
2516 U32 next_alloc = 0;
786e8c11
YO
2517 regnode *jumper = NULL;
2518 regnode *nextbranch = NULL;
7f69552c 2519 regnode *convert = NULL;
2e64971a 2520 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2521 /* we just use folder as a flag in utf8 */
1e696034 2522 const U8 * folder = NULL;
a3621e74 2523
3a611511
YO
2524 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2525 * which stands for one trie structure, one hash, optionally followed
2526 * by two arrays */
2b8b4781 2527#ifdef DEBUGGING
3a611511 2528 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2b8b4781
NC
2529 AV *trie_words = NULL;
2530 /* along with revcharmap, this only used during construction but both are
2531 * useful during debugging so we store them in the struct when debugging.
8e11feef 2532 */
2b8b4781 2533#else
cf78de0b 2534 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2535 STRLEN trie_charcount=0;
3dab1dad 2536#endif
2b8b4781 2537 SV *re_trie_maxbuff;
a3621e74 2538 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2539
2540 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2541#ifndef DEBUGGING
2542 PERL_UNUSED_ARG(depth);
2543#endif
a3621e74 2544
1e696034 2545 switch (flags) {
a4525e78 2546 case EXACT: case EXACTL: break;
2f7f8cb1 2547 case EXACTFA:
fab2782b 2548 case EXACTFU_SS:
a4525e78
KW
2549 case EXACTFU:
2550 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2551 case EXACTF: folder = PL_fold; break;
fab2782b 2552 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2553 }
2554
c944940b 2555 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2556 trie->refcount = 1;
3dab1dad 2557 trie->startstate = 1;
786e8c11 2558 trie->wordcount = word_count;
f8fc2ecf 2559 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2560 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
a4525e78 2561 if (flags == EXACT || flags == EXACTL)
c944940b 2562 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2563 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2564 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2565
a3621e74 2566 DEBUG_r({
2b8b4781 2567 trie_words = newAV();
a3621e74 2568 });
a3621e74 2569
0111c4fd 2570 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
316ebaf2 2571 assert(re_trie_maxbuff);
a3621e74 2572 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2573 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2574 }
df826430 2575 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2576 Perl_re_indentf( aTHX_
cb41e5d6
YO
2577 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2578 depth+1,
538e84ed
KW
2579 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2580 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2581 });
538e84ed 2582
7f69552c
YO
2583 /* Find the node we are going to overwrite */
2584 if ( first == startbranch && OP( last ) != BRANCH ) {
2585 /* whole branch chain */
2586 convert = first;
2587 } else {
2588 /* branch sub-chain */
2589 convert = NEXTOPER( first );
2590 }
538e84ed 2591
a3621e74
YO
2592 /* -- First loop and Setup --
2593
2594 We first traverse the branches and scan each word to determine if it
2595 contains widechars, and how many unique chars there are, this is
2596 important as we have to build a table with at least as many columns as we
2597 have unique chars.
2598
2599 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2600 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2601 the native representation of the character value as the key and IV's for
2602 the coded index.
a3621e74
YO
2603
2604 *TODO* If we keep track of how many times each character is used we can
2605 remap the columns so that the table compression later on is more
3b753521 2606 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2607 middle and the least common are on the outside. IMO this would be better
2608 than a most to least common mapping as theres a decent chance the most
2609 common letter will share a node with the least common, meaning the node
486ec47a 2610 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2611 case is when we have the least common nodes twice.
2612
2613 */
2614
a3621e74 2615 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2616 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2617 const U8 *uc;
2618 const U8 *e;
bc031a7d 2619 int foldlen = 0;
07be1b83 2620 U32 wordlen = 0; /* required init */
bc031a7d
KW
2621 STRLEN minchars = 0;
2622 STRLEN maxchars = 0;
538e84ed
KW
2623 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2624 bitmap?*/
a3621e74 2625
3dab1dad 2626 if (OP(noper) == NOTHING) {
20ed8c88
YO
2627 /* skip past a NOTHING at the start of an alternation
2628 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2629 */
df826430 2630 regnode *noper_next= regnext(noper);
944e05e3
YO
2631 if (noper_next < tail)
2632 noper= noper_next;
2633 }
2634
dca5fc4c
YO
2635 if ( noper < tail &&
2636 (
2637 OP(noper) == flags ||
2638 (
2639 flags == EXACTFU &&
2640 OP(noper) == EXACTFU_SS
2641 )
2642 )
2643 ) {
944e05e3
YO
2644 uc= (U8*)STRING(noper);
2645 e= uc + STR_LEN(noper);
2646 } else {
2647 trie->minlen= 0;
2648 continue;
3dab1dad 2649 }
df826430 2650
944e05e3 2651
fab2782b 2652 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2653 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2654 regardless of encoding */
fab2782b
YO
2655 if (OP( noper ) == EXACTFU_SS) {
2656 /* false positives are ok, so just set this */
0dc4a61d 2657 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2658 }
2659 }
dca5fc4c 2660
bc031a7d
KW
2661 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2662 branch */
3dab1dad 2663 TRIE_CHARCOUNT(trie)++;
a3621e74 2664 TRIE_READ_CHAR;
645de4ce 2665
bc031a7d
KW
2666 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2667 * is in effect. Under /i, this character can match itself, or
2668 * anything that folds to it. If not under /i, it can match just
2669 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2670 * all fold to k, and all are single characters. But some folds
2671 * expand to more than one character, so for example LATIN SMALL
2672 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2673 * the string beginning at 'uc' is 'ffi', it could be matched by
2674 * three characters, or just by the one ligature character. (It
2675 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2676 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2677 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2678 * match.) The trie needs to know the minimum and maximum number
2679 * of characters that could match so that it can use size alone to
2680 * quickly reject many match attempts. The max is simple: it is
2681 * the number of folded characters in this branch (since a fold is
2682 * never shorter than what folds to it. */
2683
2684 maxchars++;
2685
2686 /* And the min is equal to the max if not under /i (indicated by
2687 * 'folder' being NULL), or there are no multi-character folds. If
2688 * there is a multi-character fold, the min is incremented just
2689 * once, for the character that folds to the sequence. Each
2690 * character in the sequence needs to be added to the list below of
2691 * characters in the trie, but we count only the first towards the
2692 * min number of characters needed. This is done through the
2693 * variable 'foldlen', which is returned by the macros that look
2694 * for these sequences as the number of bytes the sequence
2695 * occupies. Each time through the loop, we decrement 'foldlen' by
2696 * how many bytes the current char occupies. Only when it reaches
2697 * 0 do we increment 'minchars' or look for another multi-character
2698 * sequence. */
2699 if (folder == NULL) {
2700 minchars++;
2701 }
2702 else if (foldlen > 0) {
2703 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2704 }
2705 else {
bc031a7d
KW
2706 minchars++;
2707
2708 /* See if *uc is the beginning of a multi-character fold. If
2709 * so, we decrement the length remaining to look at, to account
2710 * for the current character this iteration. (We can use 'uc'
2711 * instead of the fold returned by TRIE_READ_CHAR because for
2712 * non-UTF, the latin1_safe macro is smart enough to account
2713 * for all the unfolded characters, and because for UTF, the
2714 * string will already have been folded earlier in the
2715 * compilation process */
2716 if (UTF) {
2717 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2718 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2719 }
2720 }
bc031a7d
KW
2721 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2722 foldlen--;
2723 }
645de4ce 2724 }
bc031a7d
KW
2725
2726 /* The current character (and any potential folds) should be added
2727 * to the possible matching characters for this position in this
2728 * branch */
a3621e74 2729 if ( uvc < 256 ) {
fab2782b
YO
2730 if ( folder ) {
2731 U8 folded= folder[ (U8) uvc ];
2732 if ( !trie->charmap[ folded ] ) {
2733 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2734 TRIE_STORE_REVCHAR( folded );
2735 }
2736 }
a3621e74
YO
2737 if ( !trie->charmap[ uvc ] ) {
2738 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2739 TRIE_STORE_REVCHAR( uvc );
a3621e74 2740 }
02daf0ab 2741 if ( set_bit ) {
62012aee
KW
2742 /* store the codepoint in the bitmap, and its folded
2743 * equivalent. */
8bcafbf4 2744 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2745 set_bit = 0; /* We've done our bit :-) */
2746 }
a3621e74 2747 } else {
bc031a7d
KW
2748
2749 /* XXX We could come up with the list of code points that fold
2750 * to this using PL_utf8_foldclosures, except not for
2751 * multi-char folds, as there may be multiple combinations
2752 * there that could work, which needs to wait until runtime to
2753 * resolve (The comment about LIGATURE FFI above is such an
2754 * example */
2755
a3621e74 2756 SV** svpp;
55eed653
NC
2757 if ( !widecharmap )
2758 widecharmap = newHV();
a3621e74 2759
55eed653 2760 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2761
2762 if ( !svpp )
147e3846 2763 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
2764
2765 if ( !SvTRUE( *svpp ) ) {
2766 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2767 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2768 }
2769 }
bc031a7d
KW
2770 } /* end loop through characters in this branch of the trie */
2771
2772 /* We take the min and max for this branch and combine to find the min
2773 * and max for all branches processed so far */
3dab1dad 2774 if( cur == first ) {
bc031a7d
KW
2775 trie->minlen = minchars;
2776 trie->maxlen = maxchars;
2777 } else if (minchars < trie->minlen) {
2778 trie->minlen = minchars;
2779 } else if (maxchars > trie->maxlen) {
2780 trie->maxlen = maxchars;
fab2782b 2781 }
a3621e74
YO
2782 } /* end first pass */
2783 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2784 Perl_re_indentf( aTHX_
cb41e5d6
YO
2785 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2786 depth+1,
55eed653 2787 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2788 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2789 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2790 );
a3621e74
YO
2791
2792 /*
2793 We now know what we are dealing with in terms of unique chars and
2794 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2795 representation using a flat table will take. If it's over a reasonable
2796 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2797 conservative but potentially much slower representation using an array
2798 of lists.
2799
2800 At the end we convert both representations into the same compressed
2801 form that will be used in regexec.c for matching with. The latter
2802 is a form that cannot be used to construct with but has memory
2803 properties similar to the list form and access properties similar
2804 to the table form making it both suitable for fast searches and
2805 small enough that its feasable to store for the duration of a program.
2806
2807 See the comment in the code where the compressed table is produced
2808 inplace from the flat tabe representation for an explanation of how
2809 the compression works.
2810
2811 */
2812
2813
2e64971a
DM
2814 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2815 prev_states[1] = 0;
2816
538e84ed
KW
2817 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2818 > SvIV(re_trie_maxbuff) )
2819 {
a3621e74
YO
2820 /*
2821 Second Pass -- Array Of Lists Representation
2822
2823 Each state will be represented by a list of charid:state records
2824 (reg_trie_trans_le) the first such element holds the CUR and LEN
2825 points of the allocated array. (See defines above).
2826
2827 We build the initial structure using the lists, and then convert
2828 it into the compressed table form which allows faster lookups
2829 (but cant be modified once converted).
a3621e74
YO
2830 */
2831
a3621e74
YO
2832 STRLEN transcount = 1;
2833
6ad9a8ab 2834 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 2835 depth+1));
686b73d4 2836
c944940b
JH
2837 trie->states = (reg_trie_state *)
2838 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2839 sizeof(reg_trie_state) );
a3621e74
YO
2840 TRIE_LIST_NEW(1);
2841 next_alloc = 2;
2842
2843 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2844
df826430 2845 regnode *noper = NEXTOPER( cur );
c445ea15
AL
2846 U32 state = 1; /* required init */
2847 U16 charid = 0; /* sanity init */
07be1b83 2848 U32 wordlen = 0; /* required init */
c445ea15 2849
df826430
YO
2850 if (OP(noper) == NOTHING) {
2851 regnode *noper_next= regnext(noper);
944e05e3
YO
2852 if (noper_next < tail)
2853 noper= noper_next;
df826430
YO
2854 }
2855
944e05e3
YO
2856 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2857 const U8 *uc= (U8*)STRING(noper);
2858 const U8 *e= uc + STR_LEN(noper);
2859
786e8c11 2860 for ( ; uc < e ; uc += len ) {
c445ea15 2861
786e8c11 2862 TRIE_READ_CHAR;
c445ea15 2863
786e8c11
YO
2864 if ( uvc < 256 ) {
2865 charid = trie->charmap[ uvc ];
c445ea15 2866 } else {
538e84ed
KW
2867 SV** const svpp = hv_fetch( widecharmap,
2868 (char*)&uvc,
2869 sizeof( UV ),
2870 0);
786e8c11
YO
2871 if ( !svpp ) {
2872 charid = 0;
2873 } else {
2874 charid=(U16)SvIV( *svpp );
2875 }
c445ea15 2876 }
538e84ed
KW
2877 /* charid is now 0 if we dont know the char read, or
2878 * nonzero if we do */
786e8c11 2879 if ( charid ) {
a3621e74 2880
786e8c11
YO
2881 U16 check;
2882 U32 newstate = 0;
a3621e74 2883
786e8c11
YO
2884 charid--;
2885 if ( !trie->states[ state ].trans.list ) {
2886 TRIE_LIST_NEW( state );
c445ea15 2887 }
538e84ed
KW
2888 for ( check = 1;
2889 check <= TRIE_LIST_USED( state );
2890 check++ )
2891 {
2892 if ( TRIE_LIST_ITEM( state, check ).forid
2893 == charid )
2894 {
786e8c11
YO
2895 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2896 break;
2897 }
2898 }
2899 if ( ! newstate ) {
2900 newstate = next_alloc++;
2e64971a 2901 prev_states[newstate] = state;
786e8c11
YO
2902 TRIE_LIST_PUSH( state, charid, newstate );
2903 transcount++;
2904 }
2905 state = newstate;
2906 } else {
147e3846 2907 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 2908 }
a28509cc 2909 }
c445ea15 2910 }
3dab1dad 2911 TRIE_HANDLE_WORD(state);
a3621e74
YO
2912
2913 } /* end second pass */
2914
1e2e3d02 2915 /* next alloc is the NEXT state to be allocated */
538e84ed 2916 trie->statecount = next_alloc;
c944940b
JH
2917 trie->states = (reg_trie_state *)
2918 PerlMemShared_realloc( trie->states,
2919 next_alloc
2920 * sizeof(reg_trie_state) );
a3621e74 2921
3dab1dad 2922 /* and now dump it out before we compress it */
2b8b4781
NC
2923 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2924 revcharmap, next_alloc,
2925 depth+1)
1e2e3d02 2926 );
a3621e74 2927
c944940b
JH
2928 trie->trans = (reg_trie_trans *)
2929 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2930 {
2931 U32 state;
a3621e74
YO
2932 U32 tp = 0;
2933 U32 zp = 0;
2934
2935
2936 for( state=1 ; state < next_alloc ; state ++ ) {
2937 U32 base=0;
2938
2939 /*
2940 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 2941 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
2942 );
2943 */
2944
2945 if (trie->states[state].trans.list) {
2946 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2947 U16 maxid=minid;
a28509cc 2948 U16 idx;
a3621e74
YO
2949
2950 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
2951 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2952 if ( forid < minid ) {
2953 minid=forid;
2954 } else if ( forid > maxid ) {
2955 maxid=forid;
2956 }
a3621e74
YO
2957 }
2958 if ( transcount < tp + maxid - minid + 1) {
2959 transcount *= 2;
c944940b
JH
2960 trie->trans = (reg_trie_trans *)
2961 PerlMemShared_realloc( trie->trans,
446bd890
NC
2962 transcount
2963 * sizeof(reg_trie_trans) );
538e84ed
KW
2964 Zero( trie->trans + (transcount / 2),
2965 transcount / 2,
2966 reg_trie_trans );
a3621e74
YO
2967 }
2968 base = trie->uniquecharcount + tp - minid;
2969 if ( maxid == minid ) {
2970 U32 set = 0;
2971 for ( ; zp < tp ; zp++ ) {
2972 if ( ! trie->trans[ zp ].next ) {
2973 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
2974 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2975 1).newstate;
a3621e74
YO
2976 trie->trans[ zp ].check = state;
2977 set = 1;
2978 break;
2979 }
2980 }
2981 if ( !set ) {
538e84ed
KW
2982 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2983 1).newstate;
a3621e74
YO
2984 trie->trans[ tp ].check = state;
2985 tp++;
2986 zp = tp;
2987 }
2988 } else {
2989 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
2990 const U32 tid = base
2991 - trie->uniquecharcount
2992 + TRIE_LIST_ITEM( state, idx ).forid;
2993 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2994 idx ).newstate;
a3621e74
YO
2995 trie->trans[ tid ].check = state;
2996 }
2997 tp += ( maxid - minid + 1 );
2998 }
2999 Safefree(trie->states[ state ].trans.list);
3000 }
3001 /*
3002 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3003 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
3004 );
3005 */
3006 trie->states[ state ].trans.base=base;
3007 }
cc601c31 3008 trie->lasttrans = tp + 1;
a3621e74
YO
3009 }
3010 } else {
3011 /*
3012 Second Pass -- Flat Table Representation.
3013
b423522f
KW
3014 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3015 each. We know that we will need Charcount+1 trans at most to store
3016 the data (one row per char at worst case) So we preallocate both
3017 structures assuming worst case.
a3621e74
YO
3018
3019 We then construct the trie using only the .next slots of the entry
3020 structs.
3021
b423522f
KW
3022 We use the .check field of the first entry of the node temporarily
3023 to make compression both faster and easier by keeping track of how
3024 many non zero fields are in the node.
a3621e74
YO
3025
3026 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3027 transition.
3028
b423522f
KW
3029 There are two terms at use here: state as a TRIE_NODEIDX() which is
3030 a number representing the first entry of the node, and state as a
3031 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3032 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3033 if there are 2 entrys per node. eg:
a3621e74
YO
3034
3035 A B A B
3036 1. 2 4 1. 3 7
3037 2. 0 3 3. 0 5
3038 3. 0 0 5. 0 0
3039 4. 0 0 7. 0 0
3040
b423522f
KW
3041 The table is internally in the right hand, idx form. However as we
3042 also have to deal with the states array which is indexed by nodenum
3043 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
3044
3045 */
6ad9a8ab 3046 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 3047 depth+1));
3dab1dad 3048
c944940b
JH
3049 trie->trans = (reg_trie_trans *)
3050 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3051 * trie->uniquecharcount + 1,
3052 sizeof(reg_trie_trans) );
3053 trie->states = (reg_trie_state *)
3054 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3055 sizeof(reg_trie_state) );
a3621e74
YO
3056 next_alloc = trie->uniquecharcount + 1;
3057
3dab1dad 3058
a3621e74
YO
3059 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3060
df826430 3061 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3062
3063 U32 state = 1; /* required init */
3064
3065 U16 charid = 0; /* sanity init */
3066 U32 accept_state = 0; /* sanity init */
a3621e74 3067
07be1b83 3068 U32 wordlen = 0; /* required init */
a3621e74 3069
df826430
YO
3070 if (OP(noper) == NOTHING) {
3071 regnode *noper_next= regnext(noper);
944e05e3
YO
3072 if (noper_next < tail)
3073 noper= noper_next;
df826430 3074 }
fab2782b 3075
944e05e3
YO
3076 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3077 const U8 *uc= (U8*)STRING(noper);
3078 const U8 *e= uc + STR_LEN(noper);
3079
786e8c11 3080 for ( ; uc < e ; uc += len ) {
a3621e74 3081
786e8c11 3082 TRIE_READ_CHAR;
a3621e74 3083
786e8c11
YO
3084 if ( uvc < 256 ) {
3085 charid = trie->charmap[ uvc ];
3086 } else {
538e84ed
KW
3087 SV* const * const svpp = hv_fetch( widecharmap,
3088 (char*)&uvc,
3089 sizeof( UV ),
3090 0);
786e8c11 3091 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3092 }
786e8c11
YO
3093 if ( charid ) {
3094 charid--;
3095 if ( !trie->trans[ state + charid ].next ) {
3096 trie->trans[ state + charid ].next = next_alloc;
3097 trie->trans[ state ].check++;
2e64971a
DM
3098 prev_states[TRIE_NODENUM(next_alloc)]
3099 = TRIE_NODENUM(state);
786e8c11
YO
3100 next_alloc += trie->uniquecharcount;
3101 }
3102 state = trie->trans[ state + charid ].next;
3103 } else {
147e3846 3104 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3105 }
538e84ed
KW
3106 /* charid is now 0 if we dont know the char read, or
3107 * nonzero if we do */
a3621e74 3108 }
a3621e74 3109 }
3dab1dad
YO
3110 accept_state = TRIE_NODENUM( state );
3111 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3112
3113 } /* end second pass */
3114
3dab1dad 3115 /* and now dump it out before we compress it */
2b8b4781
NC
3116 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3117 revcharmap,
3118 next_alloc, depth+1));
a3621e74 3119
a3621e74
YO
3120 {
3121 /*
3122 * Inplace compress the table.*
3123
3124 For sparse data sets the table constructed by the trie algorithm will
3125 be mostly 0/FAIL transitions or to put it another way mostly empty.
3126 (Note that leaf nodes will not contain any transitions.)
3127
3128 This algorithm compresses the tables by eliminating most such
3129 transitions, at the cost of a modest bit of extra work during lookup:
3130
3131 - Each states[] entry contains a .base field which indicates the
3132 index in the state[] array wheres its transition data is stored.
3133
3b753521 3134 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3135
3136 - If .base is nonzero then charid is added to it to find an entry in
3137 the trans array.
3138
3139 -If trans[states[state].base+charid].check!=state then the
3140 transition is taken to be a 0/Fail transition. Thus if there are fail
3141 transitions at the front of the node then the .base offset will point
3142 somewhere inside the previous nodes data (or maybe even into a node
3143 even earlier), but the .check field determines if the transition is
3144 valid.
3145
786e8c11 3146 XXX - wrong maybe?
a3621e74 3147 The following process inplace converts the table to the compressed
3b753521 3148 table: We first do not compress the root node 1,and mark all its
a3621e74 3149 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3150 allows us to do a DFA construction from the compressed table later,
3151 and ensures that any .base pointers we calculate later are greater
3152 than 0.
a3621e74
YO
3153
3154 - We set 'pos' to indicate the first entry of the second node.
3155
3156 - We then iterate over the columns of the node, finding the first and
3157 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3158 and set the .check pointers accordingly, and advance pos
3159 appropriately and repreat for the next node. Note that when we copy
3160 the next pointers we have to convert them from the original
3161 NODEIDX form to NODENUM form as the former is not valid post
3162 compression.
3163
3164 - If a node has no transitions used we mark its base as 0 and do not
3165 advance the pos pointer.
3166
3167 - If a node only has one transition we use a second pointer into the
3168 structure to fill in allocated fail transitions from other states.
3169 This pointer is independent of the main pointer and scans forward
3170 looking for null transitions that are allocated to a state. When it
3171 finds one it writes the single transition into the "hole". If the
786e8c11 3172 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3173
3174 - Once compressed we can Renew/realloc the structures to release the
3175 excess space.
3176
3177 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3178 specifically Fig 3.47 and the associated pseudocode.
3179
3180 demq
3181 */
a3b680e6 3182 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3183 U32 state, charid;
a3621e74 3184 U32 pos = 0, zp=0;
1e2e3d02 3185 trie->statecount = laststate;
a3621e74
YO
3186
3187 for ( state = 1 ; state < laststate ; state++ ) {
3188 U8 flag = 0;
a28509cc
AL
3189 const U32 stateidx = TRIE_NODEIDX( state );
3190 const U32 o_used = trie->trans[ stateidx ].check;
3191 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3192 trie->trans[ stateidx ].check = 0;
3193
538e84ed
KW
3194 for ( charid = 0;
3195 used && charid < trie->uniquecharcount;
3196 charid++ )
3197 {
a3621e74
YO
3198 if ( flag || trie->trans[ stateidx + charid ].next ) {
3199 if ( trie->trans[ stateidx + charid ].next ) {
3200 if (o_used == 1) {
3201 for ( ; zp < pos ; zp++ ) {
3202 if ( ! trie->trans[ zp ].next ) {
3203 break;
3204 }
3205 }
538e84ed
KW
3206 trie->states[ state ].trans.base
3207 = zp
3208 + trie->uniquecharcount
3209 - charid ;
3210 trie->trans[ zp ].next
3211 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3212 + charid ].next );
a3621e74
YO
3213 trie->trans[ zp ].check = state;
3214 if ( ++zp > pos ) pos = zp;
3215 break;
3216 }
3217 used--;
3218 }
3219 if ( !flag ) {
3220 flag = 1;
538e84ed
KW
3221 trie->states[ state ].trans.base
3222 = pos + trie->uniquecharcount - charid ;
a3621e74 3223 }
538e84ed
KW
3224 trie->trans[ pos ].next
3225 = SAFE_TRIE_NODENUM(
3226 trie->trans[ stateidx + charid ].next );
a3621e74
YO
3227 trie->trans[ pos ].check = state;
3228 pos++;
3229 }
3230 }
3231 }
cc601c31 3232 trie->lasttrans = pos + 1;
c944940b
JH
3233 trie->states = (reg_trie_state *)
3234 PerlMemShared_realloc( trie->states, laststate
3235 * sizeof(reg_trie_state) );
a3621e74 3236 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3237 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
cb41e5d6 3238 depth+1,
538e84ed
KW
3239 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3240 + 1 ),
3241 (IV)next_alloc,
3242 (IV)pos,
3243 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
3244 );
3245
3246 } /* end table compress */
3247 }
1e2e3d02 3248 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3249 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
cb41e5d6 3250 depth+1,
1e2e3d02
YO
3251 (UV)trie->statecount,
3252 (UV)trie->lasttrans)
3253 );
cc601c31 3254 /* resize the trans array to remove unused space */
c944940b
JH
3255 trie->trans = (reg_trie_trans *)
3256 PerlMemShared_realloc( trie->trans, trie->lasttrans
3257 * sizeof(reg_trie_trans) );
a3621e74 3258
538e84ed 3259 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
3260 U8 nodetype =(U8)(flags & 0xFF);
3261 char *str=NULL;
538e84ed 3262
07be1b83 3263#ifdef DEBUGGING
e62cc96a 3264 regnode *optimize = NULL;
7122b237
YO
3265#ifdef RE_TRACK_PATTERN_OFFSETS
3266
b57a0404
JH
3267 U32 mjd_offset = 0;
3268 U32 mjd_nodelen = 0;
7122b237
YO
3269#endif /* RE_TRACK_PATTERN_OFFSETS */
3270#endif /* DEBUGGING */
a3621e74 3271 /*
3dab1dad
YO
3272 This means we convert either the first branch or the first Exact,
3273 depending on whether the thing following (in 'last') is a branch
3274 or not and whther first is the startbranch (ie is it a sub part of
3275 the alternation or is it the whole thing.)
3b753521 3276 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 3277 the whole branch sequence, including the first.
a3621e74 3278 */
3dab1dad 3279 /* Find the node we are going to overwrite */
7f69552c 3280 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 3281 /* branch sub-chain */
3dab1dad 3282 NEXT_OFF( first ) = (U16)(last - first);
7122b237 3283#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
3284 DEBUG_r({
3285 mjd_offset= Node_Offset((convert));
3286 mjd_nodelen= Node_Length((convert));
3287 });
7122b237 3288#endif
7f69552c 3289 /* whole branch chain */
7122b237
YO
3290 }
3291#ifdef RE_TRACK_PATTERN_OFFSETS
3292 else {
7f69552c
YO
3293 DEBUG_r({
3294 const regnode *nop = NEXTOPER( convert );
3295 mjd_offset= Node_Offset((nop));
3296 mjd_nodelen= Node_Length((nop));
3297 });
07be1b83
YO
3298 }
3299 DEBUG_OPTIMISE_r(
147e3846 3300 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
cb41e5d6 3301 depth+1,
786e8c11 3302 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 3303 );
7122b237 3304#endif
538e84ed 3305 /* But first we check to see if there is a common prefix we can
3dab1dad
YO
3306 split out as an EXACT and put in front of the TRIE node. */
3307 trie->startstate= 1;
55eed653 3308 if ( trie->bitmap && !widecharmap && !trie->jump ) {
5ee57374
YO
3309 /* we want to find the first state that has more than
3310 * one transition, if that state is not the first state
3311 * then we have a common prefix which we can remove.
3312 */
3dab1dad 3313 U32 state;
1e2e3d02 3314 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 3315 U32 ofs = 0;
ee0dfd0b
YO
3316 I32 first_ofs = -1; /* keeps track of the ofs of the first
3317 transition, -1 means none */
8e11feef
RGS
3318 U32 count = 0;
3319 const U32 base = trie->states[ state ].trans.base;
a3621e74 3320
5ee57374 3321 /* does this state terminate an alternation? */
3dab1dad 3322 if ( trie->states[state].wordnum )
8e11feef 3323 count = 1;
a3621e74 3324
8e11feef 3325 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
3326 if ( ( base + ofs >= trie->uniquecharcount ) &&
3327 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
3328 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3329 {
3dab1dad 3330 if ( ++count > 1 ) {
5ee57374 3331 /* we have more than one transition */
d3d91c7c
YO
3332 SV **tmp;
3333 U8 *ch;
5ee57374
YO
3334 /* if this is the first state there is no common prefix
3335 * to extract, so we can exit */
8e11feef 3336 if ( state == 1 ) break;
d3d91c7c
YO
3337 tmp = av_fetch( revcharmap, ofs, 0);
3338 ch = (U8*)SvPV_nolen_const( *tmp );
3339
5ee57374
YO
3340 /* if we are on count 2 then we need to initialize the
3341 * bitmap, and store the previous char if there was one
3342 * in it*/
3dab1dad 3343 if ( count == 2 ) {
5ee57374 3344 /* clear the bitmap */
3dab1dad
YO
3345 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3346 DEBUG_OPTIMISE_r(
147e3846 3347 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
cb41e5d6 3348 depth+1,
786e8c11 3349 (UV)state));
ee0dfd0b
YO
3350 if (first_ofs >= 0) {
3351 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
be8e71aa 3352 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 3353
8bcafbf4 3354 TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
3dab1dad 3355 DEBUG_OPTIMISE_r(
6ad9a8ab 3356 Perl_re_printf( aTHX_ "%s", (char*)ch)
3dab1dad 3357 );
8e11feef
RGS
3358 }
3359 }
8bcafbf4
YO
3360 /* store the current firstchar in the bitmap */
3361 TRIE_BITMAP_SET_FOLDED(trie,*ch,folder);
6ad9a8ab 3362 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
8e11feef 3363 }
ee0dfd0b 3364 first_ofs = ofs;
8e11feef 3365 }
3dab1dad
YO
3366 }
3367 if ( count == 1 ) {
5ee57374
YO
3368 /* This state has only one transition, its transition is part
3369 * of a common prefix - we need to concatenate the char it
3370 * represents to what we have so far. */
ee0dfd0b 3371 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
c490c714
YO
3372 STRLEN len;
3373 char *ch = SvPV( *tmp, len );
de734bd5
A
3374 DEBUG_OPTIMISE_r({
3375 SV *sv=sv_newmortal();
147e3846 3376 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
cb41e5d6 3377 depth+1,
ee0dfd0b 3378 (UV)state, (UV)first_ofs,
538e84ed 3379 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
de734bd5
A
3380 PL_colors[0], PL_colors[1],
3381 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed 3382 PERL_PV_ESCAPE_FIRSTCHAR
de734bd5
A
3383 )
3384 );
3385 });
3dab1dad
YO
3386 if ( state==1 ) {
3387 OP( convert ) = nodetype;
3388 str=STRING(convert);
3389 STR_LEN(convert)=0;
3390 }
c490c714
YO
3391 STR_LEN(convert) += len;
3392 while (len--)
de734bd5 3393 *str++ = *ch++;
8e11feef 3394 } else {
538e84ed 3395#ifdef DEBUGGING
8e11feef 3396 if (state>1)
6ad9a8ab 3397 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "]\n"));
f9049ba1 3398#endif
8e11feef
RGS
3399 break;
3400 }
3401 }
2e64971a 3402 trie->prefixlen = (state-1);
3dab1dad 3403 if (str) {
8e11feef 3404 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 3405 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 3406 trie->startstate = state;
07be1b83
YO
3407 trie->minlen -= (state - 1);
3408 trie->maxlen -= (state - 1);
33809eae
JH
3409#ifdef DEBUGGING
3410 /* At least the UNICOS C compiler choked on this
3411 * being argument to DEBUG_r(), so let's just have
3412 * it right here. */
3413 if (
3414#ifdef PERL_EXT_RE_BUILD
3415 1
3416#else
3417 DEBUG_r_TEST
3418#endif
3419 ) {
3420 regnode *fix = convert;
3421 U32 word = trie->wordcount;
3422 mjd_nodelen++;
3423 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3424 while( ++fix < n ) {
3425 Set_Node_Offset_Length(fix, 0, 0);
3426 }
3427 while (word--) {
3428 SV ** const tmp = av_fetch( trie_words, word, 0 );
3429 if (tmp) {
3430 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3431 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3432 else
3433 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3434 }
3435 }
3436 }
3437#endif
8e11feef
RGS
3438 if (trie->maxlen) {
3439 convert = n;
3440 } else {
3dab1dad 3441 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 3442 DEBUG_r(optimize= n);
3dab1dad
YO
3443 }
3444 }
3445 }
538e84ed
KW
3446 if (!jumper)
3447 jumper = last;
3dab1dad 3448 if ( trie->maxlen ) {
8e11feef
RGS
3449 NEXT_OFF( convert ) = (U16)(tail - convert);
3450 ARG_SET( convert, data_slot );
538e84ed
KW
3451 /* Store the offset to the first unabsorbed branch in
3452 jump[0], which is otherwise unused by the jump logic.
786e8c11 3453 We use this when dumping a trie and during optimisation. */
538e84ed 3454 if (trie->jump)
7f69552c 3455 trie->jump[0] = (U16)(nextbranch - convert);
538e84ed 3456
6c48061a
YO
3457 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3458 * and there is a bitmap
3459 * and the first "jump target" node we found leaves enough room
3460 * then convert the TRIE node into a TRIEC node, with the bitmap
3461 * embedded inline in the opcode - this is hypothetically faster.
3462 */
3463 if ( !trie->states[trie->startstate].wordnum
3464 && trie->bitmap
3465 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
3466 {
3467 OP( convert ) = TRIEC;
3468 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 3469 PerlMemShared_free(trie->bitmap);
786e8c11 3470 trie->bitmap= NULL;
538e84ed 3471 } else
786e8c11 3472 OP( convert ) = TRIE;
a3621e74 3473
3dab1dad
YO
3474 /* store the type in the flags */
3475 convert->flags = nodetype;
a5ca303d 3476 DEBUG_r({
538e84ed
KW
3477 optimize = convert
3478 + NODE_STEP_REGNODE
a5ca303d
YO
3479 + regarglen[ OP( convert ) ];
3480 });
538e84ed 3481 /* XXX We really should free up the resource in trie now,
a5ca303d 3482 as we won't use them - (which resources?) dmq */
3dab1dad 3483 }
a3621e74 3484 /* needed for dumping*/
e62cc96a 3485 DEBUG_r(if (optimize) {
07be1b83 3486 regnode *opt = convert;
bcdf7404 3487
e62cc96a 3488 while ( ++opt < optimize) {
07be1b83
YO
3489 Set_Node_Offset_Length(opt,0,0);
3490 }
538e84ed
KW
3491 /*