This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Put common code in a macro
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
f7e03a10 89#include "dquote_inline.h"
b992490d 90#include "invlist_inline.h"
1b0f46bf 91#include "unicode_constants.h"
04e98a4d 92
538e84ed
KW
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
95#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 97#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 98#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
3f910e62
YO
104/* this is a chain of data about sub patterns we are processing that
105 need to be handled separately/specially in study_chunk. Its so
106 we can simulate recursion without losing state. */
107struct scan_frame;
108typedef struct scan_frame {
109 regnode *last_regnode; /* last node to process in this frame */
110 regnode *next_regnode; /* next node to process when last is reached */
111 U32 prev_recursed_depth;
112 I32 stopparen; /* what stopparen do we use */
3f910e62
YO
113
114 struct scan_frame *this_prev_frame; /* this previous frame */
115 struct scan_frame *prev_frame; /* previous frame */
116 struct scan_frame *next_frame; /* next frame */
117} scan_frame;
118
f4ae5a27
KW
119/* Certain characters are output as a sequence with the first being a
120 * backslash. */
eb7486de 121#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
f4ae5a27
KW
122
123
09b2b2e6 124struct RExC_state_t {
514a91f1
DM
125 U32 flags; /* RXf_* are we folding, multilining? */
126 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 127 char *precomp; /* uncompiled string. */
711b303b 128 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 129 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 130 regexp *rx; /* perl core regexp structure */
538e84ed
KW
131 regexp_internal *rxi; /* internal data for regexp object
132 pprivate field */
fac92740 133 char *start; /* Start of input for compile */
830247a4
IZ
134 char *end; /* End of input for compile */
135 char *parse; /* Input-scan pointer. */
51684e3c
KW
136 char *copy_start; /* start of copy of input within
137 constructed parse string */
138 char *copy_start_in_input; /* Position in input string
139 corresponding to copy_start */
ea3daa5d 140 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 141 regnode *emit_start; /* Start of emitted-code area */
538e84ed
KW
142 regnode *emit_bound; /* First regnode outside of the
143 allocated space */
f55b7b26 144 regnode_offset emit; /* Code-emit pointer */
830247a4
IZ
145 I32 naughty; /* How bad is this pattern? */
146 I32 sawback; /* Did we see \1, ...? */
147 U32 seen;
88f063b4
KW
148 SSize_t size; /* Number of regnode equivalents in
149 pattern */
e7f023ff 150 I32 npar; /* Capture buffer count, (OPEN) plus
538e84ed
KW
151 one. ("par" 0 is the whole
152 pattern)*/
153 I32 nestroot; /* root parens we are in - used by
154 accept */
830247a4
IZ
155 I32 extralen;
156 I32 seen_zerolen;
f55b7b26
KW
157 regnode_offset *open_parens; /* offsets to open parens */
158 regnode_offset *close_parens; /* offsets to close parens */
d5a00e4a 159 regnode *end_op; /* END node in program */
02daf0ab
YO
160 I32 utf8; /* whether the pattern is utf8 or not */
161 I32 orig_utf8; /* whether the pattern was originally in utf8 */
162 /* XXX use this for future optimisation of case
163 * where pattern must be upgraded to utf8. */
e40e74fe
KW
164 I32 uni_semantics; /* If a d charset modifier should use unicode
165 rules, even if the pattern is not in
166 utf8 */
81714fb9 167 HV *paren_names; /* Paren names */
538e84ed 168
40d049e4 169 regnode **recurse; /* Recurse regops */
88f063b4 170 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 171 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 172 through */
09a65838 173 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 174 I32 in_lookbehind;
4624b182 175 I32 contains_locale;
bb3f3ed2 176 I32 override_recoding;
b6d67071
KW
177#ifdef EBCDIC
178 I32 recode_x_to_native;
179#endif
9d53c457 180 I32 in_multi_char_class;
1acab4c5 181 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 182 within pattern */
b1603ef8 183 int code_index; /* next code_blocks[] slot */
ee273784 184 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
185 scan_frame *frame_head;
186 scan_frame *frame_last;
187 U32 frame_count;
7eec73eb 188 AV *warn_text;
dc6d7f5c 189#ifdef ADD_TO_REGEXEC
830247a4
IZ
190 char *starttry; /* -Dr: where regtry was called. */
191#define RExC_starttry (pRExC_state->starttry)
192#endif
d24ca0c5 193 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 194#ifdef DEBUGGING
be8e71aa 195 const char *lastparse;
3dab1dad 196 I32 lastnum;
1f1031fe 197 AV *paren_name_list; /* idx -> name */
d9a72fcc 198 U32 study_chunk_recursed_count;
c9f0d54c
YO
199 SV *mysv1;
200 SV *mysv2;
88f063b4 201
3dab1dad
YO
202#define RExC_lastparse (pRExC_state->lastparse)
203#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 204#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 205#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
206#define RExC_mysv (pRExC_state->mysv1)
207#define RExC_mysv1 (pRExC_state->mysv1)
208#define RExC_mysv2 (pRExC_state->mysv2)
209
3dab1dad 210#endif
512c0f5a 211 bool seen_unfolded_sharp_s;
911bd04e 212 bool strict;
da7cf1cc 213 bool study_started;
034602eb 214 bool in_script_run;
f55b7b26 215 bool pass1;
09b2b2e6 216};
830247a4 217
e2509266 218#define RExC_flags (pRExC_state->flags)
514a91f1 219#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 220#define RExC_precomp (pRExC_state->precomp)
51684e3c
KW
221#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
222#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
711b303b 223#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 224#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 225#define RExC_rx (pRExC_state->rx)
f8fc2ecf 226#define RExC_rxi (pRExC_state->rxi)
fac92740 227#define RExC_start (pRExC_state->start)
830247a4
IZ
228#define RExC_end (pRExC_state->end)
229#define RExC_parse (pRExC_state->parse)
230#define RExC_whilem_seen (pRExC_state->whilem_seen)
512c0f5a
KW
231
232/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
233 * EXACTF node, hence was parsed under /di rules. If later in the parse,
234 * something forces the pattern into using /ui rules, the sharp s should be
235 * folded into the sequence 'ss', which takes up more space than previously
236 * calculated. This means that the sizing pass needs to be restarted. (The
237 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
238 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
239 * so there is no need to resize [perl #125990]. */
240#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
241
7122b237 242#ifdef RE_TRACK_PATTERN_OFFSETS
44f36cd3 243# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
538e84ed 244 others */
7122b237 245#endif
830247a4 246#define RExC_emit (pRExC_state->emit)
f55b7b26 247#define RExC_pass1 (pRExC_state->pass1)
fac92740 248#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 249#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
250#define RExC_sawback (pRExC_state->sawback)
251#define RExC_seen (pRExC_state->seen)
252#define RExC_size (pRExC_state->size)
ee273784 253#define RExC_maxlen (pRExC_state->maxlen)
830247a4 254#define RExC_npar (pRExC_state->npar)
e2e6a0f1 255#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
256#define RExC_extralen (pRExC_state->extralen)
257#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 258#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 259#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 260#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
261#define RExC_open_parens (pRExC_state->open_parens)
262#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 263#define RExC_end_op (pRExC_state->end_op)
81714fb9 264#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
265#define RExC_recurse (pRExC_state->recurse)
266#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 267#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
268#define RExC_study_chunk_recursed_bytes \
269 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 270#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 271#define RExC_contains_locale (pRExC_state->contains_locale)
b6d67071
KW
272#ifdef EBCDIC
273# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
274#endif
9d53c457 275#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
276#define RExC_frame_head (pRExC_state->frame_head)
277#define RExC_frame_last (pRExC_state->frame_last)
278#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 279#define RExC_strict (pRExC_state->strict)
da7cf1cc 280#define RExC_study_started (pRExC_state->study_started)
7eec73eb 281#define RExC_warn_text (pRExC_state->warn_text)
034602eb 282#define RExC_in_script_run (pRExC_state->in_script_run)
830247a4 283
99807a43
HS
284/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
285 * a flag to disable back-off on the fixed/floating substrings - if it's
286 * a high complexity pattern we assume the benefit of avoiding a full match
287 * is worth the cost of checking for the substrings even if they rarely help.
288 */
289#define RExC_naughty (pRExC_state->naughty)
290#define TOO_NAUGHTY (10)
291#define MARK_NAUGHTY(add) \
292 if (RExC_naughty < TOO_NAUGHTY) \
293 RExC_naughty += (add)
294#define MARK_NAUGHTY_EXP(exp, add) \
295 if (RExC_naughty < TOO_NAUGHTY) \
296 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 297
a687059c
LW
298#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
299#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 300 ((*s) == '{' && regcurly(s)))
a687059c
LW
301
302/*
303 * Flags to be passed up and down.
304 */
a687059c 305#define WORST 0 /* Worst case. */
a3b492c3 306#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 307
e64f369d 308/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
309 * character. (There needs to be a case: in the switch statement in regexec.c
310 * for any node marked SIMPLE.) Note that this is not the same thing as
311 * REGNODE_SIMPLE */
fda99bee 312#define SIMPLE 0x02
e64f369d 313#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
314#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
315#define TRYAGAIN 0x10 /* Weeded out a declaration. */
c36361b2
KW
316#define RESTART_PARSE 0x20 /* Need to redo the parse */
317#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
318 calcuate sizes as UTF-8 */
a687059c 319
3dab1dad
YO
320#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
321
07be1b83
YO
322/* whether trie related optimizations are enabled */
323#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
324#define TRIE_STUDY_OPT
786e8c11 325#define FULL_TRIE_STUDY
07be1b83
YO
326#define TRIE_STCLASS
327#endif
1de06328
YO
328
329
40d049e4
YO
330
331#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
332#define PBITVAL(paren) (1 << ((paren) & 7))
333#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
334#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
335#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
336
82a6ada4 337#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 338 if (!UTF) { \
82a6ada4 339 assert(PASS1); \
c36361b2 340 *flagp = RESTART_PARSE|NEED_UTF8; \
f55b7b26 341 return 0; \
8d9c2815 342 } \
82a6ada4 343 } STMT_END
40d049e4 344
512c0f5a
KW
345/* Change from /d into /u rules, and restart the parse if we've already seen
346 * something whose size would increase as a result, by setting *flagp and
347 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
142ece99 348 * we've changed to /u during the parse. */
512c0f5a
KW
349#define REQUIRE_UNI_RULES(flagp, restart_retval) \
350 STMT_START { \
351 if (DEPENDS_SEMANTICS) { \
352 assert(PASS1); \
353 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
354 RExC_uni_semantics = 1; \
355 if (RExC_seen_unfolded_sharp_s) { \
c36361b2 356 *flagp |= RESTART_PARSE; \
512c0f5a
KW
357 return restart_retval; \
358 } \
359 } \
360 } STMT_END
361
fd7fc6ae 362/* Executes a return statement with the value 'X', if 'flags' contains any of
c36361b2 363 * 'RESTART_PARSE', 'NEED_UTF8', or 'extra'. If so, *flagp is set to those
fd7fc6ae
YO
364 * flags */
365#define RETURN_X_ON_RESTART_OR_FLAGS(X, flags, flagp, extra) \
366 STMT_START { \
c36361b2
KW
367 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
368 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
fd7fc6ae
YO
369 return X; \
370 } \
371 } STMT_END
372
5d555bb8 373#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
f55b7b26 374 RETURN_X_ON_RESTART_OR_FLAGS(0,flags,flagp,extra)
fd7fc6ae
YO
375
376#define RETURN_X_ON_RESTART(X, flags,flagp) \
377 RETURN_X_ON_RESTART_OR_FLAGS( X, flags, flagp, 0)
378
379
5d555bb8 380#define RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp,extra) \
c36361b2 381 if (*(flagp) & (RESTART_PARSE|(extra))) return 0
fd7fc6ae 382
c36361b2 383#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
fd7fc6ae 384
5d555bb8 385#define RETURN_FAIL_ON_RESTART(flags,flagp) \
f55b7b26 386 RETURN_X_ON_RESTART(0, flags,flagp)
5d555bb8 387#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
88f063b4 388 RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp, 0)
fd7fc6ae 389
f19b1a63
KW
390/* This converts the named class defined in regcomp.h to its equivalent class
391 * number defined in handy.h. */
392#define namedclass_to_classnum(class) ((int) ((class) / 2))
393#define classnum_to_namedclass(classnum) ((classnum) * 2)
394
de92f5e6
KW
395#define _invlist_union_complement_2nd(a, b, output) \
396 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
397#define _invlist_intersection_complement_2nd(a, b, output) \
398 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
399
1de06328
YO
400/* About scan_data_t.
401
402 During optimisation we recurse through the regexp program performing
403 various inplace (keyhole style) optimisations. In addition study_chunk
404 and scan_commit populate this data structure with information about
538e84ed 405 what strings MUST appear in the pattern. We look for the longest
3b753521 406 string that must appear at a fixed location, and we look for the
1de06328
YO
407 longest string that may appear at a floating location. So for instance
408 in the pattern:
538e84ed 409
1de06328 410 /FOO[xX]A.*B[xX]BAR/
538e84ed 411
1de06328
YO
412 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
413 strings (because they follow a .* construct). study_chunk will identify
414 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 415
1de06328 416 The strings can be composites, for instance
538e84ed 417
1de06328 418 /(f)(o)(o)/
538e84ed 419
1de06328 420 will result in a composite fixed substring 'foo'.
538e84ed 421
1de06328 422 For each string some basic information is maintained:
538e84ed 423
6ea77169 424 - min_offset
1de06328
YO
425 This is the position the string must appear at, or not before.
426 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
427 characters must match before the string we are searching for.
428 Likewise when combined with minlenp and the length of the string it
538e84ed 429 tells us how many characters must appear after the string we have
1de06328 430 found.
538e84ed 431
1de06328
YO
432 - max_offset
433 Only used for floating strings. This is the rightmost point that
ea3daa5d 434 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 435 string can occur infinitely far to the right.
6e12f832 436 For fixed strings, it is equal to min_offset.
538e84ed 437
1de06328 438 - minlenp
2d608413
KW
439 A pointer to the minimum number of characters of the pattern that the
440 string was found inside. This is important as in the case of positive
538e84ed 441 lookahead or positive lookbehind we can have multiple patterns
1de06328 442 involved. Consider
538e84ed 443
1de06328 444 /(?=FOO).*F/
538e84ed 445
1de06328
YO
446 The minimum length of the pattern overall is 3, the minimum length
447 of the lookahead part is 3, but the minimum length of the part that
538e84ed 448 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 449 minimum length for the F is 1. This is important as the minimum length
538e84ed 450 is used to determine offsets in front of and behind the string being
1de06328 451 looked for. Since strings can be composites this is the length of the
486ec47a 452 pattern at the time it was committed with a scan_commit. Note that
1de06328 453 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 454 are not known until the full pattern has been compiled, thus the
1de06328 455 pointer to the value.
538e84ed 456
1de06328 457 - lookbehind
538e84ed 458
1de06328 459 In the case of lookbehind the string being searched for can be
538e84ed 460 offset past the start point of the final matching string.
1de06328
YO
461 If this value was just blithely removed from the min_offset it would
462 invalidate some of the calculations for how many chars must match
463 before or after (as they are derived from min_offset and minlen and
538e84ed 464 the length of the string being searched for).
1de06328
YO
465 When the final pattern is compiled and the data is moved from the
466 scan_data_t structure into the regexp structure the information
538e84ed
KW
467 about lookbehind is factored in, with the information that would
468 have been lost precalculated in the end_shift field for the
1de06328
YO
469 associated string.
470
471 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 472 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
473
474*/
2c2d71f5 475
cb6aef8b
DM
476struct scan_data_substrs {
477 SV *str; /* longest substring found in pattern */
478 SSize_t min_offset; /* earliest point in string it can appear */
479 SSize_t max_offset; /* latest point in string it can appear */
480 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
481 SSize_t lookbehind; /* is the pos of the string modified by LB */
11683ecb 482 I32 flags; /* per substring SF_* and SCF_* flags */
cb6aef8b
DM
483};
484
2c2d71f5 485typedef struct scan_data_t {
1de06328
YO
486 /*I32 len_min; unused */
487 /*I32 len_delta; unused */
49f55535 488 SSize_t pos_min;
ea3daa5d 489 SSize_t pos_delta;
2c2d71f5 490 SV *last_found;
ea3daa5d 491 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 492 SSize_t last_start_min;
ea3daa5d 493 SSize_t last_start_max;
2099df82
DM
494 U8 cur_is_floating; /* whether the last_* values should be set as
495 * the next fixed (0) or floating (1)
496 * substring */
6ea77169 497
2099df82
DM
498 /* [0] is longest fixed substring so far, [1] is longest float so far */
499 struct scan_data_substrs substrs[2];
6ea77169 500
11683ecb 501 I32 flags; /* common SF_* and SCF_* flags */
2c2d71f5 502 I32 whilem_c;
49f55535 503 SSize_t *last_closep;
b8f7bb16 504 regnode_ssc *start_class;
2c2d71f5
JH
505} scan_data_t;
506
a687059c 507/*
e50aee73 508 * Forward declarations for pregcomp()'s friends.
a687059c 509 */
a0d0e21e 510
6ea77169 511static const scan_data_t zero_scan_data = {
a2ca2017 512 0, 0, NULL, 0, 0, 0, 0,
6ea77169 513 {
11683ecb
DM
514 { NULL, 0, 0, 0, 0, 0 },
515 { NULL, 0, 0, 0, 0, 0 },
6ea77169
DM
516 },
517 0, 0, NULL, NULL
518};
c277df42 519
11683ecb
DM
520/* study flags */
521
07be1b83
YO
522#define SF_BEFORE_SEOL 0x0001
523#define SF_BEFORE_MEOL 0x0002
11683ecb 524#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
c277df42 525
07be1b83
YO
526#define SF_IS_INF 0x0040
527#define SF_HAS_PAR 0x0080
528#define SF_IN_PAR 0x0100
529#define SF_HAS_EVAL 0x0200
bc604ad8
KW
530
531
532/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
533 * longest substring in the pattern. When it is not set the optimiser keeps
534 * track of position, but does not keep track of the actual strings seen,
535 *
536 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
537 * /foo/i will not.
538 *
539 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
540 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
541 * turned off because of the alternation (BRANCH). */
07be1b83 542#define SCF_DO_SUBSTR 0x0400
bc604ad8 543
653099ff
GS
544#define SCF_DO_STCLASS_AND 0x0800
545#define SCF_DO_STCLASS_OR 0x1000
546#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 547#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 548
786e8c11 549#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 550#define SCF_SEEN_ACCEPT 0x8000
688e0391 551#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
552#define SCF_IN_DEFINE 0x20000
553
554
555
07be1b83 556
43fead97 557#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
558
559/* The enums for all these are ordered so things work out correctly */
a62b1201 560#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
561#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
562 == REGEX_DEPENDS_CHARSET)
00b27cfc 563#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
564#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
565 >= REGEX_UNICODE_CHARSET)
566#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
567 == REGEX_ASCII_RESTRICTED_CHARSET)
568#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
569 >= REGEX_ASCII_RESTRICTED_CHARSET)
570#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
571 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 572
43fead97 573#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 574
f2c2a6ab
KW
575/* For programs that want to be strictly Unicode compatible by dying if any
576 * attempt is made to match a non-Unicode code point against a Unicode
577 * property. */
578#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
579
93733859 580#define OOB_NAMEDCLASS -1
b8c5462f 581
8e661ac5
KW
582/* There is no code point that is out-of-bounds, so this is problematic. But
583 * its only current use is to initialize a variable that is always set before
584 * looked at. */
585#define OOB_UNICODE 0xDEADBEEF
586
a0ed51b3 587#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 588
8615cb43 589
b45f050a
JF
590/* length of regex to show in messages that don't mark a position within */
591#define RegexLengthToShowInErrorMessages 127
592
593/*
594 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
595 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
596 * op/pragma/warn/regcomp.
597 */
7253e4e3
RK
598#define MARKER1 "<-- HERE" /* marker as it appears in the description */
599#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 600
538e84ed 601#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 602 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 603
285b5ca0
KW
604/* The code in this file in places uses one level of recursion with parsing
605 * rebased to an alternate string constructed by us in memory. This can take
606 * the form of something that is completely different from the input, or
607 * something that uses the input as part of the alternate. In the first case,
608 * there should be no possibility of an error, as we are in complete control of
51684e3c
KW
609 * the alternate string. But in the second case we don't completely control
610 * the input portion, so there may be errors in that. Here's an example:
285b5ca0
KW
611 * /[abc\x{DF}def]/ui
612 * is handled specially because \x{df} folds to a sequence of more than one
51684e3c 613 * character: 'ss'. What is done is to create and parse an alternate string,
285b5ca0
KW
614 * which looks like this:
615 * /(?:\x{DF}|[abc\x{DF}def])/ui
616 * where it uses the input unchanged in the middle of something it constructs,
617 * which is a branch for the DF outside the character class, and clustering
618 * parens around the whole thing. (It knows enough to skip the DF inside the
619 * class while in this substitute parse.) 'abc' and 'def' may have errors that
620 * need to be reported. The general situation looks like this:
621 *
51684e3c 622 * |<------- identical ------>|
285b5ca0 623 * sI tI xI eI
51684e3c 624 * Input: ---------------------------------------------------------------
285b5ca0
KW
625 * Constructed: ---------------------------------------------------
626 * sC tC xC eC EC
51684e3c 627 * |<------- identical ------>|
285b5ca0 628 *
51684e3c
KW
629 * sI..eI is the portion of the input pattern we are concerned with here.
630 * sC..EC is the constructed substitute parse string.
631 * sC..tC is constructed by us
632 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
633 * In the diagram, these are vertically aligned.
634 * eC..EC is also constructed by us.
635 * xC is the position in the substitute parse string where we found a
636 * problem.
637 * xI is the position in the original pattern corresponding to xC.
285b5ca0 638 *
51684e3c
KW
639 * We want to display a message showing the real input string. Thus we need to
640 * translate from xC to xI. We know that xC >= tC, since the portion of the
641 * string sC..tC has been constructed by us, and so shouldn't have errors. We
642 * get:
643 * xI = tI + (xC - tC)
285b5ca0 644 *
51684e3c
KW
645 * When the substitute parse is constructed, the code needs to set:
646 * RExC_start (sC)
647 * RExC_end (eC)
648 * RExC_copy_start_in_input (tI)
649 * RExC_copy_start_in_constructed (tC)
650 * and restore them when done.
285b5ca0 651 *
51684e3c
KW
652 * During normal processing of the input pattern, both
653 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
654 * sI, so that xC equals xI.
285b5ca0
KW
655 */
656
51684e3c
KW
657#define sI RExC_precomp
658#define eI RExC_precomp_end
659#define sC RExC_start
660#define eC RExC_end
661#define tI RExC_copy_start_in_input
662#define tC RExC_copy_start_in_constructed
663#define xI(xC) (tI + (xC - tC))
664#define xI_offset(xC) (xI(xC) - sI)
285b5ca0
KW
665
666#define REPORT_LOCATION_ARGS(xC) \
667 UTF8fARG(UTF, \
51684e3c 668 (xI(xC) > eI) /* Don't run off end */ \
285b5ca0 669 ? eC - sC /* Length before the <--HERE */ \
bc929b61
KW
670 : ((xI_offset(xC) >= 0) \
671 ? xI_offset(xC) \
672 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
673 IVdf " trying to output message for " \
674 " pattern %.*s", \
51684e3c 675 __FILE__, __LINE__, (IV) xI_offset(xC), \
bc929b61 676 ((int) (eC - sC)), sC), 0)), \
51684e3c 677 sI), /* The input pattern printed up to the <--HERE */ \
285b5ca0 678 UTF8fARG(UTF, \
51684e3c
KW
679 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
680 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
c1d900c3 681
8a6d8ec6
HS
682/* Used to point after bad bytes for an error message, but avoid skipping
683 * past a nul byte. */
684#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
685
b45f050a
JF
686/*
687 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
688 * arg. Show regex, up to a maximum length. If it's too long, chop and add
689 * "...".
690 */
58e23c8d 691#define _FAIL(code) STMT_START { \
bfed75c6 692 const char *ellipses = ""; \
88f063b4 693 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380
MP
694 \
695 if (!SIZE_ONLY) \
a5e7bc51 696 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
697 if (len > RegexLengthToShowInErrorMessages) { \
698 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
699 len = RegexLengthToShowInErrorMessages - 10; \
700 ellipses = "..."; \
701 } \
58e23c8d 702 code; \
ccb2c380 703} STMT_END
8615cb43 704
58e23c8d 705#define FAIL(msg) _FAIL( \
147e3846 706 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 707 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
708
709#define FAIL2(msg,arg) _FAIL( \
147e3846 710 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 711 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 712
b45f050a 713/*
b45f050a
JF
714 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
715 */
ccb2c380 716#define Simple_vFAIL(m) STMT_START { \
ccb2c380 717 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 718 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 719} STMT_END
b45f050a
JF
720
721/*
722 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
723 */
ccb2c380
MP
724#define vFAIL(m) STMT_START { \
725 if (!SIZE_ONLY) \
a5e7bc51 726 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
727 Simple_vFAIL(m); \
728} STMT_END
b45f050a
JF
729
730/*
731 * Like Simple_vFAIL(), but accepts two arguments.
732 */
ccb2c380 733#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
734 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
735 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 736} STMT_END
b45f050a
JF
737
738/*
739 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
740 */
ccb2c380
MP
741#define vFAIL2(m,a1) STMT_START { \
742 if (!SIZE_ONLY) \
a5e7bc51 743 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
744 Simple_vFAIL2(m, a1); \
745} STMT_END
b45f050a
JF
746
747
748/*
749 * Like Simple_vFAIL(), but accepts three arguments.
750 */
ccb2c380 751#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 752 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 753 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 754} STMT_END
b45f050a
JF
755
756/*
757 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
758 */
ccb2c380
MP
759#define vFAIL3(m,a1,a2) STMT_START { \
760 if (!SIZE_ONLY) \
a5e7bc51 761 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
762 Simple_vFAIL3(m, a1, a2); \
763} STMT_END
b45f050a
JF
764
765/*
766 * Like Simple_vFAIL(), but accepts four arguments.
767 */
ccb2c380 768#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
769 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
770 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 771} STMT_END
b45f050a 772
95db3ffa
KW
773#define vFAIL4(m,a1,a2,a3) STMT_START { \
774 if (!SIZE_ONLY) \
775 SAVEFREESV(RExC_rx_sv); \
776 Simple_vFAIL4(m, a1, a2, a3); \
777} STMT_END
778
946095af 779/* A specialized version of vFAIL2 that works with UTF8f */
d528642a
KW
780#define vFAIL2utf8f(m, a1) STMT_START { \
781 if (!SIZE_ONLY) \
782 SAVEFREESV(RExC_rx_sv); \
783 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
784 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
785} STMT_END
786
3ba22297 787#define vFAIL3utf8f(m, a1, a2) STMT_START { \
3ba22297
KW
788 if (!SIZE_ONLY) \
789 SAVEFREESV(RExC_rx_sv); \
790 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 791 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
792} STMT_END
793
f8220fe2 794/* This has an assert in it because of [perl #122671] Many warnings in
499333dc
KW
795 * regcomp.c can occur twice. If they get output in pass1 and later in that
796 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
f8220fe2
KW
797 * would get output again. So they should be output in pass2, and this
798 * assert makes sure new warnings follow that paradigm. */
799#define _WARN_HELPER(loc, warns, code) \
800 STMT_START { \
801 __ASSERT_(PASS2) code; \
802 } STMT_END
946095af 803
5e0a247b 804/* m is not necessarily a "literal string", in this macro */
f8220fe2
KW
805#define reg_warn_non_literal_string(loc, m) \
806 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
807 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 808 "%s" REPORT_LOCATION, \
f8220fe2 809 m, REPORT_LOCATION_ARGS(loc)))
5e0a247b 810
f8220fe2
KW
811#define ckWARNreg(loc,m) \
812 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
813 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 814 m REPORT_LOCATION, \
f8220fe2 815 REPORT_LOCATION_ARGS(loc)))
ccb2c380 816
f8220fe2
KW
817#define vWARN(loc, m) \
818 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
819 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 820 m REPORT_LOCATION, \
f8220fe2 821 REPORT_LOCATION_ARGS(loc))) \
b927b7e9 822
f8220fe2
KW
823#define vWARN_dep(loc, m) \
824 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
825 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 826 m REPORT_LOCATION, \
f8220fe2 827 REPORT_LOCATION_ARGS(loc)))
0d6106aa 828
f8220fe2
KW
829#define ckWARNdep(loc,m) \
830 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
831 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 832 m REPORT_LOCATION, \
f8220fe2 833 REPORT_LOCATION_ARGS(loc)))
147508a2 834
f8220fe2
KW
835#define ckWARNregdep(loc,m) \
836 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
837 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
d528642a
KW
838 WARN_REGEXP), \
839 m REPORT_LOCATION, \
f8220fe2 840 REPORT_LOCATION_ARGS(loc)))
ccb2c380 841
f8220fe2
KW
842#define ckWARN2reg_d(loc,m, a1) \
843 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
844 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
d528642a 845 m REPORT_LOCATION, \
f8220fe2 846 a1, REPORT_LOCATION_ARGS(loc)))
2335b3d3 847
f8220fe2
KW
848#define ckWARN2reg(loc, m, a1) \
849 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
850 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 851 m REPORT_LOCATION, \
f8220fe2 852 a1, REPORT_LOCATION_ARGS(loc)))
ccb2c380 853
f8220fe2
KW
854#define vWARN3(loc, m, a1, a2) \
855 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
856 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 857 m REPORT_LOCATION, \
f8220fe2 858 a1, a2, REPORT_LOCATION_ARGS(loc)))
ccb2c380 859
f8220fe2
KW
860#define ckWARN3reg(loc, m, a1, a2) \
861 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
862 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
863 m REPORT_LOCATION, \
864 a1, a2, \
f8220fe2 865 REPORT_LOCATION_ARGS(loc)))
668c081a 866
f8220fe2
KW
867#define vWARN4(loc, m, a1, a2, a3) \
868 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
869 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
870 m REPORT_LOCATION, \
871 a1, a2, a3, \
f8220fe2 872 REPORT_LOCATION_ARGS(loc)))
ccb2c380 873
f8220fe2
KW
874#define ckWARN4reg(loc, m, a1, a2, a3) \
875 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
876 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
877 m REPORT_LOCATION, \
878 a1, a2, a3, \
f8220fe2 879 REPORT_LOCATION_ARGS(loc)))
668c081a 880
f8220fe2
KW
881#define vWARN5(loc, m, a1, a2, a3, a4) \
882 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
883 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
884 m REPORT_LOCATION, \
885 a1, a2, a3, a4, \
f8220fe2 886 REPORT_LOCATION_ARGS(loc)))
9d1d55b5 887
1a322bb4
KW
888/* Convert between a pointer to a node and its offset from the beginning of the
889 * program */
890#define REGNODE_p(offset) (RExC_emit_start + (offset))
891#define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
892
538e84ed 893/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
894 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
895 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
896 * Element 0 holds the number n.
07be1b83 897 * Position is 1 indexed.
fac92740 898 */
7122b237 899#ifndef RE_TRACK_PATTERN_OFFSETS
d7c442ce 900#define Set_Node_Offset_To_R(offset,byte)
7122b237
YO
901#define Set_Node_Offset(node,byte)
902#define Set_Cur_Node_Offset
903#define Set_Node_Length_To_R(node,len)
904#define Set_Node_Length(node,len)
6a86c6ad 905#define Set_Node_Cur_Length(node,start)
538e84ed
KW
906#define Node_Offset(n)
907#define Node_Length(n)
7122b237
YO
908#define Set_Node_Offset_Length(node,offset,len)
909#define ProgLen(ri) ri->u.proglen
910#define SetProgLen(ri,x) ri->u.proglen = x
911#else
912#define ProgLen(ri) ri->u.offsets[0]
913#define SetProgLen(ri,x) ri->u.offsets[0] = x
d7c442ce 914#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
ccb2c380
MP
915 if (! SIZE_ONLY) { \
916 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
d7c442ce
KW
917 __LINE__, (int)(offset), (int)(byte))); \
918 if((offset) < 0) { \
538e84ed 919 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
d7c442ce 920 (int)(offset)); \
ccb2c380 921 } else { \
d7c442ce 922 RExC_offsets[2*(offset)-1] = (byte); \
ccb2c380
MP
923 } \
924 } \
925} STMT_END
926
88f063b4 927#define Set_Node_Offset(node,byte) \
1a322bb4 928 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
ccb2c380
MP
929#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
930
931#define Set_Node_Length_To_R(node,len) STMT_START { \
932 if (! SIZE_ONLY) { \
933 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 934 __LINE__, (int)(node), (int)(len))); \
ccb2c380 935 if((node) < 0) { \
538e84ed
KW
936 Perl_croak(aTHX_ "value of node is %d in Length macro", \
937 (int)(node)); \
ccb2c380
MP
938 } else { \
939 RExC_offsets[2*(node)] = (len); \
940 } \
941 } \
942} STMT_END
943
944#define Set_Node_Length(node,len) \
1a322bb4 945 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
6a86c6ad
NC
946#define Set_Node_Cur_Length(node, start) \
947 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
948
949/* Get offsets and lengths */
1a322bb4
KW
950#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
951#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
fac92740 952
07be1b83 953#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1a322bb4
KW
954 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
955 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
07be1b83 956} STMT_END
7122b237 957#endif
07be1b83
YO
958
959#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
960#define EXPERIMENTAL_INPLACESCAN
f427392e 961#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 962
cb41e5d6
YO
963#ifdef DEBUGGING
964int
6ad9a8ab 965Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
966{
967 va_list ap;
968 int result;
969 PerlIO *f= Perl_debug_log;
970 PERL_ARGS_ASSERT_RE_PRINTF;
971 va_start(ap, fmt);
972 result = PerlIO_vprintf(f, fmt, ap);
973 va_end(ap);
974 return result;
975}
976
977int
7b031478 978Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
979{
980 va_list ap;
981 int result;
982 PerlIO *f= Perl_debug_log;
983 PERL_ARGS_ASSERT_RE_INDENTF;
984 va_start(ap, depth);
daeb874b 985 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
986 result = PerlIO_vprintf(f, fmt, ap);
987 va_end(ap);
988 return result;
989}
990#endif /* DEBUGGING */
991
7b031478 992#define DEBUG_RExC_seen() \
538e84ed 993 DEBUG_OPTIMISE_MORE_r({ \
88f063b4 994 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 995 \
e384d5c1 996 if (RExC_seen & REG_ZERO_LEN_SEEN) \
88f063b4 997 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 998 \
e384d5c1 999 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
88f063b4 1000 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 1001 \
e384d5c1 1002 if (RExC_seen & REG_GPOS_SEEN) \
88f063b4 1003 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 1004 \
e384d5c1 1005 if (RExC_seen & REG_RECURSE_SEEN) \
88f063b4 1006 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 1007 \
7b031478 1008 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
88f063b4 1009 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 1010 \
e384d5c1 1011 if (RExC_seen & REG_VERBARG_SEEN) \
88f063b4 1012 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 1013 \
e384d5c1 1014 if (RExC_seen & REG_CUTGROUP_SEEN) \
88f063b4 1015 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 1016 \
e384d5c1 1017 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
88f063b4 1018 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 1019 \
e384d5c1 1020 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
88f063b4 1021 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 1022 \
7b031478 1023 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
88f063b4 1024 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 1025 \
88f063b4 1026 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
1027 });
1028
fdfb4f21 1029#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 1030 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21 1031
fdfb4f21 1032
f5a36d78
DM
1033#ifdef DEBUGGING
1034static void
1035S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1036 const char *close_str)
1037{
1038 if (!flags)
1039 return;
1040
1041 Perl_re_printf( aTHX_ "%s", open_str);
11683ecb
DM
1042 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1043 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
f5a36d78
DM
1044 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1045 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1046 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1047 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1048 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1049 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1050 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1051 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1052 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1053 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1054 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1055 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1056 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1057 Perl_re_printf( aTHX_ "%s", close_str);
1058}
1059
1060
1061static void
1062S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1063 U32 depth, int is_inf)
1064{
1065 GET_RE_DEBUG_FLAGS_DECL;
1066
1067 DEBUG_OPTIMISE_MORE_r({
1068 if (!data)
1069 return;
1070 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1071 depth,
1072 where,
1073 (IV)data->pos_min,
1074 (IV)data->pos_delta,
1075 (UV)data->flags
1076 );
1077
1078 S_debug_show_study_flags(aTHX_ data->flags," [","]");
fdfb4f21 1079
f5a36d78
DM
1080 Perl_re_printf( aTHX_
1081 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1082 (IV)data->whilem_c,
1083 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1084 is_inf ? "INF " : ""
1085 );
1086
11683ecb 1087 if (data->last_found) {
37b6262f 1088 int i;
11683ecb
DM
1089 Perl_re_printf(aTHX_
1090 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1091 SvPVX_const(data->last_found),
1092 (IV)data->last_end,
1093 (IV)data->last_start_min,
1094 (IV)data->last_start_max
1095 );
1096
2099df82 1097 for (i = 0; i < 2; i++) {
37b6262f
DM
1098 Perl_re_printf(aTHX_
1099 " %s%s: '%s' @ %" IVdf "/%" IVdf,
2099df82
DM
1100 data->cur_is_floating == i ? "*" : "",
1101 i ? "Float" : "Fixed",
37b6262f
DM
1102 SvPVX_const(data->substrs[i].str),
1103 (IV)data->substrs[i].min_offset,
1104 (IV)data->substrs[i].max_offset
1105 );
1106 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1107 }
11683ecb 1108 }
f5a36d78
DM
1109
1110 Perl_re_printf( aTHX_ "\n");
1111 });
1112}
1113
1114
1115static void
1116S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1117 regnode *scan, U32 depth, U32 flags)
1118{
1119 GET_RE_DEBUG_FLAGS_DECL;
1120
1121 DEBUG_OPTIMISE_r({
1122 regnode *Next;
1123
1124 if (!scan)
1125 return;
1126 Next = regnext(scan);
1127 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1128 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1129 depth,
1130 str,
1131 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1132 Next ? (REG_NODE_NUM(Next)) : 0 );
1133 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1134 Perl_re_printf( aTHX_ "\n");
1135 });
1136}
1137
1138
1139# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1140 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1141
1142# define DEBUG_PEEP(str, scan, depth, flags) \
1143 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1144
1145#else
1146# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1147# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1148#endif
1de06328 1149
cb41e5d6 1150
c6871b76
KW
1151/* =========================================================
1152 * BEGIN edit_distance stuff.
1153 *
1154 * This calculates how many single character changes of any type are needed to
1155 * transform a string into another one. It is taken from version 3.1 of
1156 *
1157 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1158 */
1159
1160/* Our unsorted dictionary linked list. */
1161/* Note we use UVs, not chars. */
1162
1163struct dictionary{
1164 UV key;
1165 UV value;
1166 struct dictionary* next;
1167};
1168typedef struct dictionary item;
1169
1170
1171PERL_STATIC_INLINE item*
88f063b4 1172push(UV key, item* curr)
c6871b76
KW
1173{
1174 item* head;
d09f14bf 1175 Newx(head, 1, item);
c6871b76
KW
1176 head->key = key;
1177 head->value = 0;
1178 head->next = curr;
1179 return head;
1180}
1181
1182
1183PERL_STATIC_INLINE item*
1184find(item* head, UV key)
1185{
1186 item* iterator = head;
1187 while (iterator){
1188 if (iterator->key == key){
1189 return iterator;
1190 }
1191 iterator = iterator->next;
1192 }
1193
1194 return NULL;
1195}
1196
1197PERL_STATIC_INLINE item*
88f063b4 1198uniquePush(item* head, UV key)
c6871b76
KW
1199{
1200 item* iterator = head;
1201
1202 while (iterator){
1203 if (iterator->key == key) {
1204 return head;
1205 }
1206 iterator = iterator->next;
1207 }
1208
88f063b4 1209 return push(key, head);
c6871b76
KW
1210}
1211
1212PERL_STATIC_INLINE void
1213dict_free(item* head)
1214{
1215 item* iterator = head;
1216
1217 while (iterator) {
1218 item* temp = iterator;
1219 iterator = iterator->next;
1220 Safefree(temp);
1221 }
1222
1223 head = NULL;
1224}
1225
1226/* End of Dictionary Stuff */
1227
1228/* All calculations/work are done here */
1229STATIC int
1230S_edit_distance(const UV* src,
1231 const UV* tgt,
1232 const STRLEN x, /* length of src[] */
1233 const STRLEN y, /* length of tgt[] */
1234 const SSize_t maxDistance
1235)
1236{
1237 item *head = NULL;
88f063b4 1238 UV swapCount, swapScore, targetCharCount, i, j;
c6871b76
KW
1239 UV *scores;
1240 UV score_ceil = x + y;
1241
1242 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1243
1244 /* intialize matrix start values */
d09f14bf 1245 Newx(scores, ( (x + 2) * (y + 2)), UV);
c6871b76
KW
1246 scores[0] = score_ceil;
1247 scores[1 * (y + 2) + 0] = score_ceil;
1248 scores[0 * (y + 2) + 1] = score_ceil;
1249 scores[1 * (y + 2) + 1] = 0;
88f063b4 1250 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
c6871b76
KW
1251
1252 /* work loops */
1253 /* i = src index */
1254 /* j = tgt index */
1255 for (i=1;i<=x;i++) {
1256 if (i < x)
88f063b4 1257 head = uniquePush(head, src[i]);
c6871b76
KW
1258 scores[(i+1) * (y + 2) + 1] = i;
1259 scores[(i+1) * (y + 2) + 0] = score_ceil;
1260 swapCount = 0;
1261
1262 for (j=1;j<=y;j++) {
1263 if (i == 1) {
1264 if(j < y)
88f063b4 1265 head = uniquePush(head, tgt[j]);
c6871b76
KW
1266 scores[1 * (y + 2) + (j + 1)] = j;
1267 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1268 }
1269
88f063b4 1270 targetCharCount = find(head, tgt[j-1])->value;
c6871b76
KW
1271 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1272
1273 if (src[i-1] != tgt[j-1]){
1274 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1275 }
1276 else {
1277 swapCount = j;
1278 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1279 }
1280 }
1281
88f063b4 1282 find(head, src[i-1])->value = i;
c6871b76
KW
1283 }
1284
1285 {
1286 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1287 dict_free(head);
1288 Safefree(scores);
1289 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1290 }
1291}
1292
1293/* END of edit_distance() stuff
1294 * ========================================================= */
1295
8e35b056
KW
1296/* is c a control character for which we have a mnemonic? */
1297#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1298
549b4e78
KW
1299STATIC const char *
1300S_cntrl_to_mnemonic(const U8 c)
1301{
1302 /* Returns the mnemonic string that represents character 'c', if one
1303 * exists; NULL otherwise. The only ones that exist for the purposes of
1304 * this routine are a few control characters */
1305
1306 switch (c) {
1307 case '\a': return "\\a";
1308 case '\b': return "\\b";
1309 case ESC_NATIVE: return "\\e";
1310 case '\f': return "\\f";
1311 case '\n': return "\\n";
1312 case '\r': return "\\r";
1313 case '\t': return "\\t";
1314 }
1315
1316 return NULL;
1317}
1318
653099ff 1319/* Mark that we cannot extend a found fixed substring at this point.
37b6262f 1320 Update the longest found anchored substring or the longest found
653099ff
GS
1321 floating substrings if needed. */
1322
4327152a 1323STATIC void
ea3daa5d
FC
1324S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1325 SSize_t *minlenp, int is_inf)
c277df42 1326{
e1ec3a88 1327 const STRLEN l = CHR_SVLEN(data->last_found);
2099df82 1328 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
a2ca2017 1329 const STRLEN old_l = CHR_SVLEN(longest_sv);
1de06328 1330 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1331
7918f24d
NC
1332 PERL_ARGS_ASSERT_SCAN_COMMIT;
1333
c277df42 1334 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
2099df82 1335 const U8 i = data->cur_is_floating;
a2ca2017 1336 SvSetMagicSV(longest_sv, data->last_found);
37b6262f
DM
1337 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1338
2099df82
DM
1339 if (!i) /* fixed */
1340 data->substrs[0].max_offset = data->substrs[0].min_offset;
1341 else { /* float */
1342 data->substrs[1].max_offset = (l
646e8787
DM
1343 ? data->last_start_max
1344 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1345 ? SSize_t_MAX
1346 : data->pos_min + data->pos_delta));
1347 if (is_inf
2099df82
DM
1348 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1349 data->substrs[1].max_offset = SSize_t_MAX;
37b6262f 1350 }
11683ecb 1351
37b6262f
DM
1352 if (data->flags & SF_BEFORE_EOL)
1353 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1354 else
1355 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1356 data->substrs[i].minlenp = minlenp;
1357 data->substrs[i].lookbehind = 0;
c277df42 1358 }
37b6262f 1359
c277df42 1360 SvCUR_set(data->last_found, 0);
0eda9292 1361 {
a28509cc 1362 SV * const sv = data->last_found;
097eb12c
AL
1363 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1364 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1365 if (mg)
1366 mg->mg_len = 0;
1367 }
0eda9292 1368 }
c277df42
IZ
1369 data->last_end = -1;
1370 data->flags &= ~SF_BEFORE_EOL;
f5a36d78 1371 DEBUG_STUDYDATA("commit", data, 0, is_inf);
c277df42
IZ
1372}
1373
cdd87c1d
KW
1374/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1375 * list that describes which code points it matches */
1376
653099ff 1377STATIC void
3420edd7 1378S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1379{
cdd87c1d
KW
1380 /* Set the SSC 'ssc' to match an empty string or any code point */
1381
557bd3fb 1382 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1383
71068078 1384 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1385
0854ea0b
KW
1386 /* mortalize so won't leak */
1387 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1388 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1389}
1390
653099ff 1391STATIC int
dc3bf405 1392S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1393{
c144baaa
KW
1394 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1395 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1396 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1397 * in any way, so there's no point in using it */
cdd87c1d
KW
1398
1399 UV start, end;
1400 bool ret;
653099ff 1401
557bd3fb 1402 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1403
71068078 1404 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1405
93e92956 1406 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1407 return FALSE;
1408 }
1409
1410 /* See if the list consists solely of the range 0 - Infinity */
1411 invlist_iterinit(ssc->invlist);
1412 ret = invlist_iternext(ssc->invlist, &start, &end)
1413 && start == 0
1414 && end == UV_MAX;
1415
1416 invlist_iterfinish(ssc->invlist);
1417
1418 if (ret) {
1419 return TRUE;
1420 }
1421
1422 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1423 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1424 int i;
1425 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1426 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1427 return TRUE;
1428 }
1429 }
1430 }
1431
1432 return FALSE;
653099ff
GS
1433}
1434
653099ff 1435STATIC void
cdd87c1d 1436S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1437{
cdd87c1d
KW
1438 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1439 * string, any code point, or any posix class under locale */
1440
557bd3fb 1441 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1442
557bd3fb 1443 Zero(ssc, 1, regnode_ssc);
71068078 1444 set_ANYOF_SYNTHETIC(ssc);
93e92956 1445 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1446 ssc_anything(ssc);
cdd87c1d 1447
2f306ab9
KW
1448 /* If any portion of the regex is to operate under locale rules that aren't
1449 * fully known at compile time, initialization includes it. The reason
1450 * this isn't done for all regexes is that the optimizer was written under
1451 * the assumption that locale was all-or-nothing. Given the complexity and
1452 * lack of documentation in the optimizer, and that there are inadequate
1453 * test cases for locale, many parts of it may not work properly, it is
1454 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1455 if (RExC_contains_locale) {
1456 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1457 }
1458 else {
1459 ANYOF_POSIXL_ZERO(ssc);
1460 }
653099ff
GS
1461}
1462
b423522f 1463STATIC int
dc3bf405
BF
1464S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1465 const regnode_ssc *ssc)
b423522f
KW
1466{
1467 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1468 * to the list of code points matched, and locale posix classes; hence does
1469 * not check its flags) */
1470
1471 UV start, end;
1472 bool ret;
1473
1474 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1475
71068078 1476 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1477
1478 invlist_iterinit(ssc->invlist);
1479 ret = invlist_iternext(ssc->invlist, &start, &end)
1480 && start == 0
1481 && end == UV_MAX;
1482
1483 invlist_iterfinish(ssc->invlist);
1484
1485 if (! ret) {
1486 return FALSE;
1487 }
1488
e0e1be5f 1489 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1490 return FALSE;
b423522f
KW
1491 }
1492
1493 return TRUE;
1494}
1495
1496STATIC SV*
1497S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1498 const regnode_charclass* const node)
b423522f
KW
1499{
1500 /* Returns a mortal inversion list defining which code points are matched
1501 * by 'node', which is of type ANYOF. Handles complementing the result if
1502 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1503 * returned list must, and will, contain every code point that is a
1504 * possibility. */
b423522f 1505
e2506fa7 1506 SV* invlist = NULL;
1ee208c4 1507 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1508 unsigned int i;
1509 const U32 n = ARG(node);
31f05a37 1510 bool new_node_has_latin1 = FALSE;
b423522f
KW
1511
1512 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1513
1514 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1515 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1516 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1517 AV * const av = MUTABLE_AV(SvRV(rv));
1518 SV **const ary = AvARRAY(av);
1519 assert(RExC_rxi->data->what[n] == 's');
1520
1521 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
28118b9c 1522 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]), NULL));
b423522f
KW
1523 }
1524 else if (ary[0] && ary[0] != &PL_sv_undef) {
1525
1526 /* Here, no compile-time swash, and there are things that won't be
1527 * known until runtime -- we have to assume it could be anything */
e2506fa7 1528 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1529 return _add_range_to_invlist(invlist, 0, UV_MAX);
1530 }
1ee208c4 1531 else if (ary[3] && ary[3] != &PL_sv_undef) {
b423522f
KW
1532
1533 /* Here no compile-time swash, and no run-time only data. Use the
1534 * node's inversion list */
28118b9c 1535 invlist = sv_2mortal(invlist_clone(ary[3], NULL));
1ee208c4
KW
1536 }
1537
1538 /* Get the code points valid only under UTF-8 locales */
037715a6 1539 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1ee208c4
KW
1540 && ary[2] && ary[2] != &PL_sv_undef)
1541 {
1542 only_utf8_locale_invlist = ary[2];
b423522f
KW
1543 }
1544 }
1545
e2506fa7
KW
1546 if (! invlist) {
1547 invlist = sv_2mortal(_new_invlist(0));
1548 }
1549
dcb20b36
KW
1550 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1551 * code points, and an inversion list for the others, but if there are code
1552 * points that should match only conditionally on the target string being
1553 * UTF-8, those are placed in the inversion list, and not the bitmap.
1554 * Since there are circumstances under which they could match, they are
1555 * included in the SSC. But if the ANYOF node is to be inverted, we have
1556 * to exclude them here, so that when we invert below, the end result
1557 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1558 * have to do this here before we add the unconditionally matched code
1559 * points */
b423522f
KW
1560 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1561 _invlist_intersection_complement_2nd(invlist,
1562 PL_UpperLatin1,
1563 &invlist);
1564 }
1565
1566 /* Add in the points from the bit map */
dcb20b36 1567 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
b423522f 1568 if (ANYOF_BITMAP_TEST(node, i)) {
6f8848d5
TC
1569 unsigned int start = i++;
1570
1571 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1572 /* empty */
1573 }
1574 invlist = _add_range_to_invlist(invlist, start, i-1);
31f05a37 1575 new_node_has_latin1 = TRUE;
b423522f
KW
1576 }
1577 }
1578
1579 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1580 * as well. But don't add them if inverting, as when that gets done below,
1581 * it would exclude all these characters, including the ones it shouldn't
1582 * that were added just above */
1583 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
f240c685
KW
1584 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1585 {
b423522f
KW
1586 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1587 }
1588
1589 /* Similarly for these */
93e92956 1590 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1591 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1592 }
1593
1594 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1595 _invlist_invert(invlist);
1596 }
037715a6 1597 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
31f05a37
KW
1598
1599 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1600 * locale. We can skip this if there are no 0-255 at all. */
1601 _invlist_union(invlist, PL_Latin1, &invlist);
1602 }
1603
1ee208c4
KW
1604 /* Similarly add the UTF-8 locale possible matches. These have to be
1605 * deferred until after the non-UTF-8 locale ones are taken care of just
1606 * above, or it leads to wrong results under ANYOF_INVERT */
1607 if (only_utf8_locale_invlist) {
31f05a37 1608 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1609 only_utf8_locale_invlist,
31f05a37
KW
1610 ANYOF_FLAGS(node) & ANYOF_INVERT,
1611 &invlist);
1612 }
b423522f
KW
1613
1614 return invlist;
1615}
1616
1051e1c4 1617/* These two functions currently do the exact same thing */
557bd3fb 1618#define ssc_init_zero ssc_init
653099ff 1619
cdd87c1d
KW
1620#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1621#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1622
557bd3fb 1623/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1624 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1625 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1626
653099ff 1627STATIC void
b423522f 1628S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1629 const regnode_charclass *and_with)
653099ff 1630{
cdd87c1d
KW
1631 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1632 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1633
a0dd4231
KW
1634 SV* anded_cp_list;
1635 U8 anded_flags;
1e6ade67 1636
cdd87c1d 1637 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1638
71068078 1639 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1640
1641 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1642 * the code point inversion list and just the relevant flags */
71068078 1643 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1644 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
a0dd4231 1645 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1646
1647 /* XXX This is a kludge around what appears to be deficiencies in the
1648 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1649 * there are paths through the optimizer where it doesn't get weeded
1650 * out when it should. And if we don't make some extra provision for
1651 * it like the code just below, it doesn't get added when it should.
1652 * This solution is to add it only when AND'ing, which is here, and
1653 * only when what is being AND'ed is the pristine, original node
1654 * matching anything. Thus it is like adding it to ssc_anything() but
1655 * only when the result is to be AND'ed. Probably the same solution
1656 * could be adopted for the same problem we have with /l matching,
1657 * which is solved differently in S_ssc_init(), and that would lead to
1658 * fewer false positives than that solution has. But if this solution
1659 * creates bugs, the consequences are only that a warning isn't raised
1660 * that should be; while the consequences for having /l bugs is
1661 * incorrect matches */
7dcac5f6 1662 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1663 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1664 }
a0dd4231
KW
1665 }
1666 else {
5c0f85ef 1667 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685
KW
1668 if (OP(and_with) == ANYOFD) {
1669 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1670 }
1671 else {
1672 anded_flags = ANYOF_FLAGS(and_with)
1673 &( ANYOF_COMMON_FLAGS
108316fb
KW
1674 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1675 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1676 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1677 anded_flags &=
1678 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1679 }
f240c685 1680 }
a0dd4231
KW
1681 }
1682
1683 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1684
1685 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1686 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1687 * 'and_with' may be inverted. When not inverted, we have the situation of
1688 * computing:
1689 * (C1 | P1) & (C2 | P2)
1690 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1691 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1692 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1693 * <= ((C1 & C2) | P1 | P2)
1694 * Alternatively, the last few steps could be:
1695 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1696 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1697 * <= (C1 | C2 | (P1 & P2))
1698 * We favor the second approach if either P1 or P2 is non-empty. This is
1699 * because these components are a barrier to doing optimizations, as what
1700 * they match cannot be known until the moment of matching as they are
1701 * dependent on the current locale, 'AND"ing them likely will reduce or
1702 * eliminate them.
1703 * But we can do better if we know that C1,P1 are in their initial state (a
1704 * frequent occurrence), each matching everything:
1705 * (<everything>) & (C2 | P2) = C2 | P2
1706 * Similarly, if C2,P2 are in their initial state (again a frequent
1707 * occurrence), the result is a no-op
1708 * (C1 | P1) & (<everything>) = C1 | P1
1709 *
1710 * Inverted, we have
1711 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1712 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1713 * <= (C1 & ~C2) | (P1 & ~P2)
1714 * */
1aa99e6b 1715
a0dd4231 1716 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1717 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1718 {
cdd87c1d 1719 unsigned int i;
8951c461 1720
cdd87c1d
KW
1721 ssc_intersection(ssc,
1722 anded_cp_list,
1723 FALSE /* Has already been inverted */
1724 );
c6b76537 1725
cdd87c1d
KW
1726 /* If either P1 or P2 is empty, the intersection will be also; can skip
1727 * the loop */
93e92956 1728 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1729 ANYOF_POSIXL_ZERO(ssc);
1730 }
e0e1be5f 1731 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1732
1733 /* Note that the Posix class component P from 'and_with' actually
1734 * looks like:
1735 * P = Pa | Pb | ... | Pn
1736 * where each component is one posix class, such as in [\w\s].
1737 * Thus
1738 * ~P = ~(Pa | Pb | ... | Pn)
1739 * = ~Pa & ~Pb & ... & ~Pn
1740 * <= ~Pa | ~Pb | ... | ~Pn
1741 * The last is something we can easily calculate, but unfortunately
1742 * is likely to have many false positives. We could do better
1743 * in some (but certainly not all) instances if two classes in
1744 * P have known relationships. For example
1745 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1746 * So
1747 * :lower: & :print: = :lower:
1748 * And similarly for classes that must be disjoint. For example,
1749 * since \s and \w can have no elements in common based on rules in
1750 * the POSIX standard,
1751 * \w & ^\S = nothing
1752 * Unfortunately, some vendor locales do not meet the Posix
1753 * standard, in particular almost everything by Microsoft.
1754 * The loop below just changes e.g., \w into \W and vice versa */
1755
1ee208c4 1756 regnode_charclass_posixl temp;
cdd87c1d
KW
1757 int add = 1; /* To calculate the index of the complement */
1758
b1234259 1759 Zero(&temp, 1, regnode_charclass_posixl);
cdd87c1d
KW
1760 ANYOF_POSIXL_ZERO(&temp);
1761 for (i = 0; i < ANYOF_MAX; i++) {
1762 assert(i % 2 != 0
7dcac5f6
KW
1763 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1764 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1765
7dcac5f6 1766 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1767 ANYOF_POSIXL_SET(&temp, i + add);
1768 }
1769 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1770 }
1771 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1772
cdd87c1d
KW
1773 } /* else ssc already has no posixes */
1774 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1775 in its initial state */
71068078 1776 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1777 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1778 {
1779 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1780 * copy it over 'ssc' */
1781 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1782 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1783 StructCopy(and_with, ssc, regnode_ssc);
1784 }
1785 else {
1786 ssc->invlist = anded_cp_list;
1787 ANYOF_POSIXL_ZERO(ssc);
93e92956 1788 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1789 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1790 }
1791 }
1792 }
e0e1be5f 1793 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
93e92956 1794 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1795 {
1796 /* One or the other of P1, P2 is non-empty. */
93e92956 1797 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1798 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1799 }
cdd87c1d
KW
1800 ssc_union(ssc, anded_cp_list, FALSE);
1801 }
1802 else { /* P1 = P2 = empty */
1803 ssc_intersection(ssc, anded_cp_list, FALSE);
1804 }
137165a6 1805 }
653099ff
GS
1806}
1807
653099ff 1808STATIC void
cdd87c1d 1809S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1810 const regnode_charclass *or_with)
653099ff 1811{
cdd87c1d
KW
1812 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1813 * another SSC or a regular ANYOF class. Can create false positives if
1814 * 'or_with' is to be inverted. */
7918f24d 1815
a0dd4231
KW
1816 SV* ored_cp_list;
1817 U8 ored_flags;
c6b76537 1818
cdd87c1d 1819 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1820
71068078 1821 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1822
1823 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1824 * the code point inversion list and just the relevant flags */
71068078 1825 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1826 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
a0dd4231
KW
1827 ored_flags = ANYOF_FLAGS(or_with);
1828 }
1829 else {
5c0f85ef 1830 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
eff8b7dc 1831 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
f240c685
KW
1832 if (OP(or_with) != ANYOFD) {
1833 ored_flags
1834 |= ANYOF_FLAGS(or_with)
108316fb
KW
1835 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1836 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1837 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1838 ored_flags |=
1839 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1840 }
f240c685 1841 }
a0dd4231
KW
1842 }
1843
1844 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1845
1846 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1847 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1848 * 'or_with' may be inverted. When not inverted, we have the simple
1849 * situation of computing:
1850 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1851 * If P1|P2 yields a situation with both a class and its complement are
1852 * set, like having both \w and \W, this matches all code points, and we
1853 * can delete these from the P component of the ssc going forward. XXX We
1854 * might be able to delete all the P components, but I (khw) am not certain
1855 * about this, and it is better to be safe.
1856 *
1857 * Inverted, we have
1858 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1859 * <= (C1 | P1) | ~C2
1860 * <= (C1 | ~C2) | P1
1861 * (which results in actually simpler code than the non-inverted case)
1862 * */
9826f543 1863
a0dd4231 1864 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1865 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1866 {
cdd87c1d 1867 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1868 } /* else Not inverted */
93e92956 1869 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1870 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1871 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1872 unsigned int i;
1873 for (i = 0; i < ANYOF_MAX; i += 2) {
1874 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1875 {
1876 ssc_match_all_cp(ssc);
1877 ANYOF_POSIXL_CLEAR(ssc, i);
1878 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1879 }
1880 }
1881 }
1aa99e6b 1882 }
cdd87c1d
KW
1883
1884 ssc_union(ssc,
1885 ored_cp_list,
1886 FALSE /* Already has been inverted */
1887 );
653099ff
GS
1888}
1889
b423522f
KW
1890PERL_STATIC_INLINE void
1891S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1892{
1893 PERL_ARGS_ASSERT_SSC_UNION;
1894
71068078 1895 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1896
1897 _invlist_union_maybe_complement_2nd(ssc->invlist,
1898 invlist,
1899 invert2nd,
1900 &ssc->invlist);
1901}
1902
1903PERL_STATIC_INLINE void
1904S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1905 SV* const invlist,
1906 const bool invert2nd)
1907{
1908 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1909
71068078 1910 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1911
1912 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1913 invlist,
1914 invert2nd,
1915 &ssc->invlist);
1916}
1917
1918PERL_STATIC_INLINE void
1919S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1920{
1921 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1922
71068078 1923 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1924
1925 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1926}
1927
1928PERL_STATIC_INLINE void
1929S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1930{
1931 /* AND just the single code point 'cp' into the SSC 'ssc' */
1932
1933 SV* cp_list = _new_invlist(2);
1934
1935 PERL_ARGS_ASSERT_SSC_CP_AND;
1936
71068078 1937 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1938
1939 cp_list = add_cp_to_invlist(cp_list, cp);
1940 ssc_intersection(ssc, cp_list,
1941 FALSE /* Not inverted */
1942 );
1943 SvREFCNT_dec_NN(cp_list);
1944}
1945
1946PERL_STATIC_INLINE void
dc3bf405 1947S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
1948{
1949 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
1950 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1951
71068078 1952 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1953
1954 ANYOF_POSIXL_ZERO(ssc);
1955 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1956}
1957
b35552de
KW
1958#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1959
1960STATIC bool
1961S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1962{
1963 /* The synthetic start class is used to hopefully quickly winnow down
1964 * places where a pattern could start a match in the target string. If it
1965 * doesn't really narrow things down that much, there isn't much point to
1966 * having the overhead of using it. This function uses some very crude
1967 * heuristics to decide if to use the ssc or not.
1968 *
1969 * It returns TRUE if 'ssc' rules out more than half what it considers to
1970 * be the "likely" possible matches, but of course it doesn't know what the
1971 * actual things being matched are going to be; these are only guesses
1972 *
1973 * For /l matches, it assumes that the only likely matches are going to be
1974 * in the 0-255 range, uniformly distributed, so half of that is 127
1975 * For /a and /d matches, it assumes that the likely matches will be just
1976 * the ASCII range, so half of that is 63
1977 * For /u and there isn't anything matching above the Latin1 range, it
1978 * assumes that that is the only range likely to be matched, and uses
1979 * half that as the cut-off: 127. If anything matches above Latin1,
1980 * it assumes that all of Unicode could match (uniformly), except for
1981 * non-Unicode code points and things in the General Category "Other"
1982 * (unassigned, private use, surrogates, controls and formats). This
1983 * is a much large number. */
1984
b35552de
KW
1985 U32 count = 0; /* Running total of number of code points matched by
1986 'ssc' */
1987 UV start, end; /* Start and end points of current range in inversion
1988 list */
72400949
KW
1989 const U32 max_code_points = (LOC)
1990 ? 256
1991 : (( ! UNI_SEMANTICS
1992 || invlist_highest(ssc->invlist) < 256)
1993 ? 128
1994 : NON_OTHER_COUNT);
1995 const U32 max_match = max_code_points / 2;
b35552de
KW
1996
1997 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1998
1999 invlist_iterinit(ssc->invlist);
2000 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
2001 if (start >= max_code_points) {
2002 break;
b35552de 2003 }
72400949 2004 end = MIN(end, max_code_points - 1);
b35552de 2005 count += end - start + 1;
72400949 2006 if (count >= max_match) {
b35552de
KW
2007 invlist_iterfinish(ssc->invlist);
2008 return FALSE;
2009 }
2010 }
2011
2012 return TRUE;
2013}
2014
2015
b423522f
KW
2016STATIC void
2017S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2018{
2019 /* The inversion list in the SSC is marked mortal; now we need a more
2020 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
2021 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2022 * map */
b423522f 2023
28118b9c 2024 SV* invlist = invlist_clone(ssc->invlist, NULL);
b423522f
KW
2025
2026 PERL_ARGS_ASSERT_SSC_FINALIZE;
2027
71068078 2028 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 2029
a0dd4231 2030 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
2031 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2032 * by the time we reach here */
f240c685
KW
2033 assert(! (ANYOF_FLAGS(ssc)
2034 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
2035 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2036 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 2037
b423522f
KW
2038 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2039
1ee208c4
KW
2040 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
2041 NULL, NULL, NULL, FALSE);
b423522f 2042
85c8e306
KW
2043 /* Make sure is clone-safe */
2044 ssc->invlist = NULL;
2045
e0e1be5f 2046 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 2047 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
d156f5cb 2048 OP(ssc) = ANYOFPOSIXL;
e0e1be5f 2049 }
d156f5cb 2050 else if (RExC_contains_locale) {
b2e90ddf
KW
2051 OP(ssc) = ANYOFL;
2052 }
2053
1462525b 2054 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
2055}
2056
a3621e74
YO
2057#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2058#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2059#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
2060#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2061 ? (TRIE_LIST_CUR( idx ) - 1) \
2062 : 0 )
a3621e74 2063
3dab1dad
YO
2064
2065#ifdef DEBUGGING
07be1b83 2066/*
2b8b4781
NC
2067 dump_trie(trie,widecharmap,revcharmap)
2068 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2069 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
2070
2071 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
2072 The _interim_ variants are used for debugging the interim
2073 tables that are used to generate the final compressed
2074 representation which is what dump_trie expects.
2075
486ec47a 2076 Part of the reason for their existence is to provide a form
3dab1dad 2077 of documentation as to how the different representations function.
07be1b83
YO
2078
2079*/
3dab1dad
YO
2080
2081/*
3dab1dad
YO
2082 Dumps the final compressed table form of the trie to Perl_debug_log.
2083 Used for debugging make_trie().
2084*/
b9a59e08 2085
3dab1dad 2086STATIC void
2b8b4781
NC
2087S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2088 AV *revcharmap, U32 depth)
3dab1dad
YO
2089{
2090 U32 state;
ab3bbdeb 2091 SV *sv=sv_newmortal();
55eed653 2092 int colwidth= widecharmap ? 6 : 4;
2e64971a 2093 U16 word;
3dab1dad
YO
2094 GET_RE_DEBUG_FLAGS_DECL;
2095
7918f24d 2096 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 2097
6ad9a8ab 2098 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 2099 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
2100
2101 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 2102 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 2103 if ( tmp ) {
6ad9a8ab 2104 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2105 colwidth,
538e84ed 2106 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2107 PL_colors[0], PL_colors[1],
2108 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2109 PERL_PV_ESCAPE_FIRSTCHAR
2110 )
ab3bbdeb 2111 );
3dab1dad
YO
2112 }
2113 }
1e37780e
YO
2114 Perl_re_printf( aTHX_ "\n");
2115 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2116
2117 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2118 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2119 Perl_re_printf( aTHX_ "\n");
3dab1dad 2120
1e2e3d02 2121 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2122 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2123
147e3846 2124 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2125
2126 if ( trie->states[ state ].wordnum ) {
1e37780e 2127 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2128 } else {
6ad9a8ab 2129 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2130 }
2131
147e3846 2132 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2133
2134 if ( base ) {
2135 U32 ofs = 0;
2136
2137 while( ( base + ofs < trie->uniquecharcount ) ||
2138 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2139 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2140 != state))
3dab1dad
YO
2141 ofs++;
2142
147e3846 2143 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2144
2145 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2146 if ( ( base + ofs >= trie->uniquecharcount )
2147 && ( base + ofs - trie->uniquecharcount
2148 < trie->lasttrans )
2149 && trie->trans[ base + ofs
2150 - trie->uniquecharcount ].check == state )
3dab1dad 2151 {
147e3846 2152 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2153 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2154 );
3dab1dad 2155 } else {
88f063b4 2156 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
3dab1dad
YO
2157 }
2158 }
2159
6ad9a8ab 2160 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2161
2162 }
6ad9a8ab 2163 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2164 }
6ad9a8ab 2165 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2166 depth);
2e64971a 2167 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2168 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2169 (int)word, (int)(trie->wordinfo[word].prev),
2170 (int)(trie->wordinfo[word].len));
2171 }
6ad9a8ab 2172 Perl_re_printf( aTHX_ "\n" );
538e84ed 2173}
3dab1dad 2174/*
3dab1dad 2175 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2176 List tries normally only are used for construction when the number of
3dab1dad
YO
2177 possible chars (trie->uniquecharcount) is very high.
2178 Used for debugging make_trie().
2179*/
2180STATIC void
55eed653 2181S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2182 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2183 U32 depth)
3dab1dad
YO
2184{
2185 U32 state;
ab3bbdeb 2186 SV *sv=sv_newmortal();
55eed653 2187 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2188 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2189
2190 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2191
3dab1dad 2192 /* print out the table precompression. */
6ad9a8ab 2193 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2194 depth+1 );
6ad9a8ab 2195 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2196 depth+1, "------:-----+-----------------\n" );
538e84ed 2197
3dab1dad
YO
2198 for( state=1 ; state < next_alloc ; state ++ ) {
2199 U16 charid;
538e84ed 2200
147e3846 2201 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2202 depth+1, (UV)state );
3dab1dad 2203 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2204 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2205 } else {
6ad9a8ab 2206 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2207 trie->states[ state ].wordnum
2208 );
2209 }
2210 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed 2211 SV ** const tmp = av_fetch( revcharmap,
88f063b4 2212 TRIE_LIST_ITEM(state, charid).forid, 0);
ab3bbdeb 2213 if ( tmp ) {
147e3846 2214 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2215 colwidth,
538e84ed
KW
2216 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2217 colwidth,
2218 PL_colors[0], PL_colors[1],
2219 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2220 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2221 ) ,
88f063b4
KW
2222 TRIE_LIST_ITEM(state, charid).forid,
2223 (UV)TRIE_LIST_ITEM(state, charid).newstate
1e2e3d02 2224 );
538e84ed 2225 if (!(charid % 10))
6ad9a8ab 2226 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2227 (int)((depth * 2) + 14), "");
1e2e3d02 2228 }
ab3bbdeb 2229 }
6ad9a8ab 2230 Perl_re_printf( aTHX_ "\n");
3dab1dad 2231 }
538e84ed 2232}
3dab1dad
YO
2233
2234/*
3dab1dad 2235 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2236 This is the normal DFA style state transition table, with a few
2237 twists to facilitate compression later.
3dab1dad
YO
2238 Used for debugging make_trie().
2239*/
2240STATIC void
55eed653 2241S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2242 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2243 U32 depth)
3dab1dad
YO
2244{
2245 U32 state;
2246 U16 charid;
ab3bbdeb 2247 SV *sv=sv_newmortal();
55eed653 2248 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2249 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2250
2251 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2252
3dab1dad
YO
2253 /*
2254 print out the table precompression so that we can do a visual check
2255 that they are identical.
2256 */
538e84ed 2257
6ad9a8ab 2258 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2259
2260 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2261 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2262 if ( tmp ) {
6ad9a8ab 2263 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2264 colwidth,
538e84ed 2265 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2266 PL_colors[0], PL_colors[1],
2267 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2268 PERL_PV_ESCAPE_FIRSTCHAR
2269 )
ab3bbdeb 2270 );
3dab1dad
YO
2271 }
2272 }
2273
4aaafc03
YO
2274 Perl_re_printf( aTHX_ "\n");
2275 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2276
2277 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2278 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2279 }
2280
6ad9a8ab 2281 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2282
2283 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2284
147e3846 2285 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2286 depth+1,
3dab1dad
YO
2287 (UV)TRIE_NODENUM( state ) );
2288
2289 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2290 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2291 if (v)
147e3846 2292 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2293 else
6ad9a8ab 2294 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2295 }
2296 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2297 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2298 (UV)trie->trans[ state ].check );
3dab1dad 2299 } else {
147e3846 2300 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2301 (UV)trie->trans[ state ].check,
3dab1dad
YO
2302 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2303 }
2304 }
07be1b83 2305}
3dab1dad
YO
2306
2307#endif
2308
2e64971a 2309
786e8c11
YO
2310/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2311 startbranch: the first branch in the whole branch sequence
2312 first : start branch of sequence of branch-exact nodes.
2313 May be the same as startbranch
2314 last : Thing following the last branch.
2315 May be the same as tail.
2316 tail : item following the branch sequence
2317 count : words in the sequence
a4525e78 2318 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2319 depth : indent depth
3dab1dad 2320
786e8c11 2321Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2322
786e8c11
YO
2323A trie is an N'ary tree where the branches are determined by digital
2324decomposition of the key. IE, at the root node you look up the 1st character and
2325follow that branch repeat until you find the end of the branches. Nodes can be
2326marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2327
786e8c11 2328 /he|she|his|hers/
72f13be8 2329
786e8c11
YO
2330would convert into the following structure. Numbers represent states, letters
2331following numbers represent valid transitions on the letter from that state, if
2332the number is in square brackets it represents an accepting state, otherwise it
2333will be in parenthesis.
07be1b83 2334
786e8c11
YO
2335 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2336 | |
2337 | (2)
2338 | |
2339 (1) +-i->(6)-+-s->[7]
2340 |
2341 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2342
786e8c11
YO
2343 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2344
2345This shows that when matching against the string 'hers' we will begin at state 1
2346read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2347then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2348is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2349single traverse. We store a mapping from accepting to state to which word was
2350matched, and then when we have multiple possibilities we try to complete the
b8fda935 2351rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2352
2353The only prior NFA like behaviour that would be changed by the TRIE support is
2354the silent ignoring of duplicate alternations which are of the form:
2355
2356 / (DUPE|DUPE) X? (?{ ... }) Y /x
2357
4b714af6 2358Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2359and without the optimisation. With the optimisations dupes will be silently
486ec47a 2360ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2361the following demonstrates:
2362
2363 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2364
2365which prints out 'word' three times, but
2366
2367 'words'=~/(word|word|word)(?{ print $1 })S/
2368
2369which doesnt print it out at all. This is due to other optimisations kicking in.
2370
2371Example of what happens on a structural level:
2372
486ec47a 2373The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2374
2375 1: CURLYM[1] {1,32767}(18)
2376 5: BRANCH(8)
2377 6: EXACT <ac>(16)
2378 8: BRANCH(11)
2379 9: EXACT <ad>(16)
2380 11: BRANCH(14)
2381 12: EXACT <ab>(16)
2382 16: SUCCEED(0)
2383 17: NOTHING(18)
2384 18: END(0)
2385
2386This would be optimizable with startbranch=5, first=5, last=16, tail=16
2387and should turn into:
2388
2389 1: CURLYM[1] {1,32767}(18)
2390 5: TRIE(16)
2391 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2392 <ac>
2393 <ad>
2394 <ab>
2395 16: SUCCEED(0)
2396 17: NOTHING(18)
2397 18: END(0)
2398
2399Cases where tail != last would be like /(?foo|bar)baz/:
2400
2401 1: BRANCH(4)
2402 2: EXACT <foo>(8)
2403 4: BRANCH(7)
2404 5: EXACT <bar>(8)
2405 7: TAIL(8)
2406 8: EXACT <baz>(10)
2407 10: END(0)
2408
2409which would be optimizable with startbranch=1, first=1, last=7, tail=8
2410and would end up looking like:
2411
2412 1: TRIE(8)
2413 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2414 <foo>
2415 <bar>
2416 7: TAIL(8)
2417 8: EXACT <baz>(10)
2418 10: END(0)
2419
c80e42f3 2420 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2421
2422is the recommended Unicode-aware way of saying
2423
2424 *(d++) = uv;
2425*/
2426
fab2782b 2427#define TRIE_STORE_REVCHAR(val) \
786e8c11 2428 STMT_START { \
73031816 2429 if (UTF) { \
668fcfea 2430 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2431 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 2432 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
2433 SvCUR_set(zlopp, kapow - flrbbbbb); \
2434 SvPOK_on(zlopp); \
2435 SvUTF8_on(zlopp); \
2436 av_push(revcharmap, zlopp); \
2437 } else { \
fab2782b 2438 char ooooff = (char)val; \
73031816
NC
2439 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2440 } \
2441 } STMT_END
786e8c11 2442
914a25d5
KW
2443/* This gets the next character from the input, folding it if not already
2444 * folded. */
2445#define TRIE_READ_CHAR STMT_START { \
2446 wordlen++; \
2447 if ( UTF ) { \
2448 /* if it is UTF then it is either already folded, or does not need \
2449 * folding */ \
1c1d615a 2450 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2451 } \
2452 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2453 /* This folder implies Unicode rules, which in the range expressible \
2454 * by not UTF is the lower case, with the two exceptions, one of \
2455 * which should have been taken care of before calling this */ \
2456 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2457 uvc = toLOWER_L1(*uc); \
2458 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2459 len = 1; \
914a25d5
KW
2460 } else { \
2461 /* raw data, will be folded later if needed */ \
2462 uvc = (U32)*uc; \
2463 len = 1; \
2464 } \
786e8c11
YO
2465} STMT_END
2466
2467
2468
2469#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2470 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
00195859 2471 U32 ging = TRIE_LIST_LEN( state ) * 2; \
f9003953 2472 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
00195859 2473 TRIE_LIST_LEN( state ) = ging; \
786e8c11
YO
2474 } \
2475 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2476 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2477 TRIE_LIST_CUR( state )++; \
2478} STMT_END
07be1b83 2479
786e8c11 2480#define TRIE_LIST_NEW(state) STMT_START { \
d09f14bf 2481 Newx( trie->states[ state ].trans.list, \
786e8c11
YO
2482 4, reg_trie_trans_le ); \
2483 TRIE_LIST_CUR( state ) = 1; \
2484 TRIE_LIST_LEN( state ) = 4; \
2485} STMT_END
07be1b83 2486
786e8c11
YO
2487#define TRIE_HANDLE_WORD(state) STMT_START { \
2488 U16 dupe= trie->states[ state ].wordnum; \
2489 regnode * const noper_next = regnext( noper ); \
2490 \
786e8c11
YO
2491 DEBUG_r({ \
2492 /* store the word for dumping */ \
2493 SV* tmp; \
2494 if (OP(noper) != NOTHING) \
740cce10 2495 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2496 else \
740cce10 2497 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2498 av_push( trie_words, tmp ); \
786e8c11
YO
2499 }); \
2500 \
2501 curword++; \
2e64971a
DM
2502 trie->wordinfo[curword].prev = 0; \
2503 trie->wordinfo[curword].len = wordlen; \
2504 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2505 \
2506 if ( noper_next < tail ) { \
2507 if (!trie->jump) \
538e84ed
KW
2508 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2509 sizeof(U16) ); \
7f69552c 2510 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2511 if (!jumper) \
2512 jumper = noper_next; \
2513 if (!nextbranch) \
2514 nextbranch= regnext(cur); \
2515 } \
2516 \
2517 if ( dupe ) { \
2e64971a
DM
2518 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2519 /* chain, so that when the bits of chain are later */\
2520 /* linked together, the dups appear in the chain */\
2521 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2522 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2523 } else { \
2524 /* we haven't inserted this word yet. */ \
2525 trie->states[ state ].wordnum = curword; \
2526 } \
2527} STMT_END
07be1b83 2528
3dab1dad 2529
786e8c11
YO
2530#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2531 ( ( base + charid >= ucharcount \
2532 && base + charid < ubound \
2533 && state == trie->trans[ base - ucharcount + charid ].check \
2534 && trie->trans[ base - ucharcount + charid ].next ) \
2535 ? trie->trans[ base - ucharcount + charid ].next \
2536 : ( state==1 ? special : 0 ) \
2537 )
3dab1dad 2538
8bcafbf4
YO
2539#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2540STMT_START { \
2541 TRIE_BITMAP_SET(trie, uvc); \
2542 /* store the folded codepoint */ \
2543 if ( folder ) \
2544 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2545 \
2546 if ( !UTF ) { \
2547 /* store first byte of utf8 representation of */ \
2548 /* variant codepoints */ \
2549 if (! UVCHR_IS_INVARIANT(uvc)) { \
2550 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2551 } \
2552 } \
2553} STMT_END
786e8c11
YO
2554#define MADE_TRIE 1
2555#define MADE_JUMP_TRIE 2
2556#define MADE_EXACT_TRIE 4
3dab1dad 2557
a3621e74 2558STATIC I32
538e84ed
KW
2559S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2560 regnode *first, regnode *last, regnode *tail,
2561 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2562{
2563 /* first pass, loop through and scan words */
2564 reg_trie_data *trie;
55eed653 2565 HV *widecharmap = NULL;
2b8b4781 2566 AV *revcharmap = newAV();
a3621e74 2567 regnode *cur;
a3621e74
YO
2568 STRLEN len = 0;
2569 UV uvc = 0;
2570 U16 curword = 0;
2571 U32 next_alloc = 0;
786e8c11
YO
2572 regnode *jumper = NULL;
2573 regnode *nextbranch = NULL;
7f69552c 2574 regnode *convert = NULL;
2e64971a 2575 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2576 /* we just use folder as a flag in utf8 */
1e696034 2577 const U8 * folder = NULL;
a3621e74 2578
3a611511
YO
2579 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2580 * which stands for one trie structure, one hash, optionally followed
2581 * by two arrays */
2b8b4781 2582#ifdef DEBUGGING
3a611511 2583 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2b8b4781
NC
2584 AV *trie_words = NULL;
2585 /* along with revcharmap, this only used during construction but both are
2586 * useful during debugging so we store them in the struct when debugging.
8e11feef 2587 */
2b8b4781 2588#else
cf78de0b 2589 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2590 STRLEN trie_charcount=0;
3dab1dad 2591#endif
2b8b4781 2592 SV *re_trie_maxbuff;
a3621e74 2593 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2594
2595 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2596#ifndef DEBUGGING
2597 PERL_UNUSED_ARG(depth);
2598#endif
a3621e74 2599
1e696034 2600 switch (flags) {
a4525e78 2601 case EXACT: case EXACTL: break;
89829bb5 2602 case EXACTFAA:
fab2782b 2603 case EXACTFU_SS:
a4525e78
KW
2604 case EXACTFU:
2605 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2606 case EXACTF: folder = PL_fold; break;
fab2782b 2607 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2608 }
2609
c944940b 2610 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2611 trie->refcount = 1;
3dab1dad 2612 trie->startstate = 1;
786e8c11 2613 trie->wordcount = word_count;
f8fc2ecf 2614 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2615 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
a4525e78 2616 if (flags == EXACT || flags == EXACTL)
c944940b 2617 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2618 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2619 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2620
a3621e74 2621 DEBUG_r({
2b8b4781 2622 trie_words = newAV();
a3621e74 2623 });
a3621e74 2624
0111c4fd 2625 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
316ebaf2 2626 assert(re_trie_maxbuff);
a3621e74 2627 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2628 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2629 }
df826430 2630 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2631 Perl_re_indentf( aTHX_
cb41e5d6
YO
2632 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2633 depth+1,
88f063b4 2634 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
538e84ed 2635 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2636 });
538e84ed 2637
7f69552c
YO
2638 /* Find the node we are going to overwrite */
2639 if ( first == startbranch && OP( last ) != BRANCH ) {
2640 /* whole branch chain */
2641 convert = first;
2642 } else {
2643 /* branch sub-chain */
2644 convert = NEXTOPER( first );
2645 }
538e84ed 2646
a3621e74
YO
2647 /* -- First loop and Setup --
2648
2649 We first traverse the branches and scan each word to determine if it
2650 contains widechars, and how many unique chars there are, this is
2651 important as we have to build a table with at least as many columns as we
2652 have unique chars.
2653
2654 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2655 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2656 the native representation of the character value as the key and IV's for
2657 the coded index.
a3621e74
YO
2658
2659 *TODO* If we keep track of how many times each character is used we can
2660 remap the columns so that the table compression later on is more
3b753521 2661 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2662 middle and the least common are on the outside. IMO this would be better
2663 than a most to least common mapping as theres a decent chance the most
2664 common letter will share a node with the least common, meaning the node
486ec47a 2665 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2666 case is when we have the least common nodes twice.
2667
2668 */
2669
a3621e74 2670 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2671 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2672 const U8 *uc;
2673 const U8 *e;
bc031a7d 2674 int foldlen = 0;
07be1b83 2675 U32 wordlen = 0; /* required init */
bc031a7d
KW
2676 STRLEN minchars = 0;
2677 STRLEN maxchars = 0;
538e84ed
KW
2678 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2679 bitmap?*/
a3621e74 2680
3dab1dad 2681 if (OP(noper) == NOTHING) {
20ed8c88
YO
2682 /* skip past a NOTHING at the start of an alternation
2683 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2684 */
df826430 2685 regnode *noper_next= regnext(noper);
944e05e3
YO
2686 if (noper_next < tail)
2687 noper= noper_next;
2688 }
2689
dca5fc4c
YO
2690 if ( noper < tail &&
2691 (
2692 OP(noper) == flags ||
2693 (
2694 flags == EXACTFU &&
2695 OP(noper) == EXACTFU_SS
2696 )
2697 )
2698 ) {
944e05e3
YO
2699 uc= (U8*)STRING(noper);
2700 e= uc + STR_LEN(noper);
2701 } else {
2702 trie->minlen= 0;
2703 continue;
3dab1dad 2704 }
df826430 2705
944e05e3 2706
fab2782b 2707 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2708 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2709 regardless of encoding */
fab2782b
YO
2710 if (OP( noper ) == EXACTFU_SS) {
2711 /* false positives are ok, so just set this */
0dc4a61d 2712 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2713 }
2714 }
dca5fc4c 2715
bc031a7d
KW
2716 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2717 branch */
3dab1dad 2718 TRIE_CHARCOUNT(trie)++;
a3621e74 2719 TRIE_READ_CHAR;
645de4ce 2720
bc031a7d
KW
2721 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2722 * is in effect. Under /i, this character can match itself, or
2723 * anything that folds to it. If not under /i, it can match just
2724 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2725 * all fold to k, and all are single characters. But some folds
2726 * expand to more than one character, so for example LATIN SMALL
2727 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2728 * the string beginning at 'uc' is 'ffi', it could be matched by
2729 * three characters, or just by the one ligature character. (It
2730 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2731 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2732 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2733 * match.) The trie needs to know the minimum and maximum number
2734 * of characters that could match so that it can use size alone to
2735 * quickly reject many match attempts. The max is simple: it is
2736 * the number of folded characters in this branch (since a fold is
2737 * never shorter than what folds to it. */
2738
2739 maxchars++;
2740
2741 /* And the min is equal to the max if not under /i (indicated by
2742 * 'folder' being NULL), or there are no multi-character folds. If
2743 * there is a multi-character fold, the min is incremented just
2744 * once, for the character that folds to the sequence. Each
2745 * character in the sequence needs to be added to the list below of
2746 * characters in the trie, but we count only the first towards the
2747 * min number of characters needed. This is done through the
2748 * variable 'foldlen', which is returned by the macros that look
2749 * for these sequences as the number of bytes the sequence
2750 * occupies. Each time through the loop, we decrement 'foldlen' by
2751 * how many bytes the current char occupies. Only when it reaches
2752 * 0 do we increment 'minchars' or look for another multi-character
2753 * sequence. */
2754 if (folder == NULL) {
2755 minchars++;
2756 }
2757 else if (foldlen > 0) {
2758 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2759 }
2760 else {
bc031a7d
KW
2761 minchars++;
2762
2763 /* See if *uc is the beginning of a multi-character fold. If
2764 * so, we decrement the length remaining to look at, to account
2765 * for the current character this iteration. (We can use 'uc'
2766 * instead of the fold returned by TRIE_READ_CHAR because for
2767 * non-UTF, the latin1_safe macro is smart enough to account
2768 * for all the unfolded characters, and because for UTF, the
2769 * string will already have been folded earlier in the
2770 * compilation process */
2771 if (UTF) {
2772 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2773 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2774 }
2775 }
bc031a7d
KW
2776 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2777 foldlen--;
2778 }
645de4ce 2779 }
bc031a7d
KW
2780
2781 /* The current character (and any potential folds) should be added
2782 * to the possible matching characters for this position in this
2783 * branch */
a3621e74 2784 if ( uvc < 256 ) {
fab2782b
YO
2785 if ( folder ) {
2786 U8 folded= folder[ (U8) uvc ];
2787 if ( !trie->charmap[ folded ] ) {
2788 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2789 TRIE_STORE_REVCHAR( folded );
2790 }
2791 }
a3621e74
YO
2792 if ( !trie->charmap[ uvc ] ) {
2793 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2794 TRIE_STORE_REVCHAR( uvc );
a3621e74 2795 }
02daf0ab 2796 if ( set_bit ) {
62012aee
KW
2797 /* store the codepoint in the bitmap, and its folded
2798 * equivalent. */
8bcafbf4 2799 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2800 set_bit = 0; /* We've done our bit :-) */
2801 }
a3621e74 2802 } else {
bc031a7d
KW
2803
2804 /* XXX We could come up with the list of code points that fold
2805 * to this using PL_utf8_foldclosures, except not for
2806 * multi-char folds, as there may be multiple combinations
2807 * there that could work, which needs to wait until runtime to
2808 * resolve (The comment about LIGATURE FFI above is such an
2809 * example */
2810
a3621e74 2811 SV** svpp;
55eed653
NC
2812 if ( !widecharmap )
2813 widecharmap = newHV();
a3621e74 2814
55eed653 2815 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2816
2817 if ( !svpp )
147e3846 2818 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
2819
2820 if ( !SvTRUE( *svpp ) ) {
2821 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2822 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2823 }
2824 }
bc031a7d
KW
2825 } /* end loop through characters in this branch of the trie */
2826
2827 /* We take the min and max for this branch and combine to find the min
2828 * and max for all branches processed so far */
3dab1dad 2829 if( cur == first ) {
bc031a7d
KW
2830 trie->minlen = minchars;
2831 trie->maxlen = maxchars;
2832 } else if (minchars < trie->minlen) {
2833 trie->minlen = minchars;
2834 } else if (maxchars > trie->maxlen) {
2835 trie->maxlen = maxchars;
fab2782b 2836 }
a3621e74
YO
2837 } /* end first pass */
2838 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2839 Perl_re_indentf( aTHX_
cb41e5d6
YO
2840 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2841 depth+1,
55eed653 2842 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2843 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2844 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2845 );
a3621e74
YO
2846
2847 /*
2848 We now know what we are dealing with in terms of unique chars and
2849 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2850 representation using a flat table will take. If it's over a reasonable
2851 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2852 conservative but potentially much slower representation using an array
2853 of lists.
2854
2855 At the end we convert both representations into the same compressed
2856 form that will be used in regexec.c for matching with. The latter
2857 is a form that cannot be used to construct with but has memory
2858 properties similar to the list form and access properties similar
2859 to the table form making it both suitable for fast searches and
2860 small enough that its feasable to store for the duration of a program.
2861
2862 See the comment in the code where the compressed table is produced
2863 inplace from the flat tabe representation for an explanation of how
2864 the compression works.
2865
2866 */
2867
2868
2e64971a
DM
2869 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2870 prev_states[1] = 0;
2871
538e84ed
KW
2872 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2873 > SvIV(re_trie_maxbuff) )
2874 {
a3621e74
YO
2875 /*
2876 Second Pass -- Array Of Lists Representation
2877
2878 Each state will be represented by a list of charid:state records
2879 (reg_trie_trans_le) the first such element holds the CUR and LEN
2880 points of the allocated array. (See defines above).
2881
2882 We build the initial structure using the lists, and then convert
2883 it into the compressed table form which allows faster lookups
2884 (but cant be modified once converted).
a3621e74
YO
2885 */
2886
a3621e74
YO
2887 STRLEN transcount = 1;
2888
6ad9a8ab 2889 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 2890 depth+1));
686b73d4 2891
c944940b
JH
2892 trie->states = (reg_trie_state *)
2893 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2894 sizeof(reg_trie_state) );
a3621e74
YO
2895 TRIE_LIST_NEW(1);
2896 next_alloc = 2;
2897
2898 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2899
df826430 2900 regnode *noper = NEXTOPER( cur );
c445ea15
AL
2901 U32 state = 1; /* required init */
2902 U16 charid = 0; /* sanity init */
07be1b83 2903 U32 wordlen = 0; /* required init */
c445ea15 2904
df826430
YO
2905 if (OP(noper) == NOTHING) {
2906 regnode *noper_next= regnext(noper);
944e05e3
YO
2907 if (noper_next < tail)
2908 noper= noper_next;
df826430
YO
2909 }
2910
944e05e3
YO
2911 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2912 const U8 *uc= (U8*)STRING(noper);
2913 const U8 *e= uc + STR_LEN(noper);
2914
786e8c11 2915 for ( ; uc < e ; uc += len ) {
c445ea15 2916
786e8c11 2917 TRIE_READ_CHAR;
c445ea15 2918
786e8c11
YO
2919 if ( uvc < 256 ) {
2920 charid = trie->charmap[ uvc ];
c445ea15 2921 } else {
538e84ed
KW
2922 SV** const svpp = hv_fetch( widecharmap,
2923 (char*)&uvc,
2924 sizeof( UV ),
2925 0);
786e8c11
YO
2926 if ( !svpp ) {
2927 charid = 0;
2928 } else {
2929 charid=(U16)SvIV( *svpp );
2930 }
c445ea15 2931 }
538e84ed
KW
2932 /* charid is now 0 if we dont know the char read, or
2933 * nonzero if we do */
786e8c11 2934 if ( charid ) {
a3621e74 2935
786e8c11
YO
2936 U16 check;
2937 U32 newstate = 0;
a3621e74 2938
786e8c11
YO
2939 charid--;
2940 if ( !trie->states[ state ].trans.list ) {
2941 TRIE_LIST_NEW( state );
c445ea15 2942 }
538e84ed
KW
2943 for ( check = 1;
2944 check <= TRIE_LIST_USED( state );
2945 check++ )
2946 {
2947 if ( TRIE_LIST_ITEM( state, check ).forid
2948 == charid )
2949 {
786e8c11
YO
2950 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2951 break;
2952 }
2953 }
2954 if ( ! newstate ) {
2955 newstate = next_alloc++;
2e64971a 2956 prev_states[newstate] = state;
786e8c11
YO
2957 TRIE_LIST_PUSH( state, charid, newstate );
2958 transcount++;
2959 }
2960 state = newstate;
2961 } else {
147e3846 2962 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 2963 }
a28509cc 2964 }
c445ea15 2965 }
3dab1dad 2966 TRIE_HANDLE_WORD(state);
a3621e74
YO
2967
2968 } /* end second pass */
2969
1e2e3d02 2970 /* next alloc is the NEXT state to be allocated */
538e84ed 2971 trie->statecount = next_alloc;
c944940b
JH
2972 trie->states = (reg_trie_state *)
2973 PerlMemShared_realloc( trie->states,
2974 next_alloc
2975 * sizeof(reg_trie_state) );
a3621e74 2976
3dab1dad 2977 /* and now dump it out before we compress it */
2b8b4781
NC
2978 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2979 revcharmap, next_alloc,
2980 depth+1)
1e2e3d02 2981 );
a3621e74 2982
c944940b
JH
2983 trie->trans = (reg_trie_trans *)
2984 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2985 {
2986 U32 state;
a3621e74
YO
2987 U32 tp = 0;
2988 U32 zp = 0;
2989
2990
2991 for( state=1 ; state < next_alloc ; state ++ ) {
2992 U32 base=0;
2993
2994 /*
2995 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 2996 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
2997 );
2998 */
2999
3000 if (trie->states[state].trans.list) {
3001 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3002 U16 maxid=minid;
a28509cc 3003 U16 idx;
a3621e74
YO
3004
3005 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
3006 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3007 if ( forid < minid ) {
3008 minid=forid;
3009 } else if ( forid > maxid ) {
3010 maxid=forid;
3011 }
a3621e74
YO
3012 }
3013 if ( transcount < tp + maxid - minid + 1) {
3014 transcount *= 2;
c944940b
JH
3015 trie->trans = (reg_trie_trans *)
3016 PerlMemShared_realloc( trie->trans,
446bd890
NC
3017 transcount
3018 * sizeof(reg_trie_trans) );
538e84ed
KW
3019 Zero( trie->trans + (transcount / 2),
3020 transcount / 2,
3021 reg_trie_trans );
a3621e74
YO
3022 }
3023 base = trie->uniquecharcount + tp - minid;
3024 if ( maxid == minid ) {
3025 U32 set = 0;
3026 for ( ; zp < tp ; zp++ ) {
3027 if ( ! trie->trans[ zp ].next ) {
3028 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
3029 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3030 1).newstate;
a3621e74
YO
3031 trie->trans[ zp ].check = state;
3032 set = 1;
3033 break;
3034 }
3035 }
3036 if ( !set ) {
538e84ed
KW
3037 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3038 1).newstate;
a3621e74
YO
3039 trie->trans[ tp ].check = state;
3040 tp++;
3041 zp = tp;
3042 }
3043 } else {
3044 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
3045 const U32 tid = base
3046 - trie->uniquecharcount
3047 + TRIE_LIST_ITEM( state, idx ).forid;
3048 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3049 idx ).newstate;
a3621e74
YO
3050 trie->trans[ tid ].check = state;
3051 }
3052 tp += ( maxid - minid + 1 );
3053 }
3054 Safefree(trie->states[ state ].trans.list);
3055 }
3056 /*
3057 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3058 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
3059 );
3060 */
3061 trie->states[ state ].trans.base=base;
3062 }
cc601c31 3063 trie->lasttrans = tp + 1;
a3621e74
YO
3064 }
3065 } else {
3066 /*
3067 Second Pass -- Flat Table Representation.
3068
b423522f
KW
3069 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3070 each. We know that we will need Charcount+1 trans at most to store
3071 the data (one row per char at worst case) So we preallocate both
3072 structures assuming worst case.
a3621e74
YO
3073
3074 We then construct the trie using only the .next slots of the entry
3075 structs.
3076
b423522f
KW
3077 We use the .check field of the first entry of the node temporarily
3078 to make compression both faster and easier by keeping track of how
3079 many non zero fields are in the node.
a3621e74
YO
3080
3081 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3082 transition.
3083
b423522f
KW
3084 There are two terms at use here: state as a TRIE_NODEIDX() which is
3085 a number representing the first entry of the node, and state as a
3086 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3087 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3088 if there are 2 entrys per node. eg:
a3621e74
YO
3089
3090 A B A B
3091 1. 2 4 1. 3 7
3092 2. 0 3 3. 0 5
3093 3. 0 0 5. 0 0
3094 4. 0 0 7. 0 0
3095
b423522f
KW
3096 The table is internally in the right hand, idx form. However as we
3097 also have to deal with the states array which is indexed by nodenum
3098 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
3099
3100 */
6ad9a8ab 3101 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 3102 depth+1));
3dab1dad 3103
c944940b
JH
3104 trie->trans = (reg_trie_trans *)
3105 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3106 * trie->uniquecharcount + 1,
3107 sizeof(reg_trie_trans) );
3108 trie->states = (reg_trie_state *)
3109 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3110 sizeof(reg_trie_state) );
a3621e74
YO
3111 next_alloc = trie->uniquecharcount + 1;
3112
3dab1dad 3113
a3621e74
YO
3114 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3115
df826430 3116 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3117
3118 U32 state = 1; /* required init */
3119
3120 U16 charid = 0; /* sanity init */
3121 U32 accept_state = 0; /* sanity init */
a3621e74 3122
07be1b83 3123 U32 wordlen = 0; /* required init */
a3621e74 3124
df826430
YO
3125 if (OP(noper) == NOTHING) {
3126 regnode *noper_next= regnext(noper);
944e05e3
YO
3127 if (noper_next < tail)
3128 noper= noper_next;
df826430 3129 }
fab2782b 3130
944e05e3
YO
3131 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3132 const U8 *uc= (U8*)STRING(noper);
3133 const U8 *e= uc + STR_LEN(noper);
3134
786e8c11 3135 for ( ; uc < e ; uc += len ) {
a3621e74 3136
786e8c11 3137 TRIE_READ_CHAR;
a3621e74 3138
786e8c11
YO
3139 if ( uvc < 256 ) {
3140 charid = trie->charmap[ uvc ];
3141 } else {
538e84ed
KW
3142 SV* const * const svpp = hv_fetch( widecharmap,
3143 (char*)&uvc,
3144 sizeof( UV ),
3145 0);
786e8c11 3146 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3147 }
786e8c11
YO
3148 if ( charid ) {
3149 charid--;
3150 if ( !trie->trans[ state + charid ].next ) {
3151 trie->trans[ state + charid ].next = next_alloc;
3152 trie->trans[ state ].check++;
2e64971a
DM
3153 prev_states[TRIE_NODENUM(next_alloc)]
3154 = TRIE_NODENUM(state);
786e8c11
YO
3155 next_alloc += trie->uniquecharcount;
3156 }
3157 state = trie->trans[ state + charid ].next;
3158 } else {
147e3846 3159 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3160 }
538e84ed
KW
3161 /* charid is now 0 if we dont know the char read, or
3162 * nonzero if we do */
a3621e74 3163 }
a3621e74 3164 }
3dab1dad
YO
3165 accept_state = TRIE_NODENUM( state );
3166 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3167
3168 } /* end second pass */
3169
3dab1dad 3170 /* and now dump it out before we compress it */
2b8b4781
NC
3171 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3172 revcharmap,
3173 next_alloc, depth+1));
a3621e74 3174
a3621e74
YO
3175 {
3176 /*
3177 * Inplace compress the table.*
3178
3179 For sparse data sets the table constructed by the trie algorithm will
3180 be mostly 0/FAIL transitions or to put it another way mostly empty.
3181 (Note that leaf nodes will not contain any transitions.)
3182
3183 This algorithm compresses the tables by eliminating most such
3184 transitions, at the cost of a modest bit of extra work during lookup:
3185
3186 - Each states[] entry contains a .base field which indicates the
3187 index in the state[] array wheres its transition data is stored.
3188
3b753521 3189 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3190
3191 - If .base is nonzero then charid is added to it to find an entry in
3192 the trans array.
3193
3194 -If trans[states[state].base+charid].check!=state then the
3195 transition is taken to be a 0/Fail transition. Thus if there are fail
3196 transitions at the front of the node then the .base offset will point
3197 somewhere inside the previous nodes data (or maybe even into a node
3198 even earlier), but the .check field determines if the transition is
3199 valid.
3200
786e8c11 3201 XXX - wrong maybe?
a3621e74 3202 The following process inplace converts the table to the compressed
3b753521 3203 table: We first do not compress the root node 1,and mark all its
a3621e74 3204 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3205 allows us to do a DFA construction from the compressed table later,
3206 and ensures that any .base pointers we calculate later are greater
3207 than 0.
a3621e74
YO
3208
3209 - We set 'pos' to indicate the first entry of the second node.
3210
3211 - We then iterate over the columns of the node, finding the first and
3212 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3213 and set the .check pointers accordingly, and advance pos
3214 appropriately and repreat for the next node. Note that when we copy
3215 the next pointers we have to convert them from the original
3216 NODEIDX form to NODENUM form as the former is not valid post
3217 compression.
3218
3219 - If a node has no transitions used we mark its base as 0 and do not
3220 advance the pos pointer.
3221
3222 - If a node only has one transition we use a second pointer into the
3223 structure to fill in allocated fail transitions from other states.
3224 This pointer is independent of the main pointer and scans forward
3225 looking for null transitions that are allocated to a state. When it
3226 finds one it writes the single transition into the "hole". If the
786e8c11 3227 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3228
3229 - Once compressed we can Renew/realloc the structures to release the
3230 excess space.
3231
3232 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3233 specifically Fig 3.47 and the associated pseudocode.
3234
3235 demq
3236 */
a3b680e6 3237 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3238 U32 state, charid;
a3621e74 3239 U32 pos = 0, zp=0;
1e2e3d02 3240 trie->statecount = laststate;
a3621e74
YO
3241
3242 for ( state = 1 ; state < laststate ; state++ ) {
3243 U8 flag = 0;
a28509cc
AL
3244 const U32 stateidx = TRIE_NODEIDX( state );
3245 const U32 o_used = trie->trans[ stateidx ].check;
3246 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3247 trie->trans[ stateidx ].check = 0;
3248
538e84ed
KW
3249 for ( charid = 0;
3250 used && charid < trie->uniquecharcount;
3251 charid++ )
3252 {
a3621e74
YO
3253 if ( flag || trie->trans[ stateidx + charid ].next ) {
3254 if ( trie->trans[ stateidx + charid ].next ) {
3255 if (o_used == 1) {
3256 for ( ; zp < pos ; zp++ ) {
3257 if ( ! trie->trans[ zp ].next ) {
3258 break;
3259 }
3260 }
538e84ed
KW
3261 trie->states[ state ].trans.base
3262 = zp
3263 + trie->uniquecharcount
3264 - charid ;
3265 trie->trans[ zp ].next
3266 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3267 + charid ].next );
a3621e74
YO
3268 trie->trans[ zp ].check = state;
3269 if ( ++zp > pos ) pos = zp;
3270 break;
3271 }
3272 used--;
3273 }
3274 if ( !flag ) {
3275 flag = 1;
538e84ed
KW
3276 trie->states[ state ].trans.base
3277 = pos + trie->uniquecharcount - charid ;
a3621e74 3278 }
538e84ed
KW
3279 trie->trans[ pos ].next
3280 = SAFE_TRIE_NODENUM(
3281 trie->trans[ stateidx + charid ].next );
a3621e74
YO
3282 trie->trans[ pos ].check = state;
3283 pos++;
3284 }
3285 }
3286 }
cc601c31 3287 trie->lasttrans = pos + 1;
c944940b
JH
3288 trie->states = (reg_trie_state *)
3289 PerlMemShared_realloc( trie->states, laststate
3290 * sizeof(reg_trie_state) );
a3621e74 3291 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3292 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
cb41e5d6 3293 depth+1,
538e84ed
KW
3294 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3295 + 1 ),
3296 (IV)next_alloc,
3297 (IV)pos,
3298 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
3299 );
3300
3301 } /* end table compress */
3302 }
1e2e3d02 3303 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3304 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
cb41e5d6 3305 depth+1,
1e2e3d02
YO
3306 (UV)trie->statecount,
3307 (UV)trie->lasttrans)
3308 );
cc601c31 3309 /* resize the trans array to remove unused space */
c944940b
JH
3310 trie->trans = (reg_trie_trans *)
3311 PerlMemShared_realloc( trie->trans, trie->lasttrans
3312 * sizeof(reg_trie_trans) );
a3621e74 3313
538e84ed 3314 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
3315 U8 nodetype =(U8)(flags & 0xFF);
3316 char *str=NULL;
538e84ed 3317
07be1b83 3318#ifdef DEBUGGING
e62cc96a 3319 regnode *optimize = NULL;
7122b237
YO
3320#ifdef RE_TRACK_PATTERN_OFFSETS
3321
b57a0404
JH
3322 U32 mjd_offset = 0;
3323 U32 mjd_nodelen = 0;
7122b237
YO
3324#endif /* RE_TRACK_PATTERN_OFFSETS */
3325#endif /* DEBUGGING */
a3621e74 3326 /*
3dab1dad
YO
3327 This means we convert either the first branch or the first Exact,
3328 depending on whether the thing following (in 'last') is a branch
3329 or not and whther first is the startbranch (ie is it a sub part of
3330 the alternation or is it the whole thing.)
3b753521 3331 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 3332 the whole branch sequence, including the first.
a3621e74 3333 */
3dab1dad 3334 /* Find the node we are going to overwrite */
7f69552c 3335 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 3336 /* branch sub-chain */
3dab1dad 3337 NEXT_OFF( first ) = (U16)(last - first);
7122b237 3338#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
3339 DEBUG_r({
3340 mjd_offset= Node_Offset((convert));
3341 mjd_nodelen= Node_Length((convert));
3342 });
7122b237 3343#endif
7f69552c 3344 /* whole branch chain */
7122b237
YO
3345 }
3346#ifdef RE_TRACK_PATTERN_OFFSETS
3347 else {
7f69552c
YO
3348 DEBUG_r({
3349 const regnode *nop = NEXTOPER( convert );
3350 mjd_offset= Node_Offset((nop));
3351 mjd_nodelen= Node_Length((nop));
3352 });
07be1b83
YO
3353 }
3354 DEBUG_OPTIMISE_r(
147e3846 3355 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
cb41e5d6 3356 depth+1,
786e8c11 3357 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 3358 );
7122b237 3359#endif
538e84ed 3360 /* But first we check to see if there is a common prefix we can
3dab1dad
YO
3361 split out as an EXACT and put in front of the TRIE node. */
3362 trie->startstate= 1;
55eed653 3363 if ( trie->bitmap && !widecharmap && !trie->jump ) {
5ee57374
YO
3364 /* we want to find the first state that has more than
3365 * one transition, if that state is not the first state
3366 * then we have a common prefix which we can remove.
3367 */
3dab1dad 3368 U32 state;
1e2e3d02 3369 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 3370 U32 ofs = 0;
ee0dfd0b
YO
3371 I32 first_ofs = -1; /* keeps track of the ofs of the first
3372 transition, -1 means none */
8e11feef
RGS
3373 U32 count = 0;
3374 const U32 base = trie->states[ state ].trans.base;
a3621e74 3375
5ee57374 3376 /* does this state terminate an alternation? */
3dab1dad 3377 if ( trie->states[state].wordnum )
8e11feef 3378 count = 1;
a3621e74 3379
8e11feef 3380 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
3381 if ( ( base + ofs >= trie->uniquecharcount ) &&
3382 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
3383 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3384 {
3dab1dad 3385 if ( ++count > 1 ) {
5ee57374 3386 /* we have more than one transition */
d3d91c7c
YO
3387 SV **tmp;
3388 U8 *ch;
5ee57374
YO
3389 /* if this is the first state there is no common prefix
3390 * to extract, so we can exit */
8e11feef 3391 if ( state == 1 ) break;
d3d91c7c
YO
3392 tmp = av_fetch( revcharmap, ofs, 0);
3393 ch = (U8*)SvPV_nolen_const( *tmp );
3394
5ee57374
YO
3395 /* if we are on count 2 then we need to initialize the
3396 * bitmap, and store the previous char if there was one
3397 * in it*/
3dab1dad 3398 if ( count == 2 ) {
5ee57374 3399 /* clear the bitmap */
3dab1dad
YO
3400 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3401 DEBUG_OPTIMISE_r(
147e3846 3402 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
cb41e5d6 3403 depth+1,
786e8c11 3404 (UV)state));
ee0dfd0b
YO
3405 if (first_ofs >= 0) {
3406 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
be8e71aa 3407 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 3408
88f063b4 3409 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3dab1dad 3410 DEBUG_OPTIMISE_r(
6ad9a8ab 3411 Perl_re_printf( aTHX_ "%s", (char*)ch)
3dab1dad 3412 );
8e11feef
RGS
3413 }
3414 }
8bcafbf4 3415 /* store the current firstchar in the bitmap */
88f063b4 3416 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
6ad9a8ab 3417 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
8e11feef 3418 }
ee0dfd0b 3419 first_ofs = ofs;
8e11feef 3420 }
3dab1dad
YO
3421 }
3422 if ( count == 1 ) {
5ee57374
YO
3423 /* This state has only one transition, its transition is part
3424 * of a common prefix - we need to concatenate the char it
3425 * represents to what we have so far. */
ee0dfd0b 3426 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
c490c714
YO
3427 STRLEN len;
3428 char *ch = SvPV( *tmp, len );
de734bd5
A
3429 DEBUG_OPTIMISE_r({
3430 SV *sv=sv_newmortal();
147e3846 3431 Perl_re_indentf( aTHX_ "Prefix State: %" UVuf " Ofs:%" UVuf " Char='%s'\n",
cb41e5d6 3432 depth+1,