This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Add macro for warning experimental features
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
f7e03a10 89#include "dquote_inline.h"
b992490d 90#include "invlist_inline.h"
1b0f46bf 91#include "unicode_constants.h"
04e98a4d 92
538e84ed
KW
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
95#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 97#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 98#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
3f910e62
YO
104/* this is a chain of data about sub patterns we are processing that
105 need to be handled separately/specially in study_chunk. Its so
106 we can simulate recursion without losing state. */
107struct scan_frame;
108typedef struct scan_frame {
109 regnode *last_regnode; /* last node to process in this frame */
110 regnode *next_regnode; /* next node to process when last is reached */
111 U32 prev_recursed_depth;
112 I32 stopparen; /* what stopparen do we use */
3f910e62
YO
113
114 struct scan_frame *this_prev_frame; /* this previous frame */
115 struct scan_frame *prev_frame; /* previous frame */
116 struct scan_frame *next_frame; /* next frame */
117} scan_frame;
118
f4ae5a27
KW
119/* Certain characters are output as a sequence with the first being a
120 * backslash. */
eb7486de 121#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
f4ae5a27
KW
122
123
09b2b2e6 124struct RExC_state_t {
514a91f1
DM
125 U32 flags; /* RXf_* are we folding, multilining? */
126 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 127 char *precomp; /* uncompiled string. */
711b303b 128 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 129 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 130 regexp *rx; /* perl core regexp structure */
538e84ed
KW
131 regexp_internal *rxi; /* internal data for regexp object
132 pprivate field */
fac92740 133 char *start; /* Start of input for compile */
830247a4
IZ
134 char *end; /* End of input for compile */
135 char *parse; /* Input-scan pointer. */
51684e3c
KW
136 char *copy_start; /* start of copy of input within
137 constructed parse string */
138 char *copy_start_in_input; /* Position in input string
139 corresponding to copy_start */
ea3daa5d 140 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 141 regnode *emit_start; /* Start of emitted-code area */
538e84ed
KW
142 regnode *emit_bound; /* First regnode outside of the
143 allocated space */
f55b7b26 144 regnode_offset emit; /* Code-emit pointer */
830247a4
IZ
145 I32 naughty; /* How bad is this pattern? */
146 I32 sawback; /* Did we see \1, ...? */
147 U32 seen;
88f063b4
KW
148 SSize_t size; /* Number of regnode equivalents in
149 pattern */
e7f023ff 150 I32 npar; /* Capture buffer count, (OPEN) plus
538e84ed
KW
151 one. ("par" 0 is the whole
152 pattern)*/
153 I32 nestroot; /* root parens we are in - used by
154 accept */
830247a4
IZ
155 I32 extralen;
156 I32 seen_zerolen;
f55b7b26
KW
157 regnode_offset *open_parens; /* offsets to open parens */
158 regnode_offset *close_parens; /* offsets to close parens */
d5a00e4a 159 regnode *end_op; /* END node in program */
02daf0ab
YO
160 I32 utf8; /* whether the pattern is utf8 or not */
161 I32 orig_utf8; /* whether the pattern was originally in utf8 */
162 /* XXX use this for future optimisation of case
163 * where pattern must be upgraded to utf8. */
e40e74fe
KW
164 I32 uni_semantics; /* If a d charset modifier should use unicode
165 rules, even if the pattern is not in
166 utf8 */
81714fb9 167 HV *paren_names; /* Paren names */
538e84ed 168
40d049e4 169 regnode **recurse; /* Recurse regops */
88f063b4 170 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 171 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 172 through */
09a65838 173 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 174 I32 in_lookbehind;
4624b182 175 I32 contains_locale;
bb3f3ed2 176 I32 override_recoding;
b6d67071
KW
177#ifdef EBCDIC
178 I32 recode_x_to_native;
179#endif
9d53c457 180 I32 in_multi_char_class;
1acab4c5 181 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 182 within pattern */
b1603ef8 183 int code_index; /* next code_blocks[] slot */
ee273784 184 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
185 scan_frame *frame_head;
186 scan_frame *frame_last;
187 U32 frame_count;
7eec73eb 188 AV *warn_text;
dc6d7f5c 189#ifdef ADD_TO_REGEXEC
830247a4
IZ
190 char *starttry; /* -Dr: where regtry was called. */
191#define RExC_starttry (pRExC_state->starttry)
192#endif
d24ca0c5 193 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 194#ifdef DEBUGGING
be8e71aa 195 const char *lastparse;
3dab1dad 196 I32 lastnum;
1f1031fe 197 AV *paren_name_list; /* idx -> name */
d9a72fcc 198 U32 study_chunk_recursed_count;
c9f0d54c
YO
199 SV *mysv1;
200 SV *mysv2;
88f063b4 201
3dab1dad
YO
202#define RExC_lastparse (pRExC_state->lastparse)
203#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 204#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 205#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
206#define RExC_mysv (pRExC_state->mysv1)
207#define RExC_mysv1 (pRExC_state->mysv1)
208#define RExC_mysv2 (pRExC_state->mysv2)
209
3dab1dad 210#endif
512c0f5a 211 bool seen_unfolded_sharp_s;
911bd04e 212 bool strict;
da7cf1cc 213 bool study_started;
034602eb 214 bool in_script_run;
f55b7b26 215 bool pass1;
09b2b2e6 216};
830247a4 217
e2509266 218#define RExC_flags (pRExC_state->flags)
514a91f1 219#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 220#define RExC_precomp (pRExC_state->precomp)
51684e3c
KW
221#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
222#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
711b303b 223#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 224#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 225#define RExC_rx (pRExC_state->rx)
f8fc2ecf 226#define RExC_rxi (pRExC_state->rxi)
fac92740 227#define RExC_start (pRExC_state->start)
830247a4
IZ
228#define RExC_end (pRExC_state->end)
229#define RExC_parse (pRExC_state->parse)
230#define RExC_whilem_seen (pRExC_state->whilem_seen)
512c0f5a
KW
231
232/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
233 * EXACTF node, hence was parsed under /di rules. If later in the parse,
234 * something forces the pattern into using /ui rules, the sharp s should be
235 * folded into the sequence 'ss', which takes up more space than previously
236 * calculated. This means that the sizing pass needs to be restarted. (The
237 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
238 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
239 * so there is no need to resize [perl #125990]. */
240#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
241
7122b237 242#ifdef RE_TRACK_PATTERN_OFFSETS
44f36cd3 243# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
538e84ed 244 others */
7122b237 245#endif
830247a4 246#define RExC_emit (pRExC_state->emit)
f55b7b26 247#define RExC_pass1 (pRExC_state->pass1)
fac92740 248#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 249#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
250#define RExC_sawback (pRExC_state->sawback)
251#define RExC_seen (pRExC_state->seen)
252#define RExC_size (pRExC_state->size)
ee273784 253#define RExC_maxlen (pRExC_state->maxlen)
830247a4 254#define RExC_npar (pRExC_state->npar)
e2e6a0f1 255#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
256#define RExC_extralen (pRExC_state->extralen)
257#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 258#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 259#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 260#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
261#define RExC_open_parens (pRExC_state->open_parens)
262#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 263#define RExC_end_op (pRExC_state->end_op)
81714fb9 264#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
265#define RExC_recurse (pRExC_state->recurse)
266#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 267#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
268#define RExC_study_chunk_recursed_bytes \
269 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 270#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 271#define RExC_contains_locale (pRExC_state->contains_locale)
b6d67071
KW
272#ifdef EBCDIC
273# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
274#endif
9d53c457 275#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
276#define RExC_frame_head (pRExC_state->frame_head)
277#define RExC_frame_last (pRExC_state->frame_last)
278#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 279#define RExC_strict (pRExC_state->strict)
da7cf1cc 280#define RExC_study_started (pRExC_state->study_started)
7eec73eb 281#define RExC_warn_text (pRExC_state->warn_text)
034602eb 282#define RExC_in_script_run (pRExC_state->in_script_run)
830247a4 283
99807a43
HS
284/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
285 * a flag to disable back-off on the fixed/floating substrings - if it's
286 * a high complexity pattern we assume the benefit of avoiding a full match
287 * is worth the cost of checking for the substrings even if they rarely help.
288 */
289#define RExC_naughty (pRExC_state->naughty)
290#define TOO_NAUGHTY (10)
291#define MARK_NAUGHTY(add) \
292 if (RExC_naughty < TOO_NAUGHTY) \
293 RExC_naughty += (add)
294#define MARK_NAUGHTY_EXP(exp, add) \
295 if (RExC_naughty < TOO_NAUGHTY) \
296 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 297
a687059c
LW
298#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
299#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 300 ((*s) == '{' && regcurly(s)))
a687059c
LW
301
302/*
303 * Flags to be passed up and down.
304 */
a687059c 305#define WORST 0 /* Worst case. */
a3b492c3 306#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 307
e64f369d 308/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
309 * character. (There needs to be a case: in the switch statement in regexec.c
310 * for any node marked SIMPLE.) Note that this is not the same thing as
311 * REGNODE_SIMPLE */
fda99bee 312#define SIMPLE 0x02
e64f369d 313#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
314#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
315#define TRYAGAIN 0x10 /* Weeded out a declaration. */
c36361b2
KW
316#define RESTART_PARSE 0x20 /* Need to redo the parse */
317#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
318 calcuate sizes as UTF-8 */
a687059c 319
3dab1dad
YO
320#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
321
07be1b83
YO
322/* whether trie related optimizations are enabled */
323#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
324#define TRIE_STUDY_OPT
786e8c11 325#define FULL_TRIE_STUDY
07be1b83
YO
326#define TRIE_STCLASS
327#endif
1de06328
YO
328
329
40d049e4
YO
330
331#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
332#define PBITVAL(paren) (1 << ((paren) & 7))
333#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
334#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
335#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
336
82a6ada4 337#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 338 if (!UTF) { \
82a6ada4 339 assert(PASS1); \
c36361b2 340 *flagp = RESTART_PARSE|NEED_UTF8; \
f55b7b26 341 return 0; \
8d9c2815 342 } \
82a6ada4 343 } STMT_END
40d049e4 344
512c0f5a
KW
345/* Change from /d into /u rules, and restart the parse if we've already seen
346 * something whose size would increase as a result, by setting *flagp and
347 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
142ece99 348 * we've changed to /u during the parse. */
512c0f5a
KW
349#define REQUIRE_UNI_RULES(flagp, restart_retval) \
350 STMT_START { \
351 if (DEPENDS_SEMANTICS) { \
352 assert(PASS1); \
353 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
354 RExC_uni_semantics = 1; \
355 if (RExC_seen_unfolded_sharp_s) { \
c36361b2 356 *flagp |= RESTART_PARSE; \
512c0f5a
KW
357 return restart_retval; \
358 } \
359 } \
360 } STMT_END
361
fd7fc6ae 362/* Executes a return statement with the value 'X', if 'flags' contains any of
c36361b2 363 * 'RESTART_PARSE', 'NEED_UTF8', or 'extra'. If so, *flagp is set to those
fd7fc6ae
YO
364 * flags */
365#define RETURN_X_ON_RESTART_OR_FLAGS(X, flags, flagp, extra) \
366 STMT_START { \
c36361b2
KW
367 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
368 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
fd7fc6ae
YO
369 return X; \
370 } \
371 } STMT_END
372
5d555bb8 373#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
f55b7b26 374 RETURN_X_ON_RESTART_OR_FLAGS(0,flags,flagp,extra)
fd7fc6ae
YO
375
376#define RETURN_X_ON_RESTART(X, flags,flagp) \
377 RETURN_X_ON_RESTART_OR_FLAGS( X, flags, flagp, 0)
378
379
5d555bb8 380#define RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp,extra) \
c36361b2 381 if (*(flagp) & (RESTART_PARSE|(extra))) return 0
fd7fc6ae 382
c36361b2 383#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
fd7fc6ae 384
5d555bb8 385#define RETURN_FAIL_ON_RESTART(flags,flagp) \
f55b7b26 386 RETURN_X_ON_RESTART(0, flags,flagp)
5d555bb8 387#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
88f063b4 388 RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp, 0)
fd7fc6ae 389
f19b1a63
KW
390/* This converts the named class defined in regcomp.h to its equivalent class
391 * number defined in handy.h. */
392#define namedclass_to_classnum(class) ((int) ((class) / 2))
393#define classnum_to_namedclass(classnum) ((classnum) * 2)
394
de92f5e6
KW
395#define _invlist_union_complement_2nd(a, b, output) \
396 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
397#define _invlist_intersection_complement_2nd(a, b, output) \
398 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
399
1de06328
YO
400/* About scan_data_t.
401
402 During optimisation we recurse through the regexp program performing
403 various inplace (keyhole style) optimisations. In addition study_chunk
404 and scan_commit populate this data structure with information about
538e84ed 405 what strings MUST appear in the pattern. We look for the longest
3b753521 406 string that must appear at a fixed location, and we look for the
1de06328
YO
407 longest string that may appear at a floating location. So for instance
408 in the pattern:
538e84ed 409
1de06328 410 /FOO[xX]A.*B[xX]BAR/
538e84ed 411
1de06328
YO
412 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
413 strings (because they follow a .* construct). study_chunk will identify
414 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 415
1de06328 416 The strings can be composites, for instance
538e84ed 417
1de06328 418 /(f)(o)(o)/
538e84ed 419
1de06328 420 will result in a composite fixed substring 'foo'.
538e84ed 421
1de06328 422 For each string some basic information is maintained:
538e84ed 423
6ea77169 424 - min_offset
1de06328
YO
425 This is the position the string must appear at, or not before.
426 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
427 characters must match before the string we are searching for.
428 Likewise when combined with minlenp and the length of the string it
538e84ed 429 tells us how many characters must appear after the string we have
1de06328 430 found.
538e84ed 431
1de06328
YO
432 - max_offset
433 Only used for floating strings. This is the rightmost point that
ea3daa5d 434 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 435 string can occur infinitely far to the right.
6e12f832 436 For fixed strings, it is equal to min_offset.
538e84ed 437
1de06328 438 - minlenp
2d608413
KW
439 A pointer to the minimum number of characters of the pattern that the
440 string was found inside. This is important as in the case of positive
538e84ed 441 lookahead or positive lookbehind we can have multiple patterns
1de06328 442 involved. Consider
538e84ed 443
1de06328 444 /(?=FOO).*F/
538e84ed 445
1de06328
YO
446 The minimum length of the pattern overall is 3, the minimum length
447 of the lookahead part is 3, but the minimum length of the part that
538e84ed 448 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 449 minimum length for the F is 1. This is important as the minimum length
538e84ed 450 is used to determine offsets in front of and behind the string being
1de06328 451 looked for. Since strings can be composites this is the length of the
486ec47a 452 pattern at the time it was committed with a scan_commit. Note that
1de06328 453 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 454 are not known until the full pattern has been compiled, thus the
1de06328 455 pointer to the value.
538e84ed 456
1de06328 457 - lookbehind
538e84ed 458
1de06328 459 In the case of lookbehind the string being searched for can be
538e84ed 460 offset past the start point of the final matching string.
1de06328
YO
461 If this value was just blithely removed from the min_offset it would
462 invalidate some of the calculations for how many chars must match
463 before or after (as they are derived from min_offset and minlen and
538e84ed 464 the length of the string being searched for).
1de06328
YO
465 When the final pattern is compiled and the data is moved from the
466 scan_data_t structure into the regexp structure the information
538e84ed
KW
467 about lookbehind is factored in, with the information that would
468 have been lost precalculated in the end_shift field for the
1de06328
YO
469 associated string.
470
471 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 472 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
473
474*/
2c2d71f5 475
cb6aef8b
DM
476struct scan_data_substrs {
477 SV *str; /* longest substring found in pattern */
478 SSize_t min_offset; /* earliest point in string it can appear */
479 SSize_t max_offset; /* latest point in string it can appear */
480 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
481 SSize_t lookbehind; /* is the pos of the string modified by LB */
11683ecb 482 I32 flags; /* per substring SF_* and SCF_* flags */
cb6aef8b
DM
483};
484
2c2d71f5 485typedef struct scan_data_t {
1de06328
YO
486 /*I32 len_min; unused */
487 /*I32 len_delta; unused */
49f55535 488 SSize_t pos_min;
ea3daa5d 489 SSize_t pos_delta;
2c2d71f5 490 SV *last_found;
ea3daa5d 491 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 492 SSize_t last_start_min;
ea3daa5d 493 SSize_t last_start_max;
2099df82
DM
494 U8 cur_is_floating; /* whether the last_* values should be set as
495 * the next fixed (0) or floating (1)
496 * substring */
6ea77169 497
2099df82
DM
498 /* [0] is longest fixed substring so far, [1] is longest float so far */
499 struct scan_data_substrs substrs[2];
6ea77169 500
11683ecb 501 I32 flags; /* common SF_* and SCF_* flags */
2c2d71f5 502 I32 whilem_c;
49f55535 503 SSize_t *last_closep;
b8f7bb16 504 regnode_ssc *start_class;
2c2d71f5
JH
505} scan_data_t;
506
a687059c 507/*
e50aee73 508 * Forward declarations for pregcomp()'s friends.
a687059c 509 */
a0d0e21e 510
6ea77169 511static const scan_data_t zero_scan_data = {
a2ca2017 512 0, 0, NULL, 0, 0, 0, 0,
6ea77169 513 {
11683ecb
DM
514 { NULL, 0, 0, 0, 0, 0 },
515 { NULL, 0, 0, 0, 0, 0 },
6ea77169
DM
516 },
517 0, 0, NULL, NULL
518};
c277df42 519
11683ecb
DM
520/* study flags */
521
07be1b83
YO
522#define SF_BEFORE_SEOL 0x0001
523#define SF_BEFORE_MEOL 0x0002
11683ecb 524#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
c277df42 525
07be1b83
YO
526#define SF_IS_INF 0x0040
527#define SF_HAS_PAR 0x0080
528#define SF_IN_PAR 0x0100
529#define SF_HAS_EVAL 0x0200
bc604ad8
KW
530
531
532/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
533 * longest substring in the pattern. When it is not set the optimiser keeps
534 * track of position, but does not keep track of the actual strings seen,
535 *
536 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
537 * /foo/i will not.
538 *
539 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
540 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
541 * turned off because of the alternation (BRANCH). */
07be1b83 542#define SCF_DO_SUBSTR 0x0400
bc604ad8 543
653099ff
GS
544#define SCF_DO_STCLASS_AND 0x0800
545#define SCF_DO_STCLASS_OR 0x1000
546#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 547#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 548
786e8c11 549#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 550#define SCF_SEEN_ACCEPT 0x8000
688e0391 551#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
552#define SCF_IN_DEFINE 0x20000
553
554
555
07be1b83 556
43fead97 557#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
558
559/* The enums for all these are ordered so things work out correctly */
a62b1201 560#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
561#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
562 == REGEX_DEPENDS_CHARSET)
00b27cfc 563#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
564#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
565 >= REGEX_UNICODE_CHARSET)
566#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
567 == REGEX_ASCII_RESTRICTED_CHARSET)
568#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
569 >= REGEX_ASCII_RESTRICTED_CHARSET)
570#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
571 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 572
43fead97 573#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 574
f2c2a6ab
KW
575/* For programs that want to be strictly Unicode compatible by dying if any
576 * attempt is made to match a non-Unicode code point against a Unicode
577 * property. */
578#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
579
93733859 580#define OOB_NAMEDCLASS -1
b8c5462f 581
8e661ac5
KW
582/* There is no code point that is out-of-bounds, so this is problematic. But
583 * its only current use is to initialize a variable that is always set before
584 * looked at. */
585#define OOB_UNICODE 0xDEADBEEF
586
a0ed51b3 587#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 588
8615cb43 589
b45f050a
JF
590/* length of regex to show in messages that don't mark a position within */
591#define RegexLengthToShowInErrorMessages 127
592
593/*
594 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
595 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
596 * op/pragma/warn/regcomp.
597 */
7253e4e3
RK
598#define MARKER1 "<-- HERE" /* marker as it appears in the description */
599#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 600
538e84ed 601#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 602 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 603
285b5ca0
KW
604/* The code in this file in places uses one level of recursion with parsing
605 * rebased to an alternate string constructed by us in memory. This can take
606 * the form of something that is completely different from the input, or
607 * something that uses the input as part of the alternate. In the first case,
608 * there should be no possibility of an error, as we are in complete control of
51684e3c
KW
609 * the alternate string. But in the second case we don't completely control
610 * the input portion, so there may be errors in that. Here's an example:
285b5ca0
KW
611 * /[abc\x{DF}def]/ui
612 * is handled specially because \x{df} folds to a sequence of more than one
51684e3c 613 * character: 'ss'. What is done is to create and parse an alternate string,
285b5ca0
KW
614 * which looks like this:
615 * /(?:\x{DF}|[abc\x{DF}def])/ui
616 * where it uses the input unchanged in the middle of something it constructs,
617 * which is a branch for the DF outside the character class, and clustering
618 * parens around the whole thing. (It knows enough to skip the DF inside the
619 * class while in this substitute parse.) 'abc' and 'def' may have errors that
620 * need to be reported. The general situation looks like this:
621 *
51684e3c 622 * |<------- identical ------>|
285b5ca0 623 * sI tI xI eI
51684e3c 624 * Input: ---------------------------------------------------------------
285b5ca0
KW
625 * Constructed: ---------------------------------------------------
626 * sC tC xC eC EC
51684e3c 627 * |<------- identical ------>|
285b5ca0 628 *
51684e3c
KW
629 * sI..eI is the portion of the input pattern we are concerned with here.
630 * sC..EC is the constructed substitute parse string.
631 * sC..tC is constructed by us
632 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
633 * In the diagram, these are vertically aligned.
634 * eC..EC is also constructed by us.
635 * xC is the position in the substitute parse string where we found a
636 * problem.
637 * xI is the position in the original pattern corresponding to xC.
285b5ca0 638 *
51684e3c
KW
639 * We want to display a message showing the real input string. Thus we need to
640 * translate from xC to xI. We know that xC >= tC, since the portion of the
641 * string sC..tC has been constructed by us, and so shouldn't have errors. We
642 * get:
643 * xI = tI + (xC - tC)
285b5ca0 644 *
51684e3c
KW
645 * When the substitute parse is constructed, the code needs to set:
646 * RExC_start (sC)
647 * RExC_end (eC)
648 * RExC_copy_start_in_input (tI)
649 * RExC_copy_start_in_constructed (tC)
650 * and restore them when done.
285b5ca0 651 *
51684e3c
KW
652 * During normal processing of the input pattern, both
653 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
654 * sI, so that xC equals xI.
285b5ca0
KW
655 */
656
51684e3c
KW
657#define sI RExC_precomp
658#define eI RExC_precomp_end
659#define sC RExC_start
660#define eC RExC_end
661#define tI RExC_copy_start_in_input
662#define tC RExC_copy_start_in_constructed
663#define xI(xC) (tI + (xC - tC))
664#define xI_offset(xC) (xI(xC) - sI)
285b5ca0
KW
665
666#define REPORT_LOCATION_ARGS(xC) \
667 UTF8fARG(UTF, \
51684e3c 668 (xI(xC) > eI) /* Don't run off end */ \
285b5ca0 669 ? eC - sC /* Length before the <--HERE */ \
bc929b61
KW
670 : ((xI_offset(xC) >= 0) \
671 ? xI_offset(xC) \
672 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
673 IVdf " trying to output message for " \
674 " pattern %.*s", \
51684e3c 675 __FILE__, __LINE__, (IV) xI_offset(xC), \
bc929b61 676 ((int) (eC - sC)), sC), 0)), \
51684e3c 677 sI), /* The input pattern printed up to the <--HERE */ \
285b5ca0 678 UTF8fARG(UTF, \
51684e3c
KW
679 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
680 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
c1d900c3 681
8a6d8ec6
HS
682/* Used to point after bad bytes for an error message, but avoid skipping
683 * past a nul byte. */
684#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
685
b45f050a
JF
686/*
687 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
688 * arg. Show regex, up to a maximum length. If it's too long, chop and add
689 * "...".
690 */
58e23c8d 691#define _FAIL(code) STMT_START { \
bfed75c6 692 const char *ellipses = ""; \
88f063b4 693 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380
MP
694 \
695 if (!SIZE_ONLY) \
a5e7bc51 696 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
697 if (len > RegexLengthToShowInErrorMessages) { \
698 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
699 len = RegexLengthToShowInErrorMessages - 10; \
700 ellipses = "..."; \
701 } \
58e23c8d 702 code; \
ccb2c380 703} STMT_END
8615cb43 704
58e23c8d 705#define FAIL(msg) _FAIL( \
147e3846 706 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 707 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
708
709#define FAIL2(msg,arg) _FAIL( \
147e3846 710 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 711 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 712
b45f050a 713/*
b45f050a
JF
714 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
715 */
ccb2c380 716#define Simple_vFAIL(m) STMT_START { \
ccb2c380 717 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 718 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 719} STMT_END
b45f050a
JF
720
721/*
722 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
723 */
ccb2c380
MP
724#define vFAIL(m) STMT_START { \
725 if (!SIZE_ONLY) \
a5e7bc51 726 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
727 Simple_vFAIL(m); \
728} STMT_END
b45f050a
JF
729
730/*
731 * Like Simple_vFAIL(), but accepts two arguments.
732 */
ccb2c380 733#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
734 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
735 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 736} STMT_END
b45f050a
JF
737
738/*
739 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
740 */
ccb2c380
MP
741#define vFAIL2(m,a1) STMT_START { \
742 if (!SIZE_ONLY) \
a5e7bc51 743 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
744 Simple_vFAIL2(m, a1); \
745} STMT_END
b45f050a
JF
746
747
748/*
749 * Like Simple_vFAIL(), but accepts three arguments.
750 */
ccb2c380 751#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 752 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 753 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 754} STMT_END
b45f050a
JF
755
756/*
757 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
758 */
ccb2c380
MP
759#define vFAIL3(m,a1,a2) STMT_START { \
760 if (!SIZE_ONLY) \
a5e7bc51 761 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
762 Simple_vFAIL3(m, a1, a2); \
763} STMT_END
b45f050a
JF
764
765/*
766 * Like Simple_vFAIL(), but accepts four arguments.
767 */
ccb2c380 768#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
769 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
770 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 771} STMT_END
b45f050a 772
95db3ffa
KW
773#define vFAIL4(m,a1,a2,a3) STMT_START { \
774 if (!SIZE_ONLY) \
775 SAVEFREESV(RExC_rx_sv); \
776 Simple_vFAIL4(m, a1, a2, a3); \
777} STMT_END
778
946095af 779/* A specialized version of vFAIL2 that works with UTF8f */
d528642a
KW
780#define vFAIL2utf8f(m, a1) STMT_START { \
781 if (!SIZE_ONLY) \
782 SAVEFREESV(RExC_rx_sv); \
783 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
784 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
785} STMT_END
786
3ba22297 787#define vFAIL3utf8f(m, a1, a2) STMT_START { \
3ba22297
KW
788 if (!SIZE_ONLY) \
789 SAVEFREESV(RExC_rx_sv); \
790 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 791 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
792} STMT_END
793
f8220fe2 794/* This has an assert in it because of [perl #122671] Many warnings in
499333dc
KW
795 * regcomp.c can occur twice. If they get output in pass1 and later in that
796 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
f8220fe2
KW
797 * would get output again. So they should be output in pass2, and this
798 * assert makes sure new warnings follow that paradigm. */
799#define _WARN_HELPER(loc, warns, code) \
800 STMT_START { \
801 __ASSERT_(PASS2) code; \
802 } STMT_END
946095af 803
5e0a247b 804/* m is not necessarily a "literal string", in this macro */
f8220fe2
KW
805#define reg_warn_non_literal_string(loc, m) \
806 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
807 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 808 "%s" REPORT_LOCATION, \
f8220fe2 809 m, REPORT_LOCATION_ARGS(loc)))
5e0a247b 810
f8220fe2
KW
811#define ckWARNreg(loc,m) \
812 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
813 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 814 m REPORT_LOCATION, \
f8220fe2 815 REPORT_LOCATION_ARGS(loc)))
ccb2c380 816
f8220fe2
KW
817#define vWARN(loc, m) \
818 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
819 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 820 m REPORT_LOCATION, \
f8220fe2 821 REPORT_LOCATION_ARGS(loc))) \
b927b7e9 822
f8220fe2
KW
823#define vWARN_dep(loc, m) \
824 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
825 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 826 m REPORT_LOCATION, \
f8220fe2 827 REPORT_LOCATION_ARGS(loc)))
0d6106aa 828
f8220fe2
KW
829#define ckWARNdep(loc,m) \
830 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
831 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 832 m REPORT_LOCATION, \
f8220fe2 833 REPORT_LOCATION_ARGS(loc)))
147508a2 834
f8220fe2
KW
835#define ckWARNregdep(loc,m) \
836 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
837 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
d528642a
KW
838 WARN_REGEXP), \
839 m REPORT_LOCATION, \
f8220fe2 840 REPORT_LOCATION_ARGS(loc)))
ccb2c380 841
f8220fe2
KW
842#define ckWARN2reg_d(loc,m, a1) \
843 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
844 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
d528642a 845 m REPORT_LOCATION, \
f8220fe2 846 a1, REPORT_LOCATION_ARGS(loc)))
2335b3d3 847
f8220fe2
KW
848#define ckWARN2reg(loc, m, a1) \
849 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
850 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 851 m REPORT_LOCATION, \
f8220fe2 852 a1, REPORT_LOCATION_ARGS(loc)))
ccb2c380 853
f8220fe2
KW
854#define vWARN3(loc, m, a1, a2) \
855 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
856 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 857 m REPORT_LOCATION, \
f8220fe2 858 a1, a2, REPORT_LOCATION_ARGS(loc)))
ccb2c380 859
f8220fe2
KW
860#define ckWARN3reg(loc, m, a1, a2) \
861 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
862 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
863 m REPORT_LOCATION, \
864 a1, a2, \
f8220fe2 865 REPORT_LOCATION_ARGS(loc)))
668c081a 866
f8220fe2
KW
867#define vWARN4(loc, m, a1, a2, a3) \
868 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
869 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
870 m REPORT_LOCATION, \
871 a1, a2, a3, \
f8220fe2 872 REPORT_LOCATION_ARGS(loc)))
ccb2c380 873
f8220fe2
KW
874#define ckWARN4reg(loc, m, a1, a2, a3) \
875 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
876 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
877 m REPORT_LOCATION, \
878 a1, a2, a3, \
f8220fe2 879 REPORT_LOCATION_ARGS(loc)))
668c081a 880
f8220fe2
KW
881#define vWARN5(loc, m, a1, a2, a3, a4) \
882 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
883 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
884 m REPORT_LOCATION, \
885 a1, a2, a3, a4, \
f8220fe2 886 REPORT_LOCATION_ARGS(loc)))
9d1d55b5 887
aea5f181
KW
888#define ckWARNexperimental(loc, class, m) \
889 _WARN_HELPER(loc, packWARN(class), \
890 Perl_ck_warner_d(aTHX_ packWARN(class), \
891 m REPORT_LOCATION, \
892 REPORT_LOCATION_ARGS(loc)))
893
1a322bb4
KW
894/* Convert between a pointer to a node and its offset from the beginning of the
895 * program */
896#define REGNODE_p(offset) (RExC_emit_start + (offset))
897#define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
898
538e84ed 899/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
900 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
901 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
902 * Element 0 holds the number n.
07be1b83 903 * Position is 1 indexed.
fac92740 904 */
7122b237 905#ifndef RE_TRACK_PATTERN_OFFSETS
d7c442ce 906#define Set_Node_Offset_To_R(offset,byte)
7122b237
YO
907#define Set_Node_Offset(node,byte)
908#define Set_Cur_Node_Offset
909#define Set_Node_Length_To_R(node,len)
910#define Set_Node_Length(node,len)
6a86c6ad 911#define Set_Node_Cur_Length(node,start)
538e84ed
KW
912#define Node_Offset(n)
913#define Node_Length(n)
7122b237
YO
914#define Set_Node_Offset_Length(node,offset,len)
915#define ProgLen(ri) ri->u.proglen
916#define SetProgLen(ri,x) ri->u.proglen = x
917#else
918#define ProgLen(ri) ri->u.offsets[0]
919#define SetProgLen(ri,x) ri->u.offsets[0] = x
d7c442ce 920#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
ccb2c380
MP
921 if (! SIZE_ONLY) { \
922 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
d7c442ce
KW
923 __LINE__, (int)(offset), (int)(byte))); \
924 if((offset) < 0) { \
538e84ed 925 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
d7c442ce 926 (int)(offset)); \
ccb2c380 927 } else { \
d7c442ce 928 RExC_offsets[2*(offset)-1] = (byte); \
ccb2c380
MP
929 } \
930 } \
931} STMT_END
932
88f063b4 933#define Set_Node_Offset(node,byte) \
1a322bb4 934 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
ccb2c380
MP
935#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
936
937#define Set_Node_Length_To_R(node,len) STMT_START { \
938 if (! SIZE_ONLY) { \
939 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 940 __LINE__, (int)(node), (int)(len))); \
ccb2c380 941 if((node) < 0) { \
538e84ed
KW
942 Perl_croak(aTHX_ "value of node is %d in Length macro", \
943 (int)(node)); \
ccb2c380
MP
944 } else { \
945 RExC_offsets[2*(node)] = (len); \
946 } \
947 } \
948} STMT_END
949
950#define Set_Node_Length(node,len) \
1a322bb4 951 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
6a86c6ad
NC
952#define Set_Node_Cur_Length(node, start) \
953 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
954
955/* Get offsets and lengths */
1a322bb4
KW
956#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
957#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
fac92740 958
07be1b83 959#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1a322bb4
KW
960 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
961 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
07be1b83 962} STMT_END
7122b237 963#endif
07be1b83
YO
964
965#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
966#define EXPERIMENTAL_INPLACESCAN
f427392e 967#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 968
cb41e5d6
YO
969#ifdef DEBUGGING
970int
6ad9a8ab 971Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
972{
973 va_list ap;
974 int result;
975 PerlIO *f= Perl_debug_log;
976 PERL_ARGS_ASSERT_RE_PRINTF;
977 va_start(ap, fmt);
978 result = PerlIO_vprintf(f, fmt, ap);
979 va_end(ap);
980 return result;
981}
982
983int
7b031478 984Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
985{
986 va_list ap;
987 int result;
988 PerlIO *f= Perl_debug_log;
989 PERL_ARGS_ASSERT_RE_INDENTF;
990 va_start(ap, depth);
daeb874b 991 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
992 result = PerlIO_vprintf(f, fmt, ap);
993 va_end(ap);
994 return result;
995}
996#endif /* DEBUGGING */
997
7b031478 998#define DEBUG_RExC_seen() \
538e84ed 999 DEBUG_OPTIMISE_MORE_r({ \
88f063b4 1000 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 1001 \
e384d5c1 1002 if (RExC_seen & REG_ZERO_LEN_SEEN) \
88f063b4 1003 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 1004 \
e384d5c1 1005 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
88f063b4 1006 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 1007 \
e384d5c1 1008 if (RExC_seen & REG_GPOS_SEEN) \
88f063b4 1009 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 1010 \
e384d5c1 1011 if (RExC_seen & REG_RECURSE_SEEN) \
88f063b4 1012 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 1013 \
7b031478 1014 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
88f063b4 1015 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 1016 \
e384d5c1 1017 if (RExC_seen & REG_VERBARG_SEEN) \
88f063b4 1018 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 1019 \
e384d5c1 1020 if (RExC_seen & REG_CUTGROUP_SEEN) \
88f063b4 1021 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 1022 \
e384d5c1 1023 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
88f063b4 1024 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 1025 \
e384d5c1 1026 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
88f063b4 1027 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 1028 \
7b031478 1029 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
88f063b4 1030 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 1031 \
88f063b4 1032 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
1033 });
1034
fdfb4f21 1035#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 1036 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21 1037
fdfb4f21 1038
f5a36d78
DM
1039#ifdef DEBUGGING
1040static void
1041S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1042 const char *close_str)
1043{
1044 if (!flags)
1045 return;
1046
1047 Perl_re_printf( aTHX_ "%s", open_str);
11683ecb
DM
1048 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1049 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
f5a36d78
DM
1050 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1051 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1052 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1053 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1054 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1055 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1056 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1057 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1058 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1059 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1060 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1061 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1062 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1063 Perl_re_printf( aTHX_ "%s", close_str);
1064}
1065
1066
1067static void
1068S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1069 U32 depth, int is_inf)
1070{
1071 GET_RE_DEBUG_FLAGS_DECL;
1072
1073 DEBUG_OPTIMISE_MORE_r({
1074 if (!data)
1075 return;
1076 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1077 depth,
1078 where,
1079 (IV)data->pos_min,
1080 (IV)data->pos_delta,
1081 (UV)data->flags
1082 );
1083
1084 S_debug_show_study_flags(aTHX_ data->flags," [","]");
fdfb4f21 1085
f5a36d78
DM
1086 Perl_re_printf( aTHX_
1087 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1088 (IV)data->whilem_c,
1089 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1090 is_inf ? "INF " : ""
1091 );
1092
11683ecb 1093 if (data->last_found) {
37b6262f 1094 int i;
11683ecb
DM
1095 Perl_re_printf(aTHX_
1096 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1097 SvPVX_const(data->last_found),
1098 (IV)data->last_end,
1099 (IV)data->last_start_min,
1100 (IV)data->last_start_max
1101 );
1102
2099df82 1103 for (i = 0; i < 2; i++) {
37b6262f
DM
1104 Perl_re_printf(aTHX_
1105 " %s%s: '%s' @ %" IVdf "/%" IVdf,
2099df82
DM
1106 data->cur_is_floating == i ? "*" : "",
1107 i ? "Float" : "Fixed",
37b6262f
DM
1108 SvPVX_const(data->substrs[i].str),
1109 (IV)data->substrs[i].min_offset,
1110 (IV)data->substrs[i].max_offset
1111 );
1112 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1113 }
11683ecb 1114 }
f5a36d78
DM
1115
1116 Perl_re_printf( aTHX_ "\n");
1117 });
1118}
1119
1120
1121static void
1122S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1123 regnode *scan, U32 depth, U32 flags)
1124{
1125 GET_RE_DEBUG_FLAGS_DECL;
1126
1127 DEBUG_OPTIMISE_r({
1128 regnode *Next;
1129
1130 if (!scan)
1131 return;
1132 Next = regnext(scan);
1133 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1134 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1135 depth,
1136 str,
1137 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1138 Next ? (REG_NODE_NUM(Next)) : 0 );
1139 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1140 Perl_re_printf( aTHX_ "\n");
1141 });
1142}
1143
1144
1145# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1146 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1147
1148# define DEBUG_PEEP(str, scan, depth, flags) \
1149 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1150
1151#else
1152# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1153# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1154#endif
1de06328 1155
cb41e5d6 1156
c6871b76
KW
1157/* =========================================================
1158 * BEGIN edit_distance stuff.
1159 *
1160 * This calculates how many single character changes of any type are needed to
1161 * transform a string into another one. It is taken from version 3.1 of
1162 *
1163 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1164 */
1165
1166/* Our unsorted dictionary linked list. */
1167/* Note we use UVs, not chars. */
1168
1169struct dictionary{
1170 UV key;
1171 UV value;
1172 struct dictionary* next;
1173};
1174typedef struct dictionary item;
1175
1176
1177PERL_STATIC_INLINE item*
88f063b4 1178push(UV key, item* curr)
c6871b76
KW
1179{
1180 item* head;
d09f14bf 1181 Newx(head, 1, item);
c6871b76
KW
1182 head->key = key;
1183 head->value = 0;
1184 head->next = curr;
1185 return head;
1186}
1187
1188
1189PERL_STATIC_INLINE item*
1190find(item* head, UV key)
1191{
1192 item* iterator = head;
1193 while (iterator){
1194 if (iterator->key == key){
1195 return iterator;
1196 }
1197 iterator = iterator->next;
1198 }
1199
1200 return NULL;
1201}
1202
1203PERL_STATIC_INLINE item*
88f063b4 1204uniquePush(item* head, UV key)
c6871b76
KW
1205{
1206 item* iterator = head;
1207
1208 while (iterator){
1209 if (iterator->key == key) {
1210 return head;
1211 }
1212 iterator = iterator->next;
1213 }
1214
88f063b4 1215 return push(key, head);
c6871b76
KW
1216}
1217
1218PERL_STATIC_INLINE void
1219dict_free(item* head)
1220{
1221 item* iterator = head;
1222
1223 while (iterator) {
1224 item* temp = iterator;
1225 iterator = iterator->next;
1226 Safefree(temp);
1227 }
1228
1229 head = NULL;
1230}
1231
1232/* End of Dictionary Stuff */
1233
1234/* All calculations/work are done here */
1235STATIC int
1236S_edit_distance(const UV* src,
1237 const UV* tgt,
1238 const STRLEN x, /* length of src[] */
1239 const STRLEN y, /* length of tgt[] */
1240 const SSize_t maxDistance
1241)
1242{
1243 item *head = NULL;
88f063b4 1244 UV swapCount, swapScore, targetCharCount, i, j;
c6871b76
KW
1245 UV *scores;
1246 UV score_ceil = x + y;
1247
1248 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1249
1250 /* intialize matrix start values */
d09f14bf 1251 Newx(scores, ( (x + 2) * (y + 2)), UV);
c6871b76
KW
1252 scores[0] = score_ceil;
1253 scores[1 * (y + 2) + 0] = score_ceil;
1254 scores[0 * (y + 2) + 1] = score_ceil;
1255 scores[1 * (y + 2) + 1] = 0;
88f063b4 1256 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
c6871b76
KW
1257
1258 /* work loops */
1259 /* i = src index */
1260 /* j = tgt index */
1261 for (i=1;i<=x;i++) {
1262 if (i < x)
88f063b4 1263 head = uniquePush(head, src[i]);
c6871b76
KW
1264 scores[(i+1) * (y + 2) + 1] = i;
1265 scores[(i+1) * (y + 2) + 0] = score_ceil;
1266 swapCount = 0;
1267
1268 for (j=1;j<=y;j++) {
1269 if (i == 1) {
1270 if(j < y)
88f063b4 1271 head = uniquePush(head, tgt[j]);
c6871b76
KW
1272 scores[1 * (y + 2) + (j + 1)] = j;
1273 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1274 }
1275
88f063b4 1276 targetCharCount = find(head, tgt[j-1])->value;
c6871b76
KW
1277 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1278
1279 if (src[i-1] != tgt[j-1]){
1280 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1281 }
1282 else {
1283 swapCount = j;
1284 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1285 }
1286 }
1287
88f063b4 1288 find(head, src[i-1])->value = i;
c6871b76
KW
1289 }
1290
1291 {
1292 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1293 dict_free(head);
1294 Safefree(scores);
1295 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1296 }
1297}
1298
1299/* END of edit_distance() stuff
1300 * ========================================================= */
1301
8e35b056
KW
1302/* is c a control character for which we have a mnemonic? */
1303#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1304
549b4e78
KW
1305STATIC const char *
1306S_cntrl_to_mnemonic(const U8 c)
1307{
1308 /* Returns the mnemonic string that represents character 'c', if one
1309 * exists; NULL otherwise. The only ones that exist for the purposes of
1310 * this routine are a few control characters */
1311
1312 switch (c) {
1313 case '\a': return "\\a";
1314 case '\b': return "\\b";
1315 case ESC_NATIVE: return "\\e";
1316 case '\f': return "\\f";
1317 case '\n': return "\\n";
1318 case '\r': return "\\r";
1319 case '\t': return "\\t";
1320 }
1321
1322 return NULL;
1323}
1324
653099ff 1325/* Mark that we cannot extend a found fixed substring at this point.
37b6262f 1326 Update the longest found anchored substring or the longest found
653099ff
GS
1327 floating substrings if needed. */
1328
4327152a 1329STATIC void
ea3daa5d
FC
1330S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1331 SSize_t *minlenp, int is_inf)
c277df42 1332{
e1ec3a88 1333 const STRLEN l = CHR_SVLEN(data->last_found);
2099df82 1334 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
a2ca2017 1335 const STRLEN old_l = CHR_SVLEN(longest_sv);
1de06328 1336 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1337
7918f24d
NC
1338 PERL_ARGS_ASSERT_SCAN_COMMIT;
1339
c277df42 1340 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
2099df82 1341 const U8 i = data->cur_is_floating;
a2ca2017 1342 SvSetMagicSV(longest_sv, data->last_found);
37b6262f
DM
1343 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1344
2099df82
DM
1345 if (!i) /* fixed */
1346 data->substrs[0].max_offset = data->substrs[0].min_offset;
1347 else { /* float */
1348 data->substrs[1].max_offset = (l
646e8787
DM
1349 ? data->last_start_max
1350 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1351 ? SSize_t_MAX
1352 : data->pos_min + data->pos_delta));
1353 if (is_inf
2099df82
DM
1354 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1355 data->substrs[1].max_offset = SSize_t_MAX;
37b6262f 1356 }
11683ecb 1357
37b6262f
DM
1358 if (data->flags & SF_BEFORE_EOL)
1359 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1360 else
1361 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1362 data->substrs[i].minlenp = minlenp;
1363 data->substrs[i].lookbehind = 0;
c277df42 1364 }
37b6262f 1365
c277df42 1366 SvCUR_set(data->last_found, 0);
0eda9292 1367 {
a28509cc 1368 SV * const sv = data->last_found;
097eb12c
AL
1369 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1370 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1371 if (mg)
1372 mg->mg_len = 0;
1373 }
0eda9292 1374 }
c277df42
IZ
1375 data->last_end = -1;
1376 data->flags &= ~SF_BEFORE_EOL;
f5a36d78 1377 DEBUG_STUDYDATA("commit", data, 0, is_inf);
c277df42
IZ
1378}
1379
cdd87c1d
KW
1380/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1381 * list that describes which code points it matches */
1382
653099ff 1383STATIC void
3420edd7 1384S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1385{
cdd87c1d
KW
1386 /* Set the SSC 'ssc' to match an empty string or any code point */
1387
557bd3fb 1388 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1389
71068078 1390 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1391
0854ea0b
KW
1392 /* mortalize so won't leak */
1393 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1394 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1395}
1396
653099ff 1397STATIC int
dc3bf405 1398S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1399{
c144baaa
KW
1400 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1401 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1402 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1403 * in any way, so there's no point in using it */
cdd87c1d
KW
1404
1405 UV start, end;
1406 bool ret;
653099ff 1407
557bd3fb 1408 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1409
71068078 1410 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1411
93e92956 1412 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1413 return FALSE;
1414 }
1415
1416 /* See if the list consists solely of the range 0 - Infinity */
1417 invlist_iterinit(ssc->invlist);
1418 ret = invlist_iternext(ssc->invlist, &start, &end)
1419 && start == 0
1420 && end == UV_MAX;
1421
1422 invlist_iterfinish(ssc->invlist);
1423
1424 if (ret) {
1425 return TRUE;
1426 }
1427
1428 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1429 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1430 int i;
1431 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1432 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1433 return TRUE;
1434 }
1435 }
1436 }
1437
1438 return FALSE;
653099ff
GS
1439}
1440
653099ff 1441STATIC void
cdd87c1d 1442S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1443{
cdd87c1d
KW
1444 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1445 * string, any code point, or any posix class under locale */
1446
557bd3fb 1447 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1448
557bd3fb 1449 Zero(ssc, 1, regnode_ssc);
71068078 1450 set_ANYOF_SYNTHETIC(ssc);
93e92956 1451 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1452 ssc_anything(ssc);
cdd87c1d 1453
2f306ab9
KW
1454 /* If any portion of the regex is to operate under locale rules that aren't
1455 * fully known at compile time, initialization includes it. The reason
1456 * this isn't done for all regexes is that the optimizer was written under
1457 * the assumption that locale was all-or-nothing. Given the complexity and
1458 * lack of documentation in the optimizer, and that there are inadequate
1459 * test cases for locale, many parts of it may not work properly, it is
1460 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1461 if (RExC_contains_locale) {
1462 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1463 }
1464 else {
1465 ANYOF_POSIXL_ZERO(ssc);
1466 }
653099ff
GS
1467}
1468
b423522f 1469STATIC int
dc3bf405
BF
1470S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1471 const regnode_ssc *ssc)
b423522f
KW
1472{
1473 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1474 * to the list of code points matched, and locale posix classes; hence does
1475 * not check its flags) */
1476
1477 UV start, end;
1478 bool ret;
1479
1480 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1481
71068078 1482 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1483
1484 invlist_iterinit(ssc->invlist);
1485 ret = invlist_iternext(ssc->invlist, &start, &end)
1486 && start == 0
1487 && end == UV_MAX;
1488
1489 invlist_iterfinish(ssc->invlist);
1490
1491 if (! ret) {
1492 return FALSE;
1493 }
1494
e0e1be5f 1495 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1496 return FALSE;
b423522f
KW
1497 }
1498
1499 return TRUE;
1500}
1501
1502STATIC SV*
1503S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1504 const regnode_charclass* const node)
b423522f
KW
1505{
1506 /* Returns a mortal inversion list defining which code points are matched
1507 * by 'node', which is of type ANYOF. Handles complementing the result if
1508 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1509 * returned list must, and will, contain every code point that is a
1510 * possibility. */
b423522f 1511
e2506fa7 1512 SV* invlist = NULL;
1ee208c4 1513 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1514 unsigned int i;
1515 const U32 n = ARG(node);
31f05a37 1516 bool new_node_has_latin1 = FALSE;
b423522f
KW
1517
1518 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1519
1520 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1521 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1522 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1523 AV * const av = MUTABLE_AV(SvRV(rv));
1524 SV **const ary = AvARRAY(av);
1525 assert(RExC_rxi->data->what[n] == 's');
1526
1527 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
28118b9c 1528 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]), NULL));
b423522f
KW
1529 }
1530 else if (ary[0] && ary[0] != &PL_sv_undef) {
1531
1532 /* Here, no compile-time swash, and there are things that won't be
1533 * known until runtime -- we have to assume it could be anything */
e2506fa7 1534 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1535 return _add_range_to_invlist(invlist, 0, UV_MAX);
1536 }
1ee208c4 1537 else if (ary[3] && ary[3] != &PL_sv_undef) {
b423522f
KW
1538
1539 /* Here no compile-time swash, and no run-time only data. Use the
1540 * node's inversion list */
28118b9c 1541 invlist = sv_2mortal(invlist_clone(ary[3], NULL));
1ee208c4
KW
1542 }
1543
1544 /* Get the code points valid only under UTF-8 locales */
037715a6 1545 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1ee208c4
KW
1546 && ary[2] && ary[2] != &PL_sv_undef)
1547 {
1548 only_utf8_locale_invlist = ary[2];
b423522f
KW
1549 }
1550 }
1551
e2506fa7
KW
1552 if (! invlist) {
1553 invlist = sv_2mortal(_new_invlist(0));
1554 }
1555
dcb20b36
KW
1556 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1557 * code points, and an inversion list for the others, but if there are code
1558 * points that should match only conditionally on the target string being
1559 * UTF-8, those are placed in the inversion list, and not the bitmap.
1560 * Since there are circumstances under which they could match, they are
1561 * included in the SSC. But if the ANYOF node is to be inverted, we have
1562 * to exclude them here, so that when we invert below, the end result
1563 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1564 * have to do this here before we add the unconditionally matched code
1565 * points */
b423522f
KW
1566 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1567 _invlist_intersection_complement_2nd(invlist,
1568 PL_UpperLatin1,
1569 &invlist);
1570 }
1571
1572 /* Add in the points from the bit map */
dcb20b36 1573 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
b423522f 1574 if (ANYOF_BITMAP_TEST(node, i)) {
6f8848d5
TC
1575 unsigned int start = i++;
1576
1577 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1578 /* empty */
1579 }
1580 invlist = _add_range_to_invlist(invlist, start, i-1);
31f05a37 1581 new_node_has_latin1 = TRUE;
b423522f
KW
1582 }
1583 }
1584
1585 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1586 * as well. But don't add them if inverting, as when that gets done below,
1587 * it would exclude all these characters, including the ones it shouldn't
1588 * that were added just above */
1589 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
f240c685
KW
1590 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1591 {
b423522f
KW
1592 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1593 }
1594
1595 /* Similarly for these */
93e92956 1596 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1597 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1598 }
1599
1600 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1601 _invlist_invert(invlist);
1602 }
037715a6 1603 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
31f05a37
KW
1604
1605 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1606 * locale. We can skip this if there are no 0-255 at all. */
1607 _invlist_union(invlist, PL_Latin1, &invlist);
1608 }
1609
1ee208c4
KW
1610 /* Similarly add the UTF-8 locale possible matches. These have to be
1611 * deferred until after the non-UTF-8 locale ones are taken care of just
1612 * above, or it leads to wrong results under ANYOF_INVERT */
1613 if (only_utf8_locale_invlist) {
31f05a37 1614 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1615 only_utf8_locale_invlist,
31f05a37
KW
1616 ANYOF_FLAGS(node) & ANYOF_INVERT,
1617 &invlist);
1618 }
b423522f
KW
1619
1620 return invlist;
1621}
1622
1051e1c4 1623/* These two functions currently do the exact same thing */
557bd3fb 1624#define ssc_init_zero ssc_init
653099ff 1625
cdd87c1d
KW
1626#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1627#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1628
557bd3fb 1629/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1630 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1631 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1632
653099ff 1633STATIC void
b423522f 1634S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1635 const regnode_charclass *and_with)
653099ff 1636{
cdd87c1d
KW
1637 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1638 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1639
a0dd4231
KW
1640 SV* anded_cp_list;
1641 U8 anded_flags;
1e6ade67 1642
cdd87c1d 1643 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1644
71068078 1645 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1646
1647 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1648 * the code point inversion list and just the relevant flags */
71068078 1649 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1650 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
a0dd4231 1651 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1652
1653 /* XXX This is a kludge around what appears to be deficiencies in the
1654 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1655 * there are paths through the optimizer where it doesn't get weeded
1656 * out when it should. And if we don't make some extra provision for
1657 * it like the code just below, it doesn't get added when it should.
1658 * This solution is to add it only when AND'ing, which is here, and
1659 * only when what is being AND'ed is the pristine, original node
1660 * matching anything. Thus it is like adding it to ssc_anything() but
1661 * only when the result is to be AND'ed. Probably the same solution
1662 * could be adopted for the same problem we have with /l matching,
1663 * which is solved differently in S_ssc_init(), and that would lead to
1664 * fewer false positives than that solution has. But if this solution
1665 * creates bugs, the consequences are only that a warning isn't raised
1666 * that should be; while the consequences for having /l bugs is
1667 * incorrect matches */
7dcac5f6 1668 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1669 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1670 }
a0dd4231
KW
1671 }
1672 else {
5c0f85ef 1673 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685
KW
1674 if (OP(and_with) == ANYOFD) {
1675 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1676 }
1677 else {
1678 anded_flags = ANYOF_FLAGS(and_with)
1679 &( ANYOF_COMMON_FLAGS
108316fb
KW
1680 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1681 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1682 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1683 anded_flags &=
1684 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1685 }
f240c685 1686 }
a0dd4231
KW
1687 }
1688
1689 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1690
1691 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1692 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1693 * 'and_with' may be inverted. When not inverted, we have the situation of
1694 * computing:
1695 * (C1 | P1) & (C2 | P2)
1696 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1697 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1698 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1699 * <= ((C1 & C2) | P1 | P2)
1700 * Alternatively, the last few steps could be:
1701 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1702 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1703 * <= (C1 | C2 | (P1 & P2))
1704 * We favor the second approach if either P1 or P2 is non-empty. This is
1705 * because these components are a barrier to doing optimizations, as what
1706 * they match cannot be known until the moment of matching as they are
1707 * dependent on the current locale, 'AND"ing them likely will reduce or
1708 * eliminate them.
1709 * But we can do better if we know that C1,P1 are in their initial state (a
1710 * frequent occurrence), each matching everything:
1711 * (<everything>) & (C2 | P2) = C2 | P2
1712 * Similarly, if C2,P2 are in their initial state (again a frequent
1713 * occurrence), the result is a no-op
1714 * (C1 | P1) & (<everything>) = C1 | P1
1715 *
1716 * Inverted, we have
1717 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1718 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1719 * <= (C1 & ~C2) | (P1 & ~P2)
1720 * */
1aa99e6b 1721
a0dd4231 1722 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1723 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1724 {
cdd87c1d 1725 unsigned int i;
8951c461 1726
cdd87c1d
KW
1727 ssc_intersection(ssc,
1728 anded_cp_list,
1729 FALSE /* Has already been inverted */
1730 );
c6b76537 1731
cdd87c1d
KW
1732 /* If either P1 or P2 is empty, the intersection will be also; can skip
1733 * the loop */
93e92956 1734 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1735 ANYOF_POSIXL_ZERO(ssc);
1736 }
e0e1be5f 1737 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1738
1739 /* Note that the Posix class component P from 'and_with' actually
1740 * looks like:
1741 * P = Pa | Pb | ... | Pn
1742 * where each component is one posix class, such as in [\w\s].
1743 * Thus
1744 * ~P = ~(Pa | Pb | ... | Pn)
1745 * = ~Pa & ~Pb & ... & ~Pn
1746 * <= ~Pa | ~Pb | ... | ~Pn
1747 * The last is something we can easily calculate, but unfortunately
1748 * is likely to have many false positives. We could do better
1749 * in some (but certainly not all) instances if two classes in
1750 * P have known relationships. For example
1751 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1752 * So
1753 * :lower: & :print: = :lower:
1754 * And similarly for classes that must be disjoint. For example,
1755 * since \s and \w can have no elements in common based on rules in
1756 * the POSIX standard,
1757 * \w & ^\S = nothing
1758 * Unfortunately, some vendor locales do not meet the Posix
1759 * standard, in particular almost everything by Microsoft.
1760 * The loop below just changes e.g., \w into \W and vice versa */
1761
1ee208c4 1762 regnode_charclass_posixl temp;
cdd87c1d
KW
1763 int add = 1; /* To calculate the index of the complement */
1764
b1234259 1765 Zero(&temp, 1, regnode_charclass_posixl);
cdd87c1d
KW
1766 ANYOF_POSIXL_ZERO(&temp);
1767 for (i = 0; i < ANYOF_MAX; i++) {
1768 assert(i % 2 != 0
7dcac5f6
KW
1769 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1770 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1771
7dcac5f6 1772 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1773 ANYOF_POSIXL_SET(&temp, i + add);
1774 }
1775 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1776 }
1777 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1778
cdd87c1d
KW
1779 } /* else ssc already has no posixes */
1780 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1781 in its initial state */
71068078 1782 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1783 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1784 {
1785 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1786 * copy it over 'ssc' */
1787 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1788 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1789 StructCopy(and_with, ssc, regnode_ssc);
1790 }
1791 else {
1792 ssc->invlist = anded_cp_list;
1793 ANYOF_POSIXL_ZERO(ssc);
93e92956 1794 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1795 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1796 }
1797 }
1798 }
e0e1be5f 1799 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
93e92956 1800 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1801 {
1802 /* One or the other of P1, P2 is non-empty. */
93e92956 1803 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1804 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1805 }
cdd87c1d
KW
1806 ssc_union(ssc, anded_cp_list, FALSE);
1807 }
1808 else { /* P1 = P2 = empty */
1809 ssc_intersection(ssc, anded_cp_list, FALSE);
1810 }
137165a6 1811 }
653099ff
GS
1812}
1813
653099ff 1814STATIC void
cdd87c1d 1815S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1816 const regnode_charclass *or_with)
653099ff 1817{
cdd87c1d
KW
1818 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1819 * another SSC or a regular ANYOF class. Can create false positives if
1820 * 'or_with' is to be inverted. */
7918f24d 1821
a0dd4231
KW
1822 SV* ored_cp_list;
1823 U8 ored_flags;
c6b76537 1824
cdd87c1d 1825 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1826
71068078 1827 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1828
1829 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1830 * the code point inversion list and just the relevant flags */
71068078 1831 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1832 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
a0dd4231
KW
1833 ored_flags = ANYOF_FLAGS(or_with);
1834 }
1835 else {
5c0f85ef 1836 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
eff8b7dc 1837 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
f240c685
KW
1838 if (OP(or_with) != ANYOFD) {
1839 ored_flags
1840 |= ANYOF_FLAGS(or_with)
108316fb
KW
1841 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1842 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1843 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1844 ored_flags |=
1845 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1846 }
f240c685 1847 }
a0dd4231
KW
1848 }
1849
1850 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1851
1852 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1853 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1854 * 'or_with' may be inverted. When not inverted, we have the simple
1855 * situation of computing:
1856 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1857 * If P1|P2 yields a situation with both a class and its complement are
1858 * set, like having both \w and \W, this matches all code points, and we
1859 * can delete these from the P component of the ssc going forward. XXX We
1860 * might be able to delete all the P components, but I (khw) am not certain
1861 * about this, and it is better to be safe.
1862 *
1863 * Inverted, we have
1864 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1865 * <= (C1 | P1) | ~C2
1866 * <= (C1 | ~C2) | P1
1867 * (which results in actually simpler code than the non-inverted case)
1868 * */
9826f543 1869
a0dd4231 1870 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1871 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1872 {
cdd87c1d 1873 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1874 } /* else Not inverted */
93e92956 1875 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1876 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1877 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1878 unsigned int i;
1879 for (i = 0; i < ANYOF_MAX; i += 2) {
1880 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1881 {
1882 ssc_match_all_cp(ssc);
1883 ANYOF_POSIXL_CLEAR(ssc, i);
1884 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1885 }
1886 }
1887 }
1aa99e6b 1888 }
cdd87c1d
KW
1889
1890 ssc_union(ssc,
1891 ored_cp_list,
1892 FALSE /* Already has been inverted */
1893 );
653099ff
GS
1894}
1895
b423522f
KW
1896PERL_STATIC_INLINE void
1897S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1898{
1899 PERL_ARGS_ASSERT_SSC_UNION;
1900
71068078 1901 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1902
1903 _invlist_union_maybe_complement_2nd(ssc->invlist,
1904 invlist,
1905 invert2nd,
1906 &ssc->invlist);
1907}
1908
1909PERL_STATIC_INLINE void
1910S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1911 SV* const invlist,
1912 const bool invert2nd)
1913{
1914 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1915
71068078 1916 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1917
1918 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1919 invlist,
1920 invert2nd,
1921 &ssc->invlist);
1922}
1923
1924PERL_STATIC_INLINE void
1925S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1926{
1927 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1928
71068078 1929 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1930
1931 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1932}
1933
1934PERL_STATIC_INLINE void
1935S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1936{
1937 /* AND just the single code point 'cp' into the SSC 'ssc' */
1938
1939 SV* cp_list = _new_invlist(2);
1940
1941 PERL_ARGS_ASSERT_SSC_CP_AND;
1942
71068078 1943 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1944
1945 cp_list = add_cp_to_invlist(cp_list, cp);
1946 ssc_intersection(ssc, cp_list,
1947 FALSE /* Not inverted */
1948 );
1949 SvREFCNT_dec_NN(cp_list);
1950}
1951
1952PERL_STATIC_INLINE void
dc3bf405 1953S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
1954{
1955 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
1956 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1957
71068078 1958 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1959
1960 ANYOF_POSIXL_ZERO(ssc);
1961 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1962}
1963
b35552de
KW
1964#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1965
1966STATIC bool
1967S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1968{
1969 /* The synthetic start class is used to hopefully quickly winnow down
1970 * places where a pattern could start a match in the target string. If it
1971 * doesn't really narrow things down that much, there isn't much point to
1972 * having the overhead of using it. This function uses some very crude
1973 * heuristics to decide if to use the ssc or not.
1974 *
1975 * It returns TRUE if 'ssc' rules out more than half what it considers to
1976 * be the "likely" possible matches, but of course it doesn't know what the
1977 * actual things being matched are going to be; these are only guesses
1978 *
1979 * For /l matches, it assumes that the only likely matches are going to be
1980 * in the 0-255 range, uniformly distributed, so half of that is 127
1981 * For /a and /d matches, it assumes that the likely matches will be just
1982 * the ASCII range, so half of that is 63
1983 * For /u and there isn't anything matching above the Latin1 range, it
1984 * assumes that that is the only range likely to be matched, and uses
1985 * half that as the cut-off: 127. If anything matches above Latin1,
1986 * it assumes that all of Unicode could match (uniformly), except for
1987 * non-Unicode code points and things in the General Category "Other"
1988 * (unassigned, private use, surrogates, controls and formats). This
1989 * is a much large number. */
1990
b35552de
KW
1991 U32 count = 0; /* Running total of number of code points matched by
1992 'ssc' */
1993 UV start, end; /* Start and end points of current range in inversion
1994 list */
72400949
KW
1995 const U32 max_code_points = (LOC)
1996 ? 256
1997 : (( ! UNI_SEMANTICS
1998 || invlist_highest(ssc->invlist) < 256)
1999 ? 128
2000 : NON_OTHER_COUNT);
2001 const U32 max_match = max_code_points / 2;
b35552de
KW
2002
2003 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2004
2005 invlist_iterinit(ssc->invlist);
2006 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
2007 if (start >= max_code_points) {
2008 break;
b35552de 2009 }
72400949 2010 end = MIN(end, max_code_points - 1);
b35552de 2011 count += end - start + 1;
72400949 2012 if (count >= max_match) {
b35552de
KW
2013 invlist_iterfinish(ssc->invlist);
2014 return FALSE;
2015 }
2016 }
2017
2018 return TRUE;
2019}
2020
2021
b423522f
KW
2022STATIC void
2023S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2024{
2025 /* The inversion list in the SSC is marked mortal; now we need a more
2026 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
2027 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2028 * map */
b423522f 2029
28118b9c 2030 SV* invlist = invlist_clone(ssc->invlist, NULL);
b423522f
KW
2031
2032 PERL_ARGS_ASSERT_SSC_FINALIZE;
2033
71068078 2034 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 2035
a0dd4231 2036 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
2037 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2038 * by the time we reach here */
f240c685
KW
2039 assert(! (ANYOF_FLAGS(ssc)
2040 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
2041 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2042 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 2043
b423522f
KW
2044 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2045
1ee208c4
KW
2046 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
2047 NULL, NULL, NULL, FALSE);
b423522f 2048
85c8e306
KW
2049 /* Make sure is clone-safe */
2050 ssc->invlist = NULL;
2051
e0e1be5f 2052 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 2053 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
d156f5cb 2054 OP(ssc) = ANYOFPOSIXL;
e0e1be5f 2055 }
d156f5cb 2056 else if (RExC_contains_locale) {
b2e90ddf
KW
2057 OP(ssc) = ANYOFL;
2058 }
2059
1462525b 2060 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
2061}
2062
a3621e74
YO
2063#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2064#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2065#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
2066#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2067 ? (TRIE_LIST_CUR( idx ) - 1) \
2068 : 0 )
a3621e74 2069
3dab1dad
YO
2070
2071#ifdef DEBUGGING
07be1b83 2072/*
2b8b4781
NC
2073 dump_trie(trie,widecharmap,revcharmap)
2074 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2075 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
2076
2077 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
2078 The _interim_ variants are used for debugging the interim
2079 tables that are used to generate the final compressed
2080 representation which is what dump_trie expects.
2081
486ec47a 2082 Part of the reason for their existence is to provide a form
3dab1dad 2083 of documentation as to how the different representations function.
07be1b83
YO
2084
2085*/
3dab1dad
YO
2086
2087/*
3dab1dad
YO
2088 Dumps the final compressed table form of the trie to Perl_debug_log.
2089 Used for debugging make_trie().
2090*/
b9a59e08 2091
3dab1dad 2092STATIC void
2b8b4781
NC
2093S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2094 AV *revcharmap, U32 depth)
3dab1dad
YO
2095{
2096 U32 state;
ab3bbdeb 2097 SV *sv=sv_newmortal();
55eed653 2098 int colwidth= widecharmap ? 6 : 4;
2e64971a 2099 U16 word;
3dab1dad
YO
2100 GET_RE_DEBUG_FLAGS_DECL;
2101
7918f24d 2102 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 2103
6ad9a8ab 2104 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 2105 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
2106
2107 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 2108 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 2109 if ( tmp ) {
6ad9a8ab 2110 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2111 colwidth,
538e84ed 2112 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2113 PL_colors[0], PL_colors[1],
2114 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2115 PERL_PV_ESCAPE_FIRSTCHAR
2116 )
ab3bbdeb 2117 );
3dab1dad
YO
2118 }
2119 }
1e37780e
YO
2120 Perl_re_printf( aTHX_ "\n");
2121 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2122
2123 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2124 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2125 Perl_re_printf( aTHX_ "\n");
3dab1dad 2126
1e2e3d02 2127 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2128 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2129
147e3846 2130 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2131
2132 if ( trie->states[ state ].wordnum ) {
1e37780e 2133 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2134 } else {
6ad9a8ab 2135 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2136 }
2137
147e3846 2138 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2139
2140 if ( base ) {
2141 U32 ofs = 0;
2142
2143 while( ( base + ofs < trie->uniquecharcount ) ||
2144 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2145 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2146 != state))
3dab1dad
YO
2147 ofs++;
2148
147e3846 2149 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2150
2151 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2152 if ( ( base + ofs >= trie->uniquecharcount )
2153 && ( base + ofs - trie->uniquecharcount
2154 < trie->lasttrans )
2155 && trie->trans[ base + ofs
2156 - trie->uniquecharcount ].check == state )
3dab1dad 2157 {
147e3846 2158 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2159 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2160 );
3dab1dad 2161 } else {
88f063b4 2162 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
3dab1dad
YO
2163 }
2164 }
2165
6ad9a8ab 2166 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2167
2168 }
6ad9a8ab 2169 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2170 }
6ad9a8ab 2171 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2172 depth);
2e64971a 2173 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2174 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2175 (int)word, (int)(trie->wordinfo[word].prev),
2176 (int)(trie->wordinfo[word].len));
2177 }
6ad9a8ab 2178 Perl_re_printf( aTHX_ "\n" );
538e84ed 2179}
3dab1dad 2180/*
3dab1dad 2181 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2182 List tries normally only are used for construction when the number of
3dab1dad
YO
2183 possible chars (trie->uniquecharcount) is very high.
2184 Used for debugging make_trie().
2185*/
2186STATIC void
55eed653 2187S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2188 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2189 U32 depth)
3dab1dad
YO
2190{
2191 U32 state;
ab3bbdeb 2192 SV *sv=sv_newmortal();
55eed653 2193 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2194 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2195
2196 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2197
3dab1dad 2198 /* print out the table precompression. */
6ad9a8ab 2199 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2200 depth+1 );
6ad9a8ab 2201 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2202 depth+1, "------:-----+-----------------\n" );
538e84ed 2203
3dab1dad
YO
2204 for( state=1 ; state < next_alloc ; state ++ ) {
2205 U16 charid;
538e84ed 2206
147e3846 2207 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2208 depth+1, (UV)state );
3dab1dad 2209 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2210 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2211 } else {
6ad9a8ab 2212 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2213 trie->states[ state ].wordnum
2214 );
2215 }
2216 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed 2217 SV ** const tmp = av_fetch( revcharmap,
88f063b4 2218 TRIE_LIST_ITEM(state, charid).forid, 0);
ab3bbdeb 2219 if ( tmp ) {
147e3846 2220 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2221 colwidth,
538e84ed
KW
2222 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2223 colwidth,
2224 PL_colors[0], PL_colors[1],
2225 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2226 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2227 ) ,
88f063b4
KW
2228 TRIE_LIST_ITEM(state, charid).forid,
2229 (UV)TRIE_LIST_ITEM(state, charid).newstate
1e2e3d02 2230 );
538e84ed 2231 if (!(charid % 10))
6ad9a8ab 2232 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2233 (int)((depth * 2) + 14), "");
1e2e3d02 2234 }
ab3bbdeb 2235 }
6ad9a8ab 2236 Perl_re_printf( aTHX_ "\n");
3dab1dad 2237 }
538e84ed 2238}
3dab1dad
YO
2239
2240/*
3dab1dad 2241 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2242 This is the normal DFA style state transition table, with a few
2243 twists to facilitate compression later.
3dab1dad
YO
2244 Used for debugging make_trie().
2245*/
2246STATIC void
55eed653 2247S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2248 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2249 U32 depth)
3dab1dad
YO
2250{
2251 U32 state;
2252 U16 charid;
ab3bbdeb 2253 SV *sv=sv_newmortal();
55eed653 2254 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2255 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2256
2257 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2258
3dab1dad
YO
2259 /*
2260 print out the table precompression so that we can do a visual check
2261 that they are identical.
2262 */
538e84ed 2263
6ad9a8ab 2264 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2265
2266 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2267 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2268 if ( tmp ) {
6ad9a8ab 2269 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2270 colwidth,
538e84ed 2271 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2272 PL_colors[0], PL_colors[1],
2273 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2274 PERL_PV_ESCAPE_FIRSTCHAR
2275 )
ab3bbdeb 2276 );
3dab1dad
YO
2277 }
2278 }
2279
4aaafc03
YO
2280 Perl_re_printf( aTHX_ "\n");
2281 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2282
2283 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2284 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2285 }
2286
6ad9a8ab 2287 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2288
2289 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2290
147e3846 2291 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2292 depth+1,
3dab1dad
YO
2293 (UV)TRIE_NODENUM( state ) );
2294
2295 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2296 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2297 if (v)
147e3846 2298 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2299 else
6ad9a8ab 2300 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2301 }
2302 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2303 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2304 (UV)trie->trans[ state ].check );
3dab1dad 2305 } else {
147e3846 2306 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2307 (UV)trie->trans[ state ].check,
3dab1dad
YO
2308 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2309 }
2310 }
07be1b83 2311}
3dab1dad
YO
2312
2313#endif
2314
2e64971a 2315
786e8c11
YO
2316/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2317 startbranch: the first branch in the whole branch sequence
2318 first : start branch of sequence of branch-exact nodes.
2319 May be the same as startbranch
2320 last : Thing following the last branch.
2321 May be the same as tail.
2322 tail : item following the branch sequence
2323 count : words in the sequence
a4525e78 2324 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2325 depth : indent depth
3dab1dad 2326
786e8c11 2327Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2328
786e8c11
YO
2329A trie is an N'ary tree where the branches are determined by digital
2330decomposition of the key. IE, at the root node you look up the 1st character and
2331follow that branch repeat until you find the end of the branches. Nodes can be
2332marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2333
786e8c11 2334 /he|she|his|hers/
72f13be8 2335
786e8c11
YO
2336would convert into the following structure. Numbers represent states, letters
2337following numbers represent valid transitions on the letter from that state, if
2338the number is in square brackets it represents an accepting state, otherwise it
2339will be in parenthesis.
07be1b83 2340
786e8c11
YO
2341 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2342 | |
2343 | (2)
2344 | |
2345 (1) +-i->(6)-+-s->[7]
2346 |
2347 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2348
786e8c11
YO
2349 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2350
2351This shows that when matching against the string 'hers' we will begin at state 1
2352read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2353then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2354is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2355single traverse. We store a mapping from accepting to state to which word was
2356matched, and then when we have multiple possibilities we try to complete the
b8fda935 2357rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2358
2359The only prior NFA like behaviour that would be changed by the TRIE support is
2360the silent ignoring of duplicate alternations which are of the form:
2361
2362 / (DUPE|DUPE) X? (?{ ... }) Y /x
2363
4b714af6 2364Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2365and without the optimisation. With the optimisations dupes will be silently
486ec47a 2366ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2367the following demonstrates:
2368
2369 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2370
2371which prints out 'word' three times, but
2372
2373 'words'=~/(word|word|word)(?{ print $1 })S/
2374
2375which doesnt print it out at all. This is due to other optimisations kicking in.
2376
2377Example of what happens on a structural level:
2378
486ec47a 2379The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2380
2381 1: CURLYM[1] {1,32767}(18)
2382 5: BRANCH(8)
2383 6: EXACT <ac>(16)
2384 8: BRANCH(11)
2385 9: EXACT <ad>(16)
2386 11: BRANCH(14)
2387 12: EXACT <ab>(16)
2388 16: SUCCEED(0)
2389 17: NOTHING(18)
2390 18: END(0)
2391
2392This would be optimizable with startbranch=5, first=5, last=16, tail=16
2393and should turn into:
2394
2395 1: CURLYM[1] {1,32767}(18)
2396 5: TRIE(16)
2397 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2398 <ac>
2399 <ad>
2400 <ab>
2401 16: SUCCEED(0)
2402 17: NOTHING(18)
2403 18: END(0)
2404
2405Cases where tail != last would be like /(?foo|bar)baz/:
2406
2407 1: BRANCH(4)
2408 2: EXACT <foo>(8)
2409 4: BRANCH(7)
2410 5: EXACT <bar>(8)
2411 7: TAIL(8)
2412 8: EXACT <baz>(10)
2413 10: END(0)
2414
2415which would be optimizable with startbranch=1, first=1, last=7, tail=8
2416and would end up looking like:
2417
2418 1: TRIE(8)
2419 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2420 <foo>
2421 <bar>
2422 7: TAIL(8)
2423 8: EXACT <baz>(10)
2424 10: END(0)
2425
c80e42f3 2426 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2427
2428is the recommended Unicode-aware way of saying
2429
2430 *(d++) = uv;
2431*/
2432
fab2782b 2433#define TRIE_STORE_REVCHAR(val) \
786e8c11 2434 STMT_START { \
73031816 2435 if (UTF) { \
668fcfea 2436 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2437 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 2438 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
2439 SvCUR_set(zlopp, kapow - flrbbbbb); \
2440 SvPOK_on(zlopp); \
2441 SvUTF8_on(zlopp); \
2442 av_push(revcharmap, zlopp); \
2443 } else { \
fab2782b 2444 char ooooff = (char)val; \
73031816
NC
2445 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2446 } \
2447 } STMT_END
786e8c11 2448
914a25d5
KW
2449/* This gets the next character from the input, folding it if not already
2450 * folded. */
2451#define TRIE_READ_CHAR STMT_START { \
2452 wordlen++; \
2453 if ( UTF ) { \
2454 /* if it is UTF then it is either already folded, or does not need \
2455 * folding */ \
1c1d615a 2456 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2457 } \
2458 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2459 /* This folder implies Unicode rules, which in the range expressible \
2460 * by not UTF is the lower case, with the two exceptions, one of \
2461 * which should have been taken care of before calling this */ \
2462 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2463 uvc = toLOWER_L1(*uc); \
2464 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2465 len = 1; \
914a25d5
KW
2466 } else { \
2467 /* raw data, will be folded later if needed */ \
2468 uvc = (U32)*uc; \
2469 len = 1; \
2470 } \
786e8c11
YO
2471} STMT_END
2472
2473
2474
2475#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2476 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
00195859 2477 U32 ging = TRIE_LIST_LEN( state ) * 2; \
f9003953 2478 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
00195859 2479 TRIE_LIST_LEN( state ) = ging; \
786e8c11
YO
2480 } \
2481 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2482 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2483 TRIE_LIST_CUR( state )++; \
2484} STMT_END
07be1b83 2485
786e8c11 2486#define TRIE_LIST_NEW(state) STMT_START { \
d09f14bf 2487 Newx( trie->states[ state ].trans.list, \
786e8c11
YO
2488 4, reg_trie_trans_le ); \
2489 TRIE_LIST_CUR( state ) = 1; \
2490 TRIE_LIST_LEN( state ) = 4; \
2491} STMT_END
07be1b83 2492
786e8c11
YO
2493#define TRIE_HANDLE_WORD(state) STMT_START { \
2494 U16 dupe= trie->states[ state ].wordnum; \
2495 regnode * const noper_next = regnext( noper ); \
2496 \
786e8c11
YO
2497 DEBUG_r({ \
2498 /* store the word for dumping */ \
2499 SV* tmp; \
2500 if (OP(noper) != NOTHING) \
740cce10 2501 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2502 else \
740cce10 2503 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2504 av_push( trie_words, tmp ); \
786e8c11
YO
2505 }); \
2506 \
2507 curword++; \
2e64971a
DM
2508 trie->wordinfo[curword].prev = 0; \
2509 trie->wordinfo[curword].len = wordlen; \
2510 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2511 \
2512 if ( noper_next < tail ) { \
2513 if (!trie->jump) \
538e84ed
KW
2514 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2515 sizeof(U16) ); \
7f69552c 2516 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2517 if (!jumper) \
2518 jumper = noper_next; \
2519 if (!nextbranch) \
2520 nextbranch= regnext(cur); \
2521 } \
2522 \
2523 if ( dupe ) { \
2e64971a
DM
2524 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2525 /* chain, so that when the bits of chain are later */\
2526 /* linked together, the dups appear in the chain */\
2527 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2528 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2529 } else { \
2530 /* we haven't inserted this word yet. */ \
2531 trie->states[ state ].wordnum = curword; \
2532 } \
2533} STMT_END
07be1b83 2534
3dab1dad 2535
786e8c11
YO
2536#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2537 ( ( base + charid >= ucharcount \
2538 && base + charid < ubound \
2539 && state == trie->trans[ base - ucharcount + charid ].check \
2540 && trie->trans[ base - ucharcount + charid ].next ) \
2541 ? trie->trans[ base - ucharcount + charid ].next \
2542 : ( state==1 ? special : 0 ) \
2543 )
3dab1dad 2544
8bcafbf4
YO
2545#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2546STMT_START { \
2547 TRIE_BITMAP_SET(trie, uvc); \
2548 /* store the folded codepoint */ \
2549 if ( folder ) \
2550 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2551 \
2552 if ( !UTF ) { \
2553 /* store first byte of utf8 representation of */ \
2554 /* variant codepoints */ \
2555 if (! UVCHR_IS_INVARIANT(uvc)) { \
2556 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2557 } \
2558 } \
2559} STMT_END
786e8c11
YO
2560#define MADE_TRIE 1
2561#define MADE_JUMP_TRIE 2
2562#define MADE_EXACT_TRIE 4
3dab1dad 2563
a3621e74 2564STATIC I32
538e84ed
KW
2565S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2566 regnode *first, regnode *last, regnode *tail,
2567 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2568{
2569 /* first pass, loop through and scan words */
2570 reg_trie_data *trie;
55eed653 2571 HV *widecharmap = NULL;
2b8b4781 2572 AV *revcharmap = newAV();
a3621e74 2573 regnode *cur;
a3621e74
YO
2574 STRLEN len = 0;
2575 UV uvc = 0;
2576 U16 curword = 0;
2577 U32 next_alloc = 0;
786e8c11
YO
2578 regnode *jumper = NULL;
2579 regnode *nextbranch = NULL;
7f69552c 2580 regnode *convert = NULL;
2e64971a 2581 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2582 /* we just use folder as a flag in utf8 */
1e696034 2583 const U8 * folder = NULL;
a3621e74 2584
3a611511
YO
2585 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2586 * which stands for one trie structure, one hash, optionally followed
2587 * by two arrays */
2b8b4781 2588#ifdef DEBUGGING
3a611511 2589 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2b8b4781
NC
2590 AV *trie_words = NULL;
2591 /* along with revcharmap, this only used during construction but both are
2592 * useful during debugging so we store them in the struct when debugging.
8e11feef 2593 */
2b8b4781 2594#else
cf78de0b 2595 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2596 STRLEN trie_charcount=0;
3dab1dad 2597#endif
2b8b4781 2598 SV *re_trie_maxbuff;
a3621e74 2599 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2600
2601 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2602#ifndef DEBUGGING
2603 PERL_UNUSED_ARG(depth);
2604#endif
a3621e74 2605
1e696034 2606 switch (flags) {
a4525e78 2607 case EXACT: case EXACTL: break;
89829bb5 2608 case EXACTFAA:
fab2782b 2609 case EXACTFU_SS:
a4525e78
KW
2610 case EXACTFU:
2611 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2612 case EXACTF: folder = PL_fold; break;
fab2782b 2613 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2614 }
2615
c944940b 2616 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2617 trie->refcount = 1;
3dab1dad 2618 trie->startstate = 1;
786e8c11 2619 trie->wordcount = word_count;
f8fc2ecf 2620 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2621 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
a4525e78 2622 if (flags == EXACT || flags == EXACTL)
c944940b 2623 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2624 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2625 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2626
a3621e74 2627 DEBUG_r({
2b8b4781 2628 trie_words = newAV();
a3621e74 2629 });
a3621e74 2630
0111c4fd 2631 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
316ebaf2 2632 assert(re_trie_maxbuff);
a3621e74 2633 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2634 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2635 }
df826430 2636 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2637 Perl_re_indentf( aTHX_
cb41e5d6
YO
2638 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2639 depth+1,
88f063b4 2640 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
538e84ed 2641 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2642 });
538e84ed 2643
7f69552c
YO
2644 /* Find the node we are going to overwrite */
2645 if ( first == startbranch && OP( last ) != BRANCH ) {
2646 /* whole branch chain */
2647 convert = first;
2648 } else {
2649 /* branch sub-chain */
2650 convert = NEXTOPER( first );
2651 }
538e84ed 2652
a3621e74
YO
2653 /* -- First loop and Setup --
2654
2655 We first traverse the branches and scan each word to determine if it
2656 contains widechars, and how many unique chars there are, this is
2657 important as we have to build a table with at least as many columns as we
2658 have unique chars.
2659
2660 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2661 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2662 the native representation of the character value as the key and IV's for
2663 the coded index.
a3621e74
YO
2664
2665 *TODO* If we keep track of how many times each character is used we can
2666 remap the columns so that the table compression later on is more
3b753521 2667 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2668 middle and the least common are on the outside. IMO this would be better
2669 than a most to least common mapping as theres a decent chance the most
2670 common letter will share a node with the least common, meaning the node
486ec47a 2671 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2672 case is when we have the least common nodes twice.
2673
2674 */
2675
a3621e74 2676 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2677 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2678 const U8 *uc;
2679 const U8 *e;
bc031a7d 2680 int foldlen = 0;
07be1b83 2681 U32 wordlen = 0; /* required init */
bc031a7d
KW
2682 STRLEN minchars = 0;
2683 STRLEN maxchars = 0;
538e84ed
KW
2684 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2685 bitmap?*/
a3621e74 2686
3dab1dad 2687 if (OP(noper) == NOTHING) {
20ed8c88
YO
2688 /* skip past a NOTHING at the start of an alternation
2689 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2690 */
df826430 2691 regnode *noper_next= regnext(noper);
944e05e3
YO
2692 if (noper_next < tail)
2693 noper= noper_next;
2694 }
2695
dca5fc4c
YO
2696 if ( noper < tail &&
2697 (
2698 OP(noper) == flags ||
2699 (
2700 flags == EXACTFU &&
2701 OP(noper) == EXACTFU_SS
2702 )
2703 )
2704 ) {
944e05e3
YO
2705 uc= (U8*)STRING(noper);
2706 e= uc + STR_LEN(noper);
2707 } else {
2708 trie->minlen= 0;
2709 continue;
3dab1dad 2710 }
df826430 2711
944e05e3 2712
fab2782b 2713 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2714 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2715 regardless of encoding */
fab2782b
YO
2716 if (OP( noper ) == EXACTFU_SS) {
2717 /* false positives are ok, so just set this */
0dc4a61d 2718 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2719 }
2720 }
dca5fc4c 2721
bc031a7d
KW
2722 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2723 branch */
3dab1dad 2724 TRIE_CHARCOUNT(trie)++;
a3621e74 2725 TRIE_READ_CHAR;
645de4ce 2726
bc031a7d
KW
2727 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2728 * is in effect. Under /i, this character can match itself, or
2729 * anything that folds to it. If not under /i, it can match just
2730 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2731 * all fold to k, and all are single characters. But some folds
2732 * expand to more than one character, so for example LATIN SMALL
2733 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2734 * the string beginning at 'uc' is 'ffi', it could be matched by
2735 * three characters, or just by the one ligature character. (It
2736 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2737 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2738 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2739 * match.) The trie needs to know the minimum and maximum number
2740 * of characters that could match so that it can use size alone to
2741 * quickly reject many match attempts. The max is simple: it is
2742 * the number of folded characters in this branch (since a fold is
2743 * never shorter than what folds to it. */
2744
2745 maxchars++;
2746
2747 /* And the min is equal to the max if not under /i (indicated by
2748 * 'folder' being NULL), or there are no multi-character folds. If
2749 * there is a multi-character fold, the min is incremented just
2750 * once, for the character that folds to the sequence. Each
2751 * character in the sequence needs to be added to the list below of
2752 * characters in the trie, but we count only the first towards the
2753 * min number of characters needed. This is done through the
2754 * variable 'foldlen', which is returned by the macros that look
2755 * for these sequences as the number of bytes the sequence
2756 * occupies. Each time through the loop, we decrement 'foldlen' by
2757 * how many bytes the current char occupies. Only when it reaches
2758 * 0 do we increment 'minchars' or look for another multi-character
2759 * sequence. */
2760 if (folder == NULL) {
2761 minchars++;
2762 }
2763 else if (foldlen > 0) {
2764 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2765 }
2766 else {
bc031a7d
KW
2767 minchars++;
2768
2769 /* See if *uc is the beginning of a multi-character fold. If
2770 * so, we decrement the length remaining to look at, to account
2771 * for the current character this iteration. (We can use 'uc'
2772 * instead of the fold returned by TRIE_READ_CHAR because for
2773 * non-UTF, the latin1_safe macro is smart enough to account
2774 * for all the unfolded characters, and because for UTF, the
2775 * string will already have been folded earlier in the
2776 * compilation process */
2777 if (UTF) {
2778 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2779 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2780 }
2781 }
bc031a7d
KW
2782 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2783 foldlen--;
2784 }
645de4ce 2785 }
bc031a7d
KW
2786
2787 /* The current character (and any potential folds) should be added
2788 * to the possible matching characters for this position in this
2789 * branch */
a3621e74 2790 if ( uvc < 256 ) {
fab2782b
YO
2791 if ( folder ) {
2792 U8 folded= folder[ (U8) uvc ];
2793 if ( !trie->charmap[ folded ] ) {
2794 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2795 TRIE_STORE_REVCHAR( folded );
2796 }
2797 }
a3621e74
YO
2798 if ( !trie->charmap[ uvc ] ) {
2799 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2800 TRIE_STORE_REVCHAR( uvc );
a3621e74 2801 }
02daf0ab 2802 if ( set_bit ) {
62012aee
KW
2803 /* store the codepoint in the bitmap, and its folded
2804 * equivalent. */
8bcafbf4 2805 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2806 set_bit = 0; /* We've done our bit :-) */
2807 }
a3621e74 2808 } else {
bc031a7d
KW
2809
2810 /* XXX We could come up with the list of code points that fold
2811 * to this using PL_utf8_foldclosures, except not for
2812 * multi-char folds, as there may be multiple combinations
2813 * there that could work, which needs to wait until runtime to
2814 * resolve (The comment about LIGATURE FFI above is such an
2815 * example */
2816
a3621e74 2817 SV** svpp;
55eed653
NC
2818 if ( !widecharmap )
2819 widecharmap = newHV();
a3621e74 2820
55eed653 2821 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2822
2823 if ( !svpp )
147e3846 2824 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
2825
2826 if ( !SvTRUE( *svpp ) ) {
2827 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2828 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2829 }
2830 }
bc031a7d
KW
2831 } /* end loop through characters in this branch of the trie */
2832
2833 /* We take the min and max for this branch and combine to find the min
2834 * and max for all branches processed so far */
3dab1dad 2835 if( cur == first ) {
bc031a7d
KW
2836 trie->minlen = minchars;
2837 trie->maxlen = maxchars;
2838 } else if (minchars < trie->minlen) {
2839 trie->minlen = minchars;
2840 } else if (maxchars > trie->maxlen) {
2841 trie->maxlen = maxchars;
fab2782b 2842 }
a3621e74
YO
2843 } /* end first pass */
2844 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2845 Perl_re_indentf( aTHX_
cb41e5d6
YO
2846 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2847 depth+1,
55eed653 2848 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2849 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2850 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2851 );
a3621e74
YO
2852
2853 /*
2854 We now know what we are dealing with in terms of unique chars and
2855 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2856 representation using a flat table will take. If it's over a reasonable
2857 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2858 conservative but potentially much slower representation using an array
2859 of lists.
2860
2861 At the end we convert both representations into the same compressed
2862 form that will be used in regexec.c for matching with. The latter
2863 is a form that cannot be used to construct with but has memory
2864 properties similar to the list form and access properties similar
2865 to the table form making it both suitable for fast searches and
2866 small enough that its feasable to store for the duration of a program.
2867
2868 See the comment in the code where the compressed table is produced
2869 inplace from the flat tabe representation for an explanation of how
2870 the compression works.
2871
2872 */
2873
2874
2e64971a
DM
2875 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2876 prev_states[1] = 0;
2877
538e84ed
KW
2878 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2879 > SvIV(re_trie_maxbuff) )
2880 {
a3621e74
YO
2881 /*
2882 Second Pass -- Array Of Lists Representation
2883
2884 Each state will be represented by a list of charid:state records
2885 (reg_trie_trans_le) the first such element holds the CUR and LEN
2886 points of the allocated array. (See defines above).
2887
2888 We build the initial structure using the lists, and then convert
2889 it into the compressed table form which allows faster lookups
2890 (but cant be modified once converted).
a3621e74
YO
2891 */
2892
a3621e74
YO
2893 STRLEN transcount = 1;
2894
6ad9a8ab 2895 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 2896 depth+1));
686b73d4 2897
c944940b
JH
2898 trie->states = (reg_trie_state *)
2899 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2900 sizeof(reg_trie_state) );
a3621e74
YO
2901 TRIE_LIST_NEW(1);
2902 next_alloc = 2;
2903
2904 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2905
df826430 2906 regnode *noper = NEXTOPER( cur );
c445ea15
AL
2907 U32 state = 1; /* required init */
2908 U16 charid = 0; /* sanity init */
07be1b83 2909 U32 wordlen = 0; /* required init */
c445ea15 2910
df826430
YO
2911 if (OP(noper) == NOTHING) {
2912 regnode *noper_next= regnext(noper);
944e05e3
YO
2913 if (noper_next < tail)
2914 noper= noper_next;
df826430
YO
2915 }
2916
944e05e3
YO
2917 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2918 const U8 *uc= (U8*)STRING(noper);
2919 const U8 *e= uc + STR_LEN(noper);
2920
786e8c11 2921 for ( ; uc < e ; uc += len ) {
c445ea15 2922
786e8c11 2923 TRIE_READ_CHAR;
c445ea15 2924
786e8c11
YO
2925 if ( uvc < 256 ) {
2926 charid = trie->charmap[ uvc ];
c445ea15 2927 } else {
538e84ed
KW
2928 SV** const svpp = hv_fetch( widecharmap,
2929 (char*)&uvc,
2930 sizeof( UV ),
2931 0);
786e8c11
YO
2932 if ( !svpp ) {
2933 charid = 0;
2934 } else {
2935 charid=(U16)SvIV( *svpp );
2936 }
c445ea15 2937 }
538e84ed
KW
2938 /* charid is now 0 if we dont know the char read, or
2939 * nonzero if we do */
786e8c11 2940 if ( charid ) {
a3621e74 2941
786e8c11
YO
2942 U16 check;
2943 U32 newstate = 0;
a3621e74 2944
786e8c11
YO
2945 charid--;
2946 if ( !trie->states[ state ].trans.list ) {
2947 TRIE_LIST_NEW( state );
c445ea15 2948 }
538e84ed
KW
2949 for ( check = 1;
2950 check <= TRIE_LIST_USED( state );
2951 check++ )
2952 {
2953 if ( TRIE_LIST_ITEM( state, check ).forid
2954 == charid )
2955 {
786e8c11
YO
2956 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2957 break;
2958 }
2959 }
2960 if ( ! newstate ) {
2961 newstate = next_alloc++;
2e64971a 2962 prev_states[newstate] = state;
786e8c11
YO
2963 TRIE_LIST_PUSH( state, charid, newstate );
2964 transcount++;
2965 }
2966 state = newstate;
2967 } else {
147e3846 2968 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 2969 }
a28509cc 2970 }
c445ea15 2971 }
3dab1dad 2972 TRIE_HANDLE_WORD(state);
a3621e74
YO
2973
2974 } /* end second pass */
2975
1e2e3d02 2976 /* next alloc is the NEXT state to be allocated */
538e84ed 2977 trie->statecount = next_alloc;
c944940b
JH
2978 trie->states = (reg_trie_state *)
2979 PerlMemShared_realloc( trie->states,
2980 next_alloc
2981 * sizeof(reg_trie_state) );
a3621e74 2982
3dab1dad 2983 /* and now dump it out before we compress it */
2b8b4781
NC
2984 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2985 revcharmap, next_alloc,
2986 depth+1)
1e2e3d02 2987 );
a3621e74 2988
c944940b
JH
2989 trie->trans = (reg_trie_trans *)
2990 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2991 {
2992 U32 state;
a3621e74
YO
2993 U32 tp = 0;
2994 U32 zp = 0;
2995
2996
2997 for( state=1 ; state < next_alloc ; state ++ ) {
2998 U32 base=0;
2999
3000 /*
3001 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3002 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
3003 );
3004 */
3005
3006 if (trie->states[state].trans.list) {
3007 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3008 U16 maxid=minid;
a28509cc 3009 U16 idx;
a3621e74
YO
3010
3011 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
3012 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3013 if ( forid < minid ) {
3014 minid=forid;
3015 } else if ( forid > maxid ) {
3016 maxid=forid;
3017 }
a3621e74
YO
3018 }
3019 if ( transcount < tp + maxid - minid + 1) {
3020 transcount *= 2;
c944940b
JH
3021 trie->trans = (reg_trie_trans *)
3022 PerlMemShared_realloc( trie->trans,
446bd890
NC
3023 transcount
3024 * sizeof(reg_trie_trans) );
538e84ed
KW
3025 Zero( trie->trans + (transcount / 2),
3026 transcount / 2,
3027 reg_trie_trans );
a3621e74
YO
3028 }
3029 base = trie->uniquecharcount + tp - minid;
3030 if ( maxid == minid ) {
3031 U32 set = 0;
3032 for ( ; zp < tp ; zp++ ) {
3033 if ( ! trie->trans[ zp ].next ) {
3034 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
3035 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3036 1).newstate;
a3621e74
YO
3037 trie->trans[ zp ].check = state;
3038 set = 1;
3039 break;
3040 }
3041 }
3042 if ( !set ) {
538e84ed
KW
3043 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3044 1).newstate;
a3621e74
YO
3045 trie->trans[ tp ].check = state;
3046 tp++;
3047 zp = tp;
3048 }
3049 } else {
3050 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
3051 const U32 tid = base
3052 - trie->uniquecharcount
3053 + TRIE_LIST_ITEM( state, idx ).forid;
3054 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3055 idx ).newstate;
a3621e74
YO
3056 trie->trans[ tid ].check = state;
3057 }
3058 tp += ( maxid - minid + 1 );
3059 }
3060 Safefree(trie->states[ state ].trans.list);
3061 }
3062 /*
3063 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3064 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
3065 );
3066 */
3067 trie->states[ state ].trans.base=base;
3068 }
cc601c31 3069 trie->lasttrans = tp + 1;
a3621e74
YO
3070 }
3071 } else {
3072 /*
3073 Second Pass -- Flat Table Representation.
3074
b423522f
KW
3075 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3076 each. We know that we will need Charcount+1 trans at most to store
3077 the data (one row per char at worst case) So we preallocate both
3078 structures assuming worst case.
a3621e74
YO
3079
3080 We then construct the trie using only the .next slots of the entry
3081 structs.
3082
b423522f
KW
3083 We use the .check field of the first entry of the node temporarily
3084 to make compression both faster and easier by keeping track of how
3085 many non zero fields are in the node.
a3621e74
YO
3086
3087 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3088 transition.
3089
b423522f
KW
3090 There are two terms at use here: state as a TRIE_NODEIDX() which is
3091 a number representing the first entry of the node, and state as a
3092 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3093 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3094 if there are 2 entrys per node. eg:
a3621e74
YO
3095
3096 A B A B
3097 1. 2 4 1. 3 7
3098 2. 0 3 3. 0 5
3099 3. 0 0 5. 0 0
3100 4. 0 0 7. 0 0
3101
b423522f
KW
3102 The table is internally in the right hand, idx form. However as we
3103 also have to deal with the states array which is indexed by nodenum
3104 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
3105
3106 */
6ad9a8ab 3107 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 3108 depth+1));
3dab1dad 3109
c944940b
JH
3110 trie->trans = (reg_trie_trans *)
3111 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3112 * trie->uniquecharcount + 1,
3113 sizeof(reg_trie_trans) );
3114 trie->states = (reg_trie_state *)
3115 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3116 sizeof(reg_trie_state) );
a3621e74
YO
3117 next_alloc = trie->uniquecharcount + 1;
3118
3dab1dad 3119
a3621e74
YO
3120 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3121
df826430 3122 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3123
3124 U32 state = 1; /* required init */
3125
3126 U16 charid = 0; /* sanity init */
3127 U32 accept_state = 0; /* sanity init */
a3621e74 3128
07be1b83 3129 U32 wordlen = 0; /* required init */
a3621e74 3130
df826430
YO
3131 if (OP(noper) == NOTHING) {
3132 regnode *noper_next= regnext(noper);
944e05e3
YO
3133 if (noper_next < tail)
3134 noper= noper_next;
df826430 3135 }
fab2782b 3136
944e05e3
YO
3137 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3138 const U8 *uc= (U8*)STRING(noper);
3139 const U8 *e= uc + STR_LEN(noper);
3140
786e8c11 3141 for ( ; uc < e ; uc += len ) {
a3621e74 3142
786e8c11 3143 TRIE_READ_CHAR;
a3621e74 3144
786e8c11
YO
3145 if ( uvc < 256 ) {
3146 charid = trie->charmap[ uvc ];
3147 } else {
538e84ed
KW
3148 SV* const * const svpp = hv_fetch( widecharmap,
3149 (char*)&uvc,
3150 sizeof( UV ),
3151 0);
786e8c11 3152 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3153 }
786e8c11
YO
3154 if ( charid ) {
3155 charid--;
3156 if ( !trie->trans[ state + charid ].next ) {
3157 trie->trans[ state + charid ].next = next_alloc;
3158 trie->trans[ state ].check++;
2e64971a
DM
3159 prev_states[TRIE_NODENUM(next_alloc)]
3160 = TRIE_NODENUM(state);
786e8c11
YO
3161 next_alloc += trie->uniquecharcount;
3162 }
3163 state = trie->trans[ state + charid ].next;
3164 } else {
147e3846 3165 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3166 }
538e84ed
KW
3167 /* charid is now 0 if we dont know the char read, or
3168 * nonzero if we do */
a3621e74 3169 }
a3621e74 3170 }
3dab1dad
YO
3171 accept_state = TRIE_NODENUM( state );
3172 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3173
3174 } /* end second pass */
3175
3dab1dad 3176 /* and now dump it out before we compress it */
2b8b4781
NC
3177 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3178 revcharmap,
3179 next_alloc, depth+1));
a3621e74 3180
a3621e74
YO
3181 {
3182 /*
3183 * Inplace compress the table.*
3184
3185 For sparse data sets the table constructed by the trie algorithm will
3186 be mostly 0/FAIL transitions or to put it another way mostly empty.
3187 (Note that leaf nodes will not contain any transitions.)
3188
3189 This algorithm compresses the tables by eliminating most such
3190 transitions, at the cost of a modest bit of extra work during lookup:
3191
3192 - Each states[] entry contains a .base field which indicates the
3193 index in the state[] array wheres its transition data is stored.
3194
3b753521 3195 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3196
3197 - If .base is nonzero then charid is added to it to find an entry in
3198 the trans array.
3199
3200 -If trans[states[state].base+charid].check!=state then the
3201 transition is taken to be a 0/Fail transition. Thus if there are fail
3202 transitions at the front of the node then the .base offset will point
3203 somewhere inside the previous nodes data (or maybe even into a node
3204 even earlier), but the .check field determines if the transition is
3205 valid.
3206
786e8c11 3207 XXX - wrong maybe?
a3621e74 3208 The following process inplace converts the table to the compressed
3b753521 3209 table: We first do not compress the root node 1,and mark all its
a3621e74 3210 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3211 allows us to do a DFA construction from the compressed table later,
3212 and ensures that any .base pointers we calculate later are greater
3213 than 0.
a3621e74
YO
3214
3215 - We set 'pos' to indicate the first entry of the second node.
3216
3217 - We then iterate over the columns of the node, finding the first and
3218 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3219 and set the .check pointers accordingly, and advance pos
3220 appropriately and repreat for the next node. Note that when we copy
3221 the next pointers we have to convert them from the original
3222 NODEIDX form to NODENUM form as the former is not valid post
3223 compression.
3224
3225 - If a node has no transitions used we mark its base as 0 and do not
3226 advance the pos pointer.
3227
3228 - If a node only has one transition we use a second pointer into the
3229 structure to fill in allocated fail transitions from other states.
3230 This pointer is independent of the main pointer and scans forward
3231 looking for null transitions that are allocated to a state. When it
3232 finds one it writes the single transition into the "hole". If the
786e8c11 3233 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3234
3235 - Once compressed we can Renew/realloc the structures to release the
3236 excess space.
3237
3238 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3239 specifically Fig 3.47 and the associated pseudocode.
3240
3241 demq
3242 */
a3b680e6 3243 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3244 U32 state, charid;
a3621e74 3245 U32 pos = 0, zp=0;
1e2e3d02 3246 trie->statecount = laststate;
a3621e74
YO
3247
3248 for ( state = 1 ; state < laststate ; state++ ) {
3249 U8 flag = 0;
a28509cc
AL
3250 const U32 stateidx = TRIE_NODEIDX( state );
3251 const U32 o_used = trie->trans[ stateidx ].check;
3252 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3253 trie->trans[ stateidx ].check = 0;
3254
538e84ed
KW
3255 for ( charid = 0;
3256 used && charid < trie->uniquecharcount;
3257 charid++ )
3258 {
a3621e74
YO
3259 if ( flag || trie->trans[ stateidx + charid ].next ) {
3260 if ( trie->trans[ stateidx + charid ].next ) {
3261 if (o_used == 1) {
3262 for ( ; zp < pos ; zp++ ) {
3263 if ( ! trie->trans[ zp ].next ) {
3264 break;
3265 }
3266 }
538e84ed
KW
3267 trie->states[ state ].trans.base
3268 = zp
3269 + trie->uniquecharcount
3270 - charid ;
3271 trie->trans[ zp ].next
3272 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3273 + charid ].next );
a3621e74
YO
3274 trie->trans[ zp ].check = state;
3275 if ( ++zp > pos ) pos = zp;
3276 break;
3277 }
3278 used--;
3279 }
3280 if ( !flag ) {
3281 flag = 1;
538e84ed
KW
3282 trie->states[ state ].trans.base
3283 = pos + trie->uniquecharcount - charid ;
a3621e74 3284 }
538e84ed
KW
3285 trie->trans[ pos ].next
3286 = SAFE_TRIE_NODENUM(
3287 trie->trans[ stateidx + charid ].next );
a3621e74
YO
3288 trie->trans[ pos ].check = state;
3289 pos++;
3290 }
3291 }
3292 }
cc601c31 3293 trie->lasttrans = pos + 1;
c944940b
JH
3294 trie->states = (reg_trie_state *)
3295 PerlMemShared_realloc( trie->states, laststate
3296 * sizeof(reg_trie_state) );
a3621e74 3297 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3298 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
cb41e5d6 3299 depth+1,
538e84ed
KW
3300 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3301 + 1 ),
3302 (IV)next_alloc,
3303 (IV)pos,
3304 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
3305 );
3306
3307 } /* end table compress */
3308 }
1e2e3d02 3309 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3310 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
cb41e5d6 3311 depth+1,
1e2e3d02
YO
3312 (UV)trie->statecount,
3313 (UV)trie->lasttrans)
3314 );
cc601c31 3315 /* resize the trans array to remove unused space */
c944940b
JH
3316 trie->trans = (reg_trie_trans *)
3317 PerlMemShared_realloc( trie->trans, trie->lasttrans
3318 * sizeof(reg_trie_trans) );
a3621e74 3319
538e84ed 3320 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
3321 U8 nodetype =(U8)(flags & 0xFF);
3322 char *str=NULL;
538e84ed 3323
07be1b83 3324#ifdef DEBUGGING
e62cc96a 3325 regnode *optimize = NULL;
7122b237
YO
3326#ifdef RE_TRACK_PATTERN_OFFSETS
3327
b57a0404
JH
3328 U32 mjd_offset = 0;
3329 U32 mjd_nodelen = 0;
7122b237
YO
3330#endif /* RE_TRACK_PATTERN_OFFSETS */
3331#endif /* DEBUGGING */
a3621e74 3332 /*
3dab1dad
YO
3333 This means we convert either the first branch or the first Exact,
3334 depending on whether the thing following (in 'last') is a branch
3335 or not and whther first is the startbranch (ie is it a sub part of
3336 the alternation or is it the whole thing.)
3b753521 3337 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 3338 the whole branch sequence, including the first.
a3621e74 3339 */
3dab1dad 3340 /* Find the node we are going to overwrite */
7f69552c 3341 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 3342 /* branch sub-chain */
3dab1dad 3343 NEXT_OFF( first ) = (U16)(last - first);
7122b237 3344#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
3345 DEBUG_r({
3346 mjd_offset= Node_Offset((convert));
3347 mjd_nodelen= Node_Length((convert));
3348 });
7122b237 3349#endif
7f69552c 3350 /* whole branch chain */
7122b237
YO
3351 }
3352#ifdef RE_TRACK_PATTERN_OFFSETS
3353 else {
7f69552c
YO
3354 DEBUG_r({
3355 const regnode *nop = NEXTOPER( convert );
3356 mjd_offset= Node_Offset((nop));
3357 mjd_nodelen= Node_Length((nop));
3358 });
07be1b83
YO
3359 }
3360 DEBUG_OPTIMISE_r(
147e3846 3361 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
cb41e5d6 3362 depth+1,
786e8c11 3363 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 3364 );
7122b237 3365#endif
538e84ed 3366 /* But first we check to see if there is a common prefix we can
3dab1dad
YO
3367 split out as an EXACT and put in front of the TRIE node. */
3368 trie->startstate= 1;
55eed653 3369 if ( trie->bitmap && !widecharmap && !trie->jump ) {
5ee57374
YO
3370 /* we want to find the first state that has more than
3371 * one transition, if that state is not the first state
3372 * then we have a common prefix which we can remove.
3373 */
3dab1dad 3374 U32 state;
1e2e3d02 3375 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 3376 U32 ofs = 0;
ee0dfd0b
YO
3377 I32 first_ofs = -1; /* keeps track of the ofs of the first
3378 transition, -1 means none */
8e11feef
RGS
3379 U32 count = 0;
3380 const U32 base = trie->states[ state ].trans.base;
a3621e74 3381
5ee57374 3382 /* does this state terminate an alternation? */
3dab1dad 3383 if ( trie->states[state].wordnum )
8e11feef 3384 count = 1;
a3621e74 3385
8e11feef 3386 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
3387 if ( ( base + ofs >= trie->uniquecharcount ) &&
3388 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
3389 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3390 {
3dab1dad 3391 if ( ++count > 1 ) {
5ee57374 3392 /* we have more than one transition */
d3d91c7c
YO
3393 SV **tmp;
3394 U8 *ch;
5ee57374
YO
3395 /* if this is the first state there is no common prefix
3396 * to extract, so we can exit */
8e11feef 3397 if ( state == 1 ) break;
d3d91c7c
YO
3398 tmp = av_fetch( revcharmap, ofs, 0);
3399 ch = (U8*)SvPV_nolen_const( *tmp );
3400
5ee57374
YO
3401 /* if we are on count 2 then we need to initialize the
3402 * bitmap, and store the previous char if there was one
3403 * in it*/
3dab1dad 3404 if ( count == 2 ) {
5ee57374 3405 /* clear the bitmap */
3dab1dad
YO
3406 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3407 DEBUG_OPTIMISE_r(
147e3846 3408 Perl_re_indentf( aTHX_ "New Start State=%" UVuf " Class: [",
cb41e5d6 3409 depth+1,
786e8c11 3410 (UV)state));
ee0dfd0b
YO
3411 if (first_ofs >= 0) {
3412 SV ** const tmp = av_fetch( revcharmap, first_ofs, 0);
be8e71aa 3413 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 3414
88f063b4 3415 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
3dab1dad 3416 DEBUG_OPTIMISE_r(
6ad9a8ab 3417 Perl_re_printf( aTHX_ "%s", (char*)ch)
3dab1dad 3418 );
8e11feef
RGS
3419 }
3420 }
8bcafbf4 3421 /* store the current firstchar in the bitmap */
88f063b4 3422 TRIE_BITMAP_SET_FOLDED(trie,*ch, folder);
6ad9a8ab 3423 DEBUG_OPTIMISE_r(Perl_re_printf( aTHX_ "%s", ch));
8e11feef 3424 }
ee0dfd0b 3425 first_ofs = ofs;
8e11feef 3426 }
3dab1dad
YO
3427 }
3428 if ( count == 1 ) {
5ee57374
YO
3429 /* This state has only one transition, its transition is part
3430 * of a common prefix - we need to concatenate the char it
3431 * represents to what we have so far. */
ee0dfd0b 3432 SV **tmp = av_fetch( revcharmap, first_ofs, 0);
c490c714
YO
3433 STRLEN len;
3434 char *ch = SvPV( *tmp, len );