This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Move fcn call out of loop
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
f7e03a10 89#include "dquote_inline.h"
b992490d 90#include "invlist_inline.h"
1b0f46bf 91#include "unicode_constants.h"
04e98a4d 92
538e84ed
KW
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
95#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 97#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 98#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
3f910e62
YO
104/* this is a chain of data about sub patterns we are processing that
105 need to be handled separately/specially in study_chunk. Its so
106 we can simulate recursion without losing state. */
107struct scan_frame;
108typedef struct scan_frame {
109 regnode *last_regnode; /* last node to process in this frame */
110 regnode *next_regnode; /* next node to process when last is reached */
111 U32 prev_recursed_depth;
112 I32 stopparen; /* what stopparen do we use */
3f910e62
YO
113
114 struct scan_frame *this_prev_frame; /* this previous frame */
115 struct scan_frame *prev_frame; /* previous frame */
116 struct scan_frame *next_frame; /* next frame */
117} scan_frame;
118
f4ae5a27
KW
119/* Certain characters are output as a sequence with the first being a
120 * backslash. */
eb7486de 121#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
f4ae5a27
KW
122
123
09b2b2e6 124struct RExC_state_t {
514a91f1
DM
125 U32 flags; /* RXf_* are we folding, multilining? */
126 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 127 char *precomp; /* uncompiled string. */
711b303b 128 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 129 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 130 regexp *rx; /* perl core regexp structure */
538e84ed
KW
131 regexp_internal *rxi; /* internal data for regexp object
132 pprivate field */
fac92740 133 char *start; /* Start of input for compile */
830247a4
IZ
134 char *end; /* End of input for compile */
135 char *parse; /* Input-scan pointer. */
51684e3c
KW
136 char *copy_start; /* start of copy of input within
137 constructed parse string */
138 char *copy_start_in_input; /* Position in input string
139 corresponding to copy_start */
ea3daa5d 140 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 141 regnode *emit_start; /* Start of emitted-code area */
538e84ed
KW
142 regnode *emit_bound; /* First regnode outside of the
143 allocated space */
f55b7b26 144 regnode_offset emit; /* Code-emit pointer */
830247a4
IZ
145 I32 naughty; /* How bad is this pattern? */
146 I32 sawback; /* Did we see \1, ...? */
147 U32 seen;
88f063b4
KW
148 SSize_t size; /* Number of regnode equivalents in
149 pattern */
e7f023ff 150 I32 npar; /* Capture buffer count, (OPEN) plus
538e84ed
KW
151 one. ("par" 0 is the whole
152 pattern)*/
1533d1d2
KW
153 I32 total_par; /* Capture buffer count after parse
154 completed, (OPEN) plus one. ("par" 0
155 is the whole pattern)*/
538e84ed
KW
156 I32 nestroot; /* root parens we are in - used by
157 accept */
830247a4
IZ
158 I32 extralen;
159 I32 seen_zerolen;
f55b7b26
KW
160 regnode_offset *open_parens; /* offsets to open parens */
161 regnode_offset *close_parens; /* offsets to close parens */
d5a00e4a 162 regnode *end_op; /* END node in program */
02daf0ab
YO
163 I32 utf8; /* whether the pattern is utf8 or not */
164 I32 orig_utf8; /* whether the pattern was originally in utf8 */
165 /* XXX use this for future optimisation of case
166 * where pattern must be upgraded to utf8. */
e40e74fe
KW
167 I32 uni_semantics; /* If a d charset modifier should use unicode
168 rules, even if the pattern is not in
169 utf8 */
81714fb9 170 HV *paren_names; /* Paren names */
538e84ed 171
40d049e4 172 regnode **recurse; /* Recurse regops */
88f063b4 173 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 174 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 175 through */
09a65838 176 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 177 I32 in_lookbehind;
4624b182 178 I32 contains_locale;
bb3f3ed2 179 I32 override_recoding;
b6d67071
KW
180#ifdef EBCDIC
181 I32 recode_x_to_native;
182#endif
9d53c457 183 I32 in_multi_char_class;
1acab4c5 184 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 185 within pattern */
b1603ef8 186 int code_index; /* next code_blocks[] slot */
ee273784 187 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
188 scan_frame *frame_head;
189 scan_frame *frame_last;
190 U32 frame_count;
7eec73eb 191 AV *warn_text;
dc6d7f5c 192#ifdef ADD_TO_REGEXEC
830247a4
IZ
193 char *starttry; /* -Dr: where regtry was called. */
194#define RExC_starttry (pRExC_state->starttry)
195#endif
d24ca0c5 196 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 197#ifdef DEBUGGING
be8e71aa 198 const char *lastparse;
3dab1dad 199 I32 lastnum;
1f1031fe 200 AV *paren_name_list; /* idx -> name */
d9a72fcc 201 U32 study_chunk_recursed_count;
c9f0d54c
YO
202 SV *mysv1;
203 SV *mysv2;
88f063b4 204
3dab1dad
YO
205#define RExC_lastparse (pRExC_state->lastparse)
206#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 207#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 208#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
209#define RExC_mysv (pRExC_state->mysv1)
210#define RExC_mysv1 (pRExC_state->mysv1)
211#define RExC_mysv2 (pRExC_state->mysv2)
212
3dab1dad 213#endif
512c0f5a 214 bool seen_unfolded_sharp_s;
911bd04e 215 bool strict;
da7cf1cc 216 bool study_started;
034602eb 217 bool in_script_run;
f55b7b26 218 bool pass1;
09b2b2e6 219};
830247a4 220
e2509266 221#define RExC_flags (pRExC_state->flags)
514a91f1 222#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 223#define RExC_precomp (pRExC_state->precomp)
51684e3c
KW
224#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
225#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
711b303b 226#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 227#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 228#define RExC_rx (pRExC_state->rx)
f8fc2ecf 229#define RExC_rxi (pRExC_state->rxi)
fac92740 230#define RExC_start (pRExC_state->start)
830247a4
IZ
231#define RExC_end (pRExC_state->end)
232#define RExC_parse (pRExC_state->parse)
233#define RExC_whilem_seen (pRExC_state->whilem_seen)
512c0f5a
KW
234
235/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
236 * EXACTF node, hence was parsed under /di rules. If later in the parse,
237 * something forces the pattern into using /ui rules, the sharp s should be
238 * folded into the sequence 'ss', which takes up more space than previously
239 * calculated. This means that the sizing pass needs to be restarted. (The
240 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
241 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
242 * so there is no need to resize [perl #125990]. */
243#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
244
7122b237 245#ifdef RE_TRACK_PATTERN_OFFSETS
44f36cd3 246# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
538e84ed 247 others */
7122b237 248#endif
830247a4 249#define RExC_emit (pRExC_state->emit)
f55b7b26 250#define RExC_pass1 (pRExC_state->pass1)
fac92740 251#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 252#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
253#define RExC_sawback (pRExC_state->sawback)
254#define RExC_seen (pRExC_state->seen)
255#define RExC_size (pRExC_state->size)
ee273784 256#define RExC_maxlen (pRExC_state->maxlen)
830247a4 257#define RExC_npar (pRExC_state->npar)
1533d1d2 258#define RExC_total_parens (pRExC_state->total_par)
e2e6a0f1 259#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
260#define RExC_extralen (pRExC_state->extralen)
261#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 262#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 263#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 264#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
265#define RExC_open_parens (pRExC_state->open_parens)
266#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 267#define RExC_end_op (pRExC_state->end_op)
81714fb9 268#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
269#define RExC_recurse (pRExC_state->recurse)
270#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 271#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
272#define RExC_study_chunk_recursed_bytes \
273 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 274#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 275#define RExC_contains_locale (pRExC_state->contains_locale)
b6d67071
KW
276#ifdef EBCDIC
277# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
278#endif
9d53c457 279#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
280#define RExC_frame_head (pRExC_state->frame_head)
281#define RExC_frame_last (pRExC_state->frame_last)
282#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 283#define RExC_strict (pRExC_state->strict)
da7cf1cc 284#define RExC_study_started (pRExC_state->study_started)
7eec73eb 285#define RExC_warn_text (pRExC_state->warn_text)
034602eb 286#define RExC_in_script_run (pRExC_state->in_script_run)
cb9ad9c2 287#define RExC_use_BRANCHJ (!SIZE_ONLY && RExC_extralen)
830247a4 288
99807a43
HS
289/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
290 * a flag to disable back-off on the fixed/floating substrings - if it's
291 * a high complexity pattern we assume the benefit of avoiding a full match
292 * is worth the cost of checking for the substrings even if they rarely help.
293 */
294#define RExC_naughty (pRExC_state->naughty)
295#define TOO_NAUGHTY (10)
296#define MARK_NAUGHTY(add) \
297 if (RExC_naughty < TOO_NAUGHTY) \
298 RExC_naughty += (add)
299#define MARK_NAUGHTY_EXP(exp, add) \
300 if (RExC_naughty < TOO_NAUGHTY) \
301 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 302
a687059c
LW
303#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
304#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 305 ((*s) == '{' && regcurly(s)))
a687059c
LW
306
307/*
308 * Flags to be passed up and down.
309 */
a687059c 310#define WORST 0 /* Worst case. */
a3b492c3 311#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 312
e64f369d 313/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
314 * character. (There needs to be a case: in the switch statement in regexec.c
315 * for any node marked SIMPLE.) Note that this is not the same thing as
316 * REGNODE_SIMPLE */
fda99bee 317#define SIMPLE 0x02
e64f369d 318#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
319#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
320#define TRYAGAIN 0x10 /* Weeded out a declaration. */
c36361b2
KW
321#define RESTART_PARSE 0x20 /* Need to redo the parse */
322#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
323 calcuate sizes as UTF-8 */
a687059c 324
3dab1dad
YO
325#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
326
07be1b83
YO
327/* whether trie related optimizations are enabled */
328#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
329#define TRIE_STUDY_OPT
786e8c11 330#define FULL_TRIE_STUDY
07be1b83
YO
331#define TRIE_STCLASS
332#endif
1de06328
YO
333
334
40d049e4
YO
335
336#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
337#define PBITVAL(paren) (1 << ((paren) & 7))
338#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
339#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
340#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
341
82a6ada4 342#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 343 if (!UTF) { \
82a6ada4 344 assert(PASS1); \
c36361b2 345 *flagp = RESTART_PARSE|NEED_UTF8; \
f55b7b26 346 return 0; \
8d9c2815 347 } \
82a6ada4 348 } STMT_END
40d049e4 349
512c0f5a
KW
350/* Change from /d into /u rules, and restart the parse if we've already seen
351 * something whose size would increase as a result, by setting *flagp and
352 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
142ece99 353 * we've changed to /u during the parse. */
512c0f5a
KW
354#define REQUIRE_UNI_RULES(flagp, restart_retval) \
355 STMT_START { \
356 if (DEPENDS_SEMANTICS) { \
357 assert(PASS1); \
358 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
359 RExC_uni_semantics = 1; \
360 if (RExC_seen_unfolded_sharp_s) { \
c36361b2 361 *flagp |= RESTART_PARSE; \
512c0f5a
KW
362 return restart_retval; \
363 } \
364 } \
365 } STMT_END
366
fd7fc6ae 367/* Executes a return statement with the value 'X', if 'flags' contains any of
c36361b2 368 * 'RESTART_PARSE', 'NEED_UTF8', or 'extra'. If so, *flagp is set to those
fd7fc6ae
YO
369 * flags */
370#define RETURN_X_ON_RESTART_OR_FLAGS(X, flags, flagp, extra) \
371 STMT_START { \
c36361b2
KW
372 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
373 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
fd7fc6ae
YO
374 return X; \
375 } \
376 } STMT_END
377
5d555bb8 378#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
f55b7b26 379 RETURN_X_ON_RESTART_OR_FLAGS(0,flags,flagp,extra)
fd7fc6ae
YO
380
381#define RETURN_X_ON_RESTART(X, flags,flagp) \
382 RETURN_X_ON_RESTART_OR_FLAGS( X, flags, flagp, 0)
383
384
5d555bb8 385#define RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp,extra) \
c36361b2 386 if (*(flagp) & (RESTART_PARSE|(extra))) return 0
fd7fc6ae 387
c36361b2 388#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
fd7fc6ae 389
5d555bb8 390#define RETURN_FAIL_ON_RESTART(flags,flagp) \
f55b7b26 391 RETURN_X_ON_RESTART(0, flags,flagp)
5d555bb8 392#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
88f063b4 393 RETURN_FAIL_ON_RESTART_FLAGP_OR_FLAGS(flagp, 0)
fd7fc6ae 394
f19b1a63
KW
395/* This converts the named class defined in regcomp.h to its equivalent class
396 * number defined in handy.h. */
397#define namedclass_to_classnum(class) ((int) ((class) / 2))
398#define classnum_to_namedclass(classnum) ((classnum) * 2)
399
de92f5e6
KW
400#define _invlist_union_complement_2nd(a, b, output) \
401 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
402#define _invlist_intersection_complement_2nd(a, b, output) \
403 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
404
1de06328
YO
405/* About scan_data_t.
406
407 During optimisation we recurse through the regexp program performing
408 various inplace (keyhole style) optimisations. In addition study_chunk
409 and scan_commit populate this data structure with information about
538e84ed 410 what strings MUST appear in the pattern. We look for the longest
3b753521 411 string that must appear at a fixed location, and we look for the
1de06328
YO
412 longest string that may appear at a floating location. So for instance
413 in the pattern:
538e84ed 414
1de06328 415 /FOO[xX]A.*B[xX]BAR/
538e84ed 416
1de06328
YO
417 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
418 strings (because they follow a .* construct). study_chunk will identify
419 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 420
1de06328 421 The strings can be composites, for instance
538e84ed 422
1de06328 423 /(f)(o)(o)/
538e84ed 424
1de06328 425 will result in a composite fixed substring 'foo'.
538e84ed 426
1de06328 427 For each string some basic information is maintained:
538e84ed 428
6ea77169 429 - min_offset
1de06328
YO
430 This is the position the string must appear at, or not before.
431 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
432 characters must match before the string we are searching for.
433 Likewise when combined with minlenp and the length of the string it
538e84ed 434 tells us how many characters must appear after the string we have
1de06328 435 found.
538e84ed 436
1de06328
YO
437 - max_offset
438 Only used for floating strings. This is the rightmost point that
ea3daa5d 439 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 440 string can occur infinitely far to the right.
6e12f832 441 For fixed strings, it is equal to min_offset.
538e84ed 442
1de06328 443 - minlenp
2d608413
KW
444 A pointer to the minimum number of characters of the pattern that the
445 string was found inside. This is important as in the case of positive
538e84ed 446 lookahead or positive lookbehind we can have multiple patterns
1de06328 447 involved. Consider
538e84ed 448
1de06328 449 /(?=FOO).*F/
538e84ed 450
1de06328
YO
451 The minimum length of the pattern overall is 3, the minimum length
452 of the lookahead part is 3, but the minimum length of the part that
538e84ed 453 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 454 minimum length for the F is 1. This is important as the minimum length
538e84ed 455 is used to determine offsets in front of and behind the string being
1de06328 456 looked for. Since strings can be composites this is the length of the
486ec47a 457 pattern at the time it was committed with a scan_commit. Note that
1de06328 458 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 459 are not known until the full pattern has been compiled, thus the
1de06328 460 pointer to the value.
538e84ed 461
1de06328 462 - lookbehind
538e84ed 463
1de06328 464 In the case of lookbehind the string being searched for can be
538e84ed 465 offset past the start point of the final matching string.
1de06328
YO
466 If this value was just blithely removed from the min_offset it would
467 invalidate some of the calculations for how many chars must match
468 before or after (as they are derived from min_offset and minlen and
538e84ed 469 the length of the string being searched for).
1de06328
YO
470 When the final pattern is compiled and the data is moved from the
471 scan_data_t structure into the regexp structure the information
538e84ed
KW
472 about lookbehind is factored in, with the information that would
473 have been lost precalculated in the end_shift field for the
1de06328
YO
474 associated string.
475
476 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 477 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
478
479*/
2c2d71f5 480
cb6aef8b
DM
481struct scan_data_substrs {
482 SV *str; /* longest substring found in pattern */
483 SSize_t min_offset; /* earliest point in string it can appear */
484 SSize_t max_offset; /* latest point in string it can appear */
485 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
486 SSize_t lookbehind; /* is the pos of the string modified by LB */
11683ecb 487 I32 flags; /* per substring SF_* and SCF_* flags */
cb6aef8b
DM
488};
489
2c2d71f5 490typedef struct scan_data_t {
1de06328
YO
491 /*I32 len_min; unused */
492 /*I32 len_delta; unused */
49f55535 493 SSize_t pos_min;
ea3daa5d 494 SSize_t pos_delta;
2c2d71f5 495 SV *last_found;
ea3daa5d 496 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 497 SSize_t last_start_min;
ea3daa5d 498 SSize_t last_start_max;
2099df82
DM
499 U8 cur_is_floating; /* whether the last_* values should be set as
500 * the next fixed (0) or floating (1)
501 * substring */
6ea77169 502
2099df82
DM
503 /* [0] is longest fixed substring so far, [1] is longest float so far */
504 struct scan_data_substrs substrs[2];
6ea77169 505
11683ecb 506 I32 flags; /* common SF_* and SCF_* flags */
2c2d71f5 507 I32 whilem_c;
49f55535 508 SSize_t *last_closep;
b8f7bb16 509 regnode_ssc *start_class;
2c2d71f5
JH
510} scan_data_t;
511
a687059c 512/*
e50aee73 513 * Forward declarations for pregcomp()'s friends.
a687059c 514 */
a0d0e21e 515
6ea77169 516static const scan_data_t zero_scan_data = {
a2ca2017 517 0, 0, NULL, 0, 0, 0, 0,
6ea77169 518 {
11683ecb
DM
519 { NULL, 0, 0, 0, 0, 0 },
520 { NULL, 0, 0, 0, 0, 0 },
6ea77169
DM
521 },
522 0, 0, NULL, NULL
523};
c277df42 524
11683ecb
DM
525/* study flags */
526
07be1b83
YO
527#define SF_BEFORE_SEOL 0x0001
528#define SF_BEFORE_MEOL 0x0002
11683ecb 529#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
c277df42 530
07be1b83
YO
531#define SF_IS_INF 0x0040
532#define SF_HAS_PAR 0x0080
533#define SF_IN_PAR 0x0100
534#define SF_HAS_EVAL 0x0200
bc604ad8
KW
535
536
537/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
538 * longest substring in the pattern. When it is not set the optimiser keeps
539 * track of position, but does not keep track of the actual strings seen,
540 *
541 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
542 * /foo/i will not.
543 *
544 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
545 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
546 * turned off because of the alternation (BRANCH). */
07be1b83 547#define SCF_DO_SUBSTR 0x0400
bc604ad8 548
653099ff
GS
549#define SCF_DO_STCLASS_AND 0x0800
550#define SCF_DO_STCLASS_OR 0x1000
551#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 552#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 553
786e8c11 554#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 555#define SCF_SEEN_ACCEPT 0x8000
688e0391 556#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
557#define SCF_IN_DEFINE 0x20000
558
559
560
07be1b83 561
43fead97 562#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
563
564/* The enums for all these are ordered so things work out correctly */
a62b1201 565#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
566#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
567 == REGEX_DEPENDS_CHARSET)
00b27cfc 568#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
569#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
570 >= REGEX_UNICODE_CHARSET)
571#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
572 == REGEX_ASCII_RESTRICTED_CHARSET)
573#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
574 >= REGEX_ASCII_RESTRICTED_CHARSET)
575#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
576 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 577
43fead97 578#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 579
f2c2a6ab
KW
580/* For programs that want to be strictly Unicode compatible by dying if any
581 * attempt is made to match a non-Unicode code point against a Unicode
582 * property. */
583#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
584
93733859 585#define OOB_NAMEDCLASS -1
b8c5462f 586
8e661ac5
KW
587/* There is no code point that is out-of-bounds, so this is problematic. But
588 * its only current use is to initialize a variable that is always set before
589 * looked at. */
590#define OOB_UNICODE 0xDEADBEEF
591
a0ed51b3 592#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 593
8615cb43 594
b45f050a
JF
595/* length of regex to show in messages that don't mark a position within */
596#define RegexLengthToShowInErrorMessages 127
597
598/*
599 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
600 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
601 * op/pragma/warn/regcomp.
602 */
7253e4e3
RK
603#define MARKER1 "<-- HERE" /* marker as it appears in the description */
604#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 605
538e84ed 606#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 607 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 608
285b5ca0
KW
609/* The code in this file in places uses one level of recursion with parsing
610 * rebased to an alternate string constructed by us in memory. This can take
611 * the form of something that is completely different from the input, or
612 * something that uses the input as part of the alternate. In the first case,
613 * there should be no possibility of an error, as we are in complete control of
51684e3c
KW
614 * the alternate string. But in the second case we don't completely control
615 * the input portion, so there may be errors in that. Here's an example:
285b5ca0
KW
616 * /[abc\x{DF}def]/ui
617 * is handled specially because \x{df} folds to a sequence of more than one
51684e3c 618 * character: 'ss'. What is done is to create and parse an alternate string,
285b5ca0
KW
619 * which looks like this:
620 * /(?:\x{DF}|[abc\x{DF}def])/ui
621 * where it uses the input unchanged in the middle of something it constructs,
622 * which is a branch for the DF outside the character class, and clustering
623 * parens around the whole thing. (It knows enough to skip the DF inside the
624 * class while in this substitute parse.) 'abc' and 'def' may have errors that
625 * need to be reported. The general situation looks like this:
626 *
51684e3c 627 * |<------- identical ------>|
285b5ca0 628 * sI tI xI eI
51684e3c 629 * Input: ---------------------------------------------------------------
285b5ca0
KW
630 * Constructed: ---------------------------------------------------
631 * sC tC xC eC EC
51684e3c 632 * |<------- identical ------>|
285b5ca0 633 *
51684e3c
KW
634 * sI..eI is the portion of the input pattern we are concerned with here.
635 * sC..EC is the constructed substitute parse string.
636 * sC..tC is constructed by us
637 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
638 * In the diagram, these are vertically aligned.
639 * eC..EC is also constructed by us.
640 * xC is the position in the substitute parse string where we found a
641 * problem.
642 * xI is the position in the original pattern corresponding to xC.
285b5ca0 643 *
51684e3c
KW
644 * We want to display a message showing the real input string. Thus we need to
645 * translate from xC to xI. We know that xC >= tC, since the portion of the
646 * string sC..tC has been constructed by us, and so shouldn't have errors. We
647 * get:
648 * xI = tI + (xC - tC)
285b5ca0 649 *
51684e3c
KW
650 * When the substitute parse is constructed, the code needs to set:
651 * RExC_start (sC)
652 * RExC_end (eC)
653 * RExC_copy_start_in_input (tI)
654 * RExC_copy_start_in_constructed (tC)
655 * and restore them when done.
285b5ca0 656 *
51684e3c
KW
657 * During normal processing of the input pattern, both
658 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
659 * sI, so that xC equals xI.
285b5ca0
KW
660 */
661
51684e3c
KW
662#define sI RExC_precomp
663#define eI RExC_precomp_end
664#define sC RExC_start
665#define eC RExC_end
666#define tI RExC_copy_start_in_input
667#define tC RExC_copy_start_in_constructed
668#define xI(xC) (tI + (xC - tC))
669#define xI_offset(xC) (xI(xC) - sI)
285b5ca0
KW
670
671#define REPORT_LOCATION_ARGS(xC) \
672 UTF8fARG(UTF, \
51684e3c 673 (xI(xC) > eI) /* Don't run off end */ \
285b5ca0 674 ? eC - sC /* Length before the <--HERE */ \
bc929b61
KW
675 : ((xI_offset(xC) >= 0) \
676 ? xI_offset(xC) \
677 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
678 IVdf " trying to output message for " \
679 " pattern %.*s", \
51684e3c 680 __FILE__, __LINE__, (IV) xI_offset(xC), \
bc929b61 681 ((int) (eC - sC)), sC), 0)), \
51684e3c 682 sI), /* The input pattern printed up to the <--HERE */ \
285b5ca0 683 UTF8fARG(UTF, \
51684e3c
KW
684 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
685 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
c1d900c3 686
8a6d8ec6
HS
687/* Used to point after bad bytes for an error message, but avoid skipping
688 * past a nul byte. */
689#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
690
ed68fe11
KW
691/* Set up to clean up after our imminent demise */
692#define PREPARE_TO_DIE \
693 STMT_START { \
694 if (RExC_rx_sv) \
695 SAVEFREESV(RExC_rx_sv); \
696 } STMT_END
697
b45f050a
JF
698/*
699 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
700 * arg. Show regex, up to a maximum length. If it's too long, chop and add
701 * "...".
702 */
58e23c8d 703#define _FAIL(code) STMT_START { \
bfed75c6 704 const char *ellipses = ""; \
88f063b4 705 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380 706 \
ed68fe11 707 PREPARE_TO_DIE; \
ccb2c380
MP
708 if (len > RegexLengthToShowInErrorMessages) { \
709 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
710 len = RegexLengthToShowInErrorMessages - 10; \
711 ellipses = "..."; \
712 } \
58e23c8d 713 code; \
ccb2c380 714} STMT_END
8615cb43 715
58e23c8d 716#define FAIL(msg) _FAIL( \
147e3846 717 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 718 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
719
720#define FAIL2(msg,arg) _FAIL( \
147e3846 721 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 722 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 723
b45f050a 724/*
b45f050a
JF
725 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
726 */
ccb2c380 727#define Simple_vFAIL(m) STMT_START { \
ccb2c380 728 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 729 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 730} STMT_END
b45f050a
JF
731
732/*
733 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
734 */
ccb2c380 735#define vFAIL(m) STMT_START { \
ed68fe11 736 PREPARE_TO_DIE; \
ccb2c380
MP
737 Simple_vFAIL(m); \
738} STMT_END
b45f050a
JF
739
740/*
741 * Like Simple_vFAIL(), but accepts two arguments.
742 */
ccb2c380 743#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
744 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
745 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 746} STMT_END
b45f050a
JF
747
748/*
749 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
750 */
ccb2c380 751#define vFAIL2(m,a1) STMT_START { \
ed68fe11 752 PREPARE_TO_DIE; \
ccb2c380
MP
753 Simple_vFAIL2(m, a1); \
754} STMT_END
b45f050a
JF
755
756
757/*
758 * Like Simple_vFAIL(), but accepts three arguments.
759 */
ccb2c380 760#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 761 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 762 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 763} STMT_END
b45f050a
JF
764
765/*
766 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
767 */
ccb2c380 768#define vFAIL3(m,a1,a2) STMT_START { \
ed68fe11 769 PREPARE_TO_DIE; \
ccb2c380
MP
770 Simple_vFAIL3(m, a1, a2); \
771} STMT_END
b45f050a
JF
772
773/*
774 * Like Simple_vFAIL(), but accepts four arguments.
775 */
ccb2c380 776#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
777 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
778 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 779} STMT_END
b45f050a 780
95db3ffa 781#define vFAIL4(m,a1,a2,a3) STMT_START { \
ed68fe11 782 PREPARE_TO_DIE; \
95db3ffa
KW
783 Simple_vFAIL4(m, a1, a2, a3); \
784} STMT_END
785
946095af 786/* A specialized version of vFAIL2 that works with UTF8f */
d528642a 787#define vFAIL2utf8f(m, a1) STMT_START { \
ed68fe11 788 PREPARE_TO_DIE; \
d528642a
KW
789 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
790 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
791} STMT_END
792
3ba22297 793#define vFAIL3utf8f(m, a1, a2) STMT_START { \
ed68fe11 794 PREPARE_TO_DIE; \
3ba22297 795 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 796 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
797} STMT_END
798
d62e749e
KW
799/* Setting this to NULL is a signal to not output warnings */
800#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE RExC_copy_start_in_constructed = NULL
801#define RESTORE_WARNINGS RExC_copy_start_in_constructed = RExC_precomp
802
338190a1
KW
803/* Outputting warnings is generally deferred until the 2nd pass. This is
804 * because the first pass can be restarted, for example if the pattern has to
805 * be converted to UTF-8. If a warning had already been output earlier in the
806 * pass, it would be re-output after the restart. Pass 2 is never restarted,
807 * so the problem simply goes away if we defer the output to that pass. See
d62e749e
KW
808 * [perl #122671]. 'RExC_copy_start_in_constructed' being NULL is a flag to
809 * not generate any warnings */
55bcf857 810#define TO_OUTPUT_WARNINGS(loc) \
d62e749e 811 (PASS2 && RExC_copy_start_in_constructed)
55bcf857 812
77ccba3c
KW
813#define UPDATE_WARNINGS_LOC(loc) NOOP
814
086f21bf 815/* 'warns' is the output of the packWARNx macro used in 'code' */
f8220fe2
KW
816#define _WARN_HELPER(loc, warns, code) \
817 STMT_START { \
d62e749e
KW
818 if (! RExC_copy_start_in_constructed) { \
819 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
820 " expected at '%s'", \
821 __FILE__, __LINE__, loc); \
822 } \
55bcf857 823 if (TO_OUTPUT_WARNINGS(loc)) { \
086f21bf
KW
824 if (ckDEAD(warns)) \
825 PREPARE_TO_DIE; \
338190a1 826 code; \
77ccba3c 827 UPDATE_WARNINGS_LOC(loc); \
338190a1 828 } \
f8220fe2 829 } STMT_END
946095af 830
5e0a247b 831/* m is not necessarily a "literal string", in this macro */
f8220fe2
KW
832#define reg_warn_non_literal_string(loc, m) \
833 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
834 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 835 "%s" REPORT_LOCATION, \
f8220fe2 836 m, REPORT_LOCATION_ARGS(loc)))
5e0a247b 837
f8220fe2
KW
838#define ckWARNreg(loc,m) \
839 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
840 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 841 m REPORT_LOCATION, \
f8220fe2 842 REPORT_LOCATION_ARGS(loc)))
ccb2c380 843
f8220fe2
KW
844#define vWARN(loc, m) \
845 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
846 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 847 m REPORT_LOCATION, \
f8220fe2 848 REPORT_LOCATION_ARGS(loc))) \
b927b7e9 849
f8220fe2
KW
850#define vWARN_dep(loc, m) \
851 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
852 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 853 m REPORT_LOCATION, \
f8220fe2 854 REPORT_LOCATION_ARGS(loc)))
0d6106aa 855
f8220fe2
KW
856#define ckWARNdep(loc,m) \
857 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
858 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 859 m REPORT_LOCATION, \
f8220fe2 860 REPORT_LOCATION_ARGS(loc)))
147508a2 861
f8220fe2
KW
862#define ckWARNregdep(loc,m) \
863 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
864 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
d528642a
KW
865 WARN_REGEXP), \
866 m REPORT_LOCATION, \
f8220fe2 867 REPORT_LOCATION_ARGS(loc)))
ccb2c380 868
f8220fe2
KW
869#define ckWARN2reg_d(loc,m, a1) \
870 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
871 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
d528642a 872 m REPORT_LOCATION, \
f8220fe2 873 a1, REPORT_LOCATION_ARGS(loc)))
2335b3d3 874
f8220fe2
KW
875#define ckWARN2reg(loc, m, a1) \
876 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
877 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 878 m REPORT_LOCATION, \
f8220fe2 879 a1, REPORT_LOCATION_ARGS(loc)))
ccb2c380 880
f8220fe2
KW
881#define vWARN3(loc, m, a1, a2) \
882 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
883 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 884 m REPORT_LOCATION, \
f8220fe2 885 a1, a2, REPORT_LOCATION_ARGS(loc)))
ccb2c380 886
f8220fe2
KW
887#define ckWARN3reg(loc, m, a1, a2) \
888 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
889 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
890 m REPORT_LOCATION, \
891 a1, a2, \
f8220fe2 892 REPORT_LOCATION_ARGS(loc)))
668c081a 893
f8220fe2
KW
894#define vWARN4(loc, m, a1, a2, a3) \
895 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
896 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
897 m REPORT_LOCATION, \
898 a1, a2, a3, \
f8220fe2 899 REPORT_LOCATION_ARGS(loc)))
ccb2c380 900
f8220fe2
KW
901#define ckWARN4reg(loc, m, a1, a2, a3) \
902 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
903 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
904 m REPORT_LOCATION, \
905 a1, a2, a3, \
f8220fe2 906 REPORT_LOCATION_ARGS(loc)))
668c081a 907
f8220fe2
KW
908#define vWARN5(loc, m, a1, a2, a3, a4) \
909 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
910 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
911 m REPORT_LOCATION, \
912 a1, a2, a3, a4, \
f8220fe2 913 REPORT_LOCATION_ARGS(loc)))
9d1d55b5 914
aea5f181
KW
915#define ckWARNexperimental(loc, class, m) \
916 _WARN_HELPER(loc, packWARN(class), \
917 Perl_ck_warner_d(aTHX_ packWARN(class), \
918 m REPORT_LOCATION, \
919 REPORT_LOCATION_ARGS(loc)))
920
1a322bb4
KW
921/* Convert between a pointer to a node and its offset from the beginning of the
922 * program */
923#define REGNODE_p(offset) (RExC_emit_start + (offset))
924#define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
925
538e84ed 926/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
927 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
928 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
929 * Element 0 holds the number n.
07be1b83 930 * Position is 1 indexed.
fac92740 931 */
7122b237 932#ifndef RE_TRACK_PATTERN_OFFSETS
d7c442ce 933#define Set_Node_Offset_To_R(offset,byte)
7122b237
YO
934#define Set_Node_Offset(node,byte)
935#define Set_Cur_Node_Offset
936#define Set_Node_Length_To_R(node,len)
937#define Set_Node_Length(node,len)
6a86c6ad 938#define Set_Node_Cur_Length(node,start)
538e84ed
KW
939#define Node_Offset(n)
940#define Node_Length(n)
7122b237
YO
941#define Set_Node_Offset_Length(node,offset,len)
942#define ProgLen(ri) ri->u.proglen
943#define SetProgLen(ri,x) ri->u.proglen = x
944#else
945#define ProgLen(ri) ri->u.offsets[0]
946#define SetProgLen(ri,x) ri->u.offsets[0] = x
d7c442ce 947#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
ccb2c380
MP
948 if (! SIZE_ONLY) { \
949 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
d7c442ce
KW
950 __LINE__, (int)(offset), (int)(byte))); \
951 if((offset) < 0) { \
538e84ed 952 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
d7c442ce 953 (int)(offset)); \
ccb2c380 954 } else { \
d7c442ce 955 RExC_offsets[2*(offset)-1] = (byte); \
ccb2c380
MP
956 } \
957 } \
958} STMT_END
959
88f063b4 960#define Set_Node_Offset(node,byte) \
1a322bb4 961 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
ccb2c380
MP
962#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
963
964#define Set_Node_Length_To_R(node,len) STMT_START { \
965 if (! SIZE_ONLY) { \
966 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 967 __LINE__, (int)(node), (int)(len))); \
ccb2c380 968 if((node) < 0) { \
538e84ed
KW
969 Perl_croak(aTHX_ "value of node is %d in Length macro", \
970 (int)(node)); \
ccb2c380
MP
971 } else { \
972 RExC_offsets[2*(node)] = (len); \
973 } \
974 } \
975} STMT_END
976
977#define Set_Node_Length(node,len) \
1a322bb4 978 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
6a86c6ad
NC
979#define Set_Node_Cur_Length(node, start) \
980 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
981
982/* Get offsets and lengths */
1a322bb4
KW
983#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
984#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
fac92740 985
07be1b83 986#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1a322bb4
KW
987 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
988 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
07be1b83 989} STMT_END
7122b237 990#endif
07be1b83
YO
991
992#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
993#define EXPERIMENTAL_INPLACESCAN
f427392e 994#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 995
cb41e5d6
YO
996#ifdef DEBUGGING
997int
6ad9a8ab 998Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
999{
1000 va_list ap;
1001 int result;
1002 PerlIO *f= Perl_debug_log;
1003 PERL_ARGS_ASSERT_RE_PRINTF;
1004 va_start(ap, fmt);
1005 result = PerlIO_vprintf(f, fmt, ap);
1006 va_end(ap);
1007 return result;
1008}
1009
1010int
7b031478 1011Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
1012{
1013 va_list ap;
1014 int result;
1015 PerlIO *f= Perl_debug_log;
1016 PERL_ARGS_ASSERT_RE_INDENTF;
1017 va_start(ap, depth);
daeb874b 1018 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
1019 result = PerlIO_vprintf(f, fmt, ap);
1020 va_end(ap);
1021 return result;
1022}
1023#endif /* DEBUGGING */
1024
7b031478 1025#define DEBUG_RExC_seen() \
538e84ed 1026 DEBUG_OPTIMISE_MORE_r({ \
88f063b4 1027 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 1028 \
e384d5c1 1029 if (RExC_seen & REG_ZERO_LEN_SEEN) \
88f063b4 1030 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 1031 \
e384d5c1 1032 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
88f063b4 1033 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 1034 \
e384d5c1 1035 if (RExC_seen & REG_GPOS_SEEN) \
88f063b4 1036 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 1037 \
e384d5c1 1038 if (RExC_seen & REG_RECURSE_SEEN) \
88f063b4 1039 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 1040 \
7b031478 1041 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
88f063b4 1042 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 1043 \
e384d5c1 1044 if (RExC_seen & REG_VERBARG_SEEN) \
88f063b4 1045 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 1046 \
e384d5c1 1047 if (RExC_seen & REG_CUTGROUP_SEEN) \
88f063b4 1048 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 1049 \
e384d5c1 1050 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
88f063b4 1051 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 1052 \
e384d5c1 1053 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
88f063b4 1054 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 1055 \
7b031478 1056 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
88f063b4 1057 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 1058 \
88f063b4 1059 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
1060 });
1061
fdfb4f21 1062#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 1063 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21 1064
fdfb4f21 1065
f5a36d78
DM
1066#ifdef DEBUGGING
1067static void
1068S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1069 const char *close_str)
1070{
1071 if (!flags)
1072 return;
1073
1074 Perl_re_printf( aTHX_ "%s", open_str);
11683ecb
DM
1075 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1076 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
f5a36d78
DM
1077 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1078 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1079 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1080 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1081 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1082 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1083 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1084 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1085 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1086 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1087 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1088 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1089 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1090 Perl_re_printf( aTHX_ "%s", close_str);
1091}
1092
1093
1094static void
1095S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1096 U32 depth, int is_inf)
1097{
1098 GET_RE_DEBUG_FLAGS_DECL;
1099
1100 DEBUG_OPTIMISE_MORE_r({
1101 if (!data)
1102 return;
1103 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1104 depth,
1105 where,
1106 (IV)data->pos_min,
1107 (IV)data->pos_delta,
1108 (UV)data->flags
1109 );
1110
1111 S_debug_show_study_flags(aTHX_ data->flags," [","]");
fdfb4f21 1112
f5a36d78
DM
1113 Perl_re_printf( aTHX_
1114 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1115 (IV)data->whilem_c,
1116 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1117 is_inf ? "INF " : ""
1118 );
1119
11683ecb 1120 if (data->last_found) {
37b6262f 1121 int i;
11683ecb
DM
1122 Perl_re_printf(aTHX_
1123 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1124 SvPVX_const(data->last_found),
1125 (IV)data->last_end,
1126 (IV)data->last_start_min,
1127 (IV)data->last_start_max
1128 );
1129
2099df82 1130 for (i = 0; i < 2; i++) {
37b6262f
DM
1131 Perl_re_printf(aTHX_
1132 " %s%s: '%s' @ %" IVdf "/%" IVdf,
2099df82
DM
1133 data->cur_is_floating == i ? "*" : "",
1134 i ? "Float" : "Fixed",
37b6262f
DM
1135 SvPVX_const(data->substrs[i].str),
1136 (IV)data->substrs[i].min_offset,
1137 (IV)data->substrs[i].max_offset
1138 );
1139 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1140 }
11683ecb 1141 }
f5a36d78
DM
1142
1143 Perl_re_printf( aTHX_ "\n");
1144 });
1145}
1146
1147
1148static void
1149S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1150 regnode *scan, U32 depth, U32 flags)
1151{
1152 GET_RE_DEBUG_FLAGS_DECL;
1153
1154 DEBUG_OPTIMISE_r({
1155 regnode *Next;
1156
1157 if (!scan)
1158 return;
1159 Next = regnext(scan);
1160 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1161 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1162 depth,
1163 str,
1164 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1165 Next ? (REG_NODE_NUM(Next)) : 0 );
1166 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1167 Perl_re_printf( aTHX_ "\n");
1168 });
1169}
1170
1171
1172# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1173 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1174
1175# define DEBUG_PEEP(str, scan, depth, flags) \
1176 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1177
1178#else
1179# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1180# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1181#endif
1de06328 1182
cb41e5d6 1183
c6871b76
KW
1184/* =========================================================
1185 * BEGIN edit_distance stuff.
1186 *
1187 * This calculates how many single character changes of any type are needed to
1188 * transform a string into another one. It is taken from version 3.1 of
1189 *
1190 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1191 */
1192
1193/* Our unsorted dictionary linked list. */
1194/* Note we use UVs, not chars. */
1195
1196struct dictionary{
1197 UV key;
1198 UV value;
1199 struct dictionary* next;
1200};
1201typedef struct dictionary item;
1202
1203
1204PERL_STATIC_INLINE item*
88f063b4 1205push(UV key, item* curr)
c6871b76
KW
1206{
1207 item* head;
d09f14bf 1208 Newx(head, 1, item);
c6871b76
KW
1209 head->key = key;
1210 head->value = 0;
1211 head->next = curr;
1212 return head;
1213}
1214
1215
1216PERL_STATIC_INLINE item*
1217find(item* head, UV key)
1218{
1219 item* iterator = head;
1220 while (iterator){
1221 if (iterator->key == key){
1222 return iterator;
1223 }
1224 iterator = iterator->next;
1225 }
1226
1227 return NULL;
1228}
1229
1230PERL_STATIC_INLINE item*
88f063b4 1231uniquePush(item* head, UV key)
c6871b76
KW
1232{
1233 item* iterator = head;
1234
1235 while (iterator){
1236 if (iterator->key == key) {
1237 return head;
1238 }
1239 iterator = iterator->next;
1240 }
1241
88f063b4 1242 return push(key, head);
c6871b76
KW
1243}
1244
1245PERL_STATIC_INLINE void
1246dict_free(item* head)
1247{
1248 item* iterator = head;
1249
1250 while (iterator) {
1251 item* temp = iterator;
1252 iterator = iterator->next;
1253 Safefree(temp);
1254 }
1255
1256 head = NULL;
1257}
1258
1259/* End of Dictionary Stuff */
1260
1261/* All calculations/work are done here */
1262STATIC int
1263S_edit_distance(const UV* src,
1264 const UV* tgt,
1265 const STRLEN x, /* length of src[] */
1266 const STRLEN y, /* length of tgt[] */
1267 const SSize_t maxDistance
1268)
1269{
1270 item *head = NULL;
88f063b4 1271 UV swapCount, swapScore, targetCharCount, i, j;
c6871b76
KW
1272 UV *scores;
1273 UV score_ceil = x + y;
1274
1275 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1276
1277 /* intialize matrix start values */
d09f14bf 1278 Newx(scores, ( (x + 2) * (y + 2)), UV);
c6871b76
KW
1279 scores[0] = score_ceil;
1280 scores[1 * (y + 2) + 0] = score_ceil;
1281 scores[0 * (y + 2) + 1] = score_ceil;
1282 scores[1 * (y + 2) + 1] = 0;
88f063b4 1283 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
c6871b76
KW
1284
1285 /* work loops */
1286 /* i = src index */
1287 /* j = tgt index */
1288 for (i=1;i<=x;i++) {
1289 if (i < x)
88f063b4 1290 head = uniquePush(head, src[i]);
c6871b76
KW
1291 scores[(i+1) * (y + 2) + 1] = i;
1292 scores[(i+1) * (y + 2) + 0] = score_ceil;
1293 swapCount = 0;
1294
1295 for (j=1;j<=y;j++) {
1296 if (i == 1) {
1297 if(j < y)
88f063b4 1298 head = uniquePush(head, tgt[j]);
c6871b76
KW
1299 scores[1 * (y + 2) + (j + 1)] = j;
1300 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1301 }
1302
88f063b4 1303 targetCharCount = find(head, tgt[j-1])->value;
c6871b76
KW
1304 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1305
1306 if (src[i-1] != tgt[j-1]){
1307 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1308 }
1309 else {
1310 swapCount = j;
1311 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1312 }
1313 }
1314
88f063b4 1315 find(head, src[i-1])->value = i;
c6871b76
KW
1316 }
1317
1318 {
1319 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1320 dict_free(head);
1321 Safefree(scores);
1322 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1323 }
1324}
1325
1326/* END of edit_distance() stuff
1327 * ========================================================= */
1328
8e35b056
KW
1329/* is c a control character for which we have a mnemonic? */
1330#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1331
549b4e78
KW
1332STATIC const char *
1333S_cntrl_to_mnemonic(const U8 c)
1334{
1335 /* Returns the mnemonic string that represents character 'c', if one
1336 * exists; NULL otherwise. The only ones that exist for the purposes of
1337 * this routine are a few control characters */
1338
1339 switch (c) {
1340 case '\a': return "\\a";
1341 case '\b': return "\\b";
1342 case ESC_NATIVE: return "\\e";
1343 case '\f': return "\\f";
1344 case '\n': return "\\n";
1345 case '\r': return "\\r";
1346 case '\t': return "\\t";
1347 }
1348
1349 return NULL;
1350}
1351
653099ff 1352/* Mark that we cannot extend a found fixed substring at this point.
37b6262f 1353 Update the longest found anchored substring or the longest found
653099ff
GS
1354 floating substrings if needed. */
1355
4327152a 1356STATIC void
ea3daa5d
FC
1357S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1358 SSize_t *minlenp, int is_inf)
c277df42 1359{
e1ec3a88 1360 const STRLEN l = CHR_SVLEN(data->last_found);
2099df82 1361 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
a2ca2017 1362 const STRLEN old_l = CHR_SVLEN(longest_sv);
1de06328 1363 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1364
7918f24d
NC
1365 PERL_ARGS_ASSERT_SCAN_COMMIT;
1366
c277df42 1367 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
2099df82 1368 const U8 i = data->cur_is_floating;
a2ca2017 1369 SvSetMagicSV(longest_sv, data->last_found);
37b6262f
DM
1370 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1371
2099df82
DM
1372 if (!i) /* fixed */
1373 data->substrs[0].max_offset = data->substrs[0].min_offset;
1374 else { /* float */
1375 data->substrs[1].max_offset = (l
646e8787
DM
1376 ? data->last_start_max
1377 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1378 ? SSize_t_MAX
1379 : data->pos_min + data->pos_delta));
1380 if (is_inf
2099df82
DM
1381 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1382 data->substrs[1].max_offset = SSize_t_MAX;
37b6262f 1383 }
11683ecb 1384
37b6262f
DM
1385 if (data->flags & SF_BEFORE_EOL)
1386 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1387 else
1388 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1389 data->substrs[i].minlenp = minlenp;
1390 data->substrs[i].lookbehind = 0;
c277df42 1391 }
37b6262f 1392
c277df42 1393 SvCUR_set(data->last_found, 0);
0eda9292 1394 {
a28509cc 1395 SV * const sv = data->last_found;
097eb12c
AL
1396 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1397 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1398 if (mg)
1399 mg->mg_len = 0;
1400 }
0eda9292 1401 }
c277df42
IZ
1402 data->last_end = -1;
1403 data->flags &= ~SF_BEFORE_EOL;
f5a36d78 1404 DEBUG_STUDYDATA("commit", data, 0, is_inf);
c277df42
IZ
1405}
1406
cdd87c1d
KW
1407/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1408 * list that describes which code points it matches */
1409
653099ff 1410STATIC void
3420edd7 1411S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1412{
cdd87c1d
KW
1413 /* Set the SSC 'ssc' to match an empty string or any code point */
1414
557bd3fb 1415 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1416
71068078 1417 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1418
0854ea0b
KW
1419 /* mortalize so won't leak */
1420 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1421 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1422}
1423
653099ff 1424STATIC int
dc3bf405 1425S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1426{
c144baaa
KW
1427 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1428 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1429 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1430 * in any way, so there's no point in using it */
cdd87c1d
KW
1431
1432 UV start, end;
1433 bool ret;
653099ff 1434
557bd3fb 1435 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1436
71068078 1437 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1438
93e92956 1439 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1440 return FALSE;
1441 }
1442
1443 /* See if the list consists solely of the range 0 - Infinity */
1444 invlist_iterinit(ssc->invlist);
1445 ret = invlist_iternext(ssc->invlist, &start, &end)
1446 && start == 0
1447 && end == UV_MAX;
1448
1449 invlist_iterfinish(ssc->invlist);
1450
1451 if (ret) {
1452 return TRUE;
1453 }
1454
1455 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1456 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1457 int i;
1458 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1459 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1460 return TRUE;
1461 }
1462 }
1463 }
1464
1465 return FALSE;
653099ff
GS
1466}
1467
653099ff 1468STATIC void
cdd87c1d 1469S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1470{
cdd87c1d
KW
1471 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1472 * string, any code point, or any posix class under locale */
1473
557bd3fb 1474 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1475
557bd3fb 1476 Zero(ssc, 1, regnode_ssc);
71068078 1477 set_ANYOF_SYNTHETIC(ssc);
93e92956 1478 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1479 ssc_anything(ssc);
cdd87c1d 1480
2f306ab9
KW
1481 /* If any portion of the regex is to operate under locale rules that aren't
1482 * fully known at compile time, initialization includes it. The reason
1483 * this isn't done for all regexes is that the optimizer was written under
1484 * the assumption that locale was all-or-nothing. Given the complexity and
1485 * lack of documentation in the optimizer, and that there are inadequate
1486 * test cases for locale, many parts of it may not work properly, it is
1487 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1488 if (RExC_contains_locale) {
1489 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1490 }
1491 else {
1492 ANYOF_POSIXL_ZERO(ssc);
1493 }
653099ff
GS
1494}
1495
b423522f 1496STATIC int
dc3bf405
BF
1497S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1498 const regnode_ssc *ssc)
b423522f
KW
1499{
1500 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1501 * to the list of code points matched, and locale posix classes; hence does
1502 * not check its flags) */
1503
1504 UV start, end;
1505 bool ret;
1506
1507 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1508
71068078 1509 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1510
1511 invlist_iterinit(ssc->invlist);
1512 ret = invlist_iternext(ssc->invlist, &start, &end)
1513 && start == 0
1514 && end == UV_MAX;
1515
1516 invlist_iterfinish(ssc->invlist);
1517
1518 if (! ret) {
1519 return FALSE;
1520 }
1521
e0e1be5f 1522 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1523 return FALSE;
b423522f
KW
1524 }
1525
1526 return TRUE;
1527}
1528
1529STATIC SV*
1530S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1531 const regnode_charclass* const node)
b423522f
KW
1532{
1533 /* Returns a mortal inversion list defining which code points are matched
1534 * by 'node', which is of type ANYOF. Handles complementing the result if
1535 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1536 * returned list must, and will, contain every code point that is a
1537 * possibility. */
b423522f 1538
e2506fa7 1539 SV* invlist = NULL;
1ee208c4 1540 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1541 unsigned int i;
1542 const U32 n = ARG(node);
31f05a37 1543 bool new_node_has_latin1 = FALSE;
b423522f
KW
1544
1545 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1546
1547 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1548 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1549 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1550 AV * const av = MUTABLE_AV(SvRV(rv));
1551 SV **const ary = AvARRAY(av);
1552 assert(RExC_rxi->data->what[n] == 's');
1553
1554 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
28118b9c 1555 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1]), NULL));
b423522f
KW
1556 }
1557 else if (ary[0] && ary[0] != &PL_sv_undef) {
1558
1559 /* Here, no compile-time swash, and there are things that won't be
1560 * known until runtime -- we have to assume it could be anything */
e2506fa7 1561 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1562 return _add_range_to_invlist(invlist, 0, UV_MAX);
1563 }
1ee208c4 1564 else if (ary[3] && ary[3] != &PL_sv_undef) {
b423522f
KW
1565
1566 /* Here no compile-time swash, and no run-time only data. Use the
1567 * node's inversion list */
28118b9c 1568 invlist = sv_2mortal(invlist_clone(ary[3], NULL));
1ee208c4
KW
1569 }
1570
1571 /* Get the code points valid only under UTF-8 locales */
037715a6 1572 if ((ANYOF_FLAGS(node) & ANYOFL_FOLD)
1ee208c4
KW
1573 && ary[2] && ary[2] != &PL_sv_undef)
1574 {
1575 only_utf8_locale_invlist = ary[2];
b423522f
KW
1576 }
1577 }
1578
e2506fa7
KW
1579 if (! invlist) {
1580 invlist = sv_2mortal(_new_invlist(0));
1581 }
1582
dcb20b36
KW
1583 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1584 * code points, and an inversion list for the others, but if there are code
1585 * points that should match only conditionally on the target string being
1586 * UTF-8, those are placed in the inversion list, and not the bitmap.
1587 * Since there are circumstances under which they could match, they are
1588 * included in the SSC. But if the ANYOF node is to be inverted, we have
1589 * to exclude them here, so that when we invert below, the end result
1590 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1591 * have to do this here before we add the unconditionally matched code
1592 * points */
b423522f
KW
1593 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1594 _invlist_intersection_complement_2nd(invlist,
1595 PL_UpperLatin1,
1596 &invlist);
1597 }
1598
1599 /* Add in the points from the bit map */
dcb20b36 1600 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
b423522f 1601 if (ANYOF_BITMAP_TEST(node, i)) {
6f8848d5
TC
1602 unsigned int start = i++;
1603
1604 for (; i < NUM_ANYOF_CODE_POINTS && ANYOF_BITMAP_TEST(node, i); ++i) {
1605 /* empty */
1606 }
1607 invlist = _add_range_to_invlist(invlist, start, i-1);
31f05a37 1608 new_node_has_latin1 = TRUE;
b423522f
KW
1609 }
1610 }
1611
1612 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1613 * as well. But don't add them if inverting, as when that gets done below,
1614 * it would exclude all these characters, including the ones it shouldn't
1615 * that were added just above */
1616 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
f240c685
KW
1617 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1618 {
b423522f
KW
1619 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1620 }
1621
1622 /* Similarly for these */
93e92956 1623 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1624 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1625 }
1626
1627 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1628 _invlist_invert(invlist);
1629 }
037715a6 1630 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOFL_FOLD) {
31f05a37
KW
1631
1632 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1633 * locale. We can skip this if there are no 0-255 at all. */
1634 _invlist_union(invlist, PL_Latin1, &invlist);
1635 }
1636
1ee208c4
KW
1637 /* Similarly add the UTF-8 locale possible matches. These have to be
1638 * deferred until after the non-UTF-8 locale ones are taken care of just
1639 * above, or it leads to wrong results under ANYOF_INVERT */
1640 if (only_utf8_locale_invlist) {
31f05a37 1641 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1642 only_utf8_locale_invlist,
31f05a37
KW
1643 ANYOF_FLAGS(node) & ANYOF_INVERT,
1644 &invlist);
1645 }
b423522f
KW
1646
1647 return invlist;
1648}
1649
1051e1c4 1650/* These two functions currently do the exact same thing */
557bd3fb 1651#define ssc_init_zero ssc_init
653099ff 1652
cdd87c1d
KW
1653#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1654#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1655
557bd3fb 1656/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1657 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1658 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1659
653099ff 1660STATIC void
b423522f 1661S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1662 const regnode_charclass *and_with)
653099ff 1663{
cdd87c1d
KW
1664 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1665 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1666
a0dd4231
KW
1667 SV* anded_cp_list;
1668 U8 anded_flags;
1e6ade67 1669
cdd87c1d 1670 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1671
71068078 1672 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1673
1674 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1675 * the code point inversion list and just the relevant flags */
71068078 1676 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1677 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
a0dd4231 1678 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1679
1680 /* XXX This is a kludge around what appears to be deficiencies in the
1681 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1682 * there are paths through the optimizer where it doesn't get weeded
1683 * out when it should. And if we don't make some extra provision for
1684 * it like the code just below, it doesn't get added when it should.
1685 * This solution is to add it only when AND'ing, which is here, and
1686 * only when what is being AND'ed is the pristine, original node
1687 * matching anything. Thus it is like adding it to ssc_anything() but
1688 * only when the result is to be AND'ed. Probably the same solution
1689 * could be adopted for the same problem we have with /l matching,
1690 * which is solved differently in S_ssc_init(), and that would lead to
1691 * fewer false positives than that solution has. But if this solution
1692 * creates bugs, the consequences are only that a warning isn't raised
1693 * that should be; while the consequences for having /l bugs is
1694 * incorrect matches */
7dcac5f6 1695 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1696 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1697 }
a0dd4231
KW
1698 }
1699 else {
5c0f85ef 1700 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685
KW
1701 if (OP(and_with) == ANYOFD) {
1702 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1703 }
1704 else {
1705 anded_flags = ANYOF_FLAGS(and_with)
1706 &( ANYOF_COMMON_FLAGS
108316fb
KW
1707 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1708 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1709 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1710 anded_flags &=
1711 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1712 }
f240c685 1713 }
a0dd4231
KW
1714 }
1715
1716 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1717
1718 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1719 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1720 * 'and_with' may be inverted. When not inverted, we have the situation of
1721 * computing:
1722 * (C1 | P1) & (C2 | P2)
1723 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1724 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1725 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1726 * <= ((C1 & C2) | P1 | P2)
1727 * Alternatively, the last few steps could be:
1728 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1729 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1730 * <= (C1 | C2 | (P1 & P2))
1731 * We favor the second approach if either P1 or P2 is non-empty. This is
1732 * because these components are a barrier to doing optimizations, as what
1733 * they match cannot be known until the moment of matching as they are
1734 * dependent on the current locale, 'AND"ing them likely will reduce or
1735 * eliminate them.
1736 * But we can do better if we know that C1,P1 are in their initial state (a
1737 * frequent occurrence), each matching everything:
1738 * (<everything>) & (C2 | P2) = C2 | P2
1739 * Similarly, if C2,P2 are in their initial state (again a frequent
1740 * occurrence), the result is a no-op
1741 * (C1 | P1) & (<everything>) = C1 | P1
1742 *
1743 * Inverted, we have
1744 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1745 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1746 * <= (C1 & ~C2) | (P1 & ~P2)
1747 * */
1aa99e6b 1748
a0dd4231 1749 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1750 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1751 {
cdd87c1d 1752 unsigned int i;
8951c461 1753
cdd87c1d
KW
1754 ssc_intersection(ssc,
1755 anded_cp_list,
1756 FALSE /* Has already been inverted */
1757 );
c6b76537 1758
cdd87c1d
KW
1759 /* If either P1 or P2 is empty, the intersection will be also; can skip
1760 * the loop */
93e92956 1761 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1762 ANYOF_POSIXL_ZERO(ssc);
1763 }
e0e1be5f 1764 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1765
1766 /* Note that the Posix class component P from 'and_with' actually
1767 * looks like:
1768 * P = Pa | Pb | ... | Pn
1769 * where each component is one posix class, such as in [\w\s].
1770 * Thus
1771 * ~P = ~(Pa | Pb | ... | Pn)
1772 * = ~Pa & ~Pb & ... & ~Pn
1773 * <= ~Pa | ~Pb | ... | ~Pn
1774 * The last is something we can easily calculate, but unfortunately
1775 * is likely to have many false positives. We could do better
1776 * in some (but certainly not all) instances if two classes in
1777 * P have known relationships. For example
1778 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1779 * So
1780 * :lower: & :print: = :lower:
1781 * And similarly for classes that must be disjoint. For example,
1782 * since \s and \w can have no elements in common based on rules in
1783 * the POSIX standard,
1784 * \w & ^\S = nothing
1785 * Unfortunately, some vendor locales do not meet the Posix
1786 * standard, in particular almost everything by Microsoft.
1787 * The loop below just changes e.g., \w into \W and vice versa */
1788
1ee208c4 1789 regnode_charclass_posixl temp;
cdd87c1d
KW
1790 int add = 1; /* To calculate the index of the complement */
1791
b1234259 1792 Zero(&temp, 1, regnode_charclass_posixl);
cdd87c1d
KW
1793 ANYOF_POSIXL_ZERO(&temp);
1794 for (i = 0; i < ANYOF_MAX; i++) {
1795 assert(i % 2 != 0
7dcac5f6
KW
1796 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1797 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1798
7dcac5f6 1799 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1800 ANYOF_POSIXL_SET(&temp, i + add);
1801 }
1802 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1803 }
1804 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1805
cdd87c1d
KW
1806 } /* else ssc already has no posixes */
1807 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1808 in its initial state */
71068078 1809 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1810 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1811 {
1812 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1813 * copy it over 'ssc' */
1814 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1815 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1816 StructCopy(and_with, ssc, regnode_ssc);
1817 }
1818 else {
1819 ssc->invlist = anded_cp_list;
1820 ANYOF_POSIXL_ZERO(ssc);
93e92956 1821 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1822 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1823 }
1824 }
1825 }
e0e1be5f 1826 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
93e92956 1827 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1828 {
1829 /* One or the other of P1, P2 is non-empty. */
93e92956 1830 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1831 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1832 }
cdd87c1d
KW
1833 ssc_union(ssc, anded_cp_list, FALSE);
1834 }
1835 else { /* P1 = P2 = empty */
1836 ssc_intersection(ssc, anded_cp_list, FALSE);
1837 }
137165a6 1838 }
653099ff
GS
1839}
1840
653099ff 1841STATIC void
cdd87c1d 1842S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1843 const regnode_charclass *or_with)
653099ff 1844{
cdd87c1d
KW
1845 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1846 * another SSC or a regular ANYOF class. Can create false positives if
1847 * 'or_with' is to be inverted. */
7918f24d 1848
a0dd4231
KW
1849 SV* ored_cp_list;
1850 U8 ored_flags;
c6b76537 1851
cdd87c1d 1852 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1853
71068078 1854 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1855
1856 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1857 * the code point inversion list and just the relevant flags */
71068078 1858 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1859 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
a0dd4231
KW
1860 ored_flags = ANYOF_FLAGS(or_with);
1861 }
1862 else {
5c0f85ef 1863 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
eff8b7dc 1864 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
f240c685
KW
1865 if (OP(or_with) != ANYOFD) {
1866 ored_flags
1867 |= ANYOF_FLAGS(or_with)
108316fb
KW
1868 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1869 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1870 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1871 ored_flags |=
1872 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1873 }
f240c685 1874 }
a0dd4231
KW
1875 }
1876
1877 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1878
1879 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1880 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1881 * 'or_with' may be inverted. When not inverted, we have the simple
1882 * situation of computing:
1883 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1884 * If P1|P2 yields a situation with both a class and its complement are
1885 * set, like having both \w and \W, this matches all code points, and we
1886 * can delete these from the P component of the ssc going forward. XXX We
1887 * might be able to delete all the P components, but I (khw) am not certain
1888 * about this, and it is better to be safe.
1889 *
1890 * Inverted, we have
1891 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1892 * <= (C1 | P1) | ~C2
1893 * <= (C1 | ~C2) | P1
1894 * (which results in actually simpler code than the non-inverted case)
1895 * */
9826f543 1896
a0dd4231 1897 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1898 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1899 {
cdd87c1d 1900 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1901 } /* else Not inverted */
93e92956 1902 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1903 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1904 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1905 unsigned int i;
1906 for (i = 0; i < ANYOF_MAX; i += 2) {
1907 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1908 {
1909 ssc_match_all_cp(ssc);
1910 ANYOF_POSIXL_CLEAR(ssc, i);
1911 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1912 }
1913 }
1914 }
1aa99e6b 1915 }
cdd87c1d
KW
1916
1917 ssc_union(ssc,
1918 ored_cp_list,
1919 FALSE /* Already has been inverted */
1920 );
653099ff
GS
1921}
1922
b423522f
KW
1923PERL_STATIC_INLINE void
1924S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1925{
1926 PERL_ARGS_ASSERT_SSC_UNION;
1927
71068078 1928 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1929
1930 _invlist_union_maybe_complement_2nd(ssc->invlist,
1931 invlist,
1932 invert2nd,
1933 &ssc->invlist);
1934}
1935
1936PERL_STATIC_INLINE void
1937S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1938 SV* const invlist,
1939 const bool invert2nd)
1940{
1941 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1942
71068078 1943 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1944
1945 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1946 invlist,
1947 invert2nd,
1948 &ssc->invlist);
1949}
1950
1951PERL_STATIC_INLINE void
1952S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1953{
1954 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1955
71068078 1956 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1957
1958 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1959}
1960
1961PERL_STATIC_INLINE void
1962S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1963{
1964 /* AND just the single code point 'cp' into the SSC 'ssc' */
1965
1966 SV* cp_list = _new_invlist(2);
1967
1968 PERL_ARGS_ASSERT_SSC_CP_AND;
1969
71068078 1970 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1971
1972 cp_list = add_cp_to_invlist(cp_list, cp);
1973 ssc_intersection(ssc, cp_list,
1974 FALSE /* Not inverted */
1975 );
1976 SvREFCNT_dec_NN(cp_list);
1977}
1978
1979PERL_STATIC_INLINE void
dc3bf405 1980S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
1981{
1982 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
1983 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1984
71068078 1985 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1986
1987 ANYOF_POSIXL_ZERO(ssc);
1988 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1989}
1990
b35552de
KW
1991#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1992
1993STATIC bool
1994S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1995{
1996 /* The synthetic start class is used to hopefully quickly winnow down
1997 * places where a pattern could start a match in the target string. If it
1998 * doesn't really narrow things down that much, there isn't much point to
1999 * having the overhead of using it. This function uses some very crude
2000 * heuristics to decide if to use the ssc or not.
2001 *
2002 * It returns TRUE if 'ssc' rules out more than half what it considers to
2003 * be the "likely" possible matches, but of course it doesn't know what the
2004 * actual things being matched are going to be; these are only guesses
2005 *
2006 * For /l matches, it assumes that the only likely matches are going to be
2007 * in the 0-255 range, uniformly distributed, so half of that is 127
2008 * For /a and /d matches, it assumes that the likely matches will be just
2009 * the ASCII range, so half of that is 63
2010 * For /u and there isn't anything matching above the Latin1 range, it
2011 * assumes that that is the only range likely to be matched, and uses
2012 * half that as the cut-off: 127. If anything matches above Latin1,
2013 * it assumes that all of Unicode could match (uniformly), except for
2014 * non-Unicode code points and things in the General Category "Other"
2015 * (unassigned, private use, surrogates, controls and formats). This
2016 * is a much large number. */
2017
b35552de
KW
2018 U32 count = 0; /* Running total of number of code points matched by
2019 'ssc' */
2020 UV start, end; /* Start and end points of current range in inversion
2021 list */
72400949
KW
2022 const U32 max_code_points = (LOC)
2023 ? 256
2024 : (( ! UNI_SEMANTICS
2025 || invlist_highest(ssc->invlist) < 256)
2026 ? 128
2027 : NON_OTHER_COUNT);
2028 const U32 max_match = max_code_points / 2;
b35552de
KW
2029
2030 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2031
2032 invlist_iterinit(ssc->invlist);
2033 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
2034 if (start >= max_code_points) {
2035 break;
b35552de 2036 }
72400949 2037 end = MIN(end, max_code_points - 1);
b35552de 2038 count += end - start + 1;
72400949 2039 if (count >= max_match) {
b35552de
KW
2040 invlist_iterfinish(ssc->invlist);
2041 return FALSE;
2042 }
2043 }
2044
2045 return TRUE;
2046}
2047
2048
b423522f
KW
2049STATIC void
2050S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2051{
2052 /* The inversion list in the SSC is marked mortal; now we need a more
2053 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
2054 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2055 * map */
b423522f 2056
28118b9c 2057 SV* invlist = invlist_clone(ssc->invlist, NULL);
b423522f
KW
2058
2059 PERL_ARGS_ASSERT_SSC_FINALIZE;
2060
71068078 2061 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 2062
a0dd4231 2063 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
2064 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2065 * by the time we reach here */
f240c685
KW
2066 assert(! (ANYOF_FLAGS(ssc)
2067 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
2068 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2069 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 2070
b423522f
KW
2071 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2072
1ee208c4
KW
2073 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
2074 NULL, NULL, NULL, FALSE);
b423522f 2075
85c8e306
KW
2076 /* Make sure is clone-safe */
2077 ssc->invlist = NULL;
2078
e0e1be5f 2079 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 2080 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
d156f5cb 2081 OP(ssc) = ANYOFPOSIXL;
e0e1be5f 2082 }
d156f5cb 2083 else if (RExC_contains_locale) {
b2e90ddf
KW
2084 OP(ssc) = ANYOFL;
2085 }
2086
1462525b 2087 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
2088}
2089
a3621e74
YO
2090#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2091#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2092#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
2093#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2094 ? (TRIE_LIST_CUR( idx ) - 1) \
2095 : 0 )
a3621e74 2096
3dab1dad
YO
2097
2098#ifdef DEBUGGING
07be1b83 2099/*
2b8b4781
NC
2100 dump_trie(trie,widecharmap,revcharmap)
2101 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2102 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
2103
2104 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
2105 The _interim_ variants are used for debugging the interim
2106 tables that are used to generate the final compressed
2107 representation which is what dump_trie expects.
2108
486ec47a 2109 Part of the reason for their existence is to provide a form
3dab1dad 2110 of documentation as to how the different representations function.
07be1b83
YO
2111
2112*/
3dab1dad
YO
2113
2114/*
3dab1dad
YO
2115 Dumps the final compressed table form of the trie to Perl_debug_log.
2116 Used for debugging make_trie().
2117*/
b9a59e08 2118
3dab1dad 2119STATIC void
2b8b4781
NC
2120S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2121 AV *revcharmap, U32 depth)
3dab1dad
YO
2122{
2123 U32 state;
ab3bbdeb 2124 SV *sv=sv_newmortal();
55eed653 2125 int colwidth= widecharmap ? 6 : 4;
2e64971a 2126 U16 word;
3dab1dad
YO
2127 GET_RE_DEBUG_FLAGS_DECL;
2128
7918f24d 2129 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 2130
6ad9a8ab 2131 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 2132 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
2133
2134 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 2135 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 2136 if ( tmp ) {
6ad9a8ab 2137 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2138 colwidth,
538e84ed 2139 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2140 PL_colors[0], PL_colors[1],
2141 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2142 PERL_PV_ESCAPE_FIRSTCHAR
2143 )
ab3bbdeb 2144 );
3dab1dad
YO
2145 }
2146 }
1e37780e
YO
2147 Perl_re_printf( aTHX_ "\n");
2148 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2149
2150 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2151 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2152 Perl_re_printf( aTHX_ "\n");
3dab1dad 2153
1e2e3d02 2154 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2155 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2156
147e3846 2157 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2158
2159 if ( trie->states[ state ].wordnum ) {
1e37780e 2160 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2161 } else {
6ad9a8ab 2162 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2163 }
2164
147e3846 2165 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2166
2167 if ( base ) {
2168 U32 ofs = 0;
2169
2170 while( ( base + ofs < trie->uniquecharcount ) ||
2171 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2172 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2173 != state))
3dab1dad
YO
2174 ofs++;
2175
147e3846 2176 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2177
2178 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2179 if ( ( base + ofs >= trie->uniquecharcount )
2180 && ( base + ofs - trie->uniquecharcount
2181 < trie->lasttrans )
2182 && trie->trans[ base + ofs
2183 - trie->uniquecharcount ].check == state )
3dab1dad 2184 {
147e3846 2185 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2186 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2187 );
3dab1dad 2188 } else {
88f063b4 2189 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
3dab1dad
YO
2190 }
2191 }
2192
6ad9a8ab 2193 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2194
2195 }
6ad9a8ab 2196 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2197 }
6ad9a8ab 2198 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2199 depth);
2e64971a 2200 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2201 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2202 (int)word, (int)(trie->wordinfo[word].prev),
2203 (int)(trie->wordinfo[word].len));
2204 }
6ad9a8ab 2205 Perl_re_printf( aTHX_ "\n" );
538e84ed 2206}
3dab1dad 2207/*
3dab1dad 2208 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2209 List tries normally only are used for construction when the number of
3dab1dad
YO
2210 possible chars (trie->uniquecharcount) is very high.
2211 Used for debugging make_trie().
2212*/
2213STATIC void
55eed653 2214S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2215 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2216 U32 depth)
3dab1dad
YO
2217{
2218 U32 state;
ab3bbdeb 2219 SV *sv=sv_newmortal();
55eed653 2220 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2221 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2222
2223 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2224
3dab1dad 2225 /* print out the table precompression. */
6ad9a8ab 2226 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2227 depth+1 );
6ad9a8ab 2228 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2229 depth+1, "------:-----+-----------------\n" );
538e84ed 2230
3dab1dad
YO
2231 for( state=1 ; state < next_alloc ; state ++ ) {
2232 U16 charid;
538e84ed 2233
147e3846 2234 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2235 depth+1, (UV)state );
3dab1dad 2236 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2237 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2238 } else {
6ad9a8ab 2239 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2240 trie->states[ state ].wordnum
2241 );
2242 }
2243 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed 2244 SV ** const tmp = av_fetch( revcharmap,
88f063b4 2245 TRIE_LIST_ITEM(state, charid).forid, 0);
ab3bbdeb 2246 if ( tmp ) {
147e3846 2247 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2248 colwidth,
538e84ed
KW
2249 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2250 colwidth,
2251 PL_colors[0], PL_colors[1],
2252 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2253 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2254 ) ,
88f063b4
KW
2255 TRIE_LIST_ITEM(state, charid).forid,
2256 (UV)TRIE_LIST_ITEM(state, charid).newstate
1e2e3d02 2257 );
538e84ed 2258 if (!(charid % 10))
6ad9a8ab 2259 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2260 (int)((depth * 2) + 14), "");
1e2e3d02 2261 }
ab3bbdeb 2262 }
6ad9a8ab 2263 Perl_re_printf( aTHX_ "\n");
3dab1dad 2264 }
538e84ed 2265}
3dab1dad
YO
2266
2267/*
3dab1dad 2268 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2269 This is the normal DFA style state transition table, with a few
2270 twists to facilitate compression later.
3dab1dad
YO
2271 Used for debugging make_trie().
2272*/
2273STATIC void
55eed653 2274S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2275 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2276 U32 depth)
3dab1dad
YO
2277{
2278 U32 state;
2279 U16 charid;
ab3bbdeb 2280 SV *sv=sv_newmortal();
55eed653 2281 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2282 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2283
2284 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2285
3dab1dad
YO
2286 /*
2287 print out the table precompression so that we can do a visual check
2288 that they are identical.
2289 */
538e84ed 2290
6ad9a8ab 2291 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2292
2293 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2294 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2295 if ( tmp ) {
6ad9a8ab 2296 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2297 colwidth,
538e84ed 2298 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2299 PL_colors[0], PL_colors[1],
2300 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2301 PERL_PV_ESCAPE_FIRSTCHAR
2302 )
ab3bbdeb 2303 );
3dab1dad
YO
2304 }
2305 }
2306
4aaafc03
YO
2307 Perl_re_printf( aTHX_ "\n");
2308 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2309
2310 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2311 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2312 }
2313
6ad9a8ab 2314 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2315
2316 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2317
147e3846 2318 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2319 depth+1,
3dab1dad
YO
2320 (UV)TRIE_NODENUM( state ) );
2321
2322 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2323 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2324 if (v)
147e3846 2325 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2326 else
6ad9a8ab 2327 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2328 }
2329 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2330 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2331 (UV)trie->trans[ state ].check );
3dab1dad 2332 } else {
147e3846 2333 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2334 (UV)trie->trans[ state ].check,
3dab1dad
YO
2335 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2336 }
2337 }
07be1b83 2338}
3dab1dad
YO
2339
2340#endif
2341
2e64971a 2342
786e8c11
YO
2343/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2344 startbranch: the first branch in the whole branch sequence
2345 first : start branch of sequence of branch-exact nodes.
2346 May be the same as startbranch
2347 last : Thing following the last branch.
2348 May be the same as tail.
2349 tail : item following the branch sequence
2350 count : words in the sequence
a4525e78 2351 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2352 depth : indent depth
3dab1dad 2353
786e8c11 2354Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2355
786e8c11
YO
2356A trie is an N'ary tree where the branches are determined by digital
2357decomposition of the key. IE, at the root node you look up the 1st character and
2358follow that branch repeat until you find the end of the branches. Nodes can be
2359marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2360
786e8c11 2361 /he|she|his|hers/
72f13be8 2362
786e8c11
YO
2363would convert into the following structure. Numbers represent states, letters
2364following numbers represent valid transitions on the letter from that state, if
2365the number is in square brackets it represents an accepting state, otherwise it
2366will be in parenthesis.
07be1b83 2367
786e8c11
YO
2368 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2369 | |
2370 | (2)
2371 | |
2372 (1) +-i->(6)-+-s->[7]
2373 |
2374 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2375
786e8c11
YO
2376 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2377
2378This shows that when matching against the string 'hers' we will begin at state 1
2379read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2380then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2381is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2382single traverse. We store a mapping from accepting to state to which word was
2383matched, and then when we have multiple possibilities we try to complete the
b8fda935 2384rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2385
2386The only prior NFA like behaviour that would be changed by the TRIE support is
2387the silent ignoring of duplicate alternations which are of the form:
2388
2389 / (DUPE|DUPE) X? (?{ ... }) Y /x
2390
4b714af6 2391Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2392and without the optimisation. With the optimisations dupes will be silently
486ec47a 2393ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2394the following demonstrates:
2395
2396 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2397
2398which prints out 'word' three times, but
2399
2400 'words'=~/(word|word|word)(?{ print $1 })S/
2401
2402which doesnt print it out at all. This is due to other optimisations kicking in.
2403
2404Example of what happens on a structural level:
2405
486ec47a 2406The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2407
2408 1: CURLYM[1] {1,32767}(18)
2409 5: BRANCH(8)
2410 6: EXACT <ac>(16)
2411 8: BRANCH(11)
2412 9: EXACT <ad>(16)
2413 11: BRANCH(14)
2414 12: EXACT <ab>(16)
2415 16: SUCCEED(0)
2416 17: NOTHING(18)
2417 18: END(0)
2418
2419This would be optimizable with startbranch=5, first=5, last=16, tail=16
2420and should turn into:
2421
2422 1: CURLYM[1] {1,32767}(18)
2423 5: TRIE(16)
2424 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2425 <ac>
2426 <ad>
2427 <ab>
2428 16: SUCCEED(0)
2429 17: NOTHING(18)
2430 18: END(0)
2431
2432Cases where tail != last would be like /(?foo|bar)baz/:
2433
2434 1: BRANCH(4)
2435 2: EXACT <foo>(8)
2436 4: BRANCH(7)
2437 5: EXACT <bar>(8)
2438 7: TAIL(8)
2439 8: EXACT <baz>(10)
2440 10: END(0)
2441
2442which would be optimizable with startbranch=1, first=1, last=7, tail=8
2443and would end up looking like:
2444
2445 1: TRIE(8)
2446 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2447 <foo>
2448 <bar>
2449 7: TAIL(8)
2450 8: EXACT <baz>(10)
2451 10: END(0)
2452
c80e42f3 2453 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2454
2455is the recommended Unicode-aware way of saying
2456
2457 *(d++) = uv;
2458*/
2459
fab2782b 2460#define TRIE_STORE_REVCHAR(val) \
786e8c11 2461 STMT_START { \
73031816 2462 if (UTF) { \
668fcfea 2463 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2464 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 2465 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
2466 SvCUR_set(zlopp, kapow - flrbbbbb); \
2467 SvPOK_on(zlopp); \
2468 SvUTF8_on(zlopp); \
2469 av_push(revcharmap, zlopp); \
2470 } else { \
fab2782b 2471 char ooooff = (char)val; \
73031816
NC
2472 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2473 } \
2474 } STMT_END
786e8c11 2475
914a25d5
KW
2476/* This gets the next character from the input, folding it if not already
2477 * folded. */
2478#define TRIE_READ_CHAR STMT_START { \
2479 wordlen++; \
2480 if ( UTF ) { \
2481 /* if it is UTF then it is either already folded, or does not need \
2482 * folding */ \
1c1d615a 2483 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2484 } \
2485 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2486 /* This folder implies Unicode rules, which in the range expressible \
2487 * by not UTF is the lower case, with the two exceptions, one of \
2488 * which should have been taken care of before calling this */ \
2489 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2490 uvc = toLOWER_L1(*uc); \
2491 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2492 len = 1; \
914a25d5
KW
2493 } else { \
2494 /* raw data, will be folded later if needed */ \
2495 uvc = (U32)*uc; \
2496 len = 1; \
2497 } \
786e8c11
YO
2498} STMT_END
2499
2500
2501
2502#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2503 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
00195859 2504 U32 ging = TRIE_LIST_LEN( state ) * 2; \
f9003953 2505 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
00195859 2506 TRIE_LIST_LEN( state ) = ging; \
786e8c11
YO
2507 } \
2508 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2509 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2510 TRIE_LIST_CUR( state )++; \
2511} STMT_END
07be1b83 2512
786e8c11 2513#define TRIE_LIST_NEW(state) STMT_START { \
d09f14bf 2514 Newx( trie->states[ state ].trans.list, \
786e8c11
YO
2515 4, reg_trie_trans_le ); \
2516 TRIE_LIST_CUR( state ) = 1; \
2517 TRIE_LIST_LEN( state ) = 4; \
2518} STMT_END
07be1b83 2519
786e8c11
YO
2520#define TRIE_HANDLE_WORD(state) STMT_START { \
2521 U16 dupe= trie->states[ state ].wordnum; \
2522 regnode * const noper_next = regnext( noper ); \
2523 \
786e8c11
YO
2524 DEBUG_r({ \
2525 /* store the word for dumping */ \
2526 SV* tmp; \
2527 if (OP(noper) != NOTHING) \
740cce10 2528 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2529 else \
740cce10 2530 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2531 av_push( trie_words, tmp ); \
786e8c11
YO
2532 }); \
2533 \
2534 curword++; \
2e64971a
DM
2535 trie->wordinfo[curword].prev = 0; \
2536 trie->wordinfo[curword].len = wordlen; \
2537 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2538 \
2539 if ( noper_next < tail ) { \
2540 if (!trie->jump) \
538e84ed
KW
2541 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2542 sizeof(U16) ); \
7f69552c 2543 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2544 if (!jumper) \
2545 jumper = noper_next; \
2546 if (!nextbranch) \
2547 nextbranch= regnext(cur); \
2548 } \
2549 \
2550 if ( dupe ) { \
2e64971a
DM
2551 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2552 /* chain, so that when the bits of chain are later */\
2553 /* linked together, the dups appear in the chain */\
2554 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2555 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2556 } else { \
2557 /* we haven't inserted this word yet. */ \
2558 trie->states[ state ].wordnum = curword; \
2559 } \
2560} STMT_END
07be1b83 2561
3dab1dad 2562
786e8c11
YO
2563#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2564 ( ( base + charid >= ucharcount \
2565 && base + charid < ubound \
2566 && state == trie->trans[ base - ucharcount + charid ].check \
2567 && trie->trans[ base - ucharcount + charid ].next ) \
2568 ? trie->trans[ base - ucharcount + charid ].next \
2569 : ( state==1 ? special : 0 ) \
2570 )
3dab1dad 2571
8bcafbf4
YO
2572#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2573STMT_START { \
2574 TRIE_BITMAP_SET(trie, uvc); \
2575 /* store the folded codepoint */ \
2576 if ( folder ) \
2577 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2578 \
2579 if ( !UTF ) { \
2580 /* store first byte of utf8 representation of */ \
2581 /* variant codepoints */ \
2582 if (! UVCHR_IS_INVARIANT(uvc)) { \
2583 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2584 } \
2585 } \
2586} STMT_END
786e8c11
YO
2587#define MADE_TRIE 1
2588#define MADE_JUMP_TRIE 2
2589#define MADE_EXACT_TRIE 4
3dab1dad 2590
a3621e74 2591STATIC I32
538e84ed
KW
2592S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2593 regnode *first, regnode *last, regnode *tail,
2594 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2595{
2596 /* first pass, loop through and scan words */
2597 reg_trie_data *trie;
55eed653 2598 HV *widecharmap = NULL;
2b8b4781 2599 AV *revcharmap = newAV();
a3621e74 2600 regnode *cur;
a3621e74
YO
2601 STRLEN len = 0;
2602 UV uvc = 0;
2603 U16 curword = 0;
2604 U32 next_alloc = 0;
786e8c11
YO
2605 regnode *jumper = NULL;
2606 regnode *nextbranch = NULL;
7f69552c 2607 regnode *convert = NULL;
2e64971a 2608 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2609 /* we just use folder as a flag in utf8 */
1e696034 2610 const U8 * folder = NULL;
a3621e74 2611
3a611511
YO
2612 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2613 * which stands for one trie structure, one hash, optionally followed
2614 * by two arrays */
2b8b4781 2615#ifdef DEBUGGING
3a611511 2616 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2b8b4781
NC
2617 AV *trie_words = NULL;
2618 /* along with revcharmap, this only used during construction but both are
2619 * useful during debugging so we store them in the struct when debugging.
8e11feef 2620 */
2b8b4781 2621#else
cf78de0b 2622 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2623 STRLEN trie_charcount=0;
3dab1dad 2624#endif
2b8b4781 2625 SV *re_trie_maxbuff;
a3621e74 2626 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2627
2628 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2629#ifndef DEBUGGING
2630 PERL_UNUSED_ARG(depth);
2631#endif
a3621e74 2632
1e696034 2633 switch (flags) {
a4525e78 2634 case EXACT: case EXACTL: break;
89829bb5 2635 case EXACTFAA:
fab2782b 2636 case EXACTFU_SS:
a4525e78
KW
2637 case EXACTFU:
2638 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2639 case EXACTF: folder = PL_fold; break;
fab2782b 2640 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2641 }
2642
c944940b 2643 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2644 trie->refcount = 1;
3dab1dad 2645 trie->startstate = 1;
786e8c11 2646 trie->wordcount = word_count;
f8fc2ecf 2647 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2648 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
a4525e78 2649 if (flags == EXACT || flags == EXACTL)
c944940b 2650 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2651 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2652 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2653
a3621e74 2654 DEBUG_r({
2b8b4781 2655 trie_words = newAV();
a3621e74 2656 });
a3621e74 2657
0111c4fd 2658 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
316ebaf2 2659 assert(re_trie_maxbuff);
a3621e74 2660 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2661 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2662 }
df826430 2663 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2664 Perl_re_indentf( aTHX_
cb41e5d6
YO
2665 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2666 depth+1,
88f063b4 2667 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
538e84ed 2668 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2669 });
538e84ed 2670
7f69552c
YO
2671 /* Find the node we are going to overwrite */
2672 if ( first == startbranch && OP( last ) != BRANCH ) {
2673 /* whole branch chain */
2674 convert = first;
2675 } else {
2676 /* branch sub-chain */
2677 convert = NEXTOPER( first );
2678 }
538e84ed 2679
a3621e74
YO
2680 /* -- First loop and Setup --
2681
2682 We first traverse the branches and scan each word to determine if it
2683 contains widechars, and how many unique chars there are, this is
2684 important as we have to build a table with at least as many columns as we
2685 have unique chars.
2686
2687 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2688 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2689 the native representation of the character value as the key and IV's for
2690 the coded index.
a3621e74
YO
2691
2692 *TODO* If we keep track of how many times each character is used we can
2693 remap the columns so that the table compression later on is more
3b753521 2694 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2695 middle and the least common are on the outside. IMO this would be better
2696 than a most to least common mapping as theres a decent chance the most
2697 common letter will share a node with the least common, meaning the node
486ec47a 2698 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2699 case is when we have the least common nodes twice.
2700
2701 */
2702
a3621e74 2703 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2704 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2705 const U8 *uc;
2706 const U8 *e;
bc031a7d 2707 int foldlen = 0;
07be1b83 2708 U32 wordlen = 0; /* required init */
bc031a7d
KW
2709 STRLEN minchars = 0;
2710 STRLEN maxchars = 0;
538e84ed
KW
2711 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2712 bitmap?*/
a3621e74 2713
3dab1dad 2714 if (OP(noper) == NOTHING) {
20ed8c88
YO
2715 /* skip past a NOTHING at the start of an alternation
2716 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2717 */
df826430 2718 regnode *noper_next= regnext(noper);
944e05e3
YO
2719 if (noper_next < tail)
2720 noper= noper_next;
2721 }
2722
dca5fc4c
YO
2723 if ( noper < tail &&
2724 (
2725 OP(noper) == flags ||
2726 (
2727 flags == EXACTFU &&
2728 OP(noper) == EXACTFU_SS
2729 )
2730 )
2731 ) {
944e05e3
YO
2732 uc= (U8*)STRING(noper);
2733 e= uc + STR_LEN(noper);
2734 } else {
2735 trie->minlen= 0;
2736 continue;
3dab1dad 2737 }
df826430 2738
944e05e3 2739
fab2782b 2740 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2741 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2742 regardless of encoding */
fab2782b
YO
2743 if (OP( noper ) == EXACTFU_SS) {
2744 /* false positives are ok, so just set this */
0dc4a61d 2745 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2746 }
2747 }
dca5fc4c 2748
bc031a7d
KW
2749 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2750 branch */
3dab1dad 2751 TRIE_CHARCOUNT(trie)++;
a3621e74 2752 TRIE_READ_CHAR;
645de4ce 2753
bc031a7d
KW
2754 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2755 * is in effect. Under /i, this character can match itself, or
2756 * anything that folds to it. If not under /i, it can match just
2757 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2758 * all fold to k, and all are single characters. But some folds
2759 * expand to more than one character, so for example LATIN SMALL
2760 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2761 * the string beginning at 'uc' is 'ffi', it could be matched by
2762 * three characters, or just by the one ligature character. (It
2763 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2764 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2765 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2766 * match.) The trie needs to know the minimum and maximum number
2767 * of characters that could match so that it can use size alone to
2768 * quickly reject many match attempts. The max is simple: it is
2769 * the number of folded characters in this branch (since a fold is
2770 * never shorter than what folds to it. */
2771
2772 maxchars++;
2773
2774 /* And the min is equal to the max if not under /i (indicated by
2775 * 'folder' being NULL), or there are no multi-character folds. If
2776 * there is a multi-character fold, the min is incremented just
2777 * once, for the character that folds to the sequence. Each
2778 * character in the sequence needs to be added to the list below of
2779 * characters in the trie, but we count only the first towards the
2780 * min number of characters needed. This is done through the
2781 * variable 'foldlen', which is returned by the macros that look
2782 * for these sequences as the number of bytes the sequence
2783 * occupies. Each time through the loop, we decrement 'foldlen' by
2784 * how many bytes the current char occupies. Only when it reaches
2785 * 0 do we increment 'minchars' or look for another multi-character
2786 * sequence. */
2787 if (folder == NULL) {
2788 minchars++;
2789 }
2790 else if (foldlen > 0) {
2791 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2792 }
2793 else {
bc031a7d
KW
2794 minchars++;
2795
2796 /* See if *uc is the beginning of a multi-character fold. If
2797 * so, we decrement the length remaining to look at, to account
2798 * for the current character this iteration. (We can use 'uc'
2799 * instead of the fold returned by TRIE_READ_CHAR because for
2800 * non-UTF, the latin1_safe macro is smart enough to account
2801 * for all the unfolded characters, and because for UTF, the
2802 * string will already have been folded earlier in the
2803 * compilation process */
2804 if (UTF) {
2805 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2806 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2807 }
2808 }
bc031a7d
KW
2809 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2810 foldlen--;
2811 }
645de4ce 2812 }
bc031a7d
KW
2813
2814 /* The current character (and any potential folds) should be added
2815 * to the possible matching characters for this position in this
2816 * branch */
a3621e74 2817 if ( uvc < 256 ) {
fab2782b
YO
2818 if ( folder ) {
2819 U8 folded= folder[ (U8) uvc ];
2820 if ( !trie->charmap[ folded ] ) {
2821 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2822 TRIE_STORE_REVCHAR( folded );
2823 }
2824 }
a3621e74
YO
2825 if ( !trie->charmap[ uvc ] ) {
2826 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2827 TRIE_STORE_REVCHAR( uvc );
a3621e74 2828 }
02daf0ab 2829 if ( set_bit ) {
62012aee
KW
2830 /* store the codepoint in the bitmap, and its folded
2831 * equivalent. */
8bcafbf4 2832 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2833 set_bit = 0; /* We've done our bit :-) */
2834 }
a3621e74 2835 } else {
bc031a7d
KW
2836
2837 /* XXX We could come up with the list of code points that fold
2838 * to this using PL_utf8_foldclosures, except not for
2839 * multi-char folds, as there may be multiple combinations
2840 * there that could work, which needs to wait until runtime to
2841 * resolve (The comment about LIGATURE FFI above is such an
2842 * example */
2843
a3621e74 2844 SV** svpp;
55eed653
NC
2845 if ( !widecharmap )
2846 widecharmap = newHV();
a3621e74 2847
55eed653 2848 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2849
2850 if ( !svpp )
147e3846 2851 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
2852
2853 if ( !SvTRUE( *svpp ) ) {
2854 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2855 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2856 }
2857 }
bc031a7d
KW
2858 } /* end loop through characters in this branch of the trie */
2859
2860 /* We take the min and max for this branch and combine to find the min
2861 * and max for all branches processed so far */
3dab1dad 2862 if( cur == first ) {
bc031a7d
KW
2863 trie->minlen = minchars;
2864 trie->maxlen = maxchars;
2865 } else if (minchars < trie->minlen) {
2866 trie->minlen = minchars;
2867 } else if (maxchars > trie->maxlen) {
2868 trie->maxlen = maxchars;
fab2782b 2869 }
a3621e74
YO
2870 } /* end first pass */
2871 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2872 Perl_re_indentf( aTHX_
cb41e5d6
YO
2873 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2874 depth+1,
55eed653 2875 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2876 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2877 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2878 );
a3621e74
YO
2879
2880 /*
2881 We now know what we are dealing with in terms of unique chars and
2882 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2883 representation using a flat table will take. If it's over a reasonable
2884 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2885 conservative but potentially much slower representation using an array
2886 of lists.
2887
2888 At the end we convert both representations into the same compressed
2889 form that will be used in regexec.c for matching with. The latter
2890 is a form that cannot be used to construct with but has memory
2891 properties similar to the list form and access properties similar
2892 to the table form making it both suitable for fast searches and
2893 small enough that its feasable to store for the duration of a program.
2894
2895 See the comment in the code where the compressed table is produced
2896 inplace from the flat tabe representation for an explanation of how
2897 the compression works.
2898
2899 */
2900
2901
2e64971a
DM
2902 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2903 prev_states[1] = 0;
2904
538e84ed
KW
2905 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2906 > SvIV(re_trie_maxbuff) )
2907 {
a3621e74
YO
2908 /*
2909 Second Pass -- Array Of Lists Representation
2910
2911 Each state will be represented by a list of charid:state records
2912 (reg_trie_trans_le) the first such element holds the CUR and LEN
2913 points of the allocated array. (See defines above).
2914
2915 We build the initial structure using the lists, and then convert
2916 it into the compressed table form which allows faster lookups
2917 (but cant be modified once converted).
a3621e74
YO
2918 */
2919
a3621e74
YO
2920 STRLEN transcount = 1;
2921
6ad9a8ab 2922 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 2923 depth+1));
686b73d4 2924
c944940b
JH
2925 trie->states = (reg_trie_state *)
2926 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2927 sizeof(reg_trie_state) );
a3621e74
YO
2928 TRIE_LIST_NEW(1);
2929 next_alloc = 2;
2930
2931 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2932
df826430 2933 regnode *noper = NEXTOPER( cur );
c445ea15
AL
2934 U32 state = 1; /* required init */
2935 U16 charid = 0; /* sanity init */
07be1b83 2936 U32 wordlen = 0; /* required init */
c445ea15 2937
df826430
YO
2938 if (OP(noper) == NOTHING) {
2939 regnode *noper_next= regnext(noper);
944e05e3
YO
2940 if (noper_next < tail)
2941 noper= noper_next;
df826430
YO
2942 }
2943
944e05e3
YO
2944 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
2945 const U8 *uc= (U8*)STRING(noper);
2946 const U8 *e= uc + STR_LEN(noper);
2947
786e8c11 2948 for ( ; uc < e ; uc += len ) {
c445ea15 2949
786e8c11 2950 TRIE_READ_CHAR;
c445ea15 2951
786e8c11
YO
2952 if ( uvc < 256 ) {
2953 charid = trie->charmap[ uvc ];
c445ea15 2954 } else {
538e84ed
KW
2955 SV** const svpp = hv_fetch( widecharmap,
2956 (char*)&uvc,
2957 sizeof( UV ),
2958 0);
786e8c11
YO
2959 if ( !svpp ) {
2960 charid = 0;
2961 } else {
2962 charid=(U16)SvIV( *svpp );
2963 }
c445ea15 2964 }
538e84ed
KW
2965 /* charid is now 0 if we dont know the char read, or
2966 * nonzero if we do */
786e8c11 2967 if ( charid ) {
a3621e74 2968
786e8c11
YO
2969 U16 check;
2970 U32 newstate = 0;
a3621e74 2971
786e8c11
YO
2972 charid--;
2973 if ( !trie->states[ state ].trans.list ) {
2974 TRIE_LIST_NEW( state );
c445ea15 2975 }
538e84ed
KW
2976 for ( check = 1;
2977 check <= TRIE_LIST_USED( state );
2978 check++ )
2979 {
2980 if ( TRIE_LIST_ITEM( state, check ).forid
2981 == charid )
2982 {
786e8c11
YO
2983 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2984 break;
2985 }
2986 }
2987 if ( ! newstate ) {
2988 newstate = next_alloc++;
2e64971a 2989 prev_states[newstate] = state;
786e8c11
YO
2990 TRIE_LIST_PUSH( state, charid, newstate );
2991 transcount++;
2992 }
2993 state = newstate;
2994 } else {
147e3846 2995 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 2996 }
a28509cc 2997 }
c445ea15 2998 }
3dab1dad 2999 TRIE_HANDLE_WORD(state);
a3621e74
YO
3000
3001 } /* end second pass */
3002
1e2e3d02 3003 /* next alloc is the NEXT state to be allocated */
538e84ed 3004 trie->statecount = next_alloc;
c944940b
JH
3005 trie->states = (reg_trie_state *)
3006 PerlMemShared_realloc( trie->states,
3007 next_alloc
3008 * sizeof(reg_trie_state) );
a3621e74 3009
3dab1dad 3010 /* and now dump it out before we compress it */
2b8b4781
NC
3011 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3012 revcharmap, next_alloc,
3013 depth+1)
1e2e3d02 3014 );
a3621e74 3015
c944940b
JH
3016 trie->trans = (reg_trie_trans *)
3017 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
3018 {
3019 U32 state;
a3621e74
YO
3020 U32 tp = 0;
3021 U32 zp = 0;
3022
3023
3024 for( state=1 ; state < next_alloc ; state ++ ) {
3025 U32 base=0;
3026
3027 /*
3028 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3029 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
3030 );
3031 */
3032
3033 if (trie->states[state].trans.list) {
3034 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3035 U16 maxid=minid;
a28509cc 3036 U16 idx;
a3621e74
YO
3037
3038 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
3039 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3040 if ( forid < minid ) {
3041 minid=forid;
3042 } else if ( forid > maxid ) {
3043 maxid=forid;
3044 }
a3621e74
YO
3045 }
3046 if ( transcount < tp + maxid - minid + 1) {
3047 transcount *= 2;
c944940b
JH
3048 trie->trans = (reg_trie_trans *)
3049 PerlMemShared_realloc( trie->trans,
446bd890
NC
3050 transcount
3051 * sizeof(reg_trie_trans) );
538e84ed
KW
3052 Zero( trie->trans + (transcount / 2),
3053 transcount / 2,
3054 reg_trie_trans );
a3621e74
YO
3055 }
3056 base = trie->uniquecharcount + tp - minid;
3057 if ( maxid == minid ) {
3058 U32 set = 0;
3059 for ( ; zp < tp ; zp++ ) {
3060 if ( ! trie->trans[ zp ].next ) {
3061 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
3062 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3063 1).newstate;
a3621e74
YO
3064 trie->trans[ zp ].check = state;
3065 set = 1;
3066 break;
3067 }
3068 }
3069 if ( !set ) {
538e84ed
KW
3070 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3071 1).newstate;
a3621e74
YO
3072 trie->trans[ tp ].check = state;
3073 tp++;
3074 zp = tp;
3075 }
3076 } else {
3077 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
3078 const U32 tid = base
3079 - trie->uniquecharcount
3080 + TRIE_LIST_ITEM( state, idx ).forid;
3081 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3082 idx ).newstate;
a3621e74
YO
3083 trie->trans[ tid ].check = state;
3084 }
3085 tp += ( maxid - minid + 1 );
3086 }
3087 Safefree(trie->states[ state ].trans.list);
3088 }
3089 /*
3090 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3091 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
3092 );
3093 */
3094 trie->states[ state ].trans.base=base;
3095 }
cc601c31 3096 trie->lasttrans = tp + 1;
a3621e74
YO
3097 }
3098 } else {
3099 /*
3100 Second Pass -- Flat Table Representation.
3101
b423522f
KW
3102 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3103 each. We know that we will need Charcount+1 trans at most to store
3104 the data (one row per char at worst case) So we preallocate both
3105 structures assuming worst case.
a3621e74
YO
3106
3107 We then construct the trie using only the .next slots of the entry
3108 structs.
3109
b423522f
KW
3110 We use the .check field of the first entry of the node temporarily
3111 to make compression both faster and easier by keeping track of how
3112 many non zero fields are in the node.
a3621e74
YO
3113
3114 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3115 transition.
3116
b423522f
KW
3117 There are two terms at use here: state as a TRIE_NODEIDX() which is
3118 a number representing the first entry of the node, and state as a
3119 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3120 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3121 if there are 2 entrys per node. eg:
a3621e74
YO
3122
3123 A B A B
3124 1. 2 4 1. 3 7
3125 2. 0 3 3. 0 5
3126 3. 0 0 5. 0 0
3127 4. 0 0 7. 0 0
3128
b423522f
KW
3129 The table is internally in the right hand, idx form. However as we
3130 also have to deal with the states array which is indexed by nodenum
3131 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
3132
3133 */
6ad9a8ab 3134 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 3135 depth+1));
3dab1dad 3136
c944940b
JH
3137 trie->trans = (reg_trie_trans *)
3138 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3139 * trie->uniquecharcount + 1,
3140 sizeof(reg_trie_trans) );
3141 trie->states = (reg_trie_state *)
3142 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3143 sizeof(reg_trie_state) );
a3621e74
YO
3144 next_alloc = trie->uniquecharcount + 1;
3145
3dab1dad 3146
a3621e74
YO
3147 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3148
df826430 3149 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3150
3151 U32 state = 1; /* required init */
3152
3153 U16 charid = 0; /* sanity init */
3154 U32 accept_state = 0; /* sanity init */
a3621e74 3155
07be1b83 3156 U32 wordlen = 0; /* required init */
a3621e74 3157
df826430
YO
3158 if (OP(noper) == NOTHING) {
3159 regnode *noper_next= regnext(noper);
944e05e3
YO
3160 if (noper_next < tail)
3161 noper= noper_next;
df826430 3162 }
fab2782b 3163
944e05e3
YO
3164 if ( noper < tail && ( OP(noper) == flags || ( flags == EXACTFU && OP(noper) == EXACTFU_SS ) ) ) {
3165 const U8 *uc= (U8*)STRING(noper);
3166 const U8 *e= uc + STR_LEN(noper);
3167
786e8c11 3168 for ( ; uc < e ; uc += len ) {
a3621e74 3169
786e8c11 3170 TRIE_READ_CHAR;
a3621e74 3171
786e8c11
YO
3172 if ( uvc < 256 ) {
3173 charid = trie->charmap[ uvc ];
3174 } else {
538e84ed
KW
3175 SV* const * const svpp = hv_fetch( widecharmap,
3176 (char*)&uvc,
3177 sizeof( UV ),
3178 0);
786e8c11 3179 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3180 }
786e8c11
YO
3181 if ( charid ) {
3182 charid--;
3183 if ( !trie->trans[ state + charid ].next ) {
3184 trie->trans[ state + charid ].next = next_alloc;
3185 trie->trans[ state ].check++;
2e64971a
DM
3186 prev_states[TRIE_NODENUM(next_alloc)]
3187 = TRIE_NODENUM(state);
786e8c11
YO
3188 next_alloc += trie->uniquecharcount;
3189 }
3190 state = trie->trans[ state + charid ].next;
3191 } else {
147e3846 3192 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3193 }
538e84ed
KW
3194 /* charid is now 0 if we dont know the char read, or
3195 * nonzero if we do */
a3621e74 3196 }
a3621e74 3197 }
3dab1dad
YO
3198 accept_state = TRIE_NODENUM( state );
3199 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3200
3201 } /* end second pass */
3202
3dab1dad 3203 /* and now dump it out before we compress it */
2b8b4781
NC
3204 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3205 revcharmap,
3206 next_alloc, depth+1));
a3621e74 3207
a3621e74
YO
3208 {
3209 /*
3210 * Inplace compress the table.*
3211
3212 For sparse data sets the table constructed by the trie algorithm will
3213 be mostly 0/FAIL transitions or to put it another way mostly empty.
3214 (Note that leaf nodes will not contain any transitions.)
3215
3216 This algorithm compresses the tables by eliminating most such
3217 transitions, at the cost of a modest bit of extra work during lookup:
3218
3219 - Each states[] entry contains a .base field which indicates the
3220 index in the state[] array wheres its transition data is stored.
3221
3b753521 3222 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3223
3224 - If .base is nonzero then charid is added to it to find an entry in
3225 the trans array.
3226
3227 -If trans[states[state].base+charid].check!=state then the
3228 transition is taken to be a 0/Fail transition. Thus if there are fail
3229 transitions at the front of the node then the .base offset will point
3230 somewhere inside the previous nodes data (or maybe even into a node
3231 even earlier), but the .check field determines if the transition is
3232 valid.
3233
786e8c11 3234 XXX - wrong maybe?
a3621e74 3235 The following process inplace converts the table to the compressed
3b753521 3236 table: We first do not compress the root node 1,and mark all its
a3621e74 3237 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3238 allows us to do a DFA construction from the compressed table later,
3239 and ensures that any .base pointers we calculate later are greater
3240 than 0.
a3621e74
YO
3241
3242 - We set 'pos' to indicate the first entry of the second node.
3243
3244 - We then iterate over the columns of the node, finding the first and
3245 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3246 and set the .check pointers accordingly, and advance pos
3247 appropriately and repreat for the next node. Note that when we copy
3248 the next pointers we have to convert them from the original
3249 NODEIDX form to NODENUM form as the former is not valid post
3250 compression.
3251
3252 - If a node has no transitions used we mark its base as 0 and do not
3253 advance the pos pointer.
3254
3255 - If a node only has one transition we use a second pointer into the
3256 structure to fill in allocated fail transitions from other states.
3257 This pointer is independent of the main pointer and scans forward
3258 looking for null transitions that are allocated to a state. When it
3259 finds one it writes the single transition into the "hole". If the
786e8c11 3260 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3261
3262 - Once compressed we can Renew/realloc the structures to release the
3263 excess space.
3264
3265 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3266 specifically Fig 3.47 and the associated pseudocode.
3267
3268 demq
3269 */
a3b680e6 3270 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3271 U32 state, charid;
a3621e74 3272 U32 pos = 0, zp=0;
1e2e3d02 3273 trie->statecount = laststate;
a3621e74
YO
3274
3275 for ( state = 1 ; state < laststate ; state++ ) {
3276 U8 flag = 0;
a28509cc
AL
3277 const U32 stateidx = TRIE_NODEIDX( state );
3278 const U32 o_used = trie->trans[ stateidx ].check;
3279 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3280 trie->trans[ stateidx ].check = 0;
3281
538e84ed
KW
3282 for ( charid = 0;
3283 used && charid < trie->uniquecharcount;
3284 charid++ )
3285 {
a3621e74
YO
3286 if ( flag || trie->trans[ stateidx + charid ].next ) {
3287 if ( trie->trans[ stateidx + charid ].next ) {
3288 if (o_used == 1) {
3289 for ( ; zp < pos ; zp++ ) {
3290 if ( ! trie->trans[ zp ].next ) {
3291 break;
3292 }
3293 }
538e84ed
KW
3294 trie->states[ state ].trans.base
3295 = zp
3296 + trie->uniquecharcount
3297 - charid ;
3298 trie->trans[ zp ].next
3299 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3300 + charid ].next );
a3621e74
YO
3301 trie->trans[ zp ].check = state;
3302 if ( ++zp > pos ) pos = zp;
3303 break;
3304 }
3305 used--;
3306 }
3307 if ( !flag ) {
3308 flag = 1;
538e84ed
KW
3309 trie->states[ state ].trans.base
3310 = pos + trie->uniquecharcount - charid ;
a3621e74 3311 }
538e84ed
KW
3312 trie->trans[ pos ].next
3313 = SAFE_TRIE_NODENUM(
3314 trie->trans[ stateidx + charid ].next );
a3621e74
YO
3315 trie->trans[ pos ].check = state;
3316 pos++;
3317 }
3318 }
3319 }
cc601c31 3320 trie->lasttrans = pos + 1;
c944940b
JH
3321 trie->states = (reg_trie_state *)
3322 PerlMemShared_realloc( trie->states, laststate
3323 * sizeof(reg_trie_state) );
a3621e74 3324 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3325 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
cb41e5d6 3326 depth+1,
538e84ed
KW
3327 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3328 + 1 ),
3329 (IV)next_alloc,
3330 (IV)pos,
3331 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
3332 );
3333
3334 } /* end table compress */
3335 }
1e2e3d02 3336 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3337 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
cb41e5d6 3338 depth+1,
1e2e3d02
YO
3339 (UV)trie->statecount,
3340 (UV)trie->lasttrans)
3341 );
cc601c31 3342 /* resize the trans array to remove unused space */
c944940b
JH
3343 trie->trans = (reg_trie_trans *)
3344 PerlMemShared_realloc( trie->trans, trie->lasttrans
3345 * sizeof(reg_trie_trans) );
a3621e74 3346
538e84ed 3347 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
3348 U8 nodetype =(U8)(flags & 0xFF);
3349 char *str=NULL;
538e84ed 3350
07be1b83 3351#ifdef DEBUGGING
e62cc96a 3352 regnode *optimize = NULL;
7122b237
YO
3353#ifdef RE_TRACK_PATTERN_OFFSETS
3354
b57a0404
JH
3355 U32 mjd_offset = 0;
3356 U32 mjd_nodelen = 0;
7122b237
YO
3357#endif /* RE_TRACK_PATTERN_OFFSETS */
3358#endif /* DEBUGGING */
a3621e74 3359 /*
3dab1dad
YO
3360 This means we convert either the first branch or the first Exact,
3361 depending on whether the thing following (in 'last') is a branch
3362 or not and whther first is the startbranch (ie is it a sub part of
3363 the alternation or is it the whole thing.)
3b753521 3364 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 3365 the whole branch sequence, including the first.
a3621e74 3366 */
3dab1dad 3367 /* Find the node we are going to overwrite */
7f69552c 3368 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 3369 /* branch sub-chain */
3dab1dad 3370 NEXT_OFF( first ) = (U16)(last - first);
7122b237 3371#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
3372 DEBUG_r({
3373 mjd_offset= Node_Offset((convert));
3374 mjd_nodelen= Node_Length((convert));
3375 });
7122b237 3376#endif
7f69552c 3377 /* whole branch chain */
7122b237
YO
3378 }
3379#ifdef RE_TRACK_PATTERN_OFFSETS
3380 else {
7f69552c
YO
3381 DEBUG_r({
3382 const regnode *nop = NEXTOPER( convert );
3383 mjd_offset= Node_Offset((nop));
3384 mjd_nodelen= Node_Length((nop));
3385 });
07be1b83
YO
3386 }
3387 DEBUG_OPTIMISE_r(
147e3846 3388 Perl_re_indentf( aTHX_ "MJD offset:%" UVuf " MJD length:%" UVuf "\n",
cb41e5d6 3389 depth+1,
786e8c11 3390 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 3391 );
7122b237 3392#endif
538e84ed 3393 /* But first we check to see if there is a common prefix we can
3dab1dad
YO
3394 split out as an EXACT and put in front of the TRIE node. */
3395 trie->startstate= 1;
55eed653 3396 if ( trie->bitmap && !widecharmap && !trie->jump ) {
5ee57374
YO
3397 /* we want to find the first state that has more than
3398 * one transition, if that state is not the first state
3399 * then we have a common prefix which we can remove.
3400 */
3dab1dad 3401 U32 state;
1e2e3d02 3402 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 3403 U32 ofs = 0;
ee0dfd0b
YO
3404 I32 first_ofs = -1; /* keeps track of the ofs of the first
3405 transition, -1 means none */
8e11feef
RGS
3406 U32 count = 0;
3407 const U32 base = trie->states[ state ].trans.base;
a3621e74 3408
5ee57374 3409 /* does this state terminate an alternation? */
3dab1dad 3410 if ( trie->states[state].wordnum )
8e11feef 3411 count = 1;
a3621e74 3412
8e11feef 3413 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
3414 if ( ( base + ofs >= trie->uniquecharcount ) &&
3415 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
3416 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3417 {
3dab1dad 3418 if ( ++count > 1 ) {
5ee57374 3419 /* we have more than one transition */
d3d91c7c
YO
3420 SV **tmp;
3421 U8 *ch;
5ee57374
YO
3422 /* if this is the first state there is no common prefix
3423 * to extract, so we can exit */
8e11feef 3424 if ( state == 1 ) break;
d3d91c7c
YO
3425 tmp = av_fetch( revcharmap, ofs, 0);
3426 ch = (U8*)SvPV_nolen_const( *tmp );
3427
5ee57374
YO
3428 /* if we are on count 2 then we need to initialize the
3429 * bitmap, and store the previous char if there was one
3430 * in it*/
3dab1dad 3431 if ( count == 2 ) {
5ee57374