This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
gh16947: avoid mutating regexp program only within GOSUB
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
aa2885ce
KW
73
74/* Note on debug output:
75 *
76 * This is set up so that -Dr turns on debugging like all other flags that are
77 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
78 * all regular expressions encountered in a program, and gives a huge amount of
79 * output for all but the shortest programs.
80 *
81 * The ability to output pattern debugging information lexically, and with much
82 * finer grained control was added, with 'use re qw(Debug ....);' available even
83 * in non-DEBUGGING builds. This is accomplished by copying the contents of
84 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
85 * Those files are compiled and linked into the perl executable, and they are
86 * compiled essentially as if DEBUGGING were enabled, and controlled by calls
87 * to re.pm.
88 *
89 * That would normally mean linking errors when two functions of the same name
90 * are attempted to be placed into the same executable. That is solved in one
91 * of four ways:
92 * 1) Static functions aren't known outside the file they are in, so for the
93 * many functions of that type in this file, it just isn't a problem.
94 * 2) Most externally known functions are enclosed in
95 * #ifndef PERL_IN_XSUB_RE
96 * ...
97 * #endif
98 * blocks, so there is only one defintion for them in the whole
99 * executable, the one in regcomp.c (or regexec.c). The implication of
100 * that is any debugging info that comes from them is controlled only by
101 * -Dr. Further, any static function they call will also be the version
102 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
103 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
104 * have different names, so that what gets loaded in the executable is
105 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
106 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
107 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
108 * versions and their callees are under control of re.pm. The catch is
109 * that references to all these go through the regexp_engine structure,
110 * which is initialized in regcomp.h to the Perl_foo versions, and
111 * substituted out in lexical scopes where 'use re' is in effect to the
112 * 'my_foo' ones. That structure is public API, so it would be a hard
113 * sell to add any additional members.
114 * 4) For functions in regcomp.c and re_comp.c that are called only from,
115 * respectively, regexec.c and re_exec.c, they can have two different
116 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
117 * embed.fnc.
118 *
119 * The bottom line is that if you add code to one of the public functions
120 * listed in ext/re/re_top.h, debugging automagically works. But if you write
121 * a new function that needs to do debugging or there is a chain of calls from
122 * it that need to do debugging, all functions in the chain should use options
123 * 2) or 4) above.
124 *
125 * A function may have to be split so that debugging stuff is static, but it
126 * calls out to some other function that only gets compiled in regcomp.c to
127 * access data that we don't want to duplicate.
128 */
129
a687059c 130#include "EXTERN.h"
864dbfa3 131#define PERL_IN_REGCOMP_C
a687059c 132#include "perl.h"
d06ea78c 133
c277df42 134#define REG_COMP_C
54df2634
NC
135#ifdef PERL_IN_XSUB_RE
136# include "re_comp.h"
afda64e8 137EXTERN_C const struct regexp_engine my_reg_engine;
51bd1941 138EXTERN_C const struct regexp_engine wild_reg_engine;
54df2634
NC
139#else
140# include "regcomp.h"
141#endif
a687059c 142
b992490d 143#include "invlist_inline.h"
1b0f46bf 144#include "unicode_constants.h"
04e98a4d 145
538e84ed
KW
146#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
147 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
148#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
149 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 150#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 151#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 152
a687059c
LW
153#ifndef STATIC
154#define STATIC static
155#endif
156
3f910e62
YO
157/* this is a chain of data about sub patterns we are processing that
158 need to be handled separately/specially in study_chunk. Its so
159 we can simulate recursion without losing state. */
160struct scan_frame;
161typedef struct scan_frame {
162 regnode *last_regnode; /* last node to process in this frame */
163 regnode *next_regnode; /* next node to process when last is reached */
164 U32 prev_recursed_depth;
165 I32 stopparen; /* what stopparen do we use */
089ad25d 166 bool in_gosub; /* this or an outer frame is for GOSUB */
3f910e62
YO
167
168 struct scan_frame *this_prev_frame; /* this previous frame */
169 struct scan_frame *prev_frame; /* previous frame */
170 struct scan_frame *next_frame; /* next frame */
171} scan_frame;
172
f4ae5a27
KW
173/* Certain characters are output as a sequence with the first being a
174 * backslash. */
4aada8b9 175#define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
f4ae5a27
KW
176
177
09b2b2e6 178struct RExC_state_t {
514a91f1
DM
179 U32 flags; /* RXf_* are we folding, multilining? */
180 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 181 char *precomp; /* uncompiled string. */
711b303b 182 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 183 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 184 regexp *rx; /* perl core regexp structure */
538e84ed
KW
185 regexp_internal *rxi; /* internal data for regexp object
186 pprivate field */
fac92740 187 char *start; /* Start of input for compile */
830247a4
IZ
188 char *end; /* End of input for compile */
189 char *parse; /* Input-scan pointer. */
51684e3c
KW
190 char *copy_start; /* start of copy of input within
191 constructed parse string */
cc16d262
KW
192 char *save_copy_start; /* Provides one level of saving
193 and restoring 'copy_start' */
51684e3c
KW
194 char *copy_start_in_input; /* Position in input string
195 corresponding to copy_start */
ea3daa5d 196 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 197 regnode *emit_start; /* Start of emitted-code area */
f55b7b26 198 regnode_offset emit; /* Code-emit pointer */
830247a4
IZ
199 I32 naughty; /* How bad is this pattern? */
200 I32 sawback; /* Did we see \1, ...? */
88f063b4
KW
201 SSize_t size; /* Number of regnode equivalents in
202 pattern */
d8d1dede
KW
203 Size_t sets_depth; /* Counts recursion depth of already-
204 compiled regex set patterns */
2475254e
RL
205 U32 seen;
206
207 I32 parens_buf_size; /* #slots malloced open/close_parens */
208 regnode_offset *open_parens; /* offsets to open parens */
209 regnode_offset *close_parens; /* offsets to close parens */
210 HV *paren_names; /* Paren names */
7c932d07
KW
211
212 /* position beyond 'precomp' of the warning message furthest away from
213 * 'precomp'. During the parse, no warnings are raised for any problems
214 * earlier in the parse than this position. This works if warnings are
215 * raised the first time a given spot is parsed, and if only one
216 * independent warning is raised for any given spot */
217 Size_t latest_warn_offset;
218
219 I32 npar; /* Capture buffer count so far in the
220 parse, (OPEN) plus one. ("par" 0 is
221 the whole pattern)*/
222 I32 total_par; /* During initial parse, is either 0,
223 or -1; the latter indicating a
224 reparse is needed. After that pass,
225 it is what 'npar' became after the
226 pass. Hence, it being > 0 indicates
227 we are in a reparse situation */
538e84ed
KW
228 I32 nestroot; /* root parens we are in - used by
229 accept */
830247a4 230 I32 seen_zerolen;
d5a00e4a 231 regnode *end_op; /* END node in program */
02daf0ab
YO
232 I32 utf8; /* whether the pattern is utf8 or not */
233 I32 orig_utf8; /* whether the pattern was originally in utf8 */
234 /* XXX use this for future optimisation of case
235 * where pattern must be upgraded to utf8. */
e40e74fe
KW
236 I32 uni_semantics; /* If a d charset modifier should use unicode
237 rules, even if the pattern is not in
238 utf8 */
538e84ed 239
88f063b4 240 I32 recurse_count; /* Number of recurse regops we have generated */
2475254e 241 regnode **recurse; /* Recurse regops */
4286711a 242 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 243 through */
09a65838 244 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 245 I32 in_lookbehind;
105c827d 246 I32 in_lookahead;
4624b182 247 I32 contains_locale;
bb3f3ed2 248 I32 override_recoding;
4caef9c3 249 I32 recode_x_to_native;
9d53c457 250 I32 in_multi_char_class;
2475254e 251 int code_index; /* next code_blocks[] slot */
1acab4c5 252 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 253 within pattern */
ee273784 254 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
255 scan_frame *frame_head;
256 scan_frame *frame_last;
257 U32 frame_count;
7eec73eb 258 AV *warn_text;
945961fd 259 HV *unlexed_names;
d24ca0c5 260 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 261#ifdef DEBUGGING
be8e71aa 262 const char *lastparse;
3dab1dad 263 I32 lastnum;
d9a72fcc 264 U32 study_chunk_recursed_count;
2475254e 265 AV *paren_name_list; /* idx -> name */
c9f0d54c
YO
266 SV *mysv1;
267 SV *mysv2;
88f063b4 268
3dab1dad
YO
269#define RExC_lastparse (pRExC_state->lastparse)
270#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 271#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 272#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
273#define RExC_mysv (pRExC_state->mysv1)
274#define RExC_mysv1 (pRExC_state->mysv1)
275#define RExC_mysv2 (pRExC_state->mysv2)
276
3dab1dad 277#endif
bd0b62a6 278 bool seen_d_op;
911bd04e 279 bool strict;
da7cf1cc 280 bool study_started;
034602eb 281 bool in_script_run;
7c932d07 282 bool use_BRANCHJ;
5aabce33
KW
283 bool sWARN_EXPERIMENTAL__VLB;
284 bool sWARN_EXPERIMENTAL__REGEX_SETS;
09b2b2e6 285};
830247a4 286
e2509266 287#define RExC_flags (pRExC_state->flags)
514a91f1 288#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 289#define RExC_precomp (pRExC_state->precomp)
51684e3c
KW
290#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
291#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
cc16d262 292#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
711b303b 293#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 294#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 295#define RExC_rx (pRExC_state->rx)
f8fc2ecf 296#define RExC_rxi (pRExC_state->rxi)
fac92740 297#define RExC_start (pRExC_state->start)
830247a4
IZ
298#define RExC_end (pRExC_state->end)
299#define RExC_parse (pRExC_state->parse)
7c932d07 300#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
830247a4 301#define RExC_whilem_seen (pRExC_state->whilem_seen)
bd0b62a6
KW
302#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
303 under /d from /u ? */
512c0f5a 304
7122b237 305#ifdef RE_TRACK_PATTERN_OFFSETS
44f36cd3 306# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
538e84ed 307 others */
7122b237 308#endif
830247a4 309#define RExC_emit (pRExC_state->emit)
fac92740 310#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
311#define RExC_sawback (pRExC_state->sawback)
312#define RExC_seen (pRExC_state->seen)
313#define RExC_size (pRExC_state->size)
ee273784 314#define RExC_maxlen (pRExC_state->maxlen)
830247a4 315#define RExC_npar (pRExC_state->npar)
1533d1d2 316#define RExC_total_parens (pRExC_state->total_par)
0275405c 317#define RExC_parens_buf_size (pRExC_state->parens_buf_size)
e2e6a0f1 318#define RExC_nestroot (pRExC_state->nestroot)
830247a4 319#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 320#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 321#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 322#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
323#define RExC_open_parens (pRExC_state->open_parens)
324#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 325#define RExC_end_op (pRExC_state->end_op)
81714fb9 326#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
327#define RExC_recurse (pRExC_state->recurse)
328#define RExC_recurse_count (pRExC_state->recurse_count)
d8d1dede 329#define RExC_sets_depth (pRExC_state->sets_depth)
09a65838 330#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
331#define RExC_study_chunk_recursed_bytes \
332 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 333#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
105c827d 334#define RExC_in_lookahead (pRExC_state->in_lookahead)
4624b182 335#define RExC_contains_locale (pRExC_state->contains_locale)
4caef9c3
KW
336#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
337
b6d67071 338#ifdef EBCDIC
4caef9c3
KW
339# define SET_recode_x_to_native(x) \
340 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
341#else
342# define SET_recode_x_to_native(x) NOOP
b6d67071 343#endif
4caef9c3 344
9d53c457 345#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
346#define RExC_frame_head (pRExC_state->frame_head)
347#define RExC_frame_last (pRExC_state->frame_last)
348#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 349#define RExC_strict (pRExC_state->strict)
da7cf1cc 350#define RExC_study_started (pRExC_state->study_started)
7eec73eb 351#define RExC_warn_text (pRExC_state->warn_text)
034602eb 352#define RExC_in_script_run (pRExC_state->in_script_run)
7c932d07 353#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
5aabce33
KW
354#define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
355#define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
945961fd 356#define RExC_unlexed_names (pRExC_state->unlexed_names)
830247a4 357
99807a43
HS
358/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
359 * a flag to disable back-off on the fixed/floating substrings - if it's
360 * a high complexity pattern we assume the benefit of avoiding a full match
361 * is worth the cost of checking for the substrings even if they rarely help.
362 */
363#define RExC_naughty (pRExC_state->naughty)
364#define TOO_NAUGHTY (10)
365#define MARK_NAUGHTY(add) \
366 if (RExC_naughty < TOO_NAUGHTY) \
367 RExC_naughty += (add)
368#define MARK_NAUGHTY_EXP(exp, add) \
369 if (RExC_naughty < TOO_NAUGHTY) \
370 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 371
a687059c
LW
372#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
373#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 374 ((*s) == '{' && regcurly(s)))
a687059c
LW
375
376/*
377 * Flags to be passed up and down.
378 */
a687059c 379#define WORST 0 /* Worst case. */
c4ead770
KW
380#define HASWIDTH 0x01 /* Known to not match null strings, could match
381 non-null ones. */
fda99bee 382
e64f369d 383/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
384 * character. (There needs to be a case: in the switch statement in regexec.c
385 * for any node marked SIMPLE.) Note that this is not the same thing as
386 * REGNODE_SIMPLE */
fda99bee 387#define SIMPLE 0x02
e64f369d 388#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
389#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
390#define TRYAGAIN 0x10 /* Weeded out a declaration. */
c36361b2
KW
391#define RESTART_PARSE 0x20 /* Need to redo the parse */
392#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
393 calcuate sizes as UTF-8 */
a687059c 394
3dab1dad
YO
395#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
396
07be1b83
YO
397/* whether trie related optimizations are enabled */
398#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
399#define TRIE_STUDY_OPT
786e8c11 400#define FULL_TRIE_STUDY
07be1b83
YO
401#define TRIE_STCLASS
402#endif
1de06328
YO
403
404
40d049e4
YO
405
406#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
407#define PBITVAL(paren) (1 << ((paren) & 7))
ac3ef366
HS
408#define PAREN_OFFSET(depth) \
409 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
410#define PAREN_TEST(depth, paren) \
411 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
412#define PAREN_SET(depth, paren) \
413 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
414#define PAREN_UNSET(depth, paren) \
415 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
40d049e4 416
82a6ada4 417#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 418 if (!UTF) { \
c36361b2 419 *flagp = RESTART_PARSE|NEED_UTF8; \
f55b7b26 420 return 0; \
8d9c2815 421 } \
82a6ada4 422 } STMT_END
40d049e4 423
d63d75f5 424/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
f6e8f31e
KW
425 * a flag that indicates we need to override /d with /u as a result of
426 * something in the pattern. It should only be used in regards to calling
b2090782 427 * set_regex_charset() or get_regex_charset() */
512c0f5a
KW
428#define REQUIRE_UNI_RULES(flagp, restart_retval) \
429 STMT_START { \
430 if (DEPENDS_SEMANTICS) { \
512c0f5a
KW
431 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
432 RExC_uni_semantics = 1; \
b76e1c04 433 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
bd0b62a6
KW
434 /* No need to restart the parse if we haven't seen \
435 * anything that differs between /u and /d, and no need \
436 * to restart immediately if we're going to reparse \
437 * anyway to count parens */ \
c36361b2 438 *flagp |= RESTART_PARSE; \
512c0f5a 439 return restart_retval; \
d1be211c 440 } \
512c0f5a
KW
441 } \
442 } STMT_END
443
7c932d07
KW
444#define REQUIRE_BRANCHJ(flagp, restart_retval) \
445 STMT_START { \
446 RExC_use_BRANCHJ = 1; \
439a3bfe
KW
447 *flagp |= RESTART_PARSE; \
448 return restart_retval; \
7c932d07
KW
449 } STMT_END
450
b76e1c04
KW
451/* Until we have completed the parse, we leave RExC_total_parens at 0 or
452 * less. After that, it must always be positive, because the whole re is
453 * considered to be surrounded by virtual parens. Setting it to negative
454 * indicates there is some construct that needs to know the actual number of
455 * parens to be properly handled. And that means an extra pass will be
456 * required after we've counted them all */
457#define ALL_PARENS_COUNTED (RExC_total_parens > 0)
7c932d07 458#define REQUIRE_PARENS_PASS \
b76e1c04
KW
459 STMT_START { /* No-op if have completed a pass */ \
460 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
7c932d07 461 } STMT_END
b76e1c04
KW
462#define IN_PARENS_PASS (RExC_total_parens < 0)
463
7c932d07 464
f358228e
KW
465/* This is used to return failure (zero) early from the calling function if
466 * various flags in 'flags' are set. Two flags always cause a return:
467 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
468 * additional flags that should cause a return; 0 if none. If the return will
469 * be done, '*flagp' is first set to be all of the flags that caused the
470 * return. */
471#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
fd7fc6ae 472 STMT_START { \
c36361b2
KW
473 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
474 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
f358228e 475 return 0; \
fd7fc6ae
YO
476 } \
477 } STMT_END
478
c36361b2 479#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
fd7fc6ae 480
5d555bb8 481#define RETURN_FAIL_ON_RESTART(flags,flagp) \
f358228e 482 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
5d555bb8 483#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
f358228e 484 if (MUST_RESTART(*(flagp))) return 0
fd7fc6ae 485
f19b1a63
KW
486/* This converts the named class defined in regcomp.h to its equivalent class
487 * number defined in handy.h. */
488#define namedclass_to_classnum(class) ((int) ((class) / 2))
489#define classnum_to_namedclass(classnum) ((classnum) * 2)
490
de92f5e6
KW
491#define _invlist_union_complement_2nd(a, b, output) \
492 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
493#define _invlist_intersection_complement_2nd(a, b, output) \
494 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
495
4a808c87
KW
496/* We add a marker if we are deferring expansion of a property that is both
497 * 1) potentiallly user-defined; and
498 * 2) could also be an official Unicode property.
499 *
500 * Without this marker, any deferred expansion can only be for a user-defined
501 * one. This marker shouldn't conflict with any that could be in a legal name,
502 * and is appended to its name to indicate this. There is a string and
503 * character form */
043cdb9a
KW
504#define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
505#define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
1c2f3d7a 506
0069caf1
KW
507/* What is infinity for optimization purposes */
508#define OPTIMIZE_INFTY SSize_t_MAX
509
1de06328
YO
510/* About scan_data_t.
511
512 During optimisation we recurse through the regexp program performing
513 various inplace (keyhole style) optimisations. In addition study_chunk
514 and scan_commit populate this data structure with information about
538e84ed 515 what strings MUST appear in the pattern. We look for the longest
3b753521 516 string that must appear at a fixed location, and we look for the
1de06328
YO
517 longest string that may appear at a floating location. So for instance
518 in the pattern:
538e84ed 519
1de06328 520 /FOO[xX]A.*B[xX]BAR/
538e84ed 521
1de06328
YO
522 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
523 strings (because they follow a .* construct). study_chunk will identify
524 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 525
1de06328 526 The strings can be composites, for instance
538e84ed 527
1de06328 528 /(f)(o)(o)/
538e84ed 529
1de06328 530 will result in a composite fixed substring 'foo'.
538e84ed 531
1de06328 532 For each string some basic information is maintained:
538e84ed 533
6ea77169 534 - min_offset
1de06328
YO
535 This is the position the string must appear at, or not before.
536 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
537 characters must match before the string we are searching for.
538 Likewise when combined with minlenp and the length of the string it
538e84ed 539 tells us how many characters must appear after the string we have
1de06328 540 found.
538e84ed 541
1de06328
YO
542 - max_offset
543 Only used for floating strings. This is the rightmost point that
0069caf1 544 the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
1de06328 545 string can occur infinitely far to the right.
6e12f832 546 For fixed strings, it is equal to min_offset.
538e84ed 547
1de06328 548 - minlenp
2d608413
KW
549 A pointer to the minimum number of characters of the pattern that the
550 string was found inside. This is important as in the case of positive
538e84ed 551 lookahead or positive lookbehind we can have multiple patterns
1de06328 552 involved. Consider
538e84ed 553
1de06328 554 /(?=FOO).*F/
538e84ed 555
1de06328
YO
556 The minimum length of the pattern overall is 3, the minimum length
557 of the lookahead part is 3, but the minimum length of the part that
538e84ed 558 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 559 minimum length for the F is 1. This is important as the minimum length
538e84ed 560 is used to determine offsets in front of and behind the string being
1de06328 561 looked for. Since strings can be composites this is the length of the
486ec47a 562 pattern at the time it was committed with a scan_commit. Note that
1de06328 563 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 564 are not known until the full pattern has been compiled, thus the
1de06328 565 pointer to the value.
538e84ed 566
1de06328 567 - lookbehind
538e84ed 568
1de06328 569 In the case of lookbehind the string being searched for can be
538e84ed 570 offset past the start point of the final matching string.
1de06328
YO
571 If this value was just blithely removed from the min_offset it would
572 invalidate some of the calculations for how many chars must match
573 before or after (as they are derived from min_offset and minlen and
538e84ed 574 the length of the string being searched for).
1de06328
YO
575 When the final pattern is compiled and the data is moved from the
576 scan_data_t structure into the regexp structure the information
538e84ed
KW
577 about lookbehind is factored in, with the information that would
578 have been lost precalculated in the end_shift field for the
1de06328
YO
579 associated string.
580
581 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 582 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
583
584*/
2c2d71f5 585
cb6aef8b
DM
586struct scan_data_substrs {
587 SV *str; /* longest substring found in pattern */
588 SSize_t min_offset; /* earliest point in string it can appear */
589 SSize_t max_offset; /* latest point in string it can appear */
590 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
591 SSize_t lookbehind; /* is the pos of the string modified by LB */
11683ecb 592 I32 flags; /* per substring SF_* and SCF_* flags */
cb6aef8b
DM
593};
594
2c2d71f5 595typedef struct scan_data_t {
1de06328
YO
596 /*I32 len_min; unused */
597 /*I32 len_delta; unused */
49f55535 598 SSize_t pos_min;
ea3daa5d 599 SSize_t pos_delta;
2c2d71f5 600 SV *last_found;
ea3daa5d 601 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 602 SSize_t last_start_min;
ea3daa5d 603 SSize_t last_start_max;
2099df82
DM
604 U8 cur_is_floating; /* whether the last_* values should be set as
605 * the next fixed (0) or floating (1)
606 * substring */
6ea77169 607
2099df82
DM
608 /* [0] is longest fixed substring so far, [1] is longest float so far */
609 struct scan_data_substrs substrs[2];
6ea77169 610
11683ecb 611 I32 flags; /* common SF_* and SCF_* flags */
2c2d71f5 612 I32 whilem_c;
49f55535 613 SSize_t *last_closep;
b8f7bb16 614 regnode_ssc *start_class;
2c2d71f5
JH
615} scan_data_t;
616
a687059c 617/*
e50aee73 618 * Forward declarations for pregcomp()'s friends.
a687059c 619 */
a0d0e21e 620
6ea77169 621static const scan_data_t zero_scan_data = {
a2ca2017 622 0, 0, NULL, 0, 0, 0, 0,
6ea77169 623 {
11683ecb
DM
624 { NULL, 0, 0, 0, 0, 0 },
625 { NULL, 0, 0, 0, 0, 0 },
6ea77169
DM
626 },
627 0, 0, NULL, NULL
628};
c277df42 629
11683ecb
DM
630/* study flags */
631
07be1b83
YO
632#define SF_BEFORE_SEOL 0x0001
633#define SF_BEFORE_MEOL 0x0002
11683ecb 634#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
c277df42 635
07be1b83
YO
636#define SF_IS_INF 0x0040
637#define SF_HAS_PAR 0x0080
638#define SF_IN_PAR 0x0100
639#define SF_HAS_EVAL 0x0200
bc604ad8
KW
640
641
642/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
643 * longest substring in the pattern. When it is not set the optimiser keeps
644 * track of position, but does not keep track of the actual strings seen,
645 *
646 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
647 * /foo/i will not.
648 *
649 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
650 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
651 * turned off because of the alternation (BRANCH). */
07be1b83 652#define SCF_DO_SUBSTR 0x0400
bc604ad8 653
653099ff
GS
654#define SCF_DO_STCLASS_AND 0x0800
655#define SCF_DO_STCLASS_OR 0x1000
656#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 657#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 658
786e8c11 659#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 660#define SCF_SEEN_ACCEPT 0x8000
688e0391 661#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
662#define SCF_IN_DEFINE 0x20000
663
664
665
07be1b83 666
43fead97 667#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
668
669/* The enums for all these are ordered so things work out correctly */
a62b1201 670#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
671#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
672 == REGEX_DEPENDS_CHARSET)
00b27cfc 673#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
674#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
675 >= REGEX_UNICODE_CHARSET)
676#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
677 == REGEX_ASCII_RESTRICTED_CHARSET)
678#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
679 >= REGEX_ASCII_RESTRICTED_CHARSET)
680#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
681 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 682
43fead97 683#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 684
f2c2a6ab
KW
685/* For programs that want to be strictly Unicode compatible by dying if any
686 * attempt is made to match a non-Unicode code point against a Unicode
687 * property. */
688#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
689
93733859 690#define OOB_NAMEDCLASS -1
b8c5462f 691
8e661ac5
KW
692/* There is no code point that is out-of-bounds, so this is problematic. But
693 * its only current use is to initialize a variable that is always set before
694 * looked at. */
695#define OOB_UNICODE 0xDEADBEEF
696
a0ed51b3 697#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 698
8615cb43 699
b45f050a
JF
700/* length of regex to show in messages that don't mark a position within */
701#define RegexLengthToShowInErrorMessages 127
702
703/*
704 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
705 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
706 * op/pragma/warn/regcomp.
707 */
7253e4e3
RK
708#define MARKER1 "<-- HERE" /* marker as it appears in the description */
709#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 710
538e84ed 711#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 712 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 713
285b5ca0
KW
714/* The code in this file in places uses one level of recursion with parsing
715 * rebased to an alternate string constructed by us in memory. This can take
716 * the form of something that is completely different from the input, or
717 * something that uses the input as part of the alternate. In the first case,
718 * there should be no possibility of an error, as we are in complete control of
51684e3c
KW
719 * the alternate string. But in the second case we don't completely control
720 * the input portion, so there may be errors in that. Here's an example:
285b5ca0
KW
721 * /[abc\x{DF}def]/ui
722 * is handled specially because \x{df} folds to a sequence of more than one
51684e3c 723 * character: 'ss'. What is done is to create and parse an alternate string,
285b5ca0
KW
724 * which looks like this:
725 * /(?:\x{DF}|[abc\x{DF}def])/ui
726 * where it uses the input unchanged in the middle of something it constructs,
727 * which is a branch for the DF outside the character class, and clustering
728 * parens around the whole thing. (It knows enough to skip the DF inside the
729 * class while in this substitute parse.) 'abc' and 'def' may have errors that
730 * need to be reported. The general situation looks like this:
731 *
51684e3c 732 * |<------- identical ------>|
285b5ca0 733 * sI tI xI eI
51684e3c 734 * Input: ---------------------------------------------------------------
285b5ca0
KW
735 * Constructed: ---------------------------------------------------
736 * sC tC xC eC EC
51684e3c 737 * |<------- identical ------>|
285b5ca0 738 *
51684e3c
KW
739 * sI..eI is the portion of the input pattern we are concerned with here.
740 * sC..EC is the constructed substitute parse string.
741 * sC..tC is constructed by us
742 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
743 * In the diagram, these are vertically aligned.
744 * eC..EC is also constructed by us.
745 * xC is the position in the substitute parse string where we found a
746 * problem.
747 * xI is the position in the original pattern corresponding to xC.
285b5ca0 748 *
51684e3c
KW
749 * We want to display a message showing the real input string. Thus we need to
750 * translate from xC to xI. We know that xC >= tC, since the portion of the
751 * string sC..tC has been constructed by us, and so shouldn't have errors. We
752 * get:
753 * xI = tI + (xC - tC)
285b5ca0 754 *
51684e3c
KW
755 * When the substitute parse is constructed, the code needs to set:
756 * RExC_start (sC)
757 * RExC_end (eC)
758 * RExC_copy_start_in_input (tI)
759 * RExC_copy_start_in_constructed (tC)
760 * and restore them when done.
285b5ca0 761 *
51684e3c
KW
762 * During normal processing of the input pattern, both
763 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
764 * sI, so that xC equals xI.
285b5ca0
KW
765 */
766
51684e3c
KW
767#define sI RExC_precomp
768#define eI RExC_precomp_end
769#define sC RExC_start
770#define eC RExC_end
771#define tI RExC_copy_start_in_input
772#define tC RExC_copy_start_in_constructed
773#define xI(xC) (tI + (xC - tC))
774#define xI_offset(xC) (xI(xC) - sI)
285b5ca0
KW
775
776#define REPORT_LOCATION_ARGS(xC) \
777 UTF8fARG(UTF, \
51684e3c 778 (xI(xC) > eI) /* Don't run off end */ \
232b691f 779 ? eI - sI /* Length before the <--HERE */ \
bc929b61
KW
780 : ((xI_offset(xC) >= 0) \
781 ? xI_offset(xC) \
782 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
783 IVdf " trying to output message for " \
784 " pattern %.*s", \
51684e3c 785 __FILE__, __LINE__, (IV) xI_offset(xC), \
bc929b61 786 ((int) (eC - sC)), sC), 0)), \
51684e3c 787 sI), /* The input pattern printed up to the <--HERE */ \
285b5ca0 788 UTF8fARG(UTF, \
51684e3c
KW
789 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
790 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
c1d900c3 791
8a6d8ec6
HS
792/* Used to point after bad bytes for an error message, but avoid skipping
793 * past a nul byte. */
e3304382 794#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
8a6d8ec6 795
ed68fe11
KW
796/* Set up to clean up after our imminent demise */
797#define PREPARE_TO_DIE \
798 STMT_START { \
799 if (RExC_rx_sv) \
800 SAVEFREESV(RExC_rx_sv); \
7c932d07
KW
801 if (RExC_open_parens) \
802 SAVEFREEPV(RExC_open_parens); \
803 if (RExC_close_parens) \
804 SAVEFREEPV(RExC_close_parens); \
ed68fe11
KW
805 } STMT_END
806
b45f050a
JF
807/*
808 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
809 * arg. Show regex, up to a maximum length. If it's too long, chop and add
810 * "...".
811 */
58e23c8d 812#define _FAIL(code) STMT_START { \
bfed75c6 813 const char *ellipses = ""; \
88f063b4 814 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380 815 \
ed68fe11 816 PREPARE_TO_DIE; \
ccb2c380
MP
817 if (len > RegexLengthToShowInErrorMessages) { \
818 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
819 len = RegexLengthToShowInErrorMessages - 10; \
820 ellipses = "..."; \
821 } \
58e23c8d 822 code; \
ccb2c380 823} STMT_END
8615cb43 824
58e23c8d 825#define FAIL(msg) _FAIL( \
147e3846 826 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 827 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
828
829#define FAIL2(msg,arg) _FAIL( \
147e3846 830 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 831 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 832
4b7718d6
KW
833#define FAIL3(msg,arg1,arg2) _FAIL( \
834 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
835 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
836
b45f050a 837/*
b45f050a
JF
838 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
839 */
ccb2c380 840#define Simple_vFAIL(m) STMT_START { \
ccb2c380 841 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 842 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 843} STMT_END
b45f050a
JF
844
845/*
846 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
847 */
ccb2c380 848#define vFAIL(m) STMT_START { \
ed68fe11 849 PREPARE_TO_DIE; \
ccb2c380
MP
850 Simple_vFAIL(m); \
851} STMT_END
b45f050a
JF
852
853/*
854 * Like Simple_vFAIL(), but accepts two arguments.
855 */
ccb2c380 856#define Simple_vFAIL2(m,a1) STMT_START { \
c6572a6d 857 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
d528642a 858 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 859} STMT_END
b45f050a
JF
860
861/*
862 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
863 */
ccb2c380 864#define vFAIL2(m,a1) STMT_START { \
ed68fe11 865 PREPARE_TO_DIE; \
ccb2c380
MP
866 Simple_vFAIL2(m, a1); \
867} STMT_END
b45f050a
JF
868
869
870/*
871 * Like Simple_vFAIL(), but accepts three arguments.
872 */
ccb2c380 873#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c6572a6d 874 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
d528642a 875 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 876} STMT_END
b45f050a
JF
877
878/*
879 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
880 */
ccb2c380 881#define vFAIL3(m,a1,a2) STMT_START { \
ed68fe11 882 PREPARE_TO_DIE; \
ccb2c380
MP
883 Simple_vFAIL3(m, a1, a2); \
884} STMT_END
b45f050a
JF
885
886/*
887 * Like Simple_vFAIL(), but accepts four arguments.
888 */
ccb2c380 889#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
c6572a6d 890 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \
d528642a 891 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 892} STMT_END
b45f050a 893
95db3ffa 894#define vFAIL4(m,a1,a2,a3) STMT_START { \
ed68fe11 895 PREPARE_TO_DIE; \
95db3ffa
KW
896 Simple_vFAIL4(m, a1, a2, a3); \
897} STMT_END
898
946095af 899/* A specialized version of vFAIL2 that works with UTF8f */
d528642a 900#define vFAIL2utf8f(m, a1) STMT_START { \
ed68fe11 901 PREPARE_TO_DIE; \
c6572a6d 902 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
d528642a 903 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
904} STMT_END
905
3ba22297 906#define vFAIL3utf8f(m, a1, a2) STMT_START { \
ed68fe11 907 PREPARE_TO_DIE; \
c6572a6d 908 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
d528642a 909 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
910} STMT_END
911
d62e749e 912/* Setting this to NULL is a signal to not output warnings */
cc16d262
KW
913#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
914 STMT_START { \
915 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
916 RExC_copy_start_in_constructed = NULL; \
917 } STMT_END
918#define RESTORE_WARNINGS \
919 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
d62e749e 920
7c932d07
KW
921/* Since a warning can be generated multiple times as the input is reparsed, we
922 * output it the first time we come to that point in the parse, but suppress it
923 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
924 * generate any warnings */
55bcf857 925#define TO_OUTPUT_WARNINGS(loc) \
7c932d07
KW
926 ( RExC_copy_start_in_constructed \
927 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
55bcf857 928
7c932d07
KW
929/* After we've emitted a warning, we save the position in the input so we don't
930 * output it again */
931#define UPDATE_WARNINGS_LOC(loc) \
932 STMT_START { \
933 if (TO_OUTPUT_WARNINGS(loc)) { \
5d894ca5
KW
934 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
935 - RExC_precomp; \
7c932d07
KW
936 } \
937 } STMT_END
77ccba3c 938
086f21bf 939/* 'warns' is the output of the packWARNx macro used in 'code' */
f8220fe2
KW
940#define _WARN_HELPER(loc, warns, code) \
941 STMT_START { \
d62e749e
KW
942 if (! RExC_copy_start_in_constructed) { \
943 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
944 " expected at '%s'", \
945 __FILE__, __LINE__, loc); \
946 } \
55bcf857 947 if (TO_OUTPUT_WARNINGS(loc)) { \
086f21bf
KW
948 if (ckDEAD(warns)) \
949 PREPARE_TO_DIE; \
338190a1 950 code; \
77ccba3c 951 UPDATE_WARNINGS_LOC(loc); \
338190a1 952 } \
f8220fe2 953 } STMT_END
946095af 954
5e0a247b 955/* m is not necessarily a "literal string", in this macro */
58ad8160
KW
956#define warn_non_literal_string(loc, packed_warn, m) \
957 _WARN_HELPER(loc, packed_warn, \
958 Perl_warner(aTHX_ packed_warn, \
d528642a 959 "%s" REPORT_LOCATION, \
f8220fe2 960 m, REPORT_LOCATION_ARGS(loc)))
58ad8160
KW
961#define reg_warn_non_literal_string(loc, m) \
962 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
5e0a247b 963
eb761011
KW
964#define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
965 STMT_START { \
966 char * format; \
967 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
968 Newx(format, format_size, char); \
969 my_strlcpy(format, m, format_size); \
970 my_strlcat(format, REPORT_LOCATION, format_size); \
971 SAVEFREEPV(format); \
972 _WARN_HELPER(loc, packwarn, \
973 Perl_ck_warner(aTHX_ packwarn, \
974 format, \
975 a1, REPORT_LOCATION_ARGS(loc))); \
976 } STMT_END
977
f8220fe2
KW
978#define ckWARNreg(loc,m) \
979 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
980 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 981 m REPORT_LOCATION, \
f8220fe2 982 REPORT_LOCATION_ARGS(loc)))
ccb2c380 983
f8220fe2
KW
984#define vWARN(loc, m) \
985 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
986 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 987 m REPORT_LOCATION, \
f8220fe2 988 REPORT_LOCATION_ARGS(loc))) \
b927b7e9 989
f8220fe2
KW
990#define vWARN_dep(loc, m) \
991 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
992 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 993 m REPORT_LOCATION, \
f8220fe2 994 REPORT_LOCATION_ARGS(loc)))
0d6106aa 995
f8220fe2
KW
996#define ckWARNdep(loc,m) \
997 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
998 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 999 m REPORT_LOCATION, \
f8220fe2 1000 REPORT_LOCATION_ARGS(loc)))
147508a2 1001
f8220fe2
KW
1002#define ckWARNregdep(loc,m) \
1003 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
1004 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
d528642a
KW
1005 WARN_REGEXP), \
1006 m REPORT_LOCATION, \
f8220fe2 1007 REPORT_LOCATION_ARGS(loc)))
ccb2c380 1008
f8220fe2
KW
1009#define ckWARN2reg_d(loc,m, a1) \
1010 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1011 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
d528642a 1012 m REPORT_LOCATION, \
f8220fe2 1013 a1, REPORT_LOCATION_ARGS(loc)))
2335b3d3 1014
f8220fe2
KW
1015#define ckWARN2reg(loc, m, a1) \
1016 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1017 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 1018 m REPORT_LOCATION, \
f8220fe2 1019 a1, REPORT_LOCATION_ARGS(loc)))
ccb2c380 1020
f8220fe2
KW
1021#define vWARN3(loc, m, a1, a2) \
1022 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1023 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 1024 m REPORT_LOCATION, \
f8220fe2 1025 a1, a2, REPORT_LOCATION_ARGS(loc)))
ccb2c380 1026
f8220fe2
KW
1027#define ckWARN3reg(loc, m, a1, a2) \
1028 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1029 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
1030 m REPORT_LOCATION, \
1031 a1, a2, \
f8220fe2 1032 REPORT_LOCATION_ARGS(loc)))
668c081a 1033
f8220fe2
KW
1034#define vWARN4(loc, m, a1, a2, a3) \
1035 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1036 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
1037 m REPORT_LOCATION, \
1038 a1, a2, a3, \
f8220fe2 1039 REPORT_LOCATION_ARGS(loc)))
ccb2c380 1040
f8220fe2
KW
1041#define ckWARN4reg(loc, m, a1, a2, a3) \
1042 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1043 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
1044 m REPORT_LOCATION, \
1045 a1, a2, a3, \
f8220fe2 1046 REPORT_LOCATION_ARGS(loc)))
668c081a 1047
f8220fe2
KW
1048#define vWARN5(loc, m, a1, a2, a3, a4) \
1049 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1050 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
1051 m REPORT_LOCATION, \
1052 a1, a2, a3, a4, \
f8220fe2 1053 REPORT_LOCATION_ARGS(loc)))
9d1d55b5 1054
aea5f181 1055#define ckWARNexperimental(loc, class, m) \
5aabce33
KW
1056 STMT_START { \
1057 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1058 RExC_warned_ ## class = 1; \
1059 _WARN_HELPER(loc, packWARN(class), \
aea5f181
KW
1060 Perl_ck_warner_d(aTHX_ packWARN(class), \
1061 m REPORT_LOCATION, \
5aabce33
KW
1062 REPORT_LOCATION_ARGS(loc)));\
1063 } \
1064 } STMT_END
aea5f181 1065
1a322bb4
KW
1066/* Convert between a pointer to a node and its offset from the beginning of the
1067 * program */
1068#define REGNODE_p(offset) (RExC_emit_start + (offset))
1069#define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
1070
538e84ed 1071/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
1072 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
1073 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
1074 * Element 0 holds the number n.
07be1b83 1075 * Position is 1 indexed.
fac92740 1076 */
7122b237 1077#ifndef RE_TRACK_PATTERN_OFFSETS
d7c442ce 1078#define Set_Node_Offset_To_R(offset,byte)
7122b237
YO
1079#define Set_Node_Offset(node,byte)
1080#define Set_Cur_Node_Offset
1081#define Set_Node_Length_To_R(node,len)
1082#define Set_Node_Length(node,len)
6a86c6ad 1083#define Set_Node_Cur_Length(node,start)
538e84ed
KW
1084#define Node_Offset(n)
1085#define Node_Length(n)
7122b237
YO
1086#define Set_Node_Offset_Length(node,offset,len)
1087#define ProgLen(ri) ri->u.proglen
1088#define SetProgLen(ri,x) ri->u.proglen = x
4cbeff97 1089#define Track_Code(code)
7122b237
YO
1090#else
1091#define ProgLen(ri) ri->u.offsets[0]
1092#define SetProgLen(ri,x) ri->u.offsets[0] = x
d7c442ce 1093#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
ccb2c380 1094 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
d7c442ce
KW
1095 __LINE__, (int)(offset), (int)(byte))); \
1096 if((offset) < 0) { \
538e84ed 1097 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
d7c442ce 1098 (int)(offset)); \
ccb2c380 1099 } else { \
d7c442ce 1100 RExC_offsets[2*(offset)-1] = (byte); \
ccb2c380 1101 } \
ccb2c380
MP
1102} STMT_END
1103
88f063b4 1104#define Set_Node_Offset(node,byte) \
1a322bb4 1105 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
ccb2c380
MP
1106#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1107
1108#define Set_Node_Length_To_R(node,len) STMT_START { \
ccb2c380 1109 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 1110 __LINE__, (int)(node), (int)(len))); \
ccb2c380 1111 if((node) < 0) { \
538e84ed
KW
1112 Perl_croak(aTHX_ "value of node is %d in Length macro", \
1113 (int)(node)); \
ccb2c380
MP
1114 } else { \
1115 RExC_offsets[2*(node)] = (len); \
1116 } \
ccb2c380
MP
1117} STMT_END
1118
1119#define Set_Node_Length(node,len) \
1a322bb4 1120 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
6a86c6ad
NC
1121#define Set_Node_Cur_Length(node, start) \
1122 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
1123
1124/* Get offsets and lengths */
1a322bb4
KW
1125#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1126#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
fac92740 1127
07be1b83 1128#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1a322bb4
KW
1129 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1130 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
07be1b83 1131} STMT_END
4cbeff97
TC
1132
1133#define Track_Code(code) STMT_START { code } STMT_END
7122b237 1134#endif
07be1b83
YO
1135
1136#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1137#define EXPERIMENTAL_INPLACESCAN
f427392e 1138#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 1139
cb41e5d6
YO
1140#ifdef DEBUGGING
1141int
6ad9a8ab 1142Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
1143{
1144 va_list ap;
1145 int result;
1146 PerlIO *f= Perl_debug_log;
1147 PERL_ARGS_ASSERT_RE_PRINTF;
1148 va_start(ap, fmt);
1149 result = PerlIO_vprintf(f, fmt, ap);
1150 va_end(ap);
1151 return result;
1152}
1153
1154int
7b031478 1155Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
1156{
1157 va_list ap;
1158 int result;
1159 PerlIO *f= Perl_debug_log;
1160 PERL_ARGS_ASSERT_RE_INDENTF;
1161 va_start(ap, depth);
daeb874b 1162 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
1163 result = PerlIO_vprintf(f, fmt, ap);
1164 va_end(ap);
1165 return result;
1166}
1167#endif /* DEBUGGING */
1168
7b031478 1169#define DEBUG_RExC_seen() \
538e84ed 1170 DEBUG_OPTIMISE_MORE_r({ \
88f063b4 1171 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 1172 \
e384d5c1 1173 if (RExC_seen & REG_ZERO_LEN_SEEN) \
88f063b4 1174 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 1175 \
e384d5c1 1176 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
88f063b4 1177 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 1178 \
e384d5c1 1179 if (RExC_seen & REG_GPOS_SEEN) \
88f063b4 1180 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 1181 \
e384d5c1 1182 if (RExC_seen & REG_RECURSE_SEEN) \
88f063b4 1183 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 1184 \
7b031478 1185 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
88f063b4 1186 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 1187 \
e384d5c1 1188 if (RExC_seen & REG_VERBARG_SEEN) \
88f063b4 1189 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 1190 \
e384d5c1 1191 if (RExC_seen & REG_CUTGROUP_SEEN) \
88f063b4 1192 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 1193 \
e384d5c1 1194 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
88f063b4 1195 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 1196 \
e384d5c1 1197 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
88f063b4 1198 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 1199 \
7b031478 1200 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
88f063b4 1201 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 1202 \
88f063b4 1203 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
1204 });
1205
fdfb4f21 1206#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 1207 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21 1208
fdfb4f21 1209
f5a36d78
DM
1210#ifdef DEBUGGING
1211static void
1212S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1213 const char *close_str)
1214{
1215 if (!flags)
1216 return;
1217
1218 Perl_re_printf( aTHX_ "%s", open_str);
11683ecb
DM
1219 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1220 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
f5a36d78
DM
1221 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1222 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1223 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1224 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1225 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1226 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1227 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1228 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1229 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1230 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1231 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1232 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1233 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1234 Perl_re_printf( aTHX_ "%s", close_str);
1235}
1236
1237
1238static void
1239S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1240 U32 depth, int is_inf)
1241{
271b36b1 1242 DECLARE_AND_GET_RE_DEBUG_FLAGS;
f5a36d78
DM
1243
1244 DEBUG_OPTIMISE_MORE_r({
1245 if (!data)
1246 return;
1247 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1248 depth,
1249 where,
1250 (IV)data->pos_min,
1251 (IV)data->pos_delta,
1252 (UV)data->flags
1253 );
1254
1255 S_debug_show_study_flags(aTHX_ data->flags," [","]");
fdfb4f21 1256
f5a36d78
DM
1257 Perl_re_printf( aTHX_
1258 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1259 (IV)data->whilem_c,
1260 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1261 is_inf ? "INF " : ""
1262 );
1263
11683ecb 1264 if (data->last_found) {
37b6262f 1265 int i;
11683ecb
DM
1266 Perl_re_printf(aTHX_
1267 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1268 SvPVX_const(data->last_found),
1269 (IV)data->last_end,
1270 (IV)data->last_start_min,
1271 (IV)data->last_start_max
1272 );
1273
2099df82 1274 for (i = 0; i < 2; i++) {
37b6262f
DM
1275 Perl_re_printf(aTHX_
1276 " %s%s: '%s' @ %" IVdf "/%" IVdf,
2099df82
DM
1277 data->cur_is_floating == i ? "*" : "",
1278 i ? "Float" : "Fixed",
37b6262f
DM
1279 SvPVX_const(data->substrs[i].str),
1280 (IV)data->substrs[i].min_offset,
1281 (IV)data->substrs[i].max_offset
1282 );
1283 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1284 }
11683ecb 1285 }
f5a36d78
DM
1286
1287 Perl_re_printf( aTHX_ "\n");
1288 });
1289}
1290
1291
1292static void
1293S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1294 regnode *scan, U32 depth, U32 flags)
1295{
271b36b1 1296 DECLARE_AND_GET_RE_DEBUG_FLAGS;
f5a36d78
DM
1297
1298 DEBUG_OPTIMISE_r({
1299 regnode *Next;
1300
1301 if (!scan)
1302 return;
1303 Next = regnext(scan);
1304 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1305 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1306 depth,
1307 str,
1308 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1309 Next ? (REG_NODE_NUM(Next)) : 0 );
1310 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1311 Perl_re_printf( aTHX_ "\n");
1312 });
1313}
1314
1315
1316# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1317 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1318
1319# define DEBUG_PEEP(str, scan, depth, flags) \
1320 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1321
1322#else
1323# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1324# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1325#endif
1de06328 1326
cb41e5d6 1327
c6871b76
KW
1328/* =========================================================
1329 * BEGIN edit_distance stuff.
1330 *
1331 * This calculates how many single character changes of any type are needed to
1332 * transform a string into another one. It is taken from version 3.1 of
1333 *
1334 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1335 */
1336
1337/* Our unsorted dictionary linked list. */
1338/* Note we use UVs, not chars. */
1339
1340struct dictionary{
1341 UV key;
1342 UV value;
1343 struct dictionary* next;
1344};
1345typedef struct dictionary item;
1346
1347
1348PERL_STATIC_INLINE item*
88f063b4 1349push(UV key, item* curr)
c6871b76
KW
1350{
1351 item* head;
d09f14bf 1352 Newx(head, 1, item);
c6871b76
KW
1353 head->key = key;
1354 head->value = 0;
1355 head->next = curr;
1356 return head;
1357}
1358
1359
1360PERL_STATIC_INLINE item*
1361find(item* head, UV key)
1362{
1363 item* iterator = head;
1364 while (iterator){
1365 if (iterator->key == key){
1366 return iterator;
1367 }
1368 iterator = iterator->next;
1369 }
1370
1371 return NULL;
1372}
1373
1374PERL_STATIC_INLINE item*
88f063b4 1375uniquePush(item* head, UV key)
c6871b76
KW
1376{
1377 item* iterator = head;
1378
1379 while (iterator){
1380 if (iterator->key == key) {
1381 return head;
1382 }
1383 iterator = iterator->next;
1384 }
1385
88f063b4 1386 return push(key, head);
c6871b76
KW
1387}
1388
1389PERL_STATIC_INLINE void
1390dict_free(item* head)
1391{
1392 item* iterator = head;
1393
1394 while (iterator) {
1395 item* temp = iterator;
1396 iterator = iterator->next;
1397 Safefree(temp);
1398 }
1399
1400 head = NULL;
1401}
1402
1403/* End of Dictionary Stuff */
1404
1405/* All calculations/work are done here */
1406STATIC int
1407S_edit_distance(const UV* src,
1408 const UV* tgt,
1409 const STRLEN x, /* length of src[] */
1410 const STRLEN y, /* length of tgt[] */
1411 const SSize_t maxDistance
1412)
1413{
1414 item *head = NULL;
88f063b4 1415 UV swapCount, swapScore, targetCharCount, i, j;
c6871b76
KW
1416 UV *scores;
1417 UV score_ceil = x + y;
1418
1419 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1420
1421 /* intialize matrix start values */
d09f14bf 1422 Newx(scores, ( (x + 2) * (y + 2)), UV);
c6871b76
KW
1423 scores[0] = score_ceil;
1424 scores[1 * (y + 2) + 0] = score_ceil;
1425 scores[0 * (y + 2) + 1] = score_ceil;
1426 scores[1 * (y + 2) + 1] = 0;
88f063b4 1427 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
c6871b76
KW
1428
1429 /* work loops */
1430 /* i = src index */
1431 /* j = tgt index */
1432 for (i=1;i<=x;i++) {
1433 if (i < x)
88f063b4 1434 head = uniquePush(head, src[i]);
c6871b76
KW
1435 scores[(i+1) * (y + 2) + 1] = i;
1436 scores[(i+1) * (y + 2) + 0] = score_ceil;
1437 swapCount = 0;
1438
1439 for (j=1;j<=y;j++) {
1440 if (i == 1) {
1441 if(j < y)
88f063b4 1442 head = uniquePush(head, tgt[j]);
c6871b76
KW
1443 scores[1 * (y + 2) + (j + 1)] = j;
1444 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1445 }
1446
88f063b4 1447 targetCharCount = find(head, tgt[j-1])->value;
c6871b76
KW
1448 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1449
1450 if (src[i-1] != tgt[j-1]){
1451 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1452 }
1453 else {
1454 swapCount = j;
1455 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1456 }
1457 }
1458
88f063b4 1459 find(head, src[i-1])->value = i;
c6871b76
KW
1460 }
1461
1462 {
1463 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1464 dict_free(head);
1465 Safefree(scores);
1466 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1467 }
1468}
1469
1470/* END of edit_distance() stuff
1471 * ========================================================= */
1472
653099ff 1473/* Mark that we cannot extend a found fixed substring at this point.
37b6262f 1474 Update the longest found anchored substring or the longest found
653099ff
GS
1475 floating substrings if needed. */
1476
4327152a 1477STATIC void
ea3daa5d
FC
1478S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1479 SSize_t *minlenp, int is_inf)
c277df42 1480{
e1ec3a88 1481 const STRLEN l = CHR_SVLEN(data->last_found);
2099df82 1482 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
a2ca2017 1483 const STRLEN old_l = CHR_SVLEN(longest_sv);
271b36b1 1484 DECLARE_AND_GET_RE_DEBUG_FLAGS;
b81d288d 1485
7918f24d
NC
1486 PERL_ARGS_ASSERT_SCAN_COMMIT;
1487
c277df42 1488 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
2099df82 1489 const U8 i = data->cur_is_floating;
a2ca2017 1490 SvSetMagicSV(longest_sv, data->last_found);
37b6262f
DM
1491 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1492
2099df82
DM
1493 if (!i) /* fixed */
1494 data->substrs[0].max_offset = data->substrs[0].min_offset;
1495 else { /* float */
f6231ebf
KW
1496 data->substrs[1].max_offset =
1497 (is_inf)
1498 ? OPTIMIZE_INFTY
1499 : (l
646e8787 1500 ? data->last_start_max
0069caf1
KW
1501 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
1502 ? OPTIMIZE_INFTY
ea3daa5d 1503 : data->pos_min + data->pos_delta));
37b6262f 1504 }
11683ecb 1505
5de22a40
HS
1506 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1507 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
37b6262f
DM
1508 data->substrs[i].minlenp = minlenp;
1509 data->substrs[i].lookbehind = 0;
c277df42 1510 }
37b6262f 1511
c277df42 1512 SvCUR_set(data->last_found, 0);
0eda9292 1513 {
a28509cc 1514 SV * const sv = data->last_found;
097eb12c
AL
1515 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1516 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1517 if (mg)
1518 mg->mg_len = 0;
1519 }
0eda9292 1520 }
c277df42
IZ
1521 data->last_end = -1;
1522 data->flags &= ~SF_BEFORE_EOL;
f5a36d78 1523 DEBUG_STUDYDATA("commit", data, 0, is_inf);
c277df42
IZ
1524}
1525
cdd87c1d
KW
1526/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1527 * list that describes which code points it matches */
1528
653099ff 1529STATIC void
3420edd7 1530S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1531{
cdd87c1d
KW
1532 /* Set the SSC 'ssc' to match an empty string or any code point */
1533
557bd3fb 1534 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1535
71068078 1536 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1537
0854ea0b
KW
1538 /* mortalize so won't leak */
1539 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1540 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1541}
1542
653099ff 1543STATIC int
dc3bf405 1544S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1545{
c144baaa
KW
1546 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1547 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1548 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1549 * in any way, so there's no point in using it */
cdd87c1d
KW
1550
1551 UV start, end;
1552 bool ret;
653099ff 1553
557bd3fb 1554 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1555
71068078 1556 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1557
93e92956 1558 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1559 return FALSE;
1560 }
1561
1562 /* See if the list consists solely of the range 0 - Infinity */
1563 invlist_iterinit(ssc->invlist);
1564 ret = invlist_iternext(ssc->invlist, &start, &end)
1565 && start == 0
1566 && end == UV_MAX;
1567
1568 invlist_iterfinish(ssc->invlist);
1569
1570 if (ret) {
1571 return TRUE;
1572 }
1573
1574 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1575 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1576 int i;
1577 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1578 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1579 return TRUE;
1580 }
1581 }
1582 }
1583
1584 return FALSE;
653099ff
GS
1585}
1586
653099ff 1587STATIC void
cdd87c1d 1588S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1589{
cdd87c1d
KW
1590 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1591 * string, any code point, or any posix class under locale */
1592
557bd3fb 1593 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1594
557bd3fb 1595 Zero(ssc, 1, regnode_ssc);
71068078 1596 set_ANYOF_SYNTHETIC(ssc);
93e92956 1597 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1598 ssc_anything(ssc);
cdd87c1d 1599
2f306ab9
KW
1600 /* If any portion of the regex is to operate under locale rules that aren't
1601 * fully known at compile time, initialization includes it. The reason
1602 * this isn't done for all regexes is that the optimizer was written under
1603 * the assumption that locale was all-or-nothing. Given the complexity and
1604 * lack of documentation in the optimizer, and that there are inadequate
1605 * test cases for locale, many parts of it may not work properly, it is
1606 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1607 if (RExC_contains_locale) {
1608 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1609 }
1610 else {
1611 ANYOF_POSIXL_ZERO(ssc);
1612 }
653099ff
GS
1613}
1614
b423522f 1615STATIC int
dc3bf405
BF
1616S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1617 const regnode_ssc *ssc)
b423522f
KW
1618{
1619 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1620 * to the list of code points matched, and locale posix classes; hence does
1621 * not check its flags) */
1622
1623 UV start, end;
1624 bool ret;
1625
1626 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1627
71068078 1628 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1629
1630 invlist_iterinit(ssc->invlist);
1631 ret = invlist_iternext(ssc->invlist, &start, &end)
1632 && start == 0
1633 && end == UV_MAX;
1634
1635 invlist_iterfinish(ssc->invlist);
1636
1637 if (! ret) {
1638 return FALSE;
1639 }
1640
e0e1be5f 1641 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1642 return FALSE;
b423522f
KW
1643 }
1644
1645 return TRUE;
1646}
1647
4ebed06a
KW
1648#define INVLIST_INDEX 0
1649#define ONLY_LOCALE_MATCHES_INDEX 1
1650#define DEFERRED_USER_DEFINED_INDEX 2
1651
b423522f
KW
1652STATIC SV*
1653S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1654 const regnode_charclass* const node)
b423522f
KW
1655{
1656 /* Returns a mortal inversion list defining which code points are matched
1657 * by 'node', which is of type ANYOF. Handles complementing the result if
1658 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1659 * returned list must, and will, contain every code point that is a
1660 * possibility. */
b423522f 1661
1565c085 1662 dVAR;
e2506fa7 1663 SV* invlist = NULL;
1ee208c4 1664 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1665 unsigned int i;
1666 const U32 n = ARG(node);
31f05a37 1667 bool new_node_has_latin1 = FALSE;
2d5613be 1668 const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
29a889ef
KW
1669 ? 0
1670 : ANYOF_FLAGS(node);
b423522f
KW
1671
1672 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1673
1674 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1675 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1676 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1677 AV * const av = MUTABLE_AV(SvRV(rv));
1678 SV **const ary = AvARRAY(av);
1679 assert(RExC_rxi->data->what[n] == 's');
1680
4ebed06a 1681 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
b423522f 1682
6ed02bb6
KW
1683 /* Here there are things that won't be known until runtime -- we
1684 * have to assume it could be anything */
e2506fa7 1685 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1686 return _add_range_to_invlist(invlist, 0, UV_MAX);
1687 }
4ebed06a 1688 else if (ary[INVLIST_INDEX]) {
b423522f 1689
6ed02bb6 1690 /* Use the node's inversion list */
4ebed06a 1691 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1ee208c4
KW
1692 }
1693
1694 /* Get the code points valid only under UTF-8 locales */
766d6d33 1695 if ( (flags & ANYOFL_FOLD)
4ebed06a 1696 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1ee208c4 1697 {
4ebed06a 1698 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
b423522f
KW
1699 }
1700 }
1701
e2506fa7
KW
1702 if (! invlist) {
1703 invlist = sv_2mortal(_new_invlist(0));
1704 }
1705
dcb20b36
KW
1706 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1707 * code points, and an inversion list for the others, but if there are code
1708 * points that should match only conditionally on the target string being
1709 * UTF-8, those are placed in the inversion list, and not the bitmap.
1710 * Since there are circumstances under which they could match, they are
1711 * included in the SSC. But if the ANYOF node is to be inverted, we have
1712 * to exclude them here, so that when we invert below, the end result
1713 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1714 * have to do this here before we add the unconditionally matched code
1715 * points */
766d6d33 1716 if (flags & ANYOF_INVERT) {
b423522f
KW
1717 _invlist_intersection_complement_2nd(invlist,
1718 PL_UpperLatin1,
1719 &invlist);
1720 }
1721
1722 /* Add in the points from the bit map */
2d5613be 1723 if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
90973738
KW
1724 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1725 if (ANYOF_BITMAP_TEST(node, i)) {
1726 unsigned int start = i++;
1727
1728 for (; i < NUM_ANYOF_CODE_POINTS
1729 && ANYOF_BITMAP_TEST(node, i); ++i)
1730 {
1731 /* empty */
1732 }
1733 invlist = _add_range_to_invlist(invlist, start, i-1);
1734 new_node_has_latin1 = TRUE;
6f8848d5 1735 }
b423522f
KW
1736 }
1737 }
1738
1739 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1740 * as well. But don't add them if inverting, as when that gets done below,
1741 * it would exclude all these characters, including the ones it shouldn't
1742 * that were added just above */
766d6d33
KW
1743 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1744 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
f240c685 1745 {
b423522f
KW
1746 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1747 }
1748
1749 /* Similarly for these */
766d6d33 1750 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1751 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1752 }
1753
766d6d33 1754 if (flags & ANYOF_INVERT) {
b423522f
KW
1755 _invlist_invert(invlist);
1756 }
766d6d33 1757 else if (flags & ANYOFL_FOLD) {
35b8412f 1758 if (new_node_has_latin1) {
31f05a37 1759
26be5fe6
KW
1760 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1761 * the locale. We can skip this if there are no 0-255 at all. */
1762 _invlist_union(invlist, PL_Latin1, &invlist);
35b8412f
KW
1763
1764 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1765 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1766 }
1767 else {
1768 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1769 invlist = add_cp_to_invlist(invlist, 'I');
1770 }
1771 if (_invlist_contains_cp(invlist,
1772 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1773 {
1774 invlist = add_cp_to_invlist(invlist, 'i');
1775 }
1776 }
31f05a37
KW
1777 }
1778
1ee208c4
KW
1779 /* Similarly add the UTF-8 locale possible matches. These have to be
1780 * deferred until after the non-UTF-8 locale ones are taken care of just
1781 * above, or it leads to wrong results under ANYOF_INVERT */
1782 if (only_utf8_locale_invlist) {
31f05a37 1783 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1784 only_utf8_locale_invlist,
766d6d33 1785 flags & ANYOF_INVERT,
31f05a37
KW
1786 &invlist);
1787 }
b423522f
KW
1788
1789 return invlist;
1790}
1791
1051e1c4 1792/* These two functions currently do the exact same thing */
557bd3fb 1793#define ssc_init_zero ssc_init
653099ff 1794
cdd87c1d
KW
1795#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1796#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1797
557bd3fb 1798/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1799 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1800 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1801
653099ff 1802STATIC void
b423522f 1803S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1804 const regnode_charclass *and_with)
653099ff 1805{
cdd87c1d
KW
1806 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1807 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1808
a0dd4231 1809 SV* anded_cp_list;
2d5613be 1810 U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
29a889ef
KW
1811 ? 0
1812 : ANYOF_FLAGS(and_with);
a0dd4231 1813 U8 anded_flags;
1e6ade67 1814
cdd87c1d 1815 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1816
71068078 1817 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1818
1819 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1820 * the code point inversion list and just the relevant flags */
71068078 1821 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1822 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
765e6ecf 1823 anded_flags = and_with_flags;
e9b08962
KW
1824
1825 /* XXX This is a kludge around what appears to be deficiencies in the
1826 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1827 * there are paths through the optimizer where it doesn't get weeded
1828 * out when it should. And if we don't make some extra provision for
1829 * it like the code just below, it doesn't get added when it should.
1830 * This solution is to add it only when AND'ing, which is here, and
1831 * only when what is being AND'ed is the pristine, original node
1832 * matching anything. Thus it is like adding it to ssc_anything() but
1833 * only when the result is to be AND'ed. Probably the same solution
1834 * could be adopted for the same problem we have with /l matching,
1835 * which is solved differently in S_ssc_init(), and that would lead to
1836 * fewer false positives than that solution has. But if this solution
1837 * creates bugs, the consequences are only that a warning isn't raised
1838 * that should be; while the consequences for having /l bugs is
1839 * incorrect matches */
7dcac5f6 1840 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1841 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1842 }
a0dd4231
KW
1843 }
1844 else {
5c0f85ef 1845 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685 1846 if (OP(and_with) == ANYOFD) {
765e6ecf 1847 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
f240c685
KW
1848 }
1849 else {
765e6ecf 1850 anded_flags = and_with_flags
f240c685 1851 &( ANYOF_COMMON_FLAGS
108316fb
KW
1852 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1853 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
765e6ecf 1854 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
d1c40ef5
KW
1855 anded_flags &=
1856 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1857 }
f240c685 1858 }
a0dd4231
KW
1859 }
1860
1861 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1862
1863 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1864 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1865 * 'and_with' may be inverted. When not inverted, we have the situation of
1866 * computing:
1867 * (C1 | P1) & (C2 | P2)
1868 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1869 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1870 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1871 * <= ((C1 & C2) | P1 | P2)
1872 * Alternatively, the last few steps could be:
1873 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1874 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1875 * <= (C1 | C2 | (P1 & P2))
1876 * We favor the second approach if either P1 or P2 is non-empty. This is
1877 * because these components are a barrier to doing optimizations, as what
1878 * they match cannot be known until the moment of matching as they are
1879 * dependent on the current locale, 'AND"ing them likely will reduce or
1880 * eliminate them.
1881 * But we can do better if we know that C1,P1 are in their initial state (a
1882 * frequent occurrence), each matching everything:
1883 * (<everything>) & (C2 | P2) = C2 | P2
1884 * Similarly, if C2,P2 are in their initial state (again a frequent
1885 * occurrence), the result is a no-op
1886 * (C1 | P1) & (<everything>) = C1 | P1
1887 *
1888 * Inverted, we have
1889 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1890 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1891 * <= (C1 & ~C2) | (P1 & ~P2)
1892 * */
1aa99e6b 1893
765e6ecf 1894 if ((and_with_flags & ANYOF_INVERT)
71068078 1895 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1896 {
cdd87c1d 1897 unsigned int i;
8951c461 1898
cdd87c1d
KW
1899 ssc_intersection(ssc,
1900 anded_cp_list,
1901 FALSE /* Has already been inverted */
1902 );
c6b76537 1903
cdd87c1d
KW
1904 /* If either P1 or P2 is empty, the intersection will be also; can skip
1905 * the loop */
765e6ecf 1906 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1907 ANYOF_POSIXL_ZERO(ssc);
1908 }
e0e1be5f 1909 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1910
1911 /* Note that the Posix class component P from 'and_with' actually
1912 * looks like:
1913 * P = Pa | Pb | ... | Pn
1914 * where each component is one posix class, such as in [\w\s].
1915 * Thus
1916 * ~P = ~(Pa | Pb | ... | Pn)
1917 * = ~Pa & ~Pb & ... & ~Pn
1918 * <= ~Pa | ~Pb | ... | ~Pn
1919 * The last is something we can easily calculate, but unfortunately
1920 * is likely to have many false positives. We could do better
1921 * in some (but certainly not all) instances if two classes in
1922 * P have known relationships. For example
1923 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1924 * So
1925 * :lower: & :print: = :lower:
1926 * And similarly for classes that must be disjoint. For example,
1927 * since \s and \w can have no elements in common based on rules in
1928 * the POSIX standard,
1929 * \w & ^\S = nothing
1930 * Unfortunately, some vendor locales do not meet the Posix
1931 * standard, in particular almost everything by Microsoft.
1932 * The loop below just changes e.g., \w into \W and vice versa */
1933
1ee208c4 1934 regnode_charclass_posixl temp;
cdd87c1d
KW
1935 int add = 1; /* To calculate the index of the complement */
1936
b1234259 1937 Zero(&temp, 1, regnode_charclass_posixl);
cdd87c1d
KW
1938 ANYOF_POSIXL_ZERO(&temp);
1939 for (i = 0; i < ANYOF_MAX; i++) {
1940 assert(i % 2 != 0
7dcac5f6
KW
1941 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1942 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1943
7dcac5f6 1944 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1945 ANYOF_POSIXL_SET(&temp, i + add);
1946 }
1947 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1948 }
1949 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1950
cdd87c1d
KW
1951 } /* else ssc already has no posixes */
1952 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1953 in its initial state */
71068078 1954 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1955 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1956 {
1957 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1958 * copy it over 'ssc' */
1959 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1960 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1961 StructCopy(and_with, ssc, regnode_ssc);
1962 }
1963 else {
1964 ssc->invlist = anded_cp_list;
1965 ANYOF_POSIXL_ZERO(ssc);
765e6ecf 1966 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1967 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1968 }
1969 }
1970 }
e0e1be5f 1971 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
765e6ecf 1972 || (and_with_flags & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1973 {
1974 /* One or the other of P1, P2 is non-empty. */
765e6ecf 1975 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1976 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1977 }
cdd87c1d
KW
1978 ssc_union(ssc, anded_cp_list, FALSE);
1979 }
1980 else { /* P1 = P2 = empty */
1981 ssc_intersection(ssc, anded_cp_list, FALSE);
1982 }
137165a6 1983 }
653099ff
GS
1984}
1985
653099ff 1986STATIC void
cdd87c1d 1987S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1988 const regnode_charclass *or_with)
653099ff 1989{
cdd87c1d
KW
1990 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1991 * another SSC or a regular ANYOF class. Can create false positives if
1992 * 'or_with' is to be inverted. */
7918f24d 1993
a0dd4231
KW
1994 SV* ored_cp_list;
1995 U8 ored_flags;
2d5613be 1996 U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
29a889ef
KW
1997 ? 0
1998 : ANYOF_FLAGS(or_with);
c6b76537 1999
cdd87c1d 2000 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 2001
71068078 2002 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
2003
2004 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
2005 * the code point inversion list and just the relevant flags */
71068078 2006 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 2007 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
765e6ecf 2008 ored_flags = or_with_flags;
a0dd4231
KW
2009 }
2010 else {
5c0f85ef 2011 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
765e6ecf 2012 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
f240c685
KW
2013 if (OP(or_with) != ANYOFD) {
2014 ored_flags
765e6ecf 2015 |= or_with_flags
108316fb
KW
2016 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2017 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
765e6ecf 2018 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
d1c40ef5
KW
2019 ored_flags |=
2020 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
2021 }
f240c685 2022 }
a0dd4231
KW
2023 }
2024
2025 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
2026
2027 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2028 * C2 is the list of code points in 'or-with'; P2, its posix classes.
2029 * 'or_with' may be inverted. When not inverted, we have the simple
2030 * situation of computing:
2031 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
2032 * If P1|P2 yields a situation with both a class and its complement are
2033 * set, like having both \w and \W, this matches all code points, and we
2034 * can delete these from the P component of the ssc going forward. XXX We
2035 * might be able to delete all the P components, but I (khw) am not certain
2036 * about this, and it is better to be safe.
2037 *
2038 * Inverted, we have
2039 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
2040 * <= (C1 | P1) | ~C2
2041 * <= (C1 | ~C2) | P1
2042 * (which results in actually simpler code than the non-inverted case)
2043 * */
9826f543 2044
765e6ecf 2045 if ((or_with_flags & ANYOF_INVERT)
71068078 2046 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 2047 {
cdd87c1d 2048 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 2049 } /* else Not inverted */
765e6ecf 2050 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
7dcac5f6 2051 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 2052 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
2053 unsigned int i;
2054 for (i = 0; i < ANYOF_MAX; i += 2) {
2055 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2056 {
2057 ssc_match_all_cp(ssc);
2058 ANYOF_POSIXL_CLEAR(ssc, i);
2059 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
2060 }
2061 }
2062 }
1aa99e6b 2063 }
cdd87c1d
KW
2064
2065 ssc_union(ssc,
2066 ored_cp_list,
2067 FALSE /* Already has been inverted */
2068 );
653099ff
GS
2069}
2070
19bb2455 2071STATIC void
b423522f
KW
2072S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2073{
2074 PERL_ARGS_ASSERT_SSC_UNION;
2075
71068078 2076 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2077
2078 _invlist_union_maybe_complement_2nd(ssc->invlist,
2079 invlist,
2080 invert2nd,
2081 &ssc->invlist);
2082}
2083
19bb2455 2084STATIC void
b423522f
KW
2085S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2086 SV* const invlist,
2087 const bool invert2nd)
2088{
2089 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2090
71068078 2091 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2092
2093 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2094 invlist,
2095 invert2nd,
2096 &ssc->invlist);
2097}
2098
19bb2455 2099STATIC void
b423522f
KW
2100S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2101{
2102 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2103
71068078 2104 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2105
2106 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2107}
2108
19bb2455 2109STATIC void
b423522f
KW
2110S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2111{
2112 /* AND just the single code point 'cp' into the SSC 'ssc' */
2113
2114 SV* cp_list = _new_invlist(2);
2115
2116 PERL_ARGS_ASSERT_SSC_CP_AND;
2117
71068078 2118 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2119
2120 cp_list = add_cp_to_invlist(cp_list, cp);
2121 ssc_intersection(ssc, cp_list,
2122 FALSE /* Not inverted */
2123 );
2124 SvREFCNT_dec_NN(cp_list);
2125}
2126
19bb2455 2127STATIC void
dc3bf405 2128S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
2129{
2130 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
2131 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2132
71068078 2133 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2134
2135 ANYOF_POSIXL_ZERO(ssc);
2136 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2137}
2138
b35552de
KW
2139#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2140
2141STATIC bool
2142S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2143{
2144 /* The synthetic start class is used to hopefully quickly winnow down
2145 * places where a pattern could start a match in the target string. If it
2146 * doesn't really narrow things down that much, there isn't much point to
2147 * having the overhead of using it. This function uses some very crude
2148 * heuristics to decide if to use the ssc or not.
2149 *
2150 * It returns TRUE if 'ssc' rules out more than half what it considers to
2151 * be the "likely" possible matches, but of course it doesn't know what the
2152 * actual things being matched are going to be; these are only guesses
2153 *
2154 * For /l matches, it assumes that the only likely matches are going to be
2155 * in the 0-255 range, uniformly distributed, so half of that is 127
2156 * For /a and /d matches, it assumes that the likely matches will be just
2157 * the ASCII range, so half of that is 63
2158 * For /u and there isn't anything matching above the Latin1 range, it
2159 * assumes that that is the only range likely to be matched, and uses
2160 * half that as the cut-off: 127. If anything matches above Latin1,
2161 * it assumes that all of Unicode could match (uniformly), except for
2162 * non-Unicode code points and things in the General Category "Other"
2163 * (unassigned, private use, surrogates, controls and formats). This
2164 * is a much large number. */
2165
b35552de
KW
2166 U32 count = 0; /* Running total of number of code points matched by
2167 'ssc' */
2168 UV start, end; /* Start and end points of current range in inversion
26be5fe6 2169 XXX outdated. UTF-8 locales are common, what about invert? list */
72400949
KW
2170 const U32 max_code_points = (LOC)
2171 ? 256
f6e8f31e 2172 : (( ! UNI_SEMANTICS
1a26cbcb 2173 || invlist_highest(ssc->invlist) < 256)
72400949
KW
2174 ? 128
2175 : NON_OTHER_COUNT);
2176 const U32 max_match = max_code_points / 2;
b35552de
KW
2177
2178 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2179
2180 invlist_iterinit(ssc->invlist);
2181 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
2182 if (start >= max_code_points) {
2183 break;
b35552de 2184 }
72400949 2185 end = MIN(end, max_code_points - 1);
b35552de 2186 count += end - start + 1;
72400949 2187 if (count >= max_match) {
b35552de
KW
2188 invlist_iterfinish(ssc->invlist);
2189 return FALSE;
2190 }
2191 }
2192
2193 return TRUE;
2194}
2195
2196
b423522f
KW
2197STATIC void
2198S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2199{
2200 /* The inversion list in the SSC is marked mortal; now we need a more
2201 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
2202 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2203 * map */
b423522f 2204
28118b9c 2205 SV* invlist = invlist_clone(ssc->invlist, NULL);
b423522f
KW
2206
2207 PERL_ARGS_ASSERT_SSC_FINALIZE;
2208
71068078 2209 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 2210
a0dd4231 2211 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
2212 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2213 * by the time we reach here */
f240c685
KW
2214 assert(! (ANYOF_FLAGS(ssc)
2215 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
2216 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2217 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 2218
b423522f
KW
2219 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2220
4c404f26 2221 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
21c3fd9d 2222 SvREFCNT_dec(invlist);
b423522f 2223
85c8e306
KW
2224 /* Make sure is clone-safe */
2225 ssc->invlist = NULL;
2226
e0e1be5f 2227 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 2228 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
d156f5cb 2229 OP(ssc) = ANYOFPOSIXL;
e0e1be5f 2230 }
d156f5cb 2231 else if (RExC_contains_locale) {
b2e90ddf
KW
2232 OP(ssc) = ANYOFL;
2233 }
2234
1462525b 2235 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
2236}
2237
a3621e74
YO
2238#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2239#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2240#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
2241#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2242 ? (TRIE_LIST_CUR( idx ) - 1) \
2243 : 0 )
a3621e74 2244
3dab1dad
YO
2245
2246#ifdef DEBUGGING
07be1b83 2247/*
2b8b4781
NC
2248 dump_trie(trie,widecharmap,revcharmap)
2249 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2250 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
2251
2252 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
2253 The _interim_ variants are used for debugging the interim
2254 tables that are used to generate the final compressed
2255 representation which is what dump_trie expects.
2256
486ec47a 2257 Part of the reason for their existence is to provide a form
3dab1dad 2258 of documentation as to how the different representations function.
07be1b83
YO
2259
2260*/
3dab1dad
YO
2261
2262/*
3dab1dad
YO
2263 Dumps the final compressed table form of the trie to Perl_debug_log.
2264 Used for debugging make_trie().
2265*/
b9a59e08 2266
3dab1dad 2267STATIC void
2b8b4781
NC
2268S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2269 AV *revcharmap, U32 depth)
3dab1dad
YO
2270{
2271 U32 state;
ab3bbdeb 2272 SV *sv=sv_newmortal();
55eed653 2273 int colwidth= widecharmap ? 6 : 4;
2e64971a 2274 U16 word;
271b36b1 2275 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3dab1dad 2276
7918f24d 2277 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 2278
6ad9a8ab 2279 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 2280 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
2281
2282 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 2283 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 2284 if ( tmp ) {
6ad9a8ab 2285 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2286 colwidth,
538e84ed 2287 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2288 PL_colors[0], PL_colors[1],
2289 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2290 PERL_PV_ESCAPE_FIRSTCHAR
2291 )
ab3bbdeb 2292 );
3dab1dad
YO
2293 }
2294 }
1e37780e
YO
2295 Perl_re_printf( aTHX_ "\n");
2296 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2297
2298 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2299 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2300 Perl_re_printf( aTHX_ "\n");
3dab1dad 2301
1e2e3d02 2302 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2303 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2304
147e3846 2305 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2306
2307 if ( trie->states[ state ].wordnum ) {
1e37780e 2308 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2309 } else {
6ad9a8ab 2310 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2311 }
2312
147e3846 2313 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2314
2315 if ( base ) {
2316 U32 ofs = 0;
2317
2318 while( ( base + ofs < trie->uniquecharcount ) ||
2319 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2320 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2321 != state))
3dab1dad
YO
2322 ofs++;
2323
147e3846 2324 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2325
2326 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2327 if ( ( base + ofs >= trie->uniquecharcount )
2328 && ( base + ofs - trie->uniquecharcount
2329 < trie->lasttrans )
2330 && trie->trans[ base + ofs
2331 - trie->uniquecharcount ].check == state )
3dab1dad 2332 {
147e3846 2333 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2334 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2335 );
3dab1dad 2336 } else {
88f063b4 2337 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
3dab1dad
YO
2338 }
2339 }
2340
6ad9a8ab 2341 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2342
2343 }
6ad9a8ab 2344 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2345 }
6ad9a8ab 2346 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2347 depth);
2e64971a 2348 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2349 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2350 (int)word, (int)(trie->wordinfo[word].prev),
2351 (int)(trie->wordinfo[word].len));
2352 }
6ad9a8ab 2353 Perl_re_printf( aTHX_ "\n" );
538e84ed 2354}
3dab1dad 2355/*
3dab1dad 2356 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2357 List tries normally only are used for construction when the number of
3dab1dad
YO
2358 possible chars (trie->uniquecharcount) is very high.
2359 Used for debugging make_trie().
2360*/
2361STATIC void
55eed653 2362S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2363 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2364 U32 depth)
3dab1dad
YO
2365{
2366 U32 state;
ab3bbdeb 2367 SV *sv=sv_newmortal();
55eed653 2368 int colwidth= widecharmap ? 6 : 4;
271b36b1 2369 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7918f24d
NC
2370
2371 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2372
3dab1dad 2373 /* print out the table precompression. */
6ad9a8ab 2374 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2375 depth+1 );
6ad9a8ab 2376 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2377 depth+1, "------:-----+-----------------\n" );
538e84ed 2378
3dab1dad
YO
2379 for( state=1 ; state < next_alloc ; state ++ ) {
2380 U16 charid;
538e84ed 2381
147e3846 2382 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2383 depth+1, (UV)state );
3dab1dad 2384 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2385 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2386 } else {
6ad9a8ab 2387 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2388 trie->states[ state ].wordnum
2389 );
2390 }
2391 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed 2392 SV ** const tmp = av_fetch( revcharmap,
88f063b4 2393 TRIE_LIST_ITEM(state, charid).forid, 0);
ab3bbdeb 2394 if ( tmp ) {
147e3846 2395 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2396 colwidth,
538e84ed
KW
2397 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2398 colwidth,
2399 PL_colors[0], PL_colors[1],
2400 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2401 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2402 ) ,
88f063b4
KW
2403 TRIE_LIST_ITEM(state, charid).forid,
2404 (UV)TRIE_LIST_ITEM(state, charid).newstate
1e2e3d02 2405 );
538e84ed 2406 if (!(charid % 10))
6ad9a8ab 2407 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2408 (int)((depth * 2) + 14), "");
1e2e3d02 2409 }
ab3bbdeb 2410 }
6ad9a8ab 2411 Perl_re_printf( aTHX_ "\n");
3dab1dad 2412 }
538e84ed 2413}
3dab1dad
YO
2414
2415/*
3dab1dad 2416 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2417 This is the normal DFA style state transition table, with a few
2418 twists to facilitate compression later.
3dab1dad
YO
2419 Used for debugging make_trie().
2420*/
2421STATIC void
55eed653 2422S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2423 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2424 U32 depth)
3dab1dad
YO
2425{
2426 U32 state;
2427 U16 charid;
ab3bbdeb 2428 SV *sv=sv_newmortal();
55eed653 2429 int colwidth= widecharmap ? 6 : 4;
271b36b1 2430 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7918f24d
NC
2431
2432 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2433
3dab1dad
YO
2434 /*
2435 print out the table precompression so that we can do a visual check
2436 that they are identical.
2437 */
538e84ed 2438
6ad9a8ab 2439 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2440
2441 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2442 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2443 if ( tmp ) {
6ad9a8ab 2444 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2445 colwidth,
538e84ed 2446 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2447 PL_colors[0], PL_colors[1],
2448 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2449 PERL_PV_ESCAPE_FIRSTCHAR
2450 )
ab3bbdeb 2451 );
3dab1dad
YO
2452 }
2453 }
2454
4aaafc03
YO
2455 Perl_re_printf( aTHX_ "\n");
2456 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2457
2458 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2459 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2460 }
2461
6ad9a8ab 2462 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2463
2464 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2465
147e3846 2466 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2467 depth+1,
3dab1dad
YO
2468 (UV)TRIE_NODENUM( state ) );
2469
2470 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2471 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2472 if (v)
147e3846 2473 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2474 else
6ad9a8ab 2475 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2476 }
2477 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2478 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2479 (UV)trie->trans[ state ].check );
3dab1dad 2480 } else {
147e3846 2481 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2482 (UV)trie->trans[ state ].check,
3dab1dad
YO
2483 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2484 }
2485 }
07be1b83 2486}
3dab1dad
YO
2487
2488#endif
2489
2e64971a 2490
786e8c11
YO
2491/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2492 startbranch: the first branch in the whole branch sequence
2493 first : start branch of sequence of branch-exact nodes.
2494 May be the same as startbranch
2495 last : Thing following the last branch.
2496 May be the same as tail.
2497 tail : item following the branch sequence
2498 count : words in the sequence
a4525e78 2499 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2500 depth : indent depth
3dab1dad 2501
786e8c11 2502Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2503
786e8c11
YO
2504A trie is an N'ary tree where the branches are determined by digital
2505decomposition of the key. IE, at the root node you look up the 1st character and
2506follow that branch repeat until you find the end of the branches. Nodes can be
2507marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2508
786e8c11 2509 /he|she|his|hers/
72f13be8 2510
786e8c11
YO
2511would convert into the following structure. Numbers represent states, letters
2512following numbers represent valid transitions on the letter from that state, if
2513the number is in square brackets it represents an accepting state, otherwise it
2514will be in parenthesis.
07be1b83 2515
786e8c11
YO
2516 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2517 | |
2518 | (2)
2519 | |
2520 (1) +-i->(6)-+-s->[7]
2521 |
2522 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2523
786e8c11
YO
2524 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2525
2526This shows that when matching against the string 'hers' we will begin at state 1
2527read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2528then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2529is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2530single traverse. We store a mapping from accepting to state to which word was
2531matched, and then when we have multiple possibilities we try to complete the
b8fda935 2532rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2533
2534The only prior NFA like behaviour that would be changed by the TRIE support is
2535the silent ignoring of duplicate alternations which are of the form:
2536
2537 / (DUPE|DUPE) X? (?{ ... }) Y /x
2538
4b714af6 2539Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2540and without the optimisation. With the optimisations dupes will be silently
486ec47a 2541ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2542the following demonstrates:
2543
2544 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2545
2546which prints out 'word' three times, but
2547
2548 'words'=~/(word|word|word)(?{ print $1 })S/
2549
2550which doesnt print it out at all. This is due to other optimisations kicking in.
2551
2552Example of what happens on a structural level:
2553
486ec47a 2554The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2555
2556 1: CURLYM[1] {1,32767}(18)
2557 5: BRANCH(8)
2558 6: EXACT <ac>(16)
2559 8: BRANCH(11)
2560 9: EXACT <ad>(16)
2561 11: BRANCH(14)
2562 12: EXACT <ab>(16)
2563 16: SUCCEED(0)
2564 17: NOTHING(18)
2565 18: END(0)
2566
2567This would be optimizable with startbranch=5, first=5, last=16, tail=16
2568and should turn into:
2569
2570 1: CURLYM[1] {1,32767}(18)
2571 5: TRIE(16)
2572 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2573 <ac>
2574 <ad>
2575 <ab>
2576 16: SUCCEED(0)
2577 17: NOTHING(18)
2578 18: END(0)
2579
2580Cases where tail != last would be like /(?foo|bar)baz/:
2581
2582 1: BRANCH(4)
2583 2: EXACT <foo>(8)
2584 4: BRANCH(7)
2585 5: EXACT <bar>(8)
2586 7: TAIL(8)
2587 8: EXACT <baz>(10)
2588 10: END(0)
2589
2590which would be optimizable with startbranch=1, first=1, last=7, tail=8
2591and would end up looking like:
2592
2593 1: TRIE(8)
2594 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2595 <foo>
2596 <bar>
2597 7: TAIL(8)
2598 8: EXACT <baz>(10)
2599 10: END(0)
2600
c80e42f3 2601 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2602
2603is the recommended Unicode-aware way of saying
2604
2605 *(d++) = uv;
2606*/
2607
fab2782b 2608#define TRIE_STORE_REVCHAR(val) \
786e8c11 2609 STMT_START { \
73031816 2610 if (UTF) { \
668fcfea 2611 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2612 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1d84a256
DM
2613 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2614 *kapow = '\0'; \
73031816
NC
2615 SvCUR_set(zlopp, kapow - flrbbbbb); \
2616 SvPOK_on(zlopp); \
2617 SvUTF8_on(zlopp); \
2618 av_push(revcharmap, zlopp); \
2619 } else { \
fab2782b 2620 char ooooff = (char)val; \
73031816
NC
2621 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2622 } \
2623 } STMT_END
786e8c11 2624
914a25d5
KW
2625/* This gets the next character from the input, folding it if not already
2626 * folded. */
2627#define TRIE_READ_CHAR STMT_START { \
2628 wordlen++; \
2629 if ( UTF ) { \
2630 /* if it is UTF then it is either already folded, or does not need \
2631 * folding */ \
1c1d615a 2632 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2633 } \
2634 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2635 /* This folder implies Unicode rules, which in the range expressible \
2636 * by not UTF is the lower case, with the two exceptions, one of \
2637 * which should have been taken care of before calling this */ \
2638 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2639 uvc = toLOWER_L1(*uc); \
2640 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2641 len = 1; \
914a25d5
KW
2642 } else { \
2643 /* raw data, will be folded later if needed */ \
2644 uvc = (U32)*uc; \
2645 len = 1; \
2646 } \
786e8c11
YO
2647} STMT_END
2648
2649
2650
2651#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2652 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
00195859 2653 U32 ging = TRIE_LIST_LEN( state ) * 2; \
f9003953 2654 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
00195859 2655 TRIE_LIST_LEN( state ) = ging; \
786e8c11
YO
2656 } \
2657 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2658 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2659 TRIE_LIST_CUR( state )++; \
2660} STMT_END
07be1b83 2661
786e8c11 2662#define TRIE_LIST_NEW(state) STMT_START { \
d09f14bf 2663 Newx( trie->states[ state ].trans.list, \
786e8c11
YO
2664 4, reg_trie_trans_le ); \
2665 TRIE_LIST_CUR( state ) = 1; \
2666 TRIE_LIST_LEN( state ) = 4; \
2667} STMT_END
07be1b83 2668
786e8c11
YO
2669#define TRIE_HANDLE_WORD(state) STMT_START { \
2670 U16 dupe= trie->states[ state ].wordnum; \
2671 regnode * const noper_next = regnext( noper ); \
2672 \
786e8c11
YO
2673 DEBUG_r({ \
2674 /* store the word for dumping */ \
2675 SV* tmp; \
2676 if (OP(noper) != NOTHING) \
740cce10 2677 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2678 else \
740cce10 2679 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2680 av_push( trie_words, tmp ); \
786e8c11
YO
2681 }); \
2682 \
2683 curword++; \
2e64971a
DM
2684 trie->wordinfo[curword].prev = 0; \
2685 trie->wordinfo[curword].len = wordlen; \
2686 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2687 \
2688 if ( noper_next < tail ) { \
2689 if (!trie->jump) \
538e84ed
KW
2690 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2691 sizeof(U16) ); \
7f69552c 2692 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2693 if (!jumper) \
2694 jumper = noper_next; \
2695 if (!nextbranch) \
2696 nextbranch= regnext(cur); \
2697 } \
2698 \
2699 if ( dupe ) { \
2e64971a
DM
2700 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2701 /* chain, so that when the bits of chain are later */\
2702 /* linked together, the dups appear in the chain */\
2703 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2704 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2705 } else { \
2706 /* we haven't inserted this word yet. */ \
2707 trie->states[ state ].wordnum = curword; \
2708 } \
2709} STMT_END
07be1b83 2710
3dab1dad 2711
786e8c11
YO
2712#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2713 ( ( base + charid >= ucharcount \
2714 && base + charid < ubound \
2715 && state == trie->trans[ base - ucharcount + charid ].check \
2716 && trie->trans[ base - ucharcount + charid ].next ) \
2717 ? trie->trans[ base - ucharcount + charid ].next \
2718 : ( state==1 ? special : 0 ) \
2719 )
3dab1dad 2720
8bcafbf4
YO
2721#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2722STMT_START { \
2723 TRIE_BITMAP_SET(trie, uvc); \
2724 /* store the folded codepoint */ \
2725 if ( folder ) \
2726 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2727 \
2728 if ( !UTF ) { \
2729 /* store first byte of utf8 representation of */ \
2730 /* variant codepoints */ \
2731 if (! UVCHR_IS_INVARIANT(uvc)) { \
2732 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2733 } \
2734 } \
2735} STMT_END
786e8c11
YO
2736#define MADE_TRIE 1
2737#define MADE_JUMP_TRIE 2
2738#define MADE_EXACT_TRIE 4
3dab1dad 2739
a3621e74 2740STATIC I32
538e84ed
KW
2741S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2742 regnode *first, regnode *last, regnode *tail,
2743 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2744{
2745 /* first pass, loop through and scan words */
2746 reg_trie_data *trie;
55eed653 2747 HV *widecharmap = NULL;
2b8b4781 2748 AV *revcharmap = newAV();
a3621e74 2749 regnode *cur;
a3621e74
YO
2750 STRLEN len = 0;
2751 UV uvc = 0;
2752 U16 curword = 0;
2753 U32 next_alloc = 0;
786e8c11
YO
2754 regnode *jumper = NULL;
2755 regnode *nextbranch = NULL;
7f69552c 2756 regnode *convert = NULL;
2e64971a 2757 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2758 /* we just use folder as a flag in utf8 */
1e696034 2759 const U8 * folder = NULL;
a3621e74 2760
3a611511
YO
2761 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2762 * which stands for one trie structure, one hash, optionally followed
2763 * by two arrays */
2b8b4781 2764#ifdef DEBUGGING
3a611511 2765 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2b8b4781
NC
2766 AV *trie_words = NULL;
2767 /* along with revcharmap, this only used during construction but both are
2768 * useful during debugging so we store them in the struct when debugging.
8e11feef 2769 */
2b8b4781 2770#else
cf78de0b 2771 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2772 STRLEN trie_charcount=0;
3dab1dad 2773#endif
2b8b4781 2774 SV *re_trie_maxbuff;
271b36b1 2775 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7918f24d
NC
2776
2777 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2778#ifndef DEBUGGING
2779 PERL_UNUSED_ARG(depth);
2780#endif
a3621e74 2781
1e696034 2782 switch (flags) {
3f2416ae 2783 case EXACT: case EXACT_REQ8: case EXACTL: break;
89829bb5 2784 case EXACTFAA:
0ea669f4 2785 case EXACTFUP:
a4525e78
KW
2786 case EXACTFU:
2787 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2788 case EXACTF: folder = PL_fold; break;
fab2782b 2789 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2790 }
2791
c944940b 2792 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2793 trie->refcount = 1;
3dab1dad 2794 trie->startstate = 1;
786e8c11 2795 trie->wordcount = word_count;
f8fc2ecf 2796 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2797 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3f2416ae 2798 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
c944940b 2799 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2800 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2801 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2802
a3621e74 2803 DEBUG_r({
2b8b4781 2804 trie_words = newAV();
a3621e74 2805 });
a3621e74 2806
3b89859a 2807 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
316ebaf2 2808 assert(re_trie_maxbuff);
a3621e74 2809 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2810 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2811 }
df826430 2812 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2813 Perl_re_indentf( aTHX_
cb41e5d6
YO
2814 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2815 depth+1,
88f063b4 2816 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
538e84ed 2817 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2818 });
538e84ed 2819
7f69552c
YO
2820 /* Find the node we are going to overwrite */
2821 if ( first == startbranch && OP( last ) != BRANCH ) {
2822 /* whole branch chain */
2823 convert = first;
2824 } else {
2825 /* branch sub-chain */
2826 convert = NEXTOPER( first );
2827 }
538e84ed 2828
a3621e74
YO
2829 /* -- First loop and Setup --
2830
2831 We first traverse the branches and scan each word to determine if it
2832 contains widechars, and how many unique chars there are, this is
2833 important as we have to build a table with at least as many columns as we
2834 have unique chars.
2835
2836 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2837 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2838 the native representation of the character value as the key and IV's for
2839 the coded index.
a3621e74
YO
2840
2841 *TODO* If we keep track of how many times each character is used we can
2842 remap the columns so that the table compression later on is more
3b753521 2843 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2844 middle and the least common are on the outside. IMO this would be better
2845 than a most to least common mapping as theres a decent chance the most
2846 common letter will share a node with the least common, meaning the node
486ec47a 2847 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2848 case is when we have the least common nodes twice.
2849
2850 */
2851
a3621e74 2852 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2853 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2854 const U8 *uc;
2855 const U8 *e;
bc031a7d 2856 int foldlen = 0;
07be1b83 2857 U32 wordlen = 0; /* required init */
bc031a7d
KW
2858 STRLEN minchars = 0;
2859 STRLEN maxchars = 0;
538e84ed
KW
2860 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2861 bitmap?*/
a3621e74 2862
3dab1dad 2863 if (OP(noper) == NOTHING) {
20ed8c88
YO
2864 /* skip past a NOTHING at the start of an alternation
2865 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
ca902fb8
YO
2866 *
2867 * If the next node is not something we are supposed to process
2868 * we will just ignore it due to the condition guarding the
2869 * next block.
20ed8c88 2870 */
ca902fb8 2871
df826430 2872 regnode *noper_next= regnext(noper);
944e05e3
YO
2873 if (noper_next < tail)
2874 noper= noper_next;
2875 }
2876
f6b4b99d
KW
2877 if ( noper < tail
2878 && ( OP(noper) == flags
3f2416ae
KW
2879 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2880 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 2881 || OP(noper) == EXACTFUP))))
f6b4b99d 2882 {
944e05e3
YO
2883 uc= (U8*)STRING(noper);
2884 e= uc + STR_LEN(noper);
2885 } else {
2886 trie->minlen= 0;
2887 continue;
3dab1dad 2888 }
df826430 2889
944e05e3 2890
fab2782b 2891 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2892 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2893 regardless of encoding */
0ea669f4 2894 if (OP( noper ) == EXACTFUP) {
fab2782b 2895 /* false positives are ok, so just set this */
0dc4a61d 2896 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2897 }
2898 }
dca5fc4c 2899
bc031a7d
KW
2900 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2901 branch */
3dab1dad 2902 TRIE_CHARCOUNT(trie)++;
a3621e74 2903 TRIE_READ_CHAR;
645de4ce 2904
bc031a7d
KW
2905 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2906 * is in effect. Under /i, this character can match itself, or
2907 * anything that folds to it. If not under /i, it can match just
2908 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2909 * all fold to k, and all are single characters. But some folds
2910 * expand to more than one character, so for example LATIN SMALL
2911 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2912 * the string beginning at 'uc' is 'ffi', it could be matched by
2913 * three characters, or just by the one ligature character. (It
2914 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2915 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2916 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2917 * match.) The trie needs to know the minimum and maximum number
2918 * of characters that could match so that it can use size alone to
2919 * quickly reject many match attempts. The max is simple: it is
2920 * the number of folded characters in this branch (since a fold is
2921 * never shorter than what folds to it. */
2922
2923 maxchars++;
2924
2925 /* And the min is equal to the max if not under /i (indicated by
2926 * 'folder' being NULL), or there are no multi-character folds. If
2927 * there is a multi-character fold, the min is incremented just
2928 * once, for the character that folds to the sequence. Each
2929 * character in the sequence needs to be added to the list below of
2930 * characters in the trie, but we count only the first towards the
2931 * min number of characters needed. This is done through the
2932 * variable 'foldlen', which is returned by the macros that look
2933 * for these sequences as the number of bytes the sequence
2934 * occupies. Each time through the loop, we decrement 'foldlen' by
2935 * how many bytes the current char occupies. Only when it reaches
2936 * 0 do we increment 'minchars' or look for another multi-character
2937 * sequence. */
2938 if (folder == NULL) {
2939 minchars++;
2940 }
2941 else if (foldlen > 0) {
2942 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2943 }
2944 else {
bc031a7d
KW
2945 minchars++;
2946
2947 /* See if *uc is the beginning of a multi-character fold. If
2948 * so, we decrement the length remaining to look at, to account
2949 * for the current character this iteration. (We can use 'uc'
2950 * instead of the fold returned by TRIE_READ_CHAR because for
2951 * non-UTF, the latin1_safe macro is smart enough to account
2952 * for all the unfolded characters, and because for UTF, the
2953 * string will already have been folded earlier in the
2954 * compilation process */
2955 if (UTF) {
2956 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2957 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2958 }
2959 }
bc031a7d
KW
2960 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2961 foldlen--;
2962 }
645de4ce 2963 }
bc031a7d
KW
2964
2965 /* The current character (and any potential folds) should be added
2966 * to the possible matching characters for this position in this
2967 * branch */
a3621e74 2968 if ( uvc < 256 ) {
fab2782b
YO
2969 if ( folder ) {
2970 U8 folded= folder[ (U8) uvc ];
2971 if ( !trie->charmap[ folded ] ) {
2972 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2973 TRIE_STORE_REVCHAR( folded );
2974 }
2975 }
a3621e74
YO
2976 if ( !trie->charmap[ uvc ] ) {
2977 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2978 TRIE_STORE_REVCHAR( uvc );
a3621e74 2979 }
02daf0ab 2980 if ( set_bit ) {
62012aee
KW
2981 /* store the codepoint in the bitmap, and its folded
2982 * equivalent. */
8bcafbf4 2983 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2984 set_bit = 0; /* We've done our bit :-) */
2985 }
a3621e74 2986 } else {
bc031a7d
KW
2987
2988 /* XXX We could come up with the list of code points that fold
2989 * to this using PL_utf8_foldclosures, except not for
2990 * multi-char folds, as there may be multiple combinations
2991 * there that could work, which needs to wait until runtime to
2992 * resolve (The comment about LIGATURE FFI above is such an
2993 * example */
2994
a3621e74 2995 SV** svpp;
55eed653
NC
2996 if ( !widecharmap )
2997 widecharmap = newHV();
a3621e74 2998
55eed653 2999 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
3000
3001 if ( !svpp )
147e3846 3002 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
3003
3004 if ( !SvTRUE( *svpp ) ) {
3005 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 3006 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
3007 }
3008 }
bc031a7d
KW
3009 } /* end loop through characters in this branch of the trie */
3010
3011 /* We take the min and max for this branch and combine to find the min
3012 * and max for all branches processed so far */
3dab1dad 3013 if( cur == first ) {
bc031a7d
KW
3014 trie->minlen = minchars;
3015 trie->maxlen = maxchars;
3016 } else if (minchars < trie->minlen) {
3017 trie->minlen = minchars;
3018 } else if (maxchars > trie->maxlen) {
3019 trie->maxlen = maxchars;
fab2782b 3020 }
a3621e74
YO
3021 } /* end first pass */
3022 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 3023 Perl_re_indentf( aTHX_
cb41e5d6
YO
3024 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3025 depth+1,
55eed653 3026 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
3027 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
3028 (int)trie->minlen, (int)trie->maxlen )
a3621e74 3029 );
a3621e74
YO
3030
3031 /*
3032 We now know what we are dealing with in terms of unique chars and
3033 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
3034 representation using a flat table will take. If it's over a reasonable
3035 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
3036 conservative but potentially much slower representation using an array
3037 of lists.
3038
3039 At the end we convert both representations into the same compressed
3040 form that will be used in regexec.c for matching with. The latter
3041 is a form that cannot be used to construct with but has memory
3042 properties similar to the list form and access properties similar
3043 to the table form making it both suitable for fast searches and
3044 small enough that its feasable to store for the duration of a program.
3045
3046 See the comment in the code where the compressed table is produced
3047 inplace from the flat tabe representation for an explanation of how
3048 the compression works.
3049
3050 */
3051
3052
2e64971a
DM
3053 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3054 prev_states[1] = 0;
3055
538e84ed
KW
3056 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3057 > SvIV(re_trie_maxbuff) )
3058 {
a3621e74
YO
3059 /*
3060 Second Pass -- Array Of Lists Representation
3061
3062 Each state will be represented by a list of charid:state records
3063 (reg_trie_trans_le) the first such element holds the CUR and LEN
3064 points of the allocated array. (See defines above).
3065
3066 We build the initial structure using the lists, and then convert
3067 it into the compressed table form which allows faster lookups
3068 (but cant be modified once converted).
a3621e74
YO
3069 */
3070
a3621e74
YO
3071 STRLEN transcount = 1;
3072
6ad9a8ab 3073 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 3074 depth+1));
686b73d4 3075
c944940b
JH
3076 trie->states = (reg_trie_state *)
3077 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3078 sizeof(reg_trie_state) );
a3621e74
YO
3079 TRIE_LIST_NEW(1);
3080 next_alloc = 2;
3081
3082 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3083
df826430 3084 regnode *noper = NEXTOPER( cur );
c445ea15
AL
3085 U32 state = 1; /* required init */
3086 U16 charid = 0; /* sanity init */
07be1b83 3087 U32 wordlen = 0; /* required init */
c445ea15 3088
df826430
YO
3089 if (OP(noper) == NOTHING) {
3090 regnode *noper_next= regnext(noper);
944e05e3
YO
3091 if (noper_next < tail)
3092 noper= noper_next;
ca902fb8
YO
3093 /* we will undo this assignment if noper does not
3094 * point at a trieable type in the else clause of
3095 * the following statement. */
df826430
YO
3096 }
3097
f6b4b99d
KW
3098 if ( noper < tail
3099 && ( OP(noper) == flags
3f2416ae
KW
3100 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3101 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 3102 || OP(noper) == EXACTFUP))))
f6b4b99d 3103 {
944e05e3
YO
3104 const U8 *uc= (U8*)STRING(noper);
3105 const U8 *e= uc + STR_LEN(noper);
3106
786e8c11 3107 for ( ; uc < e ; uc += len ) {
c445ea15 3108
786e8c11 3109 TRIE_READ_CHAR;
c445ea15 3110
786e8c11
YO
3111 if ( uvc < 256 ) {
3112 charid = trie->charmap[ uvc ];
c445ea15 3113 } else {
538e84ed
KW
3114 SV** const svpp = hv_fetch( widecharmap,
3115 (char*)&uvc,
3116 sizeof( UV ),
3117 0);
786e8c11
YO
3118 if ( !svpp ) {
3119 charid = 0;
3120 } else {
3121 charid=(U16)SvIV( *svpp );
3122 }
c445ea15 3123 }
538e84ed
KW
3124 /* charid is now 0 if we dont know the char read, or
3125 * nonzero if we do */
786e8c11 3126 if ( charid ) {
a3621e74 3127
786e8c11
YO
3128 U16 check;
3129 U32 newstate = 0;
a3621e74 3130
786e8c11
YO
3131 charid--;
3132 if ( !trie->states[ state ].trans.list ) {
3133 TRIE_LIST_NEW( state );
c445ea15 3134 }
538e84ed
KW
3135 for ( check = 1;
3136 check <= TRIE_LIST_USED( state );
3137 check++ )
3138 {
3139 if ( TRIE_LIST_ITEM( state, check ).forid
3140 == charid )
3141 {
786e8c11
YO
3142 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3143 break;
3144 }
3145 }
3146 if ( ! newstate ) {
3147 newstate = next_alloc++;
2e64971a 3148 prev_states[newstate] = state;
786e8c11
YO
3149 TRIE_LIST_PUSH( state, charid, newstate );
3150 transcount++;
3151 }
3152 state = newstate;
3153 } else {
147e3846 3154 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 3155 }
a28509cc 3156 }
ca902fb8
YO
3157 } else {
3158 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3159 * on a trieable type. So we need to reset noper back to point at the first regop
3160 * in the branch before we call TRIE_HANDLE_WORD()
3161 */
3162 noper= NEXTOPER(cur);
3163 }
3dab1dad 3164 TRIE_HANDLE_WORD(state);
a3621e74
YO
3165
3166 } /* end second pass */
3167
1e2e3d02 3168 /* next alloc is the NEXT state to be allocated */
538e84ed 3169 trie->statecount = next_alloc;
c944940b
JH
3170 trie->states = (reg_trie_state *)
3171 PerlMemShared_realloc( trie->states,
3172 next_alloc
3173 * sizeof(reg_trie_state) );
a3621e74 3174
3dab1dad 3175 /* and now dump it out before we compress it */
2b8b4781
NC
3176 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3177 revcharmap, next_alloc,
3178 depth+1)
1e2e3d02 3179 );
a3621e74 3180
c944940b
JH
3181 trie->trans = (reg_trie_trans *)
3182 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
3183 {
3184 U32 state;
a3621e74
YO
3185 U32 tp = 0;
3186 U32 zp = 0;
3187
3188
3189 for( state=1 ; state < next_alloc ; state ++ ) {
3190 U32 base=0;
3191
3192 /*
3193 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3194 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
3195 );
3196 */
3197
3198 if (trie->states[state].trans.list) {
3199 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3200 U16 maxid=minid;
a28509cc 3201 U16 idx;
a3621e74
YO
3202
3203 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
3204 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3205 if ( forid < minid ) {
3206 minid=forid;
3207 } else if ( forid > maxid ) {
3208 maxid=forid;
3209 }
a3621e74
YO
3210 }
3211 if ( transcount < tp + maxid - minid + 1) {
3212 transcount *= 2;
c944940b
JH
3213 trie->trans = (reg_trie_trans *)
3214 PerlMemShared_realloc( trie->trans,
446bd890
NC
3215 transcount
3216 * sizeof(reg_trie_trans) );
538e84ed
KW
3217 Zero( trie->trans + (transcount / 2),
3218 transcount / 2,
3219 reg_trie_trans );
a3621e74
YO
3220 }
3221 base = trie->uniquecharcount + tp - minid;
3222 if ( maxid == minid ) {
3223 U32 set = 0;
3224 for ( ; zp < tp ; zp++ ) {
3225 if ( ! trie->trans[ zp ].next ) {
3226 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
3227 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3228 1).newstate;
a3621e74
YO
3229 trie->trans[ zp ].check = state;
3230 set = 1;
3231 break;
3232 }
3233 }
3234 if ( !set ) {
538e84ed
KW
3235 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3236 1).newstate;
a3621e74
YO
3237 trie->trans[ tp ].check = state;
3238 tp++;
3239 zp = tp;
3240 }
3241 } else {
3242 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
3243 const U32 tid = base
3244 - trie->uniquecharcount
3245 + TRIE_LIST_ITEM( state, idx ).forid;
3246 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3247 idx ).newstate;
a3621e74
YO
3248 trie->trans[ tid ].check = state;
3249 }
3250 tp += ( maxid - minid + 1 );
3251 }
3252 Safefree(trie->states[ state ].trans.list);
3253 }
3254 /*
3255 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3256 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
3257 );
3258 */
3259 trie->states[ state ].trans.base=base;
3260 }
cc601c31 3261 trie->lasttrans = tp + 1;
a3621e74
YO
3262 }
3263 } else {
3264 /*
3265 Second Pass -- Flat Table Representation.
3266
b423522f
KW
3267 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3268 each. We know that we will need Charcount+1 trans at most to store
3269 the data (one row per char at worst case) So we preallocate both
3270 structures assuming worst case.
a3621e74
YO
3271
3272 We then construct the trie using only the .next slots of the entry
3273 structs.
3274
b423522f
KW
3275 We use the .check field of the first entry of the node temporarily
3276 to make compression both faster and easier by keeping track of how
3277 many non zero fields are in the node.
a3621e74
YO
3278
3279 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3280 transition.
3281
b423522f
KW
3282 There are two terms at use here: state as a TRIE_NODEIDX() which is
3283 a number representing the first entry of the node, and state as a
3284 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3285 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3286 if there are 2 entrys per node. eg:
a3621e74
YO
3287
3288 A B A B
3289 1. 2 4 1. 3 7
3290 2. 0 3 3. 0 5
3291 3. 0 0 5. 0 0
3292 4. 0 0 7. 0 0
3293
b423522f
KW
3294 The table is internally in the right hand, idx form. However as we
3295 also have to deal with the states array which is indexed by nodenum
3296 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
3297
3298 */
6ad9a8ab 3299 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 3300 depth+1));
3dab1dad 3301
c944940b
JH
3302 trie->trans = (reg_trie_trans *)
3303 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3304 * trie->uniquecharcount + 1,
3305 sizeof(reg_trie_trans) );
3306 trie->states = (reg_trie_state *)
3307 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3308 sizeof(reg_trie_state) );
a3621e74
YO
3309 next_alloc = trie->uniquecharcount + 1;
3310
3dab1dad 3311
a3621e74
YO
3312 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3313
df826430 3314 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3315
3316 U32 state = 1; /* required init */
3317
3318 U16 charid = 0; /* sanity init */
3319 U32 accept_state = 0; /* sanity init */
a3621e74 3320
07be1b83 3321 U32 wordlen = 0; /* required init */
a3621e74 3322
df826430
YO
3323 if (OP(noper) == NOTHING) {
3324 regnode *noper_next= regnext(noper);
944e05e3
YO
3325 if (noper_next < tail)
3326 noper= noper_next;
ca902fb8
YO
3327 /* we will undo this assignment if noper does not
3328 * point at a trieable type in the else clause of
3329 * the following statement. */
df826430 3330 }
fab2782b 3331
f6b4b99d
KW
3332 if ( noper < tail
3333 && ( OP(noper) == flags
3f2416ae
KW
3334 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3335 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 3336 || OP(noper) == EXACTFUP))))
f6b4b99d 3337 {
944e05e3
YO
3338 const U8 *uc= (U8*)STRING(noper);
3339 const U8 *e= uc + STR_LEN(noper);
3340
786e8c11 3341 for ( ; uc < e ; uc += len ) {
a3621e74 3342
786e8c11 3343 TRIE_READ_CHAR;
a3621e74 3344
786e8c11
YO
3345 if ( uvc < 256 ) {
3346 charid = trie->charmap[ uvc ];
3347 } else {
538e84ed
KW
3348 SV* const * const svpp = hv_fetch( widecharmap,
3349 (char*)&uvc,
3350 sizeof( UV ),
3351 0);
786e8c11 3352 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3353 }
786e8c11
YO
3354 if ( charid ) {
3355 charid--;
3356 if ( !trie->trans[ state + charid ].next ) {
3357 trie->trans[ state + charid ].next = next_alloc;
3358 trie->trans[ state ].check++;
2e64971a
DM
3359 prev_states[TRIE_NODENUM(next_alloc)]
3360 = TRIE_NODENUM(state);
786e8c11
YO
3361 next_alloc += trie->uniquecharcount;
3362 }
3363 state = trie->trans[ state + charid ].next;
3364 } else {
147e3846 3365 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3366 }
538e84ed
KW
3367 /* charid is now 0 if we dont know the char read, or
3368 * nonzero if we do */
a3621e74 3369 }
ca902fb8
YO
3370 } else {
3371 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3372 * on a trieable type. So we need to reset noper back to point at the first regop
3373 * in the branch before we call TRIE_HANDLE_WORD().
3374 */
3375 noper= NEXTOPER(cur);
a3621e74 3376 }
3dab1dad
YO
3377 accept_state = TRIE_NODENUM( state );
3378 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3379
3380 } /* end second pass */
3381
3dab1dad 3382 /* and now dump it out before we compress it */
2b8b4781
NC
3383 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3384 revcharmap,
3385 next_alloc, depth+1));
a3621e74 3386
a3621e74
YO
3387 {
3388 /*
3389 * Inplace compress the table.*
3390
3391 For sparse data sets the table constructed by the trie algorithm will
3392 be mostly 0/FAIL transitions or to put it another way mostly empty.
3393 (Note that leaf nodes will not contain any transitions.)
3394
3395 This algorithm compresses the tables by eliminating most such
3396 transitions, at the cost of a modest bit of extra work during lookup:
3397
3398 - Each states[] entry contains a .base field which indicates the
3399 index in the state[] array wheres its transition data is stored.
3400
3b753521 3401 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3402
3403 - If .base is nonzero then charid is added to it to find an entry in
3404 the trans array.
3405</