This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
study_chunk: honour mutate_ok over recursion
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
aa2885ce
KW
73
74/* Note on debug output:
75 *
76 * This is set up so that -Dr turns on debugging like all other flags that are
77 * enabled by -DDEBUGGING. -Drv gives more verbose output. This applies to
78 * all regular expressions encountered in a program, and gives a huge amount of
79 * output for all but the shortest programs.
80 *
81 * The ability to output pattern debugging information lexically, and with much
82 * finer grained control was added, with 'use re qw(Debug ....);' available even
83 * in non-DEBUGGING builds. This is accomplished by copying the contents of
84 * regcomp.c to ext/re/re_comp.c, and regexec.c is copied to ext/re/re_exec.c.
85 * Those files are compiled and linked into the perl executable, and they are
86 * compiled essentially as if DEBUGGING were enabled, and controlled by calls
87 * to re.pm.
88 *
89 * That would normally mean linking errors when two functions of the same name
90 * are attempted to be placed into the same executable. That is solved in one
91 * of four ways:
92 * 1) Static functions aren't known outside the file they are in, so for the
93 * many functions of that type in this file, it just isn't a problem.
94 * 2) Most externally known functions are enclosed in
95 * #ifndef PERL_IN_XSUB_RE
96 * ...
97 * #endif
98 * blocks, so there is only one defintion for them in the whole
99 * executable, the one in regcomp.c (or regexec.c). The implication of
100 * that is any debugging info that comes from them is controlled only by
101 * -Dr. Further, any static function they call will also be the version
102 * in regcomp.c (or regexec.c), so its debugging will also be by -Dr.
103 * 3) About a dozen external functions are re-#defined in ext/re/re_top.h, to
104 * have different names, so that what gets loaded in the executable is
105 * 'Perl_foo' from regcomp.c (and regexec.c), and the identical function
106 * from re_comp.c (and re_exec.c), but with the name 'my_foo' Debugging
107 * in the 'Perl_foo' versions is controlled by -Dr, but the 'my_foo'
108 * versions and their callees are under control of re.pm. The catch is
109 * that references to all these go through the regexp_engine structure,
110 * which is initialized in regcomp.h to the Perl_foo versions, and
111 * substituted out in lexical scopes where 'use re' is in effect to the
112 * 'my_foo' ones. That structure is public API, so it would be a hard
113 * sell to add any additional members.
114 * 4) For functions in regcomp.c and re_comp.c that are called only from,
115 * respectively, regexec.c and re_exec.c, they can have two different
116 * names, depending on #ifdef'ing PERL_IN_XSUB_RE, in both regexec.c and
117 * embed.fnc.
118 *
119 * The bottom line is that if you add code to one of the public functions
120 * listed in ext/re/re_top.h, debugging automagically works. But if you write
121 * a new function that needs to do debugging or there is a chain of calls from
122 * it that need to do debugging, all functions in the chain should use options
123 * 2) or 4) above.
124 *
125 * A function may have to be split so that debugging stuff is static, but it
126 * calls out to some other function that only gets compiled in regcomp.c to
127 * access data that we don't want to duplicate.
128 */
129
a687059c 130#include "EXTERN.h"
864dbfa3 131#define PERL_IN_REGCOMP_C
a687059c 132#include "perl.h"
d06ea78c 133
c277df42 134#define REG_COMP_C
54df2634
NC
135#ifdef PERL_IN_XSUB_RE
136# include "re_comp.h"
afda64e8 137EXTERN_C const struct regexp_engine my_reg_engine;
51bd1941 138EXTERN_C const struct regexp_engine wild_reg_engine;
54df2634
NC
139#else
140# include "regcomp.h"
141#endif
a687059c 142
b992490d 143#include "invlist_inline.h"
1b0f46bf 144#include "unicode_constants.h"
04e98a4d 145
538e84ed
KW
146#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
147 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
148#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
149 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 150#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 151#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 152
a687059c
LW
153#ifndef STATIC
154#define STATIC static
155#endif
156
3f910e62
YO
157/* this is a chain of data about sub patterns we are processing that
158 need to be handled separately/specially in study_chunk. Its so
159 we can simulate recursion without losing state. */
160struct scan_frame;
161typedef struct scan_frame {
162 regnode *last_regnode; /* last node to process in this frame */
163 regnode *next_regnode; /* next node to process when last is reached */
164 U32 prev_recursed_depth;
165 I32 stopparen; /* what stopparen do we use */
089ad25d 166 bool in_gosub; /* this or an outer frame is for GOSUB */
3f910e62
YO
167
168 struct scan_frame *this_prev_frame; /* this previous frame */
169 struct scan_frame *prev_frame; /* previous frame */
170 struct scan_frame *next_frame; /* next frame */
171} scan_frame;
172
f4ae5a27
KW
173/* Certain characters are output as a sequence with the first being a
174 * backslash. */
4aada8b9 175#define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
f4ae5a27
KW
176
177
09b2b2e6 178struct RExC_state_t {
514a91f1
DM
179 U32 flags; /* RXf_* are we folding, multilining? */
180 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 181 char *precomp; /* uncompiled string. */
711b303b 182 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 183 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 184 regexp *rx; /* perl core regexp structure */
538e84ed
KW
185 regexp_internal *rxi; /* internal data for regexp object
186 pprivate field */
fac92740 187 char *start; /* Start of input for compile */
830247a4
IZ
188 char *end; /* End of input for compile */
189 char *parse; /* Input-scan pointer. */
51684e3c
KW
190 char *copy_start; /* start of copy of input within
191 constructed parse string */
cc16d262
KW
192 char *save_copy_start; /* Provides one level of saving
193 and restoring 'copy_start' */
51684e3c
KW
194 char *copy_start_in_input; /* Position in input string
195 corresponding to copy_start */
ea3daa5d 196 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 197 regnode *emit_start; /* Start of emitted-code area */
f55b7b26 198 regnode_offset emit; /* Code-emit pointer */
830247a4
IZ
199 I32 naughty; /* How bad is this pattern? */
200 I32 sawback; /* Did we see \1, ...? */
88f063b4
KW
201 SSize_t size; /* Number of regnode equivalents in
202 pattern */
d8d1dede
KW
203 Size_t sets_depth; /* Counts recursion depth of already-
204 compiled regex set patterns */
2475254e
RL
205 U32 seen;
206
207 I32 parens_buf_size; /* #slots malloced open/close_parens */
208 regnode_offset *open_parens; /* offsets to open parens */
209 regnode_offset *close_parens; /* offsets to close parens */
210 HV *paren_names; /* Paren names */
7c932d07
KW
211
212 /* position beyond 'precomp' of the warning message furthest away from
213 * 'precomp'. During the parse, no warnings are raised for any problems
214 * earlier in the parse than this position. This works if warnings are
215 * raised the first time a given spot is parsed, and if only one
216 * independent warning is raised for any given spot */
217 Size_t latest_warn_offset;
218
219 I32 npar; /* Capture buffer count so far in the
220 parse, (OPEN) plus one. ("par" 0 is
221 the whole pattern)*/
222 I32 total_par; /* During initial parse, is either 0,
223 or -1; the latter indicating a
224 reparse is needed. After that pass,
225 it is what 'npar' became after the
226 pass. Hence, it being > 0 indicates
227 we are in a reparse situation */
538e84ed
KW
228 I32 nestroot; /* root parens we are in - used by
229 accept */
830247a4 230 I32 seen_zerolen;
d5a00e4a 231 regnode *end_op; /* END node in program */
02daf0ab
YO
232 I32 utf8; /* whether the pattern is utf8 or not */
233 I32 orig_utf8; /* whether the pattern was originally in utf8 */
234 /* XXX use this for future optimisation of case
235 * where pattern must be upgraded to utf8. */
e40e74fe
KW
236 I32 uni_semantics; /* If a d charset modifier should use unicode
237 rules, even if the pattern is not in
238 utf8 */
538e84ed 239
88f063b4 240 I32 recurse_count; /* Number of recurse regops we have generated */
2475254e 241 regnode **recurse; /* Recurse regops */
4286711a 242 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 243 through */
09a65838 244 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 245 I32 in_lookbehind;
105c827d 246 I32 in_lookahead;
4624b182 247 I32 contains_locale;
bb3f3ed2 248 I32 override_recoding;
4caef9c3 249 I32 recode_x_to_native;
9d53c457 250 I32 in_multi_char_class;
2475254e 251 int code_index; /* next code_blocks[] slot */
1acab4c5 252 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 253 within pattern */
ee273784 254 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
255 scan_frame *frame_head;
256 scan_frame *frame_last;
257 U32 frame_count;
7eec73eb 258 AV *warn_text;
945961fd 259 HV *unlexed_names;
d24ca0c5 260 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 261#ifdef DEBUGGING
be8e71aa 262 const char *lastparse;
3dab1dad 263 I32 lastnum;
d9a72fcc 264 U32 study_chunk_recursed_count;
2475254e 265 AV *paren_name_list; /* idx -> name */
c9f0d54c
YO
266 SV *mysv1;
267 SV *mysv2;
88f063b4 268
3dab1dad
YO
269#define RExC_lastparse (pRExC_state->lastparse)
270#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 271#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 272#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
273#define RExC_mysv (pRExC_state->mysv1)
274#define RExC_mysv1 (pRExC_state->mysv1)
275#define RExC_mysv2 (pRExC_state->mysv2)
276
3dab1dad 277#endif
bd0b62a6 278 bool seen_d_op;
911bd04e 279 bool strict;
da7cf1cc 280 bool study_started;
034602eb 281 bool in_script_run;
7c932d07 282 bool use_BRANCHJ;
5aabce33
KW
283 bool sWARN_EXPERIMENTAL__VLB;
284 bool sWARN_EXPERIMENTAL__REGEX_SETS;
09b2b2e6 285};
830247a4 286
e2509266 287#define RExC_flags (pRExC_state->flags)
514a91f1 288#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 289#define RExC_precomp (pRExC_state->precomp)
51684e3c
KW
290#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
291#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
cc16d262 292#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
711b303b 293#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 294#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 295#define RExC_rx (pRExC_state->rx)
f8fc2ecf 296#define RExC_rxi (pRExC_state->rxi)
fac92740 297#define RExC_start (pRExC_state->start)
830247a4
IZ
298#define RExC_end (pRExC_state->end)
299#define RExC_parse (pRExC_state->parse)
7c932d07 300#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
830247a4 301#define RExC_whilem_seen (pRExC_state->whilem_seen)
bd0b62a6
KW
302#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
303 under /d from /u ? */
512c0f5a 304
7122b237 305#ifdef RE_TRACK_PATTERN_OFFSETS
44f36cd3 306# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
538e84ed 307 others */
7122b237 308#endif
830247a4 309#define RExC_emit (pRExC_state->emit)
fac92740 310#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
311#define RExC_sawback (pRExC_state->sawback)
312#define RExC_seen (pRExC_state->seen)
313#define RExC_size (pRExC_state->size)
ee273784 314#define RExC_maxlen (pRExC_state->maxlen)
830247a4 315#define RExC_npar (pRExC_state->npar)
1533d1d2 316#define RExC_total_parens (pRExC_state->total_par)
0275405c 317#define RExC_parens_buf_size (pRExC_state->parens_buf_size)
e2e6a0f1 318#define RExC_nestroot (pRExC_state->nestroot)
830247a4 319#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 320#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 321#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 322#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
323#define RExC_open_parens (pRExC_state->open_parens)
324#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 325#define RExC_end_op (pRExC_state->end_op)
81714fb9 326#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
327#define RExC_recurse (pRExC_state->recurse)
328#define RExC_recurse_count (pRExC_state->recurse_count)
d8d1dede 329#define RExC_sets_depth (pRExC_state->sets_depth)
09a65838 330#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
331#define RExC_study_chunk_recursed_bytes \
332 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 333#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
105c827d 334#define RExC_in_lookahead (pRExC_state->in_lookahead)
4624b182 335#define RExC_contains_locale (pRExC_state->contains_locale)
4caef9c3
KW
336#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
337
b6d67071 338#ifdef EBCDIC
4caef9c3
KW
339# define SET_recode_x_to_native(x) \
340 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
341#else
342# define SET_recode_x_to_native(x) NOOP
b6d67071 343#endif
4caef9c3 344
9d53c457 345#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
346#define RExC_frame_head (pRExC_state->frame_head)
347#define RExC_frame_last (pRExC_state->frame_last)
348#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 349#define RExC_strict (pRExC_state->strict)
da7cf1cc 350#define RExC_study_started (pRExC_state->study_started)
7eec73eb 351#define RExC_warn_text (pRExC_state->warn_text)
034602eb 352#define RExC_in_script_run (pRExC_state->in_script_run)
7c932d07 353#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
5aabce33
KW
354#define RExC_warned_WARN_EXPERIMENTAL__VLB (pRExC_state->sWARN_EXPERIMENTAL__VLB)
355#define RExC_warned_WARN_EXPERIMENTAL__REGEX_SETS (pRExC_state->sWARN_EXPERIMENTAL__REGEX_SETS)
945961fd 356#define RExC_unlexed_names (pRExC_state->unlexed_names)
830247a4 357
99807a43
HS
358/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
359 * a flag to disable back-off on the fixed/floating substrings - if it's
360 * a high complexity pattern we assume the benefit of avoiding a full match
361 * is worth the cost of checking for the substrings even if they rarely help.
362 */
363#define RExC_naughty (pRExC_state->naughty)
364#define TOO_NAUGHTY (10)
365#define MARK_NAUGHTY(add) \
366 if (RExC_naughty < TOO_NAUGHTY) \
367 RExC_naughty += (add)
368#define MARK_NAUGHTY_EXP(exp, add) \
369 if (RExC_naughty < TOO_NAUGHTY) \
370 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 371
a687059c
LW
372#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
373#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 374 ((*s) == '{' && regcurly(s)))
a687059c
LW
375
376/*
377 * Flags to be passed up and down.
378 */
a687059c 379#define WORST 0 /* Worst case. */
c4ead770
KW
380#define HASWIDTH 0x01 /* Known to not match null strings, could match
381 non-null ones. */
fda99bee 382
e64f369d 383/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
384 * character. (There needs to be a case: in the switch statement in regexec.c
385 * for any node marked SIMPLE.) Note that this is not the same thing as
386 * REGNODE_SIMPLE */
fda99bee 387#define SIMPLE 0x02
e64f369d 388#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
389#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
390#define TRYAGAIN 0x10 /* Weeded out a declaration. */
c36361b2
KW
391#define RESTART_PARSE 0x20 /* Need to redo the parse */
392#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
393 calcuate sizes as UTF-8 */
a687059c 394
3dab1dad
YO
395#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
396
07be1b83
YO
397/* whether trie related optimizations are enabled */
398#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
399#define TRIE_STUDY_OPT
786e8c11 400#define FULL_TRIE_STUDY
07be1b83
YO
401#define TRIE_STCLASS
402#endif
1de06328
YO
403
404
40d049e4
YO
405
406#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
407#define PBITVAL(paren) (1 << ((paren) & 7))
ac3ef366
HS
408#define PAREN_OFFSET(depth) \
409 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
410#define PAREN_TEST(depth, paren) \
411 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
412#define PAREN_SET(depth, paren) \
413 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
414#define PAREN_UNSET(depth, paren) \
415 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
40d049e4 416
82a6ada4 417#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 418 if (!UTF) { \
c36361b2 419 *flagp = RESTART_PARSE|NEED_UTF8; \
f55b7b26 420 return 0; \
8d9c2815 421 } \
82a6ada4 422 } STMT_END
40d049e4 423
d63d75f5 424/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
f6e8f31e
KW
425 * a flag that indicates we need to override /d with /u as a result of
426 * something in the pattern. It should only be used in regards to calling
b2090782 427 * set_regex_charset() or get_regex_charset() */
512c0f5a
KW
428#define REQUIRE_UNI_RULES(flagp, restart_retval) \
429 STMT_START { \
430 if (DEPENDS_SEMANTICS) { \
512c0f5a
KW
431 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
432 RExC_uni_semantics = 1; \
b76e1c04 433 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
bd0b62a6
KW
434 /* No need to restart the parse if we haven't seen \
435 * anything that differs between /u and /d, and no need \
436 * to restart immediately if we're going to reparse \
437 * anyway to count parens */ \
c36361b2 438 *flagp |= RESTART_PARSE; \
512c0f5a 439 return restart_retval; \
d1be211c 440 } \
512c0f5a
KW
441 } \
442 } STMT_END
443
7c932d07
KW
444#define REQUIRE_BRANCHJ(flagp, restart_retval) \
445 STMT_START { \
446 RExC_use_BRANCHJ = 1; \
439a3bfe
KW
447 *flagp |= RESTART_PARSE; \
448 return restart_retval; \
7c932d07
KW
449 } STMT_END
450
b76e1c04
KW
451/* Until we have completed the parse, we leave RExC_total_parens at 0 or
452 * less. After that, it must always be positive, because the whole re is
453 * considered to be surrounded by virtual parens. Setting it to negative
454 * indicates there is some construct that needs to know the actual number of
455 * parens to be properly handled. And that means an extra pass will be
456 * required after we've counted them all */
457#define ALL_PARENS_COUNTED (RExC_total_parens > 0)
7c932d07 458#define REQUIRE_PARENS_PASS \
b76e1c04
KW
459 STMT_START { /* No-op if have completed a pass */ \
460 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
7c932d07 461 } STMT_END
b76e1c04
KW
462#define IN_PARENS_PASS (RExC_total_parens < 0)
463
7c932d07 464
f358228e
KW
465/* This is used to return failure (zero) early from the calling function if
466 * various flags in 'flags' are set. Two flags always cause a return:
467 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
468 * additional flags that should cause a return; 0 if none. If the return will
469 * be done, '*flagp' is first set to be all of the flags that caused the
470 * return. */
471#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
fd7fc6ae 472 STMT_START { \
c36361b2
KW
473 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
474 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
f358228e 475 return 0; \
fd7fc6ae
YO
476 } \
477 } STMT_END
478
c36361b2 479#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
fd7fc6ae 480
5d555bb8 481#define RETURN_FAIL_ON_RESTART(flags,flagp) \
f358228e 482 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
5d555bb8 483#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
f358228e 484 if (MUST_RESTART(*(flagp))) return 0
fd7fc6ae 485
f19b1a63
KW
486/* This converts the named class defined in regcomp.h to its equivalent class
487 * number defined in handy.h. */
488#define namedclass_to_classnum(class) ((int) ((class) / 2))
489#define classnum_to_namedclass(classnum) ((classnum) * 2)
490
de92f5e6
KW
491#define _invlist_union_complement_2nd(a, b, output) \
492 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
493#define _invlist_intersection_complement_2nd(a, b, output) \
494 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
495
4a808c87
KW
496/* We add a marker if we are deferring expansion of a property that is both
497 * 1) potentiallly user-defined; and
498 * 2) could also be an official Unicode property.
499 *
500 * Without this marker, any deferred expansion can only be for a user-defined
501 * one. This marker shouldn't conflict with any that could be in a legal name,
502 * and is appended to its name to indicate this. There is a string and
503 * character form */
043cdb9a
KW
504#define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
505#define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
1c2f3d7a 506
0069caf1
KW
507/* What is infinity for optimization purposes */
508#define OPTIMIZE_INFTY SSize_t_MAX
509
1de06328
YO
510/* About scan_data_t.
511
512 During optimisation we recurse through the regexp program performing
513 various inplace (keyhole style) optimisations. In addition study_chunk
514 and scan_commit populate this data structure with information about
538e84ed 515 what strings MUST appear in the pattern. We look for the longest
3b753521 516 string that must appear at a fixed location, and we look for the
1de06328
YO
517 longest string that may appear at a floating location. So for instance
518 in the pattern:
538e84ed 519
1de06328 520 /FOO[xX]A.*B[xX]BAR/
538e84ed 521
1de06328
YO
522 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
523 strings (because they follow a .* construct). study_chunk will identify
524 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 525
1de06328 526 The strings can be composites, for instance
538e84ed 527
1de06328 528 /(f)(o)(o)/
538e84ed 529
1de06328 530 will result in a composite fixed substring 'foo'.
538e84ed 531
1de06328 532 For each string some basic information is maintained:
538e84ed 533
6ea77169 534 - min_offset
1de06328
YO
535 This is the position the string must appear at, or not before.
536 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
537 characters must match before the string we are searching for.
538 Likewise when combined with minlenp and the length of the string it
538e84ed 539 tells us how many characters must appear after the string we have
1de06328 540 found.
538e84ed 541
1de06328
YO
542 - max_offset
543 Only used for floating strings. This is the rightmost point that
0069caf1 544 the string can appear at. If set to OPTIMIZE_INFTY it indicates that the
1de06328 545 string can occur infinitely far to the right.
6e12f832 546 For fixed strings, it is equal to min_offset.
538e84ed 547
1de06328 548 - minlenp
2d608413
KW
549 A pointer to the minimum number of characters of the pattern that the
550 string was found inside. This is important as in the case of positive
538e84ed 551 lookahead or positive lookbehind we can have multiple patterns
1de06328 552 involved. Consider
538e84ed 553
1de06328 554 /(?=FOO).*F/
538e84ed 555
1de06328
YO
556 The minimum length of the pattern overall is 3, the minimum length
557 of the lookahead part is 3, but the minimum length of the part that
538e84ed 558 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 559 minimum length for the F is 1. This is important as the minimum length
538e84ed 560 is used to determine offsets in front of and behind the string being
1de06328 561 looked for. Since strings can be composites this is the length of the
486ec47a 562 pattern at the time it was committed with a scan_commit. Note that
1de06328 563 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 564 are not known until the full pattern has been compiled, thus the
1de06328 565 pointer to the value.
538e84ed 566
1de06328 567 - lookbehind
538e84ed 568
1de06328 569 In the case of lookbehind the string being searched for can be
538e84ed 570 offset past the start point of the final matching string.
1de06328
YO
571 If this value was just blithely removed from the min_offset it would
572 invalidate some of the calculations for how many chars must match
573 before or after (as they are derived from min_offset and minlen and
538e84ed 574 the length of the string being searched for).
1de06328
YO
575 When the final pattern is compiled and the data is moved from the
576 scan_data_t structure into the regexp structure the information
538e84ed
KW
577 about lookbehind is factored in, with the information that would
578 have been lost precalculated in the end_shift field for the
1de06328
YO
579 associated string.
580
581 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 582 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
583
584*/
2c2d71f5 585
cb6aef8b
DM
586struct scan_data_substrs {
587 SV *str; /* longest substring found in pattern */
588 SSize_t min_offset; /* earliest point in string it can appear */
589 SSize_t max_offset; /* latest point in string it can appear */
590 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
591 SSize_t lookbehind; /* is the pos of the string modified by LB */
11683ecb 592 I32 flags; /* per substring SF_* and SCF_* flags */
cb6aef8b
DM
593};
594
2c2d71f5 595typedef struct scan_data_t {
1de06328
YO
596 /*I32 len_min; unused */
597 /*I32 len_delta; unused */
49f55535 598 SSize_t pos_min;
ea3daa5d 599 SSize_t pos_delta;
2c2d71f5 600 SV *last_found;
ea3daa5d 601 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 602 SSize_t last_start_min;
ea3daa5d 603 SSize_t last_start_max;
2099df82
DM
604 U8 cur_is_floating; /* whether the last_* values should be set as
605 * the next fixed (0) or floating (1)
606 * substring */
6ea77169 607
2099df82
DM
608 /* [0] is longest fixed substring so far, [1] is longest float so far */
609 struct scan_data_substrs substrs[2];
6ea77169 610
11683ecb 611 I32 flags; /* common SF_* and SCF_* flags */
2c2d71f5 612 I32 whilem_c;
49f55535 613 SSize_t *last_closep;
b8f7bb16 614 regnode_ssc *start_class;
2c2d71f5
JH
615} scan_data_t;
616
a687059c 617/*
e50aee73 618 * Forward declarations for pregcomp()'s friends.
a687059c 619 */
a0d0e21e 620
6ea77169 621static const scan_data_t zero_scan_data = {
a2ca2017 622 0, 0, NULL, 0, 0, 0, 0,
6ea77169 623 {
11683ecb
DM
624 { NULL, 0, 0, 0, 0, 0 },
625 { NULL, 0, 0, 0, 0, 0 },
6ea77169
DM
626 },
627 0, 0, NULL, NULL
628};
c277df42 629
11683ecb
DM
630/* study flags */
631
07be1b83
YO
632#define SF_BEFORE_SEOL 0x0001
633#define SF_BEFORE_MEOL 0x0002
11683ecb 634#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
c277df42 635
07be1b83
YO
636#define SF_IS_INF 0x0040
637#define SF_HAS_PAR 0x0080
638#define SF_IN_PAR 0x0100
639#define SF_HAS_EVAL 0x0200
bc604ad8
KW
640
641
642/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
643 * longest substring in the pattern. When it is not set the optimiser keeps
644 * track of position, but does not keep track of the actual strings seen,
645 *
646 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
647 * /foo/i will not.
648 *
649 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
650 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
651 * turned off because of the alternation (BRANCH). */
07be1b83 652#define SCF_DO_SUBSTR 0x0400
bc604ad8 653
653099ff
GS
654#define SCF_DO_STCLASS_AND 0x0800
655#define SCF_DO_STCLASS_OR 0x1000
656#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 657#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 658
786e8c11 659#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 660#define SCF_SEEN_ACCEPT 0x8000
688e0391 661#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
662#define SCF_IN_DEFINE 0x20000
663
664
665
07be1b83 666
43fead97 667#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
668
669/* The enums for all these are ordered so things work out correctly */
a62b1201 670#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
671#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
672 == REGEX_DEPENDS_CHARSET)
00b27cfc 673#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
674#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
675 >= REGEX_UNICODE_CHARSET)
676#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
677 == REGEX_ASCII_RESTRICTED_CHARSET)
678#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
679 >= REGEX_ASCII_RESTRICTED_CHARSET)
680#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
681 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 682
43fead97 683#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 684
f2c2a6ab
KW
685/* For programs that want to be strictly Unicode compatible by dying if any
686 * attempt is made to match a non-Unicode code point against a Unicode
687 * property. */
688#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
689
93733859 690#define OOB_NAMEDCLASS -1
b8c5462f 691
8e661ac5
KW
692/* There is no code point that is out-of-bounds, so this is problematic. But
693 * its only current use is to initialize a variable that is always set before
694 * looked at. */
695#define OOB_UNICODE 0xDEADBEEF
696
a0ed51b3 697#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 698
8615cb43 699
b45f050a
JF
700/* length of regex to show in messages that don't mark a position within */
701#define RegexLengthToShowInErrorMessages 127
702
703/*
704 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
705 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
706 * op/pragma/warn/regcomp.
707 */
7253e4e3
RK
708#define MARKER1 "<-- HERE" /* marker as it appears in the description */
709#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 710
538e84ed 711#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 712 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 713
285b5ca0
KW
714/* The code in this file in places uses one level of recursion with parsing
715 * rebased to an alternate string constructed by us in memory. This can take
716 * the form of something that is completely different from the input, or
717 * something that uses the input as part of the alternate. In the first case,
718 * there should be no possibility of an error, as we are in complete control of
51684e3c
KW
719 * the alternate string. But in the second case we don't completely control
720 * the input portion, so there may be errors in that. Here's an example:
285b5ca0
KW
721 * /[abc\x{DF}def]/ui
722 * is handled specially because \x{df} folds to a sequence of more than one
51684e3c 723 * character: 'ss'. What is done is to create and parse an alternate string,
285b5ca0
KW
724 * which looks like this:
725 * /(?:\x{DF}|[abc\x{DF}def])/ui
726 * where it uses the input unchanged in the middle of something it constructs,
727 * which is a branch for the DF outside the character class, and clustering
728 * parens around the whole thing. (It knows enough to skip the DF inside the
729 * class while in this substitute parse.) 'abc' and 'def' may have errors that
730 * need to be reported. The general situation looks like this:
731 *
51684e3c 732 * |<------- identical ------>|
285b5ca0 733 * sI tI xI eI
51684e3c 734 * Input: ---------------------------------------------------------------
285b5ca0
KW
735 * Constructed: ---------------------------------------------------
736 * sC tC xC eC EC
51684e3c 737 * |<------- identical ------>|
285b5ca0 738 *
51684e3c
KW
739 * sI..eI is the portion of the input pattern we are concerned with here.
740 * sC..EC is the constructed substitute parse string.
741 * sC..tC is constructed by us
742 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
743 * In the diagram, these are vertically aligned.
744 * eC..EC is also constructed by us.
745 * xC is the position in the substitute parse string where we found a
746 * problem.
747 * xI is the position in the original pattern corresponding to xC.
285b5ca0 748 *
51684e3c
KW
749 * We want to display a message showing the real input string. Thus we need to
750 * translate from xC to xI. We know that xC >= tC, since the portion of the
751 * string sC..tC has been constructed by us, and so shouldn't have errors. We
752 * get:
753 * xI = tI + (xC - tC)
285b5ca0 754 *
51684e3c
KW
755 * When the substitute parse is constructed, the code needs to set:
756 * RExC_start (sC)
757 * RExC_end (eC)
758 * RExC_copy_start_in_input (tI)
759 * RExC_copy_start_in_constructed (tC)
760 * and restore them when done.
285b5ca0 761 *
51684e3c
KW
762 * During normal processing of the input pattern, both
763 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
764 * sI, so that xC equals xI.
285b5ca0
KW
765 */
766
51684e3c
KW
767#define sI RExC_precomp
768#define eI RExC_precomp_end
769#define sC RExC_start
770#define eC RExC_end
771#define tI RExC_copy_start_in_input
772#define tC RExC_copy_start_in_constructed
773#define xI(xC) (tI + (xC - tC))
774#define xI_offset(xC) (xI(xC) - sI)
285b5ca0
KW
775
776#define REPORT_LOCATION_ARGS(xC) \
777 UTF8fARG(UTF, \
51684e3c 778 (xI(xC) > eI) /* Don't run off end */ \
232b691f 779 ? eI - sI /* Length before the <--HERE */ \
bc929b61
KW
780 : ((xI_offset(xC) >= 0) \
781 ? xI_offset(xC) \
782 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
783 IVdf " trying to output message for " \
784 " pattern %.*s", \
51684e3c 785 __FILE__, __LINE__, (IV) xI_offset(xC), \
bc929b61 786 ((int) (eC - sC)), sC), 0)), \
51684e3c 787 sI), /* The input pattern printed up to the <--HERE */ \
285b5ca0 788 UTF8fARG(UTF, \
51684e3c
KW
789 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
790 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
c1d900c3 791
8a6d8ec6
HS
792/* Used to point after bad bytes for an error message, but avoid skipping
793 * past a nul byte. */
e3304382 794#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
8a6d8ec6 795
ed68fe11
KW
796/* Set up to clean up after our imminent demise */
797#define PREPARE_TO_DIE \
798 STMT_START { \
799 if (RExC_rx_sv) \
800 SAVEFREESV(RExC_rx_sv); \
7c932d07
KW
801 if (RExC_open_parens) \
802 SAVEFREEPV(RExC_open_parens); \
803 if (RExC_close_parens) \
804 SAVEFREEPV(RExC_close_parens); \
ed68fe11
KW
805 } STMT_END
806
b45f050a
JF
807/*
808 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
809 * arg. Show regex, up to a maximum length. If it's too long, chop and add
810 * "...".
811 */
58e23c8d 812#define _FAIL(code) STMT_START { \
bfed75c6 813 const char *ellipses = ""; \
88f063b4 814 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380 815 \
ed68fe11 816 PREPARE_TO_DIE; \
ccb2c380
MP
817 if (len > RegexLengthToShowInErrorMessages) { \
818 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
819 len = RegexLengthToShowInErrorMessages - 10; \
820 ellipses = "..."; \
821 } \
58e23c8d 822 code; \
ccb2c380 823} STMT_END
8615cb43 824
58e23c8d 825#define FAIL(msg) _FAIL( \
147e3846 826 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 827 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
828
829#define FAIL2(msg,arg) _FAIL( \
147e3846 830 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 831 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 832
4b7718d6
KW
833#define FAIL3(msg,arg1,arg2) _FAIL( \
834 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
835 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
836
b45f050a 837/*
b45f050a
JF
838 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
839 */
ccb2c380 840#define Simple_vFAIL(m) STMT_START { \
ccb2c380 841 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 842 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 843} STMT_END
b45f050a
JF
844
845/*
846 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
847 */
ccb2c380 848#define vFAIL(m) STMT_START { \
ed68fe11 849 PREPARE_TO_DIE; \
ccb2c380
MP
850 Simple_vFAIL(m); \
851} STMT_END
b45f050a
JF
852
853/*
854 * Like Simple_vFAIL(), but accepts two arguments.
855 */
ccb2c380 856#define Simple_vFAIL2(m,a1) STMT_START { \
c6572a6d 857 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
d528642a 858 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 859} STMT_END
b45f050a
JF
860
861/*
862 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
863 */
ccb2c380 864#define vFAIL2(m,a1) STMT_START { \
ed68fe11 865 PREPARE_TO_DIE; \
ccb2c380
MP
866 Simple_vFAIL2(m, a1); \
867} STMT_END
b45f050a
JF
868
869
870/*
871 * Like Simple_vFAIL(), but accepts three arguments.
872 */
ccb2c380 873#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c6572a6d 874 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
d528642a 875 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 876} STMT_END
b45f050a
JF
877
878/*
879 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
880 */
ccb2c380 881#define vFAIL3(m,a1,a2) STMT_START { \
ed68fe11 882 PREPARE_TO_DIE; \
ccb2c380
MP
883 Simple_vFAIL3(m, a1, a2); \
884} STMT_END
b45f050a
JF
885
886/*
887 * Like Simple_vFAIL(), but accepts four arguments.
888 */
ccb2c380 889#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
c6572a6d 890 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, a3, \
d528642a 891 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 892} STMT_END
b45f050a 893
95db3ffa 894#define vFAIL4(m,a1,a2,a3) STMT_START { \
ed68fe11 895 PREPARE_TO_DIE; \
95db3ffa
KW
896 Simple_vFAIL4(m, a1, a2, a3); \
897} STMT_END
898
946095af 899/* A specialized version of vFAIL2 that works with UTF8f */
d528642a 900#define vFAIL2utf8f(m, a1) STMT_START { \
ed68fe11 901 PREPARE_TO_DIE; \
c6572a6d 902 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, \
d528642a 903 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
904} STMT_END
905
3ba22297 906#define vFAIL3utf8f(m, a1, a2) STMT_START { \
ed68fe11 907 PREPARE_TO_DIE; \
c6572a6d 908 S_re_croak(aTHX_ UTF, m REPORT_LOCATION, a1, a2, \
d528642a 909 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
910} STMT_END
911
d62e749e 912/* Setting this to NULL is a signal to not output warnings */
cc16d262
KW
913#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
914 STMT_START { \
915 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
916 RExC_copy_start_in_constructed = NULL; \
917 } STMT_END
918#define RESTORE_WARNINGS \
919 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
d62e749e 920
7c932d07
KW
921/* Since a warning can be generated multiple times as the input is reparsed, we
922 * output it the first time we come to that point in the parse, but suppress it
923 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
924 * generate any warnings */
55bcf857 925#define TO_OUTPUT_WARNINGS(loc) \
7c932d07
KW
926 ( RExC_copy_start_in_constructed \
927 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
55bcf857 928
7c932d07
KW
929/* After we've emitted a warning, we save the position in the input so we don't
930 * output it again */
931#define UPDATE_WARNINGS_LOC(loc) \
932 STMT_START { \
933 if (TO_OUTPUT_WARNINGS(loc)) { \
5d894ca5
KW
934 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
935 - RExC_precomp; \
7c932d07
KW
936 } \
937 } STMT_END
77ccba3c 938
086f21bf 939/* 'warns' is the output of the packWARNx macro used in 'code' */
f8220fe2
KW
940#define _WARN_HELPER(loc, warns, code) \
941 STMT_START { \
d62e749e
KW
942 if (! RExC_copy_start_in_constructed) { \
943 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
944 " expected at '%s'", \
945 __FILE__, __LINE__, loc); \
946 } \
55bcf857 947 if (TO_OUTPUT_WARNINGS(loc)) { \
086f21bf
KW
948 if (ckDEAD(warns)) \
949 PREPARE_TO_DIE; \
338190a1 950 code; \
77ccba3c 951 UPDATE_WARNINGS_LOC(loc); \
338190a1 952 } \
f8220fe2 953 } STMT_END
946095af 954
5e0a247b 955/* m is not necessarily a "literal string", in this macro */
58ad8160
KW
956#define warn_non_literal_string(loc, packed_warn, m) \
957 _WARN_HELPER(loc, packed_warn, \
958 Perl_warner(aTHX_ packed_warn, \
d528642a 959 "%s" REPORT_LOCATION, \
f8220fe2 960 m, REPORT_LOCATION_ARGS(loc)))
58ad8160
KW
961#define reg_warn_non_literal_string(loc, m) \
962 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
5e0a247b 963
eb761011
KW
964#define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
965 STMT_START { \
966 char * format; \
967 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
968 Newx(format, format_size, char); \
969 my_strlcpy(format, m, format_size); \
970 my_strlcat(format, REPORT_LOCATION, format_size); \
971 SAVEFREEPV(format); \
972 _WARN_HELPER(loc, packwarn, \
973 Perl_ck_warner(aTHX_ packwarn, \
974 format, \
975 a1, REPORT_LOCATION_ARGS(loc))); \
976 } STMT_END
977
f8220fe2
KW
978#define ckWARNreg(loc,m) \
979 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
980 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 981 m REPORT_LOCATION, \
f8220fe2 982 REPORT_LOCATION_ARGS(loc)))
ccb2c380 983
f8220fe2
KW
984#define vWARN(loc, m) \
985 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
986 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 987 m REPORT_LOCATION, \
f8220fe2 988 REPORT_LOCATION_ARGS(loc))) \
b927b7e9 989
f8220fe2
KW
990#define vWARN_dep(loc, m) \
991 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
992 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 993 m REPORT_LOCATION, \
f8220fe2 994 REPORT_LOCATION_ARGS(loc)))
0d6106aa 995
f8220fe2
KW
996#define ckWARNdep(loc,m) \
997 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
998 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 999 m REPORT_LOCATION, \
f8220fe2 1000 REPORT_LOCATION_ARGS(loc)))
147508a2 1001
f8220fe2
KW
1002#define ckWARNregdep(loc,m) \
1003 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
1004 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
d528642a
KW
1005 WARN_REGEXP), \
1006 m REPORT_LOCATION, \
f8220fe2 1007 REPORT_LOCATION_ARGS(loc)))
ccb2c380 1008
f8220fe2
KW
1009#define ckWARN2reg_d(loc,m, a1) \
1010 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1011 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
d528642a 1012 m REPORT_LOCATION, \
f8220fe2 1013 a1, REPORT_LOCATION_ARGS(loc)))
2335b3d3 1014
f8220fe2
KW
1015#define ckWARN2reg(loc, m, a1) \
1016 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1017 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 1018 m REPORT_LOCATION, \
f8220fe2 1019 a1, REPORT_LOCATION_ARGS(loc)))
ccb2c380 1020
f8220fe2
KW
1021#define vWARN3(loc, m, a1, a2) \
1022 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1023 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 1024 m REPORT_LOCATION, \
f8220fe2 1025 a1, a2, REPORT_LOCATION_ARGS(loc)))
ccb2c380 1026
f8220fe2
KW
1027#define ckWARN3reg(loc, m, a1, a2) \
1028 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1029 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
1030 m REPORT_LOCATION, \
1031 a1, a2, \
f8220fe2 1032 REPORT_LOCATION_ARGS(loc)))
668c081a 1033
f8220fe2
KW
1034#define vWARN4(loc, m, a1, a2, a3) \
1035 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1036 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
1037 m REPORT_LOCATION, \
1038 a1, a2, a3, \
f8220fe2 1039 REPORT_LOCATION_ARGS(loc)))
ccb2c380 1040
f8220fe2
KW
1041#define ckWARN4reg(loc, m, a1, a2, a3) \
1042 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1043 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
1044 m REPORT_LOCATION, \
1045 a1, a2, a3, \
f8220fe2 1046 REPORT_LOCATION_ARGS(loc)))
668c081a 1047
f8220fe2
KW
1048#define vWARN5(loc, m, a1, a2, a3, a4) \
1049 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
1050 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
1051 m REPORT_LOCATION, \
1052 a1, a2, a3, a4, \
f8220fe2 1053 REPORT_LOCATION_ARGS(loc)))
9d1d55b5 1054
aea5f181 1055#define ckWARNexperimental(loc, class, m) \
5aabce33
KW
1056 STMT_START { \
1057 if (! RExC_warned_ ## class) { /* warn once per compilation */ \
1058 RExC_warned_ ## class = 1; \
1059 _WARN_HELPER(loc, packWARN(class), \
aea5f181
KW
1060 Perl_ck_warner_d(aTHX_ packWARN(class), \
1061 m REPORT_LOCATION, \
5aabce33
KW
1062 REPORT_LOCATION_ARGS(loc)));\
1063 } \
1064 } STMT_END
aea5f181 1065
1a322bb4
KW
1066/* Convert between a pointer to a node and its offset from the beginning of the
1067 * program */
1068#define REGNODE_p(offset) (RExC_emit_start + (offset))
1069#define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
1070
538e84ed 1071/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
1072 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
1073 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
1074 * Element 0 holds the number n.
07be1b83 1075 * Position is 1 indexed.
fac92740 1076 */
7122b237 1077#ifndef RE_TRACK_PATTERN_OFFSETS
d7c442ce 1078#define Set_Node_Offset_To_R(offset,byte)
7122b237
YO
1079#define Set_Node_Offset(node,byte)
1080#define Set_Cur_Node_Offset
1081#define Set_Node_Length_To_R(node,len)
1082#define Set_Node_Length(node,len)
6a86c6ad 1083#define Set_Node_Cur_Length(node,start)
538e84ed
KW
1084#define Node_Offset(n)
1085#define Node_Length(n)
7122b237
YO
1086#define Set_Node_Offset_Length(node,offset,len)
1087#define ProgLen(ri) ri->u.proglen
1088#define SetProgLen(ri,x) ri->u.proglen = x
4cbeff97 1089#define Track_Code(code)
7122b237
YO
1090#else
1091#define ProgLen(ri) ri->u.offsets[0]
1092#define SetProgLen(ri,x) ri->u.offsets[0] = x
d7c442ce 1093#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
ccb2c380 1094 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
d7c442ce
KW
1095 __LINE__, (int)(offset), (int)(byte))); \
1096 if((offset) < 0) { \
538e84ed 1097 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
d7c442ce 1098 (int)(offset)); \
ccb2c380 1099 } else { \
d7c442ce 1100 RExC_offsets[2*(offset)-1] = (byte); \
ccb2c380 1101 } \
ccb2c380
MP
1102} STMT_END
1103
88f063b4 1104#define Set_Node_Offset(node,byte) \
1a322bb4 1105 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
ccb2c380
MP
1106#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1107
1108#define Set_Node_Length_To_R(node,len) STMT_START { \
ccb2c380 1109 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 1110 __LINE__, (int)(node), (int)(len))); \
ccb2c380 1111 if((node) < 0) { \
538e84ed
KW
1112 Perl_croak(aTHX_ "value of node is %d in Length macro", \
1113 (int)(node)); \
ccb2c380
MP
1114 } else { \
1115 RExC_offsets[2*(node)] = (len); \
1116 } \
ccb2c380
MP
1117} STMT_END
1118
1119#define Set_Node_Length(node,len) \
1a322bb4 1120 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
6a86c6ad
NC
1121#define Set_Node_Cur_Length(node, start) \
1122 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
1123
1124/* Get offsets and lengths */
1a322bb4
KW
1125#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1126#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
fac92740 1127
07be1b83 1128#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1a322bb4
KW
1129 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1130 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
07be1b83 1131} STMT_END
4cbeff97
TC
1132
1133#define Track_Code(code) STMT_START { code } STMT_END
7122b237 1134#endif
07be1b83
YO
1135
1136#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1137#define EXPERIMENTAL_INPLACESCAN
f427392e 1138#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 1139
cb41e5d6
YO
1140#ifdef DEBUGGING
1141int
6ad9a8ab 1142Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
1143{
1144 va_list ap;
1145 int result;
1146 PerlIO *f= Perl_debug_log;
1147 PERL_ARGS_ASSERT_RE_PRINTF;
1148 va_start(ap, fmt);
1149 result = PerlIO_vprintf(f, fmt, ap);
1150 va_end(ap);
1151 return result;
1152}
1153
1154int
7b031478 1155Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
1156{
1157 va_list ap;
1158 int result;
1159 PerlIO *f= Perl_debug_log;
1160 PERL_ARGS_ASSERT_RE_INDENTF;
1161 va_start(ap, depth);
daeb874b 1162 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
1163 result = PerlIO_vprintf(f, fmt, ap);
1164 va_end(ap);
1165 return result;
1166}
1167#endif /* DEBUGGING */
1168
7b031478 1169#define DEBUG_RExC_seen() \
538e84ed 1170 DEBUG_OPTIMISE_MORE_r({ \
88f063b4 1171 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 1172 \
e384d5c1 1173 if (RExC_seen & REG_ZERO_LEN_SEEN) \
88f063b4 1174 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 1175 \
e384d5c1 1176 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
88f063b4 1177 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 1178 \
e384d5c1 1179 if (RExC_seen & REG_GPOS_SEEN) \
88f063b4 1180 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 1181 \
e384d5c1 1182 if (RExC_seen & REG_RECURSE_SEEN) \
88f063b4 1183 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 1184 \
7b031478 1185 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
88f063b4 1186 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 1187 \
e384d5c1 1188 if (RExC_seen & REG_VERBARG_SEEN) \
88f063b4 1189 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 1190 \
e384d5c1 1191 if (RExC_seen & REG_CUTGROUP_SEEN) \
88f063b4 1192 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 1193 \
e384d5c1 1194 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
88f063b4 1195 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 1196 \
e384d5c1 1197 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
88f063b4 1198 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 1199 \
7b031478 1200 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
88f063b4 1201 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 1202 \
88f063b4 1203 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
1204 });
1205
fdfb4f21 1206#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 1207 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21 1208
fdfb4f21 1209
f5a36d78
DM
1210#ifdef DEBUGGING
1211static void
1212S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1213 const char *close_str)
1214{
1215 if (!flags)
1216 return;
1217
1218 Perl_re_printf( aTHX_ "%s", open_str);
11683ecb
DM
1219 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1220 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
f5a36d78
DM
1221 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1222 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1223 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1224 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1225 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1226 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1227 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1228 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1229 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1230 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1231 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1232 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1233 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1234 Perl_re_printf( aTHX_ "%s", close_str);
1235}
1236
1237
1238static void
1239S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1240 U32 depth, int is_inf)
1241{
271b36b1 1242 DECLARE_AND_GET_RE_DEBUG_FLAGS;
f5a36d78
DM
1243
1244 DEBUG_OPTIMISE_MORE_r({
1245 if (!data)
1246 return;
1247 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1248 depth,
1249 where,
1250 (IV)data->pos_min,
1251 (IV)data->pos_delta,
1252 (UV)data->flags
1253 );
1254
1255 S_debug_show_study_flags(aTHX_ data->flags," [","]");
fdfb4f21 1256
f5a36d78
DM
1257 Perl_re_printf( aTHX_
1258 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1259 (IV)data->whilem_c,
1260 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1261 is_inf ? "INF " : ""
1262 );
1263
11683ecb 1264 if (data->last_found) {
37b6262f 1265 int i;
11683ecb
DM
1266 Perl_re_printf(aTHX_
1267 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1268 SvPVX_const(data->last_found),
1269 (IV)data->last_end,
1270 (IV)data->last_start_min,
1271 (IV)data->last_start_max
1272 );
1273
2099df82 1274 for (i = 0; i < 2; i++) {
37b6262f
DM
1275 Perl_re_printf(aTHX_
1276 " %s%s: '%s' @ %" IVdf "/%" IVdf,
2099df82
DM
1277 data->cur_is_floating == i ? "*" : "",
1278 i ? "Float" : "Fixed",
37b6262f
DM
1279 SvPVX_const(data->substrs[i].str),
1280 (IV)data->substrs[i].min_offset,
1281 (IV)data->substrs[i].max_offset
1282 );
1283 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1284 }
11683ecb 1285 }
f5a36d78
DM
1286
1287 Perl_re_printf( aTHX_ "\n");
1288 });
1289}
1290
1291
1292static void
1293S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1294 regnode *scan, U32 depth, U32 flags)
1295{
271b36b1 1296 DECLARE_AND_GET_RE_DEBUG_FLAGS;
f5a36d78
DM
1297
1298 DEBUG_OPTIMISE_r({
1299 regnode *Next;
1300
1301 if (!scan)
1302 return;
1303 Next = regnext(scan);
1304 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1305 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1306 depth,
1307 str,
1308 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1309 Next ? (REG_NODE_NUM(Next)) : 0 );
1310 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1311 Perl_re_printf( aTHX_ "\n");
1312 });
1313}
1314
1315
1316# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1317 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1318
1319# define DEBUG_PEEP(str, scan, depth, flags) \
1320 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1321
1322#else
1323# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1324# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1325#endif
1de06328 1326
cb41e5d6 1327
c6871b76
KW
1328/* =========================================================
1329 * BEGIN edit_distance stuff.
1330 *
1331 * This calculates how many single character changes of any type are needed to
1332 * transform a string into another one. It is taken from version 3.1 of
1333 *
1334 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1335 */
1336
1337/* Our unsorted dictionary linked list. */
1338/* Note we use UVs, not chars. */
1339
1340struct dictionary{
1341 UV key;
1342 UV value;
1343 struct dictionary* next;
1344};
1345typedef struct dictionary item;
1346
1347
1348PERL_STATIC_INLINE item*
88f063b4 1349push(UV key, item* curr)
c6871b76
KW
1350{
1351 item* head;
d09f14bf 1352 Newx(head, 1, item);
c6871b76
KW
1353 head->key = key;
1354 head->value = 0;
1355 head->next = curr;
1356 return head;
1357}
1358
1359
1360PERL_STATIC_INLINE item*
1361find(item* head, UV key)
1362{
1363 item* iterator = head;
1364 while (iterator){
1365 if (iterator->key == key){
1366 return iterator;
1367 }
1368 iterator = iterator->next;
1369 }
1370
1371 return NULL;
1372}
1373
1374PERL_STATIC_INLINE item*
88f063b4 1375uniquePush(item* head, UV key)
c6871b76
KW
1376{
1377 item* iterator = head;
1378
1379 while (iterator){
1380 if (iterator->key == key) {
1381 return head;
1382 }
1383 iterator = iterator->next;
1384 }
1385
88f063b4 1386 return push(key, head);
c6871b76
KW
1387}
1388
1389PERL_STATIC_INLINE void
1390dict_free(item* head)
1391{
1392 item* iterator = head;
1393
1394 while (iterator) {
1395 item* temp = iterator;
1396 iterator = iterator->next;
1397 Safefree(temp);
1398 }
1399
1400 head = NULL;
1401}
1402
1403/* End of Dictionary Stuff */
1404
1405/* All calculations/work are done here */
1406STATIC int
1407S_edit_distance(const UV* src,
1408 const UV* tgt,
1409 const STRLEN x, /* length of src[] */
1410 const STRLEN y, /* length of tgt[] */
1411 const SSize_t maxDistance
1412)
1413{
1414 item *head = NULL;
88f063b4 1415 UV swapCount, swapScore, targetCharCount, i, j;
c6871b76
KW
1416 UV *scores;
1417 UV score_ceil = x + y;
1418
1419 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1420
1421 /* intialize matrix start values */
d09f14bf 1422 Newx(scores, ( (x + 2) * (y + 2)), UV);
c6871b76
KW
1423 scores[0] = score_ceil;
1424 scores[1 * (y + 2) + 0] = score_ceil;
1425 scores[0 * (y + 2) + 1] = score_ceil;
1426 scores[1 * (y + 2) + 1] = 0;
88f063b4 1427 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
c6871b76
KW
1428
1429 /* work loops */
1430 /* i = src index */
1431 /* j = tgt index */
1432 for (i=1;i<=x;i++) {
1433 if (i < x)
88f063b4 1434 head = uniquePush(head, src[i]);
c6871b76
KW
1435 scores[(i+1) * (y + 2) + 1] = i;
1436 scores[(i+1) * (y + 2) + 0] = score_ceil;
1437 swapCount = 0;
1438
1439 for (j=1;j<=y;j++) {
1440 if (i == 1) {
1441 if(j < y)
88f063b4 1442 head = uniquePush(head, tgt[j]);
c6871b76
KW
1443 scores[1 * (y + 2) + (j + 1)] = j;
1444 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1445 }
1446
88f063b4 1447 targetCharCount = find(head, tgt[j-1])->value;
c6871b76
KW
1448 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1449
1450 if (src[i-1] != tgt[j-1]){
1451 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1452 }
1453 else {
1454 swapCount = j;
1455 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1456 }
1457 }
1458
88f063b4 1459 find(head, src[i-1])->value = i;
c6871b76
KW
1460 }
1461
1462 {
1463 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1464 dict_free(head);
1465 Safefree(scores);
1466 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1467 }
1468}
1469
1470/* END of edit_distance() stuff
1471 * ========================================================= */
1472
653099ff 1473/* Mark that we cannot extend a found fixed substring at this point.
37b6262f 1474 Update the longest found anchored substring or the longest found
653099ff
GS
1475 floating substrings if needed. */
1476
4327152a 1477STATIC void
ea3daa5d
FC
1478S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1479 SSize_t *minlenp, int is_inf)
c277df42 1480{
e1ec3a88 1481 const STRLEN l = CHR_SVLEN(data->last_found);
2099df82 1482 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
a2ca2017 1483 const STRLEN old_l = CHR_SVLEN(longest_sv);
271b36b1 1484 DECLARE_AND_GET_RE_DEBUG_FLAGS;
b81d288d 1485
7918f24d
NC
1486 PERL_ARGS_ASSERT_SCAN_COMMIT;
1487
c277df42 1488 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
2099df82 1489 const U8 i = data->cur_is_floating;
a2ca2017 1490 SvSetMagicSV(longest_sv, data->last_found);
37b6262f
DM
1491 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1492
2099df82
DM
1493 if (!i) /* fixed */
1494 data->substrs[0].max_offset = data->substrs[0].min_offset;
1495 else { /* float */
f6231ebf
KW
1496 data->substrs[1].max_offset =
1497 (is_inf)
1498 ? OPTIMIZE_INFTY
1499 : (l
646e8787 1500 ? data->last_start_max
d23733db
HS
1501 /* temporary underflow guard for 5.32 */
1502 : data->pos_delta < 0 ? OPTIMIZE_INFTY
0069caf1
KW
1503 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
1504 ? OPTIMIZE_INFTY
ea3daa5d 1505 : data->pos_min + data->pos_delta));
37b6262f 1506 }
11683ecb 1507
5de22a40
HS
1508 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1509 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
37b6262f
DM
1510 data->substrs[i].minlenp = minlenp;
1511 data->substrs[i].lookbehind = 0;
c277df42 1512 }
37b6262f 1513
c277df42 1514 SvCUR_set(data->last_found, 0);
0eda9292 1515 {
a28509cc 1516 SV * const sv = data->last_found;
097eb12c
AL
1517 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1518 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1519 if (mg)
1520 mg->mg_len = 0;
1521 }
0eda9292 1522 }
c277df42
IZ
1523 data->last_end = -1;
1524 data->flags &= ~SF_BEFORE_EOL;
f5a36d78 1525 DEBUG_STUDYDATA("commit", data, 0, is_inf);
c277df42
IZ
1526}
1527
cdd87c1d
KW
1528/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1529 * list that describes which code points it matches */
1530
653099ff 1531STATIC void
3420edd7 1532S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1533{
cdd87c1d
KW
1534 /* Set the SSC 'ssc' to match an empty string or any code point */
1535
557bd3fb 1536 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1537
71068078 1538 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1539
0854ea0b
KW
1540 /* mortalize so won't leak */
1541 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1542 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1543}
1544
653099ff 1545STATIC int
dc3bf405 1546S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1547{
c144baaa
KW
1548 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1549 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1550 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1551 * in any way, so there's no point in using it */
cdd87c1d
KW
1552
1553 UV start, end;
1554 bool ret;
653099ff 1555
557bd3fb 1556 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1557
71068078 1558 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1559
93e92956 1560 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1561 return FALSE;
1562 }
1563
1564 /* See if the list consists solely of the range 0 - Infinity */
1565 invlist_iterinit(ssc->invlist);
1566 ret = invlist_iternext(ssc->invlist, &start, &end)
1567 && start == 0
1568 && end == UV_MAX;
1569
1570 invlist_iterfinish(ssc->invlist);
1571
1572 if (ret) {
1573 return TRUE;
1574 }
1575
1576 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1577 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1578 int i;
1579 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1580 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1581 return TRUE;
1582 }
1583 }
1584 }
1585
1586 return FALSE;
653099ff
GS
1587}
1588
653099ff 1589STATIC void
cdd87c1d 1590S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1591{
cdd87c1d
KW
1592 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1593 * string, any code point, or any posix class under locale */
1594
557bd3fb 1595 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1596
557bd3fb 1597 Zero(ssc, 1, regnode_ssc);
71068078 1598 set_ANYOF_SYNTHETIC(ssc);
93e92956 1599 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1600 ssc_anything(ssc);
cdd87c1d 1601
2f306ab9
KW
1602 /* If any portion of the regex is to operate under locale rules that aren't
1603 * fully known at compile time, initialization includes it. The reason
1604 * this isn't done for all regexes is that the optimizer was written under
1605 * the assumption that locale was all-or-nothing. Given the complexity and
1606 * lack of documentation in the optimizer, and that there are inadequate
1607 * test cases for locale, many parts of it may not work properly, it is
1608 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1609 if (RExC_contains_locale) {
1610 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1611 }
1612 else {
1613 ANYOF_POSIXL_ZERO(ssc);
1614 }
653099ff
GS
1615}
1616
b423522f 1617STATIC int
dc3bf405
BF
1618S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1619 const regnode_ssc *ssc)
b423522f
KW
1620{
1621 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1622 * to the list of code points matched, and locale posix classes; hence does
1623 * not check its flags) */
1624
1625 UV start, end;
1626 bool ret;
1627
1628 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1629
71068078 1630 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1631
1632 invlist_iterinit(ssc->invlist);
1633 ret = invlist_iternext(ssc->invlist, &start, &end)
1634 && start == 0
1635 && end == UV_MAX;
1636
1637 invlist_iterfinish(ssc->invlist);
1638
1639 if (! ret) {
1640 return FALSE;
1641 }
1642
e0e1be5f 1643 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1644 return FALSE;
b423522f
KW
1645 }
1646
1647 return TRUE;
1648}
1649
4ebed06a
KW
1650#define INVLIST_INDEX 0
1651#define ONLY_LOCALE_MATCHES_INDEX 1
1652#define DEFERRED_USER_DEFINED_INDEX 2
1653
b423522f
KW
1654STATIC SV*
1655S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1656 const regnode_charclass* const node)
b423522f
KW
1657{
1658 /* Returns a mortal inversion list defining which code points are matched
1659 * by 'node', which is of type ANYOF. Handles complementing the result if
1660 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1661 * returned list must, and will, contain every code point that is a
1662 * possibility. */
b423522f 1663
1565c085 1664 dVAR;
e2506fa7 1665 SV* invlist = NULL;
1ee208c4 1666 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1667 unsigned int i;
1668 const U32 n = ARG(node);
31f05a37 1669 bool new_node_has_latin1 = FALSE;
2d5613be 1670 const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
29a889ef
KW
1671 ? 0
1672 : ANYOF_FLAGS(node);
b423522f
KW
1673
1674 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1675
1676 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1677 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1678 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1679 AV * const av = MUTABLE_AV(SvRV(rv));
1680 SV **const ary = AvARRAY(av);
1681 assert(RExC_rxi->data->what[n] == 's');
1682
4ebed06a 1683 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
b423522f 1684
6ed02bb6
KW
1685 /* Here there are things that won't be known until runtime -- we
1686 * have to assume it could be anything */
e2506fa7 1687 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1688 return _add_range_to_invlist(invlist, 0, UV_MAX);
1689 }
4ebed06a 1690 else if (ary[INVLIST_INDEX]) {
b423522f 1691
6ed02bb6 1692 /* Use the node's inversion list */
4ebed06a 1693 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1ee208c4
KW
1694 }
1695
1696 /* Get the code points valid only under UTF-8 locales */
766d6d33 1697 if ( (flags & ANYOFL_FOLD)
4ebed06a 1698 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1ee208c4 1699 {
4ebed06a 1700 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
b423522f
KW
1701 }
1702 }
1703
e2506fa7
KW
1704 if (! invlist) {
1705 invlist = sv_2mortal(_new_invlist(0));
1706 }
1707
dcb20b36
KW
1708 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1709 * code points, and an inversion list for the others, but if there are code
1710 * points that should match only conditionally on the target string being
1711 * UTF-8, those are placed in the inversion list, and not the bitmap.
1712 * Since there are circumstances under which they could match, they are
1713 * included in the SSC. But if the ANYOF node is to be inverted, we have
1714 * to exclude them here, so that when we invert below, the end result
1715 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1716 * have to do this here before we add the unconditionally matched code
1717 * points */
766d6d33 1718 if (flags & ANYOF_INVERT) {
b423522f
KW
1719 _invlist_intersection_complement_2nd(invlist,
1720 PL_UpperLatin1,
1721 &invlist);
1722 }
1723
1724 /* Add in the points from the bit map */
2d5613be 1725 if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
90973738
KW
1726 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1727 if (ANYOF_BITMAP_TEST(node, i)) {
1728 unsigned int start = i++;
1729
1730 for (; i < NUM_ANYOF_CODE_POINTS
1731 && ANYOF_BITMAP_TEST(node, i); ++i)
1732 {
1733 /* empty */
1734 }
1735 invlist = _add_range_to_invlist(invlist, start, i-1);
1736 new_node_has_latin1 = TRUE;
6f8848d5 1737 }
b423522f
KW
1738 }
1739 }
1740
1741 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1742 * as well. But don't add them if inverting, as when that gets done below,
1743 * it would exclude all these characters, including the ones it shouldn't
1744 * that were added just above */
766d6d33
KW
1745 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1746 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
f240c685 1747 {
b423522f
KW
1748 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1749 }
1750
1751 /* Similarly for these */
766d6d33 1752 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1753 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1754 }
1755
766d6d33 1756 if (flags & ANYOF_INVERT) {
b423522f
KW
1757 _invlist_invert(invlist);
1758 }
766d6d33 1759 else if (flags & ANYOFL_FOLD) {
35b8412f 1760 if (new_node_has_latin1) {
31f05a37 1761
26be5fe6
KW
1762 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1763 * the locale. We can skip this if there are no 0-255 at all. */
1764 _invlist_union(invlist, PL_Latin1, &invlist);
35b8412f
KW
1765
1766 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1767 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1768 }
1769 else {
1770 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1771 invlist = add_cp_to_invlist(invlist, 'I');
1772 }
1773 if (_invlist_contains_cp(invlist,
1774 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1775 {
1776 invlist = add_cp_to_invlist(invlist, 'i');
1777 }
1778 }
31f05a37
KW
1779 }
1780
1ee208c4
KW
1781 /* Similarly add the UTF-8 locale possible matches. These have to be
1782 * deferred until after the non-UTF-8 locale ones are taken care of just
1783 * above, or it leads to wrong results under ANYOF_INVERT */
1784 if (only_utf8_locale_invlist) {
31f05a37 1785 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1786 only_utf8_locale_invlist,
766d6d33 1787 flags & ANYOF_INVERT,
31f05a37
KW
1788 &invlist);
1789 }
b423522f
KW
1790
1791 return invlist;
1792}
1793
1051e1c4 1794/* These two functions currently do the exact same thing */
557bd3fb 1795#define ssc_init_zero ssc_init
653099ff 1796
cdd87c1d
KW
1797#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1798#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1799
557bd3fb 1800/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1801 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1802 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1803
653099ff 1804STATIC void
b423522f 1805S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1806 const regnode_charclass *and_with)
653099ff 1807{
cdd87c1d
KW
1808 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1809 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1810
a0dd4231 1811 SV* anded_cp_list;
2d5613be 1812 U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
29a889ef
KW
1813 ? 0
1814 : ANYOF_FLAGS(and_with);
a0dd4231 1815 U8 anded_flags;
1e6ade67 1816
cdd87c1d 1817 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1818
71068078 1819 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1820
1821 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1822 * the code point inversion list and just the relevant flags */
71068078 1823 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1824 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
765e6ecf 1825 anded_flags = and_with_flags;
e9b08962
KW
1826
1827 /* XXX This is a kludge around what appears to be deficiencies in the
1828 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1829 * there are paths through the optimizer where it doesn't get weeded
1830 * out when it should. And if we don't make some extra provision for
1831 * it like the code just below, it doesn't get added when it should.
1832 * This solution is to add it only when AND'ing, which is here, and
1833 * only when what is being AND'ed is the pristine, original node
1834 * matching anything. Thus it is like adding it to ssc_anything() but
1835 * only when the result is to be AND'ed. Probably the same solution
1836 * could be adopted for the same problem we have with /l matching,
1837 * which is solved differently in S_ssc_init(), and that would lead to
1838 * fewer false positives than that solution has. But if this solution
1839 * creates bugs, the consequences are only that a warning isn't raised
1840 * that should be; while the consequences for having /l bugs is
1841 * incorrect matches */
7dcac5f6 1842 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1843 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1844 }
a0dd4231
KW
1845 }
1846 else {
5c0f85ef 1847 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685 1848 if (OP(and_with) == ANYOFD) {
765e6ecf 1849 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
f240c685
KW
1850 }
1851 else {
765e6ecf 1852 anded_flags = and_with_flags
f240c685 1853 &( ANYOF_COMMON_FLAGS
108316fb
KW
1854 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1855 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
765e6ecf 1856 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
d1c40ef5
KW
1857 anded_flags &=
1858 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1859 }
f240c685 1860 }
a0dd4231
KW
1861 }
1862
1863 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1864
1865 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1866 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1867 * 'and_with' may be inverted. When not inverted, we have the situation of
1868 * computing:
1869 * (C1 | P1) & (C2 | P2)
1870 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1871 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1872 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1873 * <= ((C1 & C2) | P1 | P2)
1874 * Alternatively, the last few steps could be:
1875 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1876 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1877 * <= (C1 | C2 | (P1 & P2))
1878 * We favor the second approach if either P1 or P2 is non-empty. This is
1879 * because these components are a barrier to doing optimizations, as what
1880 * they match cannot be known until the moment of matching as they are
1881 * dependent on the current locale, 'AND"ing them likely will reduce or
1882 * eliminate them.
1883 * But we can do better if we know that C1,P1 are in their initial state (a
1884 * frequent occurrence), each matching everything:
1885 * (<everything>) & (C2 | P2) = C2 | P2
1886 * Similarly, if C2,P2 are in their initial state (again a frequent
1887 * occurrence), the result is a no-op
1888 * (C1 | P1) & (<everything>) = C1 | P1
1889 *
1890 * Inverted, we have
1891 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1892 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1893 * <= (C1 & ~C2) | (P1 & ~P2)
1894 * */
1aa99e6b 1895
765e6ecf 1896 if ((and_with_flags & ANYOF_INVERT)
71068078 1897 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1898 {
cdd87c1d 1899 unsigned int i;
8951c461 1900
cdd87c1d
KW
1901 ssc_intersection(ssc,
1902 anded_cp_list,
1903 FALSE /* Has already been inverted */
1904 );
c6b76537 1905
cdd87c1d
KW
1906 /* If either P1 or P2 is empty, the intersection will be also; can skip
1907 * the loop */
765e6ecf 1908 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1909 ANYOF_POSIXL_ZERO(ssc);
1910 }
e0e1be5f 1911 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1912
1913 /* Note that the Posix class component P from 'and_with' actually
1914 * looks like:
1915 * P = Pa | Pb | ... | Pn
1916 * where each component is one posix class, such as in [\w\s].
1917 * Thus
1918 * ~P = ~(Pa | Pb | ... | Pn)
1919 * = ~Pa & ~Pb & ... & ~Pn
1920 * <= ~Pa | ~Pb | ... | ~Pn
1921 * The last is something we can easily calculate, but unfortunately
1922 * is likely to have many false positives. We could do better
1923 * in some (but certainly not all) instances if two classes in
1924 * P have known relationships. For example
1925 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1926 * So
1927 * :lower: & :print: = :lower:
1928 * And similarly for classes that must be disjoint. For example,
1929 * since \s and \w can have no elements in common based on rules in
1930 * the POSIX standard,
1931 * \w & ^\S = nothing
1932 * Unfortunately, some vendor locales do not meet the Posix
1933 * standard, in particular almost everything by Microsoft.
1934 * The loop below just changes e.g., \w into \W and vice versa */
1935
1ee208c4 1936 regnode_charclass_posixl temp;
cdd87c1d
KW
1937 int add = 1; /* To calculate the index of the complement */
1938
b1234259 1939 Zero(&temp, 1, regnode_charclass_posixl);
cdd87c1d
KW
1940 ANYOF_POSIXL_ZERO(&temp);
1941 for (i = 0; i < ANYOF_MAX; i++) {
1942 assert(i % 2 != 0
7dcac5f6
KW
1943 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1944 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1945
7dcac5f6 1946 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1947 ANYOF_POSIXL_SET(&temp, i + add);
1948 }
1949 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1950 }
1951 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1952
cdd87c1d
KW
1953 } /* else ssc already has no posixes */
1954 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1955 in its initial state */
71068078 1956 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1957 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1958 {
1959 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1960 * copy it over 'ssc' */
1961 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1962 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1963 StructCopy(and_with, ssc, regnode_ssc);
1964 }
1965 else {
1966 ssc->invlist = anded_cp_list;
1967 ANYOF_POSIXL_ZERO(ssc);
765e6ecf 1968 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1969 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1970 }
1971 }
1972 }
e0e1be5f 1973 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
765e6ecf 1974 || (and_with_flags & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1975 {
1976 /* One or the other of P1, P2 is non-empty. */
765e6ecf 1977 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1978 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1979 }
cdd87c1d
KW
1980 ssc_union(ssc, anded_cp_list, FALSE);
1981 }
1982 else { /* P1 = P2 = empty */
1983 ssc_intersection(ssc, anded_cp_list, FALSE);
1984 }
137165a6 1985 }
653099ff
GS
1986}
1987
653099ff 1988STATIC void
cdd87c1d 1989S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1990 const regnode_charclass *or_with)
653099ff 1991{
cdd87c1d
KW
1992 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1993 * another SSC or a regular ANYOF class. Can create false positives if
1994 * 'or_with' is to be inverted. */
7918f24d 1995
a0dd4231
KW
1996 SV* ored_cp_list;
1997 U8 ored_flags;
2d5613be 1998 U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
29a889ef
KW
1999 ? 0
2000 : ANYOF_FLAGS(or_with);
c6b76537 2001
cdd87c1d 2002 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 2003
71068078 2004 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
2005
2006 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
2007 * the code point inversion list and just the relevant flags */
71068078 2008 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 2009 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
765e6ecf 2010 ored_flags = or_with_flags;
a0dd4231
KW
2011 }
2012 else {
5c0f85ef 2013 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
765e6ecf 2014 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
f240c685
KW
2015 if (OP(or_with) != ANYOFD) {
2016 ored_flags
765e6ecf 2017 |= or_with_flags
108316fb
KW
2018 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2019 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
765e6ecf 2020 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
d1c40ef5
KW
2021 ored_flags |=
2022 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
2023 }
f240c685 2024 }
a0dd4231
KW
2025 }
2026
2027 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
2028
2029 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
2030 * C2 is the list of code points in 'or-with'; P2, its posix classes.
2031 * 'or_with' may be inverted. When not inverted, we have the simple
2032 * situation of computing:
2033 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
2034 * If P1|P2 yields a situation with both a class and its complement are
2035 * set, like having both \w and \W, this matches all code points, and we
2036 * can delete these from the P component of the ssc going forward. XXX We
2037 * might be able to delete all the P components, but I (khw) am not certain
2038 * about this, and it is better to be safe.
2039 *
2040 * Inverted, we have
2041 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
2042 * <= (C1 | P1) | ~C2
2043 * <= (C1 | ~C2) | P1
2044 * (which results in actually simpler code than the non-inverted case)
2045 * */
9826f543 2046
765e6ecf 2047 if ((or_with_flags & ANYOF_INVERT)
71068078 2048 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 2049 {
cdd87c1d 2050 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 2051 } /* else Not inverted */
765e6ecf 2052 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
7dcac5f6 2053 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 2054 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
2055 unsigned int i;
2056 for (i = 0; i < ANYOF_MAX; i += 2) {
2057 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2058 {
2059 ssc_match_all_cp(ssc);
2060 ANYOF_POSIXL_CLEAR(ssc, i);
2061 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
2062 }
2063 }
2064 }
1aa99e6b 2065 }
cdd87c1d
KW
2066
2067 ssc_union(ssc,
2068 ored_cp_list,
2069 FALSE /* Already has been inverted */
2070 );
653099ff
GS
2071}
2072
19bb2455 2073STATIC void
b423522f
KW
2074S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2075{
2076 PERL_ARGS_ASSERT_SSC_UNION;
2077
71068078 2078 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2079
2080 _invlist_union_maybe_complement_2nd(ssc->invlist,
2081 invlist,
2082 invert2nd,
2083 &ssc->invlist);
2084}
2085
19bb2455 2086STATIC void
b423522f
KW
2087S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2088 SV* const invlist,
2089 const bool invert2nd)
2090{
2091 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2092
71068078 2093 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2094
2095 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2096 invlist,
2097 invert2nd,
2098 &ssc->invlist);
2099}
2100
19bb2455 2101STATIC void
b423522f
KW
2102S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2103{
2104 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2105
71068078 2106 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2107
2108 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2109}
2110
19bb2455 2111STATIC void
b423522f
KW
2112S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2113{
2114 /* AND just the single code point 'cp' into the SSC 'ssc' */
2115
2116 SV* cp_list = _new_invlist(2);
2117
2118 PERL_ARGS_ASSERT_SSC_CP_AND;
2119
71068078 2120 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2121
2122 cp_list = add_cp_to_invlist(cp_list, cp);
2123 ssc_intersection(ssc, cp_list,
2124 FALSE /* Not inverted */
2125 );
2126 SvREFCNT_dec_NN(cp_list);
2127}
2128
19bb2455 2129STATIC void
dc3bf405 2130S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
2131{
2132 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
2133 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2134
71068078 2135 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2136
2137 ANYOF_POSIXL_ZERO(ssc);
2138 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2139}
2140
b35552de
KW
2141#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2142
2143STATIC bool
2144S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2145{
2146 /* The synthetic start class is used to hopefully quickly winnow down
2147 * places where a pattern could start a match in the target string. If it
2148 * doesn't really narrow things down that much, there isn't much point to
2149 * having the overhead of using it. This function uses some very crude
2150 * heuristics to decide if to use the ssc or not.
2151 *
2152 * It returns TRUE if 'ssc' rules out more than half what it considers to
2153 * be the "likely" possible matches, but of course it doesn't know what the
2154 * actual things being matched are going to be; these are only guesses
2155 *
2156 * For /l matches, it assumes that the only likely matches are going to be
2157 * in the 0-255 range, uniformly distributed, so half of that is 127
2158 * For /a and /d matches, it assumes that the likely matches will be just
2159 * the ASCII range, so half of that is 63
2160 * For /u and there isn't anything matching above the Latin1 range, it
2161 * assumes that that is the only range likely to be matched, and uses
2162 * half that as the cut-off: 127. If anything matches above Latin1,
2163 * it assumes that all of Unicode could match (uniformly), except for
2164 * non-Unicode code points and things in the General Category "Other"
2165 * (unassigned, private use, surrogates, controls and formats). This
2166 * is a much large number. */
2167
b35552de
KW
2168 U32 count = 0; /* Running total of number of code points matched by
2169 'ssc' */
2170 UV start, end; /* Start and end points of current range in inversion
26be5fe6 2171 XXX outdated. UTF-8 locales are common, what about invert? list */
72400949
KW
2172 const U32 max_code_points = (LOC)
2173 ? 256
f6e8f31e 2174 : (( ! UNI_SEMANTICS
1a26cbcb 2175 || invlist_highest(ssc->invlist) < 256)
72400949
KW
2176 ? 128
2177 : NON_OTHER_COUNT);
2178 const U32 max_match = max_code_points / 2;
b35552de
KW
2179
2180 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2181
2182 invlist_iterinit(ssc->invlist);
2183 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
2184 if (start >= max_code_points) {
2185 break;
b35552de 2186 }
72400949 2187 end = MIN(end, max_code_points - 1);
b35552de 2188 count += end - start + 1;
72400949 2189 if (count >= max_match) {
b35552de
KW
2190 invlist_iterfinish(ssc->invlist);
2191 return FALSE;
2192 }
2193 }
2194
2195 return TRUE;
2196}
2197
2198
b423522f
KW
2199STATIC void
2200S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2201{
2202 /* The inversion list in the SSC is marked mortal; now we need a more
2203 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
2204 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2205 * map */
b423522f 2206
28118b9c 2207 SV* invlist = invlist_clone(ssc->invlist, NULL);
b423522f
KW
2208
2209 PERL_ARGS_ASSERT_SSC_FINALIZE;
2210
71068078 2211 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 2212
a0dd4231 2213 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
2214 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2215 * by the time we reach here */
f240c685
KW
2216 assert(! (ANYOF_FLAGS(ssc)
2217 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
2218 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2219 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 2220
b423522f
KW
2221 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2222
4c404f26 2223 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
21c3fd9d 2224 SvREFCNT_dec(invlist);
b423522f 2225
85c8e306
KW
2226 /* Make sure is clone-safe */
2227 ssc->invlist = NULL;
2228
e0e1be5f 2229 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 2230 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
d156f5cb 2231 OP(ssc) = ANYOFPOSIXL;
e0e1be5f 2232 }
d156f5cb 2233 else if (RExC_contains_locale) {
b2e90ddf
KW
2234 OP(ssc) = ANYOFL;
2235 }
2236
1462525b 2237 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
2238}
2239
a3621e74
YO
2240#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2241#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2242#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
2243#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2244 ? (TRIE_LIST_CUR( idx ) - 1) \
2245 : 0 )
a3621e74 2246
3dab1dad
YO
2247
2248#ifdef DEBUGGING
07be1b83 2249/*
2b8b4781
NC
2250 dump_trie(trie,widecharmap,revcharmap)
2251 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2252 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
2253
2254 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
2255 The _interim_ variants are used for debugging the interim
2256 tables that are used to generate the final compressed
2257 representation which is what dump_trie expects.
2258
486ec47a 2259 Part of the reason for their existence is to provide a form
3dab1dad 2260 of documentation as to how the different representations function.
07be1b83
YO
2261
2262*/
3dab1dad
YO
2263
2264/*
3dab1dad
YO
2265 Dumps the final compressed table form of the trie to Perl_debug_log.
2266 Used for debugging make_trie().
2267*/
b9a59e08 2268
3dab1dad 2269STATIC void
2b8b4781
NC
2270S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2271 AV *revcharmap, U32 depth)
3dab1dad
YO
2272{
2273 U32 state;
ab3bbdeb 2274 SV *sv=sv_newmortal();
55eed653 2275 int colwidth= widecharmap ? 6 : 4;
2e64971a 2276 U16 word;
271b36b1 2277 DECLARE_AND_GET_RE_DEBUG_FLAGS;
3dab1dad 2278
7918f24d 2279 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 2280
6ad9a8ab 2281 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 2282 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
2283
2284 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 2285 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 2286 if ( tmp ) {
6ad9a8ab 2287 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2288 colwidth,
538e84ed 2289 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2290 PL_colors[0], PL_colors[1],
2291 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2292 PERL_PV_ESCAPE_FIRSTCHAR
2293 )
ab3bbdeb 2294 );
3dab1dad
YO
2295 }
2296 }
1e37780e
YO
2297 Perl_re_printf( aTHX_ "\n");
2298 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2299
2300 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2301 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2302 Perl_re_printf( aTHX_ "\n");
3dab1dad 2303
1e2e3d02 2304 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2305 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2306
147e3846 2307 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2308
2309 if ( trie->states[ state ].wordnum ) {
1e37780e 2310 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2311 } else {
6ad9a8ab 2312 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2313 }
2314
147e3846 2315 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2316
2317 if ( base ) {
2318 U32 ofs = 0;
2319
2320 while( ( base + ofs < trie->uniquecharcount ) ||
2321 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2322 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2323 != state))
3dab1dad
YO
2324 ofs++;
2325
147e3846 2326 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2327
2328 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2329 if ( ( base + ofs >= trie->uniquecharcount )
2330 && ( base + ofs - trie->uniquecharcount
2331 < trie->lasttrans )
2332 && trie->trans[ base + ofs
2333 - trie->uniquecharcount ].check == state )
3dab1dad 2334 {
147e3846 2335 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2336 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2337 );
3dab1dad 2338 } else {
88f063b4 2339 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
3dab1dad
YO
2340 }
2341 }
2342
6ad9a8ab 2343 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2344
2345 }
6ad9a8ab 2346 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2347 }
6ad9a8ab 2348 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2349 depth);
2e64971a 2350 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2351 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2352 (int)word, (int)(trie->wordinfo[word].prev),
2353 (int)(trie->wordinfo[word].len));
2354 }
6ad9a8ab 2355 Perl_re_printf( aTHX_ "\n" );
538e84ed 2356}
3dab1dad 2357/*
3dab1dad 2358 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2359 List tries normally only are used for construction when the number of
3dab1dad
YO
2360 possible chars (trie->uniquecharcount) is very high.
2361 Used for debugging make_trie().
2362*/
2363STATIC void
55eed653 2364S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2365 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2366 U32 depth)
3dab1dad
YO
2367{
2368 U32 state;
ab3bbdeb 2369 SV *sv=sv_newmortal();
55eed653 2370 int colwidth= widecharmap ? 6 : 4;
271b36b1 2371 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7918f24d
NC
2372
2373 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2374
3dab1dad 2375 /* print out the table precompression. */
6ad9a8ab 2376 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2377 depth+1 );
6ad9a8ab 2378 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2379 depth+1, "------:-----+-----------------\n" );
538e84ed 2380
3dab1dad
YO
2381 for( state=1 ; state < next_alloc ; state ++ ) {
2382 U16 charid;
538e84ed 2383
147e3846 2384 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2385 depth+1, (UV)state );
3dab1dad 2386 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2387 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2388 } else {
6ad9a8ab 2389 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2390 trie->states[ state ].wordnum
2391 );
2392 }
2393 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed 2394 SV ** const tmp = av_fetch( revcharmap,
88f063b4 2395 TRIE_LIST_ITEM(state, charid).forid, 0);
ab3bbdeb 2396 if ( tmp ) {
147e3846 2397 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2398 colwidth,
538e84ed
KW
2399 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2400 colwidth,
2401 PL_colors[0], PL_colors[1],
2402 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2403 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2404 ) ,
88f063b4
KW
2405 TRIE_LIST_ITEM(state, charid).forid,
2406 (UV)TRIE_LIST_ITEM(state, charid).newstate
1e2e3d02 2407 );
538e84ed 2408 if (!(charid % 10))
6ad9a8ab 2409 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2410 (int)((depth * 2) + 14), "");
1e2e3d02 2411 }
ab3bbdeb 2412 }
6ad9a8ab 2413 Perl_re_printf( aTHX_ "\n");
3dab1dad 2414 }
538e84ed 2415}
3dab1dad
YO
2416
2417/*
3dab1dad 2418 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2419 This is the normal DFA style state transition table, with a few
2420 twists to facilitate compression later.
3dab1dad
YO
2421 Used for debugging make_trie().
2422*/
2423STATIC void
55eed653 2424S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2425 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2426 U32 depth)
3dab1dad
YO
2427{
2428 U32 state;
2429 U16 charid;
ab3bbdeb 2430 SV *sv=sv_newmortal();
55eed653 2431 int colwidth= widecharmap ? 6 : 4;
271b36b1 2432 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7918f24d
NC
2433
2434 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2435
3dab1dad
YO
2436 /*
2437 print out the table precompression so that we can do a visual check
2438 that they are identical.
2439 */
538e84ed 2440
6ad9a8ab 2441 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2442
2443 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2444 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2445 if ( tmp ) {
6ad9a8ab 2446 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2447 colwidth,
538e84ed 2448 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2449 PL_colors[0], PL_colors[1],
2450 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2451 PERL_PV_ESCAPE_FIRSTCHAR
2452 )
ab3bbdeb 2453 );
3dab1dad
YO
2454 }
2455 }
2456
4aaafc03
YO
2457 Perl_re_printf( aTHX_ "\n");
2458 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2459
2460 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2461 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2462 }
2463
6ad9a8ab 2464 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2465
2466 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2467
147e3846 2468 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2469 depth+1,
3dab1dad
YO
2470 (UV)TRIE_NODENUM( state ) );
2471
2472 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2473 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2474 if (v)
147e3846 2475 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2476 else
6ad9a8ab 2477 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2478 }
2479 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2480 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2481 (UV)trie->trans[ state ].check );
3dab1dad 2482 } else {
147e3846 2483 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2484 (UV)trie->trans[ state ].check,
3dab1dad
YO
2485 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2486 }
2487 }
07be1b83 2488}
3dab1dad
YO
2489
2490#endif
2491
2e64971a 2492
786e8c11
YO
2493/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2494 startbranch: the first branch in the whole branch sequence
2495 first : start branch of sequence of branch-exact nodes.
2496 May be the same as startbranch
2497 last : Thing following the last branch.
2498 May be the same as tail.
2499 tail : item following the branch sequence
2500 count : words in the sequence
a4525e78 2501 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2502 depth : indent depth
3dab1dad 2503
786e8c11 2504Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2505
786e8c11
YO
2506A trie is an N'ary tree where the branches are determined by digital
2507decomposition of the key. IE, at the root node you look up the 1st character and
2508follow that branch repeat until you find the end of the branches. Nodes can be
2509marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2510
786e8c11 2511 /he|she|his|hers/
72f13be8 2512
786e8c11
YO
2513would convert into the following structure. Numbers represent states, letters
2514following numbers represent valid transitions on the letter from that state, if
2515the number is in square brackets it represents an accepting state, otherwise it
2516will be in parenthesis.
07be1b83 2517
786e8c11
YO
2518 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2519 | |
2520 | (2)
2521 | |
2522 (1) +-i->(6)-+-s->[7]
2523 |
2524 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2525
786e8c11
YO
2526 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2527
2528This shows that when matching against the string 'hers' we will begin at state 1
2529read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2530then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2531is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2532single traverse. We store a mapping from accepting to state to which word was
2533matched, and then when we have multiple possibilities we try to complete the
b8fda935 2534rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2535
2536The only prior NFA like behaviour that would be changed by the TRIE support is
2537the silent ignoring of duplicate alternations which are of the form:
2538
2539 / (DUPE|DUPE) X? (?{ ... }) Y /x
2540
4b714af6 2541Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2542and without the optimisation. With the optimisations dupes will be silently
486ec47a 2543ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2544the following demonstrates:
2545
2546 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2547
2548which prints out 'word' three times, but
2549
2550 'words'=~/(word|word|word)(?{ print $1 })S/
2551
2552which doesnt print it out at all. This is due to other optimisations kicking in.
2553
2554Example of what happens on a structural level:
2555
486ec47a 2556The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2557
2558 1: CURLYM[1] {1,32767}(18)
2559 5: BRANCH(8)
2560 6: EXACT <ac>(16)
2561 8: BRANCH(11)
2562 9: EXACT <ad>(16)
2563 11: BRANCH(14)
2564 12: EXACT <ab>(16)
2565 16: SUCCEED(0)
2566 17: NOTHING(18)
2567 18: END(0)
2568
2569This would be optimizable with startbranch=5, first=5, last=16, tail=16
2570and should turn into:
2571
2572 1: CURLYM[1] {1,32767}(18)
2573 5: TRIE(16)
2574 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2575 <ac>
2576 <ad>
2577 <ab>
2578 16: SUCCEED(0)
2579 17: NOTHING(18)
2580 18: END(0)
2581
2582Cases where tail != last would be like /(?foo|bar)baz/:
2583
2584 1: BRANCH(4)
2585 2: EXACT <foo>(8)
2586 4: BRANCH(7)
2587 5: EXACT <bar>(8)
2588 7: TAIL(8)
2589 8: EXACT <baz>(10)
2590 10: END(0)
2591
2592which would be optimizable with startbranch=1, first=1, last=7, tail=8
2593and would end up looking like:
2594
2595 1: TRIE(8)
2596 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2597 <foo>
2598 <bar>
2599 7: TAIL(8)
2600 8: EXACT <baz>(10)
2601 10: END(0)
2602
c80e42f3 2603 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2604
2605is the recommended Unicode-aware way of saying
2606
2607 *(d++) = uv;
2608*/
2609
fab2782b 2610#define TRIE_STORE_REVCHAR(val) \
786e8c11 2611 STMT_START { \
73031816 2612 if (UTF) { \
668fcfea 2613 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2614 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1d84a256
DM
2615 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2616 *kapow = '\0'; \
73031816
NC
2617 SvCUR_set(zlopp, kapow - flrbbbbb); \
2618 SvPOK_on(zlopp); \
2619 SvUTF8_on(zlopp); \
2620 av_push(revcharmap, zlopp); \
2621 } else { \
fab2782b 2622 char ooooff = (char)val; \
73031816
NC
2623 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2624 } \
2625 } STMT_END
786e8c11 2626
914a25d5
KW
2627/* This gets the next character from the input, folding it if not already
2628 * folded. */
2629#define TRIE_READ_CHAR STMT_START { \
2630 wordlen++; \
2631 if ( UTF ) { \
2632 /* if it is UTF then it is either already folded, or does not need \
2633 * folding */ \
1c1d615a 2634 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2635 } \
2636 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2637 /* This folder implies Unicode rules, which in the range expressible \
2638 * by not UTF is the lower case, with the two exceptions, one of \
2639 * which should have been taken care of before calling this */ \
2640 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2641 uvc = toLOWER_L1(*uc); \
2642 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2643 len = 1; \
914a25d5
KW
2644 } else { \
2645 /* raw data, will be folded later if needed */ \
2646 uvc = (U32)*uc; \
2647 len = 1; \
2648 } \
786e8c11
YO
2649} STMT_END
2650
2651
2652
2653#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2654 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
00195859 2655 U32 ging = TRIE_LIST_LEN( state ) * 2; \
f9003953 2656 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
00195859 2657 TRIE_LIST_LEN( state ) = ging; \
786e8c11
YO
2658 } \
2659 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2660 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2661 TRIE_LIST_CUR( state )++; \
2662} STMT_END
07be1b83 2663
786e8c11 2664#define TRIE_LIST_NEW(state) STMT_START { \
d09f14bf 2665 Newx( trie->states[ state ].trans.list, \
786e8c11
YO
2666 4, reg_trie_trans_le ); \
2667 TRIE_LIST_CUR( state ) = 1; \
2668 TRIE_LIST_LEN( state ) = 4; \
2669} STMT_END
07be1b83 2670
786e8c11
YO
2671#define TRIE_HANDLE_WORD(state) STMT_START { \
2672 U16 dupe= trie->states[ state ].wordnum; \
2673 regnode * const noper_next = regnext( noper ); \
2674 \
786e8c11
YO
2675 DEBUG_r({ \
2676 /* store the word for dumping */ \
2677 SV* tmp; \
2678 if (OP(noper) != NOTHING) \
740cce10 2679 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2680 else \
740cce10 2681 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2682 av_push( trie_words, tmp ); \
786e8c11
YO
2683 }); \
2684 \
2685 curword++; \
2e64971a
DM
2686 trie->wordinfo[curword].prev = 0; \
2687 trie->wordinfo[curword].len = wordlen; \
2688 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2689 \
2690 if ( noper_next < tail ) { \
2691 if (!trie->jump) \
538e84ed
KW
2692 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2693 sizeof(U16) ); \
7f69552c 2694 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2695 if (!jumper) \
2696 jumper = noper_next; \
2697 if (!nextbranch) \
2698 nextbranch= regnext(cur); \
2699 } \
2700 \
2701 if ( dupe ) { \
2e64971a
DM
2702 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2703 /* chain, so that when the bits of chain are later */\
2704 /* linked together, the dups appear in the chain */\
2705 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2706 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2707 } else { \
2708 /* we haven't inserted this word yet. */ \
2709 trie->states[ state ].wordnum = curword; \
2710 } \
2711} STMT_END
07be1b83 2712
3dab1dad 2713
786e8c11
YO
2714#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2715 ( ( base + charid >= ucharcount \
2716 && base + charid < ubound \
2717 && state == trie->trans[ base - ucharcount + charid ].check \
2718 && trie->trans[ base - ucharcount + charid ].next ) \
2719 ? trie->trans[ base - ucharcount + charid ].next \
2720 : ( state==1 ? special : 0 ) \
2721 )
3dab1dad 2722
8bcafbf4
YO
2723#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2724STMT_START { \
2725 TRIE_BITMAP_SET(trie, uvc); \
2726 /* store the folded codepoint */ \
2727 if ( folder ) \
2728 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2729 \
2730 if ( !UTF ) { \
2731 /* store first byte of utf8 representation of */ \
2732 /* variant codepoints */ \
2733 if (! UVCHR_IS_INVARIANT(uvc)) { \
2734 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2735 } \
2736 } \
2737} STMT_END
786e8c11
YO
2738#define MADE_TRIE 1
2739#define MADE_JUMP_TRIE 2
2740#define MADE_EXACT_TRIE 4
3dab1dad 2741
a3621e74 2742STATIC I32
538e84ed
KW
2743S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2744 regnode *first, regnode *last, regnode *tail,
2745 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2746{
2747 /* first pass, loop through and scan words */
2748 reg_trie_data *trie;
55eed653 2749 HV *widecharmap = NULL;
2b8b4781 2750 AV *revcharmap = newAV();
a3621e74 2751 regnode *cur;
a3621e74
YO
2752 STRLEN len = 0;
2753 UV uvc = 0;
2754 U16 curword = 0;
2755 U32 next_alloc = 0;
786e8c11
YO
2756 regnode *jumper = NULL;
2757 regnode *nextbranch = NULL;
7f69552c 2758 regnode *convert = NULL;
2e64971a 2759 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2760 /* we just use folder as a flag in utf8 */
1e696034 2761 const U8 * folder = NULL;
a3621e74 2762
3a611511
YO
2763 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2764 * which stands for one trie structure, one hash, optionally followed
2765 * by two arrays */
2b8b4781 2766#ifdef DEBUGGING
3a611511 2767 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2b8b4781
NC
2768 AV *trie_words = NULL;
2769 /* along with revcharmap, this only used during construction but both are
2770 * useful during debugging so we store them in the struct when debugging.
8e11feef 2771 */
2b8b4781 2772#else
cf78de0b 2773 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2774 STRLEN trie_charcount=0;
3dab1dad 2775#endif
2b8b4781 2776 SV *re_trie_maxbuff;
271b36b1 2777 DECLARE_AND_GET_RE_DEBUG_FLAGS;
7918f24d
NC
2778
2779 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2780#ifndef DEBUGGING
2781 PERL_UNUSED_ARG(depth);
2782#endif
a3621e74 2783
1e696034 2784 switch (flags) {
3f2416ae 2785 case EXACT: case EXACT_REQ8: case EXACTL: break;
89829bb5 2786 case EXACTFAA:
0ea669f4 2787 case EXACTFUP:
a4525e78
KW
2788 case EXACTFU:
2789 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2790 case EXACTF: folder = PL_fold; break;
fab2782b 2791 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2792 }
2793
c944940b 2794 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2795 trie->refcount = 1;
3dab1dad 2796 trie->startstate = 1;
786e8c11 2797 trie->wordcount = word_count;
f8fc2ecf 2798 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2799 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3f2416ae 2800 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
c944940b 2801 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2802 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2803 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2804
a3621e74 2805 DEBUG_r({
2b8b4781 2806 trie_words = newAV();
a3621e74 2807 });
a3621e74 2808
3b89859a 2809 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
316ebaf2 2810 assert(re_trie_maxbuff);
a3621e74 2811 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2812 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2813 }
df826430 2814 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2815 Perl_re_indentf( aTHX_
cb41e5d6
YO
2816 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2817 depth+1,
88f063b4 2818 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
538e84ed 2819 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2820 });
538e84ed 2821
7f69552c
YO
2822 /* Find the node we are going to overwrite */
2823 if ( first == startbranch && OP( last ) != BRANCH ) {
2824 /* whole branch chain */
2825 convert = first;
2826 } else {
2827 /* branch sub-chain */
2828 convert = NEXTOPER( first );
2829 }
538e84ed 2830
a3621e74
YO
2831 /* -- First loop and Setup --
2832
2833 We first traverse the branches and scan each word to determine if it
2834 contains widechars, and how many unique chars there are, this is
2835 important as we have to build a table with at least as many columns as we
2836 have unique chars.
2837
2838 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2839 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2840 the native representation of the character value as the key and IV's for
2841 the coded index.
a3621e74
YO
2842
2843 *TODO* If we keep track of how many times each character is used we can
2844 remap the columns so that the table compression later on is more
3b753521 2845 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2846 middle and the least common are on the outside. IMO this would be better
2847 than a most to least common mapping as theres a decent chance the most
2848 common letter will share a node with the least common, meaning the node
486ec47a 2849 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2850 case is when we have the least common nodes twice.
2851
2852 */
2853
a3621e74 2854 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2855 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2856 const U8 *uc;
2857 const U8 *e;
bc031a7d 2858 int foldlen = 0;
07be1b83 2859 U32 wordlen = 0; /* required init */
bc031a7d
KW
2860 STRLEN minchars = 0;
2861 STRLEN maxchars = 0;
538e84ed
KW
2862 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2863 bitmap?*/
a3621e74 2864
3dab1dad 2865 if (OP(noper) == NOTHING) {
20ed8c88
YO
2866 /* skip past a NOTHING at the start of an alternation
2867 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
ca902fb8
YO
2868 *
2869 * If the next node is not something we are supposed to process
2870 * we will just ignore it due to the condition guarding the
2871 * next block.
20ed8c88 2872 */
ca902fb8 2873
df826430 2874 regnode *noper_next= regnext(noper);
944e05e3
YO
2875 if (noper_next < tail)
2876 noper= noper_next;
2877 }
2878
f6b4b99d
KW
2879 if ( noper < tail
2880 && ( OP(noper) == flags
3f2416ae
KW
2881 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2882 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 2883 || OP(noper) == EXACTFUP))))
f6b4b99d 2884 {
944e05e3
YO
2885 uc= (U8*)STRING(noper);
2886 e= uc + STR_LEN(noper);
2887 } else {
2888 trie->minlen= 0;
2889 continue;
3dab1dad 2890 }
df826430 2891
944e05e3 2892
fab2782b 2893 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2894 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2895 regardless of encoding */
0ea669f4 2896 if (OP( noper ) == EXACTFUP) {
fab2782b 2897 /* false positives are ok, so just set this */
0dc4a61d 2898 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2899 }
2900 }
dca5fc4c 2901
bc031a7d
KW
2902 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2903 branch */
3dab1dad 2904 TRIE_CHARCOUNT(trie)++;
a3621e74 2905 TRIE_READ_CHAR;
645de4ce 2906
bc031a7d
KW
2907 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2908 * is in effect. Under /i, this character can match itself, or
2909 * anything that folds to it. If not under /i, it can match just
2910 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2911 * all fold to k, and all are single characters. But some folds
2912 * expand to more than one character, so for example LATIN SMALL
2913 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2914 * the string beginning at 'uc' is 'ffi', it could be matched by
2915 * three characters, or just by the one ligature character. (It
2916 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2917 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2918 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2919 * match.) The trie needs to know the minimum and maximum number
2920 * of characters that could match so that it can use size alone to
2921 * quickly reject many match attempts. The max is simple: it is
2922 * the number of folded characters in this branch (since a fold is
2923 * never shorter than what folds to it. */
2924
2925 maxchars++;
2926
2927 /* And the min is equal to the max if not under /i (indicated by
2928 * 'folder' being NULL), or there are no multi-character folds. If
2929 * there is a multi-character fold, the min is incremented just
2930 * once, for the character that folds to the sequence. Each
2931 * character in the sequence needs to be added to the list below of
2932 * characters in the trie, but we count only the first towards the
2933 * min number of characters needed. This is done through the
2934 * variable 'foldlen', which is returned by the macros that look
2935 * for these sequences as the number of bytes the sequence
2936 * occupies. Each time through the loop, we decrement 'foldlen' by
2937 * how many bytes the current char occupies. Only when it reaches
2938 * 0 do we increment 'minchars' or look for another multi-character
2939 * sequence. */
2940 if (folder == NULL) {
2941 minchars++;
2942 }
2943 else if (foldlen > 0) {
2944 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2945 }
2946 else {
bc031a7d
KW
2947 minchars++;
2948
2949 /* See if *uc is the beginning of a multi-character fold. If
2950 * so, we decrement the length remaining to look at, to account
2951 * for the current character this iteration. (We can use 'uc'
2952 * instead of the fold returned by TRIE_READ_CHAR because for
2953 * non-UTF, the latin1_safe macro is smart enough to account
2954 * for all the unfolded characters, and because for UTF, the
2955 * string will already have been folded earlier in the
2956 * compilation process */
2957 if (UTF) {
2958 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2959 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2960 }
2961 }
bc031a7d
KW
2962 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2963 foldlen--;
2964 }
645de4ce 2965 }
bc031a7d
KW
2966
2967 /* The current character (and any potential folds) should be added
2968 * to the possible matching characters for this position in this
2969 * branch */
a3621e74 2970 if ( uvc < 256 ) {
fab2782b
YO
2971 if ( folder ) {
2972 U8 folded= folder[ (U8) uvc ];
2973 if ( !trie->charmap[ folded ] ) {
2974 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2975 TRIE_STORE_REVCHAR( folded );
2976 }
2977 }
a3621e74
YO
2978 if ( !trie->charmap[ uvc ] ) {
2979 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2980 TRIE_STORE_REVCHAR( uvc );
a3621e74 2981 }
02daf0ab 2982 if ( set_bit ) {
62012aee
KW
2983 /* store the codepoint in the bitmap, and its folded
2984 * equivalent. */
8bcafbf4 2985 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2986 set_bit = 0; /* We've done our bit :-) */
2987 }
a3621e74 2988 } else {
bc031a7d
KW
2989
2990 /* XXX We could come up with the list of code points that fold
2991 * to this using PL_utf8_foldclosures, except not for
2992 * multi-char folds, as there may be multiple combinations
2993 * there that could work, which needs to wait until runtime to
2994 * resolve (The comment about LIGATURE FFI above is such an
2995 * example */
2996
a3621e74 2997 SV** svpp;
55eed653
NC
2998 if ( !widecharmap )
2999 widecharmap = newHV();
a3621e74 3000
55eed653 3001 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
3002
3003 if ( !svpp )
147e3846 3004 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
3005
3006 if ( !SvTRUE( *svpp ) ) {
3007 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 3008 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
3009 }
3010 }
bc031a7d
KW
3011 } /* end loop through characters in this branch of the trie */
3012
3013 /* We take the min and max for this branch and combine to find the min
3014 * and max for all branches processed so far */
3dab1dad 3015 if( cur == first ) {
bc031a7d
KW
3016 trie->minlen = minchars;
3017 trie->maxlen = maxchars;
3018 } else if (minchars < trie->minlen) {
3019 trie->minlen = minchars;
3020 } else if (maxchars > trie->maxlen) {
3021 trie->maxlen = maxchars;
fab2782b 3022 }
a3621e74
YO
3023 } /* end first pass */
3024 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 3025 Perl_re_indentf( aTHX_
cb41e5d6
YO
3026 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3027 depth+1,
55eed653 3028 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
3029 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
3030 (int)trie->minlen, (int)trie->maxlen )
a3621e74 3031 );
a3621e74
YO
3032
3033 /*
3034 We now know what we are dealing with in terms of unique chars and
3035 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
3036 representation using a flat table will take. If it's over a reasonable
3037 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
3038 conservative but potentially much slower representation using an array
3039 of lists.
3040
3041 At the end we convert both representations into the same compressed
3042 form that will be used in regexec.c for matching with. The latter
3043 is a form that cannot be used to construct with but has memory
3044 properties similar to the list form and access properties similar
3045 to the table form making it both suitable for fast searches and
3046 small enough that its feasable to store for the duration of a program.
3047
3048 See the comment in the code where the compressed table is produced
3049 inplace from the flat tabe representation for an explanation of how
3050 the compression works.
3051
3052 */
3053
3054
2e64971a
DM
3055 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3056 prev_states[1] = 0;
3057
538e84ed
KW
3058 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3059 > SvIV(re_trie_maxbuff) )
3060 {
a3621e74
YO
3061 /*
3062 Second Pass -- Array Of Lists Representation
3063
3064 Each state will be represented by a list of charid:state records
3065 (reg_trie_trans_le) the first such element holds the CUR and LEN
3066 points of the allocated array. (See defines above).
3067
3068 We build the initial structure using the lists, and then convert
3069 it into the compressed table form which allows faster lookups
3070 (but cant be modified once converted).
a3621e74
YO
3071 */
3072
a3621e74
YO
3073 STRLEN transcount = 1;
3074
6ad9a8ab 3075 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 3076 depth+1));
686b73d4 3077
c944940b
JH
3078 trie->states = (reg_trie_state *)
3079 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3080 sizeof(reg_trie_state) );
a3621e74
YO
3081 TRIE_LIST_NEW(1);
3082 next_alloc = 2;
3083
3084 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3085
df826430 3086 regnode *noper = NEXTOPER( cur );
c445ea15
AL
3087 U32 state = 1; /* required init */
3088 U16 charid = 0; /* sanity init */
07be1b83 3089 U32 wordlen = 0; /* required init */
c445ea15 3090
df826430
YO
3091 if (OP(noper) == NOTHING) {
3092 regnode *noper_next= regnext(noper);
944e05e3
YO
3093 if (noper_next < tail)
3094 noper= noper_next;
ca902fb8
YO
3095 /* we will undo this assignment if noper does not
3096 * point at a trieable type in the else clause of
3097 * the following statement. */
df826430
YO
3098 }
3099
f6b4b99d
KW
3100 if ( noper < tail
3101 && ( OP(noper) == flags
3f2416ae
KW
3102 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3103 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 3104 || OP(noper) == EXACTFUP))))
f6b4b99d 3105 {
944e05e3
YO
3106 const U8 *uc= (U8*)STRING(noper);
3107 const U8 *e= uc + STR_LEN(noper);
3108
786e8c11 3109 for ( ; uc < e ; uc += len ) {
c445ea15 3110
786e8c11 3111 TRIE_READ_CHAR;
c445ea15 3112
786e8c11
YO
3113 if ( uvc < 256 ) {
3114 charid = trie->charmap[ uvc ];
c445ea15 3115 } else {
538e84ed
KW
3116 SV** const svpp = hv_fetch( widecharmap,
3117 (char*)&uvc,
3118 sizeof( UV ),
3119 0);
786e8c11
YO
3120 if ( !svpp ) {
3121 charid = 0;
3122 } else {
3123 charid=(U16)SvIV( *svpp );
3124 }
c445ea15 3125 }
538e84ed
KW
3126 /* charid is now 0 if we dont know the char read, or
3127 * nonzero if we do */
786e8c11 3128 if ( charid ) {
a3621e74 3129
786e8c11
YO
3130 U16 check;
3131 U32 newstate = 0;
a3621e74 3132
786e8c11
YO
3133 charid--;
3134 if ( !trie->states[ state ].trans.list ) {
3135 TRIE_LIST_NEW( state );
c445ea15 3136 }
538e84ed
KW
3137 for ( check = 1;
3138 check <= TRIE_LIST_USED( state );
3139 check++ )
3140 {
3141 if ( TRIE_LIST_ITEM( state, check ).forid
3142 == charid )
3143 {
786e8c11
YO
3144 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3145 break;
3146 }
3147 }
3148 if ( ! newstate ) {
3149 newstate = next_alloc++;
2e64971a 3150 prev_states[newstate] = state;
786e8c11
YO
3151 TRIE_LIST_PUSH( state, charid, newstate );
3152 transcount++;
3153 }
3154 state = newstate;
3155 } else {
147e3846 3156 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 3157 }
a28509cc 3158 }
ca902fb8
YO
3159 } else {
3160 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3161 * on a trieable type. So we need to reset noper back to point at the first regop
3162 * in the branch before we call TRIE_HANDLE_WORD()
3163 */
3164 noper= NEXTOPER(cur);
3165 }
3dab1dad 3166 TRIE_HANDLE_WORD(state);
a3621e74
YO
3167
3168 } /* end second pass */
3169
1e2e3d02 3170 /* next alloc is the NEXT state to be allocated */
538e84ed 3171 trie->statecount = next_alloc;
c944940b
JH
3172 trie->states = (reg_trie_state *)
3173 PerlMemShared_realloc( trie->states,
3174 next_alloc
3175 * sizeof(reg_trie_state) );
a3621e74 3176
3dab1dad 3177 /* and now dump it out before we compress it */
2b8b4781
NC
3178 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3179 revcharmap, next_alloc,
3180 depth+1)
1e2e3d02 3181 );
a3621e74 3182
c944940b
JH
3183 trie->trans = (reg_trie_trans *)
3184 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
3185 {
3186 U32 state;
a3621e74
YO
3187 U32 tp = 0;
3188 U32 zp = 0;
3189
3190
3191 for( state=1 ; state < next_alloc ; state ++ ) {
3192 U32 base=0;
3193
3194 /*
3195 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3196 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
3197 );
3198 */
3199
3200 if (trie->states[state].trans.list) {
3201 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3202 U16 maxid=minid;
a28509cc 3203 U16 idx;
a3621e74
YO
3204
3205 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
3206 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3207 if ( forid < minid ) {
3208 minid=forid;
3209 } else if ( forid > maxid ) {
3210 maxid=forid;
3211 }
a3621e74
YO
3212 }
3213 if ( transcount < tp + maxid - minid + 1) {
3214 transcount *= 2;
c944940b
JH
3215 trie->trans = (reg_trie_trans *)
3216 PerlMemShared_realloc( trie->trans,
446bd890
NC
3217 transcount
3218 * sizeof(reg_trie_trans) );
538e84ed
KW
3219 Zero( trie->trans + (transcount / 2),
3220 transcount / 2,
3221 reg_trie_trans );
a3621e74
YO
3222 }
3223 base = trie->uniquecharcount + tp - minid;
3224 if ( maxid == minid ) {
3225 U32 set = 0;
3226 for ( ; zp < tp ; zp++ ) {
3227 if ( ! trie->trans[ zp ].next ) {
3228 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
3229 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3230 1).newstate;
a3621e74
YO
3231 trie->trans[ zp ].check = state;
3232 set = 1;
3233 break;
3234 }
3235 }
3236 if ( !set ) {
538e84ed
KW
3237 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3238 1).newstate;
a3621e74
YO
3239 trie->trans[ tp ].check = state;
3240 tp++;
3241 zp = tp;
3242 }
3243 } else {
3244 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
3245 const U32 tid = base
3246 - trie->uniquecharcount
3247 + TRIE_LIST_ITEM( state, idx ).forid;
3248 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3249 idx ).newstate;
a3621e74
YO
3250 trie->trans[ tid ].check = state;
3251 }
3252 tp += ( maxid - minid + 1 );
3253 }
3254 Safefree(trie->states[ state ].trans.list);
3255 }
3256 /*
3257 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3258 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
3259 );
3260 */
3261 trie->states[ state ].trans.base=base;
3262 }
cc601c31 3263 trie->lasttrans = tp + 1;
a3621e74
YO
3264 }
3265 } else {
3266 /*
3267 Second Pass -- Flat Table Representation.
3268
b423522f
KW
3269 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3270 each. We know that we will need Charcount+1 trans at most to store
3271 the data (one row per char at worst case) So we preallocate both
3272 structures assuming worst case.
a3621e74
YO
3273
3274 We then construct the trie using only the .next slots of the entry
3275 structs.
3276
b423522f
KW
3277 We use the .check field of the first entry of the node temporarily
3278 to make compression both faster and easier by keeping track of how
3279 many non zero fields are in the node.
a3621e74
YO
3280
3281 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3282 transition.
3283
b423522f
KW
3284 There are two terms at use here: state as a TRIE_NODEIDX() which is
3285 a number representing the first entry of the node, and state as a
3286 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3287 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3288 if there are 2 entrys per node. eg:
a3621e74
YO
3289
3290 A B A B
3291 1. 2 4 1. 3 7
3292 2. 0 3 3. 0 5
3293 3. 0 0 5. 0 0
3294 4. 0 0 7. 0 0
3295
b423522f
KW
3296 The table is internally in the right hand, idx form. However as we
3297 also have to deal with the states array which is indexed by nodenum
3298 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
3299
3300 */
6ad9a8ab 3301 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 3302 depth+1));
3dab1dad 3303
c944940b
JH
3304 trie->trans = (reg_trie_trans *)
3305 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3306 * trie->uniquecharcount + 1,
3307 sizeof(reg_trie_trans) );
3308 trie->states = (reg_trie_state *)
3309 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3310 sizeof(reg_trie_state) );
a3621e74
YO
3311 next_alloc = trie->uniquecharcount + 1;
3312
3dab1dad 3313
a3621e74
YO
3314 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3315
df826430 3316 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3317
3318 U32 state = 1; /* required init */
3319
3320 U16 charid = 0; /* sanity init */
3321 U32 accept_state = 0; /* sanity init */
a3621e74 3322
07be1b83 3323 U32 wordlen = 0; /* required init */
a3621e74 3324
df826430
YO
3325 if (OP(noper) == NOTHING) {
3326 regnode *noper_next= regnext(noper);
944e05e3
YO
3327 if (noper_next < tail)
3328 noper= noper_next;
ca902fb8
YO
3329 /* we will undo this assignment if noper does not
3330 * point at a trieable type in the else clause of
3331 * the following statement. */
df826430 3332 }
fab2782b 3333
f6b4b99d
KW
3334 if ( noper < tail
3335 && ( OP(noper) == flags
3f2416ae
KW
3336 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3337 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 3338 || OP(noper) == EXACTFUP))))
f6b4b99d 3339 {
944e05e3
YO
3340 const U8 *uc= (U8*)STRING(noper);
3341 const U8 *e= uc + STR_LEN(noper);
3342
786e8c11 3343 for ( ; uc < e ; uc += len ) {
a3621e74 3344
786e8c11 3345 TRIE_READ_CHAR;
a3621e74 3346
786e8c11
YO
3347 if ( uvc < 256 ) {
3348 charid = trie->charmap[ uvc ];
3349 } else {
538e84ed
KW
3350 SV* const * const svpp = hv_fetch( widecharmap,
3351 (char*)&uvc,
3352 sizeof( UV ),
3353 0);
786e8c11 3354 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3355 }
786e8c11
YO
3356 if ( charid ) {
3357 charid--;
3358 if ( !trie->trans[ state + charid ].next ) {
3359 trie->trans[ state + charid ].next = next_alloc;
3360 trie->trans[ state ].check++;
2e64971a
DM
3361 prev_states[TRIE_NODENUM(next_alloc)]
3362 = TRIE_NODENUM(state);
786e8c11
YO
3363 next_alloc += trie->uniquecharcount;
3364 }
3365 state = trie->trans[ state + charid ].next;
3366 } else {
147e3846 3367 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3368 }
538e84ed
KW
3369 /* charid is now 0 if we dont know the char read, or
3370 * nonzero if we do */
a3621e74 3371 }
ca902fb8
YO
3372 } else {
3373 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3374 * on a trieable type. So we need to reset noper back to point at the first regop
3375 * in the branch before we call TRIE_HANDLE_WORD().
3376 */
3377 noper= NEXTOPER(cur);
a3621e74 3378 }
3dab1dad
YO
3379 accept_state = TRIE_NODENUM( state );
3380 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3381
3382 } /* end second pass */
3383
3dab1dad 3384 /* and now dump it out before we compress it */
2b8b4781
NC
3385 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3386 revcharmap,
3387 next_alloc, depth+1));
a3621e74 3388
a3621e74
YO
3389 {
3390 /*
3391 * Inplace compress the table.*
3392
3393 For sparse data sets the table constructed by the trie algorithm will
3394 be mostly 0/FAIL transitions or to put it another way mostly empty.
3395 (Note that leaf nodes will not contain any transitions.)
3396
3397 This algorithm compresses the tables by eliminating most such
3398 transitions, at the cost of a modest bit of extra work during lookup:
3399
3400 - Each states[] entry contains a .base field which indicates the
3401 index in the state[] array wheres its transition data is stored.
3402
3b753521 3403 - If .base is 0 there are no valid transitions from that node.
a3621e74