This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Improve handling of nested qr/(?[...])/
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
c277df42 77#define REG_COMP_C
54df2634
NC
78#ifdef PERL_IN_XSUB_RE
79# include "re_comp.h"
afda64e8 80EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
81#else
82# include "regcomp.h"
83#endif
a687059c 84
b992490d 85#include "invlist_inline.h"
1b0f46bf 86#include "unicode_constants.h"
04e98a4d 87
538e84ed
KW
88#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
89 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
90#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
91 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 92#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 93#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 94
a687059c
LW
95#ifndef STATIC
96#define STATIC static
97#endif
98
3f910e62
YO
99/* this is a chain of data about sub patterns we are processing that
100 need to be handled separately/specially in study_chunk. Its so
101 we can simulate recursion without losing state. */
102struct scan_frame;
103typedef struct scan_frame {
104 regnode *last_regnode; /* last node to process in this frame */
105 regnode *next_regnode; /* next node to process when last is reached */
106 U32 prev_recursed_depth;
107 I32 stopparen; /* what stopparen do we use */
3f910e62
YO
108
109 struct scan_frame *this_prev_frame; /* this previous frame */
110 struct scan_frame *prev_frame; /* previous frame */
111 struct scan_frame *next_frame; /* next frame */
112} scan_frame;
113
f4ae5a27
KW
114/* Certain characters are output as a sequence with the first being a
115 * backslash. */
4aada8b9 116#define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
f4ae5a27
KW
117
118
09b2b2e6 119struct RExC_state_t {
514a91f1
DM
120 U32 flags; /* RXf_* are we folding, multilining? */
121 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 122 char *precomp; /* uncompiled string. */
711b303b 123 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 124 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 125 regexp *rx; /* perl core regexp structure */
538e84ed
KW
126 regexp_internal *rxi; /* internal data for regexp object
127 pprivate field */
fac92740 128 char *start; /* Start of input for compile */
830247a4
IZ
129 char *end; /* End of input for compile */
130 char *parse; /* Input-scan pointer. */
51684e3c
KW
131 char *copy_start; /* start of copy of input within
132 constructed parse string */
cc16d262
KW
133 char *save_copy_start; /* Provides one level of saving
134 and restoring 'copy_start' */
51684e3c
KW
135 char *copy_start_in_input; /* Position in input string
136 corresponding to copy_start */
ea3daa5d 137 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 138 regnode *emit_start; /* Start of emitted-code area */
f55b7b26 139 regnode_offset emit; /* Code-emit pointer */
830247a4
IZ
140 I32 naughty; /* How bad is this pattern? */
141 I32 sawback; /* Did we see \1, ...? */
142 U32 seen;
88f063b4
KW
143 SSize_t size; /* Number of regnode equivalents in
144 pattern */
d8d1dede
KW
145 Size_t sets_depth; /* Counts recursion depth of already-
146 compiled regex set patterns */
7c932d07
KW
147
148 /* position beyond 'precomp' of the warning message furthest away from
149 * 'precomp'. During the parse, no warnings are raised for any problems
150 * earlier in the parse than this position. This works if warnings are
151 * raised the first time a given spot is parsed, and if only one
152 * independent warning is raised for any given spot */
153 Size_t latest_warn_offset;
154
155 I32 npar; /* Capture buffer count so far in the
156 parse, (OPEN) plus one. ("par" 0 is
157 the whole pattern)*/
158 I32 total_par; /* During initial parse, is either 0,
159 or -1; the latter indicating a
160 reparse is needed. After that pass,
161 it is what 'npar' became after the
162 pass. Hence, it being > 0 indicates
163 we are in a reparse situation */
538e84ed
KW
164 I32 nestroot; /* root parens we are in - used by
165 accept */
830247a4 166 I32 seen_zerolen;
f55b7b26
KW
167 regnode_offset *open_parens; /* offsets to open parens */
168 regnode_offset *close_parens; /* offsets to close parens */
0275405c 169 I32 parens_buf_size; /* #slots malloced open/close_parens */
d5a00e4a 170 regnode *end_op; /* END node in program */
02daf0ab
YO
171 I32 utf8; /* whether the pattern is utf8 or not */
172 I32 orig_utf8; /* whether the pattern was originally in utf8 */
173 /* XXX use this for future optimisation of case
174 * where pattern must be upgraded to utf8. */
e40e74fe
KW
175 I32 uni_semantics; /* If a d charset modifier should use unicode
176 rules, even if the pattern is not in
177 utf8 */
81714fb9 178 HV *paren_names; /* Paren names */
538e84ed 179
40d049e4 180 regnode **recurse; /* Recurse regops */
88f063b4 181 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 182 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 183 through */
09a65838 184 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 185 I32 in_lookbehind;
105c827d 186 I32 in_lookahead;
4624b182 187 I32 contains_locale;
bb3f3ed2 188 I32 override_recoding;
4caef9c3 189 I32 recode_x_to_native;
9d53c457 190 I32 in_multi_char_class;
1acab4c5 191 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 192 within pattern */
b1603ef8 193 int code_index; /* next code_blocks[] slot */
ee273784 194 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
195 scan_frame *frame_head;
196 scan_frame *frame_last;
197 U32 frame_count;
7eec73eb 198 AV *warn_text;
945961fd 199 HV *unlexed_names;
dc6d7f5c 200#ifdef ADD_TO_REGEXEC
830247a4
IZ
201 char *starttry; /* -Dr: where regtry was called. */
202#define RExC_starttry (pRExC_state->starttry)
203#endif
d24ca0c5 204 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 205#ifdef DEBUGGING
be8e71aa 206 const char *lastparse;
3dab1dad 207 I32 lastnum;
1f1031fe 208 AV *paren_name_list; /* idx -> name */
d9a72fcc 209 U32 study_chunk_recursed_count;
c9f0d54c
YO
210 SV *mysv1;
211 SV *mysv2;
88f063b4 212
3dab1dad
YO
213#define RExC_lastparse (pRExC_state->lastparse)
214#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 215#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 216#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
217#define RExC_mysv (pRExC_state->mysv1)
218#define RExC_mysv1 (pRExC_state->mysv1)
219#define RExC_mysv2 (pRExC_state->mysv2)
220
3dab1dad 221#endif
bd0b62a6 222 bool seen_d_op;
911bd04e 223 bool strict;
da7cf1cc 224 bool study_started;
034602eb 225 bool in_script_run;
7c932d07 226 bool use_BRANCHJ;
09b2b2e6 227};
830247a4 228
e2509266 229#define RExC_flags (pRExC_state->flags)
514a91f1 230#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 231#define RExC_precomp (pRExC_state->precomp)
51684e3c
KW
232#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
233#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
cc16d262 234#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
711b303b 235#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 236#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 237#define RExC_rx (pRExC_state->rx)
f8fc2ecf 238#define RExC_rxi (pRExC_state->rxi)
fac92740 239#define RExC_start (pRExC_state->start)
830247a4
IZ
240#define RExC_end (pRExC_state->end)
241#define RExC_parse (pRExC_state->parse)
7c932d07 242#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
830247a4 243#define RExC_whilem_seen (pRExC_state->whilem_seen)
bd0b62a6
KW
244#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
245 under /d from /u ? */
512c0f5a 246
7122b237 247#ifdef RE_TRACK_PATTERN_OFFSETS
44f36cd3 248# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
538e84ed 249 others */
7122b237 250#endif
830247a4 251#define RExC_emit (pRExC_state->emit)
fac92740 252#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
253#define RExC_sawback (pRExC_state->sawback)
254#define RExC_seen (pRExC_state->seen)
255#define RExC_size (pRExC_state->size)
ee273784 256#define RExC_maxlen (pRExC_state->maxlen)
830247a4 257#define RExC_npar (pRExC_state->npar)
1533d1d2 258#define RExC_total_parens (pRExC_state->total_par)
0275405c 259#define RExC_parens_buf_size (pRExC_state->parens_buf_size)
e2e6a0f1 260#define RExC_nestroot (pRExC_state->nestroot)
830247a4 261#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 262#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 263#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 264#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
265#define RExC_open_parens (pRExC_state->open_parens)
266#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 267#define RExC_end_op (pRExC_state->end_op)
81714fb9 268#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
269#define RExC_recurse (pRExC_state->recurse)
270#define RExC_recurse_count (pRExC_state->recurse_count)
d8d1dede 271#define RExC_sets_depth (pRExC_state->sets_depth)
09a65838 272#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
273#define RExC_study_chunk_recursed_bytes \
274 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 275#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
105c827d 276#define RExC_in_lookahead (pRExC_state->in_lookahead)
4624b182 277#define RExC_contains_locale (pRExC_state->contains_locale)
4caef9c3
KW
278#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
279
b6d67071 280#ifdef EBCDIC
4caef9c3
KW
281# define SET_recode_x_to_native(x) \
282 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
283#else
284# define SET_recode_x_to_native(x) NOOP
b6d67071 285#endif
4caef9c3 286
9d53c457 287#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
288#define RExC_frame_head (pRExC_state->frame_head)
289#define RExC_frame_last (pRExC_state->frame_last)
290#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 291#define RExC_strict (pRExC_state->strict)
da7cf1cc 292#define RExC_study_started (pRExC_state->study_started)
7eec73eb 293#define RExC_warn_text (pRExC_state->warn_text)
034602eb 294#define RExC_in_script_run (pRExC_state->in_script_run)
7c932d07 295#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
945961fd 296#define RExC_unlexed_names (pRExC_state->unlexed_names)
830247a4 297
99807a43
HS
298/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
299 * a flag to disable back-off on the fixed/floating substrings - if it's
300 * a high complexity pattern we assume the benefit of avoiding a full match
301 * is worth the cost of checking for the substrings even if they rarely help.
302 */
303#define RExC_naughty (pRExC_state->naughty)
304#define TOO_NAUGHTY (10)
305#define MARK_NAUGHTY(add) \
306 if (RExC_naughty < TOO_NAUGHTY) \
307 RExC_naughty += (add)
308#define MARK_NAUGHTY_EXP(exp, add) \
309 if (RExC_naughty < TOO_NAUGHTY) \
310 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 311
a687059c
LW
312#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
313#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 314 ((*s) == '{' && regcurly(s)))
a687059c
LW
315
316/*
317 * Flags to be passed up and down.
318 */
a687059c 319#define WORST 0 /* Worst case. */
c4ead770
KW
320#define HASWIDTH 0x01 /* Known to not match null strings, could match
321 non-null ones. */
fda99bee 322
e64f369d 323/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
324 * character. (There needs to be a case: in the switch statement in regexec.c
325 * for any node marked SIMPLE.) Note that this is not the same thing as
326 * REGNODE_SIMPLE */
fda99bee 327#define SIMPLE 0x02
e64f369d 328#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
329#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
330#define TRYAGAIN 0x10 /* Weeded out a declaration. */
c36361b2
KW
331#define RESTART_PARSE 0x20 /* Need to redo the parse */
332#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
333 calcuate sizes as UTF-8 */
a687059c 334
3dab1dad
YO
335#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
336
07be1b83
YO
337/* whether trie related optimizations are enabled */
338#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
339#define TRIE_STUDY_OPT
786e8c11 340#define FULL_TRIE_STUDY
07be1b83
YO
341#define TRIE_STCLASS
342#endif
1de06328
YO
343
344
40d049e4
YO
345
346#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
347#define PBITVAL(paren) (1 << ((paren) & 7))
ac3ef366
HS
348#define PAREN_OFFSET(depth) \
349 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
350#define PAREN_TEST(depth, paren) \
351 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
352#define PAREN_SET(depth, paren) \
353 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
354#define PAREN_UNSET(depth, paren) \
355 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
40d049e4 356
82a6ada4 357#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 358 if (!UTF) { \
c36361b2 359 *flagp = RESTART_PARSE|NEED_UTF8; \
f55b7b26 360 return 0; \
8d9c2815 361 } \
82a6ada4 362 } STMT_END
40d049e4 363
d63d75f5 364/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
f6e8f31e
KW
365 * a flag that indicates we need to override /d with /u as a result of
366 * something in the pattern. It should only be used in regards to calling
b2090782 367 * set_regex_charset() or get_regex_charset() */
512c0f5a
KW
368#define REQUIRE_UNI_RULES(flagp, restart_retval) \
369 STMT_START { \
370 if (DEPENDS_SEMANTICS) { \
512c0f5a
KW
371 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
372 RExC_uni_semantics = 1; \
b76e1c04 373 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
bd0b62a6
KW
374 /* No need to restart the parse if we haven't seen \
375 * anything that differs between /u and /d, and no need \
376 * to restart immediately if we're going to reparse \
377 * anyway to count parens */ \
c36361b2 378 *flagp |= RESTART_PARSE; \
512c0f5a 379 return restart_retval; \
d1be211c 380 } \
512c0f5a
KW
381 } \
382 } STMT_END
383
7c932d07
KW
384#define REQUIRE_BRANCHJ(flagp, restart_retval) \
385 STMT_START { \
386 RExC_use_BRANCHJ = 1; \
439a3bfe
KW
387 *flagp |= RESTART_PARSE; \
388 return restart_retval; \
7c932d07
KW
389 } STMT_END
390
b76e1c04
KW
391/* Until we have completed the parse, we leave RExC_total_parens at 0 or
392 * less. After that, it must always be positive, because the whole re is
393 * considered to be surrounded by virtual parens. Setting it to negative
394 * indicates there is some construct that needs to know the actual number of
395 * parens to be properly handled. And that means an extra pass will be
396 * required after we've counted them all */
397#define ALL_PARENS_COUNTED (RExC_total_parens > 0)
7c932d07 398#define REQUIRE_PARENS_PASS \
b76e1c04
KW
399 STMT_START { /* No-op if have completed a pass */ \
400 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
7c932d07 401 } STMT_END
b76e1c04
KW
402#define IN_PARENS_PASS (RExC_total_parens < 0)
403
7c932d07 404
f358228e
KW
405/* This is used to return failure (zero) early from the calling function if
406 * various flags in 'flags' are set. Two flags always cause a return:
407 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
408 * additional flags that should cause a return; 0 if none. If the return will
409 * be done, '*flagp' is first set to be all of the flags that caused the
410 * return. */
411#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
fd7fc6ae 412 STMT_START { \
c36361b2
KW
413 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
414 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
f358228e 415 return 0; \
fd7fc6ae
YO
416 } \
417 } STMT_END
418
c36361b2 419#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
fd7fc6ae 420
5d555bb8 421#define RETURN_FAIL_ON_RESTART(flags,flagp) \
f358228e 422 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
5d555bb8 423#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
f358228e 424 if (MUST_RESTART(*(flagp))) return 0
fd7fc6ae 425
f19b1a63
KW
426/* This converts the named class defined in regcomp.h to its equivalent class
427 * number defined in handy.h. */
428#define namedclass_to_classnum(class) ((int) ((class) / 2))
429#define classnum_to_namedclass(classnum) ((classnum) * 2)
430
de92f5e6
KW
431#define _invlist_union_complement_2nd(a, b, output) \
432 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
433#define _invlist_intersection_complement_2nd(a, b, output) \
434 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
435
4a808c87
KW
436/* We add a marker if we are deferring expansion of a property that is both
437 * 1) potentiallly user-defined; and
438 * 2) could also be an official Unicode property.
439 *
440 * Without this marker, any deferred expansion can only be for a user-defined
441 * one. This marker shouldn't conflict with any that could be in a legal name,
442 * and is appended to its name to indicate this. There is a string and
443 * character form */
043cdb9a
KW
444#define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
445#define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
1c2f3d7a 446
1de06328
YO
447/* About scan_data_t.
448
449 During optimisation we recurse through the regexp program performing
450 various inplace (keyhole style) optimisations. In addition study_chunk
451 and scan_commit populate this data structure with information about
538e84ed 452 what strings MUST appear in the pattern. We look for the longest
3b753521 453 string that must appear at a fixed location, and we look for the
1de06328
YO
454 longest string that may appear at a floating location. So for instance
455 in the pattern:
538e84ed 456
1de06328 457 /FOO[xX]A.*B[xX]BAR/
538e84ed 458
1de06328
YO
459 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
460 strings (because they follow a .* construct). study_chunk will identify
461 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 462
1de06328 463 The strings can be composites, for instance
538e84ed 464
1de06328 465 /(f)(o)(o)/
538e84ed 466
1de06328 467 will result in a composite fixed substring 'foo'.
538e84ed 468
1de06328 469 For each string some basic information is maintained:
538e84ed 470
6ea77169 471 - min_offset
1de06328
YO
472 This is the position the string must appear at, or not before.
473 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
474 characters must match before the string we are searching for.
475 Likewise when combined with minlenp and the length of the string it
538e84ed 476 tells us how many characters must appear after the string we have
1de06328 477 found.
538e84ed 478
1de06328
YO
479 - max_offset
480 Only used for floating strings. This is the rightmost point that
ea3daa5d 481 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 482 string can occur infinitely far to the right.
6e12f832 483 For fixed strings, it is equal to min_offset.
538e84ed 484
1de06328 485 - minlenp
2d608413
KW
486 A pointer to the minimum number of characters of the pattern that the
487 string was found inside. This is important as in the case of positive
538e84ed 488 lookahead or positive lookbehind we can have multiple patterns
1de06328 489 involved. Consider
538e84ed 490
1de06328 491 /(?=FOO).*F/
538e84ed 492
1de06328
YO
493 The minimum length of the pattern overall is 3, the minimum length
494 of the lookahead part is 3, but the minimum length of the part that
538e84ed 495 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 496 minimum length for the F is 1. This is important as the minimum length
538e84ed 497 is used to determine offsets in front of and behind the string being
1de06328 498 looked for. Since strings can be composites this is the length of the
486ec47a 499 pattern at the time it was committed with a scan_commit. Note that
1de06328 500 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 501 are not known until the full pattern has been compiled, thus the
1de06328 502 pointer to the value.
538e84ed 503
1de06328 504 - lookbehind
538e84ed 505
1de06328 506 In the case of lookbehind the string being searched for can be
538e84ed 507 offset past the start point of the final matching string.
1de06328
YO
508 If this value was just blithely removed from the min_offset it would
509 invalidate some of the calculations for how many chars must match
510 before or after (as they are derived from min_offset and minlen and
538e84ed 511 the length of the string being searched for).
1de06328
YO
512 When the final pattern is compiled and the data is moved from the
513 scan_data_t structure into the regexp structure the information
538e84ed
KW
514 about lookbehind is factored in, with the information that would
515 have been lost precalculated in the end_shift field for the
1de06328
YO
516 associated string.
517
518 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 519 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
520
521*/
2c2d71f5 522
cb6aef8b
DM
523struct scan_data_substrs {
524 SV *str; /* longest substring found in pattern */
525 SSize_t min_offset; /* earliest point in string it can appear */
526 SSize_t max_offset; /* latest point in string it can appear */
527 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
528 SSize_t lookbehind; /* is the pos of the string modified by LB */
11683ecb 529 I32 flags; /* per substring SF_* and SCF_* flags */
cb6aef8b
DM
530};
531
2c2d71f5 532typedef struct scan_data_t {
1de06328
YO
533 /*I32 len_min; unused */
534 /*I32 len_delta; unused */
49f55535 535 SSize_t pos_min;
ea3daa5d 536 SSize_t pos_delta;
2c2d71f5 537 SV *last_found;
ea3daa5d 538 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 539 SSize_t last_start_min;
ea3daa5d 540 SSize_t last_start_max;
2099df82
DM
541 U8 cur_is_floating; /* whether the last_* values should be set as
542 * the next fixed (0) or floating (1)
543 * substring */
6ea77169 544
2099df82
DM
545 /* [0] is longest fixed substring so far, [1] is longest float so far */
546 struct scan_data_substrs substrs[2];
6ea77169 547
11683ecb 548 I32 flags; /* common SF_* and SCF_* flags */
2c2d71f5 549 I32 whilem_c;
49f55535 550 SSize_t *last_closep;
b8f7bb16 551 regnode_ssc *start_class;
2c2d71f5
JH
552} scan_data_t;
553
a687059c 554/*
e50aee73 555 * Forward declarations for pregcomp()'s friends.
a687059c 556 */
a0d0e21e 557
6ea77169 558static const scan_data_t zero_scan_data = {
a2ca2017 559 0, 0, NULL, 0, 0, 0, 0,
6ea77169 560 {
11683ecb
DM
561 { NULL, 0, 0, 0, 0, 0 },
562 { NULL, 0, 0, 0, 0, 0 },
6ea77169
DM
563 },
564 0, 0, NULL, NULL
565};
c277df42 566
11683ecb
DM
567/* study flags */
568
07be1b83
YO
569#define SF_BEFORE_SEOL 0x0001
570#define SF_BEFORE_MEOL 0x0002
11683ecb 571#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
c277df42 572
07be1b83
YO
573#define SF_IS_INF 0x0040
574#define SF_HAS_PAR 0x0080
575#define SF_IN_PAR 0x0100
576#define SF_HAS_EVAL 0x0200
bc604ad8
KW
577
578
579/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
580 * longest substring in the pattern. When it is not set the optimiser keeps
581 * track of position, but does not keep track of the actual strings seen,
582 *
583 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
584 * /foo/i will not.
585 *
586 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
587 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
588 * turned off because of the alternation (BRANCH). */
07be1b83 589#define SCF_DO_SUBSTR 0x0400
bc604ad8 590
653099ff
GS
591#define SCF_DO_STCLASS_AND 0x0800
592#define SCF_DO_STCLASS_OR 0x1000
593#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 594#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 595
786e8c11 596#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 597#define SCF_SEEN_ACCEPT 0x8000
688e0391 598#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
599#define SCF_IN_DEFINE 0x20000
600
601
602
07be1b83 603
43fead97 604#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
605
606/* The enums for all these are ordered so things work out correctly */
a62b1201 607#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
608#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
609 == REGEX_DEPENDS_CHARSET)
00b27cfc 610#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
611#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
612 >= REGEX_UNICODE_CHARSET)
613#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
614 == REGEX_ASCII_RESTRICTED_CHARSET)
615#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
616 >= REGEX_ASCII_RESTRICTED_CHARSET)
617#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
618 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 619
43fead97 620#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 621
f2c2a6ab
KW
622/* For programs that want to be strictly Unicode compatible by dying if any
623 * attempt is made to match a non-Unicode code point against a Unicode
624 * property. */
625#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
626
93733859 627#define OOB_NAMEDCLASS -1
b8c5462f 628
8e661ac5
KW
629/* There is no code point that is out-of-bounds, so this is problematic. But
630 * its only current use is to initialize a variable that is always set before
631 * looked at. */
632#define OOB_UNICODE 0xDEADBEEF
633
a0ed51b3 634#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 635
8615cb43 636
b45f050a
JF
637/* length of regex to show in messages that don't mark a position within */
638#define RegexLengthToShowInErrorMessages 127
639
640/*
641 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
642 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
643 * op/pragma/warn/regcomp.
644 */
7253e4e3
RK
645#define MARKER1 "<-- HERE" /* marker as it appears in the description */
646#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 647
538e84ed 648#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 649 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 650
285b5ca0
KW
651/* The code in this file in places uses one level of recursion with parsing
652 * rebased to an alternate string constructed by us in memory. This can take
653 * the form of something that is completely different from the input, or
654 * something that uses the input as part of the alternate. In the first case,
655 * there should be no possibility of an error, as we are in complete control of
51684e3c
KW
656 * the alternate string. But in the second case we don't completely control
657 * the input portion, so there may be errors in that. Here's an example:
285b5ca0
KW
658 * /[abc\x{DF}def]/ui
659 * is handled specially because \x{df} folds to a sequence of more than one
51684e3c 660 * character: 'ss'. What is done is to create and parse an alternate string,
285b5ca0
KW
661 * which looks like this:
662 * /(?:\x{DF}|[abc\x{DF}def])/ui
663 * where it uses the input unchanged in the middle of something it constructs,
664 * which is a branch for the DF outside the character class, and clustering
665 * parens around the whole thing. (It knows enough to skip the DF inside the
666 * class while in this substitute parse.) 'abc' and 'def' may have errors that
667 * need to be reported. The general situation looks like this:
668 *
51684e3c 669 * |<------- identical ------>|
285b5ca0 670 * sI tI xI eI
51684e3c 671 * Input: ---------------------------------------------------------------
285b5ca0
KW
672 * Constructed: ---------------------------------------------------
673 * sC tC xC eC EC
51684e3c 674 * |<------- identical ------>|
285b5ca0 675 *
51684e3c
KW
676 * sI..eI is the portion of the input pattern we are concerned with here.
677 * sC..EC is the constructed substitute parse string.
678 * sC..tC is constructed by us
679 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
680 * In the diagram, these are vertically aligned.
681 * eC..EC is also constructed by us.
682 * xC is the position in the substitute parse string where we found a
683 * problem.
684 * xI is the position in the original pattern corresponding to xC.
285b5ca0 685 *
51684e3c
KW
686 * We want to display a message showing the real input string. Thus we need to
687 * translate from xC to xI. We know that xC >= tC, since the portion of the
688 * string sC..tC has been constructed by us, and so shouldn't have errors. We
689 * get:
690 * xI = tI + (xC - tC)
285b5ca0 691 *
51684e3c
KW
692 * When the substitute parse is constructed, the code needs to set:
693 * RExC_start (sC)
694 * RExC_end (eC)
695 * RExC_copy_start_in_input (tI)
696 * RExC_copy_start_in_constructed (tC)
697 * and restore them when done.
285b5ca0 698 *
51684e3c
KW
699 * During normal processing of the input pattern, both
700 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
701 * sI, so that xC equals xI.
285b5ca0
KW
702 */
703
51684e3c
KW
704#define sI RExC_precomp
705#define eI RExC_precomp_end
706#define sC RExC_start
707#define eC RExC_end
708#define tI RExC_copy_start_in_input
709#define tC RExC_copy_start_in_constructed
710#define xI(xC) (tI + (xC - tC))
711#define xI_offset(xC) (xI(xC) - sI)
285b5ca0
KW
712
713#define REPORT_LOCATION_ARGS(xC) \
714 UTF8fARG(UTF, \
51684e3c 715 (xI(xC) > eI) /* Don't run off end */ \
232b691f 716 ? eI - sI /* Length before the <--HERE */ \
bc929b61
KW
717 : ((xI_offset(xC) >= 0) \
718 ? xI_offset(xC) \
719 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
720 IVdf " trying to output message for " \
721 " pattern %.*s", \
51684e3c 722 __FILE__, __LINE__, (IV) xI_offset(xC), \
bc929b61 723 ((int) (eC - sC)), sC), 0)), \
51684e3c 724 sI), /* The input pattern printed up to the <--HERE */ \
285b5ca0 725 UTF8fARG(UTF, \
51684e3c
KW
726 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
727 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
c1d900c3 728
8a6d8ec6
HS
729/* Used to point after bad bytes for an error message, but avoid skipping
730 * past a nul byte. */
e3304382 731#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
8a6d8ec6 732
ed68fe11
KW
733/* Set up to clean up after our imminent demise */
734#define PREPARE_TO_DIE \
735 STMT_START { \
736 if (RExC_rx_sv) \
737 SAVEFREESV(RExC_rx_sv); \
7c932d07
KW
738 if (RExC_open_parens) \
739 SAVEFREEPV(RExC_open_parens); \
740 if (RExC_close_parens) \
741 SAVEFREEPV(RExC_close_parens); \
ed68fe11
KW
742 } STMT_END
743
b45f050a
JF
744/*
745 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
746 * arg. Show regex, up to a maximum length. If it's too long, chop and add
747 * "...".
748 */
58e23c8d 749#define _FAIL(code) STMT_START { \
bfed75c6 750 const char *ellipses = ""; \
88f063b4 751 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380 752 \
ed68fe11 753 PREPARE_TO_DIE; \
ccb2c380
MP
754 if (len > RegexLengthToShowInErrorMessages) { \
755 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
756 len = RegexLengthToShowInErrorMessages - 10; \
757 ellipses = "..."; \
758 } \
58e23c8d 759 code; \
ccb2c380 760} STMT_END
8615cb43 761
58e23c8d 762#define FAIL(msg) _FAIL( \
147e3846 763 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 764 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
765
766#define FAIL2(msg,arg) _FAIL( \
147e3846 767 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 768 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 769
4b7718d6
KW
770#define FAIL3(msg,arg1,arg2) _FAIL( \
771 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
772 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
773
b45f050a 774/*
b45f050a
JF
775 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
776 */
ccb2c380 777#define Simple_vFAIL(m) STMT_START { \
ccb2c380 778 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 779 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 780} STMT_END
b45f050a
JF
781
782/*
783 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
784 */
ccb2c380 785#define vFAIL(m) STMT_START { \
ed68fe11 786 PREPARE_TO_DIE; \
ccb2c380
MP
787 Simple_vFAIL(m); \
788} STMT_END
b45f050a
JF
789
790/*
791 * Like Simple_vFAIL(), but accepts two arguments.
792 */
ccb2c380 793#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
794 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
795 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 796} STMT_END
b45f050a
JF
797
798/*
799 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
800 */
ccb2c380 801#define vFAIL2(m,a1) STMT_START { \
ed68fe11 802 PREPARE_TO_DIE; \
ccb2c380
MP
803 Simple_vFAIL2(m, a1); \
804} STMT_END
b45f050a
JF
805
806
807/*
808 * Like Simple_vFAIL(), but accepts three arguments.
809 */
ccb2c380 810#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 811 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 812 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 813} STMT_END
b45f050a
JF
814
815/*
816 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
817 */
ccb2c380 818#define vFAIL3(m,a1,a2) STMT_START { \
ed68fe11 819 PREPARE_TO_DIE; \
ccb2c380
MP
820 Simple_vFAIL3(m, a1, a2); \
821} STMT_END
b45f050a
JF
822
823/*
824 * Like Simple_vFAIL(), but accepts four arguments.
825 */
ccb2c380 826#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
827 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
828 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 829} STMT_END
b45f050a 830
95db3ffa 831#define vFAIL4(m,a1,a2,a3) STMT_START { \
ed68fe11 832 PREPARE_TO_DIE; \
95db3ffa
KW
833 Simple_vFAIL4(m, a1, a2, a3); \
834} STMT_END
835
946095af 836/* A specialized version of vFAIL2 that works with UTF8f */
d528642a 837#define vFAIL2utf8f(m, a1) STMT_START { \
ed68fe11 838 PREPARE_TO_DIE; \
d528642a
KW
839 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
840 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
841} STMT_END
842
3ba22297 843#define vFAIL3utf8f(m, a1, a2) STMT_START { \
ed68fe11 844 PREPARE_TO_DIE; \
3ba22297 845 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 846 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
847} STMT_END
848
d62e749e 849/* Setting this to NULL is a signal to not output warnings */
cc16d262
KW
850#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
851 STMT_START { \
852 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
853 RExC_copy_start_in_constructed = NULL; \
854 } STMT_END
855#define RESTORE_WARNINGS \
856 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
d62e749e 857
7c932d07
KW
858/* Since a warning can be generated multiple times as the input is reparsed, we
859 * output it the first time we come to that point in the parse, but suppress it
860 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
861 * generate any warnings */
55bcf857 862#define TO_OUTPUT_WARNINGS(loc) \
7c932d07
KW
863 ( RExC_copy_start_in_constructed \
864 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
55bcf857 865
7c932d07
KW
866/* After we've emitted a warning, we save the position in the input so we don't
867 * output it again */
868#define UPDATE_WARNINGS_LOC(loc) \
869 STMT_START { \
870 if (TO_OUTPUT_WARNINGS(loc)) { \
5d894ca5
KW
871 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
872 - RExC_precomp; \
7c932d07
KW
873 } \
874 } STMT_END
77ccba3c 875
086f21bf 876/* 'warns' is the output of the packWARNx macro used in 'code' */
f8220fe2
KW
877#define _WARN_HELPER(loc, warns, code) \
878 STMT_START { \
d62e749e
KW
879 if (! RExC_copy_start_in_constructed) { \
880 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
881 " expected at '%s'", \
882 __FILE__, __LINE__, loc); \
883 } \
55bcf857 884 if (TO_OUTPUT_WARNINGS(loc)) { \
086f21bf
KW
885 if (ckDEAD(warns)) \
886 PREPARE_TO_DIE; \
338190a1 887 code; \
77ccba3c 888 UPDATE_WARNINGS_LOC(loc); \
338190a1 889 } \
f8220fe2 890 } STMT_END
946095af 891
5e0a247b 892/* m is not necessarily a "literal string", in this macro */
58ad8160
KW
893#define warn_non_literal_string(loc, packed_warn, m) \
894 _WARN_HELPER(loc, packed_warn, \
895 Perl_warner(aTHX_ packed_warn, \
d528642a 896 "%s" REPORT_LOCATION, \
f8220fe2 897 m, REPORT_LOCATION_ARGS(loc)))
58ad8160
KW
898#define reg_warn_non_literal_string(loc, m) \
899 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
5e0a247b 900
eb761011
KW
901#define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
902 STMT_START { \
903 char * format; \
904 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
905 Newx(format, format_size, char); \
906 my_strlcpy(format, m, format_size); \
907 my_strlcat(format, REPORT_LOCATION, format_size); \
908 SAVEFREEPV(format); \
909 _WARN_HELPER(loc, packwarn, \
910 Perl_ck_warner(aTHX_ packwarn, \
911 format, \
912 a1, REPORT_LOCATION_ARGS(loc))); \
913 } STMT_END
914
f8220fe2
KW
915#define ckWARNreg(loc,m) \
916 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
917 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 918 m REPORT_LOCATION, \
f8220fe2 919 REPORT_LOCATION_ARGS(loc)))
ccb2c380 920
f8220fe2
KW
921#define vWARN(loc, m) \
922 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
923 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 924 m REPORT_LOCATION, \
f8220fe2 925 REPORT_LOCATION_ARGS(loc))) \
b927b7e9 926
f8220fe2
KW
927#define vWARN_dep(loc, m) \
928 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
929 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 930 m REPORT_LOCATION, \
f8220fe2 931 REPORT_LOCATION_ARGS(loc)))
0d6106aa 932
f8220fe2
KW
933#define ckWARNdep(loc,m) \
934 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
935 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 936 m REPORT_LOCATION, \
f8220fe2 937 REPORT_LOCATION_ARGS(loc)))
147508a2 938
f8220fe2
KW
939#define ckWARNregdep(loc,m) \
940 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
941 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
d528642a
KW
942 WARN_REGEXP), \
943 m REPORT_LOCATION, \
f8220fe2 944 REPORT_LOCATION_ARGS(loc)))
ccb2c380 945
f8220fe2
KW
946#define ckWARN2reg_d(loc,m, a1) \
947 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
948 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
d528642a 949 m REPORT_LOCATION, \
f8220fe2 950 a1, REPORT_LOCATION_ARGS(loc)))
2335b3d3 951
f8220fe2
KW
952#define ckWARN2reg(loc, m, a1) \
953 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
954 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 955 m REPORT_LOCATION, \
f8220fe2 956 a1, REPORT_LOCATION_ARGS(loc)))
ccb2c380 957
f8220fe2
KW
958#define vWARN3(loc, m, a1, a2) \
959 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
960 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 961 m REPORT_LOCATION, \
f8220fe2 962 a1, a2, REPORT_LOCATION_ARGS(loc)))
ccb2c380 963
f8220fe2
KW
964#define ckWARN3reg(loc, m, a1, a2) \
965 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
966 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
967 m REPORT_LOCATION, \
968 a1, a2, \
f8220fe2 969 REPORT_LOCATION_ARGS(loc)))
668c081a 970
f8220fe2
KW
971#define vWARN4(loc, m, a1, a2, a3) \
972 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
973 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
974 m REPORT_LOCATION, \
975 a1, a2, a3, \
f8220fe2 976 REPORT_LOCATION_ARGS(loc)))
ccb2c380 977
f8220fe2
KW
978#define ckWARN4reg(loc, m, a1, a2, a3) \
979 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
980 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
981 m REPORT_LOCATION, \
982 a1, a2, a3, \
f8220fe2 983 REPORT_LOCATION_ARGS(loc)))
668c081a 984
f8220fe2
KW
985#define vWARN5(loc, m, a1, a2, a3, a4) \
986 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
987 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
988 m REPORT_LOCATION, \
989 a1, a2, a3, a4, \
f8220fe2 990 REPORT_LOCATION_ARGS(loc)))
9d1d55b5 991
aea5f181
KW
992#define ckWARNexperimental(loc, class, m) \
993 _WARN_HELPER(loc, packWARN(class), \
994 Perl_ck_warner_d(aTHX_ packWARN(class), \
995 m REPORT_LOCATION, \
996 REPORT_LOCATION_ARGS(loc)))
997
1a322bb4
KW
998/* Convert between a pointer to a node and its offset from the beginning of the
999 * program */
1000#define REGNODE_p(offset) (RExC_emit_start + (offset))
1001#define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
1002
538e84ed 1003/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
1004 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
1005 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
1006 * Element 0 holds the number n.
07be1b83 1007 * Position is 1 indexed.
fac92740 1008 */
7122b237 1009#ifndef RE_TRACK_PATTERN_OFFSETS
d7c442ce 1010#define Set_Node_Offset_To_R(offset,byte)
7122b237
YO
1011#define Set_Node_Offset(node,byte)
1012#define Set_Cur_Node_Offset
1013#define Set_Node_Length_To_R(node,len)
1014#define Set_Node_Length(node,len)
6a86c6ad 1015#define Set_Node_Cur_Length(node,start)
538e84ed
KW
1016#define Node_Offset(n)
1017#define Node_Length(n)
7122b237
YO
1018#define Set_Node_Offset_Length(node,offset,len)
1019#define ProgLen(ri) ri->u.proglen
1020#define SetProgLen(ri,x) ri->u.proglen = x
4cbeff97 1021#define Track_Code(code)
7122b237
YO
1022#else
1023#define ProgLen(ri) ri->u.offsets[0]
1024#define SetProgLen(ri,x) ri->u.offsets[0] = x
d7c442ce 1025#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
ccb2c380 1026 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
d7c442ce
KW
1027 __LINE__, (int)(offset), (int)(byte))); \
1028 if((offset) < 0) { \
538e84ed 1029 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
d7c442ce 1030 (int)(offset)); \
ccb2c380 1031 } else { \
d7c442ce 1032 RExC_offsets[2*(offset)-1] = (byte); \
ccb2c380 1033 } \
ccb2c380
MP
1034} STMT_END
1035
88f063b4 1036#define Set_Node_Offset(node,byte) \
1a322bb4 1037 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
ccb2c380
MP
1038#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1039
1040#define Set_Node_Length_To_R(node,len) STMT_START { \
ccb2c380 1041 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 1042 __LINE__, (int)(node), (int)(len))); \
ccb2c380 1043 if((node) < 0) { \
538e84ed
KW
1044 Perl_croak(aTHX_ "value of node is %d in Length macro", \
1045 (int)(node)); \
ccb2c380
MP
1046 } else { \
1047 RExC_offsets[2*(node)] = (len); \
1048 } \
ccb2c380
MP
1049} STMT_END
1050
1051#define Set_Node_Length(node,len) \
1a322bb4 1052 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
6a86c6ad
NC
1053#define Set_Node_Cur_Length(node, start) \
1054 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
1055
1056/* Get offsets and lengths */
1a322bb4
KW
1057#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1058#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
fac92740 1059
07be1b83 1060#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1a322bb4
KW
1061 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1062 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
07be1b83 1063} STMT_END
4cbeff97
TC
1064
1065#define Track_Code(code) STMT_START { code } STMT_END
7122b237 1066#endif
07be1b83
YO
1067
1068#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1069#define EXPERIMENTAL_INPLACESCAN
f427392e 1070#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 1071
cb41e5d6
YO
1072#ifdef DEBUGGING
1073int
6ad9a8ab 1074Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
1075{
1076 va_list ap;
1077 int result;
1078 PerlIO *f= Perl_debug_log;
1079 PERL_ARGS_ASSERT_RE_PRINTF;
1080 va_start(ap, fmt);
1081 result = PerlIO_vprintf(f, fmt, ap);
1082 va_end(ap);
1083 return result;
1084}
1085
1086int
7b031478 1087Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
1088{
1089 va_list ap;
1090 int result;
1091 PerlIO *f= Perl_debug_log;
1092 PERL_ARGS_ASSERT_RE_INDENTF;
1093 va_start(ap, depth);
daeb874b 1094 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
1095 result = PerlIO_vprintf(f, fmt, ap);
1096 va_end(ap);
1097 return result;
1098}
1099#endif /* DEBUGGING */
1100
7b031478 1101#define DEBUG_RExC_seen() \
538e84ed 1102 DEBUG_OPTIMISE_MORE_r({ \
88f063b4 1103 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 1104 \
e384d5c1 1105 if (RExC_seen & REG_ZERO_LEN_SEEN) \
88f063b4 1106 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 1107 \
e384d5c1 1108 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
88f063b4 1109 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 1110 \
e384d5c1 1111 if (RExC_seen & REG_GPOS_SEEN) \
88f063b4 1112 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 1113 \
e384d5c1 1114 if (RExC_seen & REG_RECURSE_SEEN) \
88f063b4 1115 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 1116 \
7b031478 1117 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
88f063b4 1118 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 1119 \
e384d5c1 1120 if (RExC_seen & REG_VERBARG_SEEN) \
88f063b4 1121 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 1122 \
e384d5c1 1123 if (RExC_seen & REG_CUTGROUP_SEEN) \
88f063b4 1124 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 1125 \
e384d5c1 1126 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
88f063b4 1127 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 1128 \
e384d5c1 1129 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
88f063b4 1130 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 1131 \
7b031478 1132 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
88f063b4 1133 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 1134 \
88f063b4 1135 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
1136 });
1137
fdfb4f21 1138#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 1139 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21 1140
fdfb4f21 1141
f5a36d78
DM
1142#ifdef DEBUGGING
1143static void
1144S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1145 const char *close_str)
1146{
1147 if (!flags)
1148 return;
1149
1150 Perl_re_printf( aTHX_ "%s", open_str);
11683ecb
DM
1151 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1152 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
f5a36d78
DM
1153 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1154 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1155 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1156 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1157 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1158 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1159 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1160 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1161 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1162 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1163 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1164 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1165 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1166 Perl_re_printf( aTHX_ "%s", close_str);
1167}
1168
1169
1170static void
1171S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1172 U32 depth, int is_inf)
1173{
1174 GET_RE_DEBUG_FLAGS_DECL;
1175
1176 DEBUG_OPTIMISE_MORE_r({
1177 if (!data)
1178 return;
1179 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1180 depth,
1181 where,
1182 (IV)data->pos_min,
1183 (IV)data->pos_delta,
1184 (UV)data->flags
1185 );
1186
1187 S_debug_show_study_flags(aTHX_ data->flags," [","]");
fdfb4f21 1188
f5a36d78
DM
1189 Perl_re_printf( aTHX_
1190 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1191 (IV)data->whilem_c,
1192 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1193 is_inf ? "INF " : ""
1194 );
1195
11683ecb 1196 if (data->last_found) {
37b6262f 1197 int i;
11683ecb
DM
1198 Perl_re_printf(aTHX_
1199 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1200 SvPVX_const(data->last_found),
1201 (IV)data->last_end,
1202 (IV)data->last_start_min,
1203 (IV)data->last_start_max
1204 );
1205
2099df82 1206 for (i = 0; i < 2; i++) {
37b6262f
DM
1207 Perl_re_printf(aTHX_
1208 " %s%s: '%s' @ %" IVdf "/%" IVdf,
2099df82
DM
1209 data->cur_is_floating == i ? "*" : "",
1210 i ? "Float" : "Fixed",
37b6262f
DM
1211 SvPVX_const(data->substrs[i].str),
1212 (IV)data->substrs[i].min_offset,
1213 (IV)data->substrs[i].max_offset
1214 );
1215 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1216 }
11683ecb 1217 }
f5a36d78
DM
1218
1219 Perl_re_printf( aTHX_ "\n");
1220 });
1221}
1222
1223
1224static void
1225S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1226 regnode *scan, U32 depth, U32 flags)
1227{
1228 GET_RE_DEBUG_FLAGS_DECL;
1229
1230 DEBUG_OPTIMISE_r({
1231 regnode *Next;
1232
1233 if (!scan)
1234 return;
1235 Next = regnext(scan);
1236 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1237 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1238 depth,
1239 str,
1240 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1241 Next ? (REG_NODE_NUM(Next)) : 0 );
1242 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1243 Perl_re_printf( aTHX_ "\n");
1244 });
1245}
1246
1247
1248# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1249 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1250
1251# define DEBUG_PEEP(str, scan, depth, flags) \
1252 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1253
1254#else
1255# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1256# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1257#endif
1de06328 1258
cb41e5d6 1259
c6871b76
KW
1260/* =========================================================
1261 * BEGIN edit_distance stuff.
1262 *
1263 * This calculates how many single character changes of any type are needed to
1264 * transform a string into another one. It is taken from version 3.1 of
1265 *
1266 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1267 */
1268
1269/* Our unsorted dictionary linked list. */
1270/* Note we use UVs, not chars. */
1271
1272struct dictionary{
1273 UV key;
1274 UV value;
1275 struct dictionary* next;
1276};
1277typedef struct dictionary item;
1278
1279
1280PERL_STATIC_INLINE item*
88f063b4 1281push(UV key, item* curr)
c6871b76
KW
1282{
1283 item* head;
d09f14bf 1284 Newx(head, 1, item);
c6871b76
KW
1285 head->key = key;
1286 head->value = 0;
1287 head->next = curr;
1288 return head;
1289}
1290
1291
1292PERL_STATIC_INLINE item*
1293find(item* head, UV key)
1294{
1295 item* iterator = head;
1296 while (iterator){
1297 if (iterator->key == key){
1298 return iterator;
1299 }
1300 iterator = iterator->next;
1301 }
1302
1303 return NULL;
1304}
1305
1306PERL_STATIC_INLINE item*
88f063b4 1307uniquePush(item* head, UV key)
c6871b76
KW
1308{
1309 item* iterator = head;
1310
1311 while (iterator){
1312 if (iterator->key == key) {
1313 return head;
1314 }
1315 iterator = iterator->next;
1316 }
1317
88f063b4 1318 return push(key, head);
c6871b76
KW
1319}
1320
1321PERL_STATIC_INLINE void
1322dict_free(item* head)
1323{
1324 item* iterator = head;
1325
1326 while (iterator) {
1327 item* temp = iterator;
1328 iterator = iterator->next;
1329 Safefree(temp);
1330 }
1331
1332 head = NULL;
1333}
1334
1335/* End of Dictionary Stuff */
1336
1337/* All calculations/work are done here */
1338STATIC int
1339S_edit_distance(const UV* src,
1340 const UV* tgt,
1341 const STRLEN x, /* length of src[] */
1342 const STRLEN y, /* length of tgt[] */
1343 const SSize_t maxDistance
1344)
1345{
1346 item *head = NULL;
88f063b4 1347 UV swapCount, swapScore, targetCharCount, i, j;
c6871b76
KW
1348 UV *scores;
1349 UV score_ceil = x + y;
1350
1351 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1352
1353 /* intialize matrix start values */
d09f14bf 1354 Newx(scores, ( (x + 2) * (y + 2)), UV);
c6871b76
KW
1355 scores[0] = score_ceil;
1356 scores[1 * (y + 2) + 0] = score_ceil;
1357 scores[0 * (y + 2) + 1] = score_ceil;
1358 scores[1 * (y + 2) + 1] = 0;
88f063b4 1359 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
c6871b76
KW
1360
1361 /* work loops */
1362 /* i = src index */
1363 /* j = tgt index */
1364 for (i=1;i<=x;i++) {
1365 if (i < x)
88f063b4 1366 head = uniquePush(head, src[i]);
c6871b76
KW
1367 scores[(i+1) * (y + 2) + 1] = i;
1368 scores[(i+1) * (y + 2) + 0] = score_ceil;
1369 swapCount = 0;
1370
1371 for (j=1;j<=y;j++) {
1372 if (i == 1) {
1373 if(j < y)
88f063b4 1374 head = uniquePush(head, tgt[j]);
c6871b76
KW
1375 scores[1 * (y + 2) + (j + 1)] = j;
1376 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1377 }
1378
88f063b4 1379 targetCharCount = find(head, tgt[j-1])->value;
c6871b76
KW
1380 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1381
1382 if (src[i-1] != tgt[j-1]){
1383 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1384 }
1385 else {
1386 swapCount = j;
1387 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1388 }
1389 }
1390
88f063b4 1391 find(head, src[i-1])->value = i;
c6871b76
KW
1392 }
1393
1394 {
1395 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1396 dict_free(head);
1397 Safefree(scores);
1398 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1399 }
1400}
1401
1402/* END of edit_distance() stuff
1403 * ========================================================= */
1404
653099ff 1405/* Mark that we cannot extend a found fixed substring at this point.
37b6262f 1406 Update the longest found anchored substring or the longest found
653099ff
GS
1407 floating substrings if needed. */
1408
4327152a 1409STATIC void
ea3daa5d
FC
1410S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1411 SSize_t *minlenp, int is_inf)
c277df42 1412{
e1ec3a88 1413 const STRLEN l = CHR_SVLEN(data->last_found);
2099df82 1414 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
a2ca2017 1415 const STRLEN old_l = CHR_SVLEN(longest_sv);
1de06328 1416 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1417
7918f24d
NC
1418 PERL_ARGS_ASSERT_SCAN_COMMIT;
1419
c277df42 1420 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
2099df82 1421 const U8 i = data->cur_is_floating;
a2ca2017 1422 SvSetMagicSV(longest_sv, data->last_found);
37b6262f
DM
1423 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1424
2099df82
DM
1425 if (!i) /* fixed */
1426 data->substrs[0].max_offset = data->substrs[0].min_offset;
1427 else { /* float */
1428 data->substrs[1].max_offset = (l
646e8787
DM
1429 ? data->last_start_max
1430 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1431 ? SSize_t_MAX
1432 : data->pos_min + data->pos_delta));
1433 if (is_inf
2099df82
DM
1434 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1435 data->substrs[1].max_offset = SSize_t_MAX;
37b6262f 1436 }
11683ecb 1437
37b6262f
DM
1438 if (data->flags & SF_BEFORE_EOL)
1439 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1440 else
1441 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1442 data->substrs[i].minlenp = minlenp;
1443 data->substrs[i].lookbehind = 0;
c277df42 1444 }
37b6262f 1445
c277df42 1446 SvCUR_set(data->last_found, 0);
0eda9292 1447 {
a28509cc 1448 SV * const sv = data->last_found;
097eb12c
AL
1449 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1450 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1451 if (mg)
1452 mg->mg_len = 0;
1453 }
0eda9292 1454 }
c277df42
IZ
1455 data->last_end = -1;
1456 data->flags &= ~SF_BEFORE_EOL;
f5a36d78 1457 DEBUG_STUDYDATA("commit", data, 0, is_inf);
c277df42
IZ
1458}
1459
cdd87c1d
KW
1460/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1461 * list that describes which code points it matches */
1462
653099ff 1463STATIC void
3420edd7 1464S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1465{
cdd87c1d
KW
1466 /* Set the SSC 'ssc' to match an empty string or any code point */
1467
557bd3fb 1468 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1469
71068078 1470 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1471
0854ea0b
KW
1472 /* mortalize so won't leak */
1473 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1474 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1475}
1476
653099ff 1477STATIC int
dc3bf405 1478S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1479{
c144baaa
KW
1480 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1481 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1482 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1483 * in any way, so there's no point in using it */
cdd87c1d
KW
1484
1485 UV start, end;
1486 bool ret;
653099ff 1487
557bd3fb 1488 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1489
71068078 1490 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1491
93e92956 1492 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1493 return FALSE;
1494 }
1495
1496 /* See if the list consists solely of the range 0 - Infinity */
1497 invlist_iterinit(ssc->invlist);
1498 ret = invlist_iternext(ssc->invlist, &start, &end)
1499 && start == 0
1500 && end == UV_MAX;
1501
1502 invlist_iterfinish(ssc->invlist);
1503
1504 if (ret) {
1505 return TRUE;
1506 }
1507
1508 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1509 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1510 int i;
1511 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1512 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1513 return TRUE;
1514 }
1515 }
1516 }
1517
1518 return FALSE;
653099ff
GS
1519}
1520
653099ff 1521STATIC void
cdd87c1d 1522S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1523{
cdd87c1d
KW
1524 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1525 * string, any code point, or any posix class under locale */
1526
557bd3fb 1527 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1528
557bd3fb 1529 Zero(ssc, 1, regnode_ssc);
71068078 1530 set_ANYOF_SYNTHETIC(ssc);
93e92956 1531 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1532 ssc_anything(ssc);
cdd87c1d 1533
2f306ab9
KW
1534 /* If any portion of the regex is to operate under locale rules that aren't
1535 * fully known at compile time, initialization includes it. The reason
1536 * this isn't done for all regexes is that the optimizer was written under
1537 * the assumption that locale was all-or-nothing. Given the complexity and
1538 * lack of documentation in the optimizer, and that there are inadequate
1539 * test cases for locale, many parts of it may not work properly, it is
1540 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1541 if (RExC_contains_locale) {
1542 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1543 }
1544 else {
1545 ANYOF_POSIXL_ZERO(ssc);
1546 }
653099ff
GS
1547}
1548
b423522f 1549STATIC int
dc3bf405
BF
1550S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1551 const regnode_ssc *ssc)
b423522f
KW
1552{
1553 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1554 * to the list of code points matched, and locale posix classes; hence does
1555 * not check its flags) */
1556
1557 UV start, end;
1558 bool ret;
1559
1560 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1561
71068078 1562 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1563
1564 invlist_iterinit(ssc->invlist);
1565 ret = invlist_iternext(ssc->invlist, &start, &end)
1566 && start == 0
1567 && end == UV_MAX;
1568
1569 invlist_iterfinish(ssc->invlist);
1570
1571 if (! ret) {
1572 return FALSE;
1573 }
1574
e0e1be5f 1575 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1576 return FALSE;
b423522f
KW
1577 }
1578
1579 return TRUE;
1580}
1581
4ebed06a
KW
1582#define INVLIST_INDEX 0
1583#define ONLY_LOCALE_MATCHES_INDEX 1
1584#define DEFERRED_USER_DEFINED_INDEX 2
1585
b423522f
KW
1586STATIC SV*
1587S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1588 const regnode_charclass* const node)
b423522f
KW
1589{
1590 /* Returns a mortal inversion list defining which code points are matched
1591 * by 'node', which is of type ANYOF. Handles complementing the result if
1592 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1593 * returned list must, and will, contain every code point that is a
1594 * possibility. */
b423522f 1595
1565c085 1596 dVAR;
e2506fa7 1597 SV* invlist = NULL;
1ee208c4 1598 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1599 unsigned int i;
1600 const U32 n = ARG(node);
31f05a37 1601 bool new_node_has_latin1 = FALSE;
2d5613be 1602 const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
29a889ef
KW
1603 ? 0
1604 : ANYOF_FLAGS(node);
b423522f
KW
1605
1606 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1607
1608 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1609 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1610 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1611 AV * const av = MUTABLE_AV(SvRV(rv));
1612 SV **const ary = AvARRAY(av);
1613 assert(RExC_rxi->data->what[n] == 's');
1614
4ebed06a 1615 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
b423522f 1616
6ed02bb6
KW
1617 /* Here there are things that won't be known until runtime -- we
1618 * have to assume it could be anything */
e2506fa7 1619 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1620 return _add_range_to_invlist(invlist, 0, UV_MAX);
1621 }
4ebed06a 1622 else if (ary[INVLIST_INDEX]) {
b423522f 1623
6ed02bb6 1624 /* Use the node's inversion list */
4ebed06a 1625 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1ee208c4
KW
1626 }
1627
1628 /* Get the code points valid only under UTF-8 locales */
766d6d33 1629 if ( (flags & ANYOFL_FOLD)
4ebed06a 1630 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1ee208c4 1631 {
4ebed06a 1632 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
b423522f
KW
1633 }
1634 }
1635
e2506fa7
KW
1636 if (! invlist) {
1637 invlist = sv_2mortal(_new_invlist(0));
1638 }
1639
dcb20b36
KW
1640 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1641 * code points, and an inversion list for the others, but if there are code
1642 * points that should match only conditionally on the target string being
1643 * UTF-8, those are placed in the inversion list, and not the bitmap.
1644 * Since there are circumstances under which they could match, they are
1645 * included in the SSC. But if the ANYOF node is to be inverted, we have
1646 * to exclude them here, so that when we invert below, the end result
1647 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1648 * have to do this here before we add the unconditionally matched code
1649 * points */
766d6d33 1650 if (flags & ANYOF_INVERT) {
b423522f
KW
1651 _invlist_intersection_complement_2nd(invlist,
1652 PL_UpperLatin1,
1653 &invlist);
1654 }
1655
1656 /* Add in the points from the bit map */
2d5613be 1657 if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
90973738
KW
1658 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1659 if (ANYOF_BITMAP_TEST(node, i)) {
1660 unsigned int start = i++;
1661
1662 for (; i < NUM_ANYOF_CODE_POINTS
1663 && ANYOF_BITMAP_TEST(node, i); ++i)
1664 {
1665 /* empty */
1666 }
1667 invlist = _add_range_to_invlist(invlist, start, i-1);
1668 new_node_has_latin1 = TRUE;
6f8848d5 1669 }
b423522f
KW
1670 }
1671 }
1672
1673 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1674 * as well. But don't add them if inverting, as when that gets done below,
1675 * it would exclude all these characters, including the ones it shouldn't
1676 * that were added just above */
766d6d33
KW
1677 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1678 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
f240c685 1679 {
b423522f
KW
1680 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1681 }
1682
1683 /* Similarly for these */
766d6d33 1684 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1685 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1686 }
1687
766d6d33 1688 if (flags & ANYOF_INVERT) {
b423522f
KW
1689 _invlist_invert(invlist);
1690 }
766d6d33 1691 else if (flags & ANYOFL_FOLD) {
35b8412f 1692 if (new_node_has_latin1) {
31f05a37 1693
26be5fe6
KW
1694 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1695 * the locale. We can skip this if there are no 0-255 at all. */
1696 _invlist_union(invlist, PL_Latin1, &invlist);
35b8412f
KW
1697
1698 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1699 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1700 }
1701 else {
1702 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1703 invlist = add_cp_to_invlist(invlist, 'I');
1704 }
1705 if (_invlist_contains_cp(invlist,
1706 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1707 {
1708 invlist = add_cp_to_invlist(invlist, 'i');
1709 }
1710 }
31f05a37
KW
1711 }
1712
1ee208c4
KW
1713 /* Similarly add the UTF-8 locale possible matches. These have to be
1714 * deferred until after the non-UTF-8 locale ones are taken care of just
1715 * above, or it leads to wrong results under ANYOF_INVERT */
1716 if (only_utf8_locale_invlist) {
31f05a37 1717 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1718 only_utf8_locale_invlist,
766d6d33 1719 flags & ANYOF_INVERT,
31f05a37
KW
1720 &invlist);
1721 }
b423522f
KW
1722
1723 return invlist;
1724}
1725
1051e1c4 1726/* These two functions currently do the exact same thing */
557bd3fb 1727#define ssc_init_zero ssc_init
653099ff 1728
cdd87c1d
KW
1729#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1730#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1731
557bd3fb 1732/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1733 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1734 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1735
653099ff 1736STATIC void
b423522f 1737S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1738 const regnode_charclass *and_with)
653099ff 1739{
cdd87c1d
KW
1740 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1741 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1742
a0dd4231 1743 SV* anded_cp_list;
2d5613be 1744 U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
29a889ef
KW
1745 ? 0
1746 : ANYOF_FLAGS(and_with);
a0dd4231 1747 U8 anded_flags;
1e6ade67 1748
cdd87c1d 1749 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1750
71068078 1751 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1752
1753 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1754 * the code point inversion list and just the relevant flags */
71068078 1755 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1756 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
765e6ecf 1757 anded_flags = and_with_flags;
e9b08962
KW
1758
1759 /* XXX This is a kludge around what appears to be deficiencies in the
1760 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1761 * there are paths through the optimizer where it doesn't get weeded
1762 * out when it should. And if we don't make some extra provision for
1763 * it like the code just below, it doesn't get added when it should.
1764 * This solution is to add it only when AND'ing, which is here, and
1765 * only when what is being AND'ed is the pristine, original node
1766 * matching anything. Thus it is like adding it to ssc_anything() but
1767 * only when the result is to be AND'ed. Probably the same solution
1768 * could be adopted for the same problem we have with /l matching,
1769 * which is solved differently in S_ssc_init(), and that would lead to
1770 * fewer false positives than that solution has. But if this solution
1771 * creates bugs, the consequences are only that a warning isn't raised
1772 * that should be; while the consequences for having /l bugs is
1773 * incorrect matches */
7dcac5f6 1774 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1775 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1776 }
a0dd4231
KW
1777 }
1778 else {
5c0f85ef 1779 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685 1780 if (OP(and_with) == ANYOFD) {
765e6ecf 1781 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
f240c685
KW
1782 }
1783 else {
765e6ecf 1784 anded_flags = and_with_flags
f240c685 1785 &( ANYOF_COMMON_FLAGS
108316fb
KW
1786 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1787 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
765e6ecf 1788 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
d1c40ef5
KW
1789 anded_flags &=
1790 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1791 }
f240c685 1792 }
a0dd4231
KW
1793 }
1794
1795 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1796
1797 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1798 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1799 * 'and_with' may be inverted. When not inverted, we have the situation of
1800 * computing:
1801 * (C1 | P1) & (C2 | P2)
1802 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1803 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1804 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1805 * <= ((C1 & C2) | P1 | P2)
1806 * Alternatively, the last few steps could be:
1807 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1808 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1809 * <= (C1 | C2 | (P1 & P2))
1810 * We favor the second approach if either P1 or P2 is non-empty. This is
1811 * because these components are a barrier to doing optimizations, as what
1812 * they match cannot be known until the moment of matching as they are
1813 * dependent on the current locale, 'AND"ing them likely will reduce or
1814 * eliminate them.
1815 * But we can do better if we know that C1,P1 are in their initial state (a
1816 * frequent occurrence), each matching everything:
1817 * (<everything>) & (C2 | P2) = C2 | P2
1818 * Similarly, if C2,P2 are in their initial state (again a frequent
1819 * occurrence), the result is a no-op
1820 * (C1 | P1) & (<everything>) = C1 | P1
1821 *
1822 * Inverted, we have
1823 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1824 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1825 * <= (C1 & ~C2) | (P1 & ~P2)
1826 * */
1aa99e6b 1827
765e6ecf 1828 if ((and_with_flags & ANYOF_INVERT)
71068078 1829 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1830 {
cdd87c1d 1831 unsigned int i;
8951c461 1832
cdd87c1d
KW
1833 ssc_intersection(ssc,
1834 anded_cp_list,
1835 FALSE /* Has already been inverted */
1836 );
c6b76537 1837
cdd87c1d
KW
1838 /* If either P1 or P2 is empty, the intersection will be also; can skip
1839 * the loop */
765e6ecf 1840 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1841 ANYOF_POSIXL_ZERO(ssc);
1842 }
e0e1be5f 1843 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1844
1845 /* Note that the Posix class component P from 'and_with' actually
1846 * looks like:
1847 * P = Pa | Pb | ... | Pn
1848 * where each component is one posix class, such as in [\w\s].
1849 * Thus
1850 * ~P = ~(Pa | Pb | ... | Pn)
1851 * = ~Pa & ~Pb & ... & ~Pn
1852 * <= ~Pa | ~Pb | ... | ~Pn
1853 * The last is something we can easily calculate, but unfortunately
1854 * is likely to have many false positives. We could do better
1855 * in some (but certainly not all) instances if two classes in
1856 * P have known relationships. For example
1857 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1858 * So
1859 * :lower: & :print: = :lower:
1860 * And similarly for classes that must be disjoint. For example,
1861 * since \s and \w can have no elements in common based on rules in
1862 * the POSIX standard,
1863 * \w & ^\S = nothing
1864 * Unfortunately, some vendor locales do not meet the Posix
1865 * standard, in particular almost everything by Microsoft.
1866 * The loop below just changes e.g., \w into \W and vice versa */
1867
1ee208c4 1868 regnode_charclass_posixl temp;
cdd87c1d
KW
1869 int add = 1; /* To calculate the index of the complement */
1870
b1234259 1871 Zero(&temp, 1, regnode_charclass_posixl);
cdd87c1d
KW
1872 ANYOF_POSIXL_ZERO(&temp);
1873 for (i = 0; i < ANYOF_MAX; i++) {
1874 assert(i % 2 != 0
7dcac5f6
KW
1875 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1876 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1877
7dcac5f6 1878 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1879 ANYOF_POSIXL_SET(&temp, i + add);
1880 }
1881 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1882 }
1883 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1884
cdd87c1d
KW
1885 } /* else ssc already has no posixes */
1886 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1887 in its initial state */
71068078 1888 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1889 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1890 {
1891 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1892 * copy it over 'ssc' */
1893 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1894 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1895 StructCopy(and_with, ssc, regnode_ssc);
1896 }
1897 else {
1898 ssc->invlist = anded_cp_list;
1899 ANYOF_POSIXL_ZERO(ssc);
765e6ecf 1900 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1901 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1902 }
1903 }
1904 }
e0e1be5f 1905 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
765e6ecf 1906 || (and_with_flags & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1907 {
1908 /* One or the other of P1, P2 is non-empty. */
765e6ecf 1909 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1910 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1911 }
cdd87c1d
KW
1912 ssc_union(ssc, anded_cp_list, FALSE);
1913 }
1914 else { /* P1 = P2 = empty */
1915 ssc_intersection(ssc, anded_cp_list, FALSE);
1916 }
137165a6 1917 }
653099ff
GS
1918}
1919
653099ff 1920STATIC void
cdd87c1d 1921S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1922 const regnode_charclass *or_with)
653099ff 1923{
cdd87c1d
KW
1924 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1925 * another SSC or a regular ANYOF class. Can create false positives if
1926 * 'or_with' is to be inverted. */
7918f24d 1927
a0dd4231
KW
1928 SV* ored_cp_list;
1929 U8 ored_flags;
2d5613be 1930 U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
29a889ef
KW
1931 ? 0
1932 : ANYOF_FLAGS(or_with);
c6b76537 1933
cdd87c1d 1934 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1935
71068078 1936 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1937
1938 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1939 * the code point inversion list and just the relevant flags */
71068078 1940 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1941 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
765e6ecf 1942 ored_flags = or_with_flags;
a0dd4231
KW
1943 }
1944 else {
5c0f85ef 1945 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
765e6ecf 1946 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
f240c685
KW
1947 if (OP(or_with) != ANYOFD) {
1948 ored_flags
765e6ecf 1949 |= or_with_flags
108316fb
KW
1950 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1951 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
765e6ecf 1952 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
d1c40ef5
KW
1953 ored_flags |=
1954 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1955 }
f240c685 1956 }
a0dd4231
KW
1957 }
1958
1959 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1960
1961 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1962 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1963 * 'or_with' may be inverted. When not inverted, we have the simple
1964 * situation of computing:
1965 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1966 * If P1|P2 yields a situation with both a class and its complement are
1967 * set, like having both \w and \W, this matches all code points, and we
1968 * can delete these from the P component of the ssc going forward. XXX We
1969 * might be able to delete all the P components, but I (khw) am not certain
1970 * about this, and it is better to be safe.
1971 *
1972 * Inverted, we have
1973 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1974 * <= (C1 | P1) | ~C2
1975 * <= (C1 | ~C2) | P1
1976 * (which results in actually simpler code than the non-inverted case)
1977 * */
9826f543 1978
765e6ecf 1979 if ((or_with_flags & ANYOF_INVERT)
71068078 1980 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1981 {
cdd87c1d 1982 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1983 } /* else Not inverted */
765e6ecf 1984 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1985 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1986 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1987 unsigned int i;
1988 for (i = 0; i < ANYOF_MAX; i += 2) {
1989 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1990 {
1991 ssc_match_all_cp(ssc);
1992 ANYOF_POSIXL_CLEAR(ssc, i);
1993 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1994 }
1995 }
1996 }
1aa99e6b 1997 }
cdd87c1d
KW
1998
1999 ssc_union(ssc,
2000 ored_cp_list,
2001 FALSE /* Already has been inverted */
2002 );
653099ff
GS
2003}
2004
b423522f
KW
2005PERL_STATIC_INLINE void
2006S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2007{
2008 PERL_ARGS_ASSERT_SSC_UNION;
2009
71068078 2010 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2011
2012 _invlist_union_maybe_complement_2nd(ssc->invlist,
2013 invlist,
2014 invert2nd,
2015 &ssc->invlist);
2016}
2017
2018PERL_STATIC_INLINE void
2019S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2020 SV* const invlist,
2021 const bool invert2nd)
2022{
2023 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2024
71068078 2025 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2026
2027 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2028 invlist,
2029 invert2nd,
2030 &ssc->invlist);
2031}
2032
2033PERL_STATIC_INLINE void
2034S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2035{
2036 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2037
71068078 2038 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2039
2040 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2041}
2042
2043PERL_STATIC_INLINE void
2044S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2045{
2046 /* AND just the single code point 'cp' into the SSC 'ssc' */
2047
2048 SV* cp_list = _new_invlist(2);
2049
2050 PERL_ARGS_ASSERT_SSC_CP_AND;
2051
71068078 2052 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2053
2054 cp_list = add_cp_to_invlist(cp_list, cp);
2055 ssc_intersection(ssc, cp_list,
2056 FALSE /* Not inverted */
2057 );
2058 SvREFCNT_dec_NN(cp_list);
2059}
2060
2061PERL_STATIC_INLINE void
dc3bf405 2062S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
2063{
2064 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
2065 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2066
71068078 2067 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2068
2069 ANYOF_POSIXL_ZERO(ssc);
2070 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2071}
2072
b35552de
KW
2073#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2074
2075STATIC bool
2076S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2077{
2078 /* The synthetic start class is used to hopefully quickly winnow down
2079 * places where a pattern could start a match in the target string. If it
2080 * doesn't really narrow things down that much, there isn't much point to
2081 * having the overhead of using it. This function uses some very crude
2082 * heuristics to decide if to use the ssc or not.
2083 *
2084 * It returns TRUE if 'ssc' rules out more than half what it considers to
2085 * be the "likely" possible matches, but of course it doesn't know what the
2086 * actual things being matched are going to be; these are only guesses
2087 *
2088 * For /l matches, it assumes that the only likely matches are going to be
2089 * in the 0-255 range, uniformly distributed, so half of that is 127
2090 * For /a and /d matches, it assumes that the likely matches will be just
2091 * the ASCII range, so half of that is 63
2092 * For /u and there isn't anything matching above the Latin1 range, it
2093 * assumes that that is the only range likely to be matched, and uses
2094 * half that as the cut-off: 127. If anything matches above Latin1,
2095 * it assumes that all of Unicode could match (uniformly), except for
2096 * non-Unicode code points and things in the General Category "Other"
2097 * (unassigned, private use, surrogates, controls and formats). This
2098 * is a much large number. */
2099
b35552de
KW
2100 U32 count = 0; /* Running total of number of code points matched by
2101 'ssc' */
2102 UV start, end; /* Start and end points of current range in inversion
26be5fe6 2103 XXX outdated. UTF-8 locales are common, what about invert? list */
72400949
KW
2104 const U32 max_code_points = (LOC)
2105 ? 256
f6e8f31e 2106 : (( ! UNI_SEMANTICS
1a26cbcb 2107 || invlist_highest(ssc->invlist) < 256)
72400949
KW
2108 ? 128
2109 : NON_OTHER_COUNT);
2110 const U32 max_match = max_code_points / 2;
b35552de
KW
2111
2112 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2113
2114 invlist_iterinit(ssc->invlist);
2115 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
2116 if (start >= max_code_points) {
2117 break;
b35552de 2118 }
72400949 2119 end = MIN(end, max_code_points - 1);
b35552de 2120 count += end - start + 1;
72400949 2121 if (count >= max_match) {
b35552de
KW
2122 invlist_iterfinish(ssc->invlist);
2123 return FALSE;
2124 }
2125 }
2126
2127 return TRUE;
2128}
2129
2130
b423522f
KW
2131STATIC void
2132S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2133{
2134 /* The inversion list in the SSC is marked mortal; now we need a more
2135 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
2136 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2137 * map */
b423522f 2138
28118b9c 2139 SV* invlist = invlist_clone(ssc->invlist, NULL);
b423522f
KW
2140
2141 PERL_ARGS_ASSERT_SSC_FINALIZE;
2142
71068078 2143 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 2144
a0dd4231 2145 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
2146 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2147 * by the time we reach here */
f240c685
KW
2148 assert(! (ANYOF_FLAGS(ssc)
2149 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
2150 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2151 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 2152
b423522f
KW
2153 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2154
4c404f26 2155 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
21c3fd9d 2156 SvREFCNT_dec(invlist);
b423522f 2157
85c8e306
KW
2158 /* Make sure is clone-safe */
2159 ssc->invlist = NULL;
2160
e0e1be5f 2161 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 2162 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
d156f5cb 2163 OP(ssc) = ANYOFPOSIXL;
e0e1be5f 2164 }
d156f5cb 2165 else if (RExC_contains_locale) {
b2e90ddf
KW
2166 OP(ssc) = ANYOFL;
2167 }
2168
1462525b 2169 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
2170}
2171
a3621e74
YO
2172#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2173#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2174#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
2175#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2176 ? (TRIE_LIST_CUR( idx ) - 1) \
2177 : 0 )
a3621e74 2178
3dab1dad
YO
2179
2180#ifdef DEBUGGING
07be1b83 2181/*
2b8b4781
NC
2182 dump_trie(trie,widecharmap,revcharmap)
2183 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2184 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
2185
2186 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
2187 The _interim_ variants are used for debugging the interim
2188 tables that are used to generate the final compressed
2189 representation which is what dump_trie expects.
2190
486ec47a 2191 Part of the reason for their existence is to provide a form
3dab1dad 2192 of documentation as to how the different representations function.
07be1b83
YO
2193
2194*/
3dab1dad
YO
2195
2196/*
3dab1dad
YO
2197 Dumps the final compressed table form of the trie to Perl_debug_log.
2198 Used for debugging make_trie().
2199*/
b9a59e08 2200
3dab1dad 2201STATIC void
2b8b4781
NC
2202S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2203 AV *revcharmap, U32 depth)
3dab1dad
YO
2204{
2205 U32 state;
ab3bbdeb 2206 SV *sv=sv_newmortal();
55eed653 2207 int colwidth= widecharmap ? 6 : 4;
2e64971a 2208 U16 word;
3dab1dad
YO
2209 GET_RE_DEBUG_FLAGS_DECL;
2210
7918f24d 2211 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 2212
6ad9a8ab 2213 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 2214 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
2215
2216 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 2217 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 2218 if ( tmp ) {
6ad9a8ab 2219 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2220 colwidth,
538e84ed 2221 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2222 PL_colors[0], PL_colors[1],
2223 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2224 PERL_PV_ESCAPE_FIRSTCHAR
2225 )
ab3bbdeb 2226 );
3dab1dad
YO
2227 }
2228 }
1e37780e
YO
2229 Perl_re_printf( aTHX_ "\n");
2230 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2231
2232 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2233 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2234 Perl_re_printf( aTHX_ "\n");
3dab1dad 2235
1e2e3d02 2236 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2237 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2238
147e3846 2239 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2240
2241 if ( trie->states[ state ].wordnum ) {
1e37780e 2242 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2243 } else {
6ad9a8ab 2244 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2245 }
2246
147e3846 2247 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2248
2249 if ( base ) {
2250 U32 ofs = 0;
2251
2252 while( ( base + ofs < trie->uniquecharcount ) ||
2253 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2254 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2255 != state))
3dab1dad
YO
2256 ofs++;
2257
147e3846 2258 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2259
2260 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2261 if ( ( base + ofs >= trie->uniquecharcount )
2262 && ( base + ofs - trie->uniquecharcount
2263 < trie->lasttrans )
2264 && trie->trans[ base + ofs
2265 - trie->uniquecharcount ].check == state )
3dab1dad 2266 {
147e3846 2267 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2268 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2269 );
3dab1dad 2270 } else {
88f063b4 2271 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
3dab1dad
YO
2272 }
2273 }
2274
6ad9a8ab 2275 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2276
2277 }
6ad9a8ab 2278 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2279 }
6ad9a8ab 2280 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2281 depth);
2e64971a 2282 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2283 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2284 (int)word, (int)(trie->wordinfo[word].prev),
2285 (int)(trie->wordinfo[word].len));
2286 }
6ad9a8ab 2287 Perl_re_printf( aTHX_ "\n" );
538e84ed 2288}
3dab1dad 2289/*
3dab1dad 2290 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2291 List tries normally only are used for construction when the number of
3dab1dad
YO
2292 possible chars (trie->uniquecharcount) is very high.
2293 Used for debugging make_trie().
2294*/
2295STATIC void
55eed653 2296S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2297 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2298 U32 depth)
3dab1dad
YO
2299{
2300 U32 state;
ab3bbdeb 2301 SV *sv=sv_newmortal();
55eed653 2302 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2303 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2304
2305 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2306
3dab1dad 2307 /* print out the table precompression. */
6ad9a8ab 2308 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2309 depth+1 );
6ad9a8ab 2310 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2311 depth+1, "------:-----+-----------------\n" );
538e84ed 2312
3dab1dad
YO
2313 for( state=1 ; state < next_alloc ; state ++ ) {
2314 U16 charid;
538e84ed 2315
147e3846 2316 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2317 depth+1, (UV)state );
3dab1dad 2318 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2319 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2320 } else {
6ad9a8ab 2321 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2322 trie->states[ state ].wordnum
2323 );
2324 }
2325 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed 2326 SV ** const tmp = av_fetch( revcharmap,
88f063b4 2327 TRIE_LIST_ITEM(state, charid).forid, 0);
ab3bbdeb 2328 if ( tmp ) {
147e3846 2329 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2330 colwidth,
538e84ed
KW
2331 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2332 colwidth,
2333 PL_colors[0], PL_colors[1],
2334 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2335 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2336 ) ,
88f063b4
KW
2337 TRIE_LIST_ITEM(state, charid).forid,
2338 (UV)TRIE_LIST_ITEM(state, charid).newstate
1e2e3d02 2339 );
538e84ed 2340 if (!(charid % 10))
6ad9a8ab 2341 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2342 (int)((depth * 2) + 14), "");
1e2e3d02 2343 }
ab3bbdeb 2344 }
6ad9a8ab 2345 Perl_re_printf( aTHX_ "\n");
3dab1dad 2346 }
538e84ed 2347}
3dab1dad
YO
2348
2349/*
3dab1dad 2350 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2351 This is the normal DFA style state transition table, with a few
2352 twists to facilitate compression later.
3dab1dad
YO
2353 Used for debugging make_trie().
2354*/
2355STATIC void
55eed653 2356S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2357 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2358 U32 depth)
3dab1dad
YO
2359{
2360 U32 state;
2361 U16 charid;
ab3bbdeb 2362 SV *sv=sv_newmortal();
55eed653 2363 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2364 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2365
2366 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2367
3dab1dad
YO
2368 /*
2369 print out the table precompression so that we can do a visual check
2370 that they are identical.
2371 */
538e84ed 2372
6ad9a8ab 2373 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2374
2375 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2376 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2377 if ( tmp ) {
6ad9a8ab 2378 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2379 colwidth,
538e84ed 2380 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2381 PL_colors[0], PL_colors[1],
2382 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2383 PERL_PV_ESCAPE_FIRSTCHAR
2384 )
ab3bbdeb 2385 );
3dab1dad
YO
2386 }
2387 }
2388
4aaafc03
YO
2389 Perl_re_printf( aTHX_ "\n");
2390 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2391
2392 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2393 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2394 }
2395
6ad9a8ab 2396 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2397
2398 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2399
147e3846 2400 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2401 depth+1,
3dab1dad
YO
2402 (UV)TRIE_NODENUM( state ) );
2403
2404 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2405 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2406 if (v)
147e3846 2407 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2408 else
6ad9a8ab 2409 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2410 }
2411 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2412 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2413 (UV)trie->trans[ state ].check );
3dab1dad 2414 } else {
147e3846 2415 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2416 (UV)trie->trans[ state ].check,
3dab1dad
YO
2417 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2418 }
2419 }
07be1b83 2420}
3dab1dad
YO
2421
2422#endif
2423
2e64971a 2424
786e8c11
YO
2425/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2426 startbranch: the first branch in the whole branch sequence
2427 first : start branch of sequence of branch-exact nodes.
2428 May be the same as startbranch
2429 last : Thing following the last branch.
2430 May be the same as tail.
2431 tail : item following the branch sequence
2432 count : words in the sequence
a4525e78 2433 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2434 depth : indent depth
3dab1dad 2435
786e8c11 2436Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2437
786e8c11
YO
2438A trie is an N'ary tree where the branches are determined by digital
2439decomposition of the key. IE, at the root node you look up the 1st character and
2440follow that branch repeat until you find the end of the branches. Nodes can be
2441marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2442
786e8c11 2443 /he|she|his|hers/
72f13be8 2444
786e8c11
YO
2445would convert into the following structure. Numbers represent states, letters
2446following numbers represent valid transitions on the letter from that state, if
2447the number is in square brackets it represents an accepting state, otherwise it
2448will be in parenthesis.
07be1b83 2449
786e8c11
YO
2450 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2451 | |
2452 | (2)
2453 | |
2454 (1) +-i->(6)-+-s->[7]
2455 |
2456 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2457
786e8c11
YO
2458 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2459
2460This shows that when matching against the string 'hers' we will begin at state 1
2461read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2462then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2463is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2464single traverse. We store a mapping from accepting to state to which word was
2465matched, and then when we have multiple possibilities we try to complete the
b8fda935 2466rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2467
2468The only prior NFA like behaviour that would be changed by the TRIE support is
2469the silent ignoring of duplicate alternations which are of the form:
2470
2471 / (DUPE|DUPE) X? (?{ ... }) Y /x
2472
4b714af6 2473Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2474and without the optimisation. With the optimisations dupes will be silently
486ec47a 2475ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2476the following demonstrates:
2477
2478 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2479
2480which prints out 'word' three times, but
2481
2482 'words'=~/(word|word|word)(?{ print $1 })S/
2483
2484which doesnt print it out at all. This is due to other optimisations kicking in.
2485
2486Example of what happens on a structural level:
2487
486ec47a 2488The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2489
2490 1: CURLYM[1] {1,32767}(18)
2491 5: BRANCH(8)
2492 6: EXACT <ac>(16)
2493 8: BRANCH(11)
2494 9: EXACT <ad>(16)
2495 11: BRANCH(14)
2496 12: EXACT <ab>(16)
2497 16: SUCCEED(0)
2498 17: NOTHING(18)
2499 18: END(0)
2500
2501This would be optimizable with startbranch=5, first=5, last=16, tail=16
2502and should turn into:
2503
2504 1: CURLYM[1] {1,32767}(18)
2505 5: TRIE(16)
2506 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2507 <ac>
2508 <ad>
2509 <ab>
2510 16: SUCCEED(0)
2511 17: NOTHING(18)
2512 18: END(0)
2513
2514Cases where tail != last would be like /(?foo|bar)baz/:
2515
2516 1: BRANCH(4)
2517 2: EXACT <foo>(8)
2518 4: BRANCH(7)
2519 5: EXACT <bar>(8)
2520 7: TAIL(8)
2521 8: EXACT <baz>(10)
2522 10: END(0)
2523
2524which would be optimizable with startbranch=1, first=1, last=7, tail=8
2525and would end up looking like:
2526
2527 1: TRIE(8)
2528 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2529 <foo>
2530 <bar>
2531 7: TAIL(8)
2532 8: EXACT <baz>(10)
2533 10: END(0)
2534
c80e42f3 2535 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2536
2537is the recommended Unicode-aware way of saying
2538
2539 *(d++) = uv;
2540*/
2541
fab2782b 2542#define TRIE_STORE_REVCHAR(val) \
786e8c11 2543 STMT_START { \
73031816 2544 if (UTF) { \
668fcfea 2545 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2546 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1d84a256
DM
2547 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2548 *kapow = '\0'; \
73031816
NC
2549 SvCUR_set(zlopp, kapow - flrbbbbb); \
2550 SvPOK_on(zlopp); \
2551 SvUTF8_on(zlopp); \
2552 av_push(revcharmap, zlopp); \
2553 } else { \
fab2782b 2554 char ooooff = (char)val; \
73031816
NC
2555 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2556 } \
2557 } STMT_END
786e8c11 2558
914a25d5
KW
2559/* This gets the next character from the input, folding it if not already
2560 * folded. */
2561#define TRIE_READ_CHAR STMT_START { \
2562 wordlen++; \
2563 if ( UTF ) { \
2564 /* if it is UTF then it is either already folded, or does not need \
2565 * folding */ \
1c1d615a 2566 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2567 } \
2568 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2569 /* This folder implies Unicode rules, which in the range expressible \
2570 * by not UTF is the lower case, with the two exceptions, one of \
2571 * which should have been taken care of before calling this */ \
2572 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2573 uvc = toLOWER_L1(*uc); \
2574 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2575 len = 1; \
914a25d5
KW
2576 } else { \
2577 /* raw data, will be folded later if needed */ \
2578 uvc = (U32)*uc; \
2579 len = 1; \
2580 } \
786e8c11
YO
2581} STMT_END
2582
2583
2584
2585#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2586 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
00195859 2587 U32 ging = TRIE_LIST_LEN( state ) * 2; \
f9003953 2588 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
00195859 2589 TRIE_LIST_LEN( state ) = ging; \
786e8c11
YO
2590 } \
2591 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2592 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2593 TRIE_LIST_CUR( state )++; \
2594} STMT_END
07be1b83 2595
786e8c11 2596#define TRIE_LIST_NEW(state) STMT_START { \
d09f14bf 2597 Newx( trie->states[ state ].trans.list, \
786e8c11
YO
2598 4, reg_trie_trans_le ); \
2599 TRIE_LIST_CUR( state ) = 1; \
2600 TRIE_LIST_LEN( state ) = 4; \
2601} STMT_END
07be1b83 2602
786e8c11
YO
2603#define TRIE_HANDLE_WORD(state) STMT_START { \
2604 U16 dupe= trie->states[ state ].wordnum; \
2605 regnode * const noper_next = regnext( noper ); \
2606 \
786e8c11
YO
2607 DEBUG_r({ \
2608 /* store the word for dumping */ \
2609 SV* tmp; \
2610 if (OP(noper) != NOTHING) \
740cce10 2611 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2612 else \
740cce10 2613 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2614 av_push( trie_words, tmp ); \
786e8c11
YO
2615 }); \
2616 \
2617 curword++; \
2e64971a
DM
2618 trie->wordinfo[curword].prev = 0; \
2619 trie->wordinfo[curword].len = wordlen; \
2620 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2621 \
2622 if ( noper_next < tail ) { \
2623 if (!trie->jump) \
538e84ed
KW
2624 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2625 sizeof(U16) ); \
7f69552c 2626 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2627 if (!jumper) \
2628 jumper = noper_next; \
2629 if (!nextbranch) \
2630 nextbranch= regnext(cur); \
2631 } \
2632 \
2633 if ( dupe ) { \
2e64971a
DM
2634 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2635 /* chain, so that when the bits of chain are later */\
2636 /* linked together, the dups appear in the chain */\
2637 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2638 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2639 } else { \
2640 /* we haven't inserted this word yet. */ \
2641 trie->states[ state ].wordnum = curword; \
2642 } \
2643} STMT_END
07be1b83 2644
3dab1dad 2645
786e8c11
YO
2646#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2647 ( ( base + charid >= ucharcount \
2648 && base + charid < ubound \
2649 && state == trie->trans[ base - ucharcount + charid ].check \
2650 && trie->trans[ base - ucharcount + charid ].next ) \
2651 ? trie->trans[ base - ucharcount + charid ].next \
2652 : ( state==1 ? special : 0 ) \
2653 )
3dab1dad 2654
8bcafbf4
YO
2655#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2656STMT_START { \
2657 TRIE_BITMAP_SET(trie, uvc); \
2658 /* store the folded codepoint */ \
2659 if ( folder ) \
2660 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2661 \
2662 if ( !UTF ) { \
2663 /* store first byte of utf8 representation of */ \
2664 /* variant codepoints */ \
2665 if (! UVCHR_IS_INVARIANT(uvc)) { \
2666 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2667 } \
2668 } \
2669} STMT_END
786e8c11
YO
2670#define MADE_TRIE 1
2671#define MADE_JUMP_TRIE 2
2672#define MADE_EXACT_TRIE 4
3dab1dad 2673
a3621e74 2674STATIC I32
538e84ed
KW
2675S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2676 regnode *first, regnode *last, regnode *tail,
2677 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2678{
2679 /* first pass, loop through and scan words */
2680 reg_trie_data *trie;
55eed653 2681 HV *widecharmap = NULL;
2b8b4781 2682 AV *revcharmap = newAV();
a3621e74 2683 regnode *cur;
a3621e74
YO
2684 STRLEN len = 0;
2685 UV uvc = 0;
2686 U16 curword = 0;
2687 U32 next_alloc = 0;
786e8c11
YO
2688 regnode *jumper = NULL;
2689 regnode *nextbranch = NULL;
7f69552c 2690 regnode *convert = NULL;
2e64971a 2691 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2692 /* we just use folder as a flag in utf8 */
1e696034 2693 const U8 * folder = NULL;
a3621e74 2694
3a611511
YO
2695 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2696 * which stands for one trie structure, one hash, optionally followed
2697 * by two arrays */
2b8b4781 2698#ifdef DEBUGGING
3a611511 2699 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2b8b4781
NC
2700 AV *trie_words = NULL;
2701 /* along with revcharmap, this only used during construction but both are
2702 * useful during debugging so we store them in the struct when debugging.
8e11feef 2703 */
2b8b4781 2704#else
cf78de0b 2705 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2706 STRLEN trie_charcount=0;
3dab1dad 2707#endif
2b8b4781 2708 SV *re_trie_maxbuff;
a3621e74 2709 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2710
2711 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2712#ifndef DEBUGGING
2713 PERL_UNUSED_ARG(depth);
2714#endif
a3621e74 2715
1e696034 2716 switch (flags) {
3f2416ae 2717 case EXACT: case EXACT_REQ8: case EXACTL: break;
89829bb5 2718 case EXACTFAA:
0ea669f4 2719 case EXACTFUP:
a4525e78
KW
2720 case EXACTFU:
2721 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2722 case EXACTF: folder = PL_fold; break;
fab2782b 2723 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2724 }
2725
c944940b 2726 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2727 trie->refcount = 1;
3dab1dad 2728 trie->startstate = 1;
786e8c11 2729 trie->wordcount = word_count;
f8fc2ecf 2730 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2731 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3f2416ae 2732 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
c944940b 2733 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2734 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2735 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2736
a3621e74 2737 DEBUG_r({
2b8b4781 2738 trie_words = newAV();
a3621e74 2739 });
a3621e74 2740
3b89859a 2741 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
316ebaf2 2742 assert(re_trie_maxbuff);
a3621e74 2743 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2744 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2745 }
df826430 2746 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2747 Perl_re_indentf( aTHX_
cb41e5d6
YO
2748 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2749 depth+1,
88f063b4 2750 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
538e84ed 2751 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2752 });
538e84ed 2753
7f69552c
YO
2754 /* Find the node we are going to overwrite */
2755 if ( first == startbranch && OP( last ) != BRANCH ) {
2756 /* whole branch chain */
2757 convert = first;
2758 } else {
2759 /* branch sub-chain */
2760 convert = NEXTOPER( first );
2761 }
538e84ed 2762
a3621e74
YO
2763 /* -- First loop and Setup --
2764
2765 We first traverse the branches and scan each word to determine if it
2766 contains widechars, and how many unique chars there are, this is
2767 important as we have to build a table with at least as many columns as we
2768 have unique chars.
2769
2770 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2771 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2772 the native representation of the character value as the key and IV's for
2773 the coded index.
a3621e74
YO
2774
2775 *TODO* If we keep track of how many times each character is used we can
2776 remap the columns so that the table compression later on is more
3b753521 2777 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2778 middle and the least common are on the outside. IMO this would be better
2779 than a most to least common mapping as theres a decent chance the most
2780 common letter will share a node with the least common, meaning the node
486ec47a 2781 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2782 case is when we have the least common nodes twice.
2783
2784 */
2785
a3621e74 2786 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2787 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2788 const U8 *uc;
2789 const U8 *e;
bc031a7d 2790 int foldlen = 0;
07be1b83 2791 U32 wordlen = 0; /* required init */
bc031a7d
KW
2792 STRLEN minchars = 0;
2793 STRLEN maxchars = 0;
538e84ed
KW
2794 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2795 bitmap?*/
a3621e74 2796
3dab1dad 2797 if (OP(noper) == NOTHING) {
20ed8c88
YO
2798 /* skip past a NOTHING at the start of an alternation
2799 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
ca902fb8
YO
2800 *
2801 * If the next node is not something we are supposed to process
2802 * we will just ignore it due to the condition guarding the
2803 * next block.
20ed8c88 2804 */
ca902fb8 2805
df826430 2806 regnode *noper_next= regnext(noper);
944e05e3
YO
2807 if (noper_next < tail)
2808 noper= noper_next;
2809 }
2810
f6b4b99d
KW
2811 if ( noper < tail
2812 && ( OP(noper) == flags
3f2416ae
KW
2813 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2814 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 2815 || OP(noper) == EXACTFUP))))
f6b4b99d 2816 {
944e05e3
YO
2817 uc= (U8*)STRING(noper);
2818 e= uc + STR_LEN(noper);
2819 } else {
2820 trie->minlen= 0;
2821 continue;
3dab1dad 2822 }
df826430 2823
944e05e3 2824
fab2782b 2825 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2826 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2827 regardless of encoding */
0ea669f4 2828 if (OP( noper ) == EXACTFUP) {
fab2782b 2829 /* false positives are ok, so just set this */
0dc4a61d 2830 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2831 }
2832 }
dca5fc4c 2833
bc031a7d
KW
2834 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2835 branch */
3dab1dad 2836 TRIE_CHARCOUNT(trie)++;
a3621e74 2837 TRIE_READ_CHAR;
645de4ce 2838
bc031a7d
KW
2839 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2840 * is in effect. Under /i, this character can match itself, or
2841 * anything that folds to it. If not under /i, it can match just
2842 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2843 * all fold to k, and all are single characters. But some folds
2844 * expand to more than one character, so for example LATIN SMALL
2845 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2846 * the string beginning at 'uc' is 'ffi', it could be matched by
2847 * three characters, or just by the one ligature character. (It
2848 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2849 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2850 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2851 * match.) The trie needs to know the minimum and maximum number
2852 * of characters that could match so that it can use size alone to
2853 * quickly reject many match attempts. The max is simple: it is
2854 * the number of folded characters in this branch (since a fold is
2855 * never shorter than what folds to it. */
2856
2857 maxchars++;
2858
2859 /* And the min is equal to the max if not under /i (indicated by
2860 * 'folder' being NULL), or there are no multi-character folds. If
2861 * there is a multi-character fold, the min is incremented just
2862 * once, for the character that folds to the sequence. Each
2863 * character in the sequence needs to be added to the list below of
2864 * characters in the trie, but we count only the first towards the
2865 * min number of characters needed. This is done through the
2866 * variable 'foldlen', which is returned by the macros that look
2867 * for these sequences as the number of bytes the sequence
2868 * occupies. Each time through the loop, we decrement 'foldlen' by
2869 * how many bytes the current char occupies. Only when it reaches
2870 * 0 do we increment 'minchars' or look for another multi-character
2871 * sequence. */
2872 if (folder == NULL) {
2873 minchars++;
2874 }
2875 else if (foldlen > 0) {
2876 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2877 }
2878 else {
bc031a7d
KW
2879 minchars++;
2880
2881 /* See if *uc is the beginning of a multi-character fold. If
2882 * so, we decrement the length remaining to look at, to account
2883 * for the current character this iteration. (We can use 'uc'
2884 * instead of the fold returned by TRIE_READ_CHAR because for
2885 * non-UTF, the latin1_safe macro is smart enough to account
2886 * for all the unfolded characters, and because for UTF, the
2887 * string will already have been folded earlier in the
2888 * compilation process */
2889 if (UTF) {
2890 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2891 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2892 }
2893 }
bc031a7d
KW
2894 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2895 foldlen--;
2896 }
645de4ce 2897 }
bc031a7d
KW
2898
2899 /* The current character (and any potential folds) should be added
2900 * to the possible matching characters for this position in this
2901 * branch */
a3621e74 2902 if ( uvc < 256 ) {
fab2782b
YO
2903 if ( folder ) {
2904 U8 folded= folder[ (U8) uvc ];
2905 if ( !trie->charmap[ folded ] ) {
2906 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2907 TRIE_STORE_REVCHAR( folded );
2908 }
2909 }
a3621e74
YO
2910 if ( !trie->charmap[ uvc ] ) {
2911 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2912 TRIE_STORE_REVCHAR( uvc );
a3621e74 2913 }
02daf0ab 2914 if ( set_bit ) {
62012aee
KW
2915 /* store the codepoint in the bitmap, and its folded
2916 * equivalent. */
8bcafbf4 2917 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2918 set_bit = 0; /* We've done our bit :-) */
2919 }
a3621e74 2920 } else {
bc031a7d
KW
2921
2922 /* XXX We could come up with the list of code points that fold
2923 * to this using PL_utf8_foldclosures, except not for
2924 * multi-char folds, as there may be multiple combinations
2925 * there that could work, which needs to wait until runtime to
2926 * resolve (The comment about LIGATURE FFI above is such an
2927 * example */
2928
a3621e74 2929 SV** svpp;
55eed653
NC
2930 if ( !widecharmap )
2931 widecharmap = newHV();
a3621e74 2932
55eed653 2933 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2934
2935 if ( !svpp )
147e3846 2936 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
2937
2938 if ( !SvTRUE( *svpp ) ) {
2939 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2940 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2941 }
2942 }
bc031a7d
KW
2943 } /* end loop through characters in this branch of the trie */
2944
2945 /* We take the min and max for this branch and combine to find the min
2946 * and max for all branches processed so far */
3dab1dad 2947 if( cur == first ) {
bc031a7d
KW
2948 trie->minlen = minchars;
2949 trie->maxlen = maxchars;
2950 } else if (minchars < trie->minlen) {
2951 trie->minlen = minchars;
2952 } else if (maxchars > trie->maxlen) {
2953 trie->maxlen = maxchars;
fab2782b 2954 }
a3621e74
YO
2955 } /* end first pass */
2956 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2957 Perl_re_indentf( aTHX_
cb41e5d6
YO
2958 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2959 depth+1,
55eed653 2960 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2961 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2962 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2963 );
a3621e74
YO
2964
2965 /*
2966 We now know what we are dealing with in terms of unique chars and
2967 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2968 representation using a flat table will take. If it's over a reasonable
2969 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2970 conservative but potentially much slower representation using an array
2971 of lists.
2972
2973 At the end we convert both representations into the same compressed
2974 form that will be used in regexec.c for matching with. The latter
2975 is a form that cannot be used to construct with but has memory
2976 properties similar to the list form and access properties similar
2977 to the table form making it both suitable for fast searches and
2978 small enough that its feasable to store for the duration of a program.
2979
2980 See the comment in the code where the compressed table is produced
2981 inplace from the flat tabe representation for an explanation of how
2982 the compression works.
2983
2984 */
2985
2986
2e64971a
DM
2987 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2988 prev_states[1] = 0;
2989
538e84ed
KW
2990 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2991 > SvIV(re_trie_maxbuff) )
2992 {
a3621e74
YO
2993 /*
2994 Second Pass -- Array Of Lists Representation
2995
2996 Each state will be represented by a list of charid:state records
2997 (reg_trie_trans_le) the first such element holds the CUR and LEN
2998 points of the allocated array. (See defines above).
2999
3000 We build the initial structure using the lists, and then convert
3001 it into the compressed table form which allows faster lookups
3002 (but cant be modified once converted).
a3621e74
YO
3003 */
3004
a3621e74
YO
3005 STRLEN transcount = 1;
3006
6ad9a8ab 3007 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 3008 depth+1));
686b73d4 3009
c944940b
JH
3010 trie->states = (reg_trie_state *)
3011 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3012 sizeof(reg_trie_state) );
a3621e74
YO
3013 TRIE_LIST_NEW(1);
3014 next_alloc = 2;
3015
3016 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3017
df826430 3018 regnode *noper = NEXTOPER( cur );
c445ea15
AL
3019 U32 state = 1; /* required init */
3020 U16 charid = 0; /* sanity init */
07be1b83 3021 U32 wordlen = 0; /* required init */
c445ea15 3022
df826430
YO
3023 if (OP(noper) == NOTHING) {
3024 regnode *noper_next= regnext(noper);
944e05e3
YO
3025 if (noper_next < tail)
3026 noper= noper_next;
ca902fb8
YO
3027 /* we will undo this assignment if noper does not
3028 * point at a trieable type in the else clause of
3029 * the following statement. */
df826430
YO
3030 }
3031
f6b4b99d
KW
3032 if ( noper < tail
3033 && ( OP(noper) == flags
3f2416ae
KW
3034 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3035 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 3036 || OP(noper) == EXACTFUP))))
f6b4b99d 3037 {
944e05e3
YO
3038 const U8 *uc= (U8*)STRING(noper);
3039 const U8 *e= uc + STR_LEN(noper);
3040
786e8c11 3041 for ( ; uc < e ; uc += len ) {
c445ea15 3042
786e8c11 3043 TRIE_READ_CHAR;
c445ea15 3044
786e8c11
YO
3045 if ( uvc < 256 ) {
3046 charid = trie->charmap[ uvc ];
c445ea15 3047 } else {
538e84ed
KW
3048 SV** const svpp = hv_fetch( widecharmap,
3049 (char*)&uvc,
3050 sizeof( UV ),
3051 0);
786e8c11
YO
3052 if ( !svpp ) {
3053 charid = 0;
3054 } else {
3055 charid=(U16)SvIV( *svpp );
3056 }
c445ea15 3057 }
538e84ed
KW
3058 /* charid is now 0 if we dont know the char read, or
3059 * nonzero if we do */
786e8c11 3060 if ( charid ) {
a3621e74 3061
786e8c11
YO
3062 U16 check;
3063 U32 newstate = 0;
a3621e74 3064
786e8c11
YO
3065 charid--;
3066 if ( !trie->states[ state ].trans.list ) {
3067 TRIE_LIST_NEW( state );
c445ea15 3068 }
538e84ed
KW
3069 for ( check = 1;
3070 check <= TRIE_LIST_USED( state );
3071 check++ )
3072 {
3073 if ( TRIE_LIST_ITEM( state, check ).forid
3074 == charid )
3075 {
786e8c11
YO
3076 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3077 break;
3078 }
3079 }
3080 if ( ! newstate ) {
3081 newstate = next_alloc++;
2e64971a 3082 prev_states[newstate] = state;
786e8c11
YO
3083 TRIE_LIST_PUSH( state, charid, newstate );
3084 transcount++;
3085 }
3086 state = newstate;
3087 } else {
147e3846 3088 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 3089 }
a28509cc 3090 }
ca902fb8
YO
3091 } else {
3092 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3093 * on a trieable type. So we need to reset noper back to point at the first regop
3094 * in the branch before we call TRIE_HANDLE_WORD()
3095 */
3096 noper= NEXTOPER(cur);
3097 }
3dab1dad 3098 TRIE_HANDLE_WORD(state);
a3621e74
YO
3099
3100 } /* end second pass */
3101
1e2e3d02 3102 /* next alloc is the NEXT state to be allocated */
538e84ed 3103 trie->statecount = next_alloc;
c944940b
JH
3104 trie->states = (reg_trie_state *)
3105 PerlMemShared_realloc( trie->states,
3106 next_alloc
3107 * sizeof(reg_trie_state) );
a3621e74 3108
3dab1dad 3109 /* and now dump it out before we compress it */
2b8b4781
NC
3110 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3111 revcharmap, next_alloc,
3112 depth+1)
1e2e3d02 3113 );
a3621e74 3114
c944940b
JH
3115 trie->trans = (reg_trie_trans *)
3116 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
3117 {
3118 U32 state;
a3621e74
YO
3119 U32 tp = 0;
3120 U32 zp = 0;
3121
3122
3123 for( state=1 ; state < next_alloc ; state ++ ) {
3124 U32 base=0;
3125
3126 /*
3127 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3128 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
3129 );
3130 */
3131
3132 if (trie->states[state].trans.list) {
3133 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3134 U16 maxid=minid;
a28509cc 3135 U16 idx;
a3621e74
YO
3136
3137 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
3138 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3139 if ( forid < minid ) {
3140 minid=forid;
3141 } else if ( forid > maxid ) {
3142 maxid=forid;
3143 }
a3621e74
YO
3144 }
3145 if ( transcount < tp + maxid - minid + 1) {
3146 transcount *= 2;
c944940b
JH
3147 trie->trans = (reg_trie_trans *)
3148 PerlMemShared_realloc( trie->trans,
446bd890
NC
3149 transcount
3150 * sizeof(reg_trie_trans) );
538e84ed
KW
3151 Zero( trie->trans + (transcount / 2),
3152 transcount / 2,
3153 reg_trie_trans );
a3621e74
YO
3154 }
3155 base = trie->uniquecharcount + tp - minid;
3156 if ( maxid == minid ) {
3157 U32 set = 0;
3158 for ( ; zp < tp ; zp++ ) {
3159 if ( ! trie->trans[ zp ].next ) {
3160 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
3161 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3162 1).newstate;
a3621e74
YO
3163 trie->trans[ zp ].check = state;
3164 set = 1;
3165 break;
3166 }
3167 }
3168 if ( !set ) {
538e84ed
KW
3169 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3170 1).newstate;
a3621e74
YO
3171 trie->trans[ tp ].check = state;
3172 tp++;
3173 zp = tp;
3174 }
3175 } else {
3176 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
3177 const U32 tid = base
3178 - trie->uniquecharcount
3179 + TRIE_LIST_ITEM( state, idx ).forid;
3180 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3181 idx ).newstate;
a3621e74
YO
3182 trie->trans[ tid ].check = state;
3183 }
3184 tp += ( maxid - minid + 1 );
3185 }
3186 Safefree(trie->states[ state ].trans.list);
3187 }
3188 /*
3189 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3190 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
3191 );
3192 */
3193 trie->states[ state ].trans.base=base;
3194 }
cc601c31 3195 trie->lasttrans = tp + 1;
a3621e74
YO
3196 }
3197 } else {
3198 /*
3199 Second Pass -- Flat Table Representation.
3200
b423522f
KW
3201 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3202 each. We know that we will need Charcount+1 trans at most to store
3203 the data (one row per char at worst case) So we preallocate both
3204 structures assuming worst case.
a3621e74
YO
3205
3206 We then construct the trie using only the .next slots of the entry
3207 structs.
3208
b423522f
KW
3209 We use the .check field of the first entry of the node temporarily
3210 to make compression both faster and easier by keeping track of how
3211 many non zero fields are in the node.
a3621e74
YO
3212
3213 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3214 transition.
3215
b423522f
KW
3216 There are two terms at use here: state as a TRIE_NODEIDX() which is
3217 a number representing the first entry of the node, and state as a
3218 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3219 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3220 if there are 2 entrys per node. eg:
a3621e74
YO
3221
3222 A B A B
3223 1. 2 4 1. 3 7
3224 2. 0 3 3. 0 5
3225 3. 0 0 5. 0 0
3226 4. 0 0 7. 0 0
3227
b423522f
KW
3228 The table is internally in the right hand, idx form. However as we
3229 also have to deal with the states array which is indexed by nodenum
3230 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
3231
3232 */
6ad9a8ab 3233 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 3234 depth+1));
3dab1dad 3235
c944940b
JH
3236 trie->trans = (reg_trie_trans *)
3237 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3238 * trie->uniquecharcount + 1,
3239 sizeof(reg_trie_trans) );
3240 trie->states = (reg_trie_state *)
3241 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3242 sizeof(reg_trie_state) );
a3621e74
YO
3243 next_alloc = trie->uniquecharcount + 1;
3244
3dab1dad 3245
a3621e74
YO
3246 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3247
df826430 3248 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3249
3250 U32 state = 1; /* required init */
3251
3252 U16 charid = 0; /* sanity init */
3253 U32 accept_state = 0; /* sanity init */
a3621e74 3254
07be1b83 3255 U32 wordlen = 0; /* required init */
a3621e74 3256
df826430
YO
3257 if (OP(noper) == NOTHING) {
3258 regnode *noper_next= regnext(noper);
944e05e3
YO
3259 if (noper_next < tail)
3260 noper= noper_next;
ca902fb8
YO
3261 /* we will undo this assignment if noper does not
3262 * point at a trieable type in the else clause of
3263 * the following statement. */
df826430 3264 }
fab2782b 3265
f6b4b99d
KW
3266 if ( noper < tail
3267 && ( OP(noper) == flags
3f2416ae
KW
3268 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3269 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 3270 || OP(noper) == EXACTFUP))))
f6b4b99d 3271 {
944e05e3
YO
3272 const U8 *uc= (U8*)STRING(noper);
3273 const U8 *e= uc + STR_LEN(noper);
3274
786e8c11 3275 for ( ; uc < e ; uc += len ) {
a3621e74 3276
786e8c11 3277 TRIE_READ_CHAR;
a3621e74 3278
786e8c11
YO
3279 if ( uvc < 256 ) {
3280 charid = trie->charmap[ uvc ];
3281 } else {
538e84ed
KW
3282 SV* const * const svpp = hv_fetch( widecharmap,
3283 (char*)&uvc,
3284 sizeof( UV ),
3285 0);
786e8c11 3286 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3287 }
786e8c11
YO
3288 if ( charid ) {
3289 charid--;
3290 if ( !trie->trans[ state + charid ].next ) {
3291 trie->trans[ state + charid ].next = next_alloc;
3292 trie->trans[ state ].check++;
2e64971a
DM
3293 prev_states[TRIE_NODENUM(next_alloc)]
3294 = TRIE_NODENUM(state);
786e8c11
YO
3295 next_alloc += trie->uniquecharcount;
3296 }
3297 state = trie->trans[ state + charid ].next;
3298 } else {
147e3846 3299 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3300 }
538e84ed
KW
3301 /* charid is now 0 if we dont know the char read, or
3302 * nonzero if we do */
a3621e74 3303 }
ca902fb8
YO
3304 } else {
3305 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3306 * on a trieable type. So we need to reset noper back to point at the first regop
3307 * in the branch before we call TRIE_HANDLE_WORD().
3308 */
3309 noper= NEXTOPER(cur);
a3621e74 3310 }
3dab1dad
YO
3311 accept_state = TRIE_NODENUM( state );
3312 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3313
3314 } /* end second pass */
3315
3dab1dad 3316 /* and now dump it out before we compress it */
2b8b4781
NC
3317 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3318 revcharmap,
3319 next_alloc, depth+1));
a3621e74 3320
a3621e74
YO
3321 {
3322 /*
3323 * Inplace compress the table.*
3324
3325 For sparse data sets the table constructed by the trie algorithm will
3326 be mostly 0/FAIL transitions or to put it another way mostly empty.
3327 (Note that leaf nodes will not contain any transitions.)
3328
3329 This algorithm compresses the tables by eliminating most such
3330 transitions, at the cost of a modest bit of extra work during lookup:
3331
3332 - Each states[] entry contains a .base field which indicates the
3333 index in the state[] array wheres its transition data is stored.
3334
3b753521 3335 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3336
3337 - If .base is nonzero then charid is added to it to find an entry in
3338 the trans array.
3339
3340 -If trans[states[state].base+charid].check!=state then the
3341 transition is taken to be a 0/Fail transition. Thus if there are fail
3342 transitions at the front of the node then the .base offset will point
3343 somewhere inside the previous nodes data (or maybe even into a node
3344 even earlier), but the .check field determines if the transition is
3345 valid.
3346
786e8c11 3347 XXX - wrong maybe?
a3621e74 3348 The following process inplace converts the table to the compressed
3b753521 3349 table: We first do not compress the root node 1,and mark all its
a3621e74 3350 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3351 allows us to do a DFA construction from the compressed table later,
3352 and ensures that any .base pointers we calculate later are greater
3353 than 0.
a3621e74
YO
3354
3355 - We set 'pos' to indicate the first entry of the second node.
3356
3357 - We then iterate over the columns of the node, finding the first and
3358 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3359 and set the .check pointers accordingly, and advance pos
3360 appropriately and repreat for the next node. Note that when we copy
3361 the next pointers we have to convert them from the original
3362 NODEIDX form to NODENUM form as the former is not valid post
3363 compression.
3364
3365 - If a node has no transitions used we mark its base as 0 and do not
3366 advance the pos pointer.
3367
3368 - If a node only has one transition we use a second pointer into the
3369 structure to fill in allocated fail transitions from other states.
3370 This pointer is independent of the main pointer and scans forward
3371 looking for null transitions that are allocated to a state. When it
3372 finds one it writes the single transition into the "hole". If the
786e8c11 3373 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3374
3375 - Once compressed we can Renew/realloc the structures to release the
3376 excess space.
3377
3378 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3379 specifically Fig 3.47 and the associated pseudocode.
3380
3381 demq
3382 */
a3b680e6 3383 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3384 U32 state, charid;
a3621e74 3385 U32 pos = 0, zp=0;
1e2e3d02 3386 trie->statecount = laststate;
a3621e74
YO
3387
3388 for ( state = 1 ; state < laststate ; state++ ) {
3389 U8 flag = 0;
a28509cc
AL
3390 const U32 stateidx = TRIE_NODEIDX( state );
3391 const U32 o_used = trie->trans[ stateidx ].check;
3392 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3393 trie->trans[ stateidx ].check = 0;
3394
538e84ed
KW
3395 for ( charid = 0;
3396 used && charid < trie->uniquecharcount;
3397 charid++ )
3398 {
a3621e74
YO
3399 if ( flag || trie->trans[ stateidx + charid ].next ) {
3400 if ( trie->trans[ stateidx + charid ].next ) {
3401 if (o_used == 1) {
3402 for ( ; zp < pos ; zp++ ) {
3403 if ( ! trie->trans[ zp ].next ) {
3404 break;
3405 }
3406 }
538e84ed
KW
3407 trie->states[ state ].trans.base
3408 = zp
3409 + trie->uniquecharcount
3410 - charid ;
3411 trie->trans[ zp ].next