This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
(toke|regcomp).c: Use common fcn to handle \0 problems
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
c277df42 77#define REG_COMP_C
54df2634
NC
78#ifdef PERL_IN_XSUB_RE
79# include "re_comp.h"
afda64e8 80EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
81#else
82# include "regcomp.h"
83#endif
a687059c 84
f7e03a10 85#include "dquote_inline.h"
b992490d 86#include "invlist_inline.h"
1b0f46bf 87#include "unicode_constants.h"
04e98a4d 88
538e84ed
KW
89#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
90 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
91#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
92 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 93#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 94#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 95
a687059c
LW
96#ifndef STATIC
97#define STATIC static
98#endif
99
3f910e62
YO
100/* this is a chain of data about sub patterns we are processing that
101 need to be handled separately/specially in study_chunk. Its so
102 we can simulate recursion without losing state. */
103struct scan_frame;
104typedef struct scan_frame {
105 regnode *last_regnode; /* last node to process in this frame */
106 regnode *next_regnode; /* next node to process when last is reached */
107 U32 prev_recursed_depth;
108 I32 stopparen; /* what stopparen do we use */
3f910e62
YO
109
110 struct scan_frame *this_prev_frame; /* this previous frame */
111 struct scan_frame *prev_frame; /* previous frame */
112 struct scan_frame *next_frame; /* next frame */
113} scan_frame;
114
f4ae5a27
KW
115/* Certain characters are output as a sequence with the first being a
116 * backslash. */
4aada8b9 117#define isBACKSLASHED_PUNCT(c) memCHRs("-[]\\^", c)
f4ae5a27
KW
118
119
09b2b2e6 120struct RExC_state_t {
514a91f1
DM
121 U32 flags; /* RXf_* are we folding, multilining? */
122 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 123 char *precomp; /* uncompiled string. */
711b303b 124 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 125 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 126 regexp *rx; /* perl core regexp structure */
538e84ed
KW
127 regexp_internal *rxi; /* internal data for regexp object
128 pprivate field */
fac92740 129 char *start; /* Start of input for compile */
830247a4
IZ
130 char *end; /* End of input for compile */
131 char *parse; /* Input-scan pointer. */
51684e3c
KW
132 char *copy_start; /* start of copy of input within
133 constructed parse string */
cc16d262
KW
134 char *save_copy_start; /* Provides one level of saving
135 and restoring 'copy_start' */
51684e3c
KW
136 char *copy_start_in_input; /* Position in input string
137 corresponding to copy_start */
ea3daa5d 138 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 139 regnode *emit_start; /* Start of emitted-code area */
f55b7b26 140 regnode_offset emit; /* Code-emit pointer */
830247a4
IZ
141 I32 naughty; /* How bad is this pattern? */
142 I32 sawback; /* Did we see \1, ...? */
143 U32 seen;
88f063b4
KW
144 SSize_t size; /* Number of regnode equivalents in
145 pattern */
7c932d07
KW
146
147 /* position beyond 'precomp' of the warning message furthest away from
148 * 'precomp'. During the parse, no warnings are raised for any problems
149 * earlier in the parse than this position. This works if warnings are
150 * raised the first time a given spot is parsed, and if only one
151 * independent warning is raised for any given spot */
152 Size_t latest_warn_offset;
153
154 I32 npar; /* Capture buffer count so far in the
155 parse, (OPEN) plus one. ("par" 0 is
156 the whole pattern)*/
157 I32 total_par; /* During initial parse, is either 0,
158 or -1; the latter indicating a
159 reparse is needed. After that pass,
160 it is what 'npar' became after the
161 pass. Hence, it being > 0 indicates
162 we are in a reparse situation */
538e84ed
KW
163 I32 nestroot; /* root parens we are in - used by
164 accept */
830247a4 165 I32 seen_zerolen;
f55b7b26
KW
166 regnode_offset *open_parens; /* offsets to open parens */
167 regnode_offset *close_parens; /* offsets to close parens */
0275405c 168 I32 parens_buf_size; /* #slots malloced open/close_parens */
d5a00e4a 169 regnode *end_op; /* END node in program */
02daf0ab
YO
170 I32 utf8; /* whether the pattern is utf8 or not */
171 I32 orig_utf8; /* whether the pattern was originally in utf8 */
172 /* XXX use this for future optimisation of case
173 * where pattern must be upgraded to utf8. */
e40e74fe
KW
174 I32 uni_semantics; /* If a d charset modifier should use unicode
175 rules, even if the pattern is not in
176 utf8 */
81714fb9 177 HV *paren_names; /* Paren names */
538e84ed 178
40d049e4 179 regnode **recurse; /* Recurse regops */
88f063b4 180 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 181 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 182 through */
09a65838 183 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 184 I32 in_lookbehind;
105c827d 185 I32 in_lookahead;
4624b182 186 I32 contains_locale;
bb3f3ed2 187 I32 override_recoding;
4caef9c3 188 I32 recode_x_to_native;
9d53c457 189 I32 in_multi_char_class;
1acab4c5 190 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 191 within pattern */
b1603ef8 192 int code_index; /* next code_blocks[] slot */
ee273784 193 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
194 scan_frame *frame_head;
195 scan_frame *frame_last;
196 U32 frame_count;
7eec73eb 197 AV *warn_text;
945961fd 198 HV *unlexed_names;
dc6d7f5c 199#ifdef ADD_TO_REGEXEC
830247a4
IZ
200 char *starttry; /* -Dr: where regtry was called. */
201#define RExC_starttry (pRExC_state->starttry)
202#endif
d24ca0c5 203 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 204#ifdef DEBUGGING
be8e71aa 205 const char *lastparse;
3dab1dad 206 I32 lastnum;
1f1031fe 207 AV *paren_name_list; /* idx -> name */
d9a72fcc 208 U32 study_chunk_recursed_count;
c9f0d54c
YO
209 SV *mysv1;
210 SV *mysv2;
88f063b4 211
3dab1dad
YO
212#define RExC_lastparse (pRExC_state->lastparse)
213#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 214#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 215#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
216#define RExC_mysv (pRExC_state->mysv1)
217#define RExC_mysv1 (pRExC_state->mysv1)
218#define RExC_mysv2 (pRExC_state->mysv2)
219
3dab1dad 220#endif
bd0b62a6 221 bool seen_d_op;
911bd04e 222 bool strict;
da7cf1cc 223 bool study_started;
034602eb 224 bool in_script_run;
7c932d07 225 bool use_BRANCHJ;
09b2b2e6 226};
830247a4 227
e2509266 228#define RExC_flags (pRExC_state->flags)
514a91f1 229#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 230#define RExC_precomp (pRExC_state->precomp)
51684e3c
KW
231#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
232#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
cc16d262 233#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
711b303b 234#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 235#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 236#define RExC_rx (pRExC_state->rx)
f8fc2ecf 237#define RExC_rxi (pRExC_state->rxi)
fac92740 238#define RExC_start (pRExC_state->start)
830247a4
IZ
239#define RExC_end (pRExC_state->end)
240#define RExC_parse (pRExC_state->parse)
7c932d07 241#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
830247a4 242#define RExC_whilem_seen (pRExC_state->whilem_seen)
bd0b62a6
KW
243#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
244 under /d from /u ? */
512c0f5a 245
7122b237 246#ifdef RE_TRACK_PATTERN_OFFSETS
44f36cd3 247# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
538e84ed 248 others */
7122b237 249#endif
830247a4 250#define RExC_emit (pRExC_state->emit)
fac92740 251#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
252#define RExC_sawback (pRExC_state->sawback)
253#define RExC_seen (pRExC_state->seen)
254#define RExC_size (pRExC_state->size)
ee273784 255#define RExC_maxlen (pRExC_state->maxlen)
830247a4 256#define RExC_npar (pRExC_state->npar)
1533d1d2 257#define RExC_total_parens (pRExC_state->total_par)
0275405c 258#define RExC_parens_buf_size (pRExC_state->parens_buf_size)
e2e6a0f1 259#define RExC_nestroot (pRExC_state->nestroot)
830247a4 260#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 261#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 262#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 263#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
264#define RExC_open_parens (pRExC_state->open_parens)
265#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 266#define RExC_end_op (pRExC_state->end_op)
81714fb9 267#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
268#define RExC_recurse (pRExC_state->recurse)
269#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 270#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
271#define RExC_study_chunk_recursed_bytes \
272 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 273#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
105c827d 274#define RExC_in_lookahead (pRExC_state->in_lookahead)
4624b182 275#define RExC_contains_locale (pRExC_state->contains_locale)
4caef9c3
KW
276#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
277
b6d67071 278#ifdef EBCDIC
4caef9c3
KW
279# define SET_recode_x_to_native(x) \
280 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
281#else
282# define SET_recode_x_to_native(x) NOOP
b6d67071 283#endif
4caef9c3 284
9d53c457 285#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
286#define RExC_frame_head (pRExC_state->frame_head)
287#define RExC_frame_last (pRExC_state->frame_last)
288#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 289#define RExC_strict (pRExC_state->strict)
da7cf1cc 290#define RExC_study_started (pRExC_state->study_started)
7eec73eb 291#define RExC_warn_text (pRExC_state->warn_text)
034602eb 292#define RExC_in_script_run (pRExC_state->in_script_run)
7c932d07 293#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
945961fd 294#define RExC_unlexed_names (pRExC_state->unlexed_names)
830247a4 295
99807a43
HS
296/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
297 * a flag to disable back-off on the fixed/floating substrings - if it's
298 * a high complexity pattern we assume the benefit of avoiding a full match
299 * is worth the cost of checking for the substrings even if they rarely help.
300 */
301#define RExC_naughty (pRExC_state->naughty)
302#define TOO_NAUGHTY (10)
303#define MARK_NAUGHTY(add) \
304 if (RExC_naughty < TOO_NAUGHTY) \
305 RExC_naughty += (add)
306#define MARK_NAUGHTY_EXP(exp, add) \
307 if (RExC_naughty < TOO_NAUGHTY) \
308 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 309
a687059c
LW
310#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
311#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 312 ((*s) == '{' && regcurly(s)))
a687059c
LW
313
314/*
315 * Flags to be passed up and down.
316 */
a687059c 317#define WORST 0 /* Worst case. */
c4ead770
KW
318#define HASWIDTH 0x01 /* Known to not match null strings, could match
319 non-null ones. */
fda99bee 320
e64f369d 321/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
322 * character. (There needs to be a case: in the switch statement in regexec.c
323 * for any node marked SIMPLE.) Note that this is not the same thing as
324 * REGNODE_SIMPLE */
fda99bee 325#define SIMPLE 0x02
e64f369d 326#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
327#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
328#define TRYAGAIN 0x10 /* Weeded out a declaration. */
c36361b2
KW
329#define RESTART_PARSE 0x20 /* Need to redo the parse */
330#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
331 calcuate sizes as UTF-8 */
a687059c 332
3dab1dad
YO
333#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
334
07be1b83
YO
335/* whether trie related optimizations are enabled */
336#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
337#define TRIE_STUDY_OPT
786e8c11 338#define FULL_TRIE_STUDY
07be1b83
YO
339#define TRIE_STCLASS
340#endif
1de06328
YO
341
342
40d049e4
YO
343
344#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
345#define PBITVAL(paren) (1 << ((paren) & 7))
ac3ef366
HS
346#define PAREN_OFFSET(depth) \
347 (RExC_study_chunk_recursed + (depth) * RExC_study_chunk_recursed_bytes)
348#define PAREN_TEST(depth, paren) \
349 (PBYTE(PAREN_OFFSET(depth), paren) & PBITVAL(paren))
350#define PAREN_SET(depth, paren) \
351 (PBYTE(PAREN_OFFSET(depth), paren) |= PBITVAL(paren))
352#define PAREN_UNSET(depth, paren) \
353 (PBYTE(PAREN_OFFSET(depth), paren) &= ~PBITVAL(paren))
40d049e4 354
82a6ada4 355#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 356 if (!UTF) { \
c36361b2 357 *flagp = RESTART_PARSE|NEED_UTF8; \
f55b7b26 358 return 0; \
8d9c2815 359 } \
82a6ada4 360 } STMT_END
40d049e4 361
d63d75f5 362/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
f6e8f31e
KW
363 * a flag that indicates we need to override /d with /u as a result of
364 * something in the pattern. It should only be used in regards to calling
b2090782 365 * set_regex_charset() or get_regex_charset() */
512c0f5a
KW
366#define REQUIRE_UNI_RULES(flagp, restart_retval) \
367 STMT_START { \
368 if (DEPENDS_SEMANTICS) { \
512c0f5a
KW
369 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
370 RExC_uni_semantics = 1; \
b76e1c04 371 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
bd0b62a6
KW
372 /* No need to restart the parse if we haven't seen \
373 * anything that differs between /u and /d, and no need \
374 * to restart immediately if we're going to reparse \
375 * anyway to count parens */ \
c36361b2 376 *flagp |= RESTART_PARSE; \
512c0f5a 377 return restart_retval; \
d1be211c 378 } \
512c0f5a
KW
379 } \
380 } STMT_END
381
7c932d07
KW
382#define REQUIRE_BRANCHJ(flagp, restart_retval) \
383 STMT_START { \
384 RExC_use_BRANCHJ = 1; \
439a3bfe
KW
385 *flagp |= RESTART_PARSE; \
386 return restart_retval; \
7c932d07
KW
387 } STMT_END
388
b76e1c04
KW
389/* Until we have completed the parse, we leave RExC_total_parens at 0 or
390 * less. After that, it must always be positive, because the whole re is
391 * considered to be surrounded by virtual parens. Setting it to negative
392 * indicates there is some construct that needs to know the actual number of
393 * parens to be properly handled. And that means an extra pass will be
394 * required after we've counted them all */
395#define ALL_PARENS_COUNTED (RExC_total_parens > 0)
7c932d07 396#define REQUIRE_PARENS_PASS \
b76e1c04
KW
397 STMT_START { /* No-op if have completed a pass */ \
398 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
7c932d07 399 } STMT_END
b76e1c04
KW
400#define IN_PARENS_PASS (RExC_total_parens < 0)
401
7c932d07 402
f358228e
KW
403/* This is used to return failure (zero) early from the calling function if
404 * various flags in 'flags' are set. Two flags always cause a return:
405 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
406 * additional flags that should cause a return; 0 if none. If the return will
407 * be done, '*flagp' is first set to be all of the flags that caused the
408 * return. */
409#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
fd7fc6ae 410 STMT_START { \
c36361b2
KW
411 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
412 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
f358228e 413 return 0; \
fd7fc6ae
YO
414 } \
415 } STMT_END
416
c36361b2 417#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
fd7fc6ae 418
5d555bb8 419#define RETURN_FAIL_ON_RESTART(flags,flagp) \
f358228e 420 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
5d555bb8 421#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
f358228e 422 if (MUST_RESTART(*(flagp))) return 0
fd7fc6ae 423
f19b1a63
KW
424/* This converts the named class defined in regcomp.h to its equivalent class
425 * number defined in handy.h. */
426#define namedclass_to_classnum(class) ((int) ((class) / 2))
427#define classnum_to_namedclass(classnum) ((classnum) * 2)
428
de92f5e6
KW
429#define _invlist_union_complement_2nd(a, b, output) \
430 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
431#define _invlist_intersection_complement_2nd(a, b, output) \
432 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
433
4a808c87
KW
434/* We add a marker if we are deferring expansion of a property that is both
435 * 1) potentiallly user-defined; and
436 * 2) could also be an official Unicode property.
437 *
438 * Without this marker, any deferred expansion can only be for a user-defined
439 * one. This marker shouldn't conflict with any that could be in a legal name,
440 * and is appended to its name to indicate this. There is a string and
441 * character form */
043cdb9a
KW
442#define DEFERRED_COULD_BE_OFFICIAL_MARKERs "~"
443#define DEFERRED_COULD_BE_OFFICIAL_MARKERc '~'
1c2f3d7a 444
1de06328
YO
445/* About scan_data_t.
446
447 During optimisation we recurse through the regexp program performing
448 various inplace (keyhole style) optimisations. In addition study_chunk
449 and scan_commit populate this data structure with information about
538e84ed 450 what strings MUST appear in the pattern. We look for the longest
3b753521 451 string that must appear at a fixed location, and we look for the
1de06328
YO
452 longest string that may appear at a floating location. So for instance
453 in the pattern:
538e84ed 454
1de06328 455 /FOO[xX]A.*B[xX]BAR/
538e84ed 456
1de06328
YO
457 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
458 strings (because they follow a .* construct). study_chunk will identify
459 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 460
1de06328 461 The strings can be composites, for instance
538e84ed 462
1de06328 463 /(f)(o)(o)/
538e84ed 464
1de06328 465 will result in a composite fixed substring 'foo'.
538e84ed 466
1de06328 467 For each string some basic information is maintained:
538e84ed 468
6ea77169 469 - min_offset
1de06328
YO
470 This is the position the string must appear at, or not before.
471 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
472 characters must match before the string we are searching for.
473 Likewise when combined with minlenp and the length of the string it
538e84ed 474 tells us how many characters must appear after the string we have
1de06328 475 found.
538e84ed 476
1de06328
YO
477 - max_offset
478 Only used for floating strings. This is the rightmost point that
ea3daa5d 479 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 480 string can occur infinitely far to the right.
6e12f832 481 For fixed strings, it is equal to min_offset.
538e84ed 482
1de06328 483 - minlenp
2d608413
KW
484 A pointer to the minimum number of characters of the pattern that the
485 string was found inside. This is important as in the case of positive
538e84ed 486 lookahead or positive lookbehind we can have multiple patterns
1de06328 487 involved. Consider
538e84ed 488
1de06328 489 /(?=FOO).*F/
538e84ed 490
1de06328
YO
491 The minimum length of the pattern overall is 3, the minimum length
492 of the lookahead part is 3, but the minimum length of the part that
538e84ed 493 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 494 minimum length for the F is 1. This is important as the minimum length
538e84ed 495 is used to determine offsets in front of and behind the string being
1de06328 496 looked for. Since strings can be composites this is the length of the
486ec47a 497 pattern at the time it was committed with a scan_commit. Note that
1de06328 498 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 499 are not known until the full pattern has been compiled, thus the
1de06328 500 pointer to the value.
538e84ed 501
1de06328 502 - lookbehind
538e84ed 503
1de06328 504 In the case of lookbehind the string being searched for can be
538e84ed 505 offset past the start point of the final matching string.
1de06328
YO
506 If this value was just blithely removed from the min_offset it would
507 invalidate some of the calculations for how many chars must match
508 before or after (as they are derived from min_offset and minlen and
538e84ed 509 the length of the string being searched for).
1de06328
YO
510 When the final pattern is compiled and the data is moved from the
511 scan_data_t structure into the regexp structure the information
538e84ed
KW
512 about lookbehind is factored in, with the information that would
513 have been lost precalculated in the end_shift field for the
1de06328
YO
514 associated string.
515
516 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 517 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
518
519*/
2c2d71f5 520
cb6aef8b
DM
521struct scan_data_substrs {
522 SV *str; /* longest substring found in pattern */
523 SSize_t min_offset; /* earliest point in string it can appear */
524 SSize_t max_offset; /* latest point in string it can appear */
525 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
526 SSize_t lookbehind; /* is the pos of the string modified by LB */
11683ecb 527 I32 flags; /* per substring SF_* and SCF_* flags */
cb6aef8b
DM
528};
529
2c2d71f5 530typedef struct scan_data_t {
1de06328
YO
531 /*I32 len_min; unused */
532 /*I32 len_delta; unused */
49f55535 533 SSize_t pos_min;
ea3daa5d 534 SSize_t pos_delta;
2c2d71f5 535 SV *last_found;
ea3daa5d 536 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 537 SSize_t last_start_min;
ea3daa5d 538 SSize_t last_start_max;
2099df82
DM
539 U8 cur_is_floating; /* whether the last_* values should be set as
540 * the next fixed (0) or floating (1)
541 * substring */
6ea77169 542
2099df82
DM
543 /* [0] is longest fixed substring so far, [1] is longest float so far */
544 struct scan_data_substrs substrs[2];
6ea77169 545
11683ecb 546 I32 flags; /* common SF_* and SCF_* flags */
2c2d71f5 547 I32 whilem_c;
49f55535 548 SSize_t *last_closep;
b8f7bb16 549 regnode_ssc *start_class;
2c2d71f5
JH
550} scan_data_t;
551
a687059c 552/*
e50aee73 553 * Forward declarations for pregcomp()'s friends.
a687059c 554 */
a0d0e21e 555
6ea77169 556static const scan_data_t zero_scan_data = {
a2ca2017 557 0, 0, NULL, 0, 0, 0, 0,
6ea77169 558 {
11683ecb
DM
559 { NULL, 0, 0, 0, 0, 0 },
560 { NULL, 0, 0, 0, 0, 0 },
6ea77169
DM
561 },
562 0, 0, NULL, NULL
563};
c277df42 564
11683ecb
DM
565/* study flags */
566
07be1b83
YO
567#define SF_BEFORE_SEOL 0x0001
568#define SF_BEFORE_MEOL 0x0002
11683ecb 569#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
c277df42 570
07be1b83
YO
571#define SF_IS_INF 0x0040
572#define SF_HAS_PAR 0x0080
573#define SF_IN_PAR 0x0100
574#define SF_HAS_EVAL 0x0200
bc604ad8
KW
575
576
577/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
578 * longest substring in the pattern. When it is not set the optimiser keeps
579 * track of position, but does not keep track of the actual strings seen,
580 *
581 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
582 * /foo/i will not.
583 *
584 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
585 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
586 * turned off because of the alternation (BRANCH). */
07be1b83 587#define SCF_DO_SUBSTR 0x0400
bc604ad8 588
653099ff
GS
589#define SCF_DO_STCLASS_AND 0x0800
590#define SCF_DO_STCLASS_OR 0x1000
591#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 592#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 593
786e8c11 594#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 595#define SCF_SEEN_ACCEPT 0x8000
688e0391 596#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
597#define SCF_IN_DEFINE 0x20000
598
599
600
07be1b83 601
43fead97 602#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
603
604/* The enums for all these are ordered so things work out correctly */
a62b1201 605#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
606#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
607 == REGEX_DEPENDS_CHARSET)
00b27cfc 608#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
609#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
610 >= REGEX_UNICODE_CHARSET)
611#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
612 == REGEX_ASCII_RESTRICTED_CHARSET)
613#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
614 >= REGEX_ASCII_RESTRICTED_CHARSET)
615#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
616 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 617
43fead97 618#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 619
f2c2a6ab
KW
620/* For programs that want to be strictly Unicode compatible by dying if any
621 * attempt is made to match a non-Unicode code point against a Unicode
622 * property. */
623#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
624
93733859 625#define OOB_NAMEDCLASS -1
b8c5462f 626
8e661ac5
KW
627/* There is no code point that is out-of-bounds, so this is problematic. But
628 * its only current use is to initialize a variable that is always set before
629 * looked at. */
630#define OOB_UNICODE 0xDEADBEEF
631
a0ed51b3 632#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 633
8615cb43 634
b45f050a
JF
635/* length of regex to show in messages that don't mark a position within */
636#define RegexLengthToShowInErrorMessages 127
637
638/*
639 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
640 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
641 * op/pragma/warn/regcomp.
642 */
7253e4e3
RK
643#define MARKER1 "<-- HERE" /* marker as it appears in the description */
644#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 645
538e84ed 646#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 647 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 648
285b5ca0
KW
649/* The code in this file in places uses one level of recursion with parsing
650 * rebased to an alternate string constructed by us in memory. This can take
651 * the form of something that is completely different from the input, or
652 * something that uses the input as part of the alternate. In the first case,
653 * there should be no possibility of an error, as we are in complete control of
51684e3c
KW
654 * the alternate string. But in the second case we don't completely control
655 * the input portion, so there may be errors in that. Here's an example:
285b5ca0
KW
656 * /[abc\x{DF}def]/ui
657 * is handled specially because \x{df} folds to a sequence of more than one
51684e3c 658 * character: 'ss'. What is done is to create and parse an alternate string,
285b5ca0
KW
659 * which looks like this:
660 * /(?:\x{DF}|[abc\x{DF}def])/ui
661 * where it uses the input unchanged in the middle of something it constructs,
662 * which is a branch for the DF outside the character class, and clustering
663 * parens around the whole thing. (It knows enough to skip the DF inside the
664 * class while in this substitute parse.) 'abc' and 'def' may have errors that
665 * need to be reported. The general situation looks like this:
666 *
51684e3c 667 * |<------- identical ------>|
285b5ca0 668 * sI tI xI eI
51684e3c 669 * Input: ---------------------------------------------------------------
285b5ca0
KW
670 * Constructed: ---------------------------------------------------
671 * sC tC xC eC EC
51684e3c 672 * |<------- identical ------>|
285b5ca0 673 *
51684e3c
KW
674 * sI..eI is the portion of the input pattern we are concerned with here.
675 * sC..EC is the constructed substitute parse string.
676 * sC..tC is constructed by us
677 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
678 * In the diagram, these are vertically aligned.
679 * eC..EC is also constructed by us.
680 * xC is the position in the substitute parse string where we found a
681 * problem.
682 * xI is the position in the original pattern corresponding to xC.
285b5ca0 683 *
51684e3c
KW
684 * We want to display a message showing the real input string. Thus we need to
685 * translate from xC to xI. We know that xC >= tC, since the portion of the
686 * string sC..tC has been constructed by us, and so shouldn't have errors. We
687 * get:
688 * xI = tI + (xC - tC)
285b5ca0 689 *
51684e3c
KW
690 * When the substitute parse is constructed, the code needs to set:
691 * RExC_start (sC)
692 * RExC_end (eC)
693 * RExC_copy_start_in_input (tI)
694 * RExC_copy_start_in_constructed (tC)
695 * and restore them when done.
285b5ca0 696 *
51684e3c
KW
697 * During normal processing of the input pattern, both
698 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
699 * sI, so that xC equals xI.
285b5ca0
KW
700 */
701
51684e3c
KW
702#define sI RExC_precomp
703#define eI RExC_precomp_end
704#define sC RExC_start
705#define eC RExC_end
706#define tI RExC_copy_start_in_input
707#define tC RExC_copy_start_in_constructed
708#define xI(xC) (tI + (xC - tC))
709#define xI_offset(xC) (xI(xC) - sI)
285b5ca0
KW
710
711#define REPORT_LOCATION_ARGS(xC) \
712 UTF8fARG(UTF, \
51684e3c 713 (xI(xC) > eI) /* Don't run off end */ \
232b691f 714 ? eI - sI /* Length before the <--HERE */ \
bc929b61
KW
715 : ((xI_offset(xC) >= 0) \
716 ? xI_offset(xC) \
717 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
718 IVdf " trying to output message for " \
719 " pattern %.*s", \
51684e3c 720 __FILE__, __LINE__, (IV) xI_offset(xC), \
bc929b61 721 ((int) (eC - sC)), sC), 0)), \
51684e3c 722 sI), /* The input pattern printed up to the <--HERE */ \
285b5ca0 723 UTF8fARG(UTF, \
51684e3c
KW
724 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
725 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
c1d900c3 726
8a6d8ec6
HS
727/* Used to point after bad bytes for an error message, but avoid skipping
728 * past a nul byte. */
e3304382 729#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
8a6d8ec6 730
ed68fe11
KW
731/* Set up to clean up after our imminent demise */
732#define PREPARE_TO_DIE \
733 STMT_START { \
734 if (RExC_rx_sv) \
735 SAVEFREESV(RExC_rx_sv); \
7c932d07
KW
736 if (RExC_open_parens) \
737 SAVEFREEPV(RExC_open_parens); \
738 if (RExC_close_parens) \
739 SAVEFREEPV(RExC_close_parens); \
ed68fe11
KW
740 } STMT_END
741
b45f050a
JF
742/*
743 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
744 * arg. Show regex, up to a maximum length. If it's too long, chop and add
745 * "...".
746 */
58e23c8d 747#define _FAIL(code) STMT_START { \
bfed75c6 748 const char *ellipses = ""; \
88f063b4 749 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380 750 \
ed68fe11 751 PREPARE_TO_DIE; \
ccb2c380
MP
752 if (len > RegexLengthToShowInErrorMessages) { \
753 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
754 len = RegexLengthToShowInErrorMessages - 10; \
755 ellipses = "..."; \
756 } \
58e23c8d 757 code; \
ccb2c380 758} STMT_END
8615cb43 759
58e23c8d 760#define FAIL(msg) _FAIL( \
147e3846 761 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 762 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
763
764#define FAIL2(msg,arg) _FAIL( \
147e3846 765 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 766 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 767
4b7718d6
KW
768#define FAIL3(msg,arg1,arg2) _FAIL( \
769 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
770 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
771
b45f050a 772/*
b45f050a
JF
773 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
774 */
ccb2c380 775#define Simple_vFAIL(m) STMT_START { \
ccb2c380 776 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 777 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 778} STMT_END
b45f050a
JF
779
780/*
781 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
782 */
ccb2c380 783#define vFAIL(m) STMT_START { \
ed68fe11 784 PREPARE_TO_DIE; \
ccb2c380
MP
785 Simple_vFAIL(m); \
786} STMT_END
b45f050a
JF
787
788/*
789 * Like Simple_vFAIL(), but accepts two arguments.
790 */
ccb2c380 791#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
792 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
793 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 794} STMT_END
b45f050a
JF
795
796/*
797 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
798 */
ccb2c380 799#define vFAIL2(m,a1) STMT_START { \
ed68fe11 800 PREPARE_TO_DIE; \
ccb2c380
MP
801 Simple_vFAIL2(m, a1); \
802} STMT_END
b45f050a
JF
803
804
805/*
806 * Like Simple_vFAIL(), but accepts three arguments.
807 */
ccb2c380 808#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 809 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 810 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 811} STMT_END
b45f050a
JF
812
813/*
814 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
815 */
ccb2c380 816#define vFAIL3(m,a1,a2) STMT_START { \
ed68fe11 817 PREPARE_TO_DIE; \
ccb2c380
MP
818 Simple_vFAIL3(m, a1, a2); \
819} STMT_END
b45f050a
JF
820
821/*
822 * Like Simple_vFAIL(), but accepts four arguments.
823 */
ccb2c380 824#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
825 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
826 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 827} STMT_END
b45f050a 828
95db3ffa 829#define vFAIL4(m,a1,a2,a3) STMT_START { \
ed68fe11 830 PREPARE_TO_DIE; \
95db3ffa
KW
831 Simple_vFAIL4(m, a1, a2, a3); \
832} STMT_END
833
946095af 834/* A specialized version of vFAIL2 that works with UTF8f */
d528642a 835#define vFAIL2utf8f(m, a1) STMT_START { \
ed68fe11 836 PREPARE_TO_DIE; \
d528642a
KW
837 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
838 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
839} STMT_END
840
3ba22297 841#define vFAIL3utf8f(m, a1, a2) STMT_START { \
ed68fe11 842 PREPARE_TO_DIE; \
3ba22297 843 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 844 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
845} STMT_END
846
d62e749e 847/* Setting this to NULL is a signal to not output warnings */
cc16d262
KW
848#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
849 STMT_START { \
850 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
851 RExC_copy_start_in_constructed = NULL; \
852 } STMT_END
853#define RESTORE_WARNINGS \
854 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
d62e749e 855
7c932d07
KW
856/* Since a warning can be generated multiple times as the input is reparsed, we
857 * output it the first time we come to that point in the parse, but suppress it
858 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
859 * generate any warnings */
55bcf857 860#define TO_OUTPUT_WARNINGS(loc) \
7c932d07
KW
861 ( RExC_copy_start_in_constructed \
862 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
55bcf857 863
7c932d07
KW
864/* After we've emitted a warning, we save the position in the input so we don't
865 * output it again */
866#define UPDATE_WARNINGS_LOC(loc) \
867 STMT_START { \
868 if (TO_OUTPUT_WARNINGS(loc)) { \
5d894ca5
KW
869 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
870 - RExC_precomp; \
7c932d07
KW
871 } \
872 } STMT_END
77ccba3c 873
086f21bf 874/* 'warns' is the output of the packWARNx macro used in 'code' */
f8220fe2
KW
875#define _WARN_HELPER(loc, warns, code) \
876 STMT_START { \
d62e749e
KW
877 if (! RExC_copy_start_in_constructed) { \
878 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
879 " expected at '%s'", \
880 __FILE__, __LINE__, loc); \
881 } \
55bcf857 882 if (TO_OUTPUT_WARNINGS(loc)) { \
086f21bf
KW
883 if (ckDEAD(warns)) \
884 PREPARE_TO_DIE; \
338190a1 885 code; \
77ccba3c 886 UPDATE_WARNINGS_LOC(loc); \
338190a1 887 } \
f8220fe2 888 } STMT_END
946095af 889
5e0a247b 890/* m is not necessarily a "literal string", in this macro */
58ad8160
KW
891#define warn_non_literal_string(loc, packed_warn, m) \
892 _WARN_HELPER(loc, packed_warn, \
893 Perl_warner(aTHX_ packed_warn, \
d528642a 894 "%s" REPORT_LOCATION, \
f8220fe2 895 m, REPORT_LOCATION_ARGS(loc)))
58ad8160
KW
896#define reg_warn_non_literal_string(loc, m) \
897 warn_non_literal_string(loc, packWARN(WARN_REGEXP), m)
5e0a247b 898
eb761011
KW
899#define ckWARN2_non_literal_string(loc, packwarn, m, a1) \
900 STMT_START { \
901 char * format; \
902 Size_t format_size = strlen(m) + strlen(REPORT_LOCATION)+ 1;\
903 Newx(format, format_size, char); \
904 my_strlcpy(format, m, format_size); \
905 my_strlcat(format, REPORT_LOCATION, format_size); \
906 SAVEFREEPV(format); \
907 _WARN_HELPER(loc, packwarn, \
908 Perl_ck_warner(aTHX_ packwarn, \
909 format, \
910 a1, REPORT_LOCATION_ARGS(loc))); \
911 } STMT_END
912
f8220fe2
KW
913#define ckWARNreg(loc,m) \
914 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
915 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 916 m REPORT_LOCATION, \
f8220fe2 917 REPORT_LOCATION_ARGS(loc)))
ccb2c380 918
f8220fe2
KW
919#define vWARN(loc, m) \
920 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
921 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 922 m REPORT_LOCATION, \
f8220fe2 923 REPORT_LOCATION_ARGS(loc))) \
b927b7e9 924
f8220fe2
KW
925#define vWARN_dep(loc, m) \
926 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
927 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 928 m REPORT_LOCATION, \
f8220fe2 929 REPORT_LOCATION_ARGS(loc)))
0d6106aa 930
f8220fe2
KW
931#define ckWARNdep(loc,m) \
932 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
933 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 934 m REPORT_LOCATION, \
f8220fe2 935 REPORT_LOCATION_ARGS(loc)))
147508a2 936
f8220fe2
KW
937#define ckWARNregdep(loc,m) \
938 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
939 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
d528642a
KW
940 WARN_REGEXP), \
941 m REPORT_LOCATION, \
f8220fe2 942 REPORT_LOCATION_ARGS(loc)))
ccb2c380 943
f8220fe2
KW
944#define ckWARN2reg_d(loc,m, a1) \
945 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
946 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
d528642a 947 m REPORT_LOCATION, \
f8220fe2 948 a1, REPORT_LOCATION_ARGS(loc)))
2335b3d3 949
f8220fe2
KW
950#define ckWARN2reg(loc, m, a1) \
951 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
952 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 953 m REPORT_LOCATION, \
f8220fe2 954 a1, REPORT_LOCATION_ARGS(loc)))
ccb2c380 955
f8220fe2
KW
956#define vWARN3(loc, m, a1, a2) \
957 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
958 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 959 m REPORT_LOCATION, \
f8220fe2 960 a1, a2, REPORT_LOCATION_ARGS(loc)))
ccb2c380 961
f8220fe2
KW
962#define ckWARN3reg(loc, m, a1, a2) \
963 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
964 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
965 m REPORT_LOCATION, \
966 a1, a2, \
f8220fe2 967 REPORT_LOCATION_ARGS(loc)))
668c081a 968
f8220fe2
KW
969#define vWARN4(loc, m, a1, a2, a3) \
970 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
971 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
972 m REPORT_LOCATION, \
973 a1, a2, a3, \
f8220fe2 974 REPORT_LOCATION_ARGS(loc)))
ccb2c380 975
f8220fe2
KW
976#define ckWARN4reg(loc, m, a1, a2, a3) \
977 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
978 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
979 m REPORT_LOCATION, \
980 a1, a2, a3, \
f8220fe2 981 REPORT_LOCATION_ARGS(loc)))
668c081a 982
f8220fe2
KW
983#define vWARN5(loc, m, a1, a2, a3, a4) \
984 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
985 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
986 m REPORT_LOCATION, \
987 a1, a2, a3, a4, \
f8220fe2 988 REPORT_LOCATION_ARGS(loc)))
9d1d55b5 989
aea5f181
KW
990#define ckWARNexperimental(loc, class, m) \
991 _WARN_HELPER(loc, packWARN(class), \
992 Perl_ck_warner_d(aTHX_ packWARN(class), \
993 m REPORT_LOCATION, \
994 REPORT_LOCATION_ARGS(loc)))
995
1a322bb4
KW
996/* Convert between a pointer to a node and its offset from the beginning of the
997 * program */
998#define REGNODE_p(offset) (RExC_emit_start + (offset))
999#define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
1000
538e84ed 1001/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
1002 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
1003 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
1004 * Element 0 holds the number n.
07be1b83 1005 * Position is 1 indexed.
fac92740 1006 */
7122b237 1007#ifndef RE_TRACK_PATTERN_OFFSETS
d7c442ce 1008#define Set_Node_Offset_To_R(offset,byte)
7122b237
YO
1009#define Set_Node_Offset(node,byte)
1010#define Set_Cur_Node_Offset
1011#define Set_Node_Length_To_R(node,len)
1012#define Set_Node_Length(node,len)
6a86c6ad 1013#define Set_Node_Cur_Length(node,start)
538e84ed
KW
1014#define Node_Offset(n)
1015#define Node_Length(n)
7122b237
YO
1016#define Set_Node_Offset_Length(node,offset,len)
1017#define ProgLen(ri) ri->u.proglen
1018#define SetProgLen(ri,x) ri->u.proglen = x
4cbeff97 1019#define Track_Code(code)
7122b237
YO
1020#else
1021#define ProgLen(ri) ri->u.offsets[0]
1022#define SetProgLen(ri,x) ri->u.offsets[0] = x
d7c442ce 1023#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
ccb2c380 1024 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
d7c442ce
KW
1025 __LINE__, (int)(offset), (int)(byte))); \
1026 if((offset) < 0) { \
538e84ed 1027 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
d7c442ce 1028 (int)(offset)); \
ccb2c380 1029 } else { \
d7c442ce 1030 RExC_offsets[2*(offset)-1] = (byte); \
ccb2c380 1031 } \
ccb2c380
MP
1032} STMT_END
1033
88f063b4 1034#define Set_Node_Offset(node,byte) \
1a322bb4 1035 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
ccb2c380
MP
1036#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1037
1038#define Set_Node_Length_To_R(node,len) STMT_START { \
ccb2c380 1039 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 1040 __LINE__, (int)(node), (int)(len))); \
ccb2c380 1041 if((node) < 0) { \
538e84ed
KW
1042 Perl_croak(aTHX_ "value of node is %d in Length macro", \
1043 (int)(node)); \
ccb2c380
MP
1044 } else { \
1045 RExC_offsets[2*(node)] = (len); \
1046 } \
ccb2c380
MP
1047} STMT_END
1048
1049#define Set_Node_Length(node,len) \
1a322bb4 1050 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
6a86c6ad
NC
1051#define Set_Node_Cur_Length(node, start) \
1052 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
1053
1054/* Get offsets and lengths */
1a322bb4
KW
1055#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1056#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
fac92740 1057
07be1b83 1058#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1a322bb4
KW
1059 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1060 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
07be1b83 1061} STMT_END
4cbeff97
TC
1062
1063#define Track_Code(code) STMT_START { code } STMT_END
7122b237 1064#endif
07be1b83
YO
1065
1066#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1067#define EXPERIMENTAL_INPLACESCAN
f427392e 1068#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 1069
cb41e5d6
YO
1070#ifdef DEBUGGING
1071int
6ad9a8ab 1072Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
1073{
1074 va_list ap;
1075 int result;
1076 PerlIO *f= Perl_debug_log;
1077 PERL_ARGS_ASSERT_RE_PRINTF;
1078 va_start(ap, fmt);
1079 result = PerlIO_vprintf(f, fmt, ap);
1080 va_end(ap);
1081 return result;
1082}
1083
1084int
7b031478 1085Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
1086{
1087 va_list ap;
1088 int result;
1089 PerlIO *f= Perl_debug_log;
1090 PERL_ARGS_ASSERT_RE_INDENTF;
1091 va_start(ap, depth);
daeb874b 1092 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
1093 result = PerlIO_vprintf(f, fmt, ap);
1094 va_end(ap);
1095 return result;
1096}
1097#endif /* DEBUGGING */
1098
7b031478 1099#define DEBUG_RExC_seen() \
538e84ed 1100 DEBUG_OPTIMISE_MORE_r({ \
88f063b4 1101 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 1102 \
e384d5c1 1103 if (RExC_seen & REG_ZERO_LEN_SEEN) \
88f063b4 1104 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 1105 \
e384d5c1 1106 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
88f063b4 1107 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 1108 \
e384d5c1 1109 if (RExC_seen & REG_GPOS_SEEN) \
88f063b4 1110 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 1111 \
e384d5c1 1112 if (RExC_seen & REG_RECURSE_SEEN) \
88f063b4 1113 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 1114 \
7b031478 1115 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
88f063b4 1116 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 1117 \
e384d5c1 1118 if (RExC_seen & REG_VERBARG_SEEN) \
88f063b4 1119 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 1120 \
e384d5c1 1121 if (RExC_seen & REG_CUTGROUP_SEEN) \
88f063b4 1122 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 1123 \
e384d5c1 1124 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
88f063b4 1125 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 1126 \
e384d5c1 1127 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
88f063b4 1128 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 1129 \
7b031478 1130 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
88f063b4 1131 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 1132 \
88f063b4 1133 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
1134 });
1135
fdfb4f21 1136#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 1137 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21 1138
fdfb4f21 1139
f5a36d78
DM
1140#ifdef DEBUGGING
1141static void
1142S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1143 const char *close_str)
1144{
1145 if (!flags)
1146 return;
1147
1148 Perl_re_printf( aTHX_ "%s", open_str);
11683ecb
DM
1149 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1150 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
f5a36d78
DM
1151 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1152 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1153 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1154 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1155 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1156 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1157 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1158 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1159 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1160 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1161 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1162 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1163 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1164 Perl_re_printf( aTHX_ "%s", close_str);
1165}
1166
1167
1168static void
1169S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1170 U32 depth, int is_inf)
1171{
1172 GET_RE_DEBUG_FLAGS_DECL;
1173
1174 DEBUG_OPTIMISE_MORE_r({
1175 if (!data)
1176 return;
1177 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1178 depth,
1179 where,
1180 (IV)data->pos_min,
1181 (IV)data->pos_delta,
1182 (UV)data->flags
1183 );
1184
1185 S_debug_show_study_flags(aTHX_ data->flags," [","]");
fdfb4f21 1186
f5a36d78
DM
1187 Perl_re_printf( aTHX_
1188 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1189 (IV)data->whilem_c,
1190 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1191 is_inf ? "INF " : ""
1192 );
1193
11683ecb 1194 if (data->last_found) {
37b6262f 1195 int i;
11683ecb
DM
1196 Perl_re_printf(aTHX_
1197 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1198 SvPVX_const(data->last_found),
1199 (IV)data->last_end,
1200 (IV)data->last_start_min,
1201 (IV)data->last_start_max
1202 );
1203
2099df82 1204 for (i = 0; i < 2; i++) {
37b6262f
DM
1205 Perl_re_printf(aTHX_
1206 " %s%s: '%s' @ %" IVdf "/%" IVdf,
2099df82
DM
1207 data->cur_is_floating == i ? "*" : "",
1208 i ? "Float" : "Fixed",
37b6262f
DM
1209 SvPVX_const(data->substrs[i].str),
1210 (IV)data->substrs[i].min_offset,
1211 (IV)data->substrs[i].max_offset
1212 );
1213 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1214 }
11683ecb 1215 }
f5a36d78
DM
1216
1217 Perl_re_printf( aTHX_ "\n");
1218 });
1219}
1220
1221
1222static void
1223S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1224 regnode *scan, U32 depth, U32 flags)
1225{
1226 GET_RE_DEBUG_FLAGS_DECL;
1227
1228 DEBUG_OPTIMISE_r({
1229 regnode *Next;
1230
1231 if (!scan)
1232 return;
1233 Next = regnext(scan);
1234 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1235 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1236 depth,
1237 str,
1238 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1239 Next ? (REG_NODE_NUM(Next)) : 0 );
1240 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1241 Perl_re_printf( aTHX_ "\n");
1242 });
1243}
1244
1245
1246# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1247 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1248
1249# define DEBUG_PEEP(str, scan, depth, flags) \
1250 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1251
1252#else
1253# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1254# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1255#endif
1de06328 1256
cb41e5d6 1257
c6871b76
KW
1258/* =========================================================
1259 * BEGIN edit_distance stuff.
1260 *
1261 * This calculates how many single character changes of any type are needed to
1262 * transform a string into another one. It is taken from version 3.1 of
1263 *
1264 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1265 */
1266
1267/* Our unsorted dictionary linked list. */
1268/* Note we use UVs, not chars. */
1269
1270struct dictionary{
1271 UV key;
1272 UV value;
1273 struct dictionary* next;
1274};
1275typedef struct dictionary item;
1276
1277
1278PERL_STATIC_INLINE item*
88f063b4 1279push(UV key, item* curr)
c6871b76
KW
1280{
1281 item* head;
d09f14bf 1282 Newx(head, 1, item);
c6871b76
KW
1283 head->key = key;
1284 head->value = 0;
1285 head->next = curr;
1286 return head;
1287}
1288
1289
1290PERL_STATIC_INLINE item*
1291find(item* head, UV key)
1292{
1293 item* iterator = head;
1294 while (iterator){
1295 if (iterator->key == key){
1296 return iterator;
1297 }
1298 iterator = iterator->next;
1299 }
1300
1301 return NULL;
1302}
1303
1304PERL_STATIC_INLINE item*
88f063b4 1305uniquePush(item* head, UV key)
c6871b76
KW
1306{
1307 item* iterator = head;
1308
1309 while (iterator){
1310 if (iterator->key == key) {
1311 return head;
1312 }
1313 iterator = iterator->next;
1314 }
1315
88f063b4 1316 return push(key, head);
c6871b76
KW
1317}
1318
1319PERL_STATIC_INLINE void
1320dict_free(item* head)
1321{
1322 item* iterator = head;
1323
1324 while (iterator) {
1325 item* temp = iterator;
1326 iterator = iterator->next;
1327 Safefree(temp);
1328 }
1329
1330 head = NULL;
1331}
1332
1333/* End of Dictionary Stuff */
1334
1335/* All calculations/work are done here */
1336STATIC int
1337S_edit_distance(const UV* src,
1338 const UV* tgt,
1339 const STRLEN x, /* length of src[] */
1340 const STRLEN y, /* length of tgt[] */
1341 const SSize_t maxDistance
1342)
1343{
1344 item *head = NULL;
88f063b4 1345 UV swapCount, swapScore, targetCharCount, i, j;
c6871b76
KW
1346 UV *scores;
1347 UV score_ceil = x + y;
1348
1349 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1350
1351 /* intialize matrix start values */
d09f14bf 1352 Newx(scores, ( (x + 2) * (y + 2)), UV);
c6871b76
KW
1353 scores[0] = score_ceil;
1354 scores[1 * (y + 2) + 0] = score_ceil;
1355 scores[0 * (y + 2) + 1] = score_ceil;
1356 scores[1 * (y + 2) + 1] = 0;
88f063b4 1357 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
c6871b76
KW
1358
1359 /* work loops */
1360 /* i = src index */
1361 /* j = tgt index */
1362 for (i=1;i<=x;i++) {
1363 if (i < x)
88f063b4 1364 head = uniquePush(head, src[i]);
c6871b76
KW
1365 scores[(i+1) * (y + 2) + 1] = i;
1366 scores[(i+1) * (y + 2) + 0] = score_ceil;
1367 swapCount = 0;
1368
1369 for (j=1;j<=y;j++) {
1370 if (i == 1) {
1371 if(j < y)
88f063b4 1372 head = uniquePush(head, tgt[j]);
c6871b76
KW
1373 scores[1 * (y + 2) + (j + 1)] = j;
1374 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1375 }
1376
88f063b4 1377 targetCharCount = find(head, tgt[j-1])->value;
c6871b76
KW
1378 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1379
1380 if (src[i-1] != tgt[j-1]){
1381 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1382 }
1383 else {
1384 swapCount = j;
1385 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1386 }
1387 }
1388
88f063b4 1389 find(head, src[i-1])->value = i;
c6871b76
KW
1390 }
1391
1392 {
1393 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1394 dict_free(head);
1395 Safefree(scores);
1396 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1397 }
1398}
1399
1400/* END of edit_distance() stuff
1401 * ========================================================= */
1402
8e35b056
KW
1403/* is c a control character for which we have a mnemonic? */
1404#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1405
549b4e78
KW
1406STATIC const char *
1407S_cntrl_to_mnemonic(const U8 c)
1408{
1409 /* Returns the mnemonic string that represents character 'c', if one
1410 * exists; NULL otherwise. The only ones that exist for the purposes of
1411 * this routine are a few control characters */
1412
1413 switch (c) {
1414 case '\a': return "\\a";
1415 case '\b': return "\\b";
1416 case ESC_NATIVE: return "\\e";
1417 case '\f': return "\\f";
1418 case '\n': return "\\n";
1419 case '\r': return "\\r";
1420 case '\t': return "\\t";
1421 }
1422
1423 return NULL;
1424}
1425
653099ff 1426/* Mark that we cannot extend a found fixed substring at this point.
37b6262f 1427 Update the longest found anchored substring or the longest found
653099ff
GS
1428 floating substrings if needed. */
1429
4327152a 1430STATIC void
ea3daa5d
FC
1431S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1432 SSize_t *minlenp, int is_inf)
c277df42 1433{
e1ec3a88 1434 const STRLEN l = CHR_SVLEN(data->last_found);
2099df82 1435 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
a2ca2017 1436 const STRLEN old_l = CHR_SVLEN(longest_sv);
1de06328 1437 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1438
7918f24d
NC
1439 PERL_ARGS_ASSERT_SCAN_COMMIT;
1440
c277df42 1441 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
2099df82 1442 const U8 i = data->cur_is_floating;
a2ca2017 1443 SvSetMagicSV(longest_sv, data->last_found);
37b6262f
DM
1444 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1445
2099df82
DM
1446 if (!i) /* fixed */
1447 data->substrs[0].max_offset = data->substrs[0].min_offset;
1448 else { /* float */
1449 data->substrs[1].max_offset = (l
646e8787
DM
1450 ? data->last_start_max
1451 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1452 ? SSize_t_MAX
1453 : data->pos_min + data->pos_delta));
1454 if (is_inf
2099df82
DM
1455 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1456 data->substrs[1].max_offset = SSize_t_MAX;
37b6262f 1457 }
11683ecb 1458
37b6262f
DM
1459 if (data->flags & SF_BEFORE_EOL)
1460 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1461 else
1462 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1463 data->substrs[i].minlenp = minlenp;
1464 data->substrs[i].lookbehind = 0;
c277df42 1465 }
37b6262f 1466
c277df42 1467 SvCUR_set(data->last_found, 0);
0eda9292 1468 {
a28509cc 1469 SV * const sv = data->last_found;
097eb12c
AL
1470 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1471 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1472 if (mg)
1473 mg->mg_len = 0;
1474 }
0eda9292 1475 }
c277df42
IZ
1476 data->last_end = -1;
1477 data->flags &= ~SF_BEFORE_EOL;
f5a36d78 1478 DEBUG_STUDYDATA("commit", data, 0, is_inf);
c277df42
IZ
1479}
1480
cdd87c1d
KW
1481/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1482 * list that describes which code points it matches */
1483
653099ff 1484STATIC void
3420edd7 1485S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1486{
cdd87c1d
KW
1487 /* Set the SSC 'ssc' to match an empty string or any code point */
1488
557bd3fb 1489 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1490
71068078 1491 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1492
0854ea0b
KW
1493 /* mortalize so won't leak */
1494 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1495 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1496}
1497
653099ff 1498STATIC int
dc3bf405 1499S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1500{
c144baaa
KW
1501 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1502 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1503 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1504 * in any way, so there's no point in using it */
cdd87c1d
KW
1505
1506 UV start, end;
1507 bool ret;
653099ff 1508
557bd3fb 1509 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1510
71068078 1511 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1512
93e92956 1513 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1514 return FALSE;
1515 }
1516
1517 /* See if the list consists solely of the range 0 - Infinity */
1518 invlist_iterinit(ssc->invlist);
1519 ret = invlist_iternext(ssc->invlist, &start, &end)
1520 && start == 0
1521 && end == UV_MAX;
1522
1523 invlist_iterfinish(ssc->invlist);
1524
1525 if (ret) {
1526 return TRUE;
1527 }
1528
1529 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1530 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1531 int i;
1532 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1533 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1534 return TRUE;
1535 }
1536 }
1537 }
1538
1539 return FALSE;
653099ff
GS
1540}
1541
653099ff 1542STATIC void
cdd87c1d 1543S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1544{
cdd87c1d
KW
1545 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1546 * string, any code point, or any posix class under locale */
1547
557bd3fb 1548 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1549
557bd3fb 1550 Zero(ssc, 1, regnode_ssc);
71068078 1551 set_ANYOF_SYNTHETIC(ssc);
93e92956 1552 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1553 ssc_anything(ssc);
cdd87c1d 1554
2f306ab9
KW
1555 /* If any portion of the regex is to operate under locale rules that aren't
1556 * fully known at compile time, initialization includes it. The reason
1557 * this isn't done for all regexes is that the optimizer was written under
1558 * the assumption that locale was all-or-nothing. Given the complexity and
1559 * lack of documentation in the optimizer, and that there are inadequate
1560 * test cases for locale, many parts of it may not work properly, it is
1561 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1562 if (RExC_contains_locale) {
1563 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1564 }
1565 else {
1566 ANYOF_POSIXL_ZERO(ssc);
1567 }
653099ff
GS
1568}
1569
b423522f 1570STATIC int
dc3bf405
BF
1571S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1572 const regnode_ssc *ssc)
b423522f
KW
1573{
1574 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1575 * to the list of code points matched, and locale posix classes; hence does
1576 * not check its flags) */
1577
1578 UV start, end;
1579 bool ret;
1580
1581 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1582
71068078 1583 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1584
1585 invlist_iterinit(ssc->invlist);
1586 ret = invlist_iternext(ssc->invlist, &start, &end)
1587 && start == 0
1588 && end == UV_MAX;
1589
1590 invlist_iterfinish(ssc->invlist);
1591
1592 if (! ret) {
1593 return FALSE;
1594 }
1595
e0e1be5f 1596 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1597 return FALSE;
b423522f
KW
1598 }
1599
1600 return TRUE;
1601}
1602
4ebed06a
KW
1603#define INVLIST_INDEX 0
1604#define ONLY_LOCALE_MATCHES_INDEX 1
1605#define DEFERRED_USER_DEFINED_INDEX 2
1606
b423522f
KW
1607STATIC SV*
1608S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1609 const regnode_charclass* const node)
b423522f
KW
1610{
1611 /* Returns a mortal inversion list defining which code points are matched
1612 * by 'node', which is of type ANYOF. Handles complementing the result if
1613 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1614 * returned list must, and will, contain every code point that is a
1615 * possibility. */
b423522f 1616
1565c085 1617 dVAR;
e2506fa7 1618 SV* invlist = NULL;
1ee208c4 1619 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1620 unsigned int i;
1621 const U32 n = ARG(node);
31f05a37 1622 bool new_node_has_latin1 = FALSE;
2d5613be 1623 const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
29a889ef
KW
1624 ? 0
1625 : ANYOF_FLAGS(node);
b423522f
KW
1626
1627 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1628
1629 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1630 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1631 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1632 AV * const av = MUTABLE_AV(SvRV(rv));
1633 SV **const ary = AvARRAY(av);
1634 assert(RExC_rxi->data->what[n] == 's');
1635
4ebed06a 1636 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
b423522f 1637
6ed02bb6
KW
1638 /* Here there are things that won't be known until runtime -- we
1639 * have to assume it could be anything */
e2506fa7 1640 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1641 return _add_range_to_invlist(invlist, 0, UV_MAX);
1642 }
4ebed06a 1643 else if (ary[INVLIST_INDEX]) {
b423522f 1644
6ed02bb6 1645 /* Use the node's inversion list */
4ebed06a 1646 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1ee208c4
KW
1647 }
1648
1649 /* Get the code points valid only under UTF-8 locales */
766d6d33 1650 if ( (flags & ANYOFL_FOLD)
4ebed06a 1651 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1ee208c4 1652 {
4ebed06a 1653 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
b423522f
KW
1654 }
1655 }
1656
e2506fa7
KW
1657 if (! invlist) {
1658 invlist = sv_2mortal(_new_invlist(0));
1659 }
1660
dcb20b36
KW
1661 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1662 * code points, and an inversion list for the others, but if there are code
1663 * points that should match only conditionally on the target string being
1664 * UTF-8, those are placed in the inversion list, and not the bitmap.
1665 * Since there are circumstances under which they could match, they are
1666 * included in the SSC. But if the ANYOF node is to be inverted, we have
1667 * to exclude them here, so that when we invert below, the end result
1668 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1669 * have to do this here before we add the unconditionally matched code
1670 * points */
766d6d33 1671 if (flags & ANYOF_INVERT) {
b423522f
KW
1672 _invlist_intersection_complement_2nd(invlist,
1673 PL_UpperLatin1,
1674 &invlist);
1675 }
1676
1677 /* Add in the points from the bit map */
2d5613be 1678 if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
90973738
KW
1679 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1680 if (ANYOF_BITMAP_TEST(node, i)) {
1681 unsigned int start = i++;
1682
1683 for (; i < NUM_ANYOF_CODE_POINTS
1684 && ANYOF_BITMAP_TEST(node, i); ++i)
1685 {
1686 /* empty */
1687 }
1688 invlist = _add_range_to_invlist(invlist, start, i-1);
1689 new_node_has_latin1 = TRUE;
6f8848d5 1690 }
b423522f
KW
1691 }
1692 }
1693
1694 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1695 * as well. But don't add them if inverting, as when that gets done below,
1696 * it would exclude all these characters, including the ones it shouldn't
1697 * that were added just above */
766d6d33
KW
1698 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1699 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
f240c685 1700 {
b423522f
KW
1701 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1702 }
1703
1704 /* Similarly for these */
766d6d33 1705 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1706 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1707 }
1708
766d6d33 1709 if (flags & ANYOF_INVERT) {
b423522f
KW
1710 _invlist_invert(invlist);
1711 }
766d6d33 1712 else if (flags & ANYOFL_FOLD) {
35b8412f 1713 if (new_node_has_latin1) {
31f05a37 1714
26be5fe6
KW
1715 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1716 * the locale. We can skip this if there are no 0-255 at all. */
1717 _invlist_union(invlist, PL_Latin1, &invlist);
35b8412f
KW
1718
1719 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1720 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1721 }
1722 else {
1723 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1724 invlist = add_cp_to_invlist(invlist, 'I');
1725 }
1726 if (_invlist_contains_cp(invlist,
1727 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1728 {
1729 invlist = add_cp_to_invlist(invlist, 'i');
1730 }
1731 }
31f05a37
KW
1732 }
1733
1ee208c4
KW
1734 /* Similarly add the UTF-8 locale possible matches. These have to be
1735 * deferred until after the non-UTF-8 locale ones are taken care of just
1736 * above, or it leads to wrong results under ANYOF_INVERT */
1737 if (only_utf8_locale_invlist) {
31f05a37 1738 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1739 only_utf8_locale_invlist,
766d6d33 1740 flags & ANYOF_INVERT,
31f05a37
KW
1741 &invlist);
1742 }
b423522f
KW
1743
1744 return invlist;
1745}
1746
1051e1c4 1747/* These two functions currently do the exact same thing */
557bd3fb 1748#define ssc_init_zero ssc_init
653099ff 1749
cdd87c1d
KW
1750#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1751#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1752
557bd3fb 1753/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1754 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1755 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1756
653099ff 1757STATIC void
b423522f 1758S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1759 const regnode_charclass *and_with)
653099ff 1760{
cdd87c1d
KW
1761 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1762 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1763
a0dd4231 1764 SV* anded_cp_list;
2d5613be 1765 U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
29a889ef
KW
1766 ? 0
1767 : ANYOF_FLAGS(and_with);
a0dd4231 1768 U8 anded_flags;
1e6ade67 1769
cdd87c1d 1770 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1771
71068078 1772 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1773
1774 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1775 * the code point inversion list and just the relevant flags */
71068078 1776 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1777 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
765e6ecf 1778 anded_flags = and_with_flags;
e9b08962
KW
1779
1780 /* XXX This is a kludge around what appears to be deficiencies in the
1781 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1782 * there are paths through the optimizer where it doesn't get weeded
1783 * out when it should. And if we don't make some extra provision for
1784 * it like the code just below, it doesn't get added when it should.
1785 * This solution is to add it only when AND'ing, which is here, and
1786 * only when what is being AND'ed is the pristine, original node
1787 * matching anything. Thus it is like adding it to ssc_anything() but
1788 * only when the result is to be AND'ed. Probably the same solution
1789 * could be adopted for the same problem we have with /l matching,
1790 * which is solved differently in S_ssc_init(), and that would lead to
1791 * fewer false positives than that solution has. But if this solution
1792 * creates bugs, the consequences are only that a warning isn't raised
1793 * that should be; while the consequences for having /l bugs is
1794 * incorrect matches */
7dcac5f6 1795 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1796 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1797 }
a0dd4231
KW
1798 }
1799 else {
5c0f85ef 1800 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685 1801 if (OP(and_with) == ANYOFD) {
765e6ecf 1802 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
f240c685
KW
1803 }
1804 else {
765e6ecf 1805 anded_flags = and_with_flags
f240c685 1806 &( ANYOF_COMMON_FLAGS
108316fb
KW
1807 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1808 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
765e6ecf 1809 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
d1c40ef5
KW
1810 anded_flags &=
1811 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1812 }
f240c685 1813 }
a0dd4231
KW
1814 }
1815
1816 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1817
1818 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1819 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1820 * 'and_with' may be inverted. When not inverted, we have the situation of
1821 * computing:
1822 * (C1 | P1) & (C2 | P2)
1823 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1824 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1825 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1826 * <= ((C1 & C2) | P1 | P2)
1827 * Alternatively, the last few steps could be:
1828 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1829 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1830 * <= (C1 | C2 | (P1 & P2))
1831 * We favor the second approach if either P1 or P2 is non-empty. This is
1832 * because these components are a barrier to doing optimizations, as what
1833 * they match cannot be known until the moment of matching as they are
1834 * dependent on the current locale, 'AND"ing them likely will reduce or
1835 * eliminate them.
1836 * But we can do better if we know that C1,P1 are in their initial state (a
1837 * frequent occurrence), each matching everything:
1838 * (<everything>) & (C2 | P2) = C2 | P2
1839 * Similarly, if C2,P2 are in their initial state (again a frequent
1840 * occurrence), the result is a no-op
1841 * (C1 | P1) & (<everything>) = C1 | P1
1842 *
1843 * Inverted, we have
1844 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1845 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1846 * <= (C1 & ~C2) | (P1 & ~P2)
1847 * */
1aa99e6b 1848
765e6ecf 1849 if ((and_with_flags & ANYOF_INVERT)
71068078 1850 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1851 {
cdd87c1d 1852 unsigned int i;
8951c461 1853
cdd87c1d
KW
1854 ssc_intersection(ssc,
1855 anded_cp_list,
1856 FALSE /* Has already been inverted */
1857 );
c6b76537 1858
cdd87c1d
KW
1859 /* If either P1 or P2 is empty, the intersection will be also; can skip
1860 * the loop */
765e6ecf 1861 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1862 ANYOF_POSIXL_ZERO(ssc);
1863 }
e0e1be5f 1864 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1865
1866 /* Note that the Posix class component P from 'and_with' actually
1867 * looks like:
1868 * P = Pa | Pb | ... | Pn
1869 * where each component is one posix class, such as in [\w\s].
1870 * Thus
1871 * ~P = ~(Pa | Pb | ... | Pn)
1872 * = ~Pa & ~Pb & ... & ~Pn
1873 * <= ~Pa | ~Pb | ... | ~Pn
1874 * The last is something we can easily calculate, but unfortunately
1875 * is likely to have many false positives. We could do better
1876 * in some (but certainly not all) instances if two classes in
1877 * P have known relationships. For example
1878 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1879 * So
1880 * :lower: & :print: = :lower:
1881 * And similarly for classes that must be disjoint. For example,
1882 * since \s and \w can have no elements in common based on rules in
1883 * the POSIX standard,
1884 * \w & ^\S = nothing
1885 * Unfortunately, some vendor locales do not meet the Posix
1886 * standard, in particular almost everything by Microsoft.
1887 * The loop below just changes e.g., \w into \W and vice versa */
1888
1ee208c4 1889 regnode_charclass_posixl temp;
cdd87c1d
KW
1890 int add = 1; /* To calculate the index of the complement */
1891
b1234259 1892 Zero(&temp, 1, regnode_charclass_posixl);
cdd87c1d
KW
1893 ANYOF_POSIXL_ZERO(&temp);
1894 for (i = 0; i < ANYOF_MAX; i++) {
1895 assert(i % 2 != 0
7dcac5f6
KW
1896 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1897 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1898
7dcac5f6 1899 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1900 ANYOF_POSIXL_SET(&temp, i + add);
1901 }
1902 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1903 }
1904 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1905
cdd87c1d
KW
1906 } /* else ssc already has no posixes */
1907 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1908 in its initial state */
71068078 1909 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1910 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1911 {
1912 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1913 * copy it over 'ssc' */
1914 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1915 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1916 StructCopy(and_with, ssc, regnode_ssc);
1917 }
1918 else {
1919 ssc->invlist = anded_cp_list;
1920 ANYOF_POSIXL_ZERO(ssc);
765e6ecf 1921 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1922 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1923 }
1924 }
1925 }
e0e1be5f 1926 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
765e6ecf 1927 || (and_with_flags & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1928 {
1929 /* One or the other of P1, P2 is non-empty. */
765e6ecf 1930 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1931 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1932 }
cdd87c1d
KW
1933 ssc_union(ssc, anded_cp_list, FALSE);
1934 }
1935 else { /* P1 = P2 = empty */
1936 ssc_intersection(ssc, anded_cp_list, FALSE);
1937 }
137165a6 1938 }
653099ff
GS
1939}
1940
653099ff 1941STATIC void
cdd87c1d 1942S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1943 const regnode_charclass *or_with)
653099ff 1944{
cdd87c1d
KW
1945 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1946 * another SSC or a regular ANYOF class. Can create false positives if
1947 * 'or_with' is to be inverted. */
7918f24d 1948
a0dd4231
KW
1949 SV* ored_cp_list;
1950 U8 ored_flags;
2d5613be 1951 U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
29a889ef
KW
1952 ? 0
1953 : ANYOF_FLAGS(or_with);
c6b76537 1954
cdd87c1d 1955 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1956
71068078 1957 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1958
1959 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1960 * the code point inversion list and just the relevant flags */
71068078 1961 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1962 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
765e6ecf 1963 ored_flags = or_with_flags;
a0dd4231
KW
1964 }
1965 else {
5c0f85ef 1966 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
765e6ecf 1967 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
f240c685
KW
1968 if (OP(or_with) != ANYOFD) {
1969 ored_flags
765e6ecf 1970 |= or_with_flags
108316fb
KW
1971 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1972 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
765e6ecf 1973 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
d1c40ef5
KW
1974 ored_flags |=
1975 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1976 }
f240c685 1977 }
a0dd4231
KW
1978 }
1979
1980 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1981
1982 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1983 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1984 * 'or_with' may be inverted. When not inverted, we have the simple
1985 * situation of computing:
1986 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1987 * If P1|P2 yields a situation with both a class and its complement are
1988 * set, like having both \w and \W, this matches all code points, and we
1989 * can delete these from the P component of the ssc going forward. XXX We
1990 * might be able to delete all the P components, but I (khw) am not certain
1991 * about this, and it is better to be safe.
1992 *
1993 * Inverted, we have
1994 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1995 * <= (C1 | P1) | ~C2
1996 * <= (C1 | ~C2) | P1
1997 * (which results in actually simpler code than the non-inverted case)
1998 * */
9826f543 1999
765e6ecf 2000 if ((or_with_flags & ANYOF_INVERT)
71068078 2001 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 2002 {
cdd87c1d 2003 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 2004 } /* else Not inverted */
765e6ecf 2005 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
7dcac5f6 2006 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 2007 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
2008 unsigned int i;
2009 for (i = 0; i < ANYOF_MAX; i += 2) {
2010 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
2011 {
2012 ssc_match_all_cp(ssc);
2013 ANYOF_POSIXL_CLEAR(ssc, i);
2014 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
2015 }
2016 }
2017 }
1aa99e6b 2018 }
cdd87c1d
KW
2019
2020 ssc_union(ssc,
2021 ored_cp_list,
2022 FALSE /* Already has been inverted */
2023 );
653099ff
GS
2024}
2025
b423522f
KW
2026PERL_STATIC_INLINE void
2027S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
2028{
2029 PERL_ARGS_ASSERT_SSC_UNION;
2030
71068078 2031 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2032
2033 _invlist_union_maybe_complement_2nd(ssc->invlist,
2034 invlist,
2035 invert2nd,
2036 &ssc->invlist);
2037}
2038
2039PERL_STATIC_INLINE void
2040S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2041 SV* const invlist,
2042 const bool invert2nd)
2043{
2044 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2045
71068078 2046 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2047
2048 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2049 invlist,
2050 invert2nd,
2051 &ssc->invlist);
2052}
2053
2054PERL_STATIC_INLINE void
2055S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2056{
2057 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2058
71068078 2059 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2060
2061 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2062}
2063
2064PERL_STATIC_INLINE void
2065S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2066{
2067 /* AND just the single code point 'cp' into the SSC 'ssc' */
2068
2069 SV* cp_list = _new_invlist(2);
2070
2071 PERL_ARGS_ASSERT_SSC_CP_AND;
2072
71068078 2073 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2074
2075 cp_list = add_cp_to_invlist(cp_list, cp);
2076 ssc_intersection(ssc, cp_list,
2077 FALSE /* Not inverted */
2078 );
2079 SvREFCNT_dec_NN(cp_list);
2080}
2081
2082PERL_STATIC_INLINE void
dc3bf405 2083S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
2084{
2085 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
2086 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2087
71068078 2088 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2089
2090 ANYOF_POSIXL_ZERO(ssc);
2091 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2092}
2093
b35552de
KW
2094#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2095
2096STATIC bool
2097S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2098{
2099 /* The synthetic start class is used to hopefully quickly winnow down
2100 * places where a pattern could start a match in the target string. If it
2101 * doesn't really narrow things down that much, there isn't much point to
2102 * having the overhead of using it. This function uses some very crude
2103 * heuristics to decide if to use the ssc or not.
2104 *
2105 * It returns TRUE if 'ssc' rules out more than half what it considers to
2106 * be the "likely" possible matches, but of course it doesn't know what the
2107 * actual things being matched are going to be; these are only guesses
2108 *
2109 * For /l matches, it assumes that the only likely matches are going to be
2110 * in the 0-255 range, uniformly distributed, so half of that is 127
2111 * For /a and /d matches, it assumes that the likely matches will be just
2112 * the ASCII range, so half of that is 63
2113 * For /u and there isn't anything matching above the Latin1 range, it
2114 * assumes that that is the only range likely to be matched, and uses
2115 * half that as the cut-off: 127. If anything matches above Latin1,
2116 * it assumes that all of Unicode could match (uniformly), except for
2117 * non-Unicode code points and things in the General Category "Other"
2118 * (unassigned, private use, surrogates, controls and formats). This
2119 * is a much large number. */
2120
b35552de
KW
2121 U32 count = 0; /* Running total of number of code points matched by
2122 'ssc' */
2123 UV start, end; /* Start and end points of current range in inversion
26be5fe6 2124 XXX outdated. UTF-8 locales are common, what about invert? list */
72400949
KW
2125 const U32 max_code_points = (LOC)
2126 ? 256
f6e8f31e 2127 : (( ! UNI_SEMANTICS
1a26cbcb 2128 || invlist_highest(ssc->invlist) < 256)
72400949
KW
2129 ? 128
2130 : NON_OTHER_COUNT);
2131 const U32 max_match = max_code_points / 2;
b35552de
KW
2132
2133 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2134
2135 invlist_iterinit(ssc->invlist);
2136 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
2137 if (start >= max_code_points) {
2138 break;
b35552de 2139 }
72400949 2140 end = MIN(end, max_code_points - 1);
b35552de 2141 count += end - start + 1;
72400949 2142 if (count >= max_match) {
b35552de
KW
2143 invlist_iterfinish(ssc->invlist);
2144 return FALSE;
2145 }
2146 }
2147
2148 return TRUE;
2149}
2150
2151
b423522f
KW
2152STATIC void
2153S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2154{
2155 /* The inversion list in the SSC is marked mortal; now we need a more
2156 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
2157 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2158 * map */
b423522f 2159
28118b9c 2160 SV* invlist = invlist_clone(ssc->invlist, NULL);
b423522f
KW
2161
2162 PERL_ARGS_ASSERT_SSC_FINALIZE;
2163
71068078 2164 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 2165
a0dd4231 2166 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
2167 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2168 * by the time we reach here */
f240c685
KW
2169 assert(! (ANYOF_FLAGS(ssc)
2170 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
2171 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2172 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 2173
b423522f
KW
2174 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2175
4c404f26 2176 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
21c3fd9d 2177 SvREFCNT_dec(invlist);
b423522f 2178
85c8e306
KW
2179 /* Make sure is clone-safe */
2180 ssc->invlist = NULL;
2181
e0e1be5f 2182 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 2183 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
d156f5cb 2184 OP(ssc) = ANYOFPOSIXL;
e0e1be5f 2185 }
d156f5cb 2186 else if (RExC_contains_locale) {
b2e90ddf
KW
2187 OP(ssc) = ANYOFL;
2188 }
2189
1462525b 2190 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
2191}
2192
a3621e74
YO
2193#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2194#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2195#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
2196#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2197 ? (TRIE_LIST_CUR( idx ) - 1) \
2198 : 0 )
a3621e74 2199
3dab1dad
YO
2200
2201#ifdef DEBUGGING
07be1b83 2202/*
2b8b4781
NC
2203 dump_trie(trie,widecharmap,revcharmap)
2204 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2205 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
2206
2207 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
2208 The _interim_ variants are used for debugging the interim
2209 tables that are used to generate the final compressed
2210 representation which is what dump_trie expects.
2211
486ec47a 2212 Part of the reason for their existence is to provide a form
3dab1dad 2213 of documentation as to how the different representations function.
07be1b83
YO
2214
2215*/
3dab1dad
YO
2216
2217/*
3dab1dad
YO
2218 Dumps the final compressed table form of the trie to Perl_debug_log.
2219 Used for debugging make_trie().
2220*/
b9a59e08 2221
3dab1dad 2222STATIC void
2b8b4781
NC
2223S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2224 AV *revcharmap, U32 depth)
3dab1dad
YO
2225{
2226 U32 state;
ab3bbdeb 2227 SV *sv=sv_newmortal();
55eed653 2228 int colwidth= widecharmap ? 6 : 4;
2e64971a 2229 U16 word;
3dab1dad
YO
2230 GET_RE_DEBUG_FLAGS_DECL;
2231
7918f24d 2232 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 2233
6ad9a8ab 2234 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 2235 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
2236
2237 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 2238 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 2239 if ( tmp ) {
6ad9a8ab 2240 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2241 colwidth,
538e84ed 2242 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2243 PL_colors[0], PL_colors[1],
2244 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2245 PERL_PV_ESCAPE_FIRSTCHAR
2246 )
ab3bbdeb 2247 );
3dab1dad
YO
2248 }
2249 }
1e37780e
YO
2250 Perl_re_printf( aTHX_ "\n");
2251 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2252
2253 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2254 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2255 Perl_re_printf( aTHX_ "\n");
3dab1dad 2256
1e2e3d02 2257 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2258 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2259
147e3846 2260 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2261
2262 if ( trie->states[ state ].wordnum ) {
1e37780e 2263 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2264 } else {
6ad9a8ab 2265 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2266 }
2267
147e3846 2268 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2269
2270 if ( base ) {
2271 U32 ofs = 0;
2272
2273 while( ( base + ofs < trie->uniquecharcount ) ||
2274 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2275 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2276 != state))
3dab1dad
YO
2277 ofs++;
2278
147e3846 2279 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2280
2281 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2282 if ( ( base + ofs >= trie->uniquecharcount )
2283 && ( base + ofs - trie->uniquecharcount
2284 < trie->lasttrans )
2285 && trie->trans[ base + ofs
2286 - trie->uniquecharcount ].check == state )
3dab1dad 2287 {
147e3846 2288 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2289 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2290 );
3dab1dad 2291 } else {
88f063b4 2292 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
3dab1dad
YO
2293 }
2294 }
2295
6ad9a8ab 2296 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2297
2298 }
6ad9a8ab 2299 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2300 }
6ad9a8ab 2301 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2302 depth);
2e64971a 2303 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2304 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2305 (int)word, (int)(trie->wordinfo[word].prev),
2306 (int)(trie->wordinfo[word].len));
2307 }
6ad9a8ab 2308 Perl_re_printf( aTHX_ "\n" );
538e84ed 2309}
3dab1dad 2310/*
3dab1dad 2311 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2312 List tries normally only are used for construction when the number of
3dab1dad
YO
2313 possible chars (trie->uniquecharcount) is very high.
2314 Used for debugging make_trie().
2315*/
2316STATIC void
55eed653 2317S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2318 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2319 U32 depth)
3dab1dad
YO
2320{
2321 U32 state;
ab3bbdeb 2322 SV *sv=sv_newmortal();
55eed653 2323 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2324 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2325
2326 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2327
3dab1dad 2328 /* print out the table precompression. */
6ad9a8ab 2329 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2330 depth+1 );
6ad9a8ab 2331 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2332 depth+1, "------:-----+-----------------\n" );
538e84ed 2333
3dab1dad
YO
2334 for( state=1 ; state < next_alloc ; state ++ ) {
2335 U16 charid;
538e84ed 2336
147e3846 2337 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2338 depth+1, (UV)state );
3dab1dad 2339 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2340 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2341 } else {
6ad9a8ab 2342 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2343 trie->states[ state ].wordnum
2344 );
2345 }
2346 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed 2347 SV ** const tmp = av_fetch( revcharmap,
88f063b4 2348 TRIE_LIST_ITEM(state, charid).forid, 0);
ab3bbdeb 2349 if ( tmp ) {
147e3846 2350 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2351 colwidth,
538e84ed
KW
2352 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2353 colwidth,
2354 PL_colors[0], PL_colors[1],
2355 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2356 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2357 ) ,
88f063b4
KW
2358 TRIE_LIST_ITEM(state, charid).forid,
2359 (UV)TRIE_LIST_ITEM(state, charid).newstate
1e2e3d02 2360 );
538e84ed 2361 if (!(charid % 10))
6ad9a8ab 2362 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2363 (int)((depth * 2) + 14), "");
1e2e3d02 2364 }
ab3bbdeb 2365 }
6ad9a8ab 2366 Perl_re_printf( aTHX_ "\n");
3dab1dad 2367 }
538e84ed 2368}
3dab1dad
YO
2369
2370/*
3dab1dad 2371 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2372 This is the normal DFA style state transition table, with a few
2373 twists to facilitate compression later.
3dab1dad
YO
2374 Used for debugging make_trie().
2375*/
2376STATIC void
55eed653 2377S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2378 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2379 U32 depth)
3dab1dad
YO
2380{
2381 U32 state;
2382 U16 charid;
ab3bbdeb 2383 SV *sv=sv_newmortal();
55eed653 2384 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2385 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2386
2387 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2388
3dab1dad
YO
2389 /*
2390 print out the table precompression so that we can do a visual check
2391 that they are identical.
2392 */
538e84ed 2393
6ad9a8ab 2394 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2395
2396 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2397 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2398 if ( tmp ) {
6ad9a8ab 2399 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2400 colwidth,
538e84ed 2401 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2402 PL_colors[0], PL_colors[1],
2403 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2404 PERL_PV_ESCAPE_FIRSTCHAR
2405 )
ab3bbdeb 2406 );
3dab1dad
YO
2407 }
2408 }
2409
4aaafc03
YO
2410 Perl_re_printf( aTHX_ "\n");
2411 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2412
2413 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2414 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2415 }
2416
6ad9a8ab 2417 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2418
2419 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2420
147e3846 2421 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2422 depth+1,
3dab1dad
YO
2423 (UV)TRIE_NODENUM( state ) );
2424
2425 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2426 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2427 if (v)
147e3846 2428 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2429 else
6ad9a8ab 2430 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2431 }
2432 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2433 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2434 (UV)trie->trans[ state ].check );
3dab1dad 2435 } else {
147e3846 2436 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2437 (UV)trie->trans[ state ].check,
3dab1dad
YO
2438 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2439 }
2440 }
07be1b83 2441}
3dab1dad
YO
2442
2443#endif
2444
2e64971a 2445
786e8c11
YO
2446/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2447 startbranch: the first branch in the whole branch sequence
2448 first : start branch of sequence of branch-exact nodes.
2449 May be the same as startbranch
2450 last : Thing following the last branch.
2451 May be the same as tail.
2452 tail : item following the branch sequence
2453 count : words in the sequence
a4525e78 2454 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2455 depth : indent depth
3dab1dad 2456
786e8c11 2457Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2458
786e8c11
YO
2459A trie is an N'ary tree where the branches are determined by digital
2460decomposition of the key. IE, at the root node you look up the 1st character and
2461follow that branch repeat until you find the end of the branches. Nodes can be
2462marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2463
786e8c11 2464 /he|she|his|hers/
72f13be8 2465
786e8c11
YO
2466would convert into the following structure. Numbers represent states, letters
2467following numbers represent valid transitions on the letter from that state, if
2468the number is in square brackets it represents an accepting state, otherwise it
2469will be in parenthesis.
07be1b83 2470
786e8c11
YO
2471 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2472 | |
2473 | (2)
2474 | |
2475 (1) +-i->(6)-+-s->[7]
2476 |
2477 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2478
786e8c11
YO
2479 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2480
2481This shows that when matching against the string 'hers' we will begin at state 1
2482read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2483then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2484is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2485single traverse. We store a mapping from accepting to state to which word was
2486matched, and then when we have multiple possibilities we try to complete the
b8fda935 2487rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2488
2489The only prior NFA like behaviour that would be changed by the TRIE support is
2490the silent ignoring of duplicate alternations which are of the form:
2491
2492 / (DUPE|DUPE) X? (?{ ... }) Y /x
2493
4b714af6 2494Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2495and without the optimisation. With the optimisations dupes will be silently
486ec47a 2496ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2497the following demonstrates:
2498
2499 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2500
2501which prints out 'word' three times, but
2502
2503 'words'=~/(word|word|word)(?{ print $1 })S/
2504
2505which doesnt print it out at all. This is due to other optimisations kicking in.
2506
2507Example of what happens on a structural level:
2508
486ec47a 2509The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2510
2511 1: CURLYM[1] {1,32767}(18)
2512 5: BRANCH(8)
2513 6: EXACT <ac>(16)
2514 8: BRANCH(11)
2515 9: EXACT <ad>(16)
2516 11: BRANCH(14)
2517 12: EXACT <ab>(16)
2518 16: SUCCEED(0)
2519 17: NOTHING(18)
2520 18: END(0)
2521
2522This would be optimizable with startbranch=5, first=5, last=16, tail=16
2523and should turn into:
2524
2525 1: CURLYM[1] {1,32767}(18)
2526 5: TRIE(16)
2527 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2528 <ac>
2529 <ad>
2530 <ab>
2531 16: SUCCEED(0)
2532 17: NOTHING(18)
2533 18: END(0)
2534
2535Cases where tail != last would be like /(?foo|bar)baz/:
2536
2537 1: BRANCH(4)
2538 2: EXACT <foo>(8)
2539 4: BRANCH(7)
2540 5: EXACT <bar>(8)
2541 7: TAIL(8)
2542 8: EXACT <baz>(10)
2543 10: END(0)
2544
2545which would be optimizable with startbranch=1, first=1, last=7, tail=8
2546and would end up looking like:
2547
2548 1: TRIE(8)
2549 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2550 <foo>
2551 <bar>
2552 7: TAIL(8)
2553 8: EXACT <baz>(10)
2554 10: END(0)
2555
c80e42f3 2556 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2557
2558is the recommended Unicode-aware way of saying
2559
2560 *(d++) = uv;
2561*/
2562
fab2782b 2563#define TRIE_STORE_REVCHAR(val) \
786e8c11 2564 STMT_START { \
73031816 2565 if (UTF) { \
668fcfea 2566 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2567 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1d84a256
DM
2568 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2569 *kapow = '\0'; \
73031816
NC
2570 SvCUR_set(zlopp, kapow - flrbbbbb); \
2571 SvPOK_on(zlopp); \
2572 SvUTF8_on(zlopp); \
2573 av_push(revcharmap, zlopp); \
2574 } else { \
fab2782b 2575 char ooooff = (char)val; \
73031816
NC
2576 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2577 } \
2578 } STMT_END
786e8c11 2579
914a25d5
KW
2580/* This gets the next character from the input, folding it if not already
2581 * folded. */
2582#define TRIE_READ_CHAR STMT_START { \
2583 wordlen++; \
2584 if ( UTF ) { \
2585 /* if it is UTF then it is either already folded, or does not need \
2586 * folding */ \
1c1d615a 2587 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2588 } \
2589 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2590 /* This folder implies Unicode rules, which in the range expressible \
2591 * by not UTF is the lower case, with the two exceptions, one of \
2592 * which should have been taken care of before calling this */ \
2593 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2594 uvc = toLOWER_L1(*uc); \
2595 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2596 len = 1; \
914a25d5
KW
2597 } else { \
2598 /* raw data, will be folded later if needed */ \
2599 uvc = (U32)*uc; \
2600 len = 1; \
2601 } \
786e8c11
YO
2602} STMT_END
2603
2604
2605
2606#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2607 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
00195859 2608 U32 ging = TRIE_LIST_LEN( state ) * 2; \
f9003953 2609 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
00195859 2610 TRIE_LIST_LEN( state ) = ging; \
786e8c11
YO
2611 } \
2612 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2613 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2614 TRIE_LIST_CUR( state )++; \
2615} STMT_END
07be1b83 2616
786e8c11 2617#define TRIE_LIST_NEW(state) STMT_START { \
d09f14bf 2618 Newx( trie->states[ state ].trans.list, \
786e8c11
YO
2619 4, reg_trie_trans_le ); \
2620 TRIE_LIST_CUR( state ) = 1; \
2621 TRIE_LIST_LEN( state ) = 4; \
2622} STMT_END
07be1b83 2623
786e8c11
YO
2624#define TRIE_HANDLE_WORD(state) STMT_START { \
2625 U16 dupe= trie->states[ state ].wordnum; \
2626 regnode * const noper_next = regnext( noper ); \
2627 \
786e8c11
YO
2628 DEBUG_r({ \
2629 /* store the word for dumping */ \
2630 SV* tmp; \
2631 if (OP(noper) != NOTHING) \
740cce10 2632 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2633 else \
740cce10 2634 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2635 av_push( trie_words, tmp ); \
786e8c11
YO
2636 }); \
2637 \
2638 curword++; \
2e64971a
DM
2639 trie->wordinfo[curword].prev = 0; \
2640 trie->wordinfo[curword].len = wordlen; \
2641 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2642 \
2643 if ( noper_next < tail ) { \
2644 if (!trie->jump) \
538e84ed
KW
2645 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2646 sizeof(U16) ); \
7f69552c 2647 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2648 if (!jumper) \
2649 jumper = noper_next; \
2650 if (!nextbranch) \
2651 nextbranch= regnext(cur); \
2652 } \
2653 \
2654 if ( dupe ) { \
2e64971a
DM
2655 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2656 /* chain, so that when the bits of chain are later */\
2657 /* linked together, the dups appear in the chain */\
2658 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2659 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2660 } else { \
2661 /* we haven't inserted this word yet. */ \
2662 trie->states[ state ].wordnum = curword; \
2663 } \
2664} STMT_END
07be1b83 2665
3dab1dad 2666
786e8c11
YO
2667#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2668 ( ( base + charid >= ucharcount \
2669 && base + charid < ubound \
2670 && state == trie->trans[ base - ucharcount + charid ].check \
2671 && trie->trans[ base - ucharcount + charid ].next ) \
2672 ? trie->trans[ base - ucharcount + charid ].next \
2673 : ( state==1 ? special : 0 ) \
2674 )
3dab1dad 2675
8bcafbf4
YO
2676#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2677STMT_START { \
2678 TRIE_BITMAP_SET(trie, uvc); \
2679 /* store the folded codepoint */ \
2680 if ( folder ) \
2681 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2682 \
2683 if ( !UTF ) { \
2684 /* store first byte of utf8 representation of */ \
2685 /* variant codepoints */ \
2686 if (! UVCHR_IS_INVARIANT(uvc)) { \
2687 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2688 } \
2689 } \
2690} STMT_END
786e8c11
YO
2691#define MADE_TRIE 1
2692#define MADE_JUMP_TRIE 2
2693#define MADE_EXACT_TRIE 4
3dab1dad 2694
a3621e74 2695STATIC I32
538e84ed
KW
2696S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2697 regnode *first, regnode *last, regnode *tail,
2698 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2699{
2700 /* first pass, loop through and scan words */
2701 reg_trie_data *trie;
55eed653 2702 HV *widecharmap = NULL;
2b8b4781 2703 AV *revcharmap = newAV();
a3621e74 2704 regnode *cur;
a3621e74
YO
2705 STRLEN len = 0;
2706 UV uvc = 0;
2707 U16 curword = 0;
2708 U32 next_alloc = 0;
786e8c11
YO
2709 regnode *jumper = NULL;
2710 regnode *nextbranch = NULL;
7f69552c 2711 regnode *convert = NULL;
2e64971a 2712 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2713 /* we just use folder as a flag in utf8 */
1e696034 2714 const U8 * folder = NULL;
a3621e74 2715
3a611511
YO
2716 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2717 * which stands for one trie structure, one hash, optionally followed
2718 * by two arrays */
2b8b4781 2719#ifdef DEBUGGING
3a611511 2720 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2b8b4781
NC
2721 AV *trie_words = NULL;
2722 /* along with revcharmap, this only used during construction but both are
2723 * useful during debugging so we store them in the struct when debugging.
8e11feef 2724 */
2b8b4781 2725#else
cf78de0b 2726 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2727 STRLEN trie_charcount=0;
3dab1dad 2728#endif
2b8b4781 2729 SV *re_trie_maxbuff;
a3621e74 2730 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2731
2732 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2733#ifndef DEBUGGING
2734 PERL_UNUSED_ARG(depth);
2735#endif
a3621e74 2736
1e696034 2737 switch (flags) {
3f2416ae 2738 case EXACT: case EXACT_REQ8: case EXACTL: break;
89829bb5 2739 case EXACTFAA:
0ea669f4 2740 case EXACTFUP:
a4525e78
KW
2741 case EXACTFU:
2742 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2743 case EXACTF: folder = PL_fold; break;
fab2782b 2744 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2745 }
2746
c944940b 2747 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2748 trie->refcount = 1;
3dab1dad 2749 trie->startstate = 1;
786e8c11 2750 trie->wordcount = word_count;
f8fc2ecf 2751 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2752 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3f2416ae 2753 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
c944940b 2754 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2755 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2756 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2757
a3621e74 2758 DEBUG_r({
2b8b4781 2759 trie_words = newAV();
a3621e74 2760 });
a3621e74 2761
3b89859a 2762 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
316ebaf2 2763 assert(re_trie_maxbuff);
a3621e74 2764 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2765 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2766 }
df826430 2767 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2768 Perl_re_indentf( aTHX_
cb41e5d6
YO
2769 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2770 depth+1,
88f063b4 2771 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
538e84ed 2772 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2773 });
538e84ed 2774
7f69552c
YO
2775 /* Find the node we are going to overwrite */
2776 if ( first == startbranch && OP( last ) != BRANCH ) {
2777 /* whole branch chain */
2778 convert = first;
2779 } else {
2780 /* branch sub-chain */
2781 convert = NEXTOPER( first );
2782 }
538e84ed 2783
a3621e74
YO
2784 /* -- First loop and Setup --
2785
2786 We first traverse the branches and scan each word to determine if it
2787 contains widechars, and how many unique chars there are, this is
2788 important as we have to build a table with at least as many columns as we
2789 have unique chars.
2790
2791 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2792 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2793 the native representation of the character value as the key and IV's for
2794 the coded index.
a3621e74
YO
2795
2796 *TODO* If we keep track of how many times each character is used we can
2797 remap the columns so that the table compression later on is more
3b753521 2798 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2799 middle and the least common are on the outside. IMO this would be better
2800 than a most to least common mapping as theres a decent chance the most
2801 common letter will share a node with the least common, meaning the node
486ec47a 2802 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2803 case is when we have the least common nodes twice.
2804
2805 */
2806
a3621e74 2807 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2808 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2809 const U8 *uc;
2810 const U8 *e;
bc031a7d 2811 int foldlen = 0;
07be1b83 2812 U32 wordlen = 0; /* required init */
bc031a7d
KW
2813 STRLEN minchars = 0;
2814 STRLEN maxchars = 0;
538e84ed
KW
2815 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2816 bitmap?*/
a3621e74 2817
3dab1dad 2818 if (OP(noper) == NOTHING) {
20ed8c88
YO
2819 /* skip past a NOTHING at the start of an alternation
2820 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
ca902fb8
YO
2821 *
2822 * If the next node is not something we are supposed to process
2823 * we will just ignore it due to the condition guarding the
2824 * next block.
20ed8c88 2825 */
ca902fb8 2826
df826430 2827 regnode *noper_next= regnext(noper);
944e05e3
YO
2828 if (noper_next < tail)
2829 noper= noper_next;
2830 }
2831
f6b4b99d
KW
2832 if ( noper < tail
2833 && ( OP(noper) == flags
3f2416ae
KW
2834 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2835 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 2836 || OP(noper) == EXACTFUP))))
f6b4b99d 2837 {
944e05e3
YO
2838 uc= (U8*)STRING(noper);
2839 e= uc + STR_LEN(noper);
2840 } else {
2841 trie->minlen= 0;
2842 continue;
3dab1dad 2843 }
df826430 2844
944e05e3 2845
fab2782b 2846 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2847 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2848 regardless of encoding */
0ea669f4 2849 if (OP( noper ) == EXACTFUP) {
fab2782b 2850 /* false positives are ok, so just set this */
0dc4a61d 2851 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2852 }
2853 }
dca5fc4c 2854
bc031a7d
KW
2855 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2856 branch */
3dab1dad 2857 TRIE_CHARCOUNT(trie)++;
a3621e74 2858 TRIE_READ_CHAR;
645de4ce 2859
bc031a7d
KW
2860 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2861 * is in effect. Under /i, this character can match itself, or
2862 * anything that folds to it. If not under /i, it can match just
2863 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2864 * all fold to k, and all are single characters. But some folds
2865 * expand to more than one character, so for example LATIN SMALL
2866 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2867 * the string beginning at 'uc' is 'ffi', it could be matched by
2868 * three characters, or just by the one ligature character. (It
2869 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2870 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2871 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2872 * match.) The trie needs to know the minimum and maximum number
2873 * of characters that could match so that it can use size alone to
2874 * quickly reject many match attempts. The max is simple: it is
2875 * the number of folded characters in this branch (since a fold is
2876 * never shorter than what folds to it. */
2877
2878 maxchars++;
2879
2880 /* And the min is equal to the max if not under /i (indicated by
2881 * 'folder' being NULL), or there are no multi-character folds. If
2882 * there is a multi-character fold, the min is incremented just
2883 * once, for the character that folds to the sequence. Each
2884 * character in the sequence needs to be added to the list below of
2885 * characters in the trie, but we count only the first towards the
2886 * min number of characters needed. This is done through the
2887 * variable 'foldlen', which is returned by the macros that look
2888 * for these sequences as the number of bytes the sequence
2889 * occupies. Each time through the loop, we decrement 'foldlen' by
2890 * how many bytes the current char occupies. Only when it reaches
2891 * 0 do we increment 'minchars' or look for another multi-character
2892 * sequence. */
2893 if (folder == NULL) {
2894 minchars++;
2895 }
2896 else if (foldlen > 0) {
2897 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2898 }
2899 else {
bc031a7d
KW
2900 minchars++;
2901
2902 /* See if *uc is the beginning of a multi-character fold. If
2903 * so, we decrement the length remaining to look at, to account
2904 * for the current character this iteration. (We can use 'uc'
2905 * instead of the fold returned by TRIE_READ_CHAR because for
2906 * non-UTF, the latin1_safe macro is smart enough to account
2907 * for all the unfolded characters, and because for UTF, the
2908 * string will already have been folded earlier in the
2909 * compilation process */
2910 if (UTF) {
2911 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2912 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2913 }
2914 }
bc031a7d
KW
2915 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2916 foldlen--;
2917 }
645de4ce 2918 }
bc031a7d
KW
2919
2920 /* The current character (and any potential folds) should be added
2921 * to the possible matching characters for this position in this
2922 * branch */
a3621e74 2923 if ( uvc < 256 ) {
fab2782b
YO
2924 if ( folder ) {
2925 U8 folded= folder[ (U8) uvc ];
2926 if ( !trie->charmap[ folded ] ) {
2927 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2928 TRIE_STORE_REVCHAR( folded );
2929 }
2930 }
a3621e74
YO
2931 if ( !trie->charmap[ uvc ] ) {
2932 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2933 TRIE_STORE_REVCHAR( uvc );
a3621e74 2934 }
02daf0ab 2935 if ( set_bit ) {
62012aee
KW
2936 /* store the codepoint in the bitmap, and its folded
2937 * equivalent. */
8bcafbf4 2938 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2939 set_bit = 0; /* We've done our bit :-) */
2940 }
a3621e74 2941 } else {
bc031a7d
KW
2942
2943 /* XXX We could come up with the list of code points that fold
2944 * to this using PL_utf8_foldclosures, except not for
2945 * multi-char folds, as there may be multiple combinations
2946 * there that could work, which needs to wait until runtime to
2947 * resolve (The comment about LIGATURE FFI above is such an
2948 * example */
2949
a3621e74 2950 SV** svpp;
55eed653
NC
2951 if ( !widecharmap )
2952 widecharmap = newHV();
a3621e74 2953
55eed653 2954 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2955
2956 if ( !svpp )
147e3846 2957 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
2958
2959 if ( !SvTRUE( *svpp ) ) {
2960 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2961 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2962 }
2963 }
bc031a7d
KW
2964 } /* end loop through characters in this branch of the trie */
2965
2966 /* We take the min and max for this branch and combine to find the min
2967 * and max for all branches processed so far */
3dab1dad 2968 if( cur == first ) {
bc031a7d
KW
2969 trie->minlen = minchars;
2970 trie->maxlen = maxchars;
2971 } else if (minchars < trie->minlen) {
2972 trie->minlen = minchars;
2973 } else if (maxchars > trie->maxlen) {
2974 trie->maxlen = maxchars;
fab2782b 2975 }
a3621e74
YO
2976 } /* end first pass */
2977 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2978 Perl_re_indentf( aTHX_
cb41e5d6
YO
2979 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2980 depth+1,
55eed653 2981 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2982 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2983 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2984 );
a3621e74
YO
2985
2986 /*
2987 We now know what we are dealing with in terms of unique chars and
2988 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2989 representation using a flat table will take. If it's over a reasonable
2990 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2991 conservative but potentially much slower representation using an array
2992 of lists.
2993
2994 At the end we convert both representations into the same compressed
2995 form that will be used in regexec.c for matching with. The latter
2996 is a form that cannot be used to construct with but has memory
2997 properties similar to the list form and access properties similar
2998 to the table form making it both suitable for fast searches and
2999 small enough that its feasable to store for the duration of a program.
3000
3001 See the comment in the code where the compressed table is produced
3002 inplace from the flat tabe representation for an explanation of how
3003 the compression works.
3004
3005 */
3006
3007
2e64971a
DM
3008 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
3009 prev_states[1] = 0;
3010
538e84ed
KW
3011 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
3012 > SvIV(re_trie_maxbuff) )
3013 {
a3621e74
YO
3014 /*
3015 Second Pass -- Array Of Lists Representation
3016
3017 Each state will be represented by a list of charid:state records
3018 (reg_trie_trans_le) the first such element holds the CUR and LEN
3019 points of the allocated array. (See defines above).
3020
3021 We build the initial structure using the lists, and then convert
3022 it into the compressed table form which allows faster lookups
3023 (but cant be modified once converted).
a3621e74
YO
3024 */
3025
a3621e74
YO
3026 STRLEN transcount = 1;
3027
6ad9a8ab 3028 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 3029 depth+1));
686b73d4 3030
c944940b
JH
3031 trie->states = (reg_trie_state *)
3032 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3033 sizeof(reg_trie_state) );
a3621e74
YO
3034 TRIE_LIST_NEW(1);
3035 next_alloc = 2;
3036
3037 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3038
df826430 3039 regnode *noper = NEXTOPER( cur );
c445ea15
AL
3040 U32 state = 1; /* required init */
3041 U16 charid = 0; /* sanity init */
07be1b83 3042 U32 wordlen = 0; /* required init */
c445ea15 3043
df826430
YO
3044 if (OP(noper) == NOTHING) {
3045 regnode *noper_next= regnext(noper);
944e05e3
YO
3046 if (noper_next < tail)
3047 noper= noper_next;
ca902fb8
YO
3048 /* we will undo this assignment if noper does not
3049 * point at a trieable type in the else clause of
3050 * the following statement. */
df826430
YO
3051 }
3052
f6b4b99d
KW
3053 if ( noper < tail
3054 && ( OP(noper) == flags
3f2416ae
KW
3055 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3056 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 3057 || OP(noper) == EXACTFUP))))
f6b4b99d 3058 {
944e05e3
YO
3059 const U8 *uc= (U8*)STRING(noper);
3060 const U8 *e= uc + STR_LEN(noper);
3061
786e8c11 3062 for ( ; uc < e ; uc += len ) {
c445ea15 3063
786e8c11 3064 TRIE_READ_CHAR;
c445ea15 3065
786e8c11
YO
3066 if ( uvc < 256 ) {
3067 charid = trie->charmap[ uvc ];
c445ea15 3068 } else {
538e84ed
KW
3069 SV** const svpp = hv_fetch( widecharmap,
3070 (char*)&uvc,
3071 sizeof( UV ),
3072 0);
786e8c11
YO
3073 if ( !svpp ) {
3074 charid = 0;
3075 } else {
3076 charid=(U16)SvIV( *svpp );
3077 }
c445ea15 3078 }
538e84ed
KW
3079 /* charid is now 0 if we dont know the char read, or
3080 * nonzero if we do */
786e8c11 3081 if ( charid ) {
a3621e74 3082
786e8c11
YO
3083 U16 check;
3084 U32 newstate = 0;
a3621e74 3085
786e8c11
YO
3086 charid--;
3087 if ( !trie->states[ state ].trans.list ) {
3088 TRIE_LIST_NEW( state );
c445ea15 3089 }
538e84ed
KW
3090 for ( check = 1;
3091 check <= TRIE_LIST_USED( state );
3092 check++ )
3093 {
3094 if ( TRIE_LIST_ITEM( state, check ).forid
3095 == charid )
3096 {
786e8c11
YO
3097 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3098 break;
3099 }
3100 }
3101 if ( ! newstate ) {
3102 newstate = next_alloc++;
2e64971a 3103 prev_states[newstate] = state;
786e8c11
YO
3104 TRIE_LIST_PUSH( state, charid, newstate );
3105 transcount++;
3106 }
3107 state = newstate;
3108 } else {
147e3846 3109 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 3110 }
a28509cc 3111 }
ca902fb8
YO
3112 } else {
3113 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3114 * on a trieable type. So we need to reset noper back to point at the first regop
3115 * in the branch before we call TRIE_HANDLE_WORD()
3116 */
3117 noper= NEXTOPER(cur);
3118 }
3dab1dad 3119 TRIE_HANDLE_WORD(state);
a3621e74
YO
3120
3121 } /* end second pass */
3122
1e2e3d02 3123 /* next alloc is the NEXT state to be allocated */
538e84ed 3124 trie->statecount = next_alloc;
c944940b
JH
3125 trie->states = (reg_trie_state *)
3126 PerlMemShared_realloc( trie->states,
3127 next_alloc
3128 * sizeof(reg_trie_state) );
a3621e74 3129
3dab1dad 3130 /* and now dump it out before we compress it */
2b8b4781
NC
3131 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3132 revcharmap, next_alloc,
3133 depth+1)
1e2e3d02 3134 );
a3621e74 3135
c944940b
JH
3136 trie->trans = (reg_trie_trans *)
3137 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
3138 {
3139 U32 state;
a3621e74
YO
3140 U32 tp = 0;
3141 U32 zp = 0;
3142
3143
3144 for( state=1 ; state < next_alloc ; state ++ ) {
3145 U32 base=0;
3146
3147 /*
3148 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3149 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
3150 );
3151 */
3152
3153 if (trie->states[state].trans.list) {
3154 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3155 U16 maxid=minid;
a28509cc 3156 U16 idx;
a3621e74
YO
3157
3158 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
3159 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3160 if ( forid < minid ) {
3161 minid=forid;
3162 } else if ( forid > maxid ) {
3163 maxid=forid;
3164 }
a3621e74
YO
3165 }
3166 if ( transcount < tp + maxid - minid + 1) {
3167 transcount *= 2;
c944940b
JH
3168 trie->trans = (reg_trie_trans *)
3169 PerlMemShared_realloc( trie->trans,
446bd890
NC
3170 transcount
3171 * sizeof(reg_trie_trans) );
538e84ed
KW
3172 Zero( trie->trans + (transcount / 2),
3173 transcount / 2,
3174 reg_trie_trans );
a3621e74
YO
3175 }
3176 base = trie->uniquecharcount + tp - minid;
3177 if ( maxid == minid ) {
3178 U32 set = 0;
3179 for ( ; zp < tp ; zp++ ) {
3180 if ( ! trie->trans[ zp ].next ) {
3181 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
3182 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3183 1).newstate;
a3621e74
YO
3184 trie->trans[ zp ].check = state;
3185 set = 1;
3186 break;
3187 }
3188 }
3189 if ( !set ) {
538e84ed
KW
3190 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3191 1).newstate;
a3621e74
YO
3192 trie->trans[ tp ].check = state;
3193 tp++;
3194 zp = tp;
3195 }
3196 } else {
3197 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
3198 const U32 tid = base
3199 - trie->uniquecharcount
3200 + TRIE_LIST_ITEM( state, idx ).forid;
3201 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3202 idx ).newstate;
a3621e74
YO
3203 trie->trans[ tid ].check = state;
3204 }
3205 tp += ( maxid - minid + 1 );
3206 }
3207 Safefree(trie->states[ state ].trans.list);
3208 }
3209 /*
3210 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3211 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
3212 );
3213 */
3214 trie->states[ state ].trans.base=base;
3215 }
cc601c31 3216 trie->lasttrans = tp + 1;
a3621e74
YO
3217 }
3218 } else {
3219 /*
3220 Second Pass -- Flat Table Representation.
3221
b423522f
KW
3222 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3223 each. We know that we will need Charcount+1 trans at most to store
3224 the data (one row per char at worst case) So we preallocate both
3225 structures assuming worst case.
a3621e74
YO
3226
3227 We then construct the trie using only the .next slots of the entry
3228 structs.
3229
b423522f
KW
3230 We use the .check field of the first entry of the node temporarily
3231 to make compression both faster and easier by keeping track of how
3232 many non zero fields are in the node.
a3621e74
YO
3233
3234 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3235 transition.
3236
b423522f
KW
3237 There are two terms at use here: state as a TRIE_NODEIDX() which is
3238 a number representing the first entry of the node, and state as a
3239 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3240 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3241 if there are 2 entrys per node. eg:
a3621e74
YO
3242
3243 A B A B
3244 1. 2 4 1. 3 7
3245 2. 0 3 3. 0 5
3246 3. 0 0 5. 0 0
3247 4. 0 0 7. 0 0
3248
b423522f
KW
3249 The table is internally in the right hand, idx form. However as we
3250 also have to deal with the states array which is indexed by nodenum
3251 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
3252
3253 */
6ad9a8ab 3254 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 3255 depth+1));
3dab1dad 3256
c944940b
JH
3257 trie->trans = (reg_trie_trans *)
3258 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3259 * trie->uniquecharcount + 1,
3260 sizeof(reg_trie_trans) );
3261 trie->states = (reg_trie_state *)
3262 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3263 sizeof(reg_trie_state) );
a3621e74
YO
3264 next_alloc = trie->uniquecharcount + 1;
3265
3dab1dad 3266
a3621e74
YO
3267 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3268
df826430 3269 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3270
3271 U32 state = 1; /* required init */
3272
3273 U16 charid = 0; /* sanity init */
3274 U32 accept_state = 0; /* sanity init */
a3621e74 3275
07be1b83 3276 U32 wordlen = 0; /* required init */
a3621e74 3277
df826430
YO
3278 if (OP(noper) == NOTHING) {
3279 regnode *noper_next= regnext(noper);
944e05e3
YO
3280 if (noper_next < tail)
3281 noper= noper_next;
ca902fb8
YO
3282 /* we will undo this assignment if noper does not
3283 * point at a trieable type in the else clause of
3284 * the following statement. */
df826430 3285 }
fab2782b 3286
f6b4b99d
KW
3287 if ( noper < tail
3288 && ( OP(noper) == flags
3f2416ae
KW
3289 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3290 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 3291 || OP(noper) == EXACTFUP))))
f6b4b99d 3292 {
944e05e3
YO
3293 const U8 *uc= (U8*)STRING(noper);
3294 const U8 *e= uc + STR_LEN(noper);
3295
786e8c11 3296 for ( ; uc < e ; uc += len ) {
a3621e74 3297
786e8c11 3298 TRIE_READ_CHAR;
a3621e74 3299
786e8c11
YO
3300 if ( uvc < 256 ) {
3301 charid = trie->charmap[ uvc ];
3302 } else {
538e84ed
KW
3303 SV* const * const svpp = hv_fetch( widecharmap,
3304 (char*)&uvc,
3305 sizeof( UV ),
3306 0);
786e8c11 3307 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3308 }
786e8c11
YO
3309 if ( charid ) {
3310 charid--;
3311 if ( !trie->trans[ state + charid ].next ) {
3312 trie->trans[ state + charid ].next = next_alloc;
3313 trie->trans[ state ].check++;
2e64971a
DM
3314 prev_states[TRIE_NODENUM(next_alloc)]
3315 = TRIE_NODENUM(state);
786e8c11
YO
3316 next_alloc += trie->uniquecharcount;
3317 }
3318 state = trie->trans[ state + charid ].next;
3319 } else {
147e3846 3320 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3321 }
538e84ed
KW
3322 /* charid is now 0 if we dont know the char read, or
3323 * nonzero if we do */
a3621e74 3324 }
ca902fb8
YO
3325 } else {
3326 /* If we end up here it is because we skipped past a NOTHING, but did not end up
3327 * on a trieable type. So we need to reset noper back to point at the first regop
3328 * in the branch before we call TRIE_HANDLE_WORD().
3329 */
3330 noper= NEXTOPER(cur);
a3621e74 3331 }
3dab1dad
YO
3332 accept_state = TRIE_NODENUM( state );
3333 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3334
3335 } /* end second pass */
3336
3dab1dad 3337 /* and now dump it out before we compress it */
2b8b4781
NC
3338 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3339 revcharmap,
3340 next_alloc, depth+1));
a3621e74 3341
a3621e74
YO
3342 {
3343 /*
3344 * Inplace compress the table.*
3345
3346 For sparse data sets the table constructed by the trie algorithm will
3347 be mostly 0/FAIL transitions or to put it another way mostly empty.
3348 (Note that leaf nodes will not contain any transitions.)
3349
3350 This algorithm compresses the tables by eliminating most such
3351 transitions, at the cost of a modest bit of extra work during lookup:
3352
3353 - Each states[] entry contains a .base field which indicates the
3354 index in the state[] array wheres its transition data is stored.
3355
3b753521 3356 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3357
3358 - If .base is nonzero then charid is added to it to find an entry in
3359 the trans array.
3360
3361 -If trans[states[state].base+charid].check!=state then the
3362 transition is taken to be a 0/Fail transition. Thus if there are fail
3363 transitions at the front of the node then the .base offset will point
3364 somewhere inside the previous nodes data (or maybe even into a node
3365 even earlier), but the .check field determines if the transition is
3366 valid.
3367
786e8c11 3368 XXX - wrong maybe?
a3621e74 3369 The following process inplace converts the table to the compressed
3b753521 3370 table: We first do not compress the root node 1,and mark all its
a3621e74 3371 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3372 allows us to do a DFA construction from the compressed table later,
3373 and ensures that any .base pointers we calculate later are greater
3374 than 0.
a3621e74
YO
3375
3376 - We set 'pos' to indicate the first entry of the second node.
3377
3378 - We then iterate over the columns of the node, finding the first and
3379 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3380 and set the .check pointers accordingly, and advance pos
3381 appropriately and repreat for the next node. Note that when we copy
3382 the next pointers we have to convert them from the original
3383 NODEIDX form to NODENUM form as the former is not valid post
3384 compression.
3385
3386 - If a node has no transitions used we mark its base as 0 and do not
3387 advance the pos pointer.
3388
3389 - If a node only has one transition we use a second pointer into the
3390 structure to fill in allocated fail transitions from other states.
3391 This pointer is independent of the main pointer and scans forward
3392 looking for null transitions that are allocated to a state. When it
3393 finds one it writes the single transition into the "hole". If the
786e8c11 3394 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3395
3396 - Once compressed we can Renew/realloc the structures to release the
3397 excess space.
3398
3399 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3400 specifically Fig 3.47 and the associated pseudocode.
3401
3402 demq
3403 */
a3b680e6 3404 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3405 U32 state, charid;
a3621e74 3406 U32 pos = 0, zp=0;
1e2e3d02 3407 trie->statecount = laststate;
a3621e74
YO
3408
3409 for ( state = 1 ; state < laststate ; state++ ) {
3410 U8 flag = 0;
a28509cc
AL
3411 const U32 stateidx = TRIE_NODEIDX( state );
3412 const U32 o_used = trie->trans[ stateidx ].check;
3413 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3414 trie->trans[ stateidx ].check = 0;