This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Create fail-safe for reading out of bounds
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
c277df42 77#define REG_COMP_C
54df2634
NC
78#ifdef PERL_IN_XSUB_RE
79# include "re_comp.h"
afda64e8 80EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
81#else
82# include "regcomp.h"
83#endif
a687059c 84
f7e03a10 85#include "dquote_inline.h"
b992490d 86#include "invlist_inline.h"
1b0f46bf 87#include "unicode_constants.h"
04e98a4d 88
538e84ed
KW
89#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
90 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
91#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
92 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 93#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 94#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 95
a687059c
LW
96#ifndef STATIC
97#define STATIC static
98#endif
99
3f910e62
YO
100/* this is a chain of data about sub patterns we are processing that
101 need to be handled separately/specially in study_chunk. Its so
102 we can simulate recursion without losing state. */
103struct scan_frame;
104typedef struct scan_frame {
105 regnode *last_regnode; /* last node to process in this frame */
106 regnode *next_regnode; /* next node to process when last is reached */
107 U32 prev_recursed_depth;
108 I32 stopparen; /* what stopparen do we use */
3f910e62
YO
109
110 struct scan_frame *this_prev_frame; /* this previous frame */
111 struct scan_frame *prev_frame; /* previous frame */
112 struct scan_frame *next_frame; /* next frame */
113} scan_frame;
114
f4ae5a27
KW
115/* Certain characters are output as a sequence with the first being a
116 * backslash. */
eb7486de 117#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
f4ae5a27
KW
118
119
09b2b2e6 120struct RExC_state_t {
514a91f1
DM
121 U32 flags; /* RXf_* are we folding, multilining? */
122 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 123 char *precomp; /* uncompiled string. */
711b303b 124 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 125 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 126 regexp *rx; /* perl core regexp structure */
538e84ed
KW
127 regexp_internal *rxi; /* internal data for regexp object
128 pprivate field */
fac92740 129 char *start; /* Start of input for compile */
830247a4
IZ
130 char *end; /* End of input for compile */
131 char *parse; /* Input-scan pointer. */
51684e3c
KW
132 char *copy_start; /* start of copy of input within
133 constructed parse string */
cc16d262
KW
134 char *save_copy_start; /* Provides one level of saving
135 and restoring 'copy_start' */
51684e3c
KW
136 char *copy_start_in_input; /* Position in input string
137 corresponding to copy_start */
ea3daa5d 138 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 139 regnode *emit_start; /* Start of emitted-code area */
f55b7b26 140 regnode_offset emit; /* Code-emit pointer */
830247a4
IZ
141 I32 naughty; /* How bad is this pattern? */
142 I32 sawback; /* Did we see \1, ...? */
143 U32 seen;
88f063b4
KW
144 SSize_t size; /* Number of regnode equivalents in
145 pattern */
7c932d07
KW
146
147 /* position beyond 'precomp' of the warning message furthest away from
148 * 'precomp'. During the parse, no warnings are raised for any problems
149 * earlier in the parse than this position. This works if warnings are
150 * raised the first time a given spot is parsed, and if only one
151 * independent warning is raised for any given spot */
152 Size_t latest_warn_offset;
153
154 I32 npar; /* Capture buffer count so far in the
155 parse, (OPEN) plus one. ("par" 0 is
156 the whole pattern)*/
157 I32 total_par; /* During initial parse, is either 0,
158 or -1; the latter indicating a
159 reparse is needed. After that pass,
160 it is what 'npar' became after the
161 pass. Hence, it being > 0 indicates
162 we are in a reparse situation */
538e84ed
KW
163 I32 nestroot; /* root parens we are in - used by
164 accept */
830247a4 165 I32 seen_zerolen;
f55b7b26
KW
166 regnode_offset *open_parens; /* offsets to open parens */
167 regnode_offset *close_parens; /* offsets to close parens */
0275405c 168 I32 parens_buf_size; /* #slots malloced open/close_parens */
d5a00e4a 169 regnode *end_op; /* END node in program */
02daf0ab
YO
170 I32 utf8; /* whether the pattern is utf8 or not */
171 I32 orig_utf8; /* whether the pattern was originally in utf8 */
172 /* XXX use this for future optimisation of case
173 * where pattern must be upgraded to utf8. */
e40e74fe
KW
174 I32 uni_semantics; /* If a d charset modifier should use unicode
175 rules, even if the pattern is not in
176 utf8 */
81714fb9 177 HV *paren_names; /* Paren names */
538e84ed 178
40d049e4 179 regnode **recurse; /* Recurse regops */
88f063b4 180 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 181 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 182 through */
09a65838 183 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 184 I32 in_lookbehind;
105c827d 185 I32 in_lookahead;
4624b182 186 I32 contains_locale;
bb3f3ed2 187 I32 override_recoding;
4caef9c3 188 I32 recode_x_to_native;
9d53c457 189 I32 in_multi_char_class;
1acab4c5 190 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 191 within pattern */
b1603ef8 192 int code_index; /* next code_blocks[] slot */
ee273784 193 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
194 scan_frame *frame_head;
195 scan_frame *frame_last;
196 U32 frame_count;
7eec73eb 197 AV *warn_text;
945961fd 198 HV *unlexed_names;
dc6d7f5c 199#ifdef ADD_TO_REGEXEC
830247a4
IZ
200 char *starttry; /* -Dr: where regtry was called. */
201#define RExC_starttry (pRExC_state->starttry)
202#endif
d24ca0c5 203 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 204#ifdef DEBUGGING
be8e71aa 205 const char *lastparse;
3dab1dad 206 I32 lastnum;
1f1031fe 207 AV *paren_name_list; /* idx -> name */
d9a72fcc 208 U32 study_chunk_recursed_count;
c9f0d54c
YO
209 SV *mysv1;
210 SV *mysv2;
88f063b4 211
3dab1dad
YO
212#define RExC_lastparse (pRExC_state->lastparse)
213#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 214#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 215#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
216#define RExC_mysv (pRExC_state->mysv1)
217#define RExC_mysv1 (pRExC_state->mysv1)
218#define RExC_mysv2 (pRExC_state->mysv2)
219
3dab1dad 220#endif
bd0b62a6 221 bool seen_d_op;
911bd04e 222 bool strict;
da7cf1cc 223 bool study_started;
034602eb 224 bool in_script_run;
7c932d07 225 bool use_BRANCHJ;
09b2b2e6 226};
830247a4 227
e2509266 228#define RExC_flags (pRExC_state->flags)
514a91f1 229#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 230#define RExC_precomp (pRExC_state->precomp)
51684e3c
KW
231#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
232#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
cc16d262 233#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
711b303b 234#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 235#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 236#define RExC_rx (pRExC_state->rx)
f8fc2ecf 237#define RExC_rxi (pRExC_state->rxi)
fac92740 238#define RExC_start (pRExC_state->start)
830247a4
IZ
239#define RExC_end (pRExC_state->end)
240#define RExC_parse (pRExC_state->parse)
7c932d07 241#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
830247a4 242#define RExC_whilem_seen (pRExC_state->whilem_seen)
bd0b62a6
KW
243#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
244 under /d from /u ? */
512c0f5a 245
7122b237 246#ifdef RE_TRACK_PATTERN_OFFSETS
44f36cd3 247# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
538e84ed 248 others */
7122b237 249#endif
830247a4 250#define RExC_emit (pRExC_state->emit)
fac92740 251#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
252#define RExC_sawback (pRExC_state->sawback)
253#define RExC_seen (pRExC_state->seen)
254#define RExC_size (pRExC_state->size)
ee273784 255#define RExC_maxlen (pRExC_state->maxlen)
830247a4 256#define RExC_npar (pRExC_state->npar)
1533d1d2 257#define RExC_total_parens (pRExC_state->total_par)
0275405c 258#define RExC_parens_buf_size (pRExC_state->parens_buf_size)
e2e6a0f1 259#define RExC_nestroot (pRExC_state->nestroot)
830247a4 260#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 261#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 262#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 263#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
264#define RExC_open_parens (pRExC_state->open_parens)
265#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 266#define RExC_end_op (pRExC_state->end_op)
81714fb9 267#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
268#define RExC_recurse (pRExC_state->recurse)
269#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 270#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
271#define RExC_study_chunk_recursed_bytes \
272 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 273#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
105c827d 274#define RExC_in_lookahead (pRExC_state->in_lookahead)
4624b182 275#define RExC_contains_locale (pRExC_state->contains_locale)
4caef9c3
KW
276#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
277
b6d67071 278#ifdef EBCDIC
4caef9c3
KW
279# define SET_recode_x_to_native(x) \
280 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
281#else
282# define SET_recode_x_to_native(x) NOOP
b6d67071 283#endif
4caef9c3 284
9d53c457 285#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
286#define RExC_frame_head (pRExC_state->frame_head)
287#define RExC_frame_last (pRExC_state->frame_last)
288#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 289#define RExC_strict (pRExC_state->strict)
da7cf1cc 290#define RExC_study_started (pRExC_state->study_started)
7eec73eb 291#define RExC_warn_text (pRExC_state->warn_text)
034602eb 292#define RExC_in_script_run (pRExC_state->in_script_run)
7c932d07 293#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
945961fd 294#define RExC_unlexed_names (pRExC_state->unlexed_names)
830247a4 295
99807a43
HS
296/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
297 * a flag to disable back-off on the fixed/floating substrings - if it's
298 * a high complexity pattern we assume the benefit of avoiding a full match
299 * is worth the cost of checking for the substrings even if they rarely help.
300 */
301#define RExC_naughty (pRExC_state->naughty)
302#define TOO_NAUGHTY (10)
303#define MARK_NAUGHTY(add) \
304 if (RExC_naughty < TOO_NAUGHTY) \
305 RExC_naughty += (add)
306#define MARK_NAUGHTY_EXP(exp, add) \
307 if (RExC_naughty < TOO_NAUGHTY) \
308 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 309
a687059c
LW
310#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
311#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 312 ((*s) == '{' && regcurly(s)))
a687059c
LW
313
314/*
315 * Flags to be passed up and down.
316 */
a687059c 317#define WORST 0 /* Worst case. */
c4ead770
KW
318#define HASWIDTH 0x01 /* Known to not match null strings, could match
319 non-null ones. */
fda99bee 320
e64f369d 321/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
322 * character. (There needs to be a case: in the switch statement in regexec.c
323 * for any node marked SIMPLE.) Note that this is not the same thing as
324 * REGNODE_SIMPLE */
fda99bee 325#define SIMPLE 0x02
e64f369d 326#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
327#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
328#define TRYAGAIN 0x10 /* Weeded out a declaration. */
c36361b2
KW
329#define RESTART_PARSE 0x20 /* Need to redo the parse */
330#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
331 calcuate sizes as UTF-8 */
a687059c 332
3dab1dad
YO
333#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
334
07be1b83
YO
335/* whether trie related optimizations are enabled */
336#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
337#define TRIE_STUDY_OPT
786e8c11 338#define FULL_TRIE_STUDY
07be1b83
YO
339#define TRIE_STCLASS
340#endif
1de06328
YO
341
342
40d049e4
YO
343
344#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
345#define PBITVAL(paren) (1 << ((paren) & 7))
346#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
347#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
348#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
349
82a6ada4 350#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 351 if (!UTF) { \
c36361b2 352 *flagp = RESTART_PARSE|NEED_UTF8; \
f55b7b26 353 return 0; \
8d9c2815 354 } \
82a6ada4 355 } STMT_END
40d049e4 356
d63d75f5 357/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
f6e8f31e
KW
358 * a flag that indicates we need to override /d with /u as a result of
359 * something in the pattern. It should only be used in regards to calling
360 * set_regex_charset() or get_regex_charse() */
512c0f5a
KW
361#define REQUIRE_UNI_RULES(flagp, restart_retval) \
362 STMT_START { \
363 if (DEPENDS_SEMANTICS) { \
512c0f5a
KW
364 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
365 RExC_uni_semantics = 1; \
b76e1c04 366 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
bd0b62a6
KW
367 /* No need to restart the parse if we haven't seen \
368 * anything that differs between /u and /d, and no need \
369 * to restart immediately if we're going to reparse \
370 * anyway to count parens */ \
c36361b2 371 *flagp |= RESTART_PARSE; \
512c0f5a 372 return restart_retval; \
d1be211c 373 } \
512c0f5a
KW
374 } \
375 } STMT_END
376
7c932d07
KW
377#define REQUIRE_BRANCHJ(flagp, restart_retval) \
378 STMT_START { \
379 RExC_use_BRANCHJ = 1; \
439a3bfe
KW
380 *flagp |= RESTART_PARSE; \
381 return restart_retval; \
7c932d07
KW
382 } STMT_END
383
b76e1c04
KW
384/* Until we have completed the parse, we leave RExC_total_parens at 0 or
385 * less. After that, it must always be positive, because the whole re is
386 * considered to be surrounded by virtual parens. Setting it to negative
387 * indicates there is some construct that needs to know the actual number of
388 * parens to be properly handled. And that means an extra pass will be
389 * required after we've counted them all */
390#define ALL_PARENS_COUNTED (RExC_total_parens > 0)
7c932d07 391#define REQUIRE_PARENS_PASS \
b76e1c04
KW
392 STMT_START { /* No-op if have completed a pass */ \
393 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
7c932d07 394 } STMT_END
b76e1c04
KW
395#define IN_PARENS_PASS (RExC_total_parens < 0)
396
7c932d07 397
f358228e
KW
398/* This is used to return failure (zero) early from the calling function if
399 * various flags in 'flags' are set. Two flags always cause a return:
400 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
401 * additional flags that should cause a return; 0 if none. If the return will
402 * be done, '*flagp' is first set to be all of the flags that caused the
403 * return. */
404#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
fd7fc6ae 405 STMT_START { \
c36361b2
KW
406 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
407 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
f358228e 408 return 0; \
fd7fc6ae
YO
409 } \
410 } STMT_END
411
c36361b2 412#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
fd7fc6ae 413
5d555bb8 414#define RETURN_FAIL_ON_RESTART(flags,flagp) \
f358228e 415 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
5d555bb8 416#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
f358228e 417 if (MUST_RESTART(*(flagp))) return 0
fd7fc6ae 418
f19b1a63
KW
419/* This converts the named class defined in regcomp.h to its equivalent class
420 * number defined in handy.h. */
421#define namedclass_to_classnum(class) ((int) ((class) / 2))
422#define classnum_to_namedclass(classnum) ((classnum) * 2)
423
de92f5e6
KW
424#define _invlist_union_complement_2nd(a, b, output) \
425 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
426#define _invlist_intersection_complement_2nd(a, b, output) \
427 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
428
1de06328
YO
429/* About scan_data_t.
430
431 During optimisation we recurse through the regexp program performing
432 various inplace (keyhole style) optimisations. In addition study_chunk
433 and scan_commit populate this data structure with information about
538e84ed 434 what strings MUST appear in the pattern. We look for the longest
3b753521 435 string that must appear at a fixed location, and we look for the
1de06328
YO
436 longest string that may appear at a floating location. So for instance
437 in the pattern:
538e84ed 438
1de06328 439 /FOO[xX]A.*B[xX]BAR/
538e84ed 440
1de06328
YO
441 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
442 strings (because they follow a .* construct). study_chunk will identify
443 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 444
1de06328 445 The strings can be composites, for instance
538e84ed 446
1de06328 447 /(f)(o)(o)/
538e84ed 448
1de06328 449 will result in a composite fixed substring 'foo'.
538e84ed 450
1de06328 451 For each string some basic information is maintained:
538e84ed 452
6ea77169 453 - min_offset
1de06328
YO
454 This is the position the string must appear at, or not before.
455 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
456 characters must match before the string we are searching for.
457 Likewise when combined with minlenp and the length of the string it
538e84ed 458 tells us how many characters must appear after the string we have
1de06328 459 found.
538e84ed 460
1de06328
YO
461 - max_offset
462 Only used for floating strings. This is the rightmost point that
ea3daa5d 463 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 464 string can occur infinitely far to the right.
6e12f832 465 For fixed strings, it is equal to min_offset.
538e84ed 466
1de06328 467 - minlenp
2d608413
KW
468 A pointer to the minimum number of characters of the pattern that the
469 string was found inside. This is important as in the case of positive
538e84ed 470 lookahead or positive lookbehind we can have multiple patterns
1de06328 471 involved. Consider
538e84ed 472
1de06328 473 /(?=FOO).*F/
538e84ed 474
1de06328
YO
475 The minimum length of the pattern overall is 3, the minimum length
476 of the lookahead part is 3, but the minimum length of the part that
538e84ed 477 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 478 minimum length for the F is 1. This is important as the minimum length
538e84ed 479 is used to determine offsets in front of and behind the string being
1de06328 480 looked for. Since strings can be composites this is the length of the
486ec47a 481 pattern at the time it was committed with a scan_commit. Note that
1de06328 482 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 483 are not known until the full pattern has been compiled, thus the
1de06328 484 pointer to the value.
538e84ed 485
1de06328 486 - lookbehind
538e84ed 487
1de06328 488 In the case of lookbehind the string being searched for can be
538e84ed 489 offset past the start point of the final matching string.
1de06328
YO
490 If this value was just blithely removed from the min_offset it would
491 invalidate some of the calculations for how many chars must match
492 before or after (as they are derived from min_offset and minlen and
538e84ed 493 the length of the string being searched for).
1de06328
YO
494 When the final pattern is compiled and the data is moved from the
495 scan_data_t structure into the regexp structure the information
538e84ed
KW
496 about lookbehind is factored in, with the information that would
497 have been lost precalculated in the end_shift field for the
1de06328
YO
498 associated string.
499
500 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 501 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
502
503*/
2c2d71f5 504
cb6aef8b
DM
505struct scan_data_substrs {
506 SV *str; /* longest substring found in pattern */
507 SSize_t min_offset; /* earliest point in string it can appear */
508 SSize_t max_offset; /* latest point in string it can appear */
509 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
510 SSize_t lookbehind; /* is the pos of the string modified by LB */
11683ecb 511 I32 flags; /* per substring SF_* and SCF_* flags */
cb6aef8b
DM
512};
513
2c2d71f5 514typedef struct scan_data_t {
1de06328
YO
515 /*I32 len_min; unused */
516 /*I32 len_delta; unused */
49f55535 517 SSize_t pos_min;
ea3daa5d 518 SSize_t pos_delta;
2c2d71f5 519 SV *last_found;
ea3daa5d 520 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 521 SSize_t last_start_min;
ea3daa5d 522 SSize_t last_start_max;
2099df82
DM
523 U8 cur_is_floating; /* whether the last_* values should be set as
524 * the next fixed (0) or floating (1)
525 * substring */
6ea77169 526
2099df82
DM
527 /* [0] is longest fixed substring so far, [1] is longest float so far */
528 struct scan_data_substrs substrs[2];
6ea77169 529
11683ecb 530 I32 flags; /* common SF_* and SCF_* flags */
2c2d71f5 531 I32 whilem_c;
49f55535 532 SSize_t *last_closep;
b8f7bb16 533 regnode_ssc *start_class;
2c2d71f5
JH
534} scan_data_t;
535
a687059c 536/*
e50aee73 537 * Forward declarations for pregcomp()'s friends.
a687059c 538 */
a0d0e21e 539
6ea77169 540static const scan_data_t zero_scan_data = {
a2ca2017 541 0, 0, NULL, 0, 0, 0, 0,
6ea77169 542 {
11683ecb
DM
543 { NULL, 0, 0, 0, 0, 0 },
544 { NULL, 0, 0, 0, 0, 0 },
6ea77169
DM
545 },
546 0, 0, NULL, NULL
547};
c277df42 548
11683ecb
DM
549/* study flags */
550
07be1b83
YO
551#define SF_BEFORE_SEOL 0x0001
552#define SF_BEFORE_MEOL 0x0002
11683ecb 553#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
c277df42 554
07be1b83
YO
555#define SF_IS_INF 0x0040
556#define SF_HAS_PAR 0x0080
557#define SF_IN_PAR 0x0100
558#define SF_HAS_EVAL 0x0200
bc604ad8
KW
559
560
561/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
562 * longest substring in the pattern. When it is not set the optimiser keeps
563 * track of position, but does not keep track of the actual strings seen,
564 *
565 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
566 * /foo/i will not.
567 *
568 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
569 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
570 * turned off because of the alternation (BRANCH). */
07be1b83 571#define SCF_DO_SUBSTR 0x0400
bc604ad8 572
653099ff
GS
573#define SCF_DO_STCLASS_AND 0x0800
574#define SCF_DO_STCLASS_OR 0x1000
575#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 576#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 577
786e8c11 578#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 579#define SCF_SEEN_ACCEPT 0x8000
688e0391 580#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
581#define SCF_IN_DEFINE 0x20000
582
583
584
07be1b83 585
43fead97 586#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
587
588/* The enums for all these are ordered so things work out correctly */
a62b1201 589#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
590#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
591 == REGEX_DEPENDS_CHARSET)
00b27cfc 592#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
593#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
594 >= REGEX_UNICODE_CHARSET)
595#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
596 == REGEX_ASCII_RESTRICTED_CHARSET)
597#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
598 >= REGEX_ASCII_RESTRICTED_CHARSET)
599#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
600 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 601
43fead97 602#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 603
f2c2a6ab
KW
604/* For programs that want to be strictly Unicode compatible by dying if any
605 * attempt is made to match a non-Unicode code point against a Unicode
606 * property. */
607#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
608
93733859 609#define OOB_NAMEDCLASS -1
b8c5462f 610
8e661ac5
KW
611/* There is no code point that is out-of-bounds, so this is problematic. But
612 * its only current use is to initialize a variable that is always set before
613 * looked at. */
614#define OOB_UNICODE 0xDEADBEEF
615
a0ed51b3 616#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 617
8615cb43 618
b45f050a
JF
619/* length of regex to show in messages that don't mark a position within */
620#define RegexLengthToShowInErrorMessages 127
621
622/*
623 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
624 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
625 * op/pragma/warn/regcomp.
626 */
7253e4e3
RK
627#define MARKER1 "<-- HERE" /* marker as it appears in the description */
628#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 629
538e84ed 630#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 631 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 632
285b5ca0
KW
633/* The code in this file in places uses one level of recursion with parsing
634 * rebased to an alternate string constructed by us in memory. This can take
635 * the form of something that is completely different from the input, or
636 * something that uses the input as part of the alternate. In the first case,
637 * there should be no possibility of an error, as we are in complete control of
51684e3c
KW
638 * the alternate string. But in the second case we don't completely control
639 * the input portion, so there may be errors in that. Here's an example:
285b5ca0
KW
640 * /[abc\x{DF}def]/ui
641 * is handled specially because \x{df} folds to a sequence of more than one
51684e3c 642 * character: 'ss'. What is done is to create and parse an alternate string,
285b5ca0
KW
643 * which looks like this:
644 * /(?:\x{DF}|[abc\x{DF}def])/ui
645 * where it uses the input unchanged in the middle of something it constructs,
646 * which is a branch for the DF outside the character class, and clustering
647 * parens around the whole thing. (It knows enough to skip the DF inside the
648 * class while in this substitute parse.) 'abc' and 'def' may have errors that
649 * need to be reported. The general situation looks like this:
650 *
51684e3c 651 * |<------- identical ------>|
285b5ca0 652 * sI tI xI eI
51684e3c 653 * Input: ---------------------------------------------------------------
285b5ca0
KW
654 * Constructed: ---------------------------------------------------
655 * sC tC xC eC EC
51684e3c 656 * |<------- identical ------>|
285b5ca0 657 *
51684e3c
KW
658 * sI..eI is the portion of the input pattern we are concerned with here.
659 * sC..EC is the constructed substitute parse string.
660 * sC..tC is constructed by us
661 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
662 * In the diagram, these are vertically aligned.
663 * eC..EC is also constructed by us.
664 * xC is the position in the substitute parse string where we found a
665 * problem.
666 * xI is the position in the original pattern corresponding to xC.
285b5ca0 667 *
51684e3c
KW
668 * We want to display a message showing the real input string. Thus we need to
669 * translate from xC to xI. We know that xC >= tC, since the portion of the
670 * string sC..tC has been constructed by us, and so shouldn't have errors. We
671 * get:
672 * xI = tI + (xC - tC)
285b5ca0 673 *
51684e3c
KW
674 * When the substitute parse is constructed, the code needs to set:
675 * RExC_start (sC)
676 * RExC_end (eC)
677 * RExC_copy_start_in_input (tI)
678 * RExC_copy_start_in_constructed (tC)
679 * and restore them when done.
285b5ca0 680 *
51684e3c
KW
681 * During normal processing of the input pattern, both
682 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
683 * sI, so that xC equals xI.
285b5ca0
KW
684 */
685
51684e3c
KW
686#define sI RExC_precomp
687#define eI RExC_precomp_end
688#define sC RExC_start
689#define eC RExC_end
690#define tI RExC_copy_start_in_input
691#define tC RExC_copy_start_in_constructed
692#define xI(xC) (tI + (xC - tC))
693#define xI_offset(xC) (xI(xC) - sI)
285b5ca0
KW
694
695#define REPORT_LOCATION_ARGS(xC) \
696 UTF8fARG(UTF, \
51684e3c 697 (xI(xC) > eI) /* Don't run off end */ \
232b691f 698 ? eI - sI /* Length before the <--HERE */ \
bc929b61
KW
699 : ((xI_offset(xC) >= 0) \
700 ? xI_offset(xC) \
701 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
702 IVdf " trying to output message for " \
703 " pattern %.*s", \
51684e3c 704 __FILE__, __LINE__, (IV) xI_offset(xC), \
bc929b61 705 ((int) (eC - sC)), sC), 0)), \
51684e3c 706 sI), /* The input pattern printed up to the <--HERE */ \
285b5ca0 707 UTF8fARG(UTF, \
51684e3c
KW
708 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
709 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
c1d900c3 710
8a6d8ec6
HS
711/* Used to point after bad bytes for an error message, but avoid skipping
712 * past a nul byte. */
e3304382 713#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
8a6d8ec6 714
ed68fe11
KW
715/* Set up to clean up after our imminent demise */
716#define PREPARE_TO_DIE \
717 STMT_START { \
718 if (RExC_rx_sv) \
719 SAVEFREESV(RExC_rx_sv); \
7c932d07
KW
720 if (RExC_open_parens) \
721 SAVEFREEPV(RExC_open_parens); \
722 if (RExC_close_parens) \
723 SAVEFREEPV(RExC_close_parens); \
ed68fe11
KW
724 } STMT_END
725
b45f050a
JF
726/*
727 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
728 * arg. Show regex, up to a maximum length. If it's too long, chop and add
729 * "...".
730 */
58e23c8d 731#define _FAIL(code) STMT_START { \
bfed75c6 732 const char *ellipses = ""; \
88f063b4 733 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380 734 \
ed68fe11 735 PREPARE_TO_DIE; \
ccb2c380
MP
736 if (len > RegexLengthToShowInErrorMessages) { \
737 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
738 len = RegexLengthToShowInErrorMessages - 10; \
739 ellipses = "..."; \
740 } \
58e23c8d 741 code; \
ccb2c380 742} STMT_END
8615cb43 743
58e23c8d 744#define FAIL(msg) _FAIL( \
147e3846 745 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 746 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
747
748#define FAIL2(msg,arg) _FAIL( \
147e3846 749 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 750 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 751
4b7718d6
KW
752#define FAIL3(msg,arg1,arg2) _FAIL( \
753 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
754 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
755
b45f050a 756/*
b45f050a
JF
757 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
758 */
ccb2c380 759#define Simple_vFAIL(m) STMT_START { \
ccb2c380 760 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 761 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 762} STMT_END
b45f050a
JF
763
764/*
765 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
766 */
ccb2c380 767#define vFAIL(m) STMT_START { \
ed68fe11 768 PREPARE_TO_DIE; \
ccb2c380
MP
769 Simple_vFAIL(m); \
770} STMT_END
b45f050a
JF
771
772/*
773 * Like Simple_vFAIL(), but accepts two arguments.
774 */
ccb2c380 775#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
776 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
777 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 778} STMT_END
b45f050a
JF
779
780/*
781 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
782 */
ccb2c380 783#define vFAIL2(m,a1) STMT_START { \
ed68fe11 784 PREPARE_TO_DIE; \
ccb2c380
MP
785 Simple_vFAIL2(m, a1); \
786} STMT_END
b45f050a
JF
787
788
789/*
790 * Like Simple_vFAIL(), but accepts three arguments.
791 */
ccb2c380 792#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 793 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 794 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 795} STMT_END
b45f050a
JF
796
797/*
798 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
799 */
ccb2c380 800#define vFAIL3(m,a1,a2) STMT_START { \
ed68fe11 801 PREPARE_TO_DIE; \
ccb2c380
MP
802 Simple_vFAIL3(m, a1, a2); \
803} STMT_END
b45f050a
JF
804
805/*
806 * Like Simple_vFAIL(), but accepts four arguments.
807 */
ccb2c380 808#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
809 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
810 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 811} STMT_END
b45f050a 812
95db3ffa 813#define vFAIL4(m,a1,a2,a3) STMT_START { \
ed68fe11 814 PREPARE_TO_DIE; \
95db3ffa
KW
815 Simple_vFAIL4(m, a1, a2, a3); \
816} STMT_END
817
946095af 818/* A specialized version of vFAIL2 that works with UTF8f */
d528642a 819#define vFAIL2utf8f(m, a1) STMT_START { \
ed68fe11 820 PREPARE_TO_DIE; \
d528642a
KW
821 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
822 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
823} STMT_END
824
3ba22297 825#define vFAIL3utf8f(m, a1, a2) STMT_START { \
ed68fe11 826 PREPARE_TO_DIE; \
3ba22297 827 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 828 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
829} STMT_END
830
d62e749e 831/* Setting this to NULL is a signal to not output warnings */
cc16d262
KW
832#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
833 STMT_START { \
834 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
835 RExC_copy_start_in_constructed = NULL; \
836 } STMT_END
837#define RESTORE_WARNINGS \
838 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
d62e749e 839
7c932d07
KW
840/* Since a warning can be generated multiple times as the input is reparsed, we
841 * output it the first time we come to that point in the parse, but suppress it
842 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
843 * generate any warnings */
55bcf857 844#define TO_OUTPUT_WARNINGS(loc) \
7c932d07
KW
845 ( RExC_copy_start_in_constructed \
846 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
55bcf857 847
7c932d07
KW
848/* After we've emitted a warning, we save the position in the input so we don't
849 * output it again */
850#define UPDATE_WARNINGS_LOC(loc) \
851 STMT_START { \
852 if (TO_OUTPUT_WARNINGS(loc)) { \
5d894ca5
KW
853 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
854 - RExC_precomp; \
7c932d07
KW
855 } \
856 } STMT_END
77ccba3c 857
086f21bf 858/* 'warns' is the output of the packWARNx macro used in 'code' */
f8220fe2
KW
859#define _WARN_HELPER(loc, warns, code) \
860 STMT_START { \
d62e749e
KW
861 if (! RExC_copy_start_in_constructed) { \
862 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
863 " expected at '%s'", \
864 __FILE__, __LINE__, loc); \
865 } \
55bcf857 866 if (TO_OUTPUT_WARNINGS(loc)) { \
086f21bf
KW
867 if (ckDEAD(warns)) \
868 PREPARE_TO_DIE; \
338190a1 869 code; \
77ccba3c 870 UPDATE_WARNINGS_LOC(loc); \
338190a1 871 } \
f8220fe2 872 } STMT_END
946095af 873
5e0a247b 874/* m is not necessarily a "literal string", in this macro */
f8220fe2
KW
875#define reg_warn_non_literal_string(loc, m) \
876 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
877 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 878 "%s" REPORT_LOCATION, \
f8220fe2 879 m, REPORT_LOCATION_ARGS(loc)))
5e0a247b 880
f8220fe2
KW
881#define ckWARNreg(loc,m) \
882 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
883 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 884 m REPORT_LOCATION, \
f8220fe2 885 REPORT_LOCATION_ARGS(loc)))
ccb2c380 886
f8220fe2
KW
887#define vWARN(loc, m) \
888 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
889 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 890 m REPORT_LOCATION, \
f8220fe2 891 REPORT_LOCATION_ARGS(loc))) \
b927b7e9 892
f8220fe2
KW
893#define vWARN_dep(loc, m) \
894 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
895 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 896 m REPORT_LOCATION, \
f8220fe2 897 REPORT_LOCATION_ARGS(loc)))
0d6106aa 898
f8220fe2
KW
899#define ckWARNdep(loc,m) \
900 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
901 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 902 m REPORT_LOCATION, \
f8220fe2 903 REPORT_LOCATION_ARGS(loc)))
147508a2 904
f8220fe2
KW
905#define ckWARNregdep(loc,m) \
906 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
907 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
d528642a
KW
908 WARN_REGEXP), \
909 m REPORT_LOCATION, \
f8220fe2 910 REPORT_LOCATION_ARGS(loc)))
ccb2c380 911
f8220fe2
KW
912#define ckWARN2reg_d(loc,m, a1) \
913 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
914 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
d528642a 915 m REPORT_LOCATION, \
f8220fe2 916 a1, REPORT_LOCATION_ARGS(loc)))
2335b3d3 917
f8220fe2
KW
918#define ckWARN2reg(loc, m, a1) \
919 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
920 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 921 m REPORT_LOCATION, \
f8220fe2 922 a1, REPORT_LOCATION_ARGS(loc)))
ccb2c380 923
f8220fe2
KW
924#define vWARN3(loc, m, a1, a2) \
925 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
926 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 927 m REPORT_LOCATION, \
f8220fe2 928 a1, a2, REPORT_LOCATION_ARGS(loc)))
ccb2c380 929
f8220fe2
KW
930#define ckWARN3reg(loc, m, a1, a2) \
931 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
932 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
933 m REPORT_LOCATION, \
934 a1, a2, \
f8220fe2 935 REPORT_LOCATION_ARGS(loc)))
668c081a 936
f8220fe2
KW
937#define vWARN4(loc, m, a1, a2, a3) \
938 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
939 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
940 m REPORT_LOCATION, \
941 a1, a2, a3, \
f8220fe2 942 REPORT_LOCATION_ARGS(loc)))
ccb2c380 943
f8220fe2
KW
944#define ckWARN4reg(loc, m, a1, a2, a3) \
945 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
946 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
947 m REPORT_LOCATION, \
948 a1, a2, a3, \
f8220fe2 949 REPORT_LOCATION_ARGS(loc)))
668c081a 950
f8220fe2
KW
951#define vWARN5(loc, m, a1, a2, a3, a4) \
952 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
953 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
954 m REPORT_LOCATION, \
955 a1, a2, a3, a4, \
f8220fe2 956 REPORT_LOCATION_ARGS(loc)))
9d1d55b5 957
aea5f181
KW
958#define ckWARNexperimental(loc, class, m) \
959 _WARN_HELPER(loc, packWARN(class), \
960 Perl_ck_warner_d(aTHX_ packWARN(class), \
961 m REPORT_LOCATION, \
962 REPORT_LOCATION_ARGS(loc)))
963
1a322bb4
KW
964/* Convert between a pointer to a node and its offset from the beginning of the
965 * program */
966#define REGNODE_p(offset) (RExC_emit_start + (offset))
967#define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
968
538e84ed 969/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
970 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
971 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
972 * Element 0 holds the number n.
07be1b83 973 * Position is 1 indexed.
fac92740 974 */
7122b237 975#ifndef RE_TRACK_PATTERN_OFFSETS
d7c442ce 976#define Set_Node_Offset_To_R(offset,byte)
7122b237
YO
977#define Set_Node_Offset(node,byte)
978#define Set_Cur_Node_Offset
979#define Set_Node_Length_To_R(node,len)
980#define Set_Node_Length(node,len)
6a86c6ad 981#define Set_Node_Cur_Length(node,start)
538e84ed
KW
982#define Node_Offset(n)
983#define Node_Length(n)
7122b237
YO
984#define Set_Node_Offset_Length(node,offset,len)
985#define ProgLen(ri) ri->u.proglen
986#define SetProgLen(ri,x) ri->u.proglen = x
4cbeff97 987#define Track_Code(code)
7122b237
YO
988#else
989#define ProgLen(ri) ri->u.offsets[0]
990#define SetProgLen(ri,x) ri->u.offsets[0] = x
d7c442ce 991#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
ccb2c380 992 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
d7c442ce
KW
993 __LINE__, (int)(offset), (int)(byte))); \
994 if((offset) < 0) { \
538e84ed 995 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
d7c442ce 996 (int)(offset)); \
ccb2c380 997 } else { \
d7c442ce 998 RExC_offsets[2*(offset)-1] = (byte); \
ccb2c380 999 } \
ccb2c380
MP
1000} STMT_END
1001
88f063b4 1002#define Set_Node_Offset(node,byte) \
1a322bb4 1003 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
ccb2c380
MP
1004#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1005
1006#define Set_Node_Length_To_R(node,len) STMT_START { \
ccb2c380 1007 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 1008 __LINE__, (int)(node), (int)(len))); \
ccb2c380 1009 if((node) < 0) { \
538e84ed
KW
1010 Perl_croak(aTHX_ "value of node is %d in Length macro", \
1011 (int)(node)); \
ccb2c380
MP
1012 } else { \
1013 RExC_offsets[2*(node)] = (len); \
1014 } \
ccb2c380
MP
1015} STMT_END
1016
1017#define Set_Node_Length(node,len) \
1a322bb4 1018 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
6a86c6ad
NC
1019#define Set_Node_Cur_Length(node, start) \
1020 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
1021
1022/* Get offsets and lengths */
1a322bb4
KW
1023#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1024#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
fac92740 1025
07be1b83 1026#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1a322bb4
KW
1027 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1028 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
07be1b83 1029} STMT_END
4cbeff97
TC
1030
1031#define Track_Code(code) STMT_START { code } STMT_END
7122b237 1032#endif
07be1b83
YO
1033
1034#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1035#define EXPERIMENTAL_INPLACESCAN
f427392e 1036#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 1037
cb41e5d6
YO
1038#ifdef DEBUGGING
1039int
6ad9a8ab 1040Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
1041{
1042 va_list ap;
1043 int result;
1044 PerlIO *f= Perl_debug_log;
1045 PERL_ARGS_ASSERT_RE_PRINTF;
1046 va_start(ap, fmt);
1047 result = PerlIO_vprintf(f, fmt, ap);
1048 va_end(ap);
1049 return result;
1050}
1051
1052int
7b031478 1053Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
1054{
1055 va_list ap;
1056 int result;
1057 PerlIO *f= Perl_debug_log;
1058 PERL_ARGS_ASSERT_RE_INDENTF;
1059 va_start(ap, depth);
daeb874b 1060 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
1061 result = PerlIO_vprintf(f, fmt, ap);
1062 va_end(ap);
1063 return result;
1064}
1065#endif /* DEBUGGING */
1066
7b031478 1067#define DEBUG_RExC_seen() \
538e84ed 1068 DEBUG_OPTIMISE_MORE_r({ \
88f063b4 1069 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 1070 \
e384d5c1 1071 if (RExC_seen & REG_ZERO_LEN_SEEN) \
88f063b4 1072 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 1073 \
e384d5c1 1074 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
88f063b4 1075 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 1076 \
e384d5c1 1077 if (RExC_seen & REG_GPOS_SEEN) \
88f063b4 1078 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 1079 \
e384d5c1 1080 if (RExC_seen & REG_RECURSE_SEEN) \
88f063b4 1081 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 1082 \
7b031478 1083 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
88f063b4 1084 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 1085 \
e384d5c1 1086 if (RExC_seen & REG_VERBARG_SEEN) \
88f063b4 1087 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 1088 \
e384d5c1 1089 if (RExC_seen & REG_CUTGROUP_SEEN) \
88f063b4 1090 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 1091 \
e384d5c1 1092 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
88f063b4 1093 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 1094 \
e384d5c1 1095 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
88f063b4 1096 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 1097 \
7b031478 1098 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
88f063b4 1099 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 1100 \
88f063b4 1101 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
1102 });
1103
fdfb4f21 1104#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 1105 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21 1106
fdfb4f21 1107
f5a36d78
DM
1108#ifdef DEBUGGING
1109static void
1110S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1111 const char *close_str)
1112{
1113 if (!flags)
1114 return;
1115
1116 Perl_re_printf( aTHX_ "%s", open_str);
11683ecb
DM
1117 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1118 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
f5a36d78
DM
1119 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1120 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1121 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1122 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1123 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1124 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1125 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1126 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1127 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1128 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1129 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1130 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1131 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1132 Perl_re_printf( aTHX_ "%s", close_str);
1133}
1134
1135
1136static void
1137S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1138 U32 depth, int is_inf)
1139{
1140 GET_RE_DEBUG_FLAGS_DECL;
1141
1142 DEBUG_OPTIMISE_MORE_r({
1143 if (!data)
1144 return;
1145 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1146 depth,
1147 where,
1148 (IV)data->pos_min,
1149 (IV)data->pos_delta,
1150 (UV)data->flags
1151 );
1152
1153 S_debug_show_study_flags(aTHX_ data->flags," [","]");
fdfb4f21 1154
f5a36d78
DM
1155 Perl_re_printf( aTHX_
1156 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1157 (IV)data->whilem_c,
1158 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1159 is_inf ? "INF " : ""
1160 );
1161
11683ecb 1162 if (data->last_found) {
37b6262f 1163 int i;
11683ecb
DM
1164 Perl_re_printf(aTHX_
1165 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1166 SvPVX_const(data->last_found),
1167 (IV)data->last_end,
1168 (IV)data->last_start_min,
1169 (IV)data->last_start_max
1170 );
1171
2099df82 1172 for (i = 0; i < 2; i++) {
37b6262f
DM
1173 Perl_re_printf(aTHX_
1174 " %s%s: '%s' @ %" IVdf "/%" IVdf,
2099df82
DM
1175 data->cur_is_floating == i ? "*" : "",
1176 i ? "Float" : "Fixed",
37b6262f
DM
1177 SvPVX_const(data->substrs[i].str),
1178 (IV)data->substrs[i].min_offset,
1179 (IV)data->substrs[i].max_offset
1180 );
1181 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1182 }
11683ecb 1183 }
f5a36d78
DM
1184
1185 Perl_re_printf( aTHX_ "\n");
1186 });
1187}
1188
1189
1190static void
1191S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1192 regnode *scan, U32 depth, U32 flags)
1193{
1194 GET_RE_DEBUG_FLAGS_DECL;
1195
1196 DEBUG_OPTIMISE_r({
1197 regnode *Next;
1198
1199 if (!scan)
1200 return;
1201 Next = regnext(scan);
1202 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1203 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1204 depth,
1205 str,
1206 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1207 Next ? (REG_NODE_NUM(Next)) : 0 );
1208 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1209 Perl_re_printf( aTHX_ "\n");
1210 });
1211}
1212
1213
1214# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1215 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1216
1217# define DEBUG_PEEP(str, scan, depth, flags) \
1218 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1219
1220#else
1221# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1222# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1223#endif
1de06328 1224
cb41e5d6 1225
c6871b76
KW
1226/* =========================================================
1227 * BEGIN edit_distance stuff.
1228 *
1229 * This calculates how many single character changes of any type are needed to
1230 * transform a string into another one. It is taken from version 3.1 of
1231 *
1232 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1233 */
1234
1235/* Our unsorted dictionary linked list. */
1236/* Note we use UVs, not chars. */
1237
1238struct dictionary{
1239 UV key;
1240 UV value;
1241 struct dictionary* next;
1242};
1243typedef struct dictionary item;
1244
1245
1246PERL_STATIC_INLINE item*
88f063b4 1247push(UV key, item* curr)
c6871b76
KW
1248{
1249 item* head;
d09f14bf 1250 Newx(head, 1, item);
c6871b76
KW
1251 head->key = key;
1252 head->value = 0;
1253 head->next = curr;
1254 return head;
1255}
1256
1257
1258PERL_STATIC_INLINE item*
1259find(item* head, UV key)
1260{
1261 item* iterator = head;
1262 while (iterator){
1263 if (iterator->key == key){
1264 return iterator;
1265 }
1266 iterator = iterator->next;
1267 }
1268
1269 return NULL;
1270}
1271
1272PERL_STATIC_INLINE item*
88f063b4 1273uniquePush(item* head, UV key)
c6871b76
KW
1274{
1275 item* iterator = head;
1276
1277 while (iterator){
1278 if (iterator->key == key) {
1279 return head;
1280 }
1281 iterator = iterator->next;
1282 }
1283
88f063b4 1284 return push(key, head);
c6871b76
KW
1285}
1286
1287PERL_STATIC_INLINE void
1288dict_free(item* head)
1289{
1290 item* iterator = head;
1291
1292 while (iterator) {
1293 item* temp = iterator;
1294 iterator = iterator->next;
1295 Safefree(temp);
1296 }
1297
1298 head = NULL;
1299}
1300
1301/* End of Dictionary Stuff */
1302
1303/* All calculations/work are done here */
1304STATIC int
1305S_edit_distance(const UV* src,
1306 const UV* tgt,
1307 const STRLEN x, /* length of src[] */
1308 const STRLEN y, /* length of tgt[] */
1309 const SSize_t maxDistance
1310)
1311{
1312 item *head = NULL;
88f063b4 1313 UV swapCount, swapScore, targetCharCount, i, j;
c6871b76
KW
1314 UV *scores;
1315 UV score_ceil = x + y;
1316
1317 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1318
1319 /* intialize matrix start values */
d09f14bf 1320 Newx(scores, ( (x + 2) * (y + 2)), UV);
c6871b76
KW
1321 scores[0] = score_ceil;
1322 scores[1 * (y + 2) + 0] = score_ceil;
1323 scores[0 * (y + 2) + 1] = score_ceil;
1324 scores[1 * (y + 2) + 1] = 0;
88f063b4 1325 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
c6871b76
KW
1326
1327 /* work loops */
1328 /* i = src index */
1329 /* j = tgt index */
1330 for (i=1;i<=x;i++) {
1331 if (i < x)
88f063b4 1332 head = uniquePush(head, src[i]);
c6871b76
KW
1333 scores[(i+1) * (y + 2) + 1] = i;
1334 scores[(i+1) * (y + 2) + 0] = score_ceil;
1335 swapCount = 0;
1336
1337 for (j=1;j<=y;j++) {
1338 if (i == 1) {
1339 if(j < y)
88f063b4 1340 head = uniquePush(head, tgt[j]);
c6871b76
KW
1341 scores[1 * (y + 2) + (j + 1)] = j;
1342 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1343 }
1344
88f063b4 1345 targetCharCount = find(head, tgt[j-1])->value;
c6871b76
KW
1346 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1347
1348 if (src[i-1] != tgt[j-1]){
1349 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1350 }
1351 else {
1352 swapCount = j;
1353 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1354 }
1355 }
1356
88f063b4 1357 find(head, src[i-1])->value = i;
c6871b76
KW
1358 }
1359
1360 {
1361 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1362 dict_free(head);
1363 Safefree(scores);
1364 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1365 }
1366}
1367
1368/* END of edit_distance() stuff
1369 * ========================================================= */
1370
8e35b056
KW
1371/* is c a control character for which we have a mnemonic? */
1372#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1373
549b4e78
KW
1374STATIC const char *
1375S_cntrl_to_mnemonic(const U8 c)
1376{
1377 /* Returns the mnemonic string that represents character 'c', if one
1378 * exists; NULL otherwise. The only ones that exist for the purposes of
1379 * this routine are a few control characters */
1380
1381 switch (c) {
1382 case '\a': return "\\a";
1383 case '\b': return "\\b";
1384 case ESC_NATIVE: return "\\e";
1385 case '\f': return "\\f";
1386 case '\n': return "\\n";
1387 case '\r': return "\\r";
1388 case '\t': return "\\t";
1389 }
1390
1391 return NULL;
1392}
1393
653099ff 1394/* Mark that we cannot extend a found fixed substring at this point.
37b6262f 1395 Update the longest found anchored substring or the longest found
653099ff
GS
1396 floating substrings if needed. */
1397
4327152a 1398STATIC void
ea3daa5d
FC
1399S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1400 SSize_t *minlenp, int is_inf)
c277df42 1401{
e1ec3a88 1402 const STRLEN l = CHR_SVLEN(data->last_found);
2099df82 1403 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
a2ca2017 1404 const STRLEN old_l = CHR_SVLEN(longest_sv);
1de06328 1405 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1406
7918f24d
NC
1407 PERL_ARGS_ASSERT_SCAN_COMMIT;
1408
c277df42 1409 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
2099df82 1410 const U8 i = data->cur_is_floating;
a2ca2017 1411 SvSetMagicSV(longest_sv, data->last_found);
37b6262f
DM
1412 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1413
2099df82
DM
1414 if (!i) /* fixed */
1415 data->substrs[0].max_offset = data->substrs[0].min_offset;
1416 else { /* float */
1417 data->substrs[1].max_offset = (l
646e8787
DM
1418 ? data->last_start_max
1419 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1420 ? SSize_t_MAX
1421 : data->pos_min + data->pos_delta));
1422 if (is_inf
2099df82
DM
1423 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1424 data->substrs[1].max_offset = SSize_t_MAX;
37b6262f 1425 }
11683ecb 1426
37b6262f
DM
1427 if (data->flags & SF_BEFORE_EOL)
1428 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1429 else
1430 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1431 data->substrs[i].minlenp = minlenp;
1432 data->substrs[i].lookbehind = 0;
c277df42 1433 }
37b6262f 1434
c277df42 1435 SvCUR_set(data->last_found, 0);
0eda9292 1436 {
a28509cc 1437 SV * const sv = data->last_found;
097eb12c
AL
1438 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1439 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1440 if (mg)
1441 mg->mg_len = 0;
1442 }
0eda9292 1443 }
c277df42
IZ
1444 data->last_end = -1;
1445 data->flags &= ~SF_BEFORE_EOL;
f5a36d78 1446 DEBUG_STUDYDATA("commit", data, 0, is_inf);
c277df42
IZ
1447}
1448
cdd87c1d
KW
1449/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1450 * list that describes which code points it matches */
1451
653099ff 1452STATIC void
3420edd7 1453S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1454{
cdd87c1d
KW
1455 /* Set the SSC 'ssc' to match an empty string or any code point */
1456
557bd3fb 1457 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1458
71068078 1459 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1460
0854ea0b
KW
1461 /* mortalize so won't leak */
1462 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1463 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1464}
1465
653099ff 1466STATIC int
dc3bf405 1467S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1468{
c144baaa
KW
1469 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1470 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1471 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1472 * in any way, so there's no point in using it */
cdd87c1d
KW
1473
1474 UV start, end;
1475 bool ret;
653099ff 1476
557bd3fb 1477 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1478
71068078 1479 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1480
93e92956 1481 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1482 return FALSE;
1483 }
1484
1485 /* See if the list consists solely of the range 0 - Infinity */
1486 invlist_iterinit(ssc->invlist);
1487 ret = invlist_iternext(ssc->invlist, &start, &end)
1488 && start == 0
1489 && end == UV_MAX;
1490
1491 invlist_iterfinish(ssc->invlist);
1492
1493 if (ret) {
1494 return TRUE;
1495 }
1496
1497 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1498 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1499 int i;
1500 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1501 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1502 return TRUE;
1503 }
1504 }
1505 }
1506
1507 return FALSE;
653099ff
GS
1508}
1509
653099ff 1510STATIC void
cdd87c1d 1511S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1512{
cdd87c1d
KW
1513 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1514 * string, any code point, or any posix class under locale */
1515
557bd3fb 1516 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1517
557bd3fb 1518 Zero(ssc, 1, regnode_ssc);
71068078 1519 set_ANYOF_SYNTHETIC(ssc);
93e92956 1520 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1521 ssc_anything(ssc);
cdd87c1d 1522
2f306ab9
KW
1523 /* If any portion of the regex is to operate under locale rules that aren't
1524 * fully known at compile time, initialization includes it. The reason
1525 * this isn't done for all regexes is that the optimizer was written under
1526 * the assumption that locale was all-or-nothing. Given the complexity and
1527 * lack of documentation in the optimizer, and that there are inadequate
1528 * test cases for locale, many parts of it may not work properly, it is
1529 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1530 if (RExC_contains_locale) {
1531 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1532 }
1533 else {
1534 ANYOF_POSIXL_ZERO(ssc);
1535 }
653099ff
GS
1536}
1537
b423522f 1538STATIC int
dc3bf405
BF
1539S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1540 const regnode_ssc *ssc)
b423522f
KW
1541{
1542 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1543 * to the list of code points matched, and locale posix classes; hence does
1544 * not check its flags) */
1545
1546 UV start, end;
1547 bool ret;
1548
1549 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1550
71068078 1551 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1552
1553 invlist_iterinit(ssc->invlist);
1554 ret = invlist_iternext(ssc->invlist, &start, &end)
1555 && start == 0
1556 && end == UV_MAX;
1557
1558 invlist_iterfinish(ssc->invlist);
1559
1560 if (! ret) {
1561 return FALSE;
1562 }
1563
e0e1be5f 1564 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1565 return FALSE;
b423522f
KW
1566 }
1567
1568 return TRUE;
1569}
1570
4ebed06a
KW
1571#define INVLIST_INDEX 0
1572#define ONLY_LOCALE_MATCHES_INDEX 1
1573#define DEFERRED_USER_DEFINED_INDEX 2
1574
b423522f
KW
1575STATIC SV*
1576S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1577 const regnode_charclass* const node)
b423522f
KW
1578{
1579 /* Returns a mortal inversion list defining which code points are matched
1580 * by 'node', which is of type ANYOF. Handles complementing the result if
1581 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1582 * returned list must, and will, contain every code point that is a
1583 * possibility. */
b423522f 1584
1565c085 1585 dVAR;
e2506fa7 1586 SV* invlist = NULL;
1ee208c4 1587 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1588 unsigned int i;
1589 const U32 n = ARG(node);
31f05a37 1590 bool new_node_has_latin1 = FALSE;
3146c00a 1591 const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
29a889ef
KW
1592 ? 0
1593 : ANYOF_FLAGS(node);
b423522f
KW
1594
1595 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1596
1597 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1598 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1599 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1600 AV * const av = MUTABLE_AV(SvRV(rv));
1601 SV **const ary = AvARRAY(av);
1602 assert(RExC_rxi->data->what[n] == 's');
1603
4ebed06a 1604 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
b423522f 1605
6ed02bb6
KW
1606 /* Here there are things that won't be known until runtime -- we
1607 * have to assume it could be anything */
e2506fa7 1608 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1609 return _add_range_to_invlist(invlist, 0, UV_MAX);
1610 }
4ebed06a 1611 else if (ary[INVLIST_INDEX]) {
b423522f 1612
6ed02bb6 1613 /* Use the node's inversion list */
4ebed06a 1614 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1ee208c4
KW
1615 }
1616
1617 /* Get the code points valid only under UTF-8 locales */
766d6d33 1618 if ( (flags & ANYOFL_FOLD)
4ebed06a 1619 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1ee208c4 1620 {
4ebed06a 1621 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
b423522f
KW
1622 }
1623 }
1624
e2506fa7
KW
1625 if (! invlist) {
1626 invlist = sv_2mortal(_new_invlist(0));
1627 }
1628
dcb20b36
KW
1629 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1630 * code points, and an inversion list for the others, but if there are code
1631 * points that should match only conditionally on the target string being
1632 * UTF-8, those are placed in the inversion list, and not the bitmap.
1633 * Since there are circumstances under which they could match, they are
1634 * included in the SSC. But if the ANYOF node is to be inverted, we have
1635 * to exclude them here, so that when we invert below, the end result
1636 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1637 * have to do this here before we add the unconditionally matched code
1638 * points */
766d6d33 1639 if (flags & ANYOF_INVERT) {
b423522f
KW
1640 _invlist_intersection_complement_2nd(invlist,
1641 PL_UpperLatin1,
1642 &invlist);
1643 }
1644
1645 /* Add in the points from the bit map */
3146c00a 1646 if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
90973738
KW
1647 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1648 if (ANYOF_BITMAP_TEST(node, i)) {
1649 unsigned int start = i++;
1650
1651 for (; i < NUM_ANYOF_CODE_POINTS
1652 && ANYOF_BITMAP_TEST(node, i); ++i)
1653 {
1654 /* empty */
1655 }
1656 invlist = _add_range_to_invlist(invlist, start, i-1);
1657 new_node_has_latin1 = TRUE;
6f8848d5 1658 }
b423522f
KW
1659 }
1660 }
1661
1662 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1663 * as well. But don't add them if inverting, as when that gets done below,
1664 * it would exclude all these characters, including the ones it shouldn't
1665 * that were added just above */
766d6d33
KW
1666 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1667 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
f240c685 1668 {
b423522f
KW
1669 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1670 }
1671
1672 /* Similarly for these */
766d6d33 1673 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1674 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1675 }
1676
766d6d33 1677 if (flags & ANYOF_INVERT) {
b423522f
KW
1678 _invlist_invert(invlist);
1679 }
766d6d33 1680 else if (flags & ANYOFL_FOLD) {
35b8412f 1681 if (new_node_has_latin1) {
31f05a37 1682
26be5fe6
KW
1683 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1684 * the locale. We can skip this if there are no 0-255 at all. */
1685 _invlist_union(invlist, PL_Latin1, &invlist);
35b8412f
KW
1686
1687 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1688 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1689 }
1690 else {
1691 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1692 invlist = add_cp_to_invlist(invlist, 'I');
1693 }
1694 if (_invlist_contains_cp(invlist,
1695 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1696 {
1697 invlist = add_cp_to_invlist(invlist, 'i');
1698 }
1699 }
31f05a37
KW
1700 }
1701
1ee208c4
KW
1702 /* Similarly add the UTF-8 locale possible matches. These have to be
1703 * deferred until after the non-UTF-8 locale ones are taken care of just
1704 * above, or it leads to wrong results under ANYOF_INVERT */
1705 if (only_utf8_locale_invlist) {
31f05a37 1706 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1707 only_utf8_locale_invlist,
766d6d33 1708 flags & ANYOF_INVERT,
31f05a37
KW
1709 &invlist);
1710 }
b423522f
KW
1711
1712 return invlist;
1713}
1714
1051e1c4 1715/* These two functions currently do the exact same thing */
557bd3fb 1716#define ssc_init_zero ssc_init
653099ff 1717
cdd87c1d
KW
1718#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1719#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1720
557bd3fb 1721/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1722 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1723 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1724
653099ff 1725STATIC void
b423522f 1726S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1727 const regnode_charclass *and_with)
653099ff 1728{
cdd87c1d
KW
1729 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1730 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1731
a0dd4231 1732 SV* anded_cp_list;
3146c00a 1733 U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
29a889ef
KW
1734 ? 0
1735 : ANYOF_FLAGS(and_with);
a0dd4231 1736 U8 anded_flags;
1e6ade67 1737
cdd87c1d 1738 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1739
71068078 1740 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1741
1742 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1743 * the code point inversion list and just the relevant flags */
71068078 1744 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1745 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
765e6ecf 1746 anded_flags = and_with_flags;
e9b08962
KW
1747
1748 /* XXX This is a kludge around what appears to be deficiencies in the
1749 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1750 * there are paths through the optimizer where it doesn't get weeded
1751 * out when it should. And if we don't make some extra provision for
1752 * it like the code just below, it doesn't get added when it should.
1753 * This solution is to add it only when AND'ing, which is here, and
1754 * only when what is being AND'ed is the pristine, original node
1755 * matching anything. Thus it is like adding it to ssc_anything() but
1756 * only when the result is to be AND'ed. Probably the same solution
1757 * could be adopted for the same problem we have with /l matching,
1758 * which is solved differently in S_ssc_init(), and that would lead to
1759 * fewer false positives than that solution has. But if this solution
1760 * creates bugs, the consequences are only that a warning isn't raised
1761 * that should be; while the consequences for having /l bugs is
1762 * incorrect matches */
7dcac5f6 1763 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1764 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1765 }
a0dd4231
KW
1766 }
1767 else {
5c0f85ef 1768 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685 1769 if (OP(and_with) == ANYOFD) {
765e6ecf 1770 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
f240c685
KW
1771 }
1772 else {
765e6ecf 1773 anded_flags = and_with_flags
f240c685 1774 &( ANYOF_COMMON_FLAGS
108316fb
KW
1775 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1776 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
765e6ecf 1777 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
d1c40ef5
KW
1778 anded_flags &=
1779 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1780 }
f240c685 1781 }
a0dd4231
KW
1782 }
1783
1784 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1785
1786 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1787 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1788 * 'and_with' may be inverted. When not inverted, we have the situation of
1789 * computing:
1790 * (C1 | P1) & (C2 | P2)
1791 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1792 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1793 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1794 * <= ((C1 & C2) | P1 | P2)
1795 * Alternatively, the last few steps could be:
1796 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1797 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1798 * <= (C1 | C2 | (P1 & P2))
1799 * We favor the second approach if either P1 or P2 is non-empty. This is
1800 * because these components are a barrier to doing optimizations, as what
1801 * they match cannot be known until the moment of matching as they are
1802 * dependent on the current locale, 'AND"ing them likely will reduce or
1803 * eliminate them.
1804 * But we can do better if we know that C1,P1 are in their initial state (a
1805 * frequent occurrence), each matching everything:
1806 * (<everything>) & (C2 | P2) = C2 | P2
1807 * Similarly, if C2,P2 are in their initial state (again a frequent
1808 * occurrence), the result is a no-op
1809 * (C1 | P1) & (<everything>) = C1 | P1
1810 *
1811 * Inverted, we have
1812 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1813 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1814 * <= (C1 & ~C2) | (P1 & ~P2)
1815 * */
1aa99e6b 1816
765e6ecf 1817 if ((and_with_flags & ANYOF_INVERT)
71068078 1818 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1819 {
cdd87c1d 1820 unsigned int i;
8951c461 1821
cdd87c1d
KW
1822 ssc_intersection(ssc,
1823 anded_cp_list,
1824 FALSE /* Has already been inverted */
1825 );
c6b76537 1826
cdd87c1d
KW
1827 /* If either P1 or P2 is empty, the intersection will be also; can skip
1828 * the loop */
765e6ecf 1829 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1830 ANYOF_POSIXL_ZERO(ssc);
1831 }
e0e1be5f 1832 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1833
1834 /* Note that the Posix class component P from 'and_with' actually
1835 * looks like:
1836 * P = Pa | Pb | ... | Pn
1837 * where each component is one posix class, such as in [\w\s].
1838 * Thus
1839 * ~P = ~(Pa | Pb | ... | Pn)
1840 * = ~Pa & ~Pb & ... & ~Pn
1841 * <= ~Pa | ~Pb | ... | ~Pn
1842 * The last is something we can easily calculate, but unfortunately
1843 * is likely to have many false positives. We could do better
1844 * in some (but certainly not all) instances if two classes in
1845 * P have known relationships. For example
1846 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1847 * So
1848 * :lower: & :print: = :lower:
1849 * And similarly for classes that must be disjoint. For example,
1850 * since \s and \w can have no elements in common based on rules in
1851 * the POSIX standard,
1852 * \w & ^\S = nothing
1853 * Unfortunately, some vendor locales do not meet the Posix
1854 * standard, in particular almost everything by Microsoft.
1855 * The loop below just changes e.g., \w into \W and vice versa */
1856
1ee208c4 1857 regnode_charclass_posixl temp;
cdd87c1d
KW
1858 int add = 1; /* To calculate the index of the complement */
1859
b1234259 1860 Zero(&temp, 1, regnode_charclass_posixl);
cdd87c1d
KW
1861 ANYOF_POSIXL_ZERO(&temp);
1862 for (i = 0; i < ANYOF_MAX; i++) {
1863 assert(i % 2 != 0
7dcac5f6
KW
1864 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1865 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1866
7dcac5f6 1867 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1868 ANYOF_POSIXL_SET(&temp, i + add);
1869 }
1870 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1871 }
1872 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1873
cdd87c1d
KW
1874 } /* else ssc already has no posixes */
1875 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1876 in its initial state */
71068078 1877 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1878 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1879 {
1880 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1881 * copy it over 'ssc' */
1882 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1883 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1884 StructCopy(and_with, ssc, regnode_ssc);
1885 }
1886 else {
1887 ssc->invlist = anded_cp_list;
1888 ANYOF_POSIXL_ZERO(ssc);
765e6ecf 1889 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1890 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1891 }
1892 }
1893 }
e0e1be5f 1894 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
765e6ecf 1895 || (and_with_flags & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1896 {
1897 /* One or the other of P1, P2 is non-empty. */
765e6ecf 1898 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1899 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1900 }
cdd87c1d
KW
1901 ssc_union(ssc, anded_cp_list, FALSE);
1902 }
1903 else { /* P1 = P2 = empty */
1904 ssc_intersection(ssc, anded_cp_list, FALSE);
1905 }
137165a6 1906 }
653099ff
GS
1907}
1908
653099ff 1909STATIC void
cdd87c1d 1910S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1911 const regnode_charclass *or_with)
653099ff 1912{
cdd87c1d
KW
1913 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1914 * another SSC or a regular ANYOF class. Can create false positives if
1915 * 'or_with' is to be inverted. */
7918f24d 1916
a0dd4231
KW
1917 SV* ored_cp_list;
1918 U8 ored_flags;
3146c00a 1919 U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
29a889ef
KW
1920 ? 0
1921 : ANYOF_FLAGS(or_with);
c6b76537 1922
cdd87c1d 1923 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1924
71068078 1925 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1926
1927 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1928 * the code point inversion list and just the relevant flags */
71068078 1929 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1930 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
765e6ecf 1931 ored_flags = or_with_flags;
a0dd4231
KW
1932 }
1933 else {
5c0f85ef 1934 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
765e6ecf 1935 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
f240c685
KW
1936 if (OP(or_with) != ANYOFD) {
1937 ored_flags
765e6ecf 1938 |= or_with_flags
108316fb
KW
1939 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1940 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
765e6ecf 1941 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
d1c40ef5
KW
1942 ored_flags |=
1943 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1944 }
f240c685 1945 }
a0dd4231
KW
1946 }
1947
1948 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1949
1950 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1951 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1952 * 'or_with' may be inverted. When not inverted, we have the simple
1953 * situation of computing:
1954 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1955 * If P1|P2 yields a situation with both a class and its complement are
1956 * set, like having both \w and \W, this matches all code points, and we
1957 * can delete these from the P component of the ssc going forward. XXX We
1958 * might be able to delete all the P components, but I (khw) am not certain
1959 * about this, and it is better to be safe.
1960 *
1961 * Inverted, we have
1962 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1963 * <= (C1 | P1) | ~C2
1964 * <= (C1 | ~C2) | P1
1965 * (which results in actually simpler code than the non-inverted case)
1966 * */
9826f543 1967
765e6ecf 1968 if ((or_with_flags & ANYOF_INVERT)
71068078 1969 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1970 {
cdd87c1d 1971 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1972 } /* else Not inverted */
765e6ecf 1973 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1974 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1975 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1976 unsigned int i;
1977 for (i = 0; i < ANYOF_MAX; i += 2) {
1978 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1979 {
1980 ssc_match_all_cp(ssc);
1981 ANYOF_POSIXL_CLEAR(ssc, i);
1982 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1983 }
1984 }
1985 }
1aa99e6b 1986 }
cdd87c1d
KW
1987
1988 ssc_union(ssc,
1989 ored_cp_list,
1990 FALSE /* Already has been inverted */
1991 );
653099ff
GS
1992}
1993
b423522f
KW
1994PERL_STATIC_INLINE void
1995S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1996{
1997 PERL_ARGS_ASSERT_SSC_UNION;
1998
71068078 1999 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2000
2001 _invlist_union_maybe_complement_2nd(ssc->invlist,
2002 invlist,
2003 invert2nd,
2004 &ssc->invlist);
2005}
2006
2007PERL_STATIC_INLINE void
2008S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2009 SV* const invlist,
2010 const bool invert2nd)
2011{
2012 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2013
71068078 2014 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2015
2016 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2017 invlist,
2018 invert2nd,
2019 &ssc->invlist);
2020}
2021
2022PERL_STATIC_INLINE void
2023S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2024{
2025 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2026
71068078 2027 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2028
2029 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2030}
2031
2032PERL_STATIC_INLINE void
2033S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2034{
2035 /* AND just the single code point 'cp' into the SSC 'ssc' */
2036
2037 SV* cp_list = _new_invlist(2);
2038
2039 PERL_ARGS_ASSERT_SSC_CP_AND;
2040
71068078 2041 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2042
2043 cp_list = add_cp_to_invlist(cp_list, cp);
2044 ssc_intersection(ssc, cp_list,
2045 FALSE /* Not inverted */
2046 );
2047 SvREFCNT_dec_NN(cp_list);
2048}
2049
2050PERL_STATIC_INLINE void
dc3bf405 2051S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
2052{
2053 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
2054 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2055
71068078 2056 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2057
2058 ANYOF_POSIXL_ZERO(ssc);
2059 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2060}
2061
b35552de
KW
2062#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2063
2064STATIC bool
2065S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2066{
2067 /* The synthetic start class is used to hopefully quickly winnow down
2068 * places where a pattern could start a match in the target string. If it
2069 * doesn't really narrow things down that much, there isn't much point to
2070 * having the overhead of using it. This function uses some very crude
2071 * heuristics to decide if to use the ssc or not.
2072 *
2073 * It returns TRUE if 'ssc' rules out more than half what it considers to
2074 * be the "likely" possible matches, but of course it doesn't know what the
2075 * actual things being matched are going to be; these are only guesses
2076 *
2077 * For /l matches, it assumes that the only likely matches are going to be
2078 * in the 0-255 range, uniformly distributed, so half of that is 127
2079 * For /a and /d matches, it assumes that the likely matches will be just
2080 * the ASCII range, so half of that is 63
2081 * For /u and there isn't anything matching above the Latin1 range, it
2082 * assumes that that is the only range likely to be matched, and uses
2083 * half that as the cut-off: 127. If anything matches above Latin1,
2084 * it assumes that all of Unicode could match (uniformly), except for
2085 * non-Unicode code points and things in the General Category "Other"
2086 * (unassigned, private use, surrogates, controls and formats). This
2087 * is a much large number. */
2088
b35552de
KW
2089 U32 count = 0; /* Running total of number of code points matched by
2090 'ssc' */
2091 UV start, end; /* Start and end points of current range in inversion
26be5fe6 2092 XXX outdated. UTF-8 locales are common, what about invert? list */
72400949
KW
2093 const U32 max_code_points = (LOC)
2094 ? 256
f6e8f31e 2095 : (( ! UNI_SEMANTICS
1a26cbcb 2096 || invlist_highest(ssc->invlist) < 256)
72400949
KW
2097 ? 128
2098 : NON_OTHER_COUNT);
2099 const U32 max_match = max_code_points / 2;
b35552de
KW
2100
2101 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2102
2103 invlist_iterinit(ssc->invlist);
2104 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
2105 if (start >= max_code_points) {
2106 break;
b35552de 2107 }
72400949 2108 end = MIN(end, max_code_points - 1);
b35552de 2109 count += end - start + 1;
72400949 2110 if (count >= max_match) {
b35552de
KW
2111 invlist_iterfinish(ssc->invlist);
2112 return FALSE;
2113 }
2114 }
2115
2116 return TRUE;
2117}
2118
2119
b423522f
KW
2120STATIC void
2121S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2122{
2123 /* The inversion list in the SSC is marked mortal; now we need a more
2124 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
2125 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2126 * map */
b423522f 2127
28118b9c 2128 SV* invlist = invlist_clone(ssc->invlist, NULL);
b423522f
KW
2129
2130 PERL_ARGS_ASSERT_SSC_FINALIZE;
2131
71068078 2132 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 2133
a0dd4231 2134 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
2135 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2136 * by the time we reach here */
f240c685
KW
2137 assert(! (ANYOF_FLAGS(ssc)
2138 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
2139 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2140 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 2141
b423522f
KW
2142 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2143
4c404f26 2144 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
b423522f 2145
85c8e306
KW
2146 /* Make sure is clone-safe */
2147 ssc->invlist = NULL;
2148
e0e1be5f 2149 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 2150 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
d156f5cb 2151 OP(ssc) = ANYOFPOSIXL;
e0e1be5f 2152 }
d156f5cb 2153 else if (RExC_contains_locale) {
b2e90ddf
KW
2154 OP(ssc) = ANYOFL;
2155 }
2156
1462525b 2157 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
2158}
2159
a3621e74
YO
2160#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2161#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2162#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
2163#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2164 ? (TRIE_LIST_CUR( idx ) - 1) \
2165 : 0 )
a3621e74 2166
3dab1dad
YO
2167
2168#ifdef DEBUGGING
07be1b83 2169/*
2b8b4781
NC
2170 dump_trie(trie,widecharmap,revcharmap)
2171 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2172 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
2173
2174 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
2175 The _interim_ variants are used for debugging the interim
2176 tables that are used to generate the final compressed
2177 representation which is what dump_trie expects.
2178
486ec47a 2179 Part of the reason for their existence is to provide a form
3dab1dad 2180 of documentation as to how the different representations function.
07be1b83
YO
2181
2182*/
3dab1dad
YO
2183
2184/*
3dab1dad
YO
2185 Dumps the final compressed table form of the trie to Perl_debug_log.
2186 Used for debugging make_trie().
2187*/
b9a59e08 2188
3dab1dad 2189STATIC void
2b8b4781
NC
2190S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2191 AV *revcharmap, U32 depth)
3dab1dad
YO
2192{
2193 U32 state;
ab3bbdeb 2194 SV *sv=sv_newmortal();
55eed653 2195 int colwidth= widecharmap ? 6 : 4;
2e64971a 2196 U16 word;
3dab1dad
YO
2197 GET_RE_DEBUG_FLAGS_DECL;
2198
7918f24d 2199 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 2200
6ad9a8ab 2201 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 2202 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
2203
2204 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 2205 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 2206 if ( tmp ) {
6ad9a8ab 2207 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2208 colwidth,
538e84ed 2209 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2210 PL_colors[0], PL_colors[1],
2211 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2212 PERL_PV_ESCAPE_FIRSTCHAR
2213 )
ab3bbdeb 2214 );
3dab1dad
YO
2215 }
2216 }
1e37780e
YO
2217 Perl_re_printf( aTHX_ "\n");
2218 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2219
2220 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2221 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2222 Perl_re_printf( aTHX_ "\n");
3dab1dad 2223
1e2e3d02 2224 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2225 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2226
147e3846 2227 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2228
2229 if ( trie->states[ state ].wordnum ) {
1e37780e 2230 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2231 } else {
6ad9a8ab 2232 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2233 }
2234
147e3846 2235 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2236
2237 if ( base ) {
2238 U32 ofs = 0;
2239
2240 while( ( base + ofs < trie->uniquecharcount ) ||
2241 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2242 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2243 != state))
3dab1dad
YO
2244 ofs++;
2245
147e3846 2246 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2247
2248 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2249 if ( ( base + ofs >= trie->uniquecharcount )
2250 && ( base + ofs - trie->uniquecharcount
2251 < trie->lasttrans )
2252 && trie->trans[ base + ofs
2253 - trie->uniquecharcount ].check == state )
3dab1dad 2254 {
147e3846 2255 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2256 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2257 );
3dab1dad 2258 } else {
88f063b4 2259 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
3dab1dad
YO
2260 }
2261 }
2262
6ad9a8ab 2263 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2264
2265 }
6ad9a8ab 2266 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2267 }
6ad9a8ab 2268 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2269 depth);
2e64971a 2270 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2271 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2272 (int)word, (int)(trie->wordinfo[word].prev),
2273 (int)(trie->wordinfo[word].len));
2274 }
6ad9a8ab 2275 Perl_re_printf( aTHX_ "\n" );
538e84ed 2276}
3dab1dad 2277/*
3dab1dad 2278 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2279 List tries normally only are used for construction when the number of
3dab1dad
YO
2280 possible chars (trie->uniquecharcount) is very high.
2281 Used for debugging make_trie().
2282*/
2283STATIC void
55eed653 2284S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2285 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2286 U32 depth)
3dab1dad
YO
2287{
2288 U32 state;
ab3bbdeb 2289 SV *sv=sv_newmortal();
55eed653 2290 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2291 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2292
2293 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2294
3dab1dad 2295 /* print out the table precompression. */
6ad9a8ab 2296 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2297 depth+1 );
6ad9a8ab 2298 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2299 depth+1, "------:-----+-----------------\n" );
538e84ed 2300
3dab1dad
YO
2301 for( state=1 ; state < next_alloc ; state ++ ) {
2302 U16 charid;
538e84ed 2303
147e3846 2304 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2305 depth+1, (UV)state );
3dab1dad 2306 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2307 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2308 } else {
6ad9a8ab 2309 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2310 trie->states[ state ].wordnum
2311 );
2312 }
2313 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed 2314 SV ** const tmp = av_fetch( revcharmap,
88f063b4 2315 TRIE_LIST_ITEM(state, charid).forid, 0);
ab3bbdeb 2316 if ( tmp ) {
147e3846 2317 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2318 colwidth,
538e84ed
KW
2319 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2320 colwidth,
2321 PL_colors[0], PL_colors[1],
2322 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2323 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2324 ) ,
88f063b4
KW
2325 TRIE_LIST_ITEM(state, charid).forid,
2326 (UV)TRIE_LIST_ITEM(state, charid).newstate
1e2e3d02 2327 );
538e84ed 2328 if (!(charid % 10))
6ad9a8ab 2329 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2330 (int)((depth * 2) + 14), "");
1e2e3d02 2331 }
ab3bbdeb 2332 }
6ad9a8ab 2333 Perl_re_printf( aTHX_ "\n");
3dab1dad 2334 }
538e84ed 2335}
3dab1dad
YO
2336
2337/*
3dab1dad 2338 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2339 This is the normal DFA style state transition table, with a few
2340 twists to facilitate compression later.
3dab1dad
YO
2341 Used for debugging make_trie().
2342*/
2343STATIC void
55eed653 2344S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2345 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2346 U32 depth)
3dab1dad
YO
2347{
2348 U32 state;
2349 U16 charid;
ab3bbdeb 2350 SV *sv=sv_newmortal();
55eed653 2351 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2352 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2353
2354 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2355
3dab1dad
YO
2356 /*
2357 print out the table precompression so that we can do a visual check
2358 that they are identical.
2359 */
538e84ed 2360
6ad9a8ab 2361 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2362
2363 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2364 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2365 if ( tmp ) {
6ad9a8ab 2366 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2367 colwidth,
538e84ed 2368 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2369 PL_colors[0], PL_colors[1],
2370 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2371 PERL_PV_ESCAPE_FIRSTCHAR
2372 )
ab3bbdeb 2373 );
3dab1dad
YO
2374 }
2375 }
2376
4aaafc03
YO
2377 Perl_re_printf( aTHX_ "\n");
2378 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2379
2380 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2381 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2382 }
2383
6ad9a8ab 2384 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2385
2386 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2387
147e3846 2388 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2389 depth+1,
3dab1dad
YO
2390 (UV)TRIE_NODENUM( state ) );
2391
2392 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2393 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2394 if (v)
147e3846 2395 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2396 else
6ad9a8ab 2397 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2398 }
2399 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2400 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2401 (UV)trie->trans[ state ].check );
3dab1dad 2402 } else {
147e3846 2403 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2404 (UV)trie->trans[ state ].check,
3dab1dad
YO
2405 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2406 }
2407 }
07be1b83 2408}
3dab1dad
YO
2409
2410#endif
2411
2e64971a 2412
786e8c11
YO
2413/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2414 startbranch: the first branch in the whole branch sequence
2415 first : start branch of sequence of branch-exact nodes.
2416 May be the same as startbranch
2417 last : Thing following the last branch.
2418 May be the same as tail.
2419 tail : item following the branch sequence
2420 count : words in the sequence
a4525e78 2421 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2422 depth : indent depth
3dab1dad 2423
786e8c11 2424Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2425
786e8c11
YO
2426A trie is an N'ary tree where the branches are determined by digital
2427decomposition of the key. IE, at the root node you look up the 1st character and
2428follow that branch repeat until you find the end of the branches. Nodes can be
2429marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2430
786e8c11 2431 /he|she|his|hers/
72f13be8 2432
786e8c11
YO
2433would convert into the following structure. Numbers represent states, letters
2434following numbers represent valid transitions on the letter from that state, if
2435the number is in square brackets it represents an accepting state, otherwise it
2436will be in parenthesis.
07be1b83 2437
786e8c11
YO
2438 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2439 | |
2440 | (2)
2441 | |
2442 (1) +-i->(6)-+-s->[7]
2443 |
2444 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2445
786e8c11
YO
2446 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2447
2448This shows that when matching against the string 'hers' we will begin at state 1
2449read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2450then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2451is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2452single traverse. We store a mapping from accepting to state to which word was
2453matched, and then when we have multiple possibilities we try to complete the
b8fda935 2454rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2455
2456The only prior NFA like behaviour that would be changed by the TRIE support is
2457the silent ignoring of duplicate alternations which are of the form:
2458
2459 / (DUPE|DUPE) X? (?{ ... }) Y /x
2460
4b714af6 2461Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2462and without the optimisation. With the optimisations dupes will be silently
486ec47a 2463ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2464the following demonstrates:
2465
2466 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2467
2468which prints out 'word' three times, but
2469
2470 'words'=~/(word|word|word)(?{ print $1 })S/
2471
2472which doesnt print it out at all. This is due to other optimisations kicking in.
2473
2474Example of what happens on a structural level:
2475
486ec47a 2476The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2477
2478 1: CURLYM[1] {1,32767}(18)
2479 5: BRANCH(8)
2480 6: EXACT <ac>(16)
2481 8: BRANCH(11)
2482 9: EXACT <ad>(16)
2483 11: BRANCH(14)
2484 12: EXACT <ab>(16)
2485 16: SUCCEED(0)
2486 17: NOTHING(18)
2487 18: END(0)
2488
2489This would be optimizable with startbranch=5, first=5, last=16, tail=16
2490and should turn into:
2491
2492 1: CURLYM[1] {1,32767}(18)
2493 5: TRIE(16)
2494 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2495 <ac>
2496 <ad>
2497 <ab>
2498 16: SUCCEED(0)
2499 17: NOTHING(18)
2500 18: END(0)
2501
2502Cases where tail != last would be like /(?foo|bar)baz/:
2503
2504 1: BRANCH(4)
2505 2: EXACT <foo>(8)
2506 4: BRANCH(7)
2507 5: EXACT <bar>(8)
2508 7: TAIL(8)
2509 8: EXACT <baz>(10)
2510 10: END(0)
2511
2512which would be optimizable with startbranch=1, first=1, last=7, tail=8
2513and would end up looking like:
2514
2515 1: TRIE(8)
2516 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2517 <foo>
2518 <bar>
2519 7: TAIL(8)
2520 8: EXACT <baz>(10)
2521 10: END(0)
2522
c80e42f3 2523 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2524
2525is the recommended Unicode-aware way of saying
2526
2527 *(d++) = uv;
2528*/
2529
fab2782b 2530#define TRIE_STORE_REVCHAR(val) \
786e8c11 2531 STMT_START { \
73031816 2532 if (UTF) { \
668fcfea 2533 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2534 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1d84a256
DM
2535 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2536 *kapow = '\0'; \
73031816
NC
2537 SvCUR_set(zlopp, kapow - flrbbbbb); \
2538 SvPOK_on(zlopp); \
2539 SvUTF8_on(zlopp); \
2540 av_push(revcharmap, zlopp); \
2541 } else { \
fab2782b 2542 char ooooff = (char)val; \
73031816
NC
2543 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2544 } \
2545 } STMT_END
786e8c11 2546
914a25d5
KW
2547/* This gets the next character from the input, folding it if not already
2548 * folded. */
2549#define TRIE_READ_CHAR STMT_START { \
2550 wordlen++; \
2551 if ( UTF ) { \
2552 /* if it is UTF then it is either already folded, or does not need \
2553 * folding */ \
1c1d615a 2554 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2555 } \
2556 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2557 /* This folder implies Unicode rules, which in the range expressible \
2558 * by not UTF is the lower case, with the two exceptions, one of \
2559 * which should have been taken care of before calling this */ \
2560 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2561 uvc = toLOWER_L1(*uc); \
2562 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2563 len = 1; \
914a25d5
KW
2564 } else { \
2565 /* raw data, will be folded later if needed */ \
2566 uvc = (U32)*uc; \
2567 len = 1; \
2568 } \
786e8c11
YO
2569} STMT_END
2570
2571
2572
2573#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2574 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
00195859 2575 U32 ging = TRIE_LIST_LEN( state ) * 2; \
f9003953 2576 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
00195859 2577 TRIE_LIST_LEN( state ) = ging; \
786e8c11
YO
2578 } \
2579 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2580 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2581 TRIE_LIST_CUR( state )++; \
2582} STMT_END
07be1b83 2583
786e8c11 2584#define TRIE_LIST_NEW(state) STMT_START { \
d09f14bf 2585 Newx( trie->states[ state ].trans.list, \
786e8c11
YO
2586 4, reg_trie_trans_le ); \
2587 TRIE_LIST_CUR( state ) = 1; \
2588 TRIE_LIST_LEN( state ) = 4; \
2589} STMT_END
07be1b83 2590
786e8c11
YO
2591#define TRIE_HANDLE_WORD(state) STMT_START { \
2592 U16 dupe= trie->states[ state ].wordnum; \
2593 regnode * const noper_next = regnext( noper ); \
2594 \
786e8c11
YO
2595 DEBUG_r({ \
2596 /* store the word for dumping */ \
2597 SV* tmp; \
2598 if (OP(noper) != NOTHING) \
740cce10 2599 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2600 else \
740cce10 2601 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2602 av_push( trie_words, tmp ); \
786e8c11
YO
2603 }); \
2604 \
2605 curword++; \
2e64971a
DM
2606 trie->wordinfo[curword].prev = 0; \
2607 trie->wordinfo[curword].len = wordlen; \
2608 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2609 \
2610 if ( noper_next < tail ) { \
2611 if (!trie->jump) \
538e84ed
KW
2612 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2613 sizeof(U16) ); \
7f69552c 2614 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2615 if (!jumper) \
2616 jumper = noper_next; \
2617 if (!nextbranch) \
2618 nextbranch= regnext(cur); \
2619 } \
2620 \
2621 if ( dupe ) { \
2e64971a
DM
2622 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2623 /* chain, so that when the bits of chain are later */\
2624 /* linked together, the dups appear in the chain */\
2625 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2626 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2627 } else { \
2628 /* we haven't inserted this word yet. */ \
2629 trie->states[ state ].wordnum = curword; \
2630 } \
2631} STMT_END
07be1b83 2632
3dab1dad 2633
786e8c11
YO
2634#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2635 ( ( base + charid >= ucharcount \
2636 && base + charid < ubound \
2637 && state == trie->trans[ base - ucharcount + charid ].check \
2638 && trie->trans[ base - ucharcount + charid ].next ) \
2639 ? trie->trans[ base - ucharcount + charid ].next \
2640 : ( state==1 ? special : 0 ) \
2641 )
3dab1dad 2642
8bcafbf4
YO
2643#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2644STMT_START { \
2645 TRIE_BITMAP_SET(trie, uvc); \
2646 /* store the folded codepoint */ \
2647 if ( folder ) \
2648 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2649 \
2650 if ( !UTF ) { \
2651 /* store first byte of utf8 representation of */ \
2652 /* variant codepoints */ \
2653 if (! UVCHR_IS_INVARIANT(uvc)) { \
2654 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2655 } \
2656 } \
2657} STMT_END
786e8c11
YO
2658#define MADE_TRIE 1
2659#define MADE_JUMP_TRIE 2
2660#define MADE_EXACT_TRIE 4
3dab1dad 2661
a3621e74 2662STATIC I32
538e84ed
KW
2663S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2664 regnode *first, regnode *last, regnode *tail,
2665 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2666{
2667 /* first pass, loop through and scan words */
2668 reg_trie_data *trie;
55eed653 2669 HV *widecharmap = NULL;
2b8b4781 2670 AV *revcharmap = newAV();
a3621e74 2671 regnode *cur;
a3621e74
YO
2672 STRLEN len = 0;
2673 UV uvc = 0;
2674 U16 curword = 0;
2675 U32 next_alloc = 0;
786e8c11
YO
2676 regnode *jumper = NULL;
2677 regnode *nextbranch = NULL;
7f69552c 2678 regnode *convert = NULL;
2e64971a 2679 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2680 /* we just use folder as a flag in utf8 */
1e696034 2681 const U8 * folder = NULL;
a3621e74 2682
3a611511
YO
2683 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2684 * which stands for one trie structure, one hash, optionally followed
2685 * by two arrays */
2b8b4781 2686#ifdef DEBUGGING
3a611511 2687 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2b8b4781
NC
2688 AV *trie_words = NULL;
2689 /* along with revcharmap, this only used during construction but both are
2690 * useful during debugging so we store them in the struct when debugging.
8e11feef 2691 */
2b8b4781 2692#else
cf78de0b 2693 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2694 STRLEN trie_charcount=0;
3dab1dad 2695#endif
2b8b4781 2696 SV *re_trie_maxbuff;
a3621e74 2697 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2698
2699 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2700#ifndef DEBUGGING
2701 PERL_UNUSED_ARG(depth);
2702#endif
a3621e74 2703
1e696034 2704 switch (flags) {
f6b4b99d 2705 case EXACT: case EXACT_ONLY8: case EXACTL: break;
89829bb5 2706 case EXACTFAA:
0ea669f4 2707 case EXACTFUP:
a4525e78
KW
2708 case EXACTFU:
2709 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2710 case EXACTF: folder = PL_fold; break;
fab2782b 2711 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2712 }
2713
c944940b 2714 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2715 trie->refcount = 1;
3dab1dad 2716 trie->startstate = 1;
786e8c11 2717 trie->wordcount = word_count;
f8fc2ecf 2718 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2719 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
f6b4b99d 2720 if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
c944940b 2721 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2722 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2723 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2724
a3621e74 2725 DEBUG_r({
2b8b4781 2726 trie_words = newAV();
a3621e74 2727 });
a3621e74 2728
3b89859a 2729 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
316ebaf2 2730 assert(re_trie_maxbuff);
a3621e74 2731 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2732 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2733 }
df826430 2734 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2735 Perl_re_indentf( aTHX_
cb41e5d6
YO
2736 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2737 depth+1,
88f063b4 2738 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
538e84ed 2739 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2740 });
538e84ed 2741
7f69552c
YO
2742 /* Find the node we are going to overwrite */
2743 if ( first == startbranch && OP( last ) != BRANCH ) {
2744 /* whole branch chain */
2745 convert = first;
2746 } else {
2747 /* branch sub-chain */
2748 convert = NEXTOPER( first );
2749 }
538e84ed 2750
a3621e74
YO
2751 /* -- First loop and Setup --
2752
2753 We first traverse the branches and scan each word to determine if it
2754 contains widechars, and how many unique chars there are, this is
2755 important as we have to build a table with at least as many columns as we
2756 have unique chars.
2757
2758 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2759 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2760 the native representation of the character value as the key and IV's for
2761 the coded index.
a3621e74
YO
2762
2763 *TODO* If we keep track of how many times each character is used we can
2764 remap the columns so that the table compression later on is more
3b753521 2765 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2766 middle and the least common are on the outside. IMO this would be better
2767 than a most to least common mapping as theres a decent chance the most
2768 common letter will share a node with the least common, meaning the node
486ec47a 2769 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2770 case is when we have the least common nodes twice.
2771
2772 */
2773
a3621e74 2774 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2775 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2776 const U8 *uc;
2777 const U8 *e;
bc031a7d 2778 int foldlen = 0;
07be1b83 2779 U32 wordlen = 0; /* required init */
bc031a7d
KW
2780 STRLEN minchars = 0;
2781 STRLEN maxchars = 0;
538e84ed
KW
2782 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2783 bitmap?*/
a3621e74 2784
3dab1dad 2785 if (OP(noper) == NOTHING) {
20ed8c88
YO
2786 /* skip past a NOTHING at the start of an alternation
2787 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2788 */
df826430 2789 regnode *noper_next= regnext(noper);
944e05e3
YO
2790 if (noper_next < tail)
2791 noper= noper_next;
2792 }
2793
f6b4b99d
KW
2794 if ( noper < tail
2795 && ( OP(noper) == flags
2796 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
a9f8c7ac 2797 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
0ea669f4 2798 || OP(noper) == EXACTFUP))))
f6b4b99d 2799 {
944e05e3
YO
2800 uc= (U8*)STRING(noper);
2801 e= uc + STR_LEN(noper);
2802 } else {
2803 trie->minlen= 0;
2804 continue;
3dab1dad 2805 }
df826430 2806
944e05e3 2807
fab2782b 2808 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2809 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2810 regardless of encoding */
0ea669f4 2811 if (OP( noper ) == EXACTFUP) {
fab2782b 2812 /* false positives are ok, so just set this */
0dc4a61d 2813 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2814 }
2815 }
dca5fc4c 2816
bc031a7d
KW
2817 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2818 branch */
3dab1dad 2819 TRIE_CHARCOUNT(trie)++;
a3621e74 2820 TRIE_READ_CHAR;
645de4ce 2821
bc031a7d
KW
2822 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2823 * is in effect. Under /i, this character can match itself, or
2824 * anything that folds to it. If not under /i, it can match just
2825 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2826 * all fold to k, and all are single characters. But some folds
2827 * expand to more than one character, so for example LATIN SMALL
2828 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2829 * the string beginning at 'uc' is 'ffi', it could be matched by
2830 * three characters, or just by the one ligature character. (It
2831 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2832 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2833 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2834 * match.) The trie needs to know the minimum and maximum number
2835 * of characters that could match so that it can use size alone to
2836 * quickly reject many match attempts. The max is simple: it is
2837 * the number of folded characters in this branch (since a fold is
2838 * never shorter than what folds to it. */
2839
2840 maxchars++;
2841
2842 /* And the min is equal to the max if not under /i (indicated by
2843 * 'folder' being NULL), or there are no multi-character folds. If
2844 * there is a multi-character fold, the min is incremented just
2845 * once, for the character that folds to the sequence. Each
2846 * character in the sequence needs to be added to the list below of
2847 * characters in the trie, but we count only the first towards the
2848 * min number of characters needed. This is done through the
2849 * variable 'foldlen', which is returned by the macros that look
2850 * for these sequences as the number of bytes the sequence
2851 * occupies. Each time through the loop, we decrement 'foldlen' by
2852 * how many bytes the current char occupies. Only when it reaches
2853 * 0 do we increment 'minchars' or look for another multi-character
2854 * sequence. */
2855 if (folder == NULL) {
2856 minchars++;
2857 }
2858 else if (foldlen > 0) {
2859 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2860 }
2861 else {
bc031a7d
KW
2862 minchars++;
2863
2864 /* See if *uc is the beginning of a multi-character fold. If
2865 * so, we decrement the length remaining to look at, to account
2866 * for the current character this iteration. (We can use 'uc'
2867 * instead of the fold returned by TRIE_READ_CHAR because for
2868 * non-UTF, the latin1_safe macro is smart enough to account
2869 * for all the unfolded characters, and because for UTF, the
2870 * string will already have been folded earlier in the
2871 * compilation process */
2872 if (UTF) {
2873 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2874 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2875 }
2876 }
bc031a7d
KW
2877 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2878 foldlen--;
2879 }
645de4ce 2880 }
bc031a7d
KW
2881
2882 /* The current character (and any potential folds) should be added
2883 * to the possible matching characters for this position in this
2884 * branch */
a3621e74 2885 if ( uvc < 256 ) {
fab2782b
YO
2886 if ( folder ) {
2887 U8 folded= folder[ (U8) uvc ];
2888 if ( !trie->charmap[ folded ] ) {
2889 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2890 TRIE_STORE_REVCHAR( folded );
2891 }
2892 }
a3621e74
YO
2893 if ( !trie->charmap[ uvc ] ) {
2894 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2895 TRIE_STORE_REVCHAR( uvc );
a3621e74 2896 }
02daf0ab 2897 if ( set_bit ) {
62012aee
KW
2898 /* store the codepoint in the bitmap, and its folded
2899 * equivalent. */
8bcafbf4 2900 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2901 set_bit = 0; /* We've done our bit :-) */
2902 }
a3621e74 2903 } else {
bc031a7d
KW
2904
2905 /* XXX We could come up with the list of code points that fold
2906 * to this using PL_utf8_foldclosures, except not for
2907 * multi-char folds, as there may be multiple combinations
2908 * there that could work, which needs to wait until runtime to
2909 * resolve (The comment about LIGATURE FFI above is such an
2910 * example */
2911
a3621e74 2912 SV** svpp;
55eed653
NC
2913 if ( !widecharmap )
2914 widecharmap = newHV();
a3621e74 2915
55eed653 2916 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2917
2918 if ( !svpp )
147e3846 2919 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
2920
2921 if ( !SvTRUE( *svpp ) ) {
2922 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2923 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2924 }
2925 }
bc031a7d
KW
2926 } /* end loop through characters in this branch of the trie */
2927
2928 /* We take the min and max for this branch and combine to find the min
2929 * and max for all branches processed so far */
3dab1dad 2930 if( cur == first ) {
bc031a7d
KW
2931 trie->minlen = minchars;
2932 trie->maxlen = maxchars;
2933 } else if (minchars < trie->minlen) {
2934 trie->minlen = minchars;
2935 } else if (maxchars > trie->maxlen) {
2936 trie->maxlen = maxchars;
fab2782b 2937 }
a3621e74
YO
2938 } /* end first pass */
2939 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2940 Perl_re_indentf( aTHX_
cb41e5d6
YO
2941 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2942 depth+1,
55eed653 2943 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2944 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2945 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2946 );
a3621e74
YO
2947
2948 /*
2949 We now know what we are dealing with in terms of unique chars and
2950 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2951 representation using a flat table will take. If it's over a reasonable
2952 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2953 conservative but potentially much slower representation using an array
2954 of lists.
2955
2956 At the end we convert both representations into the same compressed
2957 form that will be used in regexec.c for matching with. The latter
2958 is a form that cannot be used to construct with but has memory
2959 properties similar to the list form and access properties similar
2960 to the table form making it both suitable for fast searches and
2961 small enough that its feasable to store for the duration of a program.
2962
2963 See the comment in the code where the compressed table is produced
2964 inplace from the flat tabe representation for an explanation of how
2965 the compression works.
2966
2967 */
2968
2969
2e64971a
DM
2970 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2971 prev_states[1] = 0;
2972
538e84ed
KW
2973 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2974 > SvIV(re_trie_maxbuff) )
2975 {
a3621e74
YO
2976 /*
2977 Second Pass -- Array Of Lists Representation
2978
2979 Each state will be represented by a list of charid:state records
2980 (reg_trie_trans_le) the first such element holds the CUR and LEN
2981 points of the allocated array. (See defines above).
2982
2983 We build the initial structure using the lists, and then convert
2984 it into the compressed table form which allows faster lookups
2985 (but cant be modified once converted).
a3621e74
YO
2986 */
2987
a3621e74
YO
2988 STRLEN transcount = 1;
2989
6ad9a8ab 2990 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 2991 depth+1));
686b73d4 2992
c944940b
JH
2993 trie->states = (reg_trie_state *)
2994 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2995 sizeof(reg_trie_state) );
a3621e74
YO
2996 TRIE_LIST_NEW(1);
2997 next_alloc = 2;
2998
2999 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3000
df826430 3001 regnode *noper = NEXTOPER( cur );
c445ea15
AL
3002 U32 state = 1; /* required init */
3003 U16 charid = 0; /* sanity init */
07be1b83 3004 U32 wordlen = 0; /* required init */
c445ea15 3005
df826430
YO
3006 if (OP(noper) == NOTHING) {
3007 regnode *noper_next= regnext(noper);
944e05e3
YO
3008 if (noper_next < tail)
3009 noper= noper_next;
df826430
YO
3010 }
3011
f6b4b99d
KW
3012 if ( noper < tail
3013 && ( OP(noper) == flags
3014 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
a9f8c7ac 3015 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
0ea669f4 3016 || OP(noper) == EXACTFUP))))
f6b4b99d 3017 {
944e05e3
YO
3018 const U8 *uc= (U8*)STRING(noper);
3019 const U8 *e= uc + STR_LEN(noper);
3020
786e8c11 3021 for ( ; uc < e ; uc += len ) {
c445ea15 3022
786e8c11 3023 TRIE_READ_CHAR;
c445ea15 3024
786e8c11
YO
3025 if ( uvc < 256 ) {
3026 charid = trie->charmap[ uvc ];
c445ea15 3027 } else {
538e84ed
KW
3028 SV** const svpp = hv_fetch( widecharmap,
3029 (char*)&uvc,
3030 sizeof( UV ),
3031 0);
786e8c11
YO
3032 if ( !svpp ) {
3033 charid = 0;
3034 } else {
3035 charid=(U16)SvIV( *svpp );
3036 }
c445ea15 3037 }
538e84ed
KW
3038 /* charid is now 0 if we dont know the char read, or
3039 * nonzero if we do */
786e8c11 3040 if ( charid ) {
a3621e74 3041
786e8c11
YO
3042 U16 check;
3043 U32 newstate = 0;
a3621e74 3044
786e8c11
YO
3045 charid--;
3046 if ( !trie->states[ state ].trans.list ) {
3047 TRIE_LIST_NEW( state );
c445ea15 3048 }
538e84ed
KW
3049 for ( check = 1;
3050 check <= TRIE_LIST_USED( state );
3051 check++ )
3052 {
3053 if ( TRIE_LIST_ITEM( state, check ).forid
3054 == charid )
3055 {
786e8c11
YO
3056 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3057 break;
3058 }
3059 }
3060 if ( ! newstate ) {
3061 newstate = next_alloc++;
2e64971a 3062 prev_states[newstate] = state;
786e8c11
YO
3063 TRIE_LIST_PUSH( state, charid, newstate );
3064 transcount++;
3065 }
3066 state = newstate;
3067 } else {
147e3846 3068 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 3069 }
a28509cc 3070 }
c445ea15 3071 }
3dab1dad 3072 TRIE_HANDLE_WORD(state);
a3621e74
YO
3073
3074 } /* end second pass */
3075
1e2e3d02 3076 /* next alloc is the NEXT state to be allocated */
538e84ed 3077 trie->statecount = next_alloc;
c944940b
JH
3078 trie->states = (reg_trie_state *)
3079 PerlMemShared_realloc( trie->states,
3080 next_alloc
3081 * sizeof(reg_trie_state) );
a3621e74 3082
3dab1dad 3083 /* and now dump it out before we compress it */
2b8b4781
NC
3084 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3085 revcharmap, next_alloc,
3086 depth+1)
1e2e3d02 3087 );
a3621e74 3088
c944940b
JH
3089 trie->trans = (reg_trie_trans *)
3090 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
3091 {
3092 U32 state;
a3621e74
YO
3093 U32 tp = 0;
3094 U32 zp = 0;
3095
3096
3097 for( state=1 ; state < next_alloc ; state ++ ) {
3098 U32 base=0;
3099
3100 /*
3101 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3102 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
3103 );
3104 */
3105
3106 if (trie->states[state].trans.list) {
3107 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3108 U16 maxid=minid;
a28509cc 3109 U16 idx;
a3621e74
YO
3110
3111 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
3112 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3113 if ( forid < minid ) {
3114 minid=forid;
3115 } else if ( forid > maxid ) {
3116 maxid=forid;
3117 }
a3621e74
YO
3118 }
3119 if ( transcount < tp + maxid - minid + 1) {
3120 transcount *= 2;
c944940b
JH
3121 trie->trans = (reg_trie_trans *)
3122 PerlMemShared_realloc( trie->trans,
446bd890
NC
3123 transcount
3124 * sizeof(reg_trie_trans) );
538e84ed
KW
3125 Zero( trie->trans + (transcount / 2),
3126 transcount / 2,
3127 reg_trie_trans );
a3621e74
YO
3128 }
3129 base = trie->uniquecharcount + tp - minid;
3130 if ( maxid == minid ) {
3131 U32 set = 0;
3132 for ( ; zp < tp ; zp++ ) {
3133 if ( ! trie->trans[ zp ].next ) {
3134 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
3135 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3136 1).newstate;
a3621e74
YO
3137 trie->trans[ zp ].check = state;
3138 set = 1;
3139 break;
3140 }
3141 }
3142 if ( !set ) {
538e84ed
KW
3143 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3144 1).newstate;
a3621e74
YO
3145 trie->trans[ tp ].check = state;
3146 tp++;
3147 zp = tp;
3148 }
3149 } else {
3150 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
3151 const U32 tid = base
3152 - trie->uniquecharcount
3153 + TRIE_LIST_ITEM( state, idx ).forid;
3154 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3155 idx ).newstate;
a3621e74
YO
3156 trie->trans[ tid ].check = state;
3157 }
3158 tp += ( maxid - minid + 1 );
3159 }
3160 Safefree(trie->states[ state ].trans.list);
3161 }
3162 /*
3163 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3164 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
3165 );
3166 */
3167 trie->states[ state ].trans.base=base;
3168 }
cc601c31 3169 trie->lasttrans = tp + 1;
a3621e74
YO
3170 }
3171 } else {
3172 /*
3173 Second Pass -- Flat Table Representation.
3174
b423522f
KW
3175 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3176 each. We know that we will need Charcount+1 trans at most to store
3177 the data (one row per char at worst case) So we preallocate both
3178 structures assuming worst case.
a3621e74
YO
3179
3180 We then construct the trie using only the .next slots of the entry
3181 structs.
3182
b423522f
KW
3183 We use the .check field of the first entry of the node temporarily
3184 to make compression both faster and easier by keeping track of how
3185 many non zero fields are in the node.
a3621e74
YO
3186
3187 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3188 transition.
3189
b423522f
KW
3190 There are two terms at use here: state as a TRIE_NODEIDX() which is
3191 a number representing the first entry of the node, and state as a
3192 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3193 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3194 if there are 2 entrys per node. eg:
a3621e74
YO
3195
3196 A B A B
3197 1. 2 4 1. 3 7
3198 2. 0 3 3. 0 5
3199 3. 0 0 5. 0 0
3200 4. 0 0 7. 0 0
3201
b423522f
KW
3202 The table is internally in the right hand, idx form. However as we
3203 also have to deal with the states array which is indexed by nodenum
3204 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
3205
3206 */
6ad9a8ab 3207 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 3208 depth+1));
3dab1dad 3209
c944940b
JH
3210 trie->trans = (reg_trie_trans *)
3211 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3212 * trie->uniquecharcount + 1,
3213 sizeof(reg_trie_trans) );
3214 trie->states = (reg_trie_state *)
3215 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3216 sizeof(reg_trie_state) );
a3621e74
YO
3217 next_alloc = trie->uniquecharcount + 1;
3218
3dab1dad 3219
a3621e74
YO
3220 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3221
df826430 3222 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3223
3224 U32 state = 1; /* required init */
3225
3226 U16 charid = 0; /* sanity init */
3227 U32 accept_state = 0; /* sanity init */
a3621e74 3228
07be1b83 3229 U32 wordlen = 0; /* required init */
a3621e74 3230
df826430
YO
3231 if (OP(noper) == NOTHING) {
3232 regnode *noper_next= regnext(noper);
944e05e3
YO
3233 if (noper_next < tail)
3234 noper= noper_next;
df826430 3235 }
fab2782b 3236
f6b4b99d
KW
3237 if ( noper < tail
3238 && ( OP(noper) == flags
3239 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
a9f8c7ac 3240 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
0ea669f4 3241 || OP(noper) == EXACTFUP))))
f6b4b99d 3242 {
944e05e3
YO
3243 const U8 *uc= (U8*)STRING(noper);
3244 const U8 *e= uc + STR_LEN(noper);
3245
786e8c11 3246 for ( ; uc < e ; uc += len ) {
a3621e74 3247
786e8c11 3248 TRIE_READ_CHAR;
a3621e74 3249
786e8c11
YO
3250 if ( uvc < 256 ) {
3251 charid = trie->charmap[ uvc ];
3252 } else {
538e84ed
KW
3253 SV* const * const svpp = hv_fetch( widecharmap,
3254 (char*)&uvc,
3255 sizeof( UV ),
3256 0);
786e8c11 3257 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3258 }
786e8c11
YO
3259 if ( charid ) {
3260 charid--;
3261 if ( !trie->trans[ state + charid ].next ) {
3262 trie->trans[ state + charid ].next = next_alloc;
3263 trie->trans[ state ].check++;
2e64971a
DM
3264 prev_states[TRIE_NODENUM(next_alloc)]
3265 = TRIE_NODENUM(state);
786e8c11
YO
3266 next_alloc += trie->uniquecharcount;
3267 }
3268 state = trie->trans[ state + charid ].next;
3269 } else {
147e3846 3270 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3271 }
538e84ed
KW
3272 /* charid is now 0 if we dont know the char read, or
3273 * nonzero if we do */
a3621e74 3274 }
a3621e74 3275 }
3dab1dad
YO
3276 accept_state = TRIE_NODENUM( state );
3277 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3278
3279 } /* end second pass */
3280
3dab1dad 3281 /* and now dump it out before we compress it */
2b8b4781
NC
3282 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3283 revcharmap,
3284 next_alloc, depth+1));
a3621e74 3285
a3621e74
YO
3286 {
3287 /*
3288 * Inplace compress the table.*
3289
3290 For sparse data sets the table constructed by the trie algorithm will
3291 be mostly 0/FAIL transitions or to put it another way mostly empty.
3292 (Note that leaf nodes will not contain any transitions.)
3293
3294 This algorithm compresses the tables by eliminating most such
3295 transitions, at the cost of a modest bit of extra work during lookup:
3296
3297 - Each states[] entry contains a .base field which indicates the
3298 index in the state[] array wheres its transition data is stored.
3299
3b753521 3300 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3301
3302 - If .base is nonzero then charid is added to it to find an entry in
3303 the trans array.
3304
3305 -If trans[states[state].base+charid].check!=state then the
3306 transition is taken to be a 0/Fail transition. Thus if there are fail
3307 transitions at the front of the node then the .base offset will point
3308 somewhere inside the previous nodes data (or maybe even into a node
3309 even earlier), but the .check field determines if the transition is
3310 valid.
3311
786e8c11 3312 XXX - wrong maybe?
a3621e74 3313 The following process inplace converts the table to the compressed
3b753521 3314 table: We first do not compress the root node 1,and mark all its
a3621e74 3315 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3316 allows us to do a DFA construction from the compressed table later,
3317 and ensures that any .base pointers we calculate later are greater
3318 than 0.
a3621e74
YO
3319
3320 - We set 'pos' to indicate the first entry of the second node.
3321
3322 - We then iterate over the columns of the node, finding the first and
3323 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3324 and set the .check pointers accordingly, and advance pos
3325 appropriately and repreat for the next node. Note that when we copy
3326 the next pointers we have to convert them from the original
3327 NODEIDX form to NODENUM form as the former is not valid post
3328 compression.
3329
3330 - If a node has no transitions used we mark its base as 0 and do not
3331 advance the pos pointer.
3332
3333 - If a node only has one transition we use a second pointer into the
3334 structure to fill in allocated fail transitions from other states.
3335 This pointer is independent of the main pointer and scans forward
3336 looking for null transitions that are allocated to a state. When it
3337 finds one it writes the single transition into the "hole". If the
786e8c11 3338 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3339
3340 - Once compressed we can Renew/realloc the structures to release the
3341 excess space.
3342
3343 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3344 specifically Fig 3.47 and the associated pseudocode.
3345
3346 demq
3347 */
a3b680e6 3348 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3349 U32 state, charid;
a3621e74 3350 U32 pos = 0, zp=0;
1e2e3d02 3351 trie->statecount = laststate;
a3621e74
YO
3352
3353 for ( state = 1 ; state < laststate ; state++ ) {
3354 U8 flag = 0;
a28509cc
AL
3355 const U32 stateidx = TRIE_NODEIDX( state );
3356 const U32 o_used = trie->trans[ stateidx ].check;
3357 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3358 trie->trans[ stateidx ].check = 0;
3359
538e84ed
KW
3360 for ( charid = 0;
3361 used && charid < trie->uniquecharcount;
3362 charid++ )
3363 {
a3621e74
YO
3364 if ( flag || trie->trans[ stateidx + charid ].next ) {
3365 if ( trie->trans[ stateidx + charid ].next ) {
3366 if (o_used == 1) {
3367 for ( ; zp < pos ; zp++ ) {
3368 if ( ! trie->trans[ zp ].next ) {
3369 break;
3370 }
3371 }
538e84ed
KW
3372 trie->states[ state ].trans.base
3373 = zp
3374 + trie->uniquecharcount
3375 - charid ;
3376 trie->trans[ zp ].next
3377 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3378 + charid ].next );
a3621e74
YO
3379 trie->trans[ zp ].check = state;
3380 if ( ++zp > pos ) pos = zp;
3381 break;
3382 }
3383 used--;
3384 }
3385 if ( !flag ) {
3386 flag = 1;
538e84ed
KW
3387 trie->states[ state ].trans.base
3388 = pos + trie->uniquecharcount - charid ;
a3621e74 3389 }
538e84ed
KW
3390 trie->trans[ pos ].next
3391 = SAFE_TRIE_NODENUM(
3392 trie->trans[ stateidx + charid ].next );
a3621e74
YO
3393 trie->trans[ pos ].check = state;
3394 pos++;
3395 }
3396 }
3397 }
cc601c31 3398 trie->lasttrans = pos + 1;
c944940b
JH
3399 trie->states = (reg_trie_state *)
3400 PerlMemShared_realloc( trie->states, laststate
3401 * sizeof(reg_trie_state) );
a3621e74 3402 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3403 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
cb41e5d6 3404 depth+1,
538e84ed
KW
3405 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3406 + 1 ),
3407 (IV)next_alloc,
3408 (IV)pos,
3409 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
3410 );
3411
3412 } /* end table compress */
3413 }
1e2e3d02 3414 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3415 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",