This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Move regex global variables to interpreter level
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
c277df42 77#define REG_COMP_C
54df2634
NC
78#ifdef PERL_IN_XSUB_RE
79# include "re_comp.h"
afda64e8 80EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
81#else
82# include "regcomp.h"
83#endif
a687059c 84
f7e03a10 85#include "dquote_inline.h"
b992490d 86#include "invlist_inline.h"
1b0f46bf 87#include "unicode_constants.h"
04e98a4d 88
538e84ed
KW
89#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
90 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
91#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
92 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 93#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 94#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 95
a687059c
LW
96#ifndef STATIC
97#define STATIC static
98#endif
99
3f910e62
YO
100/* this is a chain of data about sub patterns we are processing that
101 need to be handled separately/specially in study_chunk. Its so
102 we can simulate recursion without losing state. */
103struct scan_frame;
104typedef struct scan_frame {
105 regnode *last_regnode; /* last node to process in this frame */
106 regnode *next_regnode; /* next node to process when last is reached */
107 U32 prev_recursed_depth;
108 I32 stopparen; /* what stopparen do we use */
3f910e62
YO
109
110 struct scan_frame *this_prev_frame; /* this previous frame */
111 struct scan_frame *prev_frame; /* previous frame */
112 struct scan_frame *next_frame; /* next frame */
113} scan_frame;
114
f4ae5a27
KW
115/* Certain characters are output as a sequence with the first being a
116 * backslash. */
eb7486de 117#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
f4ae5a27
KW
118
119
09b2b2e6 120struct RExC_state_t {
514a91f1
DM
121 U32 flags; /* RXf_* are we folding, multilining? */
122 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 123 char *precomp; /* uncompiled string. */
711b303b 124 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 125 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 126 regexp *rx; /* perl core regexp structure */
538e84ed
KW
127 regexp_internal *rxi; /* internal data for regexp object
128 pprivate field */
fac92740 129 char *start; /* Start of input for compile */
830247a4
IZ
130 char *end; /* End of input for compile */
131 char *parse; /* Input-scan pointer. */
51684e3c
KW
132 char *copy_start; /* start of copy of input within
133 constructed parse string */
cc16d262
KW
134 char *save_copy_start; /* Provides one level of saving
135 and restoring 'copy_start' */
51684e3c
KW
136 char *copy_start_in_input; /* Position in input string
137 corresponding to copy_start */
ea3daa5d 138 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 139 regnode *emit_start; /* Start of emitted-code area */
f55b7b26 140 regnode_offset emit; /* Code-emit pointer */
830247a4
IZ
141 I32 naughty; /* How bad is this pattern? */
142 I32 sawback; /* Did we see \1, ...? */
143 U32 seen;
88f063b4
KW
144 SSize_t size; /* Number of regnode equivalents in
145 pattern */
7c932d07
KW
146
147 /* position beyond 'precomp' of the warning message furthest away from
148 * 'precomp'. During the parse, no warnings are raised for any problems
149 * earlier in the parse than this position. This works if warnings are
150 * raised the first time a given spot is parsed, and if only one
151 * independent warning is raised for any given spot */
152 Size_t latest_warn_offset;
153
154 I32 npar; /* Capture buffer count so far in the
155 parse, (OPEN) plus one. ("par" 0 is
156 the whole pattern)*/
157 I32 total_par; /* During initial parse, is either 0,
158 or -1; the latter indicating a
159 reparse is needed. After that pass,
160 it is what 'npar' became after the
161 pass. Hence, it being > 0 indicates
162 we are in a reparse situation */
538e84ed
KW
163 I32 nestroot; /* root parens we are in - used by
164 accept */
830247a4 165 I32 seen_zerolen;
f55b7b26
KW
166 regnode_offset *open_parens; /* offsets to open parens */
167 regnode_offset *close_parens; /* offsets to close parens */
0275405c 168 I32 parens_buf_size; /* #slots malloced open/close_parens */
d5a00e4a 169 regnode *end_op; /* END node in program */
02daf0ab
YO
170 I32 utf8; /* whether the pattern is utf8 or not */
171 I32 orig_utf8; /* whether the pattern was originally in utf8 */
172 /* XXX use this for future optimisation of case
173 * where pattern must be upgraded to utf8. */
e40e74fe
KW
174 I32 uni_semantics; /* If a d charset modifier should use unicode
175 rules, even if the pattern is not in
176 utf8 */
81714fb9 177 HV *paren_names; /* Paren names */
538e84ed 178
40d049e4 179 regnode **recurse; /* Recurse regops */
88f063b4 180 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 181 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 182 through */
09a65838 183 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 184 I32 in_lookbehind;
105c827d 185 I32 in_lookahead;
4624b182 186 I32 contains_locale;
bb3f3ed2 187 I32 override_recoding;
4caef9c3 188 I32 recode_x_to_native;
9d53c457 189 I32 in_multi_char_class;
1acab4c5 190 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 191 within pattern */
b1603ef8 192 int code_index; /* next code_blocks[] slot */
ee273784 193 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
194 scan_frame *frame_head;
195 scan_frame *frame_last;
196 U32 frame_count;
7eec73eb 197 AV *warn_text;
945961fd 198 HV *unlexed_names;
dc6d7f5c 199#ifdef ADD_TO_REGEXEC
830247a4
IZ
200 char *starttry; /* -Dr: where regtry was called. */
201#define RExC_starttry (pRExC_state->starttry)
202#endif
d24ca0c5 203 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 204#ifdef DEBUGGING
be8e71aa 205 const char *lastparse;
3dab1dad 206 I32 lastnum;
1f1031fe 207 AV *paren_name_list; /* idx -> name */
d9a72fcc 208 U32 study_chunk_recursed_count;
c9f0d54c
YO
209 SV *mysv1;
210 SV *mysv2;
88f063b4 211
3dab1dad
YO
212#define RExC_lastparse (pRExC_state->lastparse)
213#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 214#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 215#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
216#define RExC_mysv (pRExC_state->mysv1)
217#define RExC_mysv1 (pRExC_state->mysv1)
218#define RExC_mysv2 (pRExC_state->mysv2)
219
3dab1dad 220#endif
bd0b62a6 221 bool seen_d_op;
911bd04e 222 bool strict;
da7cf1cc 223 bool study_started;
034602eb 224 bool in_script_run;
7c932d07 225 bool use_BRANCHJ;
09b2b2e6 226};
830247a4 227
e2509266 228#define RExC_flags (pRExC_state->flags)
514a91f1 229#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 230#define RExC_precomp (pRExC_state->precomp)
51684e3c
KW
231#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
232#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
cc16d262 233#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
711b303b 234#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 235#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 236#define RExC_rx (pRExC_state->rx)
f8fc2ecf 237#define RExC_rxi (pRExC_state->rxi)
fac92740 238#define RExC_start (pRExC_state->start)
830247a4
IZ
239#define RExC_end (pRExC_state->end)
240#define RExC_parse (pRExC_state->parse)
7c932d07 241#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
830247a4 242#define RExC_whilem_seen (pRExC_state->whilem_seen)
bd0b62a6
KW
243#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
244 under /d from /u ? */
512c0f5a 245
7122b237 246#ifdef RE_TRACK_PATTERN_OFFSETS
44f36cd3 247# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
538e84ed 248 others */
7122b237 249#endif
830247a4 250#define RExC_emit (pRExC_state->emit)
fac92740 251#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
252#define RExC_sawback (pRExC_state->sawback)
253#define RExC_seen (pRExC_state->seen)
254#define RExC_size (pRExC_state->size)
ee273784 255#define RExC_maxlen (pRExC_state->maxlen)
830247a4 256#define RExC_npar (pRExC_state->npar)
1533d1d2 257#define RExC_total_parens (pRExC_state->total_par)
0275405c 258#define RExC_parens_buf_size (pRExC_state->parens_buf_size)
e2e6a0f1 259#define RExC_nestroot (pRExC_state->nestroot)
830247a4 260#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 261#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 262#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 263#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
264#define RExC_open_parens (pRExC_state->open_parens)
265#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 266#define RExC_end_op (pRExC_state->end_op)
81714fb9 267#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
268#define RExC_recurse (pRExC_state->recurse)
269#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 270#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
271#define RExC_study_chunk_recursed_bytes \
272 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 273#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
105c827d 274#define RExC_in_lookahead (pRExC_state->in_lookahead)
4624b182 275#define RExC_contains_locale (pRExC_state->contains_locale)
4caef9c3
KW
276#define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
277
b6d67071 278#ifdef EBCDIC
4caef9c3
KW
279# define SET_recode_x_to_native(x) \
280 STMT_START { RExC_recode_x_to_native = (x); } STMT_END
281#else
282# define SET_recode_x_to_native(x) NOOP
b6d67071 283#endif
4caef9c3 284
9d53c457 285#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
286#define RExC_frame_head (pRExC_state->frame_head)
287#define RExC_frame_last (pRExC_state->frame_last)
288#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 289#define RExC_strict (pRExC_state->strict)
da7cf1cc 290#define RExC_study_started (pRExC_state->study_started)
7eec73eb 291#define RExC_warn_text (pRExC_state->warn_text)
034602eb 292#define RExC_in_script_run (pRExC_state->in_script_run)
7c932d07 293#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
945961fd 294#define RExC_unlexed_names (pRExC_state->unlexed_names)
830247a4 295
99807a43
HS
296/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
297 * a flag to disable back-off on the fixed/floating substrings - if it's
298 * a high complexity pattern we assume the benefit of avoiding a full match
299 * is worth the cost of checking for the substrings even if they rarely help.
300 */
301#define RExC_naughty (pRExC_state->naughty)
302#define TOO_NAUGHTY (10)
303#define MARK_NAUGHTY(add) \
304 if (RExC_naughty < TOO_NAUGHTY) \
305 RExC_naughty += (add)
306#define MARK_NAUGHTY_EXP(exp, add) \
307 if (RExC_naughty < TOO_NAUGHTY) \
308 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 309
a687059c
LW
310#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
311#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 312 ((*s) == '{' && regcurly(s)))
a687059c
LW
313
314/*
315 * Flags to be passed up and down.
316 */
a687059c 317#define WORST 0 /* Worst case. */
c4ead770
KW
318#define HASWIDTH 0x01 /* Known to not match null strings, could match
319 non-null ones. */
fda99bee 320
e64f369d 321/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
322 * character. (There needs to be a case: in the switch statement in regexec.c
323 * for any node marked SIMPLE.) Note that this is not the same thing as
324 * REGNODE_SIMPLE */
fda99bee 325#define SIMPLE 0x02
e64f369d 326#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
327#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
328#define TRYAGAIN 0x10 /* Weeded out a declaration. */
c36361b2
KW
329#define RESTART_PARSE 0x20 /* Need to redo the parse */
330#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
331 calcuate sizes as UTF-8 */
a687059c 332
3dab1dad
YO
333#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
334
07be1b83
YO
335/* whether trie related optimizations are enabled */
336#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
337#define TRIE_STUDY_OPT
786e8c11 338#define FULL_TRIE_STUDY
07be1b83
YO
339#define TRIE_STCLASS
340#endif
1de06328
YO
341
342
40d049e4
YO
343
344#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
345#define PBITVAL(paren) (1 << ((paren) & 7))
346#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
347#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
348#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
349
82a6ada4 350#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 351 if (!UTF) { \
c36361b2 352 *flagp = RESTART_PARSE|NEED_UTF8; \
f55b7b26 353 return 0; \
8d9c2815 354 } \
82a6ada4 355 } STMT_END
40d049e4 356
d63d75f5 357/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
f6e8f31e
KW
358 * a flag that indicates we need to override /d with /u as a result of
359 * something in the pattern. It should only be used in regards to calling
b2090782 360 * set_regex_charset() or get_regex_charset() */
512c0f5a
KW
361#define REQUIRE_UNI_RULES(flagp, restart_retval) \
362 STMT_START { \
363 if (DEPENDS_SEMANTICS) { \
512c0f5a
KW
364 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
365 RExC_uni_semantics = 1; \
b76e1c04 366 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
bd0b62a6
KW
367 /* No need to restart the parse if we haven't seen \
368 * anything that differs between /u and /d, and no need \
369 * to restart immediately if we're going to reparse \
370 * anyway to count parens */ \
c36361b2 371 *flagp |= RESTART_PARSE; \
512c0f5a 372 return restart_retval; \
d1be211c 373 } \
512c0f5a
KW
374 } \
375 } STMT_END
376
7c932d07
KW
377#define REQUIRE_BRANCHJ(flagp, restart_retval) \
378 STMT_START { \
379 RExC_use_BRANCHJ = 1; \
439a3bfe
KW
380 *flagp |= RESTART_PARSE; \
381 return restart_retval; \
7c932d07
KW
382 } STMT_END
383
b76e1c04
KW
384/* Until we have completed the parse, we leave RExC_total_parens at 0 or
385 * less. After that, it must always be positive, because the whole re is
386 * considered to be surrounded by virtual parens. Setting it to negative
387 * indicates there is some construct that needs to know the actual number of
388 * parens to be properly handled. And that means an extra pass will be
389 * required after we've counted them all */
390#define ALL_PARENS_COUNTED (RExC_total_parens > 0)
7c932d07 391#define REQUIRE_PARENS_PASS \
b76e1c04
KW
392 STMT_START { /* No-op if have completed a pass */ \
393 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
7c932d07 394 } STMT_END
b76e1c04
KW
395#define IN_PARENS_PASS (RExC_total_parens < 0)
396
7c932d07 397
f358228e
KW
398/* This is used to return failure (zero) early from the calling function if
399 * various flags in 'flags' are set. Two flags always cause a return:
400 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
401 * additional flags that should cause a return; 0 if none. If the return will
402 * be done, '*flagp' is first set to be all of the flags that caused the
403 * return. */
404#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
fd7fc6ae 405 STMT_START { \
c36361b2
KW
406 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
407 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
f358228e 408 return 0; \
fd7fc6ae
YO
409 } \
410 } STMT_END
411
c36361b2 412#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
fd7fc6ae 413
5d555bb8 414#define RETURN_FAIL_ON_RESTART(flags,flagp) \
f358228e 415 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
5d555bb8 416#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
f358228e 417 if (MUST_RESTART(*(flagp))) return 0
fd7fc6ae 418
f19b1a63
KW
419/* This converts the named class defined in regcomp.h to its equivalent class
420 * number defined in handy.h. */
421#define namedclass_to_classnum(class) ((int) ((class) / 2))
422#define classnum_to_namedclass(classnum) ((classnum) * 2)
423
de92f5e6
KW
424#define _invlist_union_complement_2nd(a, b, output) \
425 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
426#define _invlist_intersection_complement_2nd(a, b, output) \
427 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
428
1de06328
YO
429/* About scan_data_t.
430
431 During optimisation we recurse through the regexp program performing
432 various inplace (keyhole style) optimisations. In addition study_chunk
433 and scan_commit populate this data structure with information about
538e84ed 434 what strings MUST appear in the pattern. We look for the longest
3b753521 435 string that must appear at a fixed location, and we look for the
1de06328
YO
436 longest string that may appear at a floating location. So for instance
437 in the pattern:
538e84ed 438
1de06328 439 /FOO[xX]A.*B[xX]BAR/
538e84ed 440
1de06328
YO
441 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
442 strings (because they follow a .* construct). study_chunk will identify
443 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 444
1de06328 445 The strings can be composites, for instance
538e84ed 446
1de06328 447 /(f)(o)(o)/
538e84ed 448
1de06328 449 will result in a composite fixed substring 'foo'.
538e84ed 450
1de06328 451 For each string some basic information is maintained:
538e84ed 452
6ea77169 453 - min_offset
1de06328
YO
454 This is the position the string must appear at, or not before.
455 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
456 characters must match before the string we are searching for.
457 Likewise when combined with minlenp and the length of the string it
538e84ed 458 tells us how many characters must appear after the string we have
1de06328 459 found.
538e84ed 460
1de06328
YO
461 - max_offset
462 Only used for floating strings. This is the rightmost point that
ea3daa5d 463 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 464 string can occur infinitely far to the right.
6e12f832 465 For fixed strings, it is equal to min_offset.
538e84ed 466
1de06328 467 - minlenp
2d608413
KW
468 A pointer to the minimum number of characters of the pattern that the
469 string was found inside. This is important as in the case of positive
538e84ed 470 lookahead or positive lookbehind we can have multiple patterns
1de06328 471 involved. Consider
538e84ed 472
1de06328 473 /(?=FOO).*F/
538e84ed 474
1de06328
YO
475 The minimum length of the pattern overall is 3, the minimum length
476 of the lookahead part is 3, but the minimum length of the part that
538e84ed 477 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 478 minimum length for the F is 1. This is important as the minimum length
538e84ed 479 is used to determine offsets in front of and behind the string being
1de06328 480 looked for. Since strings can be composites this is the length of the
486ec47a 481 pattern at the time it was committed with a scan_commit. Note that
1de06328 482 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 483 are not known until the full pattern has been compiled, thus the
1de06328 484 pointer to the value.
538e84ed 485
1de06328 486 - lookbehind
538e84ed 487
1de06328 488 In the case of lookbehind the string being searched for can be
538e84ed 489 offset past the start point of the final matching string.
1de06328
YO
490 If this value was just blithely removed from the min_offset it would
491 invalidate some of the calculations for how many chars must match
492 before or after (as they are derived from min_offset and minlen and
538e84ed 493 the length of the string being searched for).
1de06328
YO
494 When the final pattern is compiled and the data is moved from the
495 scan_data_t structure into the regexp structure the information
538e84ed
KW
496 about lookbehind is factored in, with the information that would
497 have been lost precalculated in the end_shift field for the
1de06328
YO
498 associated string.
499
500 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 501 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
502
503*/
2c2d71f5 504
cb6aef8b
DM
505struct scan_data_substrs {
506 SV *str; /* longest substring found in pattern */
507 SSize_t min_offset; /* earliest point in string it can appear */
508 SSize_t max_offset; /* latest point in string it can appear */
509 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
510 SSize_t lookbehind; /* is the pos of the string modified by LB */
11683ecb 511 I32 flags; /* per substring SF_* and SCF_* flags */
cb6aef8b
DM
512};
513
2c2d71f5 514typedef struct scan_data_t {
1de06328
YO
515 /*I32 len_min; unused */
516 /*I32 len_delta; unused */
49f55535 517 SSize_t pos_min;
ea3daa5d 518 SSize_t pos_delta;
2c2d71f5 519 SV *last_found;
ea3daa5d 520 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 521 SSize_t last_start_min;
ea3daa5d 522 SSize_t last_start_max;
2099df82
DM
523 U8 cur_is_floating; /* whether the last_* values should be set as
524 * the next fixed (0) or floating (1)
525 * substring */
6ea77169 526
2099df82
DM
527 /* [0] is longest fixed substring so far, [1] is longest float so far */
528 struct scan_data_substrs substrs[2];
6ea77169 529
11683ecb 530 I32 flags; /* common SF_* and SCF_* flags */
2c2d71f5 531 I32 whilem_c;
49f55535 532 SSize_t *last_closep;
b8f7bb16 533 regnode_ssc *start_class;
2c2d71f5
JH
534} scan_data_t;
535
a687059c 536/*
e50aee73 537 * Forward declarations for pregcomp()'s friends.
a687059c 538 */
a0d0e21e 539
6ea77169 540static const scan_data_t zero_scan_data = {
a2ca2017 541 0, 0, NULL, 0, 0, 0, 0,
6ea77169 542 {
11683ecb
DM
543 { NULL, 0, 0, 0, 0, 0 },
544 { NULL, 0, 0, 0, 0, 0 },
6ea77169
DM
545 },
546 0, 0, NULL, NULL
547};
c277df42 548
11683ecb
DM
549/* study flags */
550
07be1b83
YO
551#define SF_BEFORE_SEOL 0x0001
552#define SF_BEFORE_MEOL 0x0002
11683ecb 553#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
c277df42 554
07be1b83
YO
555#define SF_IS_INF 0x0040
556#define SF_HAS_PAR 0x0080
557#define SF_IN_PAR 0x0100
558#define SF_HAS_EVAL 0x0200
bc604ad8
KW
559
560
561/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
562 * longest substring in the pattern. When it is not set the optimiser keeps
563 * track of position, but does not keep track of the actual strings seen,
564 *
565 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
566 * /foo/i will not.
567 *
568 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
569 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
570 * turned off because of the alternation (BRANCH). */
07be1b83 571#define SCF_DO_SUBSTR 0x0400
bc604ad8 572
653099ff
GS
573#define SCF_DO_STCLASS_AND 0x0800
574#define SCF_DO_STCLASS_OR 0x1000
575#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 576#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 577
786e8c11 578#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 579#define SCF_SEEN_ACCEPT 0x8000
688e0391 580#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
581#define SCF_IN_DEFINE 0x20000
582
583
584
07be1b83 585
43fead97 586#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
587
588/* The enums for all these are ordered so things work out correctly */
a62b1201 589#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
590#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
591 == REGEX_DEPENDS_CHARSET)
00b27cfc 592#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
593#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
594 >= REGEX_UNICODE_CHARSET)
595#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
596 == REGEX_ASCII_RESTRICTED_CHARSET)
597#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
598 >= REGEX_ASCII_RESTRICTED_CHARSET)
599#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
600 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 601
43fead97 602#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 603
f2c2a6ab
KW
604/* For programs that want to be strictly Unicode compatible by dying if any
605 * attempt is made to match a non-Unicode code point against a Unicode
606 * property. */
607#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
608
93733859 609#define OOB_NAMEDCLASS -1
b8c5462f 610
8e661ac5
KW
611/* There is no code point that is out-of-bounds, so this is problematic. But
612 * its only current use is to initialize a variable that is always set before
613 * looked at. */
614#define OOB_UNICODE 0xDEADBEEF
615
a0ed51b3 616#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 617
8615cb43 618
b45f050a
JF
619/* length of regex to show in messages that don't mark a position within */
620#define RegexLengthToShowInErrorMessages 127
621
622/*
623 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
624 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
625 * op/pragma/warn/regcomp.
626 */
7253e4e3
RK
627#define MARKER1 "<-- HERE" /* marker as it appears in the description */
628#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 629
538e84ed 630#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 631 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 632
285b5ca0
KW
633/* The code in this file in places uses one level of recursion with parsing
634 * rebased to an alternate string constructed by us in memory. This can take
635 * the form of something that is completely different from the input, or
636 * something that uses the input as part of the alternate. In the first case,
637 * there should be no possibility of an error, as we are in complete control of
51684e3c
KW
638 * the alternate string. But in the second case we don't completely control
639 * the input portion, so there may be errors in that. Here's an example:
285b5ca0
KW
640 * /[abc\x{DF}def]/ui
641 * is handled specially because \x{df} folds to a sequence of more than one
51684e3c 642 * character: 'ss'. What is done is to create and parse an alternate string,
285b5ca0
KW
643 * which looks like this:
644 * /(?:\x{DF}|[abc\x{DF}def])/ui
645 * where it uses the input unchanged in the middle of something it constructs,
646 * which is a branch for the DF outside the character class, and clustering
647 * parens around the whole thing. (It knows enough to skip the DF inside the
648 * class while in this substitute parse.) 'abc' and 'def' may have errors that
649 * need to be reported. The general situation looks like this:
650 *
51684e3c 651 * |<------- identical ------>|
285b5ca0 652 * sI tI xI eI
51684e3c 653 * Input: ---------------------------------------------------------------
285b5ca0
KW
654 * Constructed: ---------------------------------------------------
655 * sC tC xC eC EC
51684e3c 656 * |<------- identical ------>|
285b5ca0 657 *
51684e3c
KW
658 * sI..eI is the portion of the input pattern we are concerned with here.
659 * sC..EC is the constructed substitute parse string.
660 * sC..tC is constructed by us
661 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
662 * In the diagram, these are vertically aligned.
663 * eC..EC is also constructed by us.
664 * xC is the position in the substitute parse string where we found a
665 * problem.
666 * xI is the position in the original pattern corresponding to xC.
285b5ca0 667 *
51684e3c
KW
668 * We want to display a message showing the real input string. Thus we need to
669 * translate from xC to xI. We know that xC >= tC, since the portion of the
670 * string sC..tC has been constructed by us, and so shouldn't have errors. We
671 * get:
672 * xI = tI + (xC - tC)
285b5ca0 673 *
51684e3c
KW
674 * When the substitute parse is constructed, the code needs to set:
675 * RExC_start (sC)
676 * RExC_end (eC)
677 * RExC_copy_start_in_input (tI)
678 * RExC_copy_start_in_constructed (tC)
679 * and restore them when done.
285b5ca0 680 *
51684e3c
KW
681 * During normal processing of the input pattern, both
682 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
683 * sI, so that xC equals xI.
285b5ca0
KW
684 */
685
51684e3c
KW
686#define sI RExC_precomp
687#define eI RExC_precomp_end
688#define sC RExC_start
689#define eC RExC_end
690#define tI RExC_copy_start_in_input
691#define tC RExC_copy_start_in_constructed
692#define xI(xC) (tI + (xC - tC))
693#define xI_offset(xC) (xI(xC) - sI)
285b5ca0
KW
694
695#define REPORT_LOCATION_ARGS(xC) \
696 UTF8fARG(UTF, \
51684e3c 697 (xI(xC) > eI) /* Don't run off end */ \
232b691f 698 ? eI - sI /* Length before the <--HERE */ \
bc929b61
KW
699 : ((xI_offset(xC) >= 0) \
700 ? xI_offset(xC) \
701 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
702 IVdf " trying to output message for " \
703 " pattern %.*s", \
51684e3c 704 __FILE__, __LINE__, (IV) xI_offset(xC), \
bc929b61 705 ((int) (eC - sC)), sC), 0)), \
51684e3c 706 sI), /* The input pattern printed up to the <--HERE */ \
285b5ca0 707 UTF8fARG(UTF, \
51684e3c
KW
708 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
709 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
c1d900c3 710
8a6d8ec6
HS
711/* Used to point after bad bytes for an error message, but avoid skipping
712 * past a nul byte. */
e3304382 713#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
8a6d8ec6 714
ed68fe11
KW
715/* Set up to clean up after our imminent demise */
716#define PREPARE_TO_DIE \
717 STMT_START { \
718 if (RExC_rx_sv) \
719 SAVEFREESV(RExC_rx_sv); \
7c932d07
KW
720 if (RExC_open_parens) \
721 SAVEFREEPV(RExC_open_parens); \
722 if (RExC_close_parens) \
723 SAVEFREEPV(RExC_close_parens); \
ed68fe11
KW
724 } STMT_END
725
b45f050a
JF
726/*
727 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
728 * arg. Show regex, up to a maximum length. If it's too long, chop and add
729 * "...".
730 */
58e23c8d 731#define _FAIL(code) STMT_START { \
bfed75c6 732 const char *ellipses = ""; \
88f063b4 733 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380 734 \
ed68fe11 735 PREPARE_TO_DIE; \
ccb2c380
MP
736 if (len > RegexLengthToShowInErrorMessages) { \
737 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
738 len = RegexLengthToShowInErrorMessages - 10; \
739 ellipses = "..."; \
740 } \
58e23c8d 741 code; \
ccb2c380 742} STMT_END
8615cb43 743
58e23c8d 744#define FAIL(msg) _FAIL( \
147e3846 745 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 746 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
747
748#define FAIL2(msg,arg) _FAIL( \
147e3846 749 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 750 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 751
4b7718d6
KW
752#define FAIL3(msg,arg1,arg2) _FAIL( \
753 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
754 arg1, arg2, UTF8fARG(UTF, len, RExC_precomp), ellipses))
755
b45f050a 756/*
b45f050a
JF
757 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
758 */
ccb2c380 759#define Simple_vFAIL(m) STMT_START { \
ccb2c380 760 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 761 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 762} STMT_END
b45f050a
JF
763
764/*
765 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
766 */
ccb2c380 767#define vFAIL(m) STMT_START { \
ed68fe11 768 PREPARE_TO_DIE; \
ccb2c380
MP
769 Simple_vFAIL(m); \
770} STMT_END
b45f050a
JF
771
772/*
773 * Like Simple_vFAIL(), but accepts two arguments.
774 */
ccb2c380 775#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
776 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
777 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 778} STMT_END
b45f050a
JF
779
780/*
781 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
782 */
ccb2c380 783#define vFAIL2(m,a1) STMT_START { \
ed68fe11 784 PREPARE_TO_DIE; \
ccb2c380
MP
785 Simple_vFAIL2(m, a1); \
786} STMT_END
b45f050a
JF
787
788
789/*
790 * Like Simple_vFAIL(), but accepts three arguments.
791 */
ccb2c380 792#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 793 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 794 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 795} STMT_END
b45f050a
JF
796
797/*
798 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
799 */
ccb2c380 800#define vFAIL3(m,a1,a2) STMT_START { \
ed68fe11 801 PREPARE_TO_DIE; \
ccb2c380
MP
802 Simple_vFAIL3(m, a1, a2); \
803} STMT_END
b45f050a
JF
804
805/*
806 * Like Simple_vFAIL(), but accepts four arguments.
807 */
ccb2c380 808#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
809 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
810 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 811} STMT_END
b45f050a 812
95db3ffa 813#define vFAIL4(m,a1,a2,a3) STMT_START { \
ed68fe11 814 PREPARE_TO_DIE; \
95db3ffa
KW
815 Simple_vFAIL4(m, a1, a2, a3); \
816} STMT_END
817
946095af 818/* A specialized version of vFAIL2 that works with UTF8f */
d528642a 819#define vFAIL2utf8f(m, a1) STMT_START { \
ed68fe11 820 PREPARE_TO_DIE; \
d528642a
KW
821 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
822 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
823} STMT_END
824
3ba22297 825#define vFAIL3utf8f(m, a1, a2) STMT_START { \
ed68fe11 826 PREPARE_TO_DIE; \
3ba22297 827 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 828 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
829} STMT_END
830
d62e749e 831/* Setting this to NULL is a signal to not output warnings */
cc16d262
KW
832#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
833 STMT_START { \
834 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
835 RExC_copy_start_in_constructed = NULL; \
836 } STMT_END
837#define RESTORE_WARNINGS \
838 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
d62e749e 839
7c932d07
KW
840/* Since a warning can be generated multiple times as the input is reparsed, we
841 * output it the first time we come to that point in the parse, but suppress it
842 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
843 * generate any warnings */
55bcf857 844#define TO_OUTPUT_WARNINGS(loc) \
7c932d07
KW
845 ( RExC_copy_start_in_constructed \
846 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
55bcf857 847
7c932d07
KW
848/* After we've emitted a warning, we save the position in the input so we don't
849 * output it again */
850#define UPDATE_WARNINGS_LOC(loc) \
851 STMT_START { \
852 if (TO_OUTPUT_WARNINGS(loc)) { \
5d894ca5
KW
853 RExC_latest_warn_offset = MAX(sI, MIN(eI, xI(loc))) \
854 - RExC_precomp; \
7c932d07
KW
855 } \
856 } STMT_END
77ccba3c 857
086f21bf 858/* 'warns' is the output of the packWARNx macro used in 'code' */
f8220fe2
KW
859#define _WARN_HELPER(loc, warns, code) \
860 STMT_START { \
d62e749e
KW
861 if (! RExC_copy_start_in_constructed) { \
862 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
863 " expected at '%s'", \
864 __FILE__, __LINE__, loc); \
865 } \
55bcf857 866 if (TO_OUTPUT_WARNINGS(loc)) { \
086f21bf
KW
867 if (ckDEAD(warns)) \
868 PREPARE_TO_DIE; \
338190a1 869 code; \
77ccba3c 870 UPDATE_WARNINGS_LOC(loc); \
338190a1 871 } \
f8220fe2 872 } STMT_END
946095af 873
5e0a247b 874/* m is not necessarily a "literal string", in this macro */
f8220fe2
KW
875#define reg_warn_non_literal_string(loc, m) \
876 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
877 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 878 "%s" REPORT_LOCATION, \
f8220fe2 879 m, REPORT_LOCATION_ARGS(loc)))
5e0a247b 880
f8220fe2
KW
881#define ckWARNreg(loc,m) \
882 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
883 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 884 m REPORT_LOCATION, \
f8220fe2 885 REPORT_LOCATION_ARGS(loc)))
ccb2c380 886
f8220fe2
KW
887#define vWARN(loc, m) \
888 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
889 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 890 m REPORT_LOCATION, \
f8220fe2 891 REPORT_LOCATION_ARGS(loc))) \
b927b7e9 892
f8220fe2
KW
893#define vWARN_dep(loc, m) \
894 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
895 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 896 m REPORT_LOCATION, \
f8220fe2 897 REPORT_LOCATION_ARGS(loc)))
0d6106aa 898
f8220fe2
KW
899#define ckWARNdep(loc,m) \
900 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
901 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 902 m REPORT_LOCATION, \
f8220fe2 903 REPORT_LOCATION_ARGS(loc)))
147508a2 904
f8220fe2
KW
905#define ckWARNregdep(loc,m) \
906 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
907 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
d528642a
KW
908 WARN_REGEXP), \
909 m REPORT_LOCATION, \
f8220fe2 910 REPORT_LOCATION_ARGS(loc)))
ccb2c380 911
f8220fe2
KW
912#define ckWARN2reg_d(loc,m, a1) \
913 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
914 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
d528642a 915 m REPORT_LOCATION, \
f8220fe2 916 a1, REPORT_LOCATION_ARGS(loc)))
2335b3d3 917
f8220fe2
KW
918#define ckWARN2reg(loc, m, a1) \
919 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
920 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 921 m REPORT_LOCATION, \
f8220fe2 922 a1, REPORT_LOCATION_ARGS(loc)))
ccb2c380 923
f8220fe2
KW
924#define vWARN3(loc, m, a1, a2) \
925 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
926 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 927 m REPORT_LOCATION, \
f8220fe2 928 a1, a2, REPORT_LOCATION_ARGS(loc)))
ccb2c380 929
f8220fe2
KW
930#define ckWARN3reg(loc, m, a1, a2) \
931 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
932 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
933 m REPORT_LOCATION, \
934 a1, a2, \
f8220fe2 935 REPORT_LOCATION_ARGS(loc)))
668c081a 936
f8220fe2
KW
937#define vWARN4(loc, m, a1, a2, a3) \
938 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
939 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
940 m REPORT_LOCATION, \
941 a1, a2, a3, \
f8220fe2 942 REPORT_LOCATION_ARGS(loc)))
ccb2c380 943
f8220fe2
KW
944#define ckWARN4reg(loc, m, a1, a2, a3) \
945 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
946 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
947 m REPORT_LOCATION, \
948 a1, a2, a3, \
f8220fe2 949 REPORT_LOCATION_ARGS(loc)))
668c081a 950
f8220fe2
KW
951#define vWARN5(loc, m, a1, a2, a3, a4) \
952 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
953 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
954 m REPORT_LOCATION, \
955 a1, a2, a3, a4, \
f8220fe2 956 REPORT_LOCATION_ARGS(loc)))
9d1d55b5 957
aea5f181
KW
958#define ckWARNexperimental(loc, class, m) \
959 _WARN_HELPER(loc, packWARN(class), \
960 Perl_ck_warner_d(aTHX_ packWARN(class), \
961 m REPORT_LOCATION, \
962 REPORT_LOCATION_ARGS(loc)))
963
1a322bb4
KW
964/* Convert between a pointer to a node and its offset from the beginning of the
965 * program */
966#define REGNODE_p(offset) (RExC_emit_start + (offset))
967#define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
968
538e84ed 969/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
970 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
971 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
972 * Element 0 holds the number n.
07be1b83 973 * Position is 1 indexed.
fac92740 974 */
7122b237 975#ifndef RE_TRACK_PATTERN_OFFSETS
d7c442ce 976#define Set_Node_Offset_To_R(offset,byte)
7122b237
YO
977#define Set_Node_Offset(node,byte)
978#define Set_Cur_Node_Offset
979#define Set_Node_Length_To_R(node,len)
980#define Set_Node_Length(node,len)
6a86c6ad 981#define Set_Node_Cur_Length(node,start)
538e84ed
KW
982#define Node_Offset(n)
983#define Node_Length(n)
7122b237
YO
984#define Set_Node_Offset_Length(node,offset,len)
985#define ProgLen(ri) ri->u.proglen
986#define SetProgLen(ri,x) ri->u.proglen = x
4cbeff97 987#define Track_Code(code)
7122b237
YO
988#else
989#define ProgLen(ri) ri->u.offsets[0]
990#define SetProgLen(ri,x) ri->u.offsets[0] = x
d7c442ce 991#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
ccb2c380 992 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
d7c442ce
KW
993 __LINE__, (int)(offset), (int)(byte))); \
994 if((offset) < 0) { \
538e84ed 995 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
d7c442ce 996 (int)(offset)); \
ccb2c380 997 } else { \
d7c442ce 998 RExC_offsets[2*(offset)-1] = (byte); \
ccb2c380 999 } \
ccb2c380
MP
1000} STMT_END
1001
88f063b4 1002#define Set_Node_Offset(node,byte) \
1a322bb4 1003 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
ccb2c380
MP
1004#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
1005
1006#define Set_Node_Length_To_R(node,len) STMT_START { \
ccb2c380 1007 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 1008 __LINE__, (int)(node), (int)(len))); \
ccb2c380 1009 if((node) < 0) { \
538e84ed
KW
1010 Perl_croak(aTHX_ "value of node is %d in Length macro", \
1011 (int)(node)); \
ccb2c380
MP
1012 } else { \
1013 RExC_offsets[2*(node)] = (len); \
1014 } \
ccb2c380
MP
1015} STMT_END
1016
1017#define Set_Node_Length(node,len) \
1a322bb4 1018 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
6a86c6ad
NC
1019#define Set_Node_Cur_Length(node, start) \
1020 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
1021
1022/* Get offsets and lengths */
1a322bb4
KW
1023#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1024#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
fac92740 1025
07be1b83 1026#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1a322bb4
KW
1027 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1028 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
07be1b83 1029} STMT_END
4cbeff97
TC
1030
1031#define Track_Code(code) STMT_START { code } STMT_END
7122b237 1032#endif
07be1b83
YO
1033
1034#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1035#define EXPERIMENTAL_INPLACESCAN
f427392e 1036#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 1037
cb41e5d6
YO
1038#ifdef DEBUGGING
1039int
6ad9a8ab 1040Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
1041{
1042 va_list ap;
1043 int result;
1044 PerlIO *f= Perl_debug_log;
1045 PERL_ARGS_ASSERT_RE_PRINTF;
1046 va_start(ap, fmt);
1047 result = PerlIO_vprintf(f, fmt, ap);
1048 va_end(ap);
1049 return result;
1050}
1051
1052int
7b031478 1053Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
1054{
1055 va_list ap;
1056 int result;
1057 PerlIO *f= Perl_debug_log;
1058 PERL_ARGS_ASSERT_RE_INDENTF;
1059 va_start(ap, depth);
daeb874b 1060 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
1061 result = PerlIO_vprintf(f, fmt, ap);
1062 va_end(ap);
1063 return result;
1064}
1065#endif /* DEBUGGING */
1066
7b031478 1067#define DEBUG_RExC_seen() \
538e84ed 1068 DEBUG_OPTIMISE_MORE_r({ \
88f063b4 1069 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 1070 \
e384d5c1 1071 if (RExC_seen & REG_ZERO_LEN_SEEN) \
88f063b4 1072 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 1073 \
e384d5c1 1074 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
88f063b4 1075 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 1076 \
e384d5c1 1077 if (RExC_seen & REG_GPOS_SEEN) \
88f063b4 1078 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 1079 \
e384d5c1 1080 if (RExC_seen & REG_RECURSE_SEEN) \
88f063b4 1081 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 1082 \
7b031478 1083 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
88f063b4 1084 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 1085 \
e384d5c1 1086 if (RExC_seen & REG_VERBARG_SEEN) \
88f063b4 1087 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 1088 \
e384d5c1 1089 if (RExC_seen & REG_CUTGROUP_SEEN) \
88f063b4 1090 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 1091 \
e384d5c1 1092 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
88f063b4 1093 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 1094 \
e384d5c1 1095 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
88f063b4 1096 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 1097 \
7b031478 1098 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
88f063b4 1099 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 1100 \
88f063b4 1101 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
1102 });
1103
fdfb4f21 1104#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 1105 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21 1106
fdfb4f21 1107
f5a36d78
DM
1108#ifdef DEBUGGING
1109static void
1110S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1111 const char *close_str)
1112{
1113 if (!flags)
1114 return;
1115
1116 Perl_re_printf( aTHX_ "%s", open_str);
11683ecb
DM
1117 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1118 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
f5a36d78
DM
1119 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1120 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1121 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1122 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1123 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1124 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1125 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1126 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1127 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1128 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1129 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1130 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1131 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1132 Perl_re_printf( aTHX_ "%s", close_str);
1133}
1134
1135
1136static void
1137S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1138 U32 depth, int is_inf)
1139{
1140 GET_RE_DEBUG_FLAGS_DECL;
1141
1142 DEBUG_OPTIMISE_MORE_r({
1143 if (!data)
1144 return;
1145 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1146 depth,
1147 where,
1148 (IV)data->pos_min,
1149 (IV)data->pos_delta,
1150 (UV)data->flags
1151 );
1152
1153 S_debug_show_study_flags(aTHX_ data->flags," [","]");
fdfb4f21 1154
f5a36d78
DM
1155 Perl_re_printf( aTHX_
1156 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1157 (IV)data->whilem_c,
1158 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1159 is_inf ? "INF " : ""
1160 );
1161
11683ecb 1162 if (data->last_found) {
37b6262f 1163 int i;
11683ecb
DM
1164 Perl_re_printf(aTHX_
1165 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1166 SvPVX_const(data->last_found),
1167 (IV)data->last_end,
1168 (IV)data->last_start_min,
1169 (IV)data->last_start_max
1170 );
1171
2099df82 1172 for (i = 0; i < 2; i++) {
37b6262f
DM
1173 Perl_re_printf(aTHX_
1174 " %s%s: '%s' @ %" IVdf "/%" IVdf,
2099df82
DM
1175 data->cur_is_floating == i ? "*" : "",
1176 i ? "Float" : "Fixed",
37b6262f
DM
1177 SvPVX_const(data->substrs[i].str),
1178 (IV)data->substrs[i].min_offset,
1179 (IV)data->substrs[i].max_offset
1180 );
1181 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1182 }
11683ecb 1183 }
f5a36d78
DM
1184
1185 Perl_re_printf( aTHX_ "\n");
1186 });
1187}
1188
1189
1190static void
1191S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1192 regnode *scan, U32 depth, U32 flags)
1193{
1194 GET_RE_DEBUG_FLAGS_DECL;
1195
1196 DEBUG_OPTIMISE_r({
1197 regnode *Next;
1198
1199 if (!scan)
1200 return;
1201 Next = regnext(scan);
1202 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1203 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1204 depth,
1205 str,
1206 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1207 Next ? (REG_NODE_NUM(Next)) : 0 );
1208 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1209 Perl_re_printf( aTHX_ "\n");
1210 });
1211}
1212
1213
1214# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1215 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1216
1217# define DEBUG_PEEP(str, scan, depth, flags) \
1218 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1219
1220#else
1221# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1222# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1223#endif
1de06328 1224
cb41e5d6 1225
c6871b76
KW
1226/* =========================================================
1227 * BEGIN edit_distance stuff.
1228 *
1229 * This calculates how many single character changes of any type are needed to
1230 * transform a string into another one. It is taken from version 3.1 of
1231 *
1232 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1233 */
1234
1235/* Our unsorted dictionary linked list. */
1236/* Note we use UVs, not chars. */
1237
1238struct dictionary{
1239 UV key;
1240 UV value;
1241 struct dictionary* next;
1242};
1243typedef struct dictionary item;
1244
1245
1246PERL_STATIC_INLINE item*
88f063b4 1247push(UV key, item* curr)
c6871b76
KW
1248{
1249 item* head;
d09f14bf 1250 Newx(head, 1, item);
c6871b76
KW
1251 head->key = key;
1252 head->value = 0;
1253 head->next = curr;
1254 return head;
1255}
1256
1257
1258PERL_STATIC_INLINE item*
1259find(item* head, UV key)
1260{
1261 item* iterator = head;
1262 while (iterator){
1263 if (iterator->key == key){
1264 return iterator;
1265 }
1266 iterator = iterator->next;
1267 }
1268
1269 return NULL;
1270}
1271
1272PERL_STATIC_INLINE item*
88f063b4 1273uniquePush(item* head, UV key)
c6871b76
KW
1274{
1275 item* iterator = head;
1276
1277 while (iterator){
1278 if (iterator->key == key) {
1279 return head;
1280 }
1281 iterator = iterator->next;
1282 }
1283
88f063b4 1284 return push(key, head);
c6871b76
KW
1285}
1286
1287PERL_STATIC_INLINE void
1288dict_free(item* head)
1289{
1290 item* iterator = head;
1291
1292 while (iterator) {
1293 item* temp = iterator;
1294 iterator = iterator->next;
1295 Safefree(temp);
1296 }
1297
1298 head = NULL;
1299}
1300
1301/* End of Dictionary Stuff */
1302
1303/* All calculations/work are done here */
1304STATIC int
1305S_edit_distance(const UV* src,
1306 const UV* tgt,
1307 const STRLEN x, /* length of src[] */
1308 const STRLEN y, /* length of tgt[] */
1309 const SSize_t maxDistance
1310)
1311{
1312 item *head = NULL;
88f063b4 1313 UV swapCount, swapScore, targetCharCount, i, j;
c6871b76
KW
1314 UV *scores;
1315 UV score_ceil = x + y;
1316
1317 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1318
1319 /* intialize matrix start values */
d09f14bf 1320 Newx(scores, ( (x + 2) * (y + 2)), UV);
c6871b76
KW
1321 scores[0] = score_ceil;
1322 scores[1 * (y + 2) + 0] = score_ceil;
1323 scores[0 * (y + 2) + 1] = score_ceil;
1324 scores[1 * (y + 2) + 1] = 0;
88f063b4 1325 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
c6871b76
KW
1326
1327 /* work loops */
1328 /* i = src index */
1329 /* j = tgt index */
1330 for (i=1;i<=x;i++) {
1331 if (i < x)
88f063b4 1332 head = uniquePush(head, src[i]);
c6871b76
KW
1333 scores[(i+1) * (y + 2) + 1] = i;
1334 scores[(i+1) * (y + 2) + 0] = score_ceil;
1335 swapCount = 0;
1336
1337 for (j=1;j<=y;j++) {
1338 if (i == 1) {
1339 if(j < y)
88f063b4 1340 head = uniquePush(head, tgt[j]);
c6871b76
KW
1341 scores[1 * (y + 2) + (j + 1)] = j;
1342 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1343 }
1344
88f063b4 1345 targetCharCount = find(head, tgt[j-1])->value;
c6871b76
KW
1346 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1347
1348 if (src[i-1] != tgt[j-1]){
1349 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1350 }
1351 else {
1352 swapCount = j;
1353 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1354 }
1355 }
1356
88f063b4 1357 find(head, src[i-1])->value = i;
c6871b76
KW
1358 }
1359
1360 {
1361 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1362 dict_free(head);
1363 Safefree(scores);
1364 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1365 }
1366}
1367
1368/* END of edit_distance() stuff
1369 * ========================================================= */
1370
8e35b056
KW
1371/* is c a control character for which we have a mnemonic? */
1372#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1373
549b4e78
KW
1374STATIC const char *
1375S_cntrl_to_mnemonic(const U8 c)
1376{
1377 /* Returns the mnemonic string that represents character 'c', if one
1378 * exists; NULL otherwise. The only ones that exist for the purposes of
1379 * this routine are a few control characters */
1380
1381 switch (c) {
1382 case '\a': return "\\a";
1383 case '\b': return "\\b";
1384 case ESC_NATIVE: return "\\e";
1385 case '\f': return "\\f";
1386 case '\n': return "\\n";
1387 case '\r': return "\\r";
1388 case '\t': return "\\t";
1389 }
1390
1391 return NULL;
1392}
1393
653099ff 1394/* Mark that we cannot extend a found fixed substring at this point.
37b6262f 1395 Update the longest found anchored substring or the longest found
653099ff
GS
1396 floating substrings if needed. */
1397
4327152a 1398STATIC void
ea3daa5d
FC
1399S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1400 SSize_t *minlenp, int is_inf)
c277df42 1401{
e1ec3a88 1402 const STRLEN l = CHR_SVLEN(data->last_found);
2099df82 1403 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
a2ca2017 1404 const STRLEN old_l = CHR_SVLEN(longest_sv);
1de06328 1405 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1406
7918f24d
NC
1407 PERL_ARGS_ASSERT_SCAN_COMMIT;
1408
c277df42 1409 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
2099df82 1410 const U8 i = data->cur_is_floating;
a2ca2017 1411 SvSetMagicSV(longest_sv, data->last_found);
37b6262f
DM
1412 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1413
2099df82
DM
1414 if (!i) /* fixed */
1415 data->substrs[0].max_offset = data->substrs[0].min_offset;
1416 else { /* float */
1417 data->substrs[1].max_offset = (l
646e8787
DM
1418 ? data->last_start_max
1419 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1420 ? SSize_t_MAX
1421 : data->pos_min + data->pos_delta));
1422 if (is_inf
2099df82
DM
1423 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1424 data->substrs[1].max_offset = SSize_t_MAX;
37b6262f 1425 }
11683ecb 1426
37b6262f
DM
1427 if (data->flags & SF_BEFORE_EOL)
1428 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1429 else
1430 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1431 data->substrs[i].minlenp = minlenp;
1432 data->substrs[i].lookbehind = 0;
c277df42 1433 }
37b6262f 1434
c277df42 1435 SvCUR_set(data->last_found, 0);
0eda9292 1436 {
a28509cc 1437 SV * const sv = data->last_found;
097eb12c
AL
1438 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1439 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1440 if (mg)
1441 mg->mg_len = 0;
1442 }
0eda9292 1443 }
c277df42
IZ
1444 data->last_end = -1;
1445 data->flags &= ~SF_BEFORE_EOL;
f5a36d78 1446 DEBUG_STUDYDATA("commit", data, 0, is_inf);
c277df42
IZ
1447}
1448
cdd87c1d
KW
1449/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1450 * list that describes which code points it matches */
1451
653099ff 1452STATIC void
3420edd7 1453S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1454{
cdd87c1d
KW
1455 /* Set the SSC 'ssc' to match an empty string or any code point */
1456
557bd3fb 1457 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1458
71068078 1459 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1460
0854ea0b
KW
1461 /* mortalize so won't leak */
1462 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1463 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1464}
1465
653099ff 1466STATIC int
dc3bf405 1467S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1468{
c144baaa
KW
1469 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1470 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1471 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1472 * in any way, so there's no point in using it */
cdd87c1d
KW
1473
1474 UV start, end;
1475 bool ret;
653099ff 1476
557bd3fb 1477 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1478
71068078 1479 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1480
93e92956 1481 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1482 return FALSE;
1483 }
1484
1485 /* See if the list consists solely of the range 0 - Infinity */
1486 invlist_iterinit(ssc->invlist);
1487 ret = invlist_iternext(ssc->invlist, &start, &end)
1488 && start == 0
1489 && end == UV_MAX;
1490
1491 invlist_iterfinish(ssc->invlist);
1492
1493 if (ret) {
1494 return TRUE;
1495 }
1496
1497 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1498 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1499 int i;
1500 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1501 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1502 return TRUE;
1503 }
1504 }
1505 }
1506
1507 return FALSE;
653099ff
GS
1508}
1509
653099ff 1510STATIC void
cdd87c1d 1511S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1512{
cdd87c1d
KW
1513 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1514 * string, any code point, or any posix class under locale */
1515
557bd3fb 1516 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1517
557bd3fb 1518 Zero(ssc, 1, regnode_ssc);
71068078 1519 set_ANYOF_SYNTHETIC(ssc);
93e92956 1520 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1521 ssc_anything(ssc);
cdd87c1d 1522
2f306ab9
KW
1523 /* If any portion of the regex is to operate under locale rules that aren't
1524 * fully known at compile time, initialization includes it. The reason
1525 * this isn't done for all regexes is that the optimizer was written under
1526 * the assumption that locale was all-or-nothing. Given the complexity and
1527 * lack of documentation in the optimizer, and that there are inadequate
1528 * test cases for locale, many parts of it may not work properly, it is
1529 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1530 if (RExC_contains_locale) {
1531 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1532 }
1533 else {
1534 ANYOF_POSIXL_ZERO(ssc);
1535 }
653099ff
GS
1536}
1537
b423522f 1538STATIC int
dc3bf405
BF
1539S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1540 const regnode_ssc *ssc)
b423522f
KW
1541{
1542 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1543 * to the list of code points matched, and locale posix classes; hence does
1544 * not check its flags) */
1545
1546 UV start, end;
1547 bool ret;
1548
1549 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1550
71068078 1551 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1552
1553 invlist_iterinit(ssc->invlist);
1554 ret = invlist_iternext(ssc->invlist, &start, &end)
1555 && start == 0
1556 && end == UV_MAX;
1557
1558 invlist_iterfinish(ssc->invlist);
1559
1560 if (! ret) {
1561 return FALSE;
1562 }
1563
e0e1be5f 1564 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1565 return FALSE;
b423522f
KW
1566 }
1567
1568 return TRUE;
1569}
1570
4ebed06a
KW
1571#define INVLIST_INDEX 0
1572#define ONLY_LOCALE_MATCHES_INDEX 1
1573#define DEFERRED_USER_DEFINED_INDEX 2
1574
b423522f
KW
1575STATIC SV*
1576S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1577 const regnode_charclass* const node)
b423522f
KW
1578{
1579 /* Returns a mortal inversion list defining which code points are matched
1580 * by 'node', which is of type ANYOF. Handles complementing the result if
1581 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1582 * returned list must, and will, contain every code point that is a
1583 * possibility. */
b423522f 1584
1565c085 1585 dVAR;
e2506fa7 1586 SV* invlist = NULL;
1ee208c4 1587 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1588 unsigned int i;
1589 const U32 n = ARG(node);
31f05a37 1590 bool new_node_has_latin1 = FALSE;
2d5613be 1591 const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFRb))
29a889ef
KW
1592 ? 0
1593 : ANYOF_FLAGS(node);
b423522f
KW
1594
1595 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1596
1597 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1598 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1599 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1600 AV * const av = MUTABLE_AV(SvRV(rv));
1601 SV **const ary = AvARRAY(av);
1602 assert(RExC_rxi->data->what[n] == 's');
1603
4ebed06a 1604 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
b423522f 1605
6ed02bb6
KW
1606 /* Here there are things that won't be known until runtime -- we
1607 * have to assume it could be anything */
e2506fa7 1608 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1609 return _add_range_to_invlist(invlist, 0, UV_MAX);
1610 }
4ebed06a 1611 else if (ary[INVLIST_INDEX]) {
b423522f 1612
6ed02bb6 1613 /* Use the node's inversion list */
4ebed06a 1614 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1ee208c4
KW
1615 }
1616
1617 /* Get the code points valid only under UTF-8 locales */
766d6d33 1618 if ( (flags & ANYOFL_FOLD)
4ebed06a 1619 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1ee208c4 1620 {
4ebed06a 1621 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
b423522f
KW
1622 }
1623 }
1624
e2506fa7
KW
1625 if (! invlist) {
1626 invlist = sv_2mortal(_new_invlist(0));
1627 }
1628
dcb20b36
KW
1629 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1630 * code points, and an inversion list for the others, but if there are code
1631 * points that should match only conditionally on the target string being
1632 * UTF-8, those are placed in the inversion list, and not the bitmap.
1633 * Since there are circumstances under which they could match, they are
1634 * included in the SSC. But if the ANYOF node is to be inverted, we have
1635 * to exclude them here, so that when we invert below, the end result
1636 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1637 * have to do this here before we add the unconditionally matched code
1638 * points */
766d6d33 1639 if (flags & ANYOF_INVERT) {
b423522f
KW
1640 _invlist_intersection_complement_2nd(invlist,
1641 PL_UpperLatin1,
1642 &invlist);
1643 }
1644
1645 /* Add in the points from the bit map */
2d5613be 1646 if (! inRANGE(OP(node), ANYOFH, ANYOFRb)) {
90973738
KW
1647 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1648 if (ANYOF_BITMAP_TEST(node, i)) {
1649 unsigned int start = i++;
1650
1651 for (; i < NUM_ANYOF_CODE_POINTS
1652 && ANYOF_BITMAP_TEST(node, i); ++i)
1653 {
1654 /* empty */
1655 }
1656 invlist = _add_range_to_invlist(invlist, start, i-1);
1657 new_node_has_latin1 = TRUE;
6f8848d5 1658 }
b423522f
KW
1659 }
1660 }
1661
1662 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1663 * as well. But don't add them if inverting, as when that gets done below,
1664 * it would exclude all these characters, including the ones it shouldn't
1665 * that were added just above */
766d6d33
KW
1666 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1667 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
f240c685 1668 {
b423522f
KW
1669 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1670 }
1671
1672 /* Similarly for these */
766d6d33 1673 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1674 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1675 }
1676
766d6d33 1677 if (flags & ANYOF_INVERT) {
b423522f
KW
1678 _invlist_invert(invlist);
1679 }
766d6d33 1680 else if (flags & ANYOFL_FOLD) {
35b8412f 1681 if (new_node_has_latin1) {
31f05a37 1682
26be5fe6
KW
1683 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1684 * the locale. We can skip this if there are no 0-255 at all. */
1685 _invlist_union(invlist, PL_Latin1, &invlist);
35b8412f
KW
1686
1687 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1688 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1689 }
1690 else {
1691 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1692 invlist = add_cp_to_invlist(invlist, 'I');
1693 }
1694 if (_invlist_contains_cp(invlist,
1695 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1696 {
1697 invlist = add_cp_to_invlist(invlist, 'i');
1698 }
1699 }
31f05a37
KW
1700 }
1701
1ee208c4
KW
1702 /* Similarly add the UTF-8 locale possible matches. These have to be
1703 * deferred until after the non-UTF-8 locale ones are taken care of just
1704 * above, or it leads to wrong results under ANYOF_INVERT */
1705 if (only_utf8_locale_invlist) {
31f05a37 1706 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1707 only_utf8_locale_invlist,
766d6d33 1708 flags & ANYOF_INVERT,
31f05a37
KW
1709 &invlist);
1710 }
b423522f
KW
1711
1712 return invlist;
1713}
1714
1051e1c4 1715/* These two functions currently do the exact same thing */
557bd3fb 1716#define ssc_init_zero ssc_init
653099ff 1717
cdd87c1d
KW
1718#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1719#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1720
557bd3fb 1721/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1722 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1723 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1724
653099ff 1725STATIC void
b423522f 1726S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1727 const regnode_charclass *and_with)
653099ff 1728{
cdd87c1d
KW
1729 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1730 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1731
a0dd4231 1732 SV* anded_cp_list;
2d5613be 1733 U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFRb)
29a889ef
KW
1734 ? 0
1735 : ANYOF_FLAGS(and_with);
a0dd4231 1736 U8 anded_flags;
1e6ade67 1737
cdd87c1d 1738 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1739
71068078 1740 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1741
1742 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1743 * the code point inversion list and just the relevant flags */
71068078 1744 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1745 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
765e6ecf 1746 anded_flags = and_with_flags;
e9b08962
KW
1747
1748 /* XXX This is a kludge around what appears to be deficiencies in the
1749 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1750 * there are paths through the optimizer where it doesn't get weeded
1751 * out when it should. And if we don't make some extra provision for
1752 * it like the code just below, it doesn't get added when it should.
1753 * This solution is to add it only when AND'ing, which is here, and
1754 * only when what is being AND'ed is the pristine, original node
1755 * matching anything. Thus it is like adding it to ssc_anything() but
1756 * only when the result is to be AND'ed. Probably the same solution
1757 * could be adopted for the same problem we have with /l matching,
1758 * which is solved differently in S_ssc_init(), and that would lead to
1759 * fewer false positives than that solution has. But if this solution
1760 * creates bugs, the consequences are only that a warning isn't raised
1761 * that should be; while the consequences for having /l bugs is
1762 * incorrect matches */
7dcac5f6 1763 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1764 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1765 }
a0dd4231
KW
1766 }
1767 else {
5c0f85ef 1768 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685 1769 if (OP(and_with) == ANYOFD) {
765e6ecf 1770 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
f240c685
KW
1771 }
1772 else {
765e6ecf 1773 anded_flags = and_with_flags
f240c685 1774 &( ANYOF_COMMON_FLAGS
108316fb
KW
1775 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1776 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
765e6ecf 1777 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
d1c40ef5
KW
1778 anded_flags &=
1779 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1780 }
f240c685 1781 }
a0dd4231
KW
1782 }
1783
1784 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1785
1786 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1787 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1788 * 'and_with' may be inverted. When not inverted, we have the situation of
1789 * computing:
1790 * (C1 | P1) & (C2 | P2)
1791 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1792 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1793 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1794 * <= ((C1 & C2) | P1 | P2)
1795 * Alternatively, the last few steps could be:
1796 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1797 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1798 * <= (C1 | C2 | (P1 & P2))
1799 * We favor the second approach if either P1 or P2 is non-empty. This is
1800 * because these components are a barrier to doing optimizations, as what
1801 * they match cannot be known until the moment of matching as they are
1802 * dependent on the current locale, 'AND"ing them likely will reduce or
1803 * eliminate them.
1804 * But we can do better if we know that C1,P1 are in their initial state (a
1805 * frequent occurrence), each matching everything:
1806 * (<everything>) & (C2 | P2) = C2 | P2
1807 * Similarly, if C2,P2 are in their initial state (again a frequent
1808 * occurrence), the result is a no-op
1809 * (C1 | P1) & (<everything>) = C1 | P1
1810 *
1811 * Inverted, we have
1812 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1813 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1814 * <= (C1 & ~C2) | (P1 & ~P2)
1815 * */
1aa99e6b 1816
765e6ecf 1817 if ((and_with_flags & ANYOF_INVERT)
71068078 1818 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1819 {
cdd87c1d 1820 unsigned int i;
8951c461 1821
cdd87c1d
KW
1822 ssc_intersection(ssc,
1823 anded_cp_list,
1824 FALSE /* Has already been inverted */
1825 );
c6b76537 1826
cdd87c1d
KW
1827 /* If either P1 or P2 is empty, the intersection will be also; can skip
1828 * the loop */
765e6ecf 1829 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1830 ANYOF_POSIXL_ZERO(ssc);
1831 }
e0e1be5f 1832 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1833
1834 /* Note that the Posix class component P from 'and_with' actually
1835 * looks like:
1836 * P = Pa | Pb | ... | Pn
1837 * where each component is one posix class, such as in [\w\s].
1838 * Thus
1839 * ~P = ~(Pa | Pb | ... | Pn)
1840 * = ~Pa & ~Pb & ... & ~Pn
1841 * <= ~Pa | ~Pb | ... | ~Pn
1842 * The last is something we can easily calculate, but unfortunately
1843 * is likely to have many false positives. We could do better
1844 * in some (but certainly not all) instances if two classes in
1845 * P have known relationships. For example
1846 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1847 * So
1848 * :lower: & :print: = :lower:
1849 * And similarly for classes that must be disjoint. For example,
1850 * since \s and \w can have no elements in common based on rules in
1851 * the POSIX standard,
1852 * \w & ^\S = nothing
1853 * Unfortunately, some vendor locales do not meet the Posix
1854 * standard, in particular almost everything by Microsoft.
1855 * The loop below just changes e.g., \w into \W and vice versa */
1856
1ee208c4 1857 regnode_charclass_posixl temp;
cdd87c1d
KW
1858 int add = 1; /* To calculate the index of the complement */
1859
b1234259 1860 Zero(&temp, 1, regnode_charclass_posixl);
cdd87c1d
KW
1861 ANYOF_POSIXL_ZERO(&temp);
1862 for (i = 0; i < ANYOF_MAX; i++) {
1863 assert(i % 2 != 0
7dcac5f6
KW
1864 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1865 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1866
7dcac5f6 1867 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1868 ANYOF_POSIXL_SET(&temp, i + add);
1869 }
1870 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1871 }
1872 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1873
cdd87c1d
KW
1874 } /* else ssc already has no posixes */
1875 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1876 in its initial state */
71068078 1877 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1878 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1879 {
1880 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1881 * copy it over 'ssc' */
1882 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1883 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1884 StructCopy(and_with, ssc, regnode_ssc);
1885 }
1886 else {
1887 ssc->invlist = anded_cp_list;
1888 ANYOF_POSIXL_ZERO(ssc);
765e6ecf 1889 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1890 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1891 }
1892 }
1893 }
e0e1be5f 1894 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
765e6ecf 1895 || (and_with_flags & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1896 {
1897 /* One or the other of P1, P2 is non-empty. */
765e6ecf 1898 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1899 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1900 }
cdd87c1d
KW
1901 ssc_union(ssc, anded_cp_list, FALSE);
1902 }
1903 else { /* P1 = P2 = empty */
1904 ssc_intersection(ssc, anded_cp_list, FALSE);
1905 }
137165a6 1906 }
653099ff
GS
1907}
1908
653099ff 1909STATIC void
cdd87c1d 1910S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1911 const regnode_charclass *or_with)
653099ff 1912{
cdd87c1d
KW
1913 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1914 * another SSC or a regular ANYOF class. Can create false positives if
1915 * 'or_with' is to be inverted. */
7918f24d 1916
a0dd4231
KW
1917 SV* ored_cp_list;
1918 U8 ored_flags;
2d5613be 1919 U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFRb)
29a889ef
KW
1920 ? 0
1921 : ANYOF_FLAGS(or_with);
c6b76537 1922
cdd87c1d 1923 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1924
71068078 1925 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1926
1927 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1928 * the code point inversion list and just the relevant flags */
71068078 1929 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1930 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
765e6ecf 1931 ored_flags = or_with_flags;
a0dd4231
KW
1932 }
1933 else {
5c0f85ef 1934 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
765e6ecf 1935 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
f240c685
KW
1936 if (OP(or_with) != ANYOFD) {
1937 ored_flags
765e6ecf 1938 |= or_with_flags
108316fb
KW
1939 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1940 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
765e6ecf 1941 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
d1c40ef5
KW
1942 ored_flags |=
1943 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1944 }
f240c685 1945 }
a0dd4231
KW
1946 }
1947
1948 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1949
1950 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1951 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1952 * 'or_with' may be inverted. When not inverted, we have the simple
1953 * situation of computing:
1954 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1955 * If P1|P2 yields a situation with both a class and its complement are
1956 * set, like having both \w and \W, this matches all code points, and we
1957 * can delete these from the P component of the ssc going forward. XXX We
1958 * might be able to delete all the P components, but I (khw) am not certain
1959 * about this, and it is better to be safe.
1960 *
1961 * Inverted, we have
1962 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1963 * <= (C1 | P1) | ~C2
1964 * <= (C1 | ~C2) | P1
1965 * (which results in actually simpler code than the non-inverted case)
1966 * */
9826f543 1967
765e6ecf 1968 if ((or_with_flags & ANYOF_INVERT)
71068078 1969 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1970 {
cdd87c1d 1971 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1972 } /* else Not inverted */
765e6ecf 1973 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1974 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1975 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1976 unsigned int i;
1977 for (i = 0; i < ANYOF_MAX; i += 2) {
1978 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1979 {
1980 ssc_match_all_cp(ssc);
1981 ANYOF_POSIXL_CLEAR(ssc, i);
1982 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1983 }
1984 }
1985 }
1aa99e6b 1986 }
cdd87c1d
KW
1987
1988 ssc_union(ssc,
1989 ored_cp_list,
1990 FALSE /* Already has been inverted */
1991 );
653099ff
GS
1992}
1993
b423522f
KW
1994PERL_STATIC_INLINE void
1995S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1996{
1997 PERL_ARGS_ASSERT_SSC_UNION;
1998
71068078 1999 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2000
2001 _invlist_union_maybe_complement_2nd(ssc->invlist,
2002 invlist,
2003 invert2nd,
2004 &ssc->invlist);
2005}
2006
2007PERL_STATIC_INLINE void
2008S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2009 SV* const invlist,
2010 const bool invert2nd)
2011{
2012 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2013
71068078 2014 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2015
2016 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2017 invlist,
2018 invert2nd,
2019 &ssc->invlist);
2020}
2021
2022PERL_STATIC_INLINE void
2023S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2024{
2025 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2026
71068078 2027 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2028
2029 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2030}
2031
2032PERL_STATIC_INLINE void
2033S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2034{
2035 /* AND just the single code point 'cp' into the SSC 'ssc' */
2036
2037 SV* cp_list = _new_invlist(2);
2038
2039 PERL_ARGS_ASSERT_SSC_CP_AND;
2040
71068078 2041 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2042
2043 cp_list = add_cp_to_invlist(cp_list, cp);
2044 ssc_intersection(ssc, cp_list,
2045 FALSE /* Not inverted */
2046 );
2047 SvREFCNT_dec_NN(cp_list);
2048}
2049
2050PERL_STATIC_INLINE void
dc3bf405 2051S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
2052{
2053 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
2054 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2055
71068078 2056 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2057
2058 ANYOF_POSIXL_ZERO(ssc);
2059 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2060}
2061
b35552de
KW
2062#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2063
2064STATIC bool
2065S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2066{
2067 /* The synthetic start class is used to hopefully quickly winnow down
2068 * places where a pattern could start a match in the target string. If it
2069 * doesn't really narrow things down that much, there isn't much point to
2070 * having the overhead of using it. This function uses some very crude
2071 * heuristics to decide if to use the ssc or not.
2072 *
2073 * It returns TRUE if 'ssc' rules out more than half what it considers to
2074 * be the "likely" possible matches, but of course it doesn't know what the
2075 * actual things being matched are going to be; these are only guesses
2076 *
2077 * For /l matches, it assumes that the only likely matches are going to be
2078 * in the 0-255 range, uniformly distributed, so half of that is 127
2079 * For /a and /d matches, it assumes that the likely matches will be just
2080 * the ASCII range, so half of that is 63
2081 * For /u and there isn't anything matching above the Latin1 range, it
2082 * assumes that that is the only range likely to be matched, and uses
2083 * half that as the cut-off: 127. If anything matches above Latin1,
2084 * it assumes that all of Unicode could match (uniformly), except for
2085 * non-Unicode code points and things in the General Category "Other"
2086 * (unassigned, private use, surrogates, controls and formats). This
2087 * is a much large number. */
2088
b35552de
KW
2089 U32 count = 0; /* Running total of number of code points matched by
2090 'ssc' */
2091 UV start, end; /* Start and end points of current range in inversion
26be5fe6 2092 XXX outdated. UTF-8 locales are common, what about invert? list */
72400949
KW
2093 const U32 max_code_points = (LOC)
2094 ? 256
f6e8f31e 2095 : (( ! UNI_SEMANTICS
1a26cbcb 2096 || invlist_highest(ssc->invlist) < 256)
72400949
KW
2097 ? 128
2098 : NON_OTHER_COUNT);
2099 const U32 max_match = max_code_points / 2;
b35552de
KW
2100
2101 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2102
2103 invlist_iterinit(ssc->invlist);
2104 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
2105 if (start >= max_code_points) {
2106 break;
b35552de 2107 }
72400949 2108 end = MIN(end, max_code_points - 1);
b35552de 2109 count += end - start + 1;
72400949 2110 if (count >= max_match) {
b35552de
KW
2111 invlist_iterfinish(ssc->invlist);
2112 return FALSE;
2113 }
2114 }
2115
2116 return TRUE;
2117}
2118
2119
b423522f
KW
2120STATIC void
2121S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2122{
2123 /* The inversion list in the SSC is marked mortal; now we need a more
2124 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
2125 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2126 * map */
b423522f 2127
28118b9c 2128 SV* invlist = invlist_clone(ssc->invlist, NULL);
b423522f
KW
2129
2130 PERL_ARGS_ASSERT_SSC_FINALIZE;
2131
71068078 2132 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 2133
a0dd4231 2134 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
2135 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2136 * by the time we reach here */
f240c685
KW
2137 assert(! (ANYOF_FLAGS(ssc)
2138 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
2139 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2140 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 2141
b423522f
KW
2142 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2143
4c404f26 2144 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
21c3fd9d 2145 SvREFCNT_dec(invlist);
b423522f 2146
85c8e306
KW
2147 /* Make sure is clone-safe */
2148 ssc->invlist = NULL;
2149
e0e1be5f 2150 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 2151 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
d156f5cb 2152 OP(ssc) = ANYOFPOSIXL;
e0e1be5f 2153 }
d156f5cb 2154 else if (RExC_contains_locale) {
b2e90ddf
KW
2155 OP(ssc) = ANYOFL;
2156 }
2157
1462525b 2158 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
2159}
2160
a3621e74
YO
2161#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2162#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2163#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
2164#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2165 ? (TRIE_LIST_CUR( idx ) - 1) \
2166 : 0 )
a3621e74 2167
3dab1dad
YO
2168
2169#ifdef DEBUGGING
07be1b83 2170/*
2b8b4781
NC
2171 dump_trie(trie,widecharmap,revcharmap)
2172 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2173 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
2174
2175 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
2176 The _interim_ variants are used for debugging the interim
2177 tables that are used to generate the final compressed
2178 representation which is what dump_trie expects.
2179
486ec47a 2180 Part of the reason for their existence is to provide a form
3dab1dad 2181 of documentation as to how the different representations function.
07be1b83
YO
2182
2183*/
3dab1dad
YO
2184
2185/*
3dab1dad
YO
2186 Dumps the final compressed table form of the trie to Perl_debug_log.
2187 Used for debugging make_trie().
2188*/
b9a59e08 2189
3dab1dad 2190STATIC void
2b8b4781
NC
2191S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2192 AV *revcharmap, U32 depth)
3dab1dad
YO
2193{
2194 U32 state;
ab3bbdeb 2195 SV *sv=sv_newmortal();
55eed653 2196 int colwidth= widecharmap ? 6 : 4;
2e64971a 2197 U16 word;
3dab1dad
YO
2198 GET_RE_DEBUG_FLAGS_DECL;
2199
7918f24d 2200 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 2201
6ad9a8ab 2202 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 2203 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
2204
2205 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 2206 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 2207 if ( tmp ) {
6ad9a8ab 2208 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2209 colwidth,
538e84ed 2210 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2211 PL_colors[0], PL_colors[1],
2212 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2213 PERL_PV_ESCAPE_FIRSTCHAR
2214 )
ab3bbdeb 2215 );
3dab1dad
YO
2216 }
2217 }
1e37780e
YO
2218 Perl_re_printf( aTHX_ "\n");
2219 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2220
2221 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2222 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2223 Perl_re_printf( aTHX_ "\n");
3dab1dad 2224
1e2e3d02 2225 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2226 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2227
147e3846 2228 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2229
2230 if ( trie->states[ state ].wordnum ) {
1e37780e 2231 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2232 } else {
6ad9a8ab 2233 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2234 }
2235
147e3846 2236 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2237
2238 if ( base ) {
2239 U32 ofs = 0;
2240
2241 while( ( base + ofs < trie->uniquecharcount ) ||
2242 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2243 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2244 != state))
3dab1dad
YO
2245 ofs++;
2246
147e3846 2247 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2248
2249 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2250 if ( ( base + ofs >= trie->uniquecharcount )
2251 && ( base + ofs - trie->uniquecharcount
2252 < trie->lasttrans )
2253 && trie->trans[ base + ofs
2254 - trie->uniquecharcount ].check == state )
3dab1dad 2255 {
147e3846 2256 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2257 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2258 );
3dab1dad 2259 } else {
88f063b4 2260 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
3dab1dad
YO
2261 }
2262 }
2263
6ad9a8ab 2264 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2265
2266 }
6ad9a8ab 2267 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2268 }
6ad9a8ab 2269 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2270 depth);
2e64971a 2271 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2272 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2273 (int)word, (int)(trie->wordinfo[word].prev),
2274 (int)(trie->wordinfo[word].len));
2275 }
6ad9a8ab 2276 Perl_re_printf( aTHX_ "\n" );
538e84ed 2277}
3dab1dad 2278/*
3dab1dad 2279 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2280 List tries normally only are used for construction when the number of
3dab1dad
YO
2281 possible chars (trie->uniquecharcount) is very high.
2282 Used for debugging make_trie().
2283*/
2284STATIC void
55eed653 2285S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2286 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2287 U32 depth)
3dab1dad
YO
2288{
2289 U32 state;
ab3bbdeb 2290 SV *sv=sv_newmortal();
55eed653 2291 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2292 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2293
2294 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2295
3dab1dad 2296 /* print out the table precompression. */
6ad9a8ab 2297 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2298 depth+1 );
6ad9a8ab 2299 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2300 depth+1, "------:-----+-----------------\n" );
538e84ed 2301
3dab1dad
YO
2302 for( state=1 ; state < next_alloc ; state ++ ) {
2303 U16 charid;
538e84ed 2304
147e3846 2305 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2306 depth+1, (UV)state );
3dab1dad 2307 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2308 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2309 } else {
6ad9a8ab 2310 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2311 trie->states[ state ].wordnum
2312 );
2313 }
2314 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed 2315 SV ** const tmp = av_fetch( revcharmap,
88f063b4 2316 TRIE_LIST_ITEM(state, charid).forid, 0);
ab3bbdeb 2317 if ( tmp ) {
147e3846 2318 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2319 colwidth,
538e84ed
KW
2320 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2321 colwidth,
2322 PL_colors[0], PL_colors[1],
2323 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2324 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2325 ) ,
88f063b4
KW
2326 TRIE_LIST_ITEM(state, charid).forid,
2327 (UV)TRIE_LIST_ITEM(state, charid).newstate
1e2e3d02 2328 );
538e84ed 2329 if (!(charid % 10))
6ad9a8ab 2330 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2331 (int)((depth * 2) + 14), "");
1e2e3d02 2332 }
ab3bbdeb 2333 }
6ad9a8ab 2334 Perl_re_printf( aTHX_ "\n");
3dab1dad 2335 }
538e84ed 2336}
3dab1dad
YO
2337
2338/*
3dab1dad 2339 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2340 This is the normal DFA style state transition table, with a few
2341 twists to facilitate compression later.
3dab1dad
YO
2342 Used for debugging make_trie().
2343*/
2344STATIC void
55eed653 2345S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2346 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2347 U32 depth)
3dab1dad
YO
2348{
2349 U32 state;
2350 U16 charid;
ab3bbdeb 2351 SV *sv=sv_newmortal();
55eed653 2352 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2353 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2354
2355 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2356
3dab1dad
YO
2357 /*
2358 print out the table precompression so that we can do a visual check
2359 that they are identical.
2360 */
538e84ed 2361
6ad9a8ab 2362 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2363
2364 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2365 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2366 if ( tmp ) {
6ad9a8ab 2367 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2368 colwidth,
538e84ed 2369 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2370 PL_colors[0], PL_colors[1],
2371 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2372 PERL_PV_ESCAPE_FIRSTCHAR
2373 )
ab3bbdeb 2374 );
3dab1dad
YO
2375 }
2376 }
2377
4aaafc03
YO
2378 Perl_re_printf( aTHX_ "\n");
2379 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2380
2381 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2382 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2383 }
2384
6ad9a8ab 2385 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2386
2387 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2388
147e3846 2389 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2390 depth+1,
3dab1dad
YO
2391 (UV)TRIE_NODENUM( state ) );
2392
2393 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2394 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2395 if (v)
147e3846 2396 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2397 else
6ad9a8ab 2398 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2399 }
2400 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2401 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2402 (UV)trie->trans[ state ].check );
3dab1dad 2403 } else {
147e3846 2404 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2405 (UV)trie->trans[ state ].check,
3dab1dad
YO
2406 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2407 }
2408 }
07be1b83 2409}
3dab1dad
YO
2410
2411#endif
2412
2e64971a 2413
786e8c11
YO
2414/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2415 startbranch: the first branch in the whole branch sequence
2416 first : start branch of sequence of branch-exact nodes.
2417 May be the same as startbranch
2418 last : Thing following the last branch.
2419 May be the same as tail.
2420 tail : item following the branch sequence
2421 count : words in the sequence
a4525e78 2422 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2423 depth : indent depth
3dab1dad 2424
786e8c11 2425Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2426
786e8c11
YO
2427A trie is an N'ary tree where the branches are determined by digital
2428decomposition of the key. IE, at the root node you look up the 1st character and
2429follow that branch repeat until you find the end of the branches. Nodes can be
2430marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2431
786e8c11 2432 /he|she|his|hers/
72f13be8 2433
786e8c11
YO
2434would convert into the following structure. Numbers represent states, letters
2435following numbers represent valid transitions on the letter from that state, if
2436the number is in square brackets it represents an accepting state, otherwise it
2437will be in parenthesis.
07be1b83 2438
786e8c11
YO
2439 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2440 | |
2441 | (2)
2442 | |
2443 (1) +-i->(6)-+-s->[7]
2444 |
2445 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2446
786e8c11
YO
2447 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2448
2449This shows that when matching against the string 'hers' we will begin at state 1
2450read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2451then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2452is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2453single traverse. We store a mapping from accepting to state to which word was
2454matched, and then when we have multiple possibilities we try to complete the
b8fda935 2455rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2456
2457The only prior NFA like behaviour that would be changed by the TRIE support is
2458the silent ignoring of duplicate alternations which are of the form:
2459
2460 / (DUPE|DUPE) X? (?{ ... }) Y /x
2461
4b714af6 2462Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2463and without the optimisation. With the optimisations dupes will be silently
486ec47a 2464ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2465the following demonstrates:
2466
2467 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2468
2469which prints out 'word' three times, but
2470
2471 'words'=~/(word|word|word)(?{ print $1 })S/
2472
2473which doesnt print it out at all. This is due to other optimisations kicking in.
2474
2475Example of what happens on a structural level:
2476
486ec47a 2477The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2478
2479 1: CURLYM[1] {1,32767}(18)
2480 5: BRANCH(8)
2481 6: EXACT <ac>(16)
2482 8: BRANCH(11)
2483 9: EXACT <ad>(16)
2484 11: BRANCH(14)
2485 12: EXACT <ab>(16)
2486 16: SUCCEED(0)
2487 17: NOTHING(18)
2488 18: END(0)
2489
2490This would be optimizable with startbranch=5, first=5, last=16, tail=16
2491and should turn into:
2492
2493 1: CURLYM[1] {1,32767}(18)
2494 5: TRIE(16)
2495 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2496 <ac>
2497 <ad>
2498 <ab>
2499 16: SUCCEED(0)
2500 17: NOTHING(18)
2501 18: END(0)
2502
2503Cases where tail != last would be like /(?foo|bar)baz/:
2504
2505 1: BRANCH(4)
2506 2: EXACT <foo>(8)
2507 4: BRANCH(7)
2508 5: EXACT <bar>(8)
2509 7: TAIL(8)
2510 8: EXACT <baz>(10)
2511 10: END(0)
2512
2513which would be optimizable with startbranch=1, first=1, last=7, tail=8
2514and would end up looking like:
2515
2516 1: TRIE(8)
2517 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2518 <foo>
2519 <bar>
2520 7: TAIL(8)
2521 8: EXACT <baz>(10)
2522 10: END(0)
2523
c80e42f3 2524 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2525
2526is the recommended Unicode-aware way of saying
2527
2528 *(d++) = uv;
2529*/
2530
fab2782b 2531#define TRIE_STORE_REVCHAR(val) \
786e8c11 2532 STMT_START { \
73031816 2533 if (UTF) { \
668fcfea 2534 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2535 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1d84a256
DM
2536 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2537 *kapow = '\0'; \
73031816
NC
2538 SvCUR_set(zlopp, kapow - flrbbbbb); \
2539 SvPOK_on(zlopp); \
2540 SvUTF8_on(zlopp); \
2541 av_push(revcharmap, zlopp); \
2542 } else { \
fab2782b 2543 char ooooff = (char)val; \
73031816
NC
2544 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2545 } \
2546 } STMT_END
786e8c11 2547
914a25d5
KW
2548/* This gets the next character from the input, folding it if not already
2549 * folded. */
2550#define TRIE_READ_CHAR STMT_START { \
2551 wordlen++; \
2552 if ( UTF ) { \
2553 /* if it is UTF then it is either already folded, or does not need \
2554 * folding */ \
1c1d615a 2555 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2556 } \
2557 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2558 /* This folder implies Unicode rules, which in the range expressible \
2559 * by not UTF is the lower case, with the two exceptions, one of \
2560 * which should have been taken care of before calling this */ \
2561 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2562 uvc = toLOWER_L1(*uc); \
2563 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2564 len = 1; \
914a25d5
KW
2565 } else { \
2566 /* raw data, will be folded later if needed */ \
2567 uvc = (U32)*uc; \
2568 len = 1; \
2569 } \
786e8c11
YO
2570} STMT_END
2571
2572
2573
2574#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2575 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
00195859 2576 U32 ging = TRIE_LIST_LEN( state ) * 2; \
f9003953 2577 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
00195859 2578 TRIE_LIST_LEN( state ) = ging; \
786e8c11
YO
2579 } \
2580 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2581 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2582 TRIE_LIST_CUR( state )++; \
2583} STMT_END
07be1b83 2584
786e8c11 2585#define TRIE_LIST_NEW(state) STMT_START { \
d09f14bf 2586 Newx( trie->states[ state ].trans.list, \
786e8c11
YO
2587 4, reg_trie_trans_le ); \
2588 TRIE_LIST_CUR( state ) = 1; \
2589 TRIE_LIST_LEN( state ) = 4; \
2590} STMT_END
07be1b83 2591
786e8c11
YO
2592#define TRIE_HANDLE_WORD(state) STMT_START { \
2593 U16 dupe= trie->states[ state ].wordnum; \
2594 regnode * const noper_next = regnext( noper ); \
2595 \
786e8c11
YO
2596 DEBUG_r({ \
2597 /* store the word for dumping */ \
2598 SV* tmp; \
2599 if (OP(noper) != NOTHING) \
740cce10 2600 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2601 else \
740cce10 2602 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2603 av_push( trie_words, tmp ); \
786e8c11
YO
2604 }); \
2605 \
2606 curword++; \
2e64971a
DM
2607 trie->wordinfo[curword].prev = 0; \
2608 trie->wordinfo[curword].len = wordlen; \
2609 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2610 \
2611 if ( noper_next < tail ) { \
2612 if (!trie->jump) \
538e84ed
KW
2613 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2614 sizeof(U16) ); \
7f69552c 2615 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2616 if (!jumper) \
2617 jumper = noper_next; \
2618 if (!nextbranch) \
2619 nextbranch= regnext(cur); \
2620 } \
2621 \
2622 if ( dupe ) { \
2e64971a
DM
2623 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2624 /* chain, so that when the bits of chain are later */\
2625 /* linked together, the dups appear in the chain */\
2626 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2627 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2628 } else { \
2629 /* we haven't inserted this word yet. */ \
2630 trie->states[ state ].wordnum = curword; \
2631 } \
2632} STMT_END
07be1b83 2633
3dab1dad 2634
786e8c11
YO
2635#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2636 ( ( base + charid >= ucharcount \
2637 && base + charid < ubound \
2638 && state == trie->trans[ base - ucharcount + charid ].check \
2639 && trie->trans[ base - ucharcount + charid ].next ) \
2640 ? trie->trans[ base - ucharcount + charid ].next \
2641 : ( state==1 ? special : 0 ) \
2642 )
3dab1dad 2643
8bcafbf4
YO
2644#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2645STMT_START { \
2646 TRIE_BITMAP_SET(trie, uvc); \
2647 /* store the folded codepoint */ \
2648 if ( folder ) \
2649 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2650 \
2651 if ( !UTF ) { \
2652 /* store first byte of utf8 representation of */ \
2653 /* variant codepoints */ \
2654 if (! UVCHR_IS_INVARIANT(uvc)) { \
2655 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2656 } \
2657 } \
2658} STMT_END
786e8c11
YO
2659#define MADE_TRIE 1
2660#define MADE_JUMP_TRIE 2
2661#define MADE_EXACT_TRIE 4
3dab1dad 2662
a3621e74 2663STATIC I32
538e84ed
KW
2664S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2665 regnode *first, regnode *last, regnode *tail,
2666 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2667{
2668 /* first pass, loop through and scan words */
2669 reg_trie_data *trie;
55eed653 2670 HV *widecharmap = NULL;
2b8b4781 2671 AV *revcharmap = newAV();
a3621e74 2672 regnode *cur;
a3621e74
YO
2673 STRLEN len = 0;
2674 UV uvc = 0;
2675 U16 curword = 0;
2676 U32 next_alloc = 0;
786e8c11
YO
2677 regnode *jumper = NULL;
2678 regnode *nextbranch = NULL;
7f69552c 2679 regnode *convert = NULL;
2e64971a 2680 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2681 /* we just use folder as a flag in utf8 */
1e696034 2682 const U8 * folder = NULL;
a3621e74 2683
3a611511
YO
2684 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2685 * which stands for one trie structure, one hash, optionally followed
2686 * by two arrays */
2b8b4781 2687#ifdef DEBUGGING
3a611511 2688 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2b8b4781
NC
2689 AV *trie_words = NULL;
2690 /* along with revcharmap, this only used during construction but both are
2691 * useful during debugging so we store them in the struct when debugging.
8e11feef 2692 */
2b8b4781 2693#else
cf78de0b 2694 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2695 STRLEN trie_charcount=0;
3dab1dad 2696#endif
2b8b4781 2697 SV *re_trie_maxbuff;
a3621e74 2698 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2699
2700 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2701#ifndef DEBUGGING
2702 PERL_UNUSED_ARG(depth);
2703#endif
a3621e74 2704
1e696034 2705 switch (flags) {
3f2416ae 2706 case EXACT: case EXACT_REQ8: case EXACTL: break;
89829bb5 2707 case EXACTFAA:
0ea669f4 2708 case EXACTFUP:
a4525e78
KW
2709 case EXACTFU:
2710 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2711 case EXACTF: folder = PL_fold; break;
fab2782b 2712 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2713 }
2714
c944940b 2715 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2716 trie->refcount = 1;
3dab1dad 2717 trie->startstate = 1;
786e8c11 2718 trie->wordcount = word_count;
f8fc2ecf 2719 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2720 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
3f2416ae 2721 if (flags == EXACT || flags == EXACT_REQ8 || flags == EXACTL)
c944940b 2722 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2723 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2724 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2725
a3621e74 2726 DEBUG_r({
2b8b4781 2727 trie_words = newAV();
a3621e74 2728 });
a3621e74 2729
3b89859a 2730 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
316ebaf2 2731 assert(re_trie_maxbuff);
a3621e74 2732 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2733 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2734 }
df826430 2735 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2736 Perl_re_indentf( aTHX_
cb41e5d6
YO
2737 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2738 depth+1,
88f063b4 2739 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
538e84ed 2740 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2741 });
538e84ed 2742
7f69552c
YO
2743 /* Find the node we are going to overwrite */
2744 if ( first == startbranch && OP( last ) != BRANCH ) {
2745 /* whole branch chain */
2746 convert = first;
2747 } else {
2748 /* branch sub-chain */
2749 convert = NEXTOPER( first );
2750 }
538e84ed 2751
a3621e74
YO
2752 /* -- First loop and Setup --
2753
2754 We first traverse the branches and scan each word to determine if it
2755 contains widechars, and how many unique chars there are, this is
2756 important as we have to build a table with at least as many columns as we
2757 have unique chars.
2758
2759 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2760 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2761 the native representation of the character value as the key and IV's for
2762 the coded index.
a3621e74
YO
2763
2764 *TODO* If we keep track of how many times each character is used we can
2765 remap the columns so that the table compression later on is more
3b753521 2766 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2767 middle and the least common are on the outside. IMO this would be better
2768 than a most to least common mapping as theres a decent chance the most
2769 common letter will share a node with the least common, meaning the node
486ec47a 2770 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2771 case is when we have the least common nodes twice.
2772
2773 */
2774
a3621e74 2775 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2776 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2777 const U8 *uc;
2778 const U8 *e;
bc031a7d 2779 int foldlen = 0;
07be1b83 2780 U32 wordlen = 0; /* required init */
bc031a7d
KW
2781 STRLEN minchars = 0;
2782 STRLEN maxchars = 0;
538e84ed
KW
2783 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2784 bitmap?*/
a3621e74 2785
3dab1dad 2786 if (OP(noper) == NOTHING) {
20ed8c88
YO
2787 /* skip past a NOTHING at the start of an alternation
2788 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2789 */
df826430 2790 regnode *noper_next= regnext(noper);
944e05e3
YO
2791 if (noper_next < tail)
2792 noper= noper_next;
2793 }
2794
f6b4b99d
KW
2795 if ( noper < tail
2796 && ( OP(noper) == flags
3f2416ae
KW
2797 || (flags == EXACT && OP(noper) == EXACT_REQ8)
2798 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 2799 || OP(noper) == EXACTFUP))))
f6b4b99d 2800 {
944e05e3
YO
2801 uc= (U8*)STRING(noper);
2802 e= uc + STR_LEN(noper);
2803 } else {
2804 trie->minlen= 0;
2805 continue;
3dab1dad 2806 }
df826430 2807
944e05e3 2808
fab2782b 2809 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2810 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2811 regardless of encoding */
0ea669f4 2812 if (OP( noper ) == EXACTFUP) {
fab2782b 2813 /* false positives are ok, so just set this */
0dc4a61d 2814 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2815 }
2816 }
dca5fc4c 2817
bc031a7d
KW
2818 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2819 branch */
3dab1dad 2820 TRIE_CHARCOUNT(trie)++;
a3621e74 2821 TRIE_READ_CHAR;
645de4ce 2822
bc031a7d
KW
2823 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2824 * is in effect. Under /i, this character can match itself, or
2825 * anything that folds to it. If not under /i, it can match just
2826 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2827 * all fold to k, and all are single characters. But some folds
2828 * expand to more than one character, so for example LATIN SMALL
2829 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2830 * the string beginning at 'uc' is 'ffi', it could be matched by
2831 * three characters, or just by the one ligature character. (It
2832 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2833 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2834 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2835 * match.) The trie needs to know the minimum and maximum number
2836 * of characters that could match so that it can use size alone to
2837 * quickly reject many match attempts. The max is simple: it is
2838 * the number of folded characters in this branch (since a fold is
2839 * never shorter than what folds to it. */
2840
2841 maxchars++;
2842
2843 /* And the min is equal to the max if not under /i (indicated by
2844 * 'folder' being NULL), or there are no multi-character folds. If
2845 * there is a multi-character fold, the min is incremented just
2846 * once, for the character that folds to the sequence. Each
2847 * character in the sequence needs to be added to the list below of
2848 * characters in the trie, but we count only the first towards the
2849 * min number of characters needed. This is done through the
2850 * variable 'foldlen', which is returned by the macros that look
2851 * for these sequences as the number of bytes the sequence
2852 * occupies. Each time through the loop, we decrement 'foldlen' by
2853 * how many bytes the current char occupies. Only when it reaches
2854 * 0 do we increment 'minchars' or look for another multi-character
2855 * sequence. */
2856 if (folder == NULL) {
2857 minchars++;
2858 }
2859 else if (foldlen > 0) {
2860 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2861 }
2862 else {
bc031a7d
KW
2863 minchars++;
2864
2865 /* See if *uc is the beginning of a multi-character fold. If
2866 * so, we decrement the length remaining to look at, to account
2867 * for the current character this iteration. (We can use 'uc'
2868 * instead of the fold returned by TRIE_READ_CHAR because for
2869 * non-UTF, the latin1_safe macro is smart enough to account
2870 * for all the unfolded characters, and because for UTF, the
2871 * string will already have been folded earlier in the
2872 * compilation process */
2873 if (UTF) {
2874 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2875 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2876 }
2877 }
bc031a7d
KW
2878 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2879 foldlen--;
2880 }
645de4ce 2881 }
bc031a7d
KW
2882
2883 /* The current character (and any potential folds) should be added
2884 * to the possible matching characters for this position in this
2885 * branch */
a3621e74 2886 if ( uvc < 256 ) {
fab2782b
YO
2887 if ( folder ) {
2888 U8 folded= folder[ (U8) uvc ];
2889 if ( !trie->charmap[ folded ] ) {
2890 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2891 TRIE_STORE_REVCHAR( folded );
2892 }
2893 }
a3621e74
YO
2894 if ( !trie->charmap[ uvc ] ) {
2895 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2896 TRIE_STORE_REVCHAR( uvc );
a3621e74 2897 }
02daf0ab 2898 if ( set_bit ) {
62012aee
KW
2899 /* store the codepoint in the bitmap, and its folded
2900 * equivalent. */
8bcafbf4 2901 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2902 set_bit = 0; /* We've done our bit :-) */
2903 }
a3621e74 2904 } else {
bc031a7d
KW
2905
2906 /* XXX We could come up with the list of code points that fold
2907 * to this using PL_utf8_foldclosures, except not for
2908 * multi-char folds, as there may be multiple combinations
2909 * there that could work, which needs to wait until runtime to
2910 * resolve (The comment about LIGATURE FFI above is such an
2911 * example */
2912
a3621e74 2913 SV** svpp;
55eed653
NC
2914 if ( !widecharmap )
2915 widecharmap = newHV();
a3621e74 2916
55eed653 2917 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2918
2919 if ( !svpp )
147e3846 2920 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
2921
2922 if ( !SvTRUE( *svpp ) ) {
2923 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2924 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2925 }
2926 }
bc031a7d
KW
2927 } /* end loop through characters in this branch of the trie */
2928
2929 /* We take the min and max for this branch and combine to find the min
2930 * and max for all branches processed so far */
3dab1dad 2931 if( cur == first ) {
bc031a7d
KW
2932 trie->minlen = minchars;
2933 trie->maxlen = maxchars;
2934 } else if (minchars < trie->minlen) {
2935 trie->minlen = minchars;
2936 } else if (maxchars > trie->maxlen) {
2937 trie->maxlen = maxchars;
fab2782b 2938 }
a3621e74
YO
2939 } /* end first pass */
2940 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2941 Perl_re_indentf( aTHX_
cb41e5d6
YO
2942 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2943 depth+1,
55eed653 2944 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2945 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2946 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2947 );
a3621e74
YO
2948
2949 /*
2950 We now know what we are dealing with in terms of unique chars and
2951 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2952 representation using a flat table will take. If it's over a reasonable
2953 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2954 conservative but potentially much slower representation using an array
2955 of lists.
2956
2957 At the end we convert both representations into the same compressed
2958 form that will be used in regexec.c for matching with. The latter
2959 is a form that cannot be used to construct with but has memory
2960 properties similar to the list form and access properties similar
2961 to the table form making it both suitable for fast searches and
2962 small enough that its feasable to store for the duration of a program.
2963
2964 See the comment in the code where the compressed table is produced
2965 inplace from the flat tabe representation for an explanation of how
2966 the compression works.
2967
2968 */
2969
2970
2e64971a
DM
2971 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2972 prev_states[1] = 0;
2973
538e84ed
KW
2974 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2975 > SvIV(re_trie_maxbuff) )
2976 {
a3621e74
YO
2977 /*
2978 Second Pass -- Array Of Lists Representation
2979
2980 Each state will be represented by a list of charid:state records
2981 (reg_trie_trans_le) the first such element holds the CUR and LEN
2982 points of the allocated array. (See defines above).
2983
2984 We build the initial structure using the lists, and then convert
2985 it into the compressed table form which allows faster lookups
2986 (but cant be modified once converted).
a3621e74
YO
2987 */
2988
a3621e74
YO
2989 STRLEN transcount = 1;
2990
6ad9a8ab 2991 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 2992 depth+1));
686b73d4 2993
c944940b
JH
2994 trie->states = (reg_trie_state *)
2995 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2996 sizeof(reg_trie_state) );
a3621e74
YO
2997 TRIE_LIST_NEW(1);
2998 next_alloc = 2;
2999
3000 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3001
df826430 3002 regnode *noper = NEXTOPER( cur );
c445ea15
AL
3003 U32 state = 1; /* required init */
3004 U16 charid = 0; /* sanity init */
07be1b83 3005 U32 wordlen = 0; /* required init */
c445ea15 3006
df826430
YO
3007 if (OP(noper) == NOTHING) {
3008 regnode *noper_next= regnext(noper);
944e05e3
YO
3009 if (noper_next < tail)
3010 noper= noper_next;
df826430
YO
3011 }
3012
f6b4b99d
KW
3013 if ( noper < tail
3014 && ( OP(noper) == flags
3f2416ae
KW
3015 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3016 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 3017 || OP(noper) == EXACTFUP))))
f6b4b99d 3018 {
944e05e3
YO
3019 const U8 *uc= (U8*)STRING(noper);
3020 const U8 *e= uc + STR_LEN(noper);
3021
786e8c11 3022 for ( ; uc < e ; uc += len ) {
c445ea15 3023
786e8c11 3024 TRIE_READ_CHAR;
c445ea15 3025
786e8c11
YO
3026 if ( uvc < 256 ) {
3027 charid = trie->charmap[ uvc ];
c445ea15 3028 } else {
538e84ed
KW
3029 SV** const svpp = hv_fetch( widecharmap,
3030 (char*)&uvc,
3031 sizeof( UV ),
3032 0);
786e8c11
YO
3033 if ( !svpp ) {
3034 charid = 0;
3035 } else {
3036 charid=(U16)SvIV( *svpp );
3037 }
c445ea15 3038 }
538e84ed
KW
3039 /* charid is now 0 if we dont know the char read, or
3040 * nonzero if we do */
786e8c11 3041 if ( charid ) {
a3621e74 3042
786e8c11
YO
3043 U16 check;
3044 U32 newstate = 0;
a3621e74 3045
786e8c11
YO
3046 charid--;
3047 if ( !trie->states[ state ].trans.list ) {
3048 TRIE_LIST_NEW( state );
c445ea15 3049 }
538e84ed
KW
3050 for ( check = 1;
3051 check <= TRIE_LIST_USED( state );
3052 check++ )
3053 {
3054 if ( TRIE_LIST_ITEM( state, check ).forid
3055 == charid )
3056 {
786e8c11
YO
3057 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3058 break;
3059 }
3060 }
3061 if ( ! newstate ) {
3062 newstate = next_alloc++;
2e64971a 3063 prev_states[newstate] = state;
786e8c11
YO
3064 TRIE_LIST_PUSH( state, charid, newstate );
3065 transcount++;
3066 }
3067 state = newstate;
3068 } else {
147e3846 3069 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 3070 }
a28509cc 3071 }
c445ea15 3072 }
3dab1dad 3073 TRIE_HANDLE_WORD(state);
a3621e74
YO
3074
3075 } /* end second pass */
3076
1e2e3d02 3077 /* next alloc is the NEXT state to be allocated */
538e84ed 3078 trie->statecount = next_alloc;
c944940b
JH
3079 trie->states = (reg_trie_state *)
3080 PerlMemShared_realloc( trie->states,
3081 next_alloc
3082 * sizeof(reg_trie_state) );
a3621e74 3083
3dab1dad 3084 /* and now dump it out before we compress it */
2b8b4781
NC
3085 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3086 revcharmap, next_alloc,
3087 depth+1)
1e2e3d02 3088 );
a3621e74 3089
c944940b
JH
3090 trie->trans = (reg_trie_trans *)
3091 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
3092 {
3093 U32 state;
a3621e74
YO
3094 U32 tp = 0;
3095 U32 zp = 0;
3096
3097
3098 for( state=1 ; state < next_alloc ; state ++ ) {
3099 U32 base=0;
3100
3101 /*
3102 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3103 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
3104 );
3105 */
3106
3107 if (trie->states[state].trans.list) {
3108 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3109 U16 maxid=minid;
a28509cc 3110 U16 idx;
a3621e74
YO
3111
3112 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
3113 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3114 if ( forid < minid ) {
3115 minid=forid;
3116 } else if ( forid > maxid ) {
3117 maxid=forid;
3118 }
a3621e74
YO
3119 }
3120 if ( transcount < tp + maxid - minid + 1) {
3121 transcount *= 2;
c944940b
JH
3122 trie->trans = (reg_trie_trans *)
3123 PerlMemShared_realloc( trie->trans,
446bd890
NC
3124 transcount
3125 * sizeof(reg_trie_trans) );
538e84ed
KW
3126 Zero( trie->trans + (transcount / 2),
3127 transcount / 2,
3128 reg_trie_trans );
a3621e74
YO
3129 }
3130 base = trie->uniquecharcount + tp - minid;
3131 if ( maxid == minid ) {
3132 U32 set = 0;
3133 for ( ; zp < tp ; zp++ ) {
3134 if ( ! trie->trans[ zp ].next ) {
3135 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
3136 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3137 1).newstate;
a3621e74
YO
3138 trie->trans[ zp ].check = state;
3139 set = 1;
3140 break;
3141 }
3142 }
3143 if ( !set ) {
538e84ed
KW
3144 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3145 1).newstate;
a3621e74
YO
3146 trie->trans[ tp ].check = state;
3147 tp++;
3148 zp = tp;
3149 }
3150 } else {
3151 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
3152 const U32 tid = base
3153 - trie->uniquecharcount
3154 + TRIE_LIST_ITEM( state, idx ).forid;
3155 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3156 idx ).newstate;
a3621e74
YO
3157 trie->trans[ tid ].check = state;
3158 }
3159 tp += ( maxid - minid + 1 );
3160 }
3161 Safefree(trie->states[ state ].trans.list);
3162 }
3163 /*
3164 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3165 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
3166 );
3167 */
3168 trie->states[ state ].trans.base=base;
3169 }
cc601c31 3170 trie->lasttrans = tp + 1;
a3621e74
YO
3171 }
3172 } else {
3173 /*
3174 Second Pass -- Flat Table Representation.
3175
b423522f
KW
3176 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3177 each. We know that we will need Charcount+1 trans at most to store
3178 the data (one row per char at worst case) So we preallocate both
3179 structures assuming worst case.
a3621e74
YO
3180
3181 We then construct the trie using only the .next slots of the entry
3182 structs.
3183
b423522f
KW
3184 We use the .check field of the first entry of the node temporarily
3185 to make compression both faster and easier by keeping track of how
3186 many non zero fields are in the node.
a3621e74
YO
3187
3188 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3189 transition.
3190
b423522f
KW
3191 There are two terms at use here: state as a TRIE_NODEIDX() which is
3192 a number representing the first entry of the node, and state as a
3193 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3194 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3195 if there are 2 entrys per node. eg:
a3621e74
YO
3196
3197 A B A B
3198 1. 2 4 1. 3 7
3199 2. 0 3 3. 0 5
3200 3. 0 0 5. 0 0
3201 4. 0 0 7. 0 0
3202
b423522f
KW
3203 The table is internally in the right hand, idx form. However as we
3204 also have to deal with the states array which is indexed by nodenum
3205 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
3206
3207 */
6ad9a8ab 3208 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 3209 depth+1));
3dab1dad 3210
c944940b
JH
3211 trie->trans = (reg_trie_trans *)
3212 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3213 * trie->uniquecharcount + 1,
3214 sizeof(reg_trie_trans) );
3215 trie->states = (reg_trie_state *)
3216 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3217 sizeof(reg_trie_state) );
a3621e74
YO
3218 next_alloc = trie->uniquecharcount + 1;
3219
3dab1dad 3220
a3621e74
YO
3221 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3222
df826430 3223 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3224
3225 U32 state = 1; /* required init */
3226
3227 U16 charid = 0; /* sanity init */
3228 U32 accept_state = 0; /* sanity init */
a3621e74 3229
07be1b83 3230 U32 wordlen = 0; /* required init */
a3621e74 3231
df826430
YO
3232 if (OP(noper) == NOTHING) {
3233 regnode *noper_next= regnext(noper);
944e05e3
YO
3234 if (noper_next < tail)
3235 noper= noper_next;
df826430 3236 }
fab2782b 3237
f6b4b99d
KW
3238 if ( noper < tail
3239 && ( OP(noper) == flags
3f2416ae
KW
3240 || (flags == EXACT && OP(noper) == EXACT_REQ8)
3241 || (flags == EXACTFU && ( OP(noper) == EXACTFU_REQ8
0ea669f4 3242 || OP(noper) == EXACTFUP))))
f6b4b99d 3243 {
944e05e3
YO
3244 const U8 *uc= (U8*)STRING(noper);
3245 const U8 *e= uc + STR_LEN(noper);
3246
786e8c11 3247 for ( ; uc < e ; uc += len ) {
a3621e74 3248
786e8c11 3249 TRIE_READ_CHAR;
a3621e74 3250
786e8c11
YO
3251 if ( uvc < 256 ) {
3252 charid = trie->charmap[ uvc ];
3253 } else {
538e84ed
KW
3254 SV* const * const svpp = hv_fetch( widecharmap,
3255 (char*)&uvc,
3256 sizeof( UV ),
3257 0);
786e8c11 3258 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3259 }
786e8c11
YO
3260 if ( charid ) {
3261 charid--;
3262 if ( !trie->trans[ state + charid ].next ) {
3263 trie->trans[ state + charid ].next = next_alloc;
3264 trie->trans[ state ].check++;
2e64971a
DM
3265 prev_states[TRIE_NODENUM(next_alloc)]
3266 = TRIE_NODENUM(state);
786e8c11
YO
3267 next_alloc += trie->uniquecharcount;
3268 }
3269 state = trie->trans[ state + charid ].next;
3270 } else {
147e3846 3271 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3272 }
538e84ed
KW
3273 /* charid is now 0 if we dont know the char read, or
3274 * nonzero if we do */
a3621e74 3275 }
a3621e74 3276 }
3dab1dad
YO
3277 accept_state = TRIE_NODENUM( state );
3278 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3279
3280 } /* end second pass */
3281
3dab1dad 3282 /* and now dump it out before we compress it */
2b8b4781
NC
3283 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3284 revcharmap,
3285 next_alloc, depth+1));
a3621e74 3286
a3621e74
YO
3287 {
3288 /*
3289 * Inplace compress the table.*
3290
3291 For sparse data sets the table constructed by the trie algorithm will
3292 be mostly 0/FAIL transitions or to put it another way mostly empty.
3293 (Note that leaf nodes will not contain any transitions.)
3294
3295 This algorithm compresses the tables by eliminating most such
3296 transitions, at the cost of a modest bit of extra work during lookup:
3297
3298 - Each states[] entry contains a .base field which indicates the
3299 index in the state[] array wheres its transition data is stored.
3300
3b753521 3301 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3302
3303 - If .base is nonzero then charid is added to it to find an entry in
3304 the trans array.
3305
3306 -If trans[states[state].base+charid].check!=state then the
3307 transition is taken to be a 0/Fail transition. Thus if there are fail
3308 transitions at the front of the node then the .base offset will point
3309 somewhere inside the previous nodes data (or maybe even into a node
3310 even earlier), but the .check field determines if the transition is
3311 valid.
3312
786e8c11 3313 XXX - wrong maybe?
a3621e74 3314 The following process inplace converts the table to the compressed
3b753521 3315 table: We first do not compress the root node 1,and mark all its
a3621e74 3316 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3317 allows us to do a DFA construction from the compressed table later,
3318 and ensures that any .base pointers we calculate later are greater
3319 than 0.
a3621e74
YO
3320
3321 - We set 'pos' to indicate the first entry of the second node.
3322
3323 - We then iterate over the columns of the node, finding the first and
3324 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3325 and set the .check pointers accordingly, and advance pos
3326 appropriately and repreat for the next node. Note that when we copy
3327 the next pointers we have to convert them from the original
3328 NODEIDX form to NODENUM form as the former is not valid post
3329 compression.
3330
3331 - If a node has no transitions used we mark its base as 0 and do not
3332 advance the pos pointer.
3333
3334 - If a node only has one transition we use a second pointer into the
3335 structure to fill in allocated fail transitions from other states.
3336 This pointer is independent of the main pointer and scans forward
3337 looking for null transitions that are allocated to a state. When it
3338 finds one it writes the single transition into the "hole". If the
786e8c11 3339 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3340
3341 - Once compressed we can Renew/realloc the structures to release the
3342 excess space.
3343
3344 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3345 specifically Fig 3.47 and the associated pseudocode.
3346
3347 demq
3348 */
a3b680e6 3349 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3350 U32 state, charid;
a3621e74 3351 U32 pos = 0, zp=0;
1e2e3d02 3352 trie->statecount = laststate;
a3621e74
YO
3353
3354 for ( state = 1 ; state < laststate ; state++ ) {
3355 U8 flag = 0;
a28509cc
AL
3356 const U32 stateidx = TRIE_NODEIDX( state );
3357 const U32 o_used = trie->trans[ stateidx ].check;
3358 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3359 trie->trans[ stateidx ].check = 0;
3360
538e84ed
KW
3361 for ( charid = 0;
3362 used && charid < trie->uniquecharcount;
3363 charid++ )
3364 {
a3621e74
YO
3365 if ( flag || trie->trans[ stateidx + charid ].next ) {
3366 if ( trie->trans[ stateidx + charid ].next ) {
3367 if (o_used == 1) {
3368 for ( ; zp < pos ; zp++ ) {
3369 if ( ! trie->trans[ zp ].next ) {
3370 break;
3371 }
3372 }
538e84ed
KW
3373 trie->states[ state ].trans.base
3374 = zp
3375 + trie->uniquecharcount
3376 - charid ;
3377 trie->trans[ zp ].next
3378 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3379 + charid ].next );
a3621e74
YO
3380 trie->trans[ zp ].check = state;
3381 if ( ++zp > pos ) pos = zp;
3382 break;
3383 }
3384 used--;
3385 }
3386 if ( !flag ) {
3387 flag = 1;
538e84ed
KW
3388 trie->states[ state ].trans.base
3389 = pos + trie->uniquecharcount - charid ;
a3621e74 3390 }
538e84ed
KW
3391 trie->trans[ pos ].next
3392 = SAFE_TRIE_NODENUM(
3393 trie->trans[ stateidx + charid ].next );
a3621e74
YO
3394 trie->trans[ pos ].check = state;
3395 pos++;
3396 }
3397 }
3398 }
cc601c31 3399 trie->lasttrans = pos + 1;
c944940b
JH
3400 trie->states = (reg_trie_state *)
3401 PerlMemShared_realloc( trie->states, laststate
3402 * sizeof(reg_trie_state) );
a3621e74 3403 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3404 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
cb41e5d6 3405 depth+1,
538e84ed
KW
3406 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3407 + 1 ),
3408 (IV)next_alloc,
3409 (IV)pos,
3410 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
3411 );
3412
3413 } /* end table compress */
3414 }
1e2e3d02