This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
include a trailing \0 in SVs holding trie info
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
c277df42 77#define REG_COMP_C
54df2634
NC
78#ifdef PERL_IN_XSUB_RE
79# include "re_comp.h"
afda64e8 80EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
81#else
82# include "regcomp.h"
83#endif
a687059c 84
f7e03a10 85#include "dquote_inline.h"
b992490d 86#include "invlist_inline.h"
1b0f46bf 87#include "unicode_constants.h"
04e98a4d 88
538e84ed
KW
89#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
90 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
91#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
92 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 93#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 94#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 95
a687059c
LW
96#ifndef STATIC
97#define STATIC static
98#endif
99
3f910e62
YO
100/* this is a chain of data about sub patterns we are processing that
101 need to be handled separately/specially in study_chunk. Its so
102 we can simulate recursion without losing state. */
103struct scan_frame;
104typedef struct scan_frame {
105 regnode *last_regnode; /* last node to process in this frame */
106 regnode *next_regnode; /* next node to process when last is reached */
107 U32 prev_recursed_depth;
108 I32 stopparen; /* what stopparen do we use */
3f910e62
YO
109
110 struct scan_frame *this_prev_frame; /* this previous frame */
111 struct scan_frame *prev_frame; /* previous frame */
112 struct scan_frame *next_frame; /* next frame */
113} scan_frame;
114
f4ae5a27
KW
115/* Certain characters are output as a sequence with the first being a
116 * backslash. */
eb7486de 117#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
f4ae5a27
KW
118
119
09b2b2e6 120struct RExC_state_t {
514a91f1
DM
121 U32 flags; /* RXf_* are we folding, multilining? */
122 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 123 char *precomp; /* uncompiled string. */
711b303b 124 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 125 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 126 regexp *rx; /* perl core regexp structure */
538e84ed
KW
127 regexp_internal *rxi; /* internal data for regexp object
128 pprivate field */
fac92740 129 char *start; /* Start of input for compile */
830247a4
IZ
130 char *end; /* End of input for compile */
131 char *parse; /* Input-scan pointer. */
51684e3c
KW
132 char *copy_start; /* start of copy of input within
133 constructed parse string */
cc16d262
KW
134 char *save_copy_start; /* Provides one level of saving
135 and restoring 'copy_start' */
51684e3c
KW
136 char *copy_start_in_input; /* Position in input string
137 corresponding to copy_start */
ea3daa5d 138 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 139 regnode *emit_start; /* Start of emitted-code area */
f55b7b26 140 regnode_offset emit; /* Code-emit pointer */
830247a4
IZ
141 I32 naughty; /* How bad is this pattern? */
142 I32 sawback; /* Did we see \1, ...? */
143 U32 seen;
88f063b4
KW
144 SSize_t size; /* Number of regnode equivalents in
145 pattern */
7c932d07
KW
146
147 /* position beyond 'precomp' of the warning message furthest away from
148 * 'precomp'. During the parse, no warnings are raised for any problems
149 * earlier in the parse than this position. This works if warnings are
150 * raised the first time a given spot is parsed, and if only one
151 * independent warning is raised for any given spot */
152 Size_t latest_warn_offset;
153
154 I32 npar; /* Capture buffer count so far in the
155 parse, (OPEN) plus one. ("par" 0 is
156 the whole pattern)*/
157 I32 total_par; /* During initial parse, is either 0,
158 or -1; the latter indicating a
159 reparse is needed. After that pass,
160 it is what 'npar' became after the
161 pass. Hence, it being > 0 indicates
162 we are in a reparse situation */
538e84ed
KW
163 I32 nestroot; /* root parens we are in - used by
164 accept */
830247a4 165 I32 seen_zerolen;
f55b7b26
KW
166 regnode_offset *open_parens; /* offsets to open parens */
167 regnode_offset *close_parens; /* offsets to close parens */
0275405c 168 I32 parens_buf_size; /* #slots malloced open/close_parens */
d5a00e4a 169 regnode *end_op; /* END node in program */
02daf0ab
YO
170 I32 utf8; /* whether the pattern is utf8 or not */
171 I32 orig_utf8; /* whether the pattern was originally in utf8 */
172 /* XXX use this for future optimisation of case
173 * where pattern must be upgraded to utf8. */
e40e74fe
KW
174 I32 uni_semantics; /* If a d charset modifier should use unicode
175 rules, even if the pattern is not in
176 utf8 */
81714fb9 177 HV *paren_names; /* Paren names */
538e84ed 178
40d049e4 179 regnode **recurse; /* Recurse regops */
88f063b4 180 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 181 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 182 through */
09a65838 183 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 184 I32 in_lookbehind;
4624b182 185 I32 contains_locale;
bb3f3ed2 186 I32 override_recoding;
b6d67071
KW
187#ifdef EBCDIC
188 I32 recode_x_to_native;
189#endif
9d53c457 190 I32 in_multi_char_class;
1acab4c5 191 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 192 within pattern */
b1603ef8 193 int code_index; /* next code_blocks[] slot */
ee273784 194 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
195 scan_frame *frame_head;
196 scan_frame *frame_last;
197 U32 frame_count;
7eec73eb 198 AV *warn_text;
945961fd 199 HV *unlexed_names;
dc6d7f5c 200#ifdef ADD_TO_REGEXEC
830247a4
IZ
201 char *starttry; /* -Dr: where regtry was called. */
202#define RExC_starttry (pRExC_state->starttry)
203#endif
d24ca0c5 204 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 205#ifdef DEBUGGING
be8e71aa 206 const char *lastparse;
3dab1dad 207 I32 lastnum;
1f1031fe 208 AV *paren_name_list; /* idx -> name */
d9a72fcc 209 U32 study_chunk_recursed_count;
c9f0d54c
YO
210 SV *mysv1;
211 SV *mysv2;
88f063b4 212
3dab1dad
YO
213#define RExC_lastparse (pRExC_state->lastparse)
214#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 215#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 216#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
217#define RExC_mysv (pRExC_state->mysv1)
218#define RExC_mysv1 (pRExC_state->mysv1)
219#define RExC_mysv2 (pRExC_state->mysv2)
220
3dab1dad 221#endif
bd0b62a6 222 bool seen_d_op;
911bd04e 223 bool strict;
da7cf1cc 224 bool study_started;
034602eb 225 bool in_script_run;
7c932d07 226 bool use_BRANCHJ;
09b2b2e6 227};
830247a4 228
e2509266 229#define RExC_flags (pRExC_state->flags)
514a91f1 230#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 231#define RExC_precomp (pRExC_state->precomp)
51684e3c
KW
232#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
233#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
cc16d262 234#define RExC_save_copy_start_in_constructed (pRExC_state->save_copy_start)
711b303b 235#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 236#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 237#define RExC_rx (pRExC_state->rx)
f8fc2ecf 238#define RExC_rxi (pRExC_state->rxi)
fac92740 239#define RExC_start (pRExC_state->start)
830247a4
IZ
240#define RExC_end (pRExC_state->end)
241#define RExC_parse (pRExC_state->parse)
7c932d07 242#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
830247a4 243#define RExC_whilem_seen (pRExC_state->whilem_seen)
bd0b62a6
KW
244#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
245 under /d from /u ? */
512c0f5a 246
512c0f5a 247
7122b237 248#ifdef RE_TRACK_PATTERN_OFFSETS
44f36cd3 249# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
538e84ed 250 others */
7122b237 251#endif
830247a4 252#define RExC_emit (pRExC_state->emit)
fac92740 253#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
254#define RExC_sawback (pRExC_state->sawback)
255#define RExC_seen (pRExC_state->seen)
256#define RExC_size (pRExC_state->size)
ee273784 257#define RExC_maxlen (pRExC_state->maxlen)
830247a4 258#define RExC_npar (pRExC_state->npar)
1533d1d2 259#define RExC_total_parens (pRExC_state->total_par)
0275405c 260#define RExC_parens_buf_size (pRExC_state->parens_buf_size)
e2e6a0f1 261#define RExC_nestroot (pRExC_state->nestroot)
830247a4 262#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 263#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 264#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 265#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
266#define RExC_open_parens (pRExC_state->open_parens)
267#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 268#define RExC_end_op (pRExC_state->end_op)
81714fb9 269#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
270#define RExC_recurse (pRExC_state->recurse)
271#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 272#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
273#define RExC_study_chunk_recursed_bytes \
274 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 275#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 276#define RExC_contains_locale (pRExC_state->contains_locale)
b6d67071
KW
277#ifdef EBCDIC
278# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
279#endif
9d53c457 280#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
281#define RExC_frame_head (pRExC_state->frame_head)
282#define RExC_frame_last (pRExC_state->frame_last)
283#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 284#define RExC_strict (pRExC_state->strict)
da7cf1cc 285#define RExC_study_started (pRExC_state->study_started)
7eec73eb 286#define RExC_warn_text (pRExC_state->warn_text)
034602eb 287#define RExC_in_script_run (pRExC_state->in_script_run)
7c932d07 288#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
945961fd 289#define RExC_unlexed_names (pRExC_state->unlexed_names)
830247a4 290
99807a43
HS
291/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
292 * a flag to disable back-off on the fixed/floating substrings - if it's
293 * a high complexity pattern we assume the benefit of avoiding a full match
294 * is worth the cost of checking for the substrings even if they rarely help.
295 */
296#define RExC_naughty (pRExC_state->naughty)
297#define TOO_NAUGHTY (10)
298#define MARK_NAUGHTY(add) \
299 if (RExC_naughty < TOO_NAUGHTY) \
300 RExC_naughty += (add)
301#define MARK_NAUGHTY_EXP(exp, add) \
302 if (RExC_naughty < TOO_NAUGHTY) \
303 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 304
a687059c
LW
305#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
306#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 307 ((*s) == '{' && regcurly(s)))
a687059c
LW
308
309/*
310 * Flags to be passed up and down.
311 */
a687059c 312#define WORST 0 /* Worst case. */
c4ead770
KW
313#define HASWIDTH 0x01 /* Known to not match null strings, could match
314 non-null ones. */
fda99bee 315
e64f369d 316/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
317 * character. (There needs to be a case: in the switch statement in regexec.c
318 * for any node marked SIMPLE.) Note that this is not the same thing as
319 * REGNODE_SIMPLE */
fda99bee 320#define SIMPLE 0x02
e64f369d 321#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
322#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
323#define TRYAGAIN 0x10 /* Weeded out a declaration. */
c36361b2
KW
324#define RESTART_PARSE 0x20 /* Need to redo the parse */
325#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
326 calcuate sizes as UTF-8 */
a687059c 327
3dab1dad
YO
328#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
329
07be1b83
YO
330/* whether trie related optimizations are enabled */
331#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
332#define TRIE_STUDY_OPT
786e8c11 333#define FULL_TRIE_STUDY
07be1b83
YO
334#define TRIE_STCLASS
335#endif
1de06328
YO
336
337
40d049e4
YO
338
339#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
340#define PBITVAL(paren) (1 << ((paren) & 7))
341#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
342#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
343#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
344
82a6ada4 345#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 346 if (!UTF) { \
c36361b2 347 *flagp = RESTART_PARSE|NEED_UTF8; \
f55b7b26 348 return 0; \
8d9c2815 349 } \
82a6ada4 350 } STMT_END
40d049e4 351
d63d75f5 352/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
f6e8f31e
KW
353 * a flag that indicates we need to override /d with /u as a result of
354 * something in the pattern. It should only be used in regards to calling
355 * set_regex_charset() or get_regex_charse() */
512c0f5a
KW
356#define REQUIRE_UNI_RULES(flagp, restart_retval) \
357 STMT_START { \
358 if (DEPENDS_SEMANTICS) { \
512c0f5a
KW
359 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
360 RExC_uni_semantics = 1; \
b76e1c04 361 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
bd0b62a6
KW
362 /* No need to restart the parse if we haven't seen \
363 * anything that differs between /u and /d, and no need \
364 * to restart immediately if we're going to reparse \
365 * anyway to count parens */ \
c36361b2 366 *flagp |= RESTART_PARSE; \
512c0f5a 367 return restart_retval; \
d1be211c 368 } \
512c0f5a
KW
369 } \
370 } STMT_END
371
7c932d07
KW
372#define REQUIRE_BRANCHJ(flagp, restart_retval) \
373 STMT_START { \
374 RExC_use_BRANCHJ = 1; \
b76e1c04 375 if (LIKELY(! IN_PARENS_PASS)) { \
523a2924
KW
376 /* No need to restart the parse immediately if we're \
377 * going to reparse anyway to count parens */ \
378 *flagp |= RESTART_PARSE; \
379 return restart_retval; \
380 } \
7c932d07
KW
381 } STMT_END
382
b76e1c04
KW
383/* Until we have completed the parse, we leave RExC_total_parens at 0 or
384 * less. After that, it must always be positive, because the whole re is
385 * considered to be surrounded by virtual parens. Setting it to negative
386 * indicates there is some construct that needs to know the actual number of
387 * parens to be properly handled. And that means an extra pass will be
388 * required after we've counted them all */
389#define ALL_PARENS_COUNTED (RExC_total_parens > 0)
7c932d07 390#define REQUIRE_PARENS_PASS \
b76e1c04
KW
391 STMT_START { /* No-op if have completed a pass */ \
392 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
7c932d07 393 } STMT_END
b76e1c04
KW
394#define IN_PARENS_PASS (RExC_total_parens < 0)
395
7c932d07 396
f358228e
KW
397/* This is used to return failure (zero) early from the calling function if
398 * various flags in 'flags' are set. Two flags always cause a return:
399 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
400 * additional flags that should cause a return; 0 if none. If the return will
401 * be done, '*flagp' is first set to be all of the flags that caused the
402 * return. */
403#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
fd7fc6ae 404 STMT_START { \
c36361b2
KW
405 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
406 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
f358228e 407 return 0; \
fd7fc6ae
YO
408 } \
409 } STMT_END
410
c36361b2 411#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
fd7fc6ae 412
5d555bb8 413#define RETURN_FAIL_ON_RESTART(flags,flagp) \
f358228e 414 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
5d555bb8 415#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
f358228e 416 if (MUST_RESTART(*(flagp))) return 0
fd7fc6ae 417
f19b1a63
KW
418/* This converts the named class defined in regcomp.h to its equivalent class
419 * number defined in handy.h. */
420#define namedclass_to_classnum(class) ((int) ((class) / 2))
421#define classnum_to_namedclass(classnum) ((classnum) * 2)
422
de92f5e6
KW
423#define _invlist_union_complement_2nd(a, b, output) \
424 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
425#define _invlist_intersection_complement_2nd(a, b, output) \
426 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
427
1de06328
YO
428/* About scan_data_t.
429
430 During optimisation we recurse through the regexp program performing
431 various inplace (keyhole style) optimisations. In addition study_chunk
432 and scan_commit populate this data structure with information about
538e84ed 433 what strings MUST appear in the pattern. We look for the longest
3b753521 434 string that must appear at a fixed location, and we look for the
1de06328
YO
435 longest string that may appear at a floating location. So for instance
436 in the pattern:
538e84ed 437
1de06328 438 /FOO[xX]A.*B[xX]BAR/
538e84ed 439
1de06328
YO
440 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
441 strings (because they follow a .* construct). study_chunk will identify
442 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 443
1de06328 444 The strings can be composites, for instance
538e84ed 445
1de06328 446 /(f)(o)(o)/
538e84ed 447
1de06328 448 will result in a composite fixed substring 'foo'.
538e84ed 449
1de06328 450 For each string some basic information is maintained:
538e84ed 451
6ea77169 452 - min_offset
1de06328
YO
453 This is the position the string must appear at, or not before.
454 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
455 characters must match before the string we are searching for.
456 Likewise when combined with minlenp and the length of the string it
538e84ed 457 tells us how many characters must appear after the string we have
1de06328 458 found.
538e84ed 459
1de06328
YO
460 - max_offset
461 Only used for floating strings. This is the rightmost point that
ea3daa5d 462 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 463 string can occur infinitely far to the right.
6e12f832 464 For fixed strings, it is equal to min_offset.
538e84ed 465
1de06328 466 - minlenp
2d608413
KW
467 A pointer to the minimum number of characters of the pattern that the
468 string was found inside. This is important as in the case of positive
538e84ed 469 lookahead or positive lookbehind we can have multiple patterns
1de06328 470 involved. Consider
538e84ed 471
1de06328 472 /(?=FOO).*F/
538e84ed 473
1de06328
YO
474 The minimum length of the pattern overall is 3, the minimum length
475 of the lookahead part is 3, but the minimum length of the part that
538e84ed 476 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 477 minimum length for the F is 1. This is important as the minimum length
538e84ed 478 is used to determine offsets in front of and behind the string being
1de06328 479 looked for. Since strings can be composites this is the length of the
486ec47a 480 pattern at the time it was committed with a scan_commit. Note that
1de06328 481 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 482 are not known until the full pattern has been compiled, thus the
1de06328 483 pointer to the value.
538e84ed 484
1de06328 485 - lookbehind
538e84ed 486
1de06328 487 In the case of lookbehind the string being searched for can be
538e84ed 488 offset past the start point of the final matching string.
1de06328
YO
489 If this value was just blithely removed from the min_offset it would
490 invalidate some of the calculations for how many chars must match
491 before or after (as they are derived from min_offset and minlen and
538e84ed 492 the length of the string being searched for).
1de06328
YO
493 When the final pattern is compiled and the data is moved from the
494 scan_data_t structure into the regexp structure the information
538e84ed
KW
495 about lookbehind is factored in, with the information that would
496 have been lost precalculated in the end_shift field for the
1de06328
YO
497 associated string.
498
499 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 500 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
501
502*/
2c2d71f5 503
cb6aef8b
DM
504struct scan_data_substrs {
505 SV *str; /* longest substring found in pattern */
506 SSize_t min_offset; /* earliest point in string it can appear */
507 SSize_t max_offset; /* latest point in string it can appear */
508 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
509 SSize_t lookbehind; /* is the pos of the string modified by LB */
11683ecb 510 I32 flags; /* per substring SF_* and SCF_* flags */
cb6aef8b
DM
511};
512
2c2d71f5 513typedef struct scan_data_t {
1de06328
YO
514 /*I32 len_min; unused */
515 /*I32 len_delta; unused */
49f55535 516 SSize_t pos_min;
ea3daa5d 517 SSize_t pos_delta;
2c2d71f5 518 SV *last_found;
ea3daa5d 519 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 520 SSize_t last_start_min;
ea3daa5d 521 SSize_t last_start_max;
2099df82
DM
522 U8 cur_is_floating; /* whether the last_* values should be set as
523 * the next fixed (0) or floating (1)
524 * substring */
6ea77169 525
2099df82
DM
526 /* [0] is longest fixed substring so far, [1] is longest float so far */
527 struct scan_data_substrs substrs[2];
6ea77169 528
11683ecb 529 I32 flags; /* common SF_* and SCF_* flags */
2c2d71f5 530 I32 whilem_c;
49f55535 531 SSize_t *last_closep;
b8f7bb16 532 regnode_ssc *start_class;
2c2d71f5
JH
533} scan_data_t;
534
a687059c 535/*
e50aee73 536 * Forward declarations for pregcomp()'s friends.
a687059c 537 */
a0d0e21e 538
6ea77169 539static const scan_data_t zero_scan_data = {
a2ca2017 540 0, 0, NULL, 0, 0, 0, 0,
6ea77169 541 {
11683ecb
DM
542 { NULL, 0, 0, 0, 0, 0 },
543 { NULL, 0, 0, 0, 0, 0 },
6ea77169
DM
544 },
545 0, 0, NULL, NULL
546};
c277df42 547
11683ecb
DM
548/* study flags */
549
07be1b83
YO
550#define SF_BEFORE_SEOL 0x0001
551#define SF_BEFORE_MEOL 0x0002
11683ecb 552#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
c277df42 553
07be1b83
YO
554#define SF_IS_INF 0x0040
555#define SF_HAS_PAR 0x0080
556#define SF_IN_PAR 0x0100
557#define SF_HAS_EVAL 0x0200
bc604ad8
KW
558
559
560/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
561 * longest substring in the pattern. When it is not set the optimiser keeps
562 * track of position, but does not keep track of the actual strings seen,
563 *
564 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
565 * /foo/i will not.
566 *
567 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
568 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
569 * turned off because of the alternation (BRANCH). */
07be1b83 570#define SCF_DO_SUBSTR 0x0400
bc604ad8 571
653099ff
GS
572#define SCF_DO_STCLASS_AND 0x0800
573#define SCF_DO_STCLASS_OR 0x1000
574#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 575#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 576
786e8c11 577#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 578#define SCF_SEEN_ACCEPT 0x8000
688e0391 579#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
580#define SCF_IN_DEFINE 0x20000
581
582
583
07be1b83 584
43fead97 585#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
586
587/* The enums for all these are ordered so things work out correctly */
a62b1201 588#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
589#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
590 == REGEX_DEPENDS_CHARSET)
00b27cfc 591#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
592#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
593 >= REGEX_UNICODE_CHARSET)
594#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
595 == REGEX_ASCII_RESTRICTED_CHARSET)
596#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
597 >= REGEX_ASCII_RESTRICTED_CHARSET)
598#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
599 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 600
43fead97 601#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 602
f2c2a6ab
KW
603/* For programs that want to be strictly Unicode compatible by dying if any
604 * attempt is made to match a non-Unicode code point against a Unicode
605 * property. */
606#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
607
93733859 608#define OOB_NAMEDCLASS -1
b8c5462f 609
8e661ac5
KW
610/* There is no code point that is out-of-bounds, so this is problematic. But
611 * its only current use is to initialize a variable that is always set before
612 * looked at. */
613#define OOB_UNICODE 0xDEADBEEF
614
a0ed51b3 615#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 616
8615cb43 617
b45f050a
JF
618/* length of regex to show in messages that don't mark a position within */
619#define RegexLengthToShowInErrorMessages 127
620
621/*
622 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
623 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
624 * op/pragma/warn/regcomp.
625 */
7253e4e3
RK
626#define MARKER1 "<-- HERE" /* marker as it appears in the description */
627#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 628
538e84ed 629#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 630 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 631
285b5ca0
KW
632/* The code in this file in places uses one level of recursion with parsing
633 * rebased to an alternate string constructed by us in memory. This can take
634 * the form of something that is completely different from the input, or
635 * something that uses the input as part of the alternate. In the first case,
636 * there should be no possibility of an error, as we are in complete control of
51684e3c
KW
637 * the alternate string. But in the second case we don't completely control
638 * the input portion, so there may be errors in that. Here's an example:
285b5ca0
KW
639 * /[abc\x{DF}def]/ui
640 * is handled specially because \x{df} folds to a sequence of more than one
51684e3c 641 * character: 'ss'. What is done is to create and parse an alternate string,
285b5ca0
KW
642 * which looks like this:
643 * /(?:\x{DF}|[abc\x{DF}def])/ui
644 * where it uses the input unchanged in the middle of something it constructs,
645 * which is a branch for the DF outside the character class, and clustering
646 * parens around the whole thing. (It knows enough to skip the DF inside the
647 * class while in this substitute parse.) 'abc' and 'def' may have errors that
648 * need to be reported. The general situation looks like this:
649 *
51684e3c 650 * |<------- identical ------>|
285b5ca0 651 * sI tI xI eI
51684e3c 652 * Input: ---------------------------------------------------------------
285b5ca0
KW
653 * Constructed: ---------------------------------------------------
654 * sC tC xC eC EC
51684e3c 655 * |<------- identical ------>|
285b5ca0 656 *
51684e3c
KW
657 * sI..eI is the portion of the input pattern we are concerned with here.
658 * sC..EC is the constructed substitute parse string.
659 * sC..tC is constructed by us
660 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
661 * In the diagram, these are vertically aligned.
662 * eC..EC is also constructed by us.
663 * xC is the position in the substitute parse string where we found a
664 * problem.
665 * xI is the position in the original pattern corresponding to xC.
285b5ca0 666 *
51684e3c
KW
667 * We want to display a message showing the real input string. Thus we need to
668 * translate from xC to xI. We know that xC >= tC, since the portion of the
669 * string sC..tC has been constructed by us, and so shouldn't have errors. We
670 * get:
671 * xI = tI + (xC - tC)
285b5ca0 672 *
51684e3c
KW
673 * When the substitute parse is constructed, the code needs to set:
674 * RExC_start (sC)
675 * RExC_end (eC)
676 * RExC_copy_start_in_input (tI)
677 * RExC_copy_start_in_constructed (tC)
678 * and restore them when done.
285b5ca0 679 *
51684e3c
KW
680 * During normal processing of the input pattern, both
681 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
682 * sI, so that xC equals xI.
285b5ca0
KW
683 */
684
51684e3c
KW
685#define sI RExC_precomp
686#define eI RExC_precomp_end
687#define sC RExC_start
688#define eC RExC_end
689#define tI RExC_copy_start_in_input
690#define tC RExC_copy_start_in_constructed
691#define xI(xC) (tI + (xC - tC))
692#define xI_offset(xC) (xI(xC) - sI)
285b5ca0
KW
693
694#define REPORT_LOCATION_ARGS(xC) \
695 UTF8fARG(UTF, \
51684e3c 696 (xI(xC) > eI) /* Don't run off end */ \
232b691f 697 ? eI - sI /* Length before the <--HERE */ \
bc929b61
KW
698 : ((xI_offset(xC) >= 0) \
699 ? xI_offset(xC) \
700 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
701 IVdf " trying to output message for " \
702 " pattern %.*s", \
51684e3c 703 __FILE__, __LINE__, (IV) xI_offset(xC), \
bc929b61 704 ((int) (eC - sC)), sC), 0)), \
51684e3c 705 sI), /* The input pattern printed up to the <--HERE */ \
285b5ca0 706 UTF8fARG(UTF, \
51684e3c
KW
707 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
708 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
c1d900c3 709
8a6d8ec6
HS
710/* Used to point after bad bytes for an error message, but avoid skipping
711 * past a nul byte. */
e3304382 712#define SKIP_IF_CHAR(s, e) (!*(s) ? 0 : UTF ? UTF8_SAFE_SKIP(s, e) : 1)
8a6d8ec6 713
ed68fe11
KW
714/* Set up to clean up after our imminent demise */
715#define PREPARE_TO_DIE \
716 STMT_START { \
717 if (RExC_rx_sv) \
718 SAVEFREESV(RExC_rx_sv); \
7c932d07
KW
719 if (RExC_open_parens) \
720 SAVEFREEPV(RExC_open_parens); \
721 if (RExC_close_parens) \
722 SAVEFREEPV(RExC_close_parens); \
ed68fe11
KW
723 } STMT_END
724
b45f050a
JF
725/*
726 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
727 * arg. Show regex, up to a maximum length. If it's too long, chop and add
728 * "...".
729 */
58e23c8d 730#define _FAIL(code) STMT_START { \
bfed75c6 731 const char *ellipses = ""; \
88f063b4 732 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380 733 \
ed68fe11 734 PREPARE_TO_DIE; \
ccb2c380
MP
735 if (len > RegexLengthToShowInErrorMessages) { \
736 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
737 len = RegexLengthToShowInErrorMessages - 10; \
738 ellipses = "..."; \
739 } \
58e23c8d 740 code; \
ccb2c380 741} STMT_END
8615cb43 742
58e23c8d 743#define FAIL(msg) _FAIL( \
147e3846 744 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 745 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
746
747#define FAIL2(msg,arg) _FAIL( \
147e3846 748 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 749 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 750
b45f050a 751/*
b45f050a
JF
752 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
753 */
ccb2c380 754#define Simple_vFAIL(m) STMT_START { \
ccb2c380 755 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 756 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 757} STMT_END
b45f050a
JF
758
759/*
760 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
761 */
ccb2c380 762#define vFAIL(m) STMT_START { \
ed68fe11 763 PREPARE_TO_DIE; \
ccb2c380
MP
764 Simple_vFAIL(m); \
765} STMT_END
b45f050a
JF
766
767/*
768 * Like Simple_vFAIL(), but accepts two arguments.
769 */
ccb2c380 770#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
771 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
772 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 773} STMT_END
b45f050a
JF
774
775/*
776 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
777 */
ccb2c380 778#define vFAIL2(m,a1) STMT_START { \
ed68fe11 779 PREPARE_TO_DIE; \
ccb2c380
MP
780 Simple_vFAIL2(m, a1); \
781} STMT_END
b45f050a
JF
782
783
784/*
785 * Like Simple_vFAIL(), but accepts three arguments.
786 */
ccb2c380 787#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 788 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 789 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 790} STMT_END
b45f050a
JF
791
792/*
793 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
794 */
ccb2c380 795#define vFAIL3(m,a1,a2) STMT_START { \
ed68fe11 796 PREPARE_TO_DIE; \
ccb2c380
MP
797 Simple_vFAIL3(m, a1, a2); \
798} STMT_END
b45f050a
JF
799
800/*
801 * Like Simple_vFAIL(), but accepts four arguments.
802 */
ccb2c380 803#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
804 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
805 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 806} STMT_END
b45f050a 807
95db3ffa 808#define vFAIL4(m,a1,a2,a3) STMT_START { \
ed68fe11 809 PREPARE_TO_DIE; \
95db3ffa
KW
810 Simple_vFAIL4(m, a1, a2, a3); \
811} STMT_END
812
946095af 813/* A specialized version of vFAIL2 that works with UTF8f */
d528642a 814#define vFAIL2utf8f(m, a1) STMT_START { \
ed68fe11 815 PREPARE_TO_DIE; \
d528642a
KW
816 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
817 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
818} STMT_END
819
3ba22297 820#define vFAIL3utf8f(m, a1, a2) STMT_START { \
ed68fe11 821 PREPARE_TO_DIE; \
3ba22297 822 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 823 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
824} STMT_END
825
d62e749e 826/* Setting this to NULL is a signal to not output warnings */
cc16d262
KW
827#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE \
828 STMT_START { \
829 RExC_save_copy_start_in_constructed = RExC_copy_start_in_constructed;\
830 RExC_copy_start_in_constructed = NULL; \
831 } STMT_END
832#define RESTORE_WARNINGS \
833 RExC_copy_start_in_constructed = RExC_save_copy_start_in_constructed
d62e749e 834
7c932d07
KW
835/* Since a warning can be generated multiple times as the input is reparsed, we
836 * output it the first time we come to that point in the parse, but suppress it
837 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
838 * generate any warnings */
55bcf857 839#define TO_OUTPUT_WARNINGS(loc) \
7c932d07
KW
840 ( RExC_copy_start_in_constructed \
841 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
55bcf857 842
7c932d07
KW
843/* After we've emitted a warning, we save the position in the input so we don't
844 * output it again */
845#define UPDATE_WARNINGS_LOC(loc) \
846 STMT_START { \
847 if (TO_OUTPUT_WARNINGS(loc)) { \
848 RExC_latest_warn_offset = (xI(loc)) - RExC_precomp; \
849 } \
850 } STMT_END
77ccba3c 851
086f21bf 852/* 'warns' is the output of the packWARNx macro used in 'code' */
f8220fe2
KW
853#define _WARN_HELPER(loc, warns, code) \
854 STMT_START { \
d62e749e
KW
855 if (! RExC_copy_start_in_constructed) { \
856 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
857 " expected at '%s'", \
858 __FILE__, __LINE__, loc); \
859 } \
55bcf857 860 if (TO_OUTPUT_WARNINGS(loc)) { \
086f21bf
KW
861 if (ckDEAD(warns)) \
862 PREPARE_TO_DIE; \
338190a1 863 code; \
77ccba3c 864 UPDATE_WARNINGS_LOC(loc); \
338190a1 865 } \
f8220fe2 866 } STMT_END
946095af 867
5e0a247b 868/* m is not necessarily a "literal string", in this macro */
f8220fe2
KW
869#define reg_warn_non_literal_string(loc, m) \
870 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
871 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 872 "%s" REPORT_LOCATION, \
f8220fe2 873 m, REPORT_LOCATION_ARGS(loc)))
5e0a247b 874
f8220fe2
KW
875#define ckWARNreg(loc,m) \
876 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
877 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 878 m REPORT_LOCATION, \
f8220fe2 879 REPORT_LOCATION_ARGS(loc)))
ccb2c380 880
f8220fe2
KW
881#define vWARN(loc, m) \
882 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
883 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 884 m REPORT_LOCATION, \
f8220fe2 885 REPORT_LOCATION_ARGS(loc))) \
b927b7e9 886
f8220fe2
KW
887#define vWARN_dep(loc, m) \
888 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
889 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 890 m REPORT_LOCATION, \
f8220fe2 891 REPORT_LOCATION_ARGS(loc)))
0d6106aa 892
f8220fe2
KW
893#define ckWARNdep(loc,m) \
894 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
895 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 896 m REPORT_LOCATION, \
f8220fe2 897 REPORT_LOCATION_ARGS(loc)))
147508a2 898
f8220fe2
KW
899#define ckWARNregdep(loc,m) \
900 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
901 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
d528642a
KW
902 WARN_REGEXP), \
903 m REPORT_LOCATION, \
f8220fe2 904 REPORT_LOCATION_ARGS(loc)))
ccb2c380 905
f8220fe2
KW
906#define ckWARN2reg_d(loc,m, a1) \
907 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
908 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
d528642a 909 m REPORT_LOCATION, \
f8220fe2 910 a1, REPORT_LOCATION_ARGS(loc)))
2335b3d3 911
f8220fe2
KW
912#define ckWARN2reg(loc, m, a1) \
913 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
914 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 915 m REPORT_LOCATION, \
f8220fe2 916 a1, REPORT_LOCATION_ARGS(loc)))
ccb2c380 917
f8220fe2
KW
918#define vWARN3(loc, m, a1, a2) \
919 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
920 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 921 m REPORT_LOCATION, \
f8220fe2 922 a1, a2, REPORT_LOCATION_ARGS(loc)))
ccb2c380 923
f8220fe2
KW
924#define ckWARN3reg(loc, m, a1, a2) \
925 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
926 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
927 m REPORT_LOCATION, \
928 a1, a2, \
f8220fe2 929 REPORT_LOCATION_ARGS(loc)))
668c081a 930
f8220fe2
KW
931#define vWARN4(loc, m, a1, a2, a3) \
932 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
933 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
934 m REPORT_LOCATION, \
935 a1, a2, a3, \
f8220fe2 936 REPORT_LOCATION_ARGS(loc)))
ccb2c380 937
f8220fe2
KW
938#define ckWARN4reg(loc, m, a1, a2, a3) \
939 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
940 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
941 m REPORT_LOCATION, \
942 a1, a2, a3, \
f8220fe2 943 REPORT_LOCATION_ARGS(loc)))
668c081a 944
f8220fe2
KW
945#define vWARN5(loc, m, a1, a2, a3, a4) \
946 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
947 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
948 m REPORT_LOCATION, \
949 a1, a2, a3, a4, \
f8220fe2 950 REPORT_LOCATION_ARGS(loc)))
9d1d55b5 951
aea5f181
KW
952#define ckWARNexperimental(loc, class, m) \
953 _WARN_HELPER(loc, packWARN(class), \
954 Perl_ck_warner_d(aTHX_ packWARN(class), \
955 m REPORT_LOCATION, \
956 REPORT_LOCATION_ARGS(loc)))
957
1a322bb4
KW
958/* Convert between a pointer to a node and its offset from the beginning of the
959 * program */
960#define REGNODE_p(offset) (RExC_emit_start + (offset))
961#define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
962
538e84ed 963/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
964 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
965 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
966 * Element 0 holds the number n.
07be1b83 967 * Position is 1 indexed.
fac92740 968 */
7122b237 969#ifndef RE_TRACK_PATTERN_OFFSETS
d7c442ce 970#define Set_Node_Offset_To_R(offset,byte)
7122b237
YO
971#define Set_Node_Offset(node,byte)
972#define Set_Cur_Node_Offset
973#define Set_Node_Length_To_R(node,len)
974#define Set_Node_Length(node,len)
6a86c6ad 975#define Set_Node_Cur_Length(node,start)
538e84ed
KW
976#define Node_Offset(n)
977#define Node_Length(n)
7122b237
YO
978#define Set_Node_Offset_Length(node,offset,len)
979#define ProgLen(ri) ri->u.proglen
980#define SetProgLen(ri,x) ri->u.proglen = x
4cbeff97 981#define Track_Code(code)
7122b237
YO
982#else
983#define ProgLen(ri) ri->u.offsets[0]
984#define SetProgLen(ri,x) ri->u.offsets[0] = x
d7c442ce 985#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
ccb2c380 986 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
d7c442ce
KW
987 __LINE__, (int)(offset), (int)(byte))); \
988 if((offset) < 0) { \
538e84ed 989 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
d7c442ce 990 (int)(offset)); \
ccb2c380 991 } else { \
d7c442ce 992 RExC_offsets[2*(offset)-1] = (byte); \
ccb2c380 993 } \
ccb2c380
MP
994} STMT_END
995
88f063b4 996#define Set_Node_Offset(node,byte) \
1a322bb4 997 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
ccb2c380
MP
998#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
999
1000#define Set_Node_Length_To_R(node,len) STMT_START { \
ccb2c380 1001 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 1002 __LINE__, (int)(node), (int)(len))); \
ccb2c380 1003 if((node) < 0) { \
538e84ed
KW
1004 Perl_croak(aTHX_ "value of node is %d in Length macro", \
1005 (int)(node)); \
ccb2c380
MP
1006 } else { \
1007 RExC_offsets[2*(node)] = (len); \
1008 } \
ccb2c380
MP
1009} STMT_END
1010
1011#define Set_Node_Length(node,len) \
1a322bb4 1012 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
6a86c6ad
NC
1013#define Set_Node_Cur_Length(node, start) \
1014 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
1015
1016/* Get offsets and lengths */
1a322bb4
KW
1017#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1018#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
fac92740 1019
07be1b83 1020#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1a322bb4
KW
1021 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1022 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
07be1b83 1023} STMT_END
4cbeff97
TC
1024
1025#define Track_Code(code) STMT_START { code } STMT_END
7122b237 1026#endif
07be1b83
YO
1027
1028#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1029#define EXPERIMENTAL_INPLACESCAN
f427392e 1030#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 1031
cb41e5d6
YO
1032#ifdef DEBUGGING
1033int
6ad9a8ab 1034Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
1035{
1036 va_list ap;
1037 int result;
1038 PerlIO *f= Perl_debug_log;
1039 PERL_ARGS_ASSERT_RE_PRINTF;
1040 va_start(ap, fmt);
1041 result = PerlIO_vprintf(f, fmt, ap);
1042 va_end(ap);
1043 return result;
1044}
1045
1046int
7b031478 1047Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
1048{
1049 va_list ap;
1050 int result;
1051 PerlIO *f= Perl_debug_log;
1052 PERL_ARGS_ASSERT_RE_INDENTF;
1053 va_start(ap, depth);
daeb874b 1054 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
1055 result = PerlIO_vprintf(f, fmt, ap);
1056 va_end(ap);
1057 return result;
1058}
1059#endif /* DEBUGGING */
1060
7b031478 1061#define DEBUG_RExC_seen() \
538e84ed 1062 DEBUG_OPTIMISE_MORE_r({ \
88f063b4 1063 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 1064 \
e384d5c1 1065 if (RExC_seen & REG_ZERO_LEN_SEEN) \
88f063b4 1066 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 1067 \
e384d5c1 1068 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
88f063b4 1069 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 1070 \
e384d5c1 1071 if (RExC_seen & REG_GPOS_SEEN) \
88f063b4 1072 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 1073 \
e384d5c1 1074 if (RExC_seen & REG_RECURSE_SEEN) \
88f063b4 1075 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 1076 \
7b031478 1077 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
88f063b4 1078 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 1079 \
e384d5c1 1080 if (RExC_seen & REG_VERBARG_SEEN) \
88f063b4 1081 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 1082 \
e384d5c1 1083 if (RExC_seen & REG_CUTGROUP_SEEN) \
88f063b4 1084 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 1085 \
e384d5c1 1086 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
88f063b4 1087 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 1088 \
e384d5c1 1089 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
88f063b4 1090 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 1091 \
7b031478 1092 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
88f063b4 1093 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 1094 \
88f063b4 1095 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
1096 });
1097
fdfb4f21 1098#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 1099 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21 1100
fdfb4f21 1101
f5a36d78
DM
1102#ifdef DEBUGGING
1103static void
1104S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1105 const char *close_str)
1106{
1107 if (!flags)
1108 return;
1109
1110 Perl_re_printf( aTHX_ "%s", open_str);
11683ecb
DM
1111 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1112 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
f5a36d78
DM
1113 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1114 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1115 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1116 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1117 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1118 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1119 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1120 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1121 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1122 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1123 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1124 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1125 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1126 Perl_re_printf( aTHX_ "%s", close_str);
1127}
1128
1129
1130static void
1131S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1132 U32 depth, int is_inf)
1133{
1134 GET_RE_DEBUG_FLAGS_DECL;
1135
1136 DEBUG_OPTIMISE_MORE_r({
1137 if (!data)
1138 return;
1139 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1140 depth,
1141 where,
1142 (IV)data->pos_min,
1143 (IV)data->pos_delta,
1144 (UV)data->flags
1145 );
1146
1147 S_debug_show_study_flags(aTHX_ data->flags," [","]");
fdfb4f21 1148
f5a36d78
DM
1149 Perl_re_printf( aTHX_
1150 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1151 (IV)data->whilem_c,
1152 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1153 is_inf ? "INF " : ""
1154 );
1155
11683ecb 1156 if (data->last_found) {
37b6262f 1157 int i;
11683ecb
DM
1158 Perl_re_printf(aTHX_
1159 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1160 SvPVX_const(data->last_found),
1161 (IV)data->last_end,
1162 (IV)data->last_start_min,
1163 (IV)data->last_start_max
1164 );
1165
2099df82 1166 for (i = 0; i < 2; i++) {
37b6262f
DM
1167 Perl_re_printf(aTHX_
1168 " %s%s: '%s' @ %" IVdf "/%" IVdf,
2099df82
DM
1169 data->cur_is_floating == i ? "*" : "",
1170 i ? "Float" : "Fixed",
37b6262f
DM
1171 SvPVX_const(data->substrs[i].str),
1172 (IV)data->substrs[i].min_offset,
1173 (IV)data->substrs[i].max_offset
1174 );
1175 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1176 }
11683ecb 1177 }
f5a36d78
DM
1178
1179 Perl_re_printf( aTHX_ "\n");
1180 });
1181}
1182
1183
1184static void
1185S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1186 regnode *scan, U32 depth, U32 flags)
1187{
1188 GET_RE_DEBUG_FLAGS_DECL;
1189
1190 DEBUG_OPTIMISE_r({
1191 regnode *Next;
1192
1193 if (!scan)
1194 return;
1195 Next = regnext(scan);
1196 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1197 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1198 depth,
1199 str,
1200 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1201 Next ? (REG_NODE_NUM(Next)) : 0 );
1202 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1203 Perl_re_printf( aTHX_ "\n");
1204 });
1205}
1206
1207
1208# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1209 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1210
1211# define DEBUG_PEEP(str, scan, depth, flags) \
1212 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1213
1214#else
1215# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1216# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1217#endif
1de06328 1218
cb41e5d6 1219
c6871b76
KW
1220/* =========================================================
1221 * BEGIN edit_distance stuff.
1222 *
1223 * This calculates how many single character changes of any type are needed to
1224 * transform a string into another one. It is taken from version 3.1 of
1225 *
1226 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1227 */
1228
1229/* Our unsorted dictionary linked list. */
1230/* Note we use UVs, not chars. */
1231
1232struct dictionary{
1233 UV key;
1234 UV value;
1235 struct dictionary* next;
1236};
1237typedef struct dictionary item;
1238
1239
1240PERL_STATIC_INLINE item*
88f063b4 1241push(UV key, item* curr)
c6871b76
KW
1242{
1243 item* head;
d09f14bf 1244 Newx(head, 1, item);
c6871b76
KW
1245 head->key = key;
1246 head->value = 0;
1247 head->next = curr;
1248 return head;
1249}
1250
1251
1252PERL_STATIC_INLINE item*
1253find(item* head, UV key)
1254{
1255 item* iterator = head;
1256 while (iterator){
1257 if (iterator->key == key){
1258 return iterator;
1259 }
1260 iterator = iterator->next;
1261 }
1262
1263 return NULL;
1264}
1265
1266PERL_STATIC_INLINE item*
88f063b4 1267uniquePush(item* head, UV key)
c6871b76
KW
1268{
1269 item* iterator = head;
1270
1271 while (iterator){
1272 if (iterator->key == key) {
1273 return head;
1274 }
1275 iterator = iterator->next;
1276 }
1277
88f063b4 1278 return push(key, head);
c6871b76
KW
1279}
1280
1281PERL_STATIC_INLINE void
1282dict_free(item* head)
1283{
1284 item* iterator = head;
1285
1286 while (iterator) {
1287 item* temp = iterator;
1288 iterator = iterator->next;
1289 Safefree(temp);
1290 }
1291
1292 head = NULL;
1293}
1294
1295/* End of Dictionary Stuff */
1296
1297/* All calculations/work are done here */
1298STATIC int
1299S_edit_distance(const UV* src,
1300 const UV* tgt,
1301 const STRLEN x, /* length of src[] */
1302 const STRLEN y, /* length of tgt[] */
1303 const SSize_t maxDistance
1304)
1305{
1306 item *head = NULL;
88f063b4 1307 UV swapCount, swapScore, targetCharCount, i, j;
c6871b76
KW
1308 UV *scores;
1309 UV score_ceil = x + y;
1310
1311 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1312
1313 /* intialize matrix start values */
d09f14bf 1314 Newx(scores, ( (x + 2) * (y + 2)), UV);
c6871b76
KW
1315 scores[0] = score_ceil;
1316 scores[1 * (y + 2) + 0] = score_ceil;
1317 scores[0 * (y + 2) + 1] = score_ceil;
1318 scores[1 * (y + 2) + 1] = 0;
88f063b4 1319 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
c6871b76
KW
1320
1321 /* work loops */
1322 /* i = src index */
1323 /* j = tgt index */
1324 for (i=1;i<=x;i++) {
1325 if (i < x)
88f063b4 1326 head = uniquePush(head, src[i]);
c6871b76
KW
1327 scores[(i+1) * (y + 2) + 1] = i;
1328 scores[(i+1) * (y + 2) + 0] = score_ceil;
1329 swapCount = 0;
1330
1331 for (j=1;j<=y;j++) {
1332 if (i == 1) {
1333 if(j < y)
88f063b4 1334 head = uniquePush(head, tgt[j]);
c6871b76
KW
1335 scores[1 * (y + 2) + (j + 1)] = j;
1336 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1337 }
1338
88f063b4 1339 targetCharCount = find(head, tgt[j-1])->value;
c6871b76
KW
1340 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1341
1342 if (src[i-1] != tgt[j-1]){
1343 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1344 }
1345 else {
1346 swapCount = j;
1347 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1348 }
1349 }
1350
88f063b4 1351 find(head, src[i-1])->value = i;
c6871b76
KW
1352 }
1353
1354 {
1355 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1356 dict_free(head);
1357 Safefree(scores);
1358 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1359 }
1360}
1361
1362/* END of edit_distance() stuff
1363 * ========================================================= */
1364
8e35b056
KW
1365/* is c a control character for which we have a mnemonic? */
1366#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1367
549b4e78
KW
1368STATIC const char *
1369S_cntrl_to_mnemonic(const U8 c)
1370{
1371 /* Returns the mnemonic string that represents character 'c', if one
1372 * exists; NULL otherwise. The only ones that exist for the purposes of
1373 * this routine are a few control characters */
1374
1375 switch (c) {
1376 case '\a': return "\\a";
1377 case '\b': return "\\b";
1378 case ESC_NATIVE: return "\\e";
1379 case '\f': return "\\f";
1380 case '\n': return "\\n";
1381 case '\r': return "\\r";
1382 case '\t': return "\\t";
1383 }
1384
1385 return NULL;
1386}
1387
653099ff 1388/* Mark that we cannot extend a found fixed substring at this point.
37b6262f 1389 Update the longest found anchored substring or the longest found
653099ff
GS
1390 floating substrings if needed. */
1391
4327152a 1392STATIC void
ea3daa5d
FC
1393S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1394 SSize_t *minlenp, int is_inf)
c277df42 1395{
e1ec3a88 1396 const STRLEN l = CHR_SVLEN(data->last_found);
2099df82 1397 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
a2ca2017 1398 const STRLEN old_l = CHR_SVLEN(longest_sv);
1de06328 1399 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1400
7918f24d
NC
1401 PERL_ARGS_ASSERT_SCAN_COMMIT;
1402
c277df42 1403 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
2099df82 1404 const U8 i = data->cur_is_floating;
a2ca2017 1405 SvSetMagicSV(longest_sv, data->last_found);
37b6262f
DM
1406 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1407
2099df82
DM
1408 if (!i) /* fixed */
1409 data->substrs[0].max_offset = data->substrs[0].min_offset;
1410 else { /* float */
1411 data->substrs[1].max_offset = (l
646e8787
DM
1412 ? data->last_start_max
1413 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1414 ? SSize_t_MAX
1415 : data->pos_min + data->pos_delta));
1416 if (is_inf
2099df82
DM
1417 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1418 data->substrs[1].max_offset = SSize_t_MAX;
37b6262f 1419 }
11683ecb 1420
37b6262f
DM
1421 if (data->flags & SF_BEFORE_EOL)
1422 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1423 else
1424 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1425 data->substrs[i].minlenp = minlenp;
1426 data->substrs[i].lookbehind = 0;
c277df42 1427 }
37b6262f 1428
c277df42 1429 SvCUR_set(data->last_found, 0);
0eda9292 1430 {
a28509cc 1431 SV * const sv = data->last_found;
097eb12c
AL
1432 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1433 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1434 if (mg)
1435 mg->mg_len = 0;
1436 }
0eda9292 1437 }
c277df42
IZ
1438 data->last_end = -1;
1439 data->flags &= ~SF_BEFORE_EOL;
f5a36d78 1440 DEBUG_STUDYDATA("commit", data, 0, is_inf);
c277df42
IZ
1441}
1442
cdd87c1d
KW
1443/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1444 * list that describes which code points it matches */
1445
653099ff 1446STATIC void
3420edd7 1447S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1448{
cdd87c1d
KW
1449 /* Set the SSC 'ssc' to match an empty string or any code point */
1450
557bd3fb 1451 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1452
71068078 1453 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1454
0854ea0b
KW
1455 /* mortalize so won't leak */
1456 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1457 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1458}
1459
653099ff 1460STATIC int
dc3bf405 1461S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1462{
c144baaa
KW
1463 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1464 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1465 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1466 * in any way, so there's no point in using it */
cdd87c1d
KW
1467
1468 UV start, end;
1469 bool ret;
653099ff 1470
557bd3fb 1471 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1472
71068078 1473 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1474
93e92956 1475 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1476 return FALSE;
1477 }
1478
1479 /* See if the list consists solely of the range 0 - Infinity */
1480 invlist_iterinit(ssc->invlist);
1481 ret = invlist_iternext(ssc->invlist, &start, &end)
1482 && start == 0
1483 && end == UV_MAX;
1484
1485 invlist_iterfinish(ssc->invlist);
1486
1487 if (ret) {
1488 return TRUE;
1489 }
1490
1491 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1492 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1493 int i;
1494 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1495 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1496 return TRUE;
1497 }
1498 }
1499 }
1500
1501 return FALSE;
653099ff
GS
1502}
1503
653099ff 1504STATIC void
cdd87c1d 1505S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1506{
cdd87c1d
KW
1507 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1508 * string, any code point, or any posix class under locale */
1509
557bd3fb 1510 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1511
557bd3fb 1512 Zero(ssc, 1, regnode_ssc);
71068078 1513 set_ANYOF_SYNTHETIC(ssc);
93e92956 1514 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1515 ssc_anything(ssc);
cdd87c1d 1516
2f306ab9
KW
1517 /* If any portion of the regex is to operate under locale rules that aren't
1518 * fully known at compile time, initialization includes it. The reason
1519 * this isn't done for all regexes is that the optimizer was written under
1520 * the assumption that locale was all-or-nothing. Given the complexity and
1521 * lack of documentation in the optimizer, and that there are inadequate
1522 * test cases for locale, many parts of it may not work properly, it is
1523 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1524 if (RExC_contains_locale) {
1525 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1526 }
1527 else {
1528 ANYOF_POSIXL_ZERO(ssc);
1529 }
653099ff
GS
1530}
1531
b423522f 1532STATIC int
dc3bf405
BF
1533S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1534 const regnode_ssc *ssc)
b423522f
KW
1535{
1536 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1537 * to the list of code points matched, and locale posix classes; hence does
1538 * not check its flags) */
1539
1540 UV start, end;
1541 bool ret;
1542
1543 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1544
71068078 1545 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1546
1547 invlist_iterinit(ssc->invlist);
1548 ret = invlist_iternext(ssc->invlist, &start, &end)
1549 && start == 0
1550 && end == UV_MAX;
1551
1552 invlist_iterfinish(ssc->invlist);
1553
1554 if (! ret) {
1555 return FALSE;
1556 }
1557
e0e1be5f 1558 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1559 return FALSE;
b423522f
KW
1560 }
1561
1562 return TRUE;
1563}
1564
4ebed06a
KW
1565#define INVLIST_INDEX 0
1566#define ONLY_LOCALE_MATCHES_INDEX 1
1567#define DEFERRED_USER_DEFINED_INDEX 2
1568
b423522f
KW
1569STATIC SV*
1570S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1571 const regnode_charclass* const node)
b423522f
KW
1572{
1573 /* Returns a mortal inversion list defining which code points are matched
1574 * by 'node', which is of type ANYOF. Handles complementing the result if
1575 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1576 * returned list must, and will, contain every code point that is a
1577 * possibility. */
b423522f 1578
1565c085 1579 dVAR;
e2506fa7 1580 SV* invlist = NULL;
1ee208c4 1581 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1582 unsigned int i;
1583 const U32 n = ARG(node);
31f05a37 1584 bool new_node_has_latin1 = FALSE;
3146c00a 1585 const U8 flags = (inRANGE(OP(node), ANYOFH, ANYOFHr))
29a889ef
KW
1586 ? 0
1587 : ANYOF_FLAGS(node);
b423522f
KW
1588
1589 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1590
1591 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1592 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1593 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1594 AV * const av = MUTABLE_AV(SvRV(rv));
1595 SV **const ary = AvARRAY(av);
1596 assert(RExC_rxi->data->what[n] == 's');
1597
4ebed06a 1598 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
b423522f 1599
6ed02bb6
KW
1600 /* Here there are things that won't be known until runtime -- we
1601 * have to assume it could be anything */
e2506fa7 1602 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1603 return _add_range_to_invlist(invlist, 0, UV_MAX);
1604 }
4ebed06a 1605 else if (ary[INVLIST_INDEX]) {
b423522f 1606
6ed02bb6 1607 /* Use the node's inversion list */
4ebed06a 1608 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1ee208c4
KW
1609 }
1610
1611 /* Get the code points valid only under UTF-8 locales */
766d6d33 1612 if ( (flags & ANYOFL_FOLD)
4ebed06a 1613 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1ee208c4 1614 {
4ebed06a 1615 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
b423522f
KW
1616 }
1617 }
1618
e2506fa7
KW
1619 if (! invlist) {
1620 invlist = sv_2mortal(_new_invlist(0));
1621 }
1622
dcb20b36
KW
1623 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1624 * code points, and an inversion list for the others, but if there are code
1625 * points that should match only conditionally on the target string being
1626 * UTF-8, those are placed in the inversion list, and not the bitmap.
1627 * Since there are circumstances under which they could match, they are
1628 * included in the SSC. But if the ANYOF node is to be inverted, we have
1629 * to exclude them here, so that when we invert below, the end result
1630 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1631 * have to do this here before we add the unconditionally matched code
1632 * points */
766d6d33 1633 if (flags & ANYOF_INVERT) {
b423522f
KW
1634 _invlist_intersection_complement_2nd(invlist,
1635 PL_UpperLatin1,
1636 &invlist);
1637 }
1638
1639 /* Add in the points from the bit map */
3146c00a 1640 if (! inRANGE(OP(node), ANYOFH, ANYOFHr)) {
90973738
KW
1641 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1642 if (ANYOF_BITMAP_TEST(node, i)) {
1643 unsigned int start = i++;
1644
1645 for (; i < NUM_ANYOF_CODE_POINTS
1646 && ANYOF_BITMAP_TEST(node, i); ++i)
1647 {
1648 /* empty */
1649 }
1650 invlist = _add_range_to_invlist(invlist, start, i-1);
1651 new_node_has_latin1 = TRUE;
6f8848d5 1652 }
b423522f
KW
1653 }
1654 }
1655
1656 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1657 * as well. But don't add them if inverting, as when that gets done below,
1658 * it would exclude all these characters, including the ones it shouldn't
1659 * that were added just above */
766d6d33
KW
1660 if (! (flags & ANYOF_INVERT) && OP(node) == ANYOFD
1661 && (flags & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
f240c685 1662 {
b423522f
KW
1663 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1664 }
1665
1666 /* Similarly for these */
766d6d33 1667 if (flags & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1668 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1669 }
1670
766d6d33 1671 if (flags & ANYOF_INVERT) {
b423522f
KW
1672 _invlist_invert(invlist);
1673 }
766d6d33 1674 else if (flags & ANYOFL_FOLD) {
35b8412f 1675 if (new_node_has_latin1) {
31f05a37 1676
26be5fe6
KW
1677 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1678 * the locale. We can skip this if there are no 0-255 at all. */
1679 _invlist_union(invlist, PL_Latin1, &invlist);
35b8412f
KW
1680
1681 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1682 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1683 }
1684 else {
1685 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1686 invlist = add_cp_to_invlist(invlist, 'I');
1687 }
1688 if (_invlist_contains_cp(invlist,
1689 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1690 {
1691 invlist = add_cp_to_invlist(invlist, 'i');
1692 }
1693 }
31f05a37
KW
1694 }
1695
1ee208c4
KW
1696 /* Similarly add the UTF-8 locale possible matches. These have to be
1697 * deferred until after the non-UTF-8 locale ones are taken care of just
1698 * above, or it leads to wrong results under ANYOF_INVERT */
1699 if (only_utf8_locale_invlist) {
31f05a37 1700 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1701 only_utf8_locale_invlist,
766d6d33 1702 flags & ANYOF_INVERT,
31f05a37
KW
1703 &invlist);
1704 }
b423522f
KW
1705
1706 return invlist;
1707}
1708
1051e1c4 1709/* These two functions currently do the exact same thing */
557bd3fb 1710#define ssc_init_zero ssc_init
653099ff 1711
cdd87c1d
KW
1712#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1713#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1714
557bd3fb 1715/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1716 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1717 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1718
653099ff 1719STATIC void
b423522f 1720S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1721 const regnode_charclass *and_with)
653099ff 1722{
cdd87c1d
KW
1723 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1724 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1725
a0dd4231 1726 SV* anded_cp_list;
3146c00a 1727 U8 and_with_flags = inRANGE(OP(and_with), ANYOFH, ANYOFHr)
29a889ef
KW
1728 ? 0
1729 : ANYOF_FLAGS(and_with);
a0dd4231 1730 U8 anded_flags;
1e6ade67 1731
cdd87c1d 1732 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1733
71068078 1734 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1735
1736 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1737 * the code point inversion list and just the relevant flags */
71068078 1738 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1739 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
765e6ecf 1740 anded_flags = and_with_flags;
e9b08962
KW
1741
1742 /* XXX This is a kludge around what appears to be deficiencies in the
1743 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1744 * there are paths through the optimizer where it doesn't get weeded
1745 * out when it should. And if we don't make some extra provision for
1746 * it like the code just below, it doesn't get added when it should.
1747 * This solution is to add it only when AND'ing, which is here, and
1748 * only when what is being AND'ed is the pristine, original node
1749 * matching anything. Thus it is like adding it to ssc_anything() but
1750 * only when the result is to be AND'ed. Probably the same solution
1751 * could be adopted for the same problem we have with /l matching,
1752 * which is solved differently in S_ssc_init(), and that would lead to
1753 * fewer false positives than that solution has. But if this solution
1754 * creates bugs, the consequences are only that a warning isn't raised
1755 * that should be; while the consequences for having /l bugs is
1756 * incorrect matches */
7dcac5f6 1757 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1758 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1759 }
a0dd4231
KW
1760 }
1761 else {
5c0f85ef 1762 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685 1763 if (OP(and_with) == ANYOFD) {
765e6ecf 1764 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
f240c685
KW
1765 }
1766 else {
765e6ecf 1767 anded_flags = and_with_flags
f240c685 1768 &( ANYOF_COMMON_FLAGS
108316fb
KW
1769 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1770 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
765e6ecf 1771 if (ANYOFL_UTF8_LOCALE_REQD(and_with_flags)) {
d1c40ef5
KW
1772 anded_flags &=
1773 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1774 }
f240c685 1775 }
a0dd4231
KW
1776 }
1777
1778 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1779
1780 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1781 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1782 * 'and_with' may be inverted. When not inverted, we have the situation of
1783 * computing:
1784 * (C1 | P1) & (C2 | P2)
1785 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1786 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1787 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1788 * <= ((C1 & C2) | P1 | P2)
1789 * Alternatively, the last few steps could be:
1790 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1791 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1792 * <= (C1 | C2 | (P1 & P2))
1793 * We favor the second approach if either P1 or P2 is non-empty. This is
1794 * because these components are a barrier to doing optimizations, as what
1795 * they match cannot be known until the moment of matching as they are
1796 * dependent on the current locale, 'AND"ing them likely will reduce or
1797 * eliminate them.
1798 * But we can do better if we know that C1,P1 are in their initial state (a
1799 * frequent occurrence), each matching everything:
1800 * (<everything>) & (C2 | P2) = C2 | P2
1801 * Similarly, if C2,P2 are in their initial state (again a frequent
1802 * occurrence), the result is a no-op
1803 * (C1 | P1) & (<everything>) = C1 | P1
1804 *
1805 * Inverted, we have
1806 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1807 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1808 * <= (C1 & ~C2) | (P1 & ~P2)
1809 * */
1aa99e6b 1810
765e6ecf 1811 if ((and_with_flags & ANYOF_INVERT)
71068078 1812 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1813 {
cdd87c1d 1814 unsigned int i;
8951c461 1815
cdd87c1d
KW
1816 ssc_intersection(ssc,
1817 anded_cp_list,
1818 FALSE /* Has already been inverted */
1819 );
c6b76537 1820
cdd87c1d
KW
1821 /* If either P1 or P2 is empty, the intersection will be also; can skip
1822 * the loop */
765e6ecf 1823 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1824 ANYOF_POSIXL_ZERO(ssc);
1825 }
e0e1be5f 1826 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1827
1828 /* Note that the Posix class component P from 'and_with' actually
1829 * looks like:
1830 * P = Pa | Pb | ... | Pn
1831 * where each component is one posix class, such as in [\w\s].
1832 * Thus
1833 * ~P = ~(Pa | Pb | ... | Pn)
1834 * = ~Pa & ~Pb & ... & ~Pn
1835 * <= ~Pa | ~Pb | ... | ~Pn
1836 * The last is something we can easily calculate, but unfortunately
1837 * is likely to have many false positives. We could do better
1838 * in some (but certainly not all) instances if two classes in
1839 * P have known relationships. For example
1840 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1841 * So
1842 * :lower: & :print: = :lower:
1843 * And similarly for classes that must be disjoint. For example,
1844 * since \s and \w can have no elements in common based on rules in
1845 * the POSIX standard,
1846 * \w & ^\S = nothing
1847 * Unfortunately, some vendor locales do not meet the Posix
1848 * standard, in particular almost everything by Microsoft.
1849 * The loop below just changes e.g., \w into \W and vice versa */
1850
1ee208c4 1851 regnode_charclass_posixl temp;
cdd87c1d
KW
1852 int add = 1; /* To calculate the index of the complement */
1853
b1234259 1854 Zero(&temp, 1, regnode_charclass_posixl);
cdd87c1d
KW
1855 ANYOF_POSIXL_ZERO(&temp);
1856 for (i = 0; i < ANYOF_MAX; i++) {
1857 assert(i % 2 != 0
7dcac5f6
KW
1858 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1859 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1860
7dcac5f6 1861 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1862 ANYOF_POSIXL_SET(&temp, i + add);
1863 }
1864 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1865 }
1866 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1867
cdd87c1d
KW
1868 } /* else ssc already has no posixes */
1869 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1870 in its initial state */
71068078 1871 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1872 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1873 {
1874 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1875 * copy it over 'ssc' */
1876 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1877 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1878 StructCopy(and_with, ssc, regnode_ssc);
1879 }
1880 else {
1881 ssc->invlist = anded_cp_list;
1882 ANYOF_POSIXL_ZERO(ssc);
765e6ecf 1883 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1884 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1885 }
1886 }
1887 }
e0e1be5f 1888 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
765e6ecf 1889 || (and_with_flags & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1890 {
1891 /* One or the other of P1, P2 is non-empty. */
765e6ecf 1892 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1893 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1894 }
cdd87c1d
KW
1895 ssc_union(ssc, anded_cp_list, FALSE);
1896 }
1897 else { /* P1 = P2 = empty */
1898 ssc_intersection(ssc, anded_cp_list, FALSE);
1899 }
137165a6 1900 }
653099ff
GS
1901}
1902
653099ff 1903STATIC void
cdd87c1d 1904S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1905 const regnode_charclass *or_with)
653099ff 1906{
cdd87c1d
KW
1907 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1908 * another SSC or a regular ANYOF class. Can create false positives if
1909 * 'or_with' is to be inverted. */
7918f24d 1910
a0dd4231
KW
1911 SV* ored_cp_list;
1912 U8 ored_flags;
3146c00a 1913 U8 or_with_flags = inRANGE(OP(or_with), ANYOFH, ANYOFHr)
29a889ef
KW
1914 ? 0
1915 : ANYOF_FLAGS(or_with);
c6b76537 1916
cdd87c1d 1917 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1918
71068078 1919 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1920
1921 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1922 * the code point inversion list and just the relevant flags */
71068078 1923 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1924 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
765e6ecf 1925 ored_flags = or_with_flags;
a0dd4231
KW
1926 }
1927 else {
5c0f85ef 1928 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
765e6ecf 1929 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
f240c685
KW
1930 if (OP(or_with) != ANYOFD) {
1931 ored_flags
765e6ecf 1932 |= or_with_flags
108316fb
KW
1933 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1934 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
765e6ecf 1935 if (ANYOFL_UTF8_LOCALE_REQD(or_with_flags)) {
d1c40ef5
KW
1936 ored_flags |=
1937 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1938 }
f240c685 1939 }
a0dd4231
KW
1940 }
1941
1942 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1943
1944 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1945 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1946 * 'or_with' may be inverted. When not inverted, we have the simple
1947 * situation of computing:
1948 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1949 * If P1|P2 yields a situation with both a class and its complement are
1950 * set, like having both \w and \W, this matches all code points, and we
1951 * can delete these from the P component of the ssc going forward. XXX We
1952 * might be able to delete all the P components, but I (khw) am not certain
1953 * about this, and it is better to be safe.
1954 *
1955 * Inverted, we have
1956 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1957 * <= (C1 | P1) | ~C2
1958 * <= (C1 | ~C2) | P1
1959 * (which results in actually simpler code than the non-inverted case)
1960 * */
9826f543 1961
765e6ecf 1962 if ((or_with_flags & ANYOF_INVERT)
71068078 1963 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1964 {
cdd87c1d 1965 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1966 } /* else Not inverted */
765e6ecf 1967 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1968 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1969 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1970 unsigned int i;
1971 for (i = 0; i < ANYOF_MAX; i += 2) {
1972 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1973 {
1974 ssc_match_all_cp(ssc);
1975 ANYOF_POSIXL_CLEAR(ssc, i);
1976 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1977 }
1978 }
1979 }
1aa99e6b 1980 }
cdd87c1d
KW
1981
1982 ssc_union(ssc,
1983 ored_cp_list,
1984 FALSE /* Already has been inverted */
1985 );
653099ff
GS
1986}
1987
b423522f
KW
1988PERL_STATIC_INLINE void
1989S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1990{
1991 PERL_ARGS_ASSERT_SSC_UNION;
1992
71068078 1993 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1994
1995 _invlist_union_maybe_complement_2nd(ssc->invlist,
1996 invlist,
1997 invert2nd,
1998 &ssc->invlist);
1999}
2000
2001PERL_STATIC_INLINE void
2002S_ssc_intersection(pTHX_ regnode_ssc *ssc,
2003 SV* const invlist,
2004 const bool invert2nd)
2005{
2006 PERL_ARGS_ASSERT_SSC_INTERSECTION;
2007
71068078 2008 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2009
2010 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
2011 invlist,
2012 invert2nd,
2013 &ssc->invlist);
2014}
2015
2016PERL_STATIC_INLINE void
2017S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
2018{
2019 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2020
71068078 2021 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2022
2023 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2024}
2025
2026PERL_STATIC_INLINE void
2027S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2028{
2029 /* AND just the single code point 'cp' into the SSC 'ssc' */
2030
2031 SV* cp_list = _new_invlist(2);
2032
2033 PERL_ARGS_ASSERT_SSC_CP_AND;
2034
71068078 2035 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2036
2037 cp_list = add_cp_to_invlist(cp_list, cp);
2038 ssc_intersection(ssc, cp_list,
2039 FALSE /* Not inverted */
2040 );
2041 SvREFCNT_dec_NN(cp_list);
2042}
2043
2044PERL_STATIC_INLINE void
dc3bf405 2045S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
2046{
2047 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
2048 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2049
71068078 2050 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2051
2052 ANYOF_POSIXL_ZERO(ssc);
2053 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2054}
2055
b35552de
KW
2056#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2057
2058STATIC bool
2059S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2060{
2061 /* The synthetic start class is used to hopefully quickly winnow down
2062 * places where a pattern could start a match in the target string. If it
2063 * doesn't really narrow things down that much, there isn't much point to
2064 * having the overhead of using it. This function uses some very crude
2065 * heuristics to decide if to use the ssc or not.
2066 *
2067 * It returns TRUE if 'ssc' rules out more than half what it considers to
2068 * be the "likely" possible matches, but of course it doesn't know what the
2069 * actual things being matched are going to be; these are only guesses
2070 *
2071 * For /l matches, it assumes that the only likely matches are going to be
2072 * in the 0-255 range, uniformly distributed, so half of that is 127
2073 * For /a and /d matches, it assumes that the likely matches will be just
2074 * the ASCII range, so half of that is 63
2075 * For /u and there isn't anything matching above the Latin1 range, it
2076 * assumes that that is the only range likely to be matched, and uses
2077 * half that as the cut-off: 127. If anything matches above Latin1,
2078 * it assumes that all of Unicode could match (uniformly), except for
2079 * non-Unicode code points and things in the General Category "Other"
2080 * (unassigned, private use, surrogates, controls and formats). This
2081 * is a much large number. */
2082
b35552de
KW
2083 U32 count = 0; /* Running total of number of code points matched by
2084 'ssc' */
2085 UV start, end; /* Start and end points of current range in inversion
26be5fe6 2086 XXX outdated. UTF-8 locales are common, what about invert? list */
72400949
KW
2087 const U32 max_code_points = (LOC)
2088 ? 256
f6e8f31e 2089 : (( ! UNI_SEMANTICS
1a26cbcb 2090 || invlist_highest(ssc->invlist) < 256)
72400949
KW
2091 ? 128
2092 : NON_OTHER_COUNT);
2093 const U32 max_match = max_code_points / 2;
b35552de
KW
2094
2095 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2096
2097 invlist_iterinit(ssc->invlist);
2098 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
2099 if (start >= max_code_points) {
2100 break;
b35552de 2101 }
72400949 2102 end = MIN(end, max_code_points - 1);
b35552de 2103 count += end - start + 1;
72400949 2104 if (count >= max_match) {
b35552de
KW
2105 invlist_iterfinish(ssc->invlist);
2106 return FALSE;
2107 }
2108 }
2109
2110 return TRUE;
2111}
2112
2113
b423522f
KW
2114STATIC void
2115S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2116{
2117 /* The inversion list in the SSC is marked mortal; now we need a more
2118 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
2119 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2120 * map */
b423522f 2121
28118b9c 2122 SV* invlist = invlist_clone(ssc->invlist, NULL);
b423522f
KW
2123
2124 PERL_ARGS_ASSERT_SSC_FINALIZE;
2125
71068078 2126 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 2127
a0dd4231 2128 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
2129 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2130 * by the time we reach here */
f240c685
KW
2131 assert(! (ANYOF_FLAGS(ssc)
2132 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
2133 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2134 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 2135
b423522f
KW
2136 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2137
4c404f26 2138 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
b423522f 2139
85c8e306
KW
2140 /* Make sure is clone-safe */
2141 ssc->invlist = NULL;
2142
e0e1be5f 2143 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 2144 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
d156f5cb 2145 OP(ssc) = ANYOFPOSIXL;
e0e1be5f 2146 }
d156f5cb 2147 else if (RExC_contains_locale) {
b2e90ddf
KW
2148 OP(ssc) = ANYOFL;
2149 }
2150
1462525b 2151 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
2152}
2153
a3621e74
YO
2154#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2155#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2156#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
2157#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2158 ? (TRIE_LIST_CUR( idx ) - 1) \
2159 : 0 )
a3621e74 2160
3dab1dad
YO
2161
2162#ifdef DEBUGGING
07be1b83 2163/*
2b8b4781
NC
2164 dump_trie(trie,widecharmap,revcharmap)
2165 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2166 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
2167
2168 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
2169 The _interim_ variants are used for debugging the interim
2170 tables that are used to generate the final compressed
2171 representation which is what dump_trie expects.
2172
486ec47a 2173 Part of the reason for their existence is to provide a form
3dab1dad 2174 of documentation as to how the different representations function.
07be1b83
YO
2175
2176*/
3dab1dad
YO
2177
2178/*
3dab1dad
YO
2179 Dumps the final compressed table form of the trie to Perl_debug_log.
2180 Used for debugging make_trie().
2181*/
b9a59e08 2182
3dab1dad 2183STATIC void
2b8b4781
NC
2184S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2185 AV *revcharmap, U32 depth)
3dab1dad
YO
2186{
2187 U32 state;
ab3bbdeb 2188 SV *sv=sv_newmortal();
55eed653 2189 int colwidth= widecharmap ? 6 : 4;
2e64971a 2190 U16 word;
3dab1dad
YO
2191 GET_RE_DEBUG_FLAGS_DECL;
2192
7918f24d 2193 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 2194
6ad9a8ab 2195 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 2196 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
2197
2198 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 2199 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 2200 if ( tmp ) {
6ad9a8ab 2201 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2202 colwidth,
538e84ed 2203 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2204 PL_colors[0], PL_colors[1],
2205 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2206 PERL_PV_ESCAPE_FIRSTCHAR
2207 )
ab3bbdeb 2208 );
3dab1dad
YO
2209 }
2210 }
1e37780e
YO
2211 Perl_re_printf( aTHX_ "\n");
2212 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2213
2214 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2215 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2216 Perl_re_printf( aTHX_ "\n");
3dab1dad 2217
1e2e3d02 2218 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2219 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2220
147e3846 2221 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2222
2223 if ( trie->states[ state ].wordnum ) {
1e37780e 2224 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2225 } else {
6ad9a8ab 2226 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2227 }
2228
147e3846 2229 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2230
2231 if ( base ) {
2232 U32 ofs = 0;
2233
2234 while( ( base + ofs < trie->uniquecharcount ) ||
2235 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2236 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2237 != state))
3dab1dad
YO
2238 ofs++;
2239
147e3846 2240 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2241
2242 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2243 if ( ( base + ofs >= trie->uniquecharcount )
2244 && ( base + ofs - trie->uniquecharcount
2245 < trie->lasttrans )
2246 && trie->trans[ base + ofs
2247 - trie->uniquecharcount ].check == state )
3dab1dad 2248 {
147e3846 2249 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2250 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2251 );
3dab1dad 2252 } else {
88f063b4 2253 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
3dab1dad
YO
2254 }
2255 }
2256
6ad9a8ab 2257 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2258
2259 }
6ad9a8ab 2260 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2261 }
6ad9a8ab 2262 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2263 depth);
2e64971a 2264 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2265 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2266 (int)word, (int)(trie->wordinfo[word].prev),
2267 (int)(trie->wordinfo[word].len));
2268 }
6ad9a8ab 2269 Perl_re_printf( aTHX_ "\n" );
538e84ed 2270}
3dab1dad 2271/*
3dab1dad 2272 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2273 List tries normally only are used for construction when the number of
3dab1dad
YO
2274 possible chars (trie->uniquecharcount) is very high.
2275 Used for debugging make_trie().
2276*/
2277STATIC void
55eed653 2278S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2279 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2280 U32 depth)
3dab1dad
YO
2281{
2282 U32 state;
ab3bbdeb 2283 SV *sv=sv_newmortal();
55eed653 2284 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2285 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2286
2287 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2288
3dab1dad 2289 /* print out the table precompression. */
6ad9a8ab 2290 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2291 depth+1 );
6ad9a8ab 2292 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2293 depth+1, "------:-----+-----------------\n" );
538e84ed 2294
3dab1dad
YO
2295 for( state=1 ; state < next_alloc ; state ++ ) {
2296 U16 charid;
538e84ed 2297
147e3846 2298 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2299 depth+1, (UV)state );
3dab1dad 2300 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2301 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2302 } else {
6ad9a8ab 2303 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2304 trie->states[ state ].wordnum
2305 );
2306 }
2307 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed 2308 SV ** const tmp = av_fetch( revcharmap,
88f063b4 2309 TRIE_LIST_ITEM(state, charid).forid, 0);
ab3bbdeb 2310 if ( tmp ) {
147e3846 2311 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2312 colwidth,
538e84ed
KW
2313 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2314 colwidth,
2315 PL_colors[0], PL_colors[1],
2316 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2317 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2318 ) ,
88f063b4
KW
2319 TRIE_LIST_ITEM(state, charid).forid,
2320 (UV)TRIE_LIST_ITEM(state, charid).newstate
1e2e3d02 2321 );
538e84ed 2322 if (!(charid % 10))
6ad9a8ab 2323 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2324 (int)((depth * 2) + 14), "");
1e2e3d02 2325 }
ab3bbdeb 2326 }
6ad9a8ab 2327 Perl_re_printf( aTHX_ "\n");
3dab1dad 2328 }
538e84ed 2329}
3dab1dad
YO
2330
2331/*
3dab1dad 2332 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2333 This is the normal DFA style state transition table, with a few
2334 twists to facilitate compression later.
3dab1dad
YO
2335 Used for debugging make_trie().
2336*/
2337STATIC void
55eed653 2338S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2339 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2340 U32 depth)
3dab1dad
YO
2341{
2342 U32 state;
2343 U16 charid;
ab3bbdeb 2344 SV *sv=sv_newmortal();
55eed653 2345 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2346 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2347
2348 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2349
3dab1dad
YO
2350 /*
2351 print out the table precompression so that we can do a visual check
2352 that they are identical.
2353 */
538e84ed 2354
6ad9a8ab 2355 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2356
2357 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2358 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2359 if ( tmp ) {
6ad9a8ab 2360 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2361 colwidth,
538e84ed 2362 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2363 PL_colors[0], PL_colors[1],
2364 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2365 PERL_PV_ESCAPE_FIRSTCHAR
2366 )
ab3bbdeb 2367 );
3dab1dad
YO
2368 }
2369 }
2370
4aaafc03
YO
2371 Perl_re_printf( aTHX_ "\n");
2372 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2373
2374 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2375 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2376 }
2377
6ad9a8ab 2378 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2379
2380 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2381
147e3846 2382 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2383 depth+1,
3dab1dad
YO
2384 (UV)TRIE_NODENUM( state ) );
2385
2386 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2387 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2388 if (v)
147e3846 2389 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2390 else
6ad9a8ab 2391 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2392 }
2393 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2394 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2395 (UV)trie->trans[ state ].check );
3dab1dad 2396 } else {
147e3846 2397 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2398 (UV)trie->trans[ state ].check,
3dab1dad
YO
2399 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2400 }
2401 }
07be1b83 2402}
3dab1dad
YO
2403
2404#endif
2405
2e64971a 2406
786e8c11
YO
2407/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2408 startbranch: the first branch in the whole branch sequence
2409 first : start branch of sequence of branch-exact nodes.
2410 May be the same as startbranch
2411 last : Thing following the last branch.
2412 May be the same as tail.
2413 tail : item following the branch sequence
2414 count : words in the sequence
a4525e78 2415 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2416 depth : indent depth
3dab1dad 2417
786e8c11 2418Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2419
786e8c11
YO
2420A trie is an N'ary tree where the branches are determined by digital
2421decomposition of the key. IE, at the root node you look up the 1st character and
2422follow that branch repeat until you find the end of the branches. Nodes can be
2423marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2424
786e8c11 2425 /he|she|his|hers/
72f13be8 2426
786e8c11
YO
2427would convert into the following structure. Numbers represent states, letters
2428following numbers represent valid transitions on the letter from that state, if
2429the number is in square brackets it represents an accepting state, otherwise it
2430will be in parenthesis.
07be1b83 2431
786e8c11
YO
2432 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2433 | |
2434 | (2)
2435 | |
2436 (1) +-i->(6)-+-s->[7]
2437 |
2438 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2439
786e8c11
YO
2440 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2441
2442This shows that when matching against the string 'hers' we will begin at state 1
2443read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2444then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2445is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2446single traverse. We store a mapping from accepting to state to which word was
2447matched, and then when we have multiple possibilities we try to complete the
b8fda935 2448rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2449
2450The only prior NFA like behaviour that would be changed by the TRIE support is
2451the silent ignoring of duplicate alternations which are of the form:
2452
2453 / (DUPE|DUPE) X? (?{ ... }) Y /x
2454
4b714af6 2455Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2456and without the optimisation. With the optimisations dupes will be silently
486ec47a 2457ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2458the following demonstrates:
2459
2460 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2461
2462which prints out 'word' three times, but
2463
2464 'words'=~/(word|word|word)(?{ print $1 })S/
2465
2466which doesnt print it out at all. This is due to other optimisations kicking in.
2467
2468Example of what happens on a structural level:
2469
486ec47a 2470The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2471
2472 1: CURLYM[1] {1,32767}(18)
2473 5: BRANCH(8)
2474 6: EXACT <ac>(16)
2475 8: BRANCH(11)
2476 9: EXACT <ad>(16)
2477 11: BRANCH(14)
2478 12: EXACT <ab>(16)
2479 16: SUCCEED(0)
2480 17: NOTHING(18)
2481 18: END(0)
2482
2483This would be optimizable with startbranch=5, first=5, last=16, tail=16
2484and should turn into:
2485
2486 1: CURLYM[1] {1,32767}(18)
2487 5: TRIE(16)
2488 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2489 <ac>
2490 <ad>
2491 <ab>
2492 16: SUCCEED(0)
2493 17: NOTHING(18)
2494 18: END(0)
2495
2496Cases where tail != last would be like /(?foo|bar)baz/:
2497
2498 1: BRANCH(4)
2499 2: EXACT <foo>(8)
2500 4: BRANCH(7)
2501 5: EXACT <bar>(8)
2502 7: TAIL(8)
2503 8: EXACT <baz>(10)
2504 10: END(0)
2505
2506which would be optimizable with startbranch=1, first=1, last=7, tail=8
2507and would end up looking like:
2508
2509 1: TRIE(8)
2510 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2511 <foo>
2512 <bar>
2513 7: TAIL(8)
2514 8: EXACT <baz>(10)
2515 10: END(0)
2516
c80e42f3 2517 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2518
2519is the recommended Unicode-aware way of saying
2520
2521 *(d++) = uv;
2522*/
2523
fab2782b 2524#define TRIE_STORE_REVCHAR(val) \
786e8c11 2525 STMT_START { \
73031816 2526 if (UTF) { \
668fcfea 2527 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2528 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
1d84a256
DM
2529 unsigned char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
2530 *kapow = '\0'; \
73031816
NC
2531 SvCUR_set(zlopp, kapow - flrbbbbb); \
2532 SvPOK_on(zlopp); \
2533 SvUTF8_on(zlopp); \
2534 av_push(revcharmap, zlopp); \
2535 } else { \
fab2782b 2536 char ooooff = (char)val; \
73031816
NC
2537 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2538 } \
2539 } STMT_END
786e8c11 2540
914a25d5
KW
2541/* This gets the next character from the input, folding it if not already
2542 * folded. */
2543#define TRIE_READ_CHAR STMT_START { \
2544 wordlen++; \
2545 if ( UTF ) { \
2546 /* if it is UTF then it is either already folded, or does not need \
2547 * folding */ \
1c1d615a 2548 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2549 } \
2550 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2551 /* This folder implies Unicode rules, which in the range expressible \
2552 * by not UTF is the lower case, with the two exceptions, one of \
2553 * which should have been taken care of before calling this */ \
2554 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2555 uvc = toLOWER_L1(*uc); \
2556 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2557 len = 1; \
914a25d5
KW
2558 } else { \
2559 /* raw data, will be folded later if needed */ \
2560 uvc = (U32)*uc; \
2561 len = 1; \
2562 } \
786e8c11
YO
2563} STMT_END
2564
2565
2566
2567#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2568 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
00195859 2569 U32 ging = TRIE_LIST_LEN( state ) * 2; \
f9003953 2570 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
00195859 2571 TRIE_LIST_LEN( state ) = ging; \
786e8c11
YO
2572 } \
2573 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2574 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2575 TRIE_LIST_CUR( state )++; \
2576} STMT_END
07be1b83 2577
786e8c11 2578#define TRIE_LIST_NEW(state) STMT_START { \
d09f14bf 2579 Newx( trie->states[ state ].trans.list, \
786e8c11
YO
2580 4, reg_trie_trans_le ); \
2581 TRIE_LIST_CUR( state ) = 1; \
2582 TRIE_LIST_LEN( state ) = 4; \
2583} STMT_END
07be1b83 2584
786e8c11
YO
2585#define TRIE_HANDLE_WORD(state) STMT_START { \
2586 U16 dupe= trie->states[ state ].wordnum; \
2587 regnode * const noper_next = regnext( noper ); \
2588 \
786e8c11
YO
2589 DEBUG_r({ \
2590 /* store the word for dumping */ \
2591 SV* tmp; \
2592 if (OP(noper) != NOTHING) \
740cce10 2593 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2594 else \
740cce10 2595 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2596 av_push( trie_words, tmp ); \
786e8c11
YO
2597 }); \
2598 \
2599 curword++; \
2e64971a
DM
2600 trie->wordinfo[curword].prev = 0; \
2601 trie->wordinfo[curword].len = wordlen; \
2602 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2603 \
2604 if ( noper_next < tail ) { \
2605 if (!trie->jump) \
538e84ed
KW
2606 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2607 sizeof(U16) ); \
7f69552c 2608 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2609 if (!jumper) \
2610 jumper = noper_next; \
2611 if (!nextbranch) \
2612 nextbranch= regnext(cur); \
2613 } \
2614 \
2615 if ( dupe ) { \
2e64971a
DM
2616 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2617 /* chain, so that when the bits of chain are later */\
2618 /* linked together, the dups appear in the chain */\
2619 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2620 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2621 } else { \
2622 /* we haven't inserted this word yet. */ \
2623 trie->states[ state ].wordnum = curword; \
2624 } \
2625} STMT_END
07be1b83 2626
3dab1dad 2627
786e8c11
YO
2628#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2629 ( ( base + charid >= ucharcount \
2630 && base + charid < ubound \
2631 && state == trie->trans[ base - ucharcount + charid ].check \
2632 && trie->trans[ base - ucharcount + charid ].next ) \
2633 ? trie->trans[ base - ucharcount + charid ].next \
2634 : ( state==1 ? special : 0 ) \
2635 )
3dab1dad 2636
8bcafbf4
YO
2637#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2638STMT_START { \
2639 TRIE_BITMAP_SET(trie, uvc); \
2640 /* store the folded codepoint */ \
2641 if ( folder ) \
2642 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2643 \
2644 if ( !UTF ) { \
2645 /* store first byte of utf8 representation of */ \
2646 /* variant codepoints */ \
2647 if (! UVCHR_IS_INVARIANT(uvc)) { \
2648 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2649 } \
2650 } \
2651} STMT_END
786e8c11
YO
2652#define MADE_TRIE 1
2653#define MADE_JUMP_TRIE 2
2654#define MADE_EXACT_TRIE 4
3dab1dad 2655
a3621e74 2656STATIC I32
538e84ed
KW
2657S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2658 regnode *first, regnode *last, regnode *tail,
2659 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2660{
2661 /* first pass, loop through and scan words */
2662 reg_trie_data *trie;
55eed653 2663 HV *widecharmap = NULL;
2b8b4781 2664 AV *revcharmap = newAV();
a3621e74 2665 regnode *cur;
a3621e74
YO
2666 STRLEN len = 0;
2667 UV uvc = 0;
2668 U16 curword = 0;
2669 U32 next_alloc = 0;
786e8c11
YO
2670 regnode *jumper = NULL;
2671 regnode *nextbranch = NULL;
7f69552c 2672 regnode *convert = NULL;
2e64971a 2673 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2674 /* we just use folder as a flag in utf8 */
1e696034 2675 const U8 * folder = NULL;
a3621e74 2676
3a611511
YO
2677 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2678 * which stands for one trie structure, one hash, optionally followed
2679 * by two arrays */
2b8b4781 2680#ifdef DEBUGGING
3a611511 2681 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2b8b4781
NC
2682 AV *trie_words = NULL;
2683 /* along with revcharmap, this only used during construction but both are
2684 * useful during debugging so we store them in the struct when debugging.
8e11feef 2685 */
2b8b4781 2686#else
cf78de0b 2687 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2688 STRLEN trie_charcount=0;
3dab1dad 2689#endif
2b8b4781 2690 SV *re_trie_maxbuff;
a3621e74 2691 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2692
2693 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2694#ifndef DEBUGGING
2695 PERL_UNUSED_ARG(depth);
2696#endif
a3621e74 2697
1e696034 2698 switch (flags) {
f6b4b99d 2699 case EXACT: case EXACT_ONLY8: case EXACTL: break;
89829bb5 2700 case EXACTFAA:
0ea669f4 2701 case EXACTFUP:
a4525e78
KW
2702 case EXACTFU:
2703 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2704 case EXACTF: folder = PL_fold; break;
fab2782b 2705 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2706 }
2707
c944940b 2708 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2709 trie->refcount = 1;
3dab1dad 2710 trie->startstate = 1;
786e8c11 2711 trie->wordcount = word_count;
f8fc2ecf 2712 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2713 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
f6b4b99d 2714 if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
c944940b 2715 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2716 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2717 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2718
a3621e74 2719 DEBUG_r({
2b8b4781 2720 trie_words = newAV();
a3621e74 2721 });
a3621e74 2722
3b89859a 2723 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, GV_ADD);
316ebaf2 2724 assert(re_trie_maxbuff);
a3621e74 2725 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2726 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2727 }
df826430 2728 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2729 Perl_re_indentf( aTHX_
cb41e5d6
YO
2730 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2731 depth+1,
88f063b4 2732 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
538e84ed 2733 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2734 });
538e84ed 2735
7f69552c
YO
2736 /* Find the node we are going to overwrite */
2737 if ( first == startbranch && OP( last ) != BRANCH ) {
2738 /* whole branch chain */
2739 convert = first;
2740 } else {
2741 /* branch sub-chain */
2742 convert = NEXTOPER( first );
2743 }
538e84ed 2744
a3621e74
YO
2745 /* -- First loop and Setup --
2746
2747 We first traverse the branches and scan each word to determine if it
2748 contains widechars, and how many unique chars there are, this is
2749 important as we have to build a table with at least as many columns as we
2750 have unique chars.
2751
2752 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2753 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2754 the native representation of the character value as the key and IV's for
2755 the coded index.
a3621e74
YO
2756
2757 *TODO* If we keep track of how many times each character is used we can
2758 remap the columns so that the table compression later on is more
3b753521 2759 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2760 middle and the least common are on the outside. IMO this would be better
2761 than a most to least common mapping as theres a decent chance the most
2762 common letter will share a node with the least common, meaning the node
486ec47a 2763 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2764 case is when we have the least common nodes twice.
2765
2766 */
2767
a3621e74 2768 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2769 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2770 const U8 *uc;
2771 const U8 *e;
bc031a7d 2772 int foldlen = 0;
07be1b83 2773 U32 wordlen = 0; /* required init */
bc031a7d
KW
2774 STRLEN minchars = 0;
2775 STRLEN maxchars = 0;
538e84ed
KW
2776 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2777 bitmap?*/
a3621e74 2778
3dab1dad 2779 if (OP(noper) == NOTHING) {
20ed8c88
YO
2780 /* skip past a NOTHING at the start of an alternation
2781 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2782 */
df826430 2783 regnode *noper_next= regnext(noper);
944e05e3
YO
2784 if (noper_next < tail)
2785 noper= noper_next;
2786 }
2787
f6b4b99d
KW
2788 if ( noper < tail
2789 && ( OP(noper) == flags
2790 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
a9f8c7ac 2791 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
0ea669f4 2792 || OP(noper) == EXACTFUP))))
f6b4b99d 2793 {
944e05e3
YO
2794 uc= (U8*)STRING(noper);
2795 e= uc + STR_LEN(noper);
2796 } else {
2797 trie->minlen= 0;
2798 continue;
3dab1dad 2799 }
df826430 2800
944e05e3 2801
fab2782b 2802 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2803 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2804 regardless of encoding */
0ea669f4 2805 if (OP( noper ) == EXACTFUP) {
fab2782b 2806 /* false positives are ok, so just set this */
0dc4a61d 2807 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2808 }
2809 }
dca5fc4c 2810
bc031a7d
KW
2811 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2812 branch */
3dab1dad 2813 TRIE_CHARCOUNT(trie)++;
a3621e74 2814 TRIE_READ_CHAR;
645de4ce 2815
bc031a7d
KW
2816 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2817 * is in effect. Under /i, this character can match itself, or
2818 * anything that folds to it. If not under /i, it can match just
2819 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2820 * all fold to k, and all are single characters. But some folds
2821 * expand to more than one character, so for example LATIN SMALL
2822 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2823 * the string beginning at 'uc' is 'ffi', it could be matched by
2824 * three characters, or just by the one ligature character. (It
2825 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2826 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2827 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2828 * match.) The trie needs to know the minimum and maximum number
2829 * of characters that could match so that it can use size alone to
2830 * quickly reject many match attempts. The max is simple: it is
2831 * the number of folded characters in this branch (since a fold is
2832 * never shorter than what folds to it. */
2833
2834 maxchars++;
2835
2836 /* And the min is equal to the max if not under /i (indicated by
2837 * 'folder' being NULL), or there are no multi-character folds. If
2838 * there is a multi-character fold, the min is incremented just
2839 * once, for the character that folds to the sequence. Each
2840 * character in the sequence needs to be added to the list below of
2841 * characters in the trie, but we count only the first towards the
2842 * min number of characters needed. This is done through the
2843 * variable 'foldlen', which is returned by the macros that look
2844 * for these sequences as the number of bytes the sequence
2845 * occupies. Each time through the loop, we decrement 'foldlen' by
2846 * how many bytes the current char occupies. Only when it reaches
2847 * 0 do we increment 'minchars' or look for another multi-character
2848 * sequence. */
2849 if (folder == NULL) {
2850 minchars++;
2851 }
2852 else if (foldlen > 0) {
2853 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2854 }
2855 else {
bc031a7d
KW
2856 minchars++;
2857
2858 /* See if *uc is the beginning of a multi-character fold. If
2859 * so, we decrement the length remaining to look at, to account
2860 * for the current character this iteration. (We can use 'uc'
2861 * instead of the fold returned by TRIE_READ_CHAR because for
2862 * non-UTF, the latin1_safe macro is smart enough to account
2863 * for all the unfolded characters, and because for UTF, the
2864 * string will already have been folded earlier in the
2865 * compilation process */
2866 if (UTF) {
2867 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2868 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2869 }
2870 }
bc031a7d
KW
2871 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2872 foldlen--;
2873 }
645de4ce 2874 }
bc031a7d
KW
2875
2876 /* The current character (and any potential folds) should be added
2877 * to the possible matching characters for this position in this
2878 * branch */
a3621e74 2879 if ( uvc < 256 ) {
fab2782b
YO
2880 if ( folder ) {
2881 U8 folded= folder[ (U8) uvc ];
2882 if ( !trie->charmap[ folded ] ) {
2883 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2884 TRIE_STORE_REVCHAR( folded );
2885 }
2886 }
a3621e74
YO
2887 if ( !trie->charmap[ uvc ] ) {
2888 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2889 TRIE_STORE_REVCHAR( uvc );
a3621e74 2890 }
02daf0ab 2891 if ( set_bit ) {
62012aee
KW
2892 /* store the codepoint in the bitmap, and its folded
2893 * equivalent. */
8bcafbf4 2894 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2895 set_bit = 0; /* We've done our bit :-) */
2896 }
a3621e74 2897 } else {
bc031a7d
KW
2898
2899 /* XXX We could come up with the list of code points that fold
2900 * to this using PL_utf8_foldclosures, except not for
2901 * multi-char folds, as there may be multiple combinations
2902 * there that could work, which needs to wait until runtime to
2903 * resolve (The comment about LIGATURE FFI above is such an
2904 * example */
2905
a3621e74 2906 SV** svpp;
55eed653
NC
2907 if ( !widecharmap )
2908 widecharmap = newHV();
a3621e74 2909
55eed653 2910 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2911
2912 if ( !svpp )
147e3846 2913 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
2914
2915 if ( !SvTRUE( *svpp ) ) {
2916 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2917 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2918 }
2919 }
bc031a7d
KW
2920 } /* end loop through characters in this branch of the trie */
2921
2922 /* We take the min and max for this branch and combine to find the min
2923 * and max for all branches processed so far */
3dab1dad 2924 if( cur == first ) {
bc031a7d
KW
2925 trie->minlen = minchars;
2926 trie->maxlen = maxchars;
2927 } else if (minchars < trie->minlen) {
2928 trie->minlen = minchars;
2929 } else if (maxchars > trie->maxlen) {
2930 trie->maxlen = maxchars;
fab2782b 2931 }
a3621e74
YO
2932 } /* end first pass */
2933 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2934 Perl_re_indentf( aTHX_
cb41e5d6
YO
2935 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2936 depth+1,
55eed653 2937 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2938 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2939 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2940 );
a3621e74
YO
2941
2942 /*
2943 We now know what we are dealing with in terms of unique chars and
2944 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2945 representation using a flat table will take. If it's over a reasonable
2946 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2947 conservative but potentially much slower representation using an array
2948 of lists.
2949
2950 At the end we convert both representations into the same compressed
2951 form that will be used in regexec.c for matching with. The latter
2952 is a form that cannot be used to construct with but has memory
2953 properties similar to the list form and access properties similar
2954 to the table form making it both suitable for fast searches and
2955 small enough that its feasable to store for the duration of a program.
2956
2957 See the comment in the code where the compressed table is produced
2958 inplace from the flat tabe representation for an explanation of how
2959 the compression works.
2960
2961 */
2962
2963
2e64971a
DM
2964 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2965 prev_states[1] = 0;
2966
538e84ed
KW
2967 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2968 > SvIV(re_trie_maxbuff) )
2969 {
a3621e74
YO
2970 /*
2971 Second Pass -- Array Of Lists Representation
2972
2973 Each state will be represented by a list of charid:state records
2974 (reg_trie_trans_le) the first such element holds the CUR and LEN
2975 points of the allocated array. (See defines above).
2976
2977 We build the initial structure using the lists, and then convert
2978 it into the compressed table form which allows faster lookups
2979 (but cant be modified once converted).
a3621e74
YO
2980 */
2981
a3621e74
YO
2982 STRLEN transcount = 1;
2983
6ad9a8ab 2984 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 2985 depth+1));
686b73d4 2986
c944940b
JH
2987 trie->states = (reg_trie_state *)
2988 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2989 sizeof(reg_trie_state) );
a3621e74
YO
2990 TRIE_LIST_NEW(1);
2991 next_alloc = 2;
2992
2993 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2994
df826430 2995 regnode *noper = NEXTOPER( cur );
c445ea15
AL
2996 U32 state = 1; /* required init */
2997 U16 charid = 0; /* sanity init */
07be1b83 2998 U32 wordlen = 0; /* required init */
c445ea15 2999
df826430
YO
3000 if (OP(noper) == NOTHING) {
3001 regnode *noper_next= regnext(noper);
944e05e3
YO
3002 if (noper_next < tail)
3003 noper= noper_next;
df826430
YO
3004 }
3005
f6b4b99d
KW
3006 if ( noper < tail
3007 && ( OP(noper) == flags
3008 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
a9f8c7ac 3009 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
0ea669f4 3010 || OP(noper) == EXACTFUP))))
f6b4b99d 3011 {
944e05e3
YO
3012 const U8 *uc= (U8*)STRING(noper);
3013 const U8 *e= uc + STR_LEN(noper);
3014
786e8c11 3015 for ( ; uc < e ; uc += len ) {
c445ea15 3016
786e8c11 3017 TRIE_READ_CHAR;
c445ea15 3018
786e8c11
YO
3019 if ( uvc < 256 ) {
3020 charid = trie->charmap[ uvc ];
c445ea15 3021 } else {
538e84ed
KW
3022 SV** const svpp = hv_fetch( widecharmap,
3023 (char*)&uvc,
3024 sizeof( UV ),
3025 0);
786e8c11
YO
3026 if ( !svpp ) {
3027 charid = 0;
3028 } else {
3029 charid=(U16)SvIV( *svpp );
3030 }
c445ea15 3031 }
538e84ed
KW
3032 /* charid is now 0 if we dont know the char read, or
3033 * nonzero if we do */
786e8c11 3034 if ( charid ) {
a3621e74 3035
786e8c11
YO
3036 U16 check;
3037 U32 newstate = 0;
a3621e74 3038
786e8c11
YO
3039 charid--;
3040 if ( !trie->states[ state ].trans.list ) {
3041 TRIE_LIST_NEW( state );
c445ea15 3042 }
538e84ed
KW
3043 for ( check = 1;
3044 check <= TRIE_LIST_USED( state );
3045 check++ )
3046 {
3047 if ( TRIE_LIST_ITEM( state, check ).forid
3048 == charid )
3049 {
786e8c11
YO
3050 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3051 break;
3052 }
3053 }
3054 if ( ! newstate ) {
3055 newstate = next_alloc++;
2e64971a 3056 prev_states[newstate] = state;
786e8c11
YO
3057 TRIE_LIST_PUSH( state, charid, newstate );
3058 transcount++;
3059 }
3060 state = newstate;
3061 } else {
147e3846 3062 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 3063 }
a28509cc 3064 }
c445ea15 3065 }
3dab1dad 3066 TRIE_HANDLE_WORD(state);
a3621e74
YO
3067
3068 } /* end second pass */
3069
1e2e3d02 3070 /* next alloc is the NEXT state to be allocated */
538e84ed 3071 trie->statecount = next_alloc;
c944940b
JH
3072 trie->states = (reg_trie_state *)
3073 PerlMemShared_realloc( trie->states,
3074 next_alloc
3075 * sizeof(reg_trie_state) );
a3621e74 3076
3dab1dad 3077 /* and now dump it out before we compress it */
2b8b4781
NC
3078 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3079 revcharmap, next_alloc,
3080 depth+1)
1e2e3d02 3081 );
a3621e74 3082
c944940b
JH
3083 trie->trans = (reg_trie_trans *)
3084 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
3085 {
3086 U32 state;
a3621e74
YO
3087 U32 tp = 0;
3088 U32 zp = 0;
3089
3090
3091 for( state=1 ; state < next_alloc ; state ++ ) {
3092 U32 base=0;
3093
3094 /*
3095 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3096 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
3097 );
3098 */
3099
3100 if (trie->states[state].trans.list) {
3101 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3102 U16 maxid=minid;
a28509cc 3103 U16 idx;
a3621e74
YO
3104
3105 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
3106 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3107 if ( forid < minid ) {
3108 minid=forid;
3109 } else if ( forid > maxid ) {
3110 maxid=forid;
3111 }
a3621e74
YO
3112 }
3113 if ( transcount < tp + maxid - minid + 1) {
3114 transcount *= 2;
c944940b
JH
3115 trie->trans = (reg_trie_trans *)
3116 PerlMemShared_realloc( trie->trans,
446bd890
NC
3117 transcount
3118 * sizeof(reg_trie_trans) );
538e84ed
KW
3119 Zero( trie->trans + (transcount / 2),
3120 transcount / 2,
3121 reg_trie_trans );
a3621e74
YO
3122 }
3123 base = trie->uniquecharcount + tp - minid;
3124 if ( maxid == minid ) {
3125 U32 set = 0;
3126 for ( ; zp < tp ; zp++ ) {
3127 if ( ! trie->trans[ zp ].next ) {
3128 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
3129 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3130 1).newstate;
a3621e74
YO
3131 trie->trans[ zp ].check = state;
3132 set = 1;
3133 break;
3134 }
3135 }
3136 if ( !set ) {
538e84ed
KW
3137 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3138 1).newstate;
a3621e74
YO
3139 trie->trans[ tp ].check = state;
3140 tp++;
3141 zp = tp;
3142 }
3143 } else {
3144 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
3145 const U32 tid = base
3146 - trie->uniquecharcount
3147 + TRIE_LIST_ITEM( state, idx ).forid;
3148 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3149 idx ).newstate;
a3621e74
YO
3150 trie->trans[ tid ].check = state;
3151 }
3152 tp += ( maxid - minid + 1 );
3153 }
3154 Safefree(trie->states[ state ].trans.list);
3155 }
3156 /*
3157 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3158 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
3159 );
3160 */
3161 trie->states[ state ].trans.base=base;
3162 }
cc601c31 3163 trie->lasttrans = tp + 1;
a3621e74
YO
3164 }
3165 } else {
3166 /*
3167 Second Pass -- Flat Table Representation.
3168
b423522f
KW
3169 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3170 each. We know that we will need Charcount+1 trans at most to store
3171 the data (one row per char at worst case) So we preallocate both
3172 structures assuming worst case.
a3621e74
YO
3173
3174 We then construct the trie using only the .next slots of the entry
3175 structs.
3176
b423522f
KW
3177 We use the .check field of the first entry of the node temporarily
3178 to make compression both faster and easier by keeping track of how
3179 many non zero fields are in the node.
a3621e74
YO
3180
3181 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3182 transition.
3183
b423522f
KW
3184 There are two terms at use here: state as a TRIE_NODEIDX() which is
3185 a number representing the first entry of the node, and state as a
3186 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3187 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3188 if there are 2 entrys per node. eg:
a3621e74
YO
3189
3190 A B A B
3191 1. 2 4 1. 3 7
3192 2. 0 3 3. 0 5
3193 3. 0 0 5. 0 0
3194 4. 0 0 7. 0 0
3195
b423522f
KW
3196 The table is internally in the right hand, idx form. However as we
3197 also have to deal with the states array which is indexed by nodenum
3198 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
3199
3200 */
6ad9a8ab 3201 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 3202 depth+1));
3dab1dad 3203
c944940b
JH
3204 trie->trans = (reg_trie_trans *)
3205 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3206 * trie->uniquecharcount + 1,
3207 sizeof(reg_trie_trans) );
3208 trie->states = (reg_trie_state *)
3209 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3210 sizeof(reg_trie_state) );
a3621e74
YO
3211 next_alloc = trie->uniquecharcount + 1;
3212
3dab1dad 3213
a3621e74
YO
3214 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3215
df826430 3216 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3217
3218 U32 state = 1; /* required init */
3219
3220 U16 charid = 0; /* sanity init */
3221 U32 accept_state = 0; /* sanity init */
a3621e74 3222
07be1b83 3223 U32 wordlen = 0; /* required init */
a3621e74 3224
df826430
YO
3225 if (OP(noper) == NOTHING) {
3226 regnode *noper_next= regnext(noper);
944e05e3
YO
3227 if (noper_next < tail)
3228 noper= noper_next;
df826430 3229 }
fab2782b 3230
f6b4b99d
KW
3231 if ( noper < tail
3232 && ( OP(noper) == flags
3233 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
a9f8c7ac 3234 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
0ea669f4 3235 || OP(noper) == EXACTFUP))))
f6b4b99d 3236 {
944e05e3
YO
3237 const U8 *uc= (U8*)STRING(noper);
3238 const U8 *e= uc + STR_LEN(noper);
3239
786e8c11 3240 for ( ; uc < e ; uc += len ) {
a3621e74 3241
786e8c11 3242 TRIE_READ_CHAR;
a3621e74 3243
786e8c11
YO
3244 if ( uvc < 256 ) {
3245 charid = trie->charmap[ uvc ];
3246 } else {
538e84ed
KW
3247 SV* const * const svpp = hv_fetch( widecharmap,
3248 (char*)&uvc,
3249 sizeof( UV ),
3250 0);
786e8c11 3251 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3252 }
786e8c11
YO
3253 if ( charid ) {
3254 charid--;
3255 if ( !trie->trans[ state + charid ].next ) {
3256 trie->trans[ state + charid ].next = next_alloc;
3257 trie->trans[ state ].check++;
2e64971a
DM
3258 prev_states[TRIE_NODENUM(next_alloc)]
3259 = TRIE_NODENUM(state);
786e8c11
YO
3260 next_alloc += trie->uniquecharcount;
3261 }
3262 state = trie->trans[ state + charid ].next;
3263 } else {
147e3846 3264 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3265 }
538e84ed
KW
3266 /* charid is now 0 if we dont know the char read, or
3267 * nonzero if we do */
a3621e74 3268 }
a3621e74 3269 }
3dab1dad
YO
3270 accept_state = TRIE_NODENUM( state );
3271 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3272
3273 } /* end second pass */
3274
3dab1dad 3275 /* and now dump it out before we compress it */
2b8b4781
NC
3276 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3277 revcharmap,
3278 next_alloc, depth+1));
a3621e74 3279
a3621e74
YO
3280 {
3281 /*
3282 * Inplace compress the table.*
3283
3284 For sparse data sets the table constructed by the trie algorithm will
3285 be mostly 0/FAIL transitions or to put it another way mostly empty.
3286 (Note that leaf nodes will not contain any transitions.)
3287
3288 This algorithm compresses the tables by eliminating most such
3289 transitions, at the cost of a modest bit of extra work during lookup:
3290
3291 - Each states[] entry contains a .base field which indicates the
3292 index in the state[] array wheres its transition data is stored.
3293
3b753521 3294 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3295
3296 - If .base is nonzero then charid is added to it to find an entry in
3297 the trans array.
3298
3299 -If trans[states[state].base+charid].check!=state then the
3300 transition is taken to be a 0/Fail transition. Thus if there are fail
3301 transitions at the front of the node then the .base offset will point
3302 somewhere inside the previous nodes data (or maybe even into a node
3303 even earlier), but the .check field determines if the transition is
3304 valid.
3305
786e8c11 3306 XXX - wrong maybe?
a3621e74 3307 The following process inplace converts the table to the compressed
3b753521 3308 table: We first do not compress the root node 1,and mark all its
a3621e74 3309 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3310 allows us to do a DFA construction from the compressed table later,
3311 and ensures that any .base pointers we calculate later are greater
3312 than 0.
a3621e74
YO
3313
3314 - We set 'pos' to indicate the first entry of the second node.
3315
3316 - We then iterate over the columns of the node, finding the first and
3317 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3318 and set the .check pointers accordingly, and advance pos
3319 appropriately and repreat for the next node. Note that when we copy
3320 the next pointers we have to convert them from the original
3321 NODEIDX form to NODENUM form as the former is not valid post
3322 compression.
3323
3324 - If a node has no transitions used we mark its base as 0 and do not
3325 advance the pos pointer.
3326
3327 - If a node only has one transition we use a second pointer into the
3328 structure to fill in allocated fail transitions from other states.
3329 This pointer is independent of the main pointer and scans forward
3330 looking for null transitions that are allocated to a state. When it
3331 finds one it writes the single transition into the "hole". If the
786e8c11 3332 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3333
3334 - Once compressed we can Renew/realloc the structures to release the
3335 excess space.
3336
3337 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3338 specifically Fig 3.47 and the associated pseudocode.
3339
3340 demq
3341 */
a3b680e6 3342 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3343 U32 state, charid;
a3621e74 3344 U32 pos = 0, zp=0;
1e2e3d02 3345 trie->statecount = laststate;
a3621e74
YO
3346
3347 for ( state = 1 ; state < laststate ; state++ ) {
3348 U8 flag = 0;
a28509cc
AL
3349 const U32 stateidx = TRIE_NODEIDX( state );
3350 const U32 o_used = trie->trans[ stateidx ].check;
3351 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3352 trie->trans[ stateidx ].check = 0;
3353
538e84ed
KW
3354 for ( charid = 0;
3355 used && charid < trie->uniquecharcount;
3356 charid++ )
3357 {
a3621e74
YO
3358 if ( flag || trie->trans[ stateidx + charid ].next ) {
3359 if ( trie->trans[ stateidx + charid ].next ) {
3360 if (o_used == 1) {
3361 for ( ; zp < pos ; zp++ ) {
3362 if ( ! trie->trans[ zp ].next ) {
3363 break;
3364 }
3365 }
538e84ed
KW
3366 trie->states[ state ].trans.base
3367 = zp
3368 + trie->uniquecharcount
3369 - charid ;
3370 trie->trans[ zp ].next
3371 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3372 + charid ].next );
a3621e74
YO
3373 trie->trans[ zp ].check = state;
3374 if ( ++zp > pos ) pos = zp;
3375 break;
3376 }
3377 used--;
3378 }
3379 if ( !flag ) {
3380 flag = 1;
538e84ed
KW
3381 trie->states[ state ].trans.base
3382 = pos + trie->uniquecharcount - charid ;
a3621e74 3383 }
538e84ed
KW
3384 trie->trans[ pos ].next
3385 = SAFE_TRIE_NODENUM(
3386 trie->trans[ stateidx + charid ].next );
a3621e74
YO
3387 trie->trans[ pos ].check = state;
3388 pos++;
3389 }
3390 }
3391 }
cc601c31 3392 trie->lasttrans = pos + 1;
c944940b
JH
3393 trie->states = (reg_trie_state *)
3394 PerlMemShared_realloc( trie->states, laststate
3395 * sizeof(reg_trie_state) );
a3621e74 3396 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3397 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
cb41e5d6 3398 depth+1,
538e84ed
KW
3399 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3400 + 1 ),
3401 (IV)next_alloc,
3402 (IV)pos,
3403 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
3404 );
3405
3406 } /* end table compress */
3407 }
1e2e3d02 3408 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3409 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
cb41e5d6 3410 depth+1,
1e2e3d02
YO
3411 (UV)trie->statecount,
3412 (UV)trie->lasttrans)
3413 );
cc601c31 3414 /* resize the trans array to remove unused space */
c944940b
JH
3415 trie->trans = (reg_trie_trans *)
3416 PerlMemShared_realloc( trie->trans, trie->lasttrans
3417 * sizeof(reg_trie_trans) );