This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add more checking for regnode offset overflowing
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
c277df42 77#define REG_COMP_C
54df2634
NC
78#ifdef PERL_IN_XSUB_RE
79# include "re_comp.h"
afda64e8 80EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
81#else
82# include "regcomp.h"
83#endif
a687059c 84
f7e03a10 85#include "dquote_inline.h"
b992490d 86#include "invlist_inline.h"
1b0f46bf 87#include "unicode_constants.h"
04e98a4d 88
538e84ed
KW
89#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
90 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
91#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
92 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 93#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 94#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 95
a687059c
LW
96#ifndef STATIC
97#define STATIC static
98#endif
99
3f910e62
YO
100/* this is a chain of data about sub patterns we are processing that
101 need to be handled separately/specially in study_chunk. Its so
102 we can simulate recursion without losing state. */
103struct scan_frame;
104typedef struct scan_frame {
105 regnode *last_regnode; /* last node to process in this frame */
106 regnode *next_regnode; /* next node to process when last is reached */
107 U32 prev_recursed_depth;
108 I32 stopparen; /* what stopparen do we use */
3f910e62
YO
109
110 struct scan_frame *this_prev_frame; /* this previous frame */
111 struct scan_frame *prev_frame; /* previous frame */
112 struct scan_frame *next_frame; /* next frame */
113} scan_frame;
114
f4ae5a27
KW
115/* Certain characters are output as a sequence with the first being a
116 * backslash. */
eb7486de 117#define isBACKSLASHED_PUNCT(c) strchr("-[]\\^", c)
f4ae5a27
KW
118
119
09b2b2e6 120struct RExC_state_t {
514a91f1
DM
121 U32 flags; /* RXf_* are we folding, multilining? */
122 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 123 char *precomp; /* uncompiled string. */
711b303b 124 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 125 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 126 regexp *rx; /* perl core regexp structure */
538e84ed
KW
127 regexp_internal *rxi; /* internal data for regexp object
128 pprivate field */
fac92740 129 char *start; /* Start of input for compile */
830247a4
IZ
130 char *end; /* End of input for compile */
131 char *parse; /* Input-scan pointer. */
51684e3c
KW
132 char *copy_start; /* start of copy of input within
133 constructed parse string */
134 char *copy_start_in_input; /* Position in input string
135 corresponding to copy_start */
ea3daa5d 136 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 137 regnode *emit_start; /* Start of emitted-code area */
f55b7b26 138 regnode_offset emit; /* Code-emit pointer */
830247a4
IZ
139 I32 naughty; /* How bad is this pattern? */
140 I32 sawback; /* Did we see \1, ...? */
141 U32 seen;
88f063b4
KW
142 SSize_t size; /* Number of regnode equivalents in
143 pattern */
7c932d07
KW
144
145 /* position beyond 'precomp' of the warning message furthest away from
146 * 'precomp'. During the parse, no warnings are raised for any problems
147 * earlier in the parse than this position. This works if warnings are
148 * raised the first time a given spot is parsed, and if only one
149 * independent warning is raised for any given spot */
150 Size_t latest_warn_offset;
151
152 I32 npar; /* Capture buffer count so far in the
153 parse, (OPEN) plus one. ("par" 0 is
154 the whole pattern)*/
155 I32 total_par; /* During initial parse, is either 0,
156 or -1; the latter indicating a
157 reparse is needed. After that pass,
158 it is what 'npar' became after the
159 pass. Hence, it being > 0 indicates
160 we are in a reparse situation */
538e84ed
KW
161 I32 nestroot; /* root parens we are in - used by
162 accept */
830247a4 163 I32 seen_zerolen;
f55b7b26
KW
164 regnode_offset *open_parens; /* offsets to open parens */
165 regnode_offset *close_parens; /* offsets to close parens */
d5a00e4a 166 regnode *end_op; /* END node in program */
02daf0ab
YO
167 I32 utf8; /* whether the pattern is utf8 or not */
168 I32 orig_utf8; /* whether the pattern was originally in utf8 */
169 /* XXX use this for future optimisation of case
170 * where pattern must be upgraded to utf8. */
e40e74fe
KW
171 I32 uni_semantics; /* If a d charset modifier should use unicode
172 rules, even if the pattern is not in
173 utf8 */
81714fb9 174 HV *paren_names; /* Paren names */
538e84ed 175
40d049e4 176 regnode **recurse; /* Recurse regops */
88f063b4 177 I32 recurse_count; /* Number of recurse regops we have generated */
4286711a 178 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 179 through */
09a65838 180 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 181 I32 in_lookbehind;
4624b182 182 I32 contains_locale;
bb3f3ed2 183 I32 override_recoding;
b6d67071
KW
184#ifdef EBCDIC
185 I32 recode_x_to_native;
186#endif
9d53c457 187 I32 in_multi_char_class;
1acab4c5 188 struct reg_code_blocks *code_blocks;/* positions of literal (?{})
68e2671b 189 within pattern */
b1603ef8 190 int code_index; /* next code_blocks[] slot */
ee273784 191 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
192 scan_frame *frame_head;
193 scan_frame *frame_last;
194 U32 frame_count;
7eec73eb 195 AV *warn_text;
945961fd 196 HV *unlexed_names;
dc6d7f5c 197#ifdef ADD_TO_REGEXEC
830247a4
IZ
198 char *starttry; /* -Dr: where regtry was called. */
199#define RExC_starttry (pRExC_state->starttry)
200#endif
d24ca0c5 201 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 202#ifdef DEBUGGING
be8e71aa 203 const char *lastparse;
3dab1dad 204 I32 lastnum;
1f1031fe 205 AV *paren_name_list; /* idx -> name */
d9a72fcc 206 U32 study_chunk_recursed_count;
c9f0d54c
YO
207 SV *mysv1;
208 SV *mysv2;
88f063b4 209
3dab1dad
YO
210#define RExC_lastparse (pRExC_state->lastparse)
211#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 212#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 213#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
214#define RExC_mysv (pRExC_state->mysv1)
215#define RExC_mysv1 (pRExC_state->mysv1)
216#define RExC_mysv2 (pRExC_state->mysv2)
217
3dab1dad 218#endif
bd0b62a6 219 bool seen_d_op;
911bd04e 220 bool strict;
da7cf1cc 221 bool study_started;
034602eb 222 bool in_script_run;
7c932d07 223 bool use_BRANCHJ;
09b2b2e6 224};
830247a4 225
e2509266 226#define RExC_flags (pRExC_state->flags)
514a91f1 227#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 228#define RExC_precomp (pRExC_state->precomp)
51684e3c
KW
229#define RExC_copy_start_in_input (pRExC_state->copy_start_in_input)
230#define RExC_copy_start_in_constructed (pRExC_state->copy_start)
711b303b 231#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 232#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 233#define RExC_rx (pRExC_state->rx)
f8fc2ecf 234#define RExC_rxi (pRExC_state->rxi)
fac92740 235#define RExC_start (pRExC_state->start)
830247a4
IZ
236#define RExC_end (pRExC_state->end)
237#define RExC_parse (pRExC_state->parse)
7c932d07 238#define RExC_latest_warn_offset (pRExC_state->latest_warn_offset )
830247a4 239#define RExC_whilem_seen (pRExC_state->whilem_seen)
bd0b62a6
KW
240#define RExC_seen_d_op (pRExC_state->seen_d_op) /* Seen something that differs
241 under /d from /u ? */
512c0f5a 242
512c0f5a 243
7122b237 244#ifdef RE_TRACK_PATTERN_OFFSETS
44f36cd3 245# define RExC_offsets (RExC_rxi->u.offsets) /* I am not like the
538e84ed 246 others */
7122b237 247#endif
830247a4 248#define RExC_emit (pRExC_state->emit)
fac92740 249#define RExC_emit_start (pRExC_state->emit_start)
830247a4
IZ
250#define RExC_sawback (pRExC_state->sawback)
251#define RExC_seen (pRExC_state->seen)
252#define RExC_size (pRExC_state->size)
ee273784 253#define RExC_maxlen (pRExC_state->maxlen)
830247a4 254#define RExC_npar (pRExC_state->npar)
1533d1d2 255#define RExC_total_parens (pRExC_state->total_par)
e2e6a0f1 256#define RExC_nestroot (pRExC_state->nestroot)
830247a4 257#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 258#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 259#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 260#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
261#define RExC_open_parens (pRExC_state->open_parens)
262#define RExC_close_parens (pRExC_state->close_parens)
5bd2d46e 263#define RExC_end_op (pRExC_state->end_op)
81714fb9 264#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
265#define RExC_recurse (pRExC_state->recurse)
266#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 267#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
268#define RExC_study_chunk_recursed_bytes \
269 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 270#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 271#define RExC_contains_locale (pRExC_state->contains_locale)
b6d67071
KW
272#ifdef EBCDIC
273# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
274#endif
9d53c457 275#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
276#define RExC_frame_head (pRExC_state->frame_head)
277#define RExC_frame_last (pRExC_state->frame_last)
278#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 279#define RExC_strict (pRExC_state->strict)
da7cf1cc 280#define RExC_study_started (pRExC_state->study_started)
7eec73eb 281#define RExC_warn_text (pRExC_state->warn_text)
034602eb 282#define RExC_in_script_run (pRExC_state->in_script_run)
7c932d07 283#define RExC_use_BRANCHJ (pRExC_state->use_BRANCHJ)
945961fd 284#define RExC_unlexed_names (pRExC_state->unlexed_names)
830247a4 285
99807a43
HS
286/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
287 * a flag to disable back-off on the fixed/floating substrings - if it's
288 * a high complexity pattern we assume the benefit of avoiding a full match
289 * is worth the cost of checking for the substrings even if they rarely help.
290 */
291#define RExC_naughty (pRExC_state->naughty)
292#define TOO_NAUGHTY (10)
293#define MARK_NAUGHTY(add) \
294 if (RExC_naughty < TOO_NAUGHTY) \
295 RExC_naughty += (add)
296#define MARK_NAUGHTY_EXP(exp, add) \
297 if (RExC_naughty < TOO_NAUGHTY) \
298 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 299
a687059c
LW
300#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
301#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 302 ((*s) == '{' && regcurly(s)))
a687059c
LW
303
304/*
305 * Flags to be passed up and down.
306 */
a687059c 307#define WORST 0 /* Worst case. */
c4ead770
KW
308#define HASWIDTH 0x01 /* Known to not match null strings, could match
309 non-null ones. */
fda99bee 310
e64f369d 311/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
312 * character. (There needs to be a case: in the switch statement in regexec.c
313 * for any node marked SIMPLE.) Note that this is not the same thing as
314 * REGNODE_SIMPLE */
fda99bee 315#define SIMPLE 0x02
e64f369d 316#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
317#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
318#define TRYAGAIN 0x10 /* Weeded out a declaration. */
c36361b2
KW
319#define RESTART_PARSE 0x20 /* Need to redo the parse */
320#define NEED_UTF8 0x40 /* In conjunction with RESTART_PARSE, need to
321 calcuate sizes as UTF-8 */
a687059c 322
3dab1dad
YO
323#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
324
07be1b83
YO
325/* whether trie related optimizations are enabled */
326#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
327#define TRIE_STUDY_OPT
786e8c11 328#define FULL_TRIE_STUDY
07be1b83
YO
329#define TRIE_STCLASS
330#endif
1de06328
YO
331
332
40d049e4
YO
333
334#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
335#define PBITVAL(paren) (1 << ((paren) & 7))
336#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
337#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
338#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
339
82a6ada4 340#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 341 if (!UTF) { \
c36361b2 342 *flagp = RESTART_PARSE|NEED_UTF8; \
f55b7b26 343 return 0; \
8d9c2815 344 } \
82a6ada4 345 } STMT_END
40d049e4 346
d63d75f5 347/* Change from /d into /u rules, and restart the parse. RExC_uni_semantics is
f6e8f31e
KW
348 * a flag that indicates we need to override /d with /u as a result of
349 * something in the pattern. It should only be used in regards to calling
350 * set_regex_charset() or get_regex_charse() */
512c0f5a
KW
351#define REQUIRE_UNI_RULES(flagp, restart_retval) \
352 STMT_START { \
353 if (DEPENDS_SEMANTICS) { \
512c0f5a
KW
354 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
355 RExC_uni_semantics = 1; \
b76e1c04 356 if (RExC_seen_d_op && LIKELY(! IN_PARENS_PASS)) { \
bd0b62a6
KW
357 /* No need to restart the parse if we haven't seen \
358 * anything that differs between /u and /d, and no need \
359 * to restart immediately if we're going to reparse \
360 * anyway to count parens */ \
c36361b2 361 *flagp |= RESTART_PARSE; \
512c0f5a 362 return restart_retval; \
d1be211c 363 } \
512c0f5a
KW
364 } \
365 } STMT_END
366
7c932d07
KW
367#define REQUIRE_BRANCHJ(flagp, restart_retval) \
368 STMT_START { \
369 RExC_use_BRANCHJ = 1; \
b76e1c04 370 if (LIKELY(! IN_PARENS_PASS)) { \
523a2924
KW
371 /* No need to restart the parse immediately if we're \
372 * going to reparse anyway to count parens */ \
373 *flagp |= RESTART_PARSE; \
374 return restart_retval; \
375 } \
7c932d07
KW
376 } STMT_END
377
b76e1c04
KW
378/* Until we have completed the parse, we leave RExC_total_parens at 0 or
379 * less. After that, it must always be positive, because the whole re is
380 * considered to be surrounded by virtual parens. Setting it to negative
381 * indicates there is some construct that needs to know the actual number of
382 * parens to be properly handled. And that means an extra pass will be
383 * required after we've counted them all */
384#define ALL_PARENS_COUNTED (RExC_total_parens > 0)
7c932d07 385#define REQUIRE_PARENS_PASS \
b76e1c04
KW
386 STMT_START { /* No-op if have completed a pass */ \
387 if (! ALL_PARENS_COUNTED) RExC_total_parens = -1; \
7c932d07 388 } STMT_END
b76e1c04
KW
389#define IN_PARENS_PASS (RExC_total_parens < 0)
390
7c932d07 391
f358228e
KW
392/* This is used to return failure (zero) early from the calling function if
393 * various flags in 'flags' are set. Two flags always cause a return:
394 * 'RESTART_PARSE' and 'NEED_UTF8'. 'extra' can be used to specify any
395 * additional flags that should cause a return; 0 if none. If the return will
396 * be done, '*flagp' is first set to be all of the flags that caused the
397 * return. */
398#define RETURN_FAIL_ON_RESTART_OR_FLAGS(flags,flagp,extra) \
fd7fc6ae 399 STMT_START { \
c36361b2
KW
400 if ((flags) & (RESTART_PARSE|NEED_UTF8|(extra))) { \
401 *(flagp) = (flags) & (RESTART_PARSE|NEED_UTF8|(extra)); \
f358228e 402 return 0; \
fd7fc6ae
YO
403 } \
404 } STMT_END
405
c36361b2 406#define MUST_RESTART(flags) ((flags) & (RESTART_PARSE))
fd7fc6ae 407
5d555bb8 408#define RETURN_FAIL_ON_RESTART(flags,flagp) \
f358228e 409 RETURN_FAIL_ON_RESTART_OR_FLAGS( flags, flagp, 0)
5d555bb8 410#define RETURN_FAIL_ON_RESTART_FLAGP(flagp) \
f358228e 411 if (MUST_RESTART(*(flagp))) return 0
fd7fc6ae 412
f19b1a63
KW
413/* This converts the named class defined in regcomp.h to its equivalent class
414 * number defined in handy.h. */
415#define namedclass_to_classnum(class) ((int) ((class) / 2))
416#define classnum_to_namedclass(classnum) ((classnum) * 2)
417
de92f5e6
KW
418#define _invlist_union_complement_2nd(a, b, output) \
419 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
420#define _invlist_intersection_complement_2nd(a, b, output) \
421 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
422
1de06328
YO
423/* About scan_data_t.
424
425 During optimisation we recurse through the regexp program performing
426 various inplace (keyhole style) optimisations. In addition study_chunk
427 and scan_commit populate this data structure with information about
538e84ed 428 what strings MUST appear in the pattern. We look for the longest
3b753521 429 string that must appear at a fixed location, and we look for the
1de06328
YO
430 longest string that may appear at a floating location. So for instance
431 in the pattern:
538e84ed 432
1de06328 433 /FOO[xX]A.*B[xX]BAR/
538e84ed 434
1de06328
YO
435 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
436 strings (because they follow a .* construct). study_chunk will identify
437 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 438
1de06328 439 The strings can be composites, for instance
538e84ed 440
1de06328 441 /(f)(o)(o)/
538e84ed 442
1de06328 443 will result in a composite fixed substring 'foo'.
538e84ed 444
1de06328 445 For each string some basic information is maintained:
538e84ed 446
6ea77169 447 - min_offset
1de06328
YO
448 This is the position the string must appear at, or not before.
449 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
450 characters must match before the string we are searching for.
451 Likewise when combined with minlenp and the length of the string it
538e84ed 452 tells us how many characters must appear after the string we have
1de06328 453 found.
538e84ed 454
1de06328
YO
455 - max_offset
456 Only used for floating strings. This is the rightmost point that
ea3daa5d 457 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 458 string can occur infinitely far to the right.
6e12f832 459 For fixed strings, it is equal to min_offset.
538e84ed 460
1de06328 461 - minlenp
2d608413
KW
462 A pointer to the minimum number of characters of the pattern that the
463 string was found inside. This is important as in the case of positive
538e84ed 464 lookahead or positive lookbehind we can have multiple patterns
1de06328 465 involved. Consider
538e84ed 466
1de06328 467 /(?=FOO).*F/
538e84ed 468
1de06328
YO
469 The minimum length of the pattern overall is 3, the minimum length
470 of the lookahead part is 3, but the minimum length of the part that
538e84ed 471 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 472 minimum length for the F is 1. This is important as the minimum length
538e84ed 473 is used to determine offsets in front of and behind the string being
1de06328 474 looked for. Since strings can be composites this is the length of the
486ec47a 475 pattern at the time it was committed with a scan_commit. Note that
1de06328 476 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 477 are not known until the full pattern has been compiled, thus the
1de06328 478 pointer to the value.
538e84ed 479
1de06328 480 - lookbehind
538e84ed 481
1de06328 482 In the case of lookbehind the string being searched for can be
538e84ed 483 offset past the start point of the final matching string.
1de06328
YO
484 If this value was just blithely removed from the min_offset it would
485 invalidate some of the calculations for how many chars must match
486 before or after (as they are derived from min_offset and minlen and
538e84ed 487 the length of the string being searched for).
1de06328
YO
488 When the final pattern is compiled and the data is moved from the
489 scan_data_t structure into the regexp structure the information
538e84ed
KW
490 about lookbehind is factored in, with the information that would
491 have been lost precalculated in the end_shift field for the
1de06328
YO
492 associated string.
493
494 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 495 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
496
497*/
2c2d71f5 498
cb6aef8b
DM
499struct scan_data_substrs {
500 SV *str; /* longest substring found in pattern */
501 SSize_t min_offset; /* earliest point in string it can appear */
502 SSize_t max_offset; /* latest point in string it can appear */
503 SSize_t *minlenp; /* pointer to the minlen relevant to the string */
504 SSize_t lookbehind; /* is the pos of the string modified by LB */
11683ecb 505 I32 flags; /* per substring SF_* and SCF_* flags */
cb6aef8b
DM
506};
507
2c2d71f5 508typedef struct scan_data_t {
1de06328
YO
509 /*I32 len_min; unused */
510 /*I32 len_delta; unused */
49f55535 511 SSize_t pos_min;
ea3daa5d 512 SSize_t pos_delta;
2c2d71f5 513 SV *last_found;
ea3daa5d 514 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 515 SSize_t last_start_min;
ea3daa5d 516 SSize_t last_start_max;
2099df82
DM
517 U8 cur_is_floating; /* whether the last_* values should be set as
518 * the next fixed (0) or floating (1)
519 * substring */
6ea77169 520
2099df82
DM
521 /* [0] is longest fixed substring so far, [1] is longest float so far */
522 struct scan_data_substrs substrs[2];
6ea77169 523
11683ecb 524 I32 flags; /* common SF_* and SCF_* flags */
2c2d71f5 525 I32 whilem_c;
49f55535 526 SSize_t *last_closep;
b8f7bb16 527 regnode_ssc *start_class;
2c2d71f5
JH
528} scan_data_t;
529
a687059c 530/*
e50aee73 531 * Forward declarations for pregcomp()'s friends.
a687059c 532 */
a0d0e21e 533
6ea77169 534static const scan_data_t zero_scan_data = {
a2ca2017 535 0, 0, NULL, 0, 0, 0, 0,
6ea77169 536 {
11683ecb
DM
537 { NULL, 0, 0, 0, 0, 0 },
538 { NULL, 0, 0, 0, 0, 0 },
6ea77169
DM
539 },
540 0, 0, NULL, NULL
541};
c277df42 542
11683ecb
DM
543/* study flags */
544
07be1b83
YO
545#define SF_BEFORE_SEOL 0x0001
546#define SF_BEFORE_MEOL 0x0002
11683ecb 547#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
c277df42 548
07be1b83
YO
549#define SF_IS_INF 0x0040
550#define SF_HAS_PAR 0x0080
551#define SF_IN_PAR 0x0100
552#define SF_HAS_EVAL 0x0200
bc604ad8
KW
553
554
555/* SCF_DO_SUBSTR is the flag that tells the regexp analyzer to track the
556 * longest substring in the pattern. When it is not set the optimiser keeps
557 * track of position, but does not keep track of the actual strings seen,
558 *
559 * So for instance /foo/ will be parsed with SCF_DO_SUBSTR being true, but
560 * /foo/i will not.
561 *
562 * Similarly, /foo.*(blah|erm|huh).*fnorble/ will have "foo" and "fnorble"
563 * parsed with SCF_DO_SUBSTR on, but while processing the (...) it will be
564 * turned off because of the alternation (BRANCH). */
07be1b83 565#define SCF_DO_SUBSTR 0x0400
bc604ad8 566
653099ff
GS
567#define SCF_DO_STCLASS_AND 0x0800
568#define SCF_DO_STCLASS_OR 0x1000
569#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 570#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 571
786e8c11 572#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 573#define SCF_SEEN_ACCEPT 0x8000
688e0391 574#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
575#define SCF_IN_DEFINE 0x20000
576
577
578
07be1b83 579
43fead97 580#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
581
582/* The enums for all these are ordered so things work out correctly */
a62b1201 583#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
584#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
585 == REGEX_DEPENDS_CHARSET)
00b27cfc 586#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
587#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
588 >= REGEX_UNICODE_CHARSET)
589#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
590 == REGEX_ASCII_RESTRICTED_CHARSET)
591#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
592 >= REGEX_ASCII_RESTRICTED_CHARSET)
593#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
594 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 595
43fead97 596#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 597
f2c2a6ab
KW
598/* For programs that want to be strictly Unicode compatible by dying if any
599 * attempt is made to match a non-Unicode code point against a Unicode
600 * property. */
601#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
602
93733859 603#define OOB_NAMEDCLASS -1
b8c5462f 604
8e661ac5
KW
605/* There is no code point that is out-of-bounds, so this is problematic. But
606 * its only current use is to initialize a variable that is always set before
607 * looked at. */
608#define OOB_UNICODE 0xDEADBEEF
609
a0ed51b3 610#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
a0ed51b3 611
8615cb43 612
b45f050a
JF
613/* length of regex to show in messages that don't mark a position within */
614#define RegexLengthToShowInErrorMessages 127
615
616/*
617 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
618 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
619 * op/pragma/warn/regcomp.
620 */
7253e4e3
RK
621#define MARKER1 "<-- HERE" /* marker as it appears in the description */
622#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 623
538e84ed 624#define REPORT_LOCATION " in regex; marked by " MARKER1 \
147e3846 625 " in m/%" UTF8f MARKER2 "%" UTF8f "/"
b45f050a 626
285b5ca0
KW
627/* The code in this file in places uses one level of recursion with parsing
628 * rebased to an alternate string constructed by us in memory. This can take
629 * the form of something that is completely different from the input, or
630 * something that uses the input as part of the alternate. In the first case,
631 * there should be no possibility of an error, as we are in complete control of
51684e3c
KW
632 * the alternate string. But in the second case we don't completely control
633 * the input portion, so there may be errors in that. Here's an example:
285b5ca0
KW
634 * /[abc\x{DF}def]/ui
635 * is handled specially because \x{df} folds to a sequence of more than one
51684e3c 636 * character: 'ss'. What is done is to create and parse an alternate string,
285b5ca0
KW
637 * which looks like this:
638 * /(?:\x{DF}|[abc\x{DF}def])/ui
639 * where it uses the input unchanged in the middle of something it constructs,
640 * which is a branch for the DF outside the character class, and clustering
641 * parens around the whole thing. (It knows enough to skip the DF inside the
642 * class while in this substitute parse.) 'abc' and 'def' may have errors that
643 * need to be reported. The general situation looks like this:
644 *
51684e3c 645 * |<------- identical ------>|
285b5ca0 646 * sI tI xI eI
51684e3c 647 * Input: ---------------------------------------------------------------
285b5ca0
KW
648 * Constructed: ---------------------------------------------------
649 * sC tC xC eC EC
51684e3c 650 * |<------- identical ------>|
285b5ca0 651 *
51684e3c
KW
652 * sI..eI is the portion of the input pattern we are concerned with here.
653 * sC..EC is the constructed substitute parse string.
654 * sC..tC is constructed by us
655 * tC..eC is an exact duplicate of the portion of the input pattern tI..eI.
656 * In the diagram, these are vertically aligned.
657 * eC..EC is also constructed by us.
658 * xC is the position in the substitute parse string where we found a
659 * problem.
660 * xI is the position in the original pattern corresponding to xC.
285b5ca0 661 *
51684e3c
KW
662 * We want to display a message showing the real input string. Thus we need to
663 * translate from xC to xI. We know that xC >= tC, since the portion of the
664 * string sC..tC has been constructed by us, and so shouldn't have errors. We
665 * get:
666 * xI = tI + (xC - tC)
285b5ca0 667 *
51684e3c
KW
668 * When the substitute parse is constructed, the code needs to set:
669 * RExC_start (sC)
670 * RExC_end (eC)
671 * RExC_copy_start_in_input (tI)
672 * RExC_copy_start_in_constructed (tC)
673 * and restore them when done.
285b5ca0 674 *
51684e3c
KW
675 * During normal processing of the input pattern, both
676 * 'RExC_copy_start_in_input' and 'RExC_copy_start_in_constructed' are set to
677 * sI, so that xC equals xI.
285b5ca0
KW
678 */
679
51684e3c
KW
680#define sI RExC_precomp
681#define eI RExC_precomp_end
682#define sC RExC_start
683#define eC RExC_end
684#define tI RExC_copy_start_in_input
685#define tC RExC_copy_start_in_constructed
686#define xI(xC) (tI + (xC - tC))
687#define xI_offset(xC) (xI(xC) - sI)
285b5ca0
KW
688
689#define REPORT_LOCATION_ARGS(xC) \
690 UTF8fARG(UTF, \
51684e3c 691 (xI(xC) > eI) /* Don't run off end */ \
232b691f 692 ? eI - sI /* Length before the <--HERE */ \
bc929b61
KW
693 : ((xI_offset(xC) >= 0) \
694 ? xI_offset(xC) \
695 : (Perl_croak(aTHX_ "panic: %s: %d: negative offset: %" \
696 IVdf " trying to output message for " \
697 " pattern %.*s", \
51684e3c 698 __FILE__, __LINE__, (IV) xI_offset(xC), \
bc929b61 699 ((int) (eC - sC)), sC), 0)), \
51684e3c 700 sI), /* The input pattern printed up to the <--HERE */ \
285b5ca0 701 UTF8fARG(UTF, \
51684e3c
KW
702 (xI(xC) > eI) ? 0 : eI - xI(xC), /* Length after <--HERE */ \
703 (xI(xC) > eI) ? eI : xI(xC)) /* pattern after <--HERE */
c1d900c3 704
8a6d8ec6
HS
705/* Used to point after bad bytes for an error message, but avoid skipping
706 * past a nul byte. */
707#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
708
ed68fe11
KW
709/* Set up to clean up after our imminent demise */
710#define PREPARE_TO_DIE \
711 STMT_START { \
712 if (RExC_rx_sv) \
713 SAVEFREESV(RExC_rx_sv); \
7c932d07
KW
714 if (RExC_open_parens) \
715 SAVEFREEPV(RExC_open_parens); \
716 if (RExC_close_parens) \
717 SAVEFREEPV(RExC_close_parens); \
ed68fe11
KW
718 } STMT_END
719
b45f050a
JF
720/*
721 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
722 * arg. Show regex, up to a maximum length. If it's too long, chop and add
723 * "...".
724 */
58e23c8d 725#define _FAIL(code) STMT_START { \
bfed75c6 726 const char *ellipses = ""; \
88f063b4 727 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380 728 \
ed68fe11 729 PREPARE_TO_DIE; \
ccb2c380
MP
730 if (len > RegexLengthToShowInErrorMessages) { \
731 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
732 len = RegexLengthToShowInErrorMessages - 10; \
733 ellipses = "..."; \
734 } \
58e23c8d 735 code; \
ccb2c380 736} STMT_END
8615cb43 737
58e23c8d 738#define FAIL(msg) _FAIL( \
147e3846 739 Perl_croak(aTHX_ "%s in regex m/%" UTF8f "%s/", \
c1d900c3 740 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
741
742#define FAIL2(msg,arg) _FAIL( \
147e3846 743 Perl_croak(aTHX_ msg " in regex m/%" UTF8f "%s/", \
c1d900c3 744 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 745
b45f050a 746/*
b45f050a
JF
747 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
748 */
ccb2c380 749#define Simple_vFAIL(m) STMT_START { \
ccb2c380 750 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 751 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 752} STMT_END
b45f050a
JF
753
754/*
755 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
756 */
ccb2c380 757#define vFAIL(m) STMT_START { \
ed68fe11 758 PREPARE_TO_DIE; \
ccb2c380
MP
759 Simple_vFAIL(m); \
760} STMT_END
b45f050a
JF
761
762/*
763 * Like Simple_vFAIL(), but accepts two arguments.
764 */
ccb2c380 765#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
766 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
767 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 768} STMT_END
b45f050a
JF
769
770/*
771 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
772 */
ccb2c380 773#define vFAIL2(m,a1) STMT_START { \
ed68fe11 774 PREPARE_TO_DIE; \
ccb2c380
MP
775 Simple_vFAIL2(m, a1); \
776} STMT_END
b45f050a
JF
777
778
779/*
780 * Like Simple_vFAIL(), but accepts three arguments.
781 */
ccb2c380 782#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 783 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 784 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 785} STMT_END
b45f050a
JF
786
787/*
788 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
789 */
ccb2c380 790#define vFAIL3(m,a1,a2) STMT_START { \
ed68fe11 791 PREPARE_TO_DIE; \
ccb2c380
MP
792 Simple_vFAIL3(m, a1, a2); \
793} STMT_END
b45f050a
JF
794
795/*
796 * Like Simple_vFAIL(), but accepts four arguments.
797 */
ccb2c380 798#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
799 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
800 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 801} STMT_END
b45f050a 802
95db3ffa 803#define vFAIL4(m,a1,a2,a3) STMT_START { \
ed68fe11 804 PREPARE_TO_DIE; \
95db3ffa
KW
805 Simple_vFAIL4(m, a1, a2, a3); \
806} STMT_END
807
946095af 808/* A specialized version of vFAIL2 that works with UTF8f */
d528642a 809#define vFAIL2utf8f(m, a1) STMT_START { \
ed68fe11 810 PREPARE_TO_DIE; \
d528642a
KW
811 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
812 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
813} STMT_END
814
3ba22297 815#define vFAIL3utf8f(m, a1, a2) STMT_START { \
ed68fe11 816 PREPARE_TO_DIE; \
3ba22297 817 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 818 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
819} STMT_END
820
d62e749e
KW
821/* Setting this to NULL is a signal to not output warnings */
822#define TURN_OFF_WARNINGS_IN_SUBSTITUTE_PARSE RExC_copy_start_in_constructed = NULL
823#define RESTORE_WARNINGS RExC_copy_start_in_constructed = RExC_precomp
824
7c932d07
KW
825/* Since a warning can be generated multiple times as the input is reparsed, we
826 * output it the first time we come to that point in the parse, but suppress it
827 * otherwise. 'RExC_copy_start_in_constructed' being NULL is a flag to not
828 * generate any warnings */
55bcf857 829#define TO_OUTPUT_WARNINGS(loc) \
7c932d07
KW
830 ( RExC_copy_start_in_constructed \
831 && ((xI(loc)) - RExC_precomp) > (Ptrdiff_t) RExC_latest_warn_offset)
55bcf857 832
7c932d07
KW
833/* After we've emitted a warning, we save the position in the input so we don't
834 * output it again */
835#define UPDATE_WARNINGS_LOC(loc) \
836 STMT_START { \
837 if (TO_OUTPUT_WARNINGS(loc)) { \
838 RExC_latest_warn_offset = (xI(loc)) - RExC_precomp; \
839 } \
840 } STMT_END
77ccba3c 841
086f21bf 842/* 'warns' is the output of the packWARNx macro used in 'code' */
f8220fe2
KW
843#define _WARN_HELPER(loc, warns, code) \
844 STMT_START { \
d62e749e
KW
845 if (! RExC_copy_start_in_constructed) { \
846 Perl_croak( aTHX_ "panic! %s: %d: Tried to warn when none" \
847 " expected at '%s'", \
848 __FILE__, __LINE__, loc); \
849 } \
55bcf857 850 if (TO_OUTPUT_WARNINGS(loc)) { \
086f21bf
KW
851 if (ckDEAD(warns)) \
852 PREPARE_TO_DIE; \
338190a1 853 code; \
77ccba3c 854 UPDATE_WARNINGS_LOC(loc); \
338190a1 855 } \
f8220fe2 856 } STMT_END
946095af 857
5e0a247b 858/* m is not necessarily a "literal string", in this macro */
f8220fe2
KW
859#define reg_warn_non_literal_string(loc, m) \
860 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
861 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 862 "%s" REPORT_LOCATION, \
f8220fe2 863 m, REPORT_LOCATION_ARGS(loc)))
5e0a247b 864
f8220fe2
KW
865#define ckWARNreg(loc,m) \
866 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
867 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 868 m REPORT_LOCATION, \
f8220fe2 869 REPORT_LOCATION_ARGS(loc)))
ccb2c380 870
f8220fe2
KW
871#define vWARN(loc, m) \
872 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
873 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 874 m REPORT_LOCATION, \
f8220fe2 875 REPORT_LOCATION_ARGS(loc))) \
b927b7e9 876
f8220fe2
KW
877#define vWARN_dep(loc, m) \
878 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
879 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 880 m REPORT_LOCATION, \
f8220fe2 881 REPORT_LOCATION_ARGS(loc)))
0d6106aa 882
f8220fe2
KW
883#define ckWARNdep(loc,m) \
884 _WARN_HELPER(loc, packWARN(WARN_DEPRECATED), \
885 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
d528642a 886 m REPORT_LOCATION, \
f8220fe2 887 REPORT_LOCATION_ARGS(loc)))
147508a2 888
f8220fe2
KW
889#define ckWARNregdep(loc,m) \
890 _WARN_HELPER(loc, packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
891 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
d528642a
KW
892 WARN_REGEXP), \
893 m REPORT_LOCATION, \
f8220fe2 894 REPORT_LOCATION_ARGS(loc)))
ccb2c380 895
f8220fe2
KW
896#define ckWARN2reg_d(loc,m, a1) \
897 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
898 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
d528642a 899 m REPORT_LOCATION, \
f8220fe2 900 a1, REPORT_LOCATION_ARGS(loc)))
2335b3d3 901
f8220fe2
KW
902#define ckWARN2reg(loc, m, a1) \
903 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
904 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 905 m REPORT_LOCATION, \
f8220fe2 906 a1, REPORT_LOCATION_ARGS(loc)))
ccb2c380 907
f8220fe2
KW
908#define vWARN3(loc, m, a1, a2) \
909 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
910 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a 911 m REPORT_LOCATION, \
f8220fe2 912 a1, a2, REPORT_LOCATION_ARGS(loc)))
ccb2c380 913
f8220fe2
KW
914#define ckWARN3reg(loc, m, a1, a2) \
915 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
916 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
917 m REPORT_LOCATION, \
918 a1, a2, \
f8220fe2 919 REPORT_LOCATION_ARGS(loc)))
668c081a 920
f8220fe2
KW
921#define vWARN4(loc, m, a1, a2, a3) \
922 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
923 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
924 m REPORT_LOCATION, \
925 a1, a2, a3, \
f8220fe2 926 REPORT_LOCATION_ARGS(loc)))
ccb2c380 927
f8220fe2
KW
928#define ckWARN4reg(loc, m, a1, a2, a3) \
929 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
930 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
931 m REPORT_LOCATION, \
932 a1, a2, a3, \
f8220fe2 933 REPORT_LOCATION_ARGS(loc)))
668c081a 934
f8220fe2
KW
935#define vWARN5(loc, m, a1, a2, a3, a4) \
936 _WARN_HELPER(loc, packWARN(WARN_REGEXP), \
937 Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
d528642a
KW
938 m REPORT_LOCATION, \
939 a1, a2, a3, a4, \
f8220fe2 940 REPORT_LOCATION_ARGS(loc)))
9d1d55b5 941
aea5f181
KW
942#define ckWARNexperimental(loc, class, m) \
943 _WARN_HELPER(loc, packWARN(class), \
944 Perl_ck_warner_d(aTHX_ packWARN(class), \
945 m REPORT_LOCATION, \
946 REPORT_LOCATION_ARGS(loc)))
947
1a322bb4
KW
948/* Convert between a pointer to a node and its offset from the beginning of the
949 * program */
950#define REGNODE_p(offset) (RExC_emit_start + (offset))
951#define REGNODE_OFFSET(node) ((node) - RExC_emit_start)
952
538e84ed 953/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
954 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
955 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
956 * Element 0 holds the number n.
07be1b83 957 * Position is 1 indexed.
fac92740 958 */
7122b237 959#ifndef RE_TRACK_PATTERN_OFFSETS
d7c442ce 960#define Set_Node_Offset_To_R(offset,byte)
7122b237
YO
961#define Set_Node_Offset(node,byte)
962#define Set_Cur_Node_Offset
963#define Set_Node_Length_To_R(node,len)
964#define Set_Node_Length(node,len)
6a86c6ad 965#define Set_Node_Cur_Length(node,start)
538e84ed
KW
966#define Node_Offset(n)
967#define Node_Length(n)
7122b237
YO
968#define Set_Node_Offset_Length(node,offset,len)
969#define ProgLen(ri) ri->u.proglen
970#define SetProgLen(ri,x) ri->u.proglen = x
4cbeff97 971#define Track_Code(code)
7122b237
YO
972#else
973#define ProgLen(ri) ri->u.offsets[0]
974#define SetProgLen(ri,x) ri->u.offsets[0] = x
d7c442ce 975#define Set_Node_Offset_To_R(offset,byte) STMT_START { \
ccb2c380 976 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
d7c442ce
KW
977 __LINE__, (int)(offset), (int)(byte))); \
978 if((offset) < 0) { \
538e84ed 979 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
d7c442ce 980 (int)(offset)); \
ccb2c380 981 } else { \
d7c442ce 982 RExC_offsets[2*(offset)-1] = (byte); \
ccb2c380 983 } \
ccb2c380
MP
984} STMT_END
985
88f063b4 986#define Set_Node_Offset(node,byte) \
1a322bb4 987 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (byte)-RExC_start)
ccb2c380
MP
988#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
989
990#define Set_Node_Length_To_R(node,len) STMT_START { \
ccb2c380 991 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 992 __LINE__, (int)(node), (int)(len))); \
ccb2c380 993 if((node) < 0) { \
538e84ed
KW
994 Perl_croak(aTHX_ "value of node is %d in Length macro", \
995 (int)(node)); \
ccb2c380
MP
996 } else { \
997 RExC_offsets[2*(node)] = (len); \
998 } \
ccb2c380
MP
999} STMT_END
1000
1001#define Set_Node_Length(node,len) \
1a322bb4 1002 Set_Node_Length_To_R(REGNODE_OFFSET(node), len)
6a86c6ad
NC
1003#define Set_Node_Cur_Length(node, start) \
1004 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
1005
1006/* Get offsets and lengths */
1a322bb4
KW
1007#define Node_Offset(n) (RExC_offsets[2*(REGNODE_OFFSET(n))-1])
1008#define Node_Length(n) (RExC_offsets[2*(REGNODE_OFFSET(n))])
fac92740 1009
07be1b83 1010#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
1a322bb4
KW
1011 Set_Node_Offset_To_R(REGNODE_OFFSET(node), (offset)); \
1012 Set_Node_Length_To_R(REGNODE_OFFSET(node), (len)); \
07be1b83 1013} STMT_END
4cbeff97
TC
1014
1015#define Track_Code(code) STMT_START { code } STMT_END
7122b237 1016#endif
07be1b83
YO
1017
1018#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
1019#define EXPERIMENTAL_INPLACESCAN
f427392e 1020#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 1021
cb41e5d6
YO
1022#ifdef DEBUGGING
1023int
6ad9a8ab 1024Perl_re_printf(pTHX_ const char *fmt, ...)
cb41e5d6
YO
1025{
1026 va_list ap;
1027 int result;
1028 PerlIO *f= Perl_debug_log;
1029 PERL_ARGS_ASSERT_RE_PRINTF;
1030 va_start(ap, fmt);
1031 result = PerlIO_vprintf(f, fmt, ap);
1032 va_end(ap);
1033 return result;
1034}
1035
1036int
7b031478 1037Perl_re_indentf(pTHX_ const char *fmt, U32 depth, ...)
cb41e5d6
YO
1038{
1039 va_list ap;
1040 int result;
1041 PerlIO *f= Perl_debug_log;
1042 PERL_ARGS_ASSERT_RE_INDENTF;
1043 va_start(ap, depth);
daeb874b 1044 PerlIO_printf(f, "%*s", ( (int)depth % 20 ) * 2, "");
cb41e5d6
YO
1045 result = PerlIO_vprintf(f, fmt, ap);
1046 va_end(ap);
1047 return result;
1048}
1049#endif /* DEBUGGING */
1050
7b031478 1051#define DEBUG_RExC_seen() \
538e84ed 1052 DEBUG_OPTIMISE_MORE_r({ \
88f063b4 1053 Perl_re_printf( aTHX_ "RExC_seen: "); \
538e84ed 1054 \
e384d5c1 1055 if (RExC_seen & REG_ZERO_LEN_SEEN) \
88f063b4 1056 Perl_re_printf( aTHX_ "REG_ZERO_LEN_SEEN "); \
538e84ed 1057 \
e384d5c1 1058 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
88f063b4 1059 Perl_re_printf( aTHX_ "REG_LOOKBEHIND_SEEN "); \
538e84ed 1060 \
e384d5c1 1061 if (RExC_seen & REG_GPOS_SEEN) \
88f063b4 1062 Perl_re_printf( aTHX_ "REG_GPOS_SEEN "); \
538e84ed 1063 \
e384d5c1 1064 if (RExC_seen & REG_RECURSE_SEEN) \
88f063b4 1065 Perl_re_printf( aTHX_ "REG_RECURSE_SEEN "); \
538e84ed 1066 \
7b031478 1067 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
88f063b4 1068 Perl_re_printf( aTHX_ "REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 1069 \
e384d5c1 1070 if (RExC_seen & REG_VERBARG_SEEN) \
88f063b4 1071 Perl_re_printf( aTHX_ "REG_VERBARG_SEEN "); \
538e84ed 1072 \
e384d5c1 1073 if (RExC_seen & REG_CUTGROUP_SEEN) \
88f063b4 1074 Perl_re_printf( aTHX_ "REG_CUTGROUP_SEEN "); \
538e84ed 1075 \
e384d5c1 1076 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
88f063b4 1077 Perl_re_printf( aTHX_ "REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 1078 \
e384d5c1 1079 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
88f063b4 1080 Perl_re_printf( aTHX_ "REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 1081 \
7b031478 1082 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
88f063b4 1083 Perl_re_printf( aTHX_ "REG_UNBOUNDED_QUANTIFIER_SEEN "); \
ee273784 1084 \
88f063b4 1085 Perl_re_printf( aTHX_ "\n"); \
9e9ecfdf
YO
1086 });
1087
fdfb4f21 1088#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
6ad9a8ab 1089 if ((flags) & flag) Perl_re_printf( aTHX_ "%s ", #flag)
fdfb4f21 1090
fdfb4f21 1091
f5a36d78
DM
1092#ifdef DEBUGGING
1093static void
1094S_debug_show_study_flags(pTHX_ U32 flags, const char *open_str,
1095 const char *close_str)
1096{
1097 if (!flags)
1098 return;
1099
1100 Perl_re_printf( aTHX_ "%s", open_str);
11683ecb
DM
1101 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_SEOL);
1102 DEBUG_SHOW_STUDY_FLAG(flags, SF_BEFORE_MEOL);
f5a36d78
DM
1103 DEBUG_SHOW_STUDY_FLAG(flags, SF_IS_INF);
1104 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_PAR);
1105 DEBUG_SHOW_STUDY_FLAG(flags, SF_IN_PAR);
1106 DEBUG_SHOW_STUDY_FLAG(flags, SF_HAS_EVAL);
1107 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_SUBSTR);
1108 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_AND);
1109 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS_OR);
1110 DEBUG_SHOW_STUDY_FLAG(flags, SCF_DO_STCLASS);
1111 DEBUG_SHOW_STUDY_FLAG(flags, SCF_WHILEM_VISITED_POS);
1112 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_RESTUDY);
1113 DEBUG_SHOW_STUDY_FLAG(flags, SCF_SEEN_ACCEPT);
1114 DEBUG_SHOW_STUDY_FLAG(flags, SCF_TRIE_DOING_RESTUDY);
1115 DEBUG_SHOW_STUDY_FLAG(flags, SCF_IN_DEFINE);
1116 Perl_re_printf( aTHX_ "%s", close_str);
1117}
1118
1119
1120static void
1121S_debug_studydata(pTHX_ const char *where, scan_data_t *data,
1122 U32 depth, int is_inf)
1123{
1124 GET_RE_DEBUG_FLAGS_DECL;
1125
1126 DEBUG_OPTIMISE_MORE_r({
1127 if (!data)
1128 return;
1129 Perl_re_indentf(aTHX_ "%s: Pos:%" IVdf "/%" IVdf " Flags: 0x%" UVXf,
1130 depth,
1131 where,
1132 (IV)data->pos_min,
1133 (IV)data->pos_delta,
1134 (UV)data->flags
1135 );
1136
1137 S_debug_show_study_flags(aTHX_ data->flags," [","]");
fdfb4f21 1138
f5a36d78
DM
1139 Perl_re_printf( aTHX_
1140 " Whilem_c: %" IVdf " Lcp: %" IVdf " %s",
1141 (IV)data->whilem_c,
1142 (IV)(data->last_closep ? *((data)->last_closep) : -1),
1143 is_inf ? "INF " : ""
1144 );
1145
11683ecb 1146 if (data->last_found) {
37b6262f 1147 int i;
11683ecb
DM
1148 Perl_re_printf(aTHX_
1149 "Last:'%s' %" IVdf ":%" IVdf "/%" IVdf,
1150 SvPVX_const(data->last_found),
1151 (IV)data->last_end,
1152 (IV)data->last_start_min,
1153 (IV)data->last_start_max
1154 );
1155
2099df82 1156 for (i = 0; i < 2; i++) {
37b6262f
DM
1157 Perl_re_printf(aTHX_
1158 " %s%s: '%s' @ %" IVdf "/%" IVdf,
2099df82
DM
1159 data->cur_is_floating == i ? "*" : "",
1160 i ? "Float" : "Fixed",
37b6262f
DM
1161 SvPVX_const(data->substrs[i].str),
1162 (IV)data->substrs[i].min_offset,
1163 (IV)data->substrs[i].max_offset
1164 );
1165 S_debug_show_study_flags(aTHX_ data->substrs[i].flags," [","]");
1166 }
11683ecb 1167 }
f5a36d78
DM
1168
1169 Perl_re_printf( aTHX_ "\n");
1170 });
1171}
1172
1173
1174static void
1175S_debug_peep(pTHX_ const char *str, const RExC_state_t *pRExC_state,
1176 regnode *scan, U32 depth, U32 flags)
1177{
1178 GET_RE_DEBUG_FLAGS_DECL;
1179
1180 DEBUG_OPTIMISE_r({
1181 regnode *Next;
1182
1183 if (!scan)
1184 return;
1185 Next = regnext(scan);
1186 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state);
1187 Perl_re_indentf( aTHX_ "%s>%3d: %s (%d)",
1188 depth,
1189 str,
1190 REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),
1191 Next ? (REG_NODE_NUM(Next)) : 0 );
1192 S_debug_show_study_flags(aTHX_ flags," [ ","]");
1193 Perl_re_printf( aTHX_ "\n");
1194 });
1195}
1196
1197
1198# define DEBUG_STUDYDATA(where, data, depth, is_inf) \
1199 S_debug_studydata(aTHX_ where, data, depth, is_inf)
1200
1201# define DEBUG_PEEP(str, scan, depth, flags) \
1202 S_debug_peep(aTHX_ str, pRExC_state, scan, depth, flags)
1203
1204#else
1205# define DEBUG_STUDYDATA(where, data, depth, is_inf) NOOP
1206# define DEBUG_PEEP(str, scan, depth, flags) NOOP
1207#endif
1de06328 1208
cb41e5d6 1209
c6871b76
KW
1210/* =========================================================
1211 * BEGIN edit_distance stuff.
1212 *
1213 * This calculates how many single character changes of any type are needed to
1214 * transform a string into another one. It is taken from version 3.1 of
1215 *
1216 * https://metacpan.org/pod/Text::Levenshtein::Damerau::XS
1217 */
1218
1219/* Our unsorted dictionary linked list. */
1220/* Note we use UVs, not chars. */
1221
1222struct dictionary{
1223 UV key;
1224 UV value;
1225 struct dictionary* next;
1226};
1227typedef struct dictionary item;
1228
1229
1230PERL_STATIC_INLINE item*
88f063b4 1231push(UV key, item* curr)
c6871b76
KW
1232{
1233 item* head;
d09f14bf 1234 Newx(head, 1, item);
c6871b76
KW
1235 head->key = key;
1236 head->value = 0;
1237 head->next = curr;
1238 return head;
1239}
1240
1241
1242PERL_STATIC_INLINE item*
1243find(item* head, UV key)
1244{
1245 item* iterator = head;
1246 while (iterator){
1247 if (iterator->key == key){
1248 return iterator;
1249 }
1250 iterator = iterator->next;
1251 }
1252
1253 return NULL;
1254}
1255
1256PERL_STATIC_INLINE item*
88f063b4 1257uniquePush(item* head, UV key)
c6871b76
KW
1258{
1259 item* iterator = head;
1260
1261 while (iterator){
1262 if (iterator->key == key) {
1263 return head;
1264 }
1265 iterator = iterator->next;
1266 }
1267
88f063b4 1268 return push(key, head);
c6871b76
KW
1269}
1270
1271PERL_STATIC_INLINE void
1272dict_free(item* head)
1273{
1274 item* iterator = head;
1275
1276 while (iterator) {
1277 item* temp = iterator;
1278 iterator = iterator->next;
1279 Safefree(temp);
1280 }
1281
1282 head = NULL;
1283}
1284
1285/* End of Dictionary Stuff */
1286
1287/* All calculations/work are done here */
1288STATIC int
1289S_edit_distance(const UV* src,
1290 const UV* tgt,
1291 const STRLEN x, /* length of src[] */
1292 const STRLEN y, /* length of tgt[] */
1293 const SSize_t maxDistance
1294)
1295{
1296 item *head = NULL;
88f063b4 1297 UV swapCount, swapScore, targetCharCount, i, j;
c6871b76
KW
1298 UV *scores;
1299 UV score_ceil = x + y;
1300
1301 PERL_ARGS_ASSERT_EDIT_DISTANCE;
1302
1303 /* intialize matrix start values */
d09f14bf 1304 Newx(scores, ( (x + 2) * (y + 2)), UV);
c6871b76
KW
1305 scores[0] = score_ceil;
1306 scores[1 * (y + 2) + 0] = score_ceil;
1307 scores[0 * (y + 2) + 1] = score_ceil;
1308 scores[1 * (y + 2) + 1] = 0;
88f063b4 1309 head = uniquePush(uniquePush(head, src[0]), tgt[0]);
c6871b76
KW
1310
1311 /* work loops */
1312 /* i = src index */
1313 /* j = tgt index */
1314 for (i=1;i<=x;i++) {
1315 if (i < x)
88f063b4 1316 head = uniquePush(head, src[i]);
c6871b76
KW
1317 scores[(i+1) * (y + 2) + 1] = i;
1318 scores[(i+1) * (y + 2) + 0] = score_ceil;
1319 swapCount = 0;
1320
1321 for (j=1;j<=y;j++) {
1322 if (i == 1) {
1323 if(j < y)
88f063b4 1324 head = uniquePush(head, tgt[j]);
c6871b76
KW
1325 scores[1 * (y + 2) + (j + 1)] = j;
1326 scores[0 * (y + 2) + (j + 1)] = score_ceil;
1327 }
1328
88f063b4 1329 targetCharCount = find(head, tgt[j-1])->value;
c6871b76
KW
1330 swapScore = scores[targetCharCount * (y + 2) + swapCount] + i - targetCharCount - 1 + j - swapCount;
1331
1332 if (src[i-1] != tgt[j-1]){
1333 scores[(i+1) * (y + 2) + (j + 1)] = MIN(swapScore,(MIN(scores[i * (y + 2) + j], MIN(scores[(i+1) * (y + 2) + j], scores[i * (y + 2) + (j + 1)])) + 1));
1334 }
1335 else {
1336 swapCount = j;
1337 scores[(i+1) * (y + 2) + (j + 1)] = MIN(scores[i * (y + 2) + j], swapScore);
1338 }
1339 }
1340
88f063b4 1341 find(head, src[i-1])->value = i;
c6871b76
KW
1342 }
1343
1344 {
1345 IV score = scores[(x+1) * (y + 2) + (y + 1)];
1346 dict_free(head);
1347 Safefree(scores);
1348 return (maxDistance != 0 && maxDistance < score)?(-1):score;
1349 }
1350}
1351
1352/* END of edit_distance() stuff
1353 * ========================================================= */
1354
8e35b056
KW
1355/* is c a control character for which we have a mnemonic? */
1356#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
1357
549b4e78
KW
1358STATIC const char *
1359S_cntrl_to_mnemonic(const U8 c)
1360{
1361 /* Returns the mnemonic string that represents character 'c', if one
1362 * exists; NULL otherwise. The only ones that exist for the purposes of
1363 * this routine are a few control characters */
1364
1365 switch (c) {
1366 case '\a': return "\\a";
1367 case '\b': return "\\b";
1368 case ESC_NATIVE: return "\\e";
1369 case '\f': return "\\f";
1370 case '\n': return "\\n";
1371 case '\r': return "\\r";
1372 case '\t': return "\\t";
1373 }
1374
1375 return NULL;
1376}
1377
653099ff 1378/* Mark that we cannot extend a found fixed substring at this point.
37b6262f 1379 Update the longest found anchored substring or the longest found
653099ff
GS
1380 floating substrings if needed. */
1381
4327152a 1382STATIC void
ea3daa5d
FC
1383S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
1384 SSize_t *minlenp, int is_inf)
c277df42 1385{
e1ec3a88 1386 const STRLEN l = CHR_SVLEN(data->last_found);
2099df82 1387 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
a2ca2017 1388 const STRLEN old_l = CHR_SVLEN(longest_sv);
1de06328 1389 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 1390
7918f24d
NC
1391 PERL_ARGS_ASSERT_SCAN_COMMIT;
1392
c277df42 1393 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
2099df82 1394 const U8 i = data->cur_is_floating;
a2ca2017 1395 SvSetMagicSV(longest_sv, data->last_found);
37b6262f
DM
1396 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
1397
2099df82
DM
1398 if (!i) /* fixed */
1399 data->substrs[0].max_offset = data->substrs[0].min_offset;
1400 else { /* float */
1401 data->substrs[1].max_offset = (l
646e8787
DM
1402 ? data->last_start_max
1403 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1404 ? SSize_t_MAX
1405 : data->pos_min + data->pos_delta));
1406 if (is_inf
2099df82
DM
1407 || (STRLEN)data->substrs[1].max_offset > (STRLEN)SSize_t_MAX)
1408 data->substrs[1].max_offset = SSize_t_MAX;
37b6262f 1409 }
11683ecb 1410
37b6262f
DM
1411 if (data->flags & SF_BEFORE_EOL)
1412 data->substrs[i].flags |= (data->flags & SF_BEFORE_EOL);
1413 else
1414 data->substrs[i].flags &= ~SF_BEFORE_EOL;
1415 data->substrs[i].minlenp = minlenp;
1416 data->substrs[i].lookbehind = 0;
c277df42 1417 }
37b6262f 1418
c277df42 1419 SvCUR_set(data->last_found, 0);
0eda9292 1420 {
a28509cc 1421 SV * const sv = data->last_found;
097eb12c
AL
1422 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1423 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1424 if (mg)
1425 mg->mg_len = 0;
1426 }
0eda9292 1427 }
c277df42
IZ
1428 data->last_end = -1;
1429 data->flags &= ~SF_BEFORE_EOL;
f5a36d78 1430 DEBUG_STUDYDATA("commit", data, 0, is_inf);
c277df42
IZ
1431}
1432
cdd87c1d
KW
1433/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1434 * list that describes which code points it matches */
1435
653099ff 1436STATIC void
3420edd7 1437S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1438{
cdd87c1d
KW
1439 /* Set the SSC 'ssc' to match an empty string or any code point */
1440
557bd3fb 1441 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1442
71068078 1443 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1444
0854ea0b
KW
1445 /* mortalize so won't leak */
1446 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
93e92956 1447 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1448}
1449
653099ff 1450STATIC int
dc3bf405 1451S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1452{
c144baaa
KW
1453 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1454 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1455 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1456 * in any way, so there's no point in using it */
cdd87c1d
KW
1457
1458 UV start, end;
1459 bool ret;
653099ff 1460
557bd3fb 1461 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1462
71068078 1463 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1464
93e92956 1465 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1466 return FALSE;
1467 }
1468
1469 /* See if the list consists solely of the range 0 - Infinity */
1470 invlist_iterinit(ssc->invlist);
1471 ret = invlist_iternext(ssc->invlist, &start, &end)
1472 && start == 0
1473 && end == UV_MAX;
1474
1475 invlist_iterfinish(ssc->invlist);
1476
1477 if (ret) {
1478 return TRUE;
1479 }
1480
1481 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1482 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1483 int i;
1484 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1485 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1486 return TRUE;
1487 }
1488 }
1489 }
1490
1491 return FALSE;
653099ff
GS
1492}
1493
653099ff 1494STATIC void
cdd87c1d 1495S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1496{
cdd87c1d
KW
1497 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1498 * string, any code point, or any posix class under locale */
1499
557bd3fb 1500 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1501
557bd3fb 1502 Zero(ssc, 1, regnode_ssc);
71068078 1503 set_ANYOF_SYNTHETIC(ssc);
93e92956 1504 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1505 ssc_anything(ssc);
cdd87c1d 1506
2f306ab9
KW
1507 /* If any portion of the regex is to operate under locale rules that aren't
1508 * fully known at compile time, initialization includes it. The reason
1509 * this isn't done for all regexes is that the optimizer was written under
1510 * the assumption that locale was all-or-nothing. Given the complexity and
1511 * lack of documentation in the optimizer, and that there are inadequate
1512 * test cases for locale, many parts of it may not work properly, it is
1513 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1514 if (RExC_contains_locale) {
1515 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1516 }
1517 else {
1518 ANYOF_POSIXL_ZERO(ssc);
1519 }
653099ff
GS
1520}
1521
b423522f 1522STATIC int
dc3bf405
BF
1523S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1524 const regnode_ssc *ssc)
b423522f
KW
1525{
1526 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1527 * to the list of code points matched, and locale posix classes; hence does
1528 * not check its flags) */
1529
1530 UV start, end;
1531 bool ret;
1532
1533 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1534
71068078 1535 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1536
1537 invlist_iterinit(ssc->invlist);
1538 ret = invlist_iternext(ssc->invlist, &start, &end)
1539 && start == 0
1540 && end == UV_MAX;
1541
1542 invlist_iterfinish(ssc->invlist);
1543
1544 if (! ret) {
1545 return FALSE;
1546 }
1547
e0e1be5f 1548 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1549 return FALSE;
b423522f
KW
1550 }
1551
1552 return TRUE;
1553}
1554
4ebed06a
KW
1555#define INVLIST_INDEX 0
1556#define ONLY_LOCALE_MATCHES_INDEX 1
1557#define DEFERRED_USER_DEFINED_INDEX 2
1558
b423522f
KW
1559STATIC SV*
1560S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1561 const regnode_charclass* const node)
b423522f
KW
1562{
1563 /* Returns a mortal inversion list defining which code points are matched
1564 * by 'node', which is of type ANYOF. Handles complementing the result if
1565 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1566 * returned list must, and will, contain every code point that is a
1567 * possibility. */
b423522f 1568
1565c085 1569 dVAR;
e2506fa7 1570 SV* invlist = NULL;
1ee208c4 1571 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1572 unsigned int i;
1573 const U32 n = ARG(node);
31f05a37 1574 bool new_node_has_latin1 = FALSE;
b423522f
KW
1575
1576 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1577
1578 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1579 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1580 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1581 AV * const av = MUTABLE_AV(SvRV(rv));
1582 SV **const ary = AvARRAY(av);
1583 assert(RExC_rxi->data->what[n] == 's');
1584
4ebed06a 1585 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
b423522f 1586
6ed02bb6
KW
1587 /* Here there are things that won't be known until runtime -- we
1588 * have to assume it could be anything */
e2506fa7 1589 invlist = sv_2mortal(_new_invlist(1));
b423522f
KW
1590 return _add_range_to_invlist(invlist, 0, UV_MAX);
1591 }
4ebed06a 1592 else if (ary[INVLIST_INDEX]) {
b423522f 1593
6ed02bb6 1594 /* Use the node's inversion list */
4ebed06a 1595 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
1ee208c4
KW
1596 }
1597
1598 /* Get the code points valid only under UTF-8 locales */
6ed02bb6 1599 if ( (ANYOF_FLAGS(node) & ANYOFL_FOLD)
4ebed06a 1600 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
1ee208c4 1601 {
4ebed06a 1602 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
b423522f
KW
1603 }
1604 }
1605
e2506fa7
KW
1606 if (! invlist) {
1607 invlist = sv_2mortal(_new_invlist(0));
1608 }
1609
dcb20b36
KW
1610 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1611 * code points, and an inversion list for the others, but if there are code
1612 * points that should match only conditionally on the target string being
1613 * UTF-8, those are placed in the inversion list, and not the bitmap.
1614 * Since there are circumstances under which they could match, they are
1615 * included in the SSC. But if the ANYOF node is to be inverted, we have
1616 * to exclude them here, so that when we invert below, the end result
1617 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1618 * have to do this here before we add the unconditionally matched code
1619 * points */
b423522f
KW
1620 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1621 _invlist_intersection_complement_2nd(invlist,
1622 PL_UpperLatin1,
1623 &invlist);
1624 }
1625
1626 /* Add in the points from the bit map */
5aa750d1 1627 if (OP(node) != ANYOFH) {
90973738
KW
1628 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
1629 if (ANYOF_BITMAP_TEST(node, i)) {
1630 unsigned int start = i++;
1631
1632 for (; i < NUM_ANYOF_CODE_POINTS
1633 && ANYOF_BITMAP_TEST(node, i); ++i)
1634 {
1635 /* empty */
1636 }
1637 invlist = _add_range_to_invlist(invlist, start, i-1);
1638 new_node_has_latin1 = TRUE;
6f8848d5 1639 }
b423522f
KW
1640 }
1641 }
1642
1643 /* If this can match all upper Latin1 code points, have to add them
ac33c516
KW
1644 * as well. But don't add them if inverting, as when that gets done below,
1645 * it would exclude all these characters, including the ones it shouldn't
1646 * that were added just above */
1647 if (! (ANYOF_FLAGS(node) & ANYOF_INVERT) && OP(node) == ANYOFD
f240c685
KW
1648 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1649 {
b423522f
KW
1650 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1651 }
1652
1653 /* Similarly for these */
93e92956 1654 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1655 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1656 }
1657
1658 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1659 _invlist_invert(invlist);
1660 }
35b8412f
KW
1661 else if (ANYOF_FLAGS(node) & ANYOFL_FOLD) {
1662 if (new_node_has_latin1) {
31f05a37 1663
26be5fe6
KW
1664 /* Under /li, any 0-255 could fold to any other 0-255, depending on
1665 * the locale. We can skip this if there are no 0-255 at all. */
1666 _invlist_union(invlist, PL_Latin1, &invlist);
35b8412f
KW
1667
1668 invlist = add_cp_to_invlist(invlist, LATIN_SMALL_LETTER_DOTLESS_I);
1669 invlist = add_cp_to_invlist(invlist, LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
1670 }
1671 else {
1672 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
1673 invlist = add_cp_to_invlist(invlist, 'I');
1674 }
1675 if (_invlist_contains_cp(invlist,
1676 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
1677 {
1678 invlist = add_cp_to_invlist(invlist, 'i');
1679 }
1680 }
31f05a37
KW
1681 }
1682
1ee208c4
KW
1683 /* Similarly add the UTF-8 locale possible matches. These have to be
1684 * deferred until after the non-UTF-8 locale ones are taken care of just
1685 * above, or it leads to wrong results under ANYOF_INVERT */
1686 if (only_utf8_locale_invlist) {
31f05a37 1687 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1688 only_utf8_locale_invlist,
31f05a37
KW
1689 ANYOF_FLAGS(node) & ANYOF_INVERT,
1690 &invlist);
1691 }
b423522f
KW
1692
1693 return invlist;
1694}
1695
1051e1c4 1696/* These two functions currently do the exact same thing */
557bd3fb 1697#define ssc_init_zero ssc_init
653099ff 1698
cdd87c1d
KW
1699#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1700#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1701
557bd3fb 1702/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1703 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1704 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1705
653099ff 1706STATIC void
b423522f 1707S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1708 const regnode_charclass *and_with)
653099ff 1709{
cdd87c1d
KW
1710 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1711 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1712
a0dd4231
KW
1713 SV* anded_cp_list;
1714 U8 anded_flags;
1e6ade67 1715
cdd87c1d 1716 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1717
71068078 1718 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1719
1720 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1721 * the code point inversion list and just the relevant flags */
71068078 1722 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1723 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
a0dd4231 1724 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1725
1726 /* XXX This is a kludge around what appears to be deficiencies in the
1727 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1728 * there are paths through the optimizer where it doesn't get weeded
1729 * out when it should. And if we don't make some extra provision for
1730 * it like the code just below, it doesn't get added when it should.
1731 * This solution is to add it only when AND'ing, which is here, and
1732 * only when what is being AND'ed is the pristine, original node
1733 * matching anything. Thus it is like adding it to ssc_anything() but
1734 * only when the result is to be AND'ed. Probably the same solution
1735 * could be adopted for the same problem we have with /l matching,
1736 * which is solved differently in S_ssc_init(), and that would lead to
1737 * fewer false positives than that solution has. But if this solution
1738 * creates bugs, the consequences are only that a warning isn't raised
1739 * that should be; while the consequences for having /l bugs is
1740 * incorrect matches */
7dcac5f6 1741 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1742 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1743 }
a0dd4231
KW
1744 }
1745 else {
5c0f85ef 1746 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685
KW
1747 if (OP(and_with) == ANYOFD) {
1748 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1749 }
1750 else {
1751 anded_flags = ANYOF_FLAGS(and_with)
1752 &( ANYOF_COMMON_FLAGS
108316fb
KW
1753 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1754 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1755 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(and_with))) {
1756 anded_flags &=
1757 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1758 }
f240c685 1759 }
a0dd4231
KW
1760 }
1761
1762 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1763
1764 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1765 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1766 * 'and_with' may be inverted. When not inverted, we have the situation of
1767 * computing:
1768 * (C1 | P1) & (C2 | P2)
1769 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1770 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1771 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1772 * <= ((C1 & C2) | P1 | P2)
1773 * Alternatively, the last few steps could be:
1774 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1775 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1776 * <= (C1 | C2 | (P1 & P2))
1777 * We favor the second approach if either P1 or P2 is non-empty. This is
1778 * because these components are a barrier to doing optimizations, as what
1779 * they match cannot be known until the moment of matching as they are
1780 * dependent on the current locale, 'AND"ing them likely will reduce or
1781 * eliminate them.
1782 * But we can do better if we know that C1,P1 are in their initial state (a
1783 * frequent occurrence), each matching everything:
1784 * (<everything>) & (C2 | P2) = C2 | P2
1785 * Similarly, if C2,P2 are in their initial state (again a frequent
1786 * occurrence), the result is a no-op
1787 * (C1 | P1) & (<everything>) = C1 | P1
1788 *
1789 * Inverted, we have
1790 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1791 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1792 * <= (C1 & ~C2) | (P1 & ~P2)
1793 * */
1aa99e6b 1794
a0dd4231 1795 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1796 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1797 {
cdd87c1d 1798 unsigned int i;
8951c461 1799
cdd87c1d
KW
1800 ssc_intersection(ssc,
1801 anded_cp_list,
1802 FALSE /* Has already been inverted */
1803 );
c6b76537 1804
cdd87c1d
KW
1805 /* If either P1 or P2 is empty, the intersection will be also; can skip
1806 * the loop */
93e92956 1807 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1808 ANYOF_POSIXL_ZERO(ssc);
1809 }
e0e1be5f 1810 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1811
1812 /* Note that the Posix class component P from 'and_with' actually
1813 * looks like:
1814 * P = Pa | Pb | ... | Pn
1815 * where each component is one posix class, such as in [\w\s].
1816 * Thus
1817 * ~P = ~(Pa | Pb | ... | Pn)
1818 * = ~Pa & ~Pb & ... & ~Pn
1819 * <= ~Pa | ~Pb | ... | ~Pn
1820 * The last is something we can easily calculate, but unfortunately
1821 * is likely to have many false positives. We could do better
1822 * in some (but certainly not all) instances if two classes in
1823 * P have known relationships. For example
1824 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1825 * So
1826 * :lower: & :print: = :lower:
1827 * And similarly for classes that must be disjoint. For example,
1828 * since \s and \w can have no elements in common based on rules in
1829 * the POSIX standard,
1830 * \w & ^\S = nothing
1831 * Unfortunately, some vendor locales do not meet the Posix
1832 * standard, in particular almost everything by Microsoft.
1833 * The loop below just changes e.g., \w into \W and vice versa */
1834
1ee208c4 1835 regnode_charclass_posixl temp;
cdd87c1d
KW
1836 int add = 1; /* To calculate the index of the complement */
1837
b1234259 1838 Zero(&temp, 1, regnode_charclass_posixl);
cdd87c1d
KW
1839 ANYOF_POSIXL_ZERO(&temp);
1840 for (i = 0; i < ANYOF_MAX; i++) {
1841 assert(i % 2 != 0
7dcac5f6
KW
1842 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1843 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1844
7dcac5f6 1845 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1846 ANYOF_POSIXL_SET(&temp, i + add);
1847 }
1848 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1849 }
1850 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1851
cdd87c1d
KW
1852 } /* else ssc already has no posixes */
1853 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1854 in its initial state */
71068078 1855 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1856 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1857 {
1858 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1859 * copy it over 'ssc' */
1860 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1861 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1862 StructCopy(and_with, ssc, regnode_ssc);
1863 }
1864 else {
1865 ssc->invlist = anded_cp_list;
1866 ANYOF_POSIXL_ZERO(ssc);
93e92956 1867 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1868 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1869 }
1870 }
1871 }
e0e1be5f 1872 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
93e92956 1873 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1874 {
1875 /* One or the other of P1, P2 is non-empty. */
93e92956 1876 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1877 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1878 }
cdd87c1d
KW
1879 ssc_union(ssc, anded_cp_list, FALSE);
1880 }
1881 else { /* P1 = P2 = empty */
1882 ssc_intersection(ssc, anded_cp_list, FALSE);
1883 }
137165a6 1884 }
653099ff
GS
1885}
1886
653099ff 1887STATIC void
cdd87c1d 1888S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1889 const regnode_charclass *or_with)
653099ff 1890{
cdd87c1d
KW
1891 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1892 * another SSC or a regular ANYOF class. Can create false positives if
1893 * 'or_with' is to be inverted. */
7918f24d 1894
a0dd4231
KW
1895 SV* ored_cp_list;
1896 U8 ored_flags;
c6b76537 1897
cdd87c1d 1898 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1899
71068078 1900 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1901
1902 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1903 * the code point inversion list and just the relevant flags */
71068078 1904 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1905 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
a0dd4231
KW
1906 ored_flags = ANYOF_FLAGS(or_with);
1907 }
1908 else {
5c0f85ef 1909 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
eff8b7dc 1910 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
f240c685
KW
1911 if (OP(or_with) != ANYOFD) {
1912 ored_flags
1913 |= ANYOF_FLAGS(or_with)
108316fb
KW
1914 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1915 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
d1c40ef5
KW
1916 if (ANYOFL_UTF8_LOCALE_REQD(ANYOF_FLAGS(or_with))) {
1917 ored_flags |=
1918 ANYOFL_SHARED_UTF8_LOCALE_fold_HAS_MATCHES_nonfold_REQD;
1919 }
f240c685 1920 }
a0dd4231
KW
1921 }
1922
1923 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1924
1925 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1926 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1927 * 'or_with' may be inverted. When not inverted, we have the simple
1928 * situation of computing:
1929 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1930 * If P1|P2 yields a situation with both a class and its complement are
1931 * set, like having both \w and \W, this matches all code points, and we
1932 * can delete these from the P component of the ssc going forward. XXX We
1933 * might be able to delete all the P components, but I (khw) am not certain
1934 * about this, and it is better to be safe.
1935 *
1936 * Inverted, we have
1937 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1938 * <= (C1 | P1) | ~C2
1939 * <= (C1 | ~C2) | P1
1940 * (which results in actually simpler code than the non-inverted case)
1941 * */
9826f543 1942
a0dd4231 1943 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1944 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1945 {
cdd87c1d 1946 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1947 } /* else Not inverted */
93e92956 1948 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1949 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1950 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1951 unsigned int i;
1952 for (i = 0; i < ANYOF_MAX; i += 2) {
1953 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1954 {
1955 ssc_match_all_cp(ssc);
1956 ANYOF_POSIXL_CLEAR(ssc, i);
1957 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1958 }
1959 }
1960 }
1aa99e6b 1961 }
cdd87c1d
KW
1962
1963 ssc_union(ssc,
1964 ored_cp_list,
1965 FALSE /* Already has been inverted */
1966 );
653099ff
GS
1967}
1968
b423522f
KW
1969PERL_STATIC_INLINE void
1970S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1971{
1972 PERL_ARGS_ASSERT_SSC_UNION;
1973
71068078 1974 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1975
1976 _invlist_union_maybe_complement_2nd(ssc->invlist,
1977 invlist,
1978 invert2nd,
1979 &ssc->invlist);
1980}
1981
1982PERL_STATIC_INLINE void
1983S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1984 SV* const invlist,
1985 const bool invert2nd)
1986{
1987 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1988
71068078 1989 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1990
1991 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1992 invlist,
1993 invert2nd,
1994 &ssc->invlist);
1995}
1996
1997PERL_STATIC_INLINE void
1998S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1999{
2000 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
2001
71068078 2002 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2003
2004 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
2005}
2006
2007PERL_STATIC_INLINE void
2008S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
2009{
2010 /* AND just the single code point 'cp' into the SSC 'ssc' */
2011
2012 SV* cp_list = _new_invlist(2);
2013
2014 PERL_ARGS_ASSERT_SSC_CP_AND;
2015
71068078 2016 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2017
2018 cp_list = add_cp_to_invlist(cp_list, cp);
2019 ssc_intersection(ssc, cp_list,
2020 FALSE /* Not inverted */
2021 );
2022 SvREFCNT_dec_NN(cp_list);
2023}
2024
2025PERL_STATIC_INLINE void
dc3bf405 2026S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
2027{
2028 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
2029 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
2030
71068078 2031 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
2032
2033 ANYOF_POSIXL_ZERO(ssc);
2034 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
2035}
2036
b35552de
KW
2037#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
2038
2039STATIC bool
2040S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
2041{
2042 /* The synthetic start class is used to hopefully quickly winnow down
2043 * places where a pattern could start a match in the target string. If it
2044 * doesn't really narrow things down that much, there isn't much point to
2045 * having the overhead of using it. This function uses some very crude
2046 * heuristics to decide if to use the ssc or not.
2047 *
2048 * It returns TRUE if 'ssc' rules out more than half what it considers to
2049 * be the "likely" possible matches, but of course it doesn't know what the
2050 * actual things being matched are going to be; these are only guesses
2051 *
2052 * For /l matches, it assumes that the only likely matches are going to be
2053 * in the 0-255 range, uniformly distributed, so half of that is 127
2054 * For /a and /d matches, it assumes that the likely matches will be just
2055 * the ASCII range, so half of that is 63
2056 * For /u and there isn't anything matching above the Latin1 range, it
2057 * assumes that that is the only range likely to be matched, and uses
2058 * half that as the cut-off: 127. If anything matches above Latin1,
2059 * it assumes that all of Unicode could match (uniformly), except for
2060 * non-Unicode code points and things in the General Category "Other"
2061 * (unassigned, private use, surrogates, controls and formats). This
2062 * is a much large number. */
2063
b35552de
KW
2064 U32 count = 0; /* Running total of number of code points matched by
2065 'ssc' */
2066 UV start, end; /* Start and end points of current range in inversion
26be5fe6 2067 XXX outdated. UTF-8 locales are common, what about invert? list */
72400949
KW
2068 const U32 max_code_points = (LOC)
2069 ? 256
f6e8f31e 2070 : (( ! UNI_SEMANTICS
1a26cbcb 2071 || invlist_highest(ssc->invlist) < 256)
72400949
KW
2072 ? 128
2073 : NON_OTHER_COUNT);
2074 const U32 max_match = max_code_points / 2;
b35552de
KW
2075
2076 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
2077
2078 invlist_iterinit(ssc->invlist);
2079 while (invlist_iternext(ssc->invlist, &start, &end)) {
72400949
KW
2080 if (start >= max_code_points) {
2081 break;
b35552de 2082 }
72400949 2083 end = MIN(end, max_code_points - 1);
b35552de 2084 count += end - start + 1;
72400949 2085 if (count >= max_match) {
b35552de
KW
2086 invlist_iterfinish(ssc->invlist);
2087 return FALSE;
2088 }
2089 }
2090
2091 return TRUE;
2092}
2093
2094
b423522f
KW
2095STATIC void
2096S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
2097{
2098 /* The inversion list in the SSC is marked mortal; now we need a more
2099 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
2100 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
2101 * map */
b423522f 2102
28118b9c 2103 SV* invlist = invlist_clone(ssc->invlist, NULL);
b423522f
KW
2104
2105 PERL_ARGS_ASSERT_SSC_FINALIZE;
2106
71068078 2107 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 2108
a0dd4231 2109 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
2110 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
2111 * by the time we reach here */
f240c685
KW
2112 assert(! (ANYOF_FLAGS(ssc)
2113 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
2114 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
2115 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 2116
b423522f
KW
2117 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
2118
4c404f26 2119 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
b423522f 2120
85c8e306
KW
2121 /* Make sure is clone-safe */
2122 ssc->invlist = NULL;
2123
e0e1be5f 2124 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 2125 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
d156f5cb 2126 OP(ssc) = ANYOFPOSIXL;
e0e1be5f 2127 }
d156f5cb 2128 else if (RExC_contains_locale) {
b2e90ddf
KW
2129 OP(ssc) = ANYOFL;
2130 }
2131
1462525b 2132 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
2133}
2134
a3621e74
YO
2135#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
2136#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
2137#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
2138#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
2139 ? (TRIE_LIST_CUR( idx ) - 1) \
2140 : 0 )
a3621e74 2141
3dab1dad
YO
2142
2143#ifdef DEBUGGING
07be1b83 2144/*
2b8b4781
NC
2145 dump_trie(trie,widecharmap,revcharmap)
2146 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
2147 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
2148
2149 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
2150 The _interim_ variants are used for debugging the interim
2151 tables that are used to generate the final compressed
2152 representation which is what dump_trie expects.
2153
486ec47a 2154 Part of the reason for their existence is to provide a form
3dab1dad 2155 of documentation as to how the different representations function.
07be1b83
YO
2156
2157*/
3dab1dad
YO
2158
2159/*
3dab1dad
YO
2160 Dumps the final compressed table form of the trie to Perl_debug_log.
2161 Used for debugging make_trie().
2162*/
b9a59e08 2163
3dab1dad 2164STATIC void
2b8b4781
NC
2165S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
2166 AV *revcharmap, U32 depth)
3dab1dad
YO
2167{
2168 U32 state;
ab3bbdeb 2169 SV *sv=sv_newmortal();
55eed653 2170 int colwidth= widecharmap ? 6 : 4;
2e64971a 2171 U16 word;
3dab1dad
YO
2172 GET_RE_DEBUG_FLAGS_DECL;
2173
7918f24d 2174 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 2175
6ad9a8ab 2176 Perl_re_indentf( aTHX_ "Char : %-6s%-6s%-4s ",
1e37780e 2177 depth+1, "Match","Base","Ofs" );
3dab1dad
YO
2178
2179 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 2180 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 2181 if ( tmp ) {
6ad9a8ab 2182 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2183 colwidth,
538e84ed 2184 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2185 PL_colors[0], PL_colors[1],
2186 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2187 PERL_PV_ESCAPE_FIRSTCHAR
2188 )
ab3bbdeb 2189 );
3dab1dad
YO
2190 }
2191 }
1e37780e
YO
2192 Perl_re_printf( aTHX_ "\n");
2193 Perl_re_indentf( aTHX_ "State|-----------------------", depth+1);
3dab1dad
YO
2194
2195 for( state = 0 ; state < trie->uniquecharcount ; state++ )
6ad9a8ab
YO
2196 Perl_re_printf( aTHX_ "%.*s", colwidth, "--------");
2197 Perl_re_printf( aTHX_ "\n");
3dab1dad 2198
1e2e3d02 2199 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 2200 const U32 base = trie->states[ state ].trans.base;
3dab1dad 2201
147e3846 2202 Perl_re_indentf( aTHX_ "#%4" UVXf "|", depth+1, (UV)state);
3dab1dad
YO
2203
2204 if ( trie->states[ state ].wordnum ) {
1e37780e 2205 Perl_re_printf( aTHX_ " W%4X", trie->states[ state ].wordnum );
3dab1dad 2206 } else {
6ad9a8ab 2207 Perl_re_printf( aTHX_ "%6s", "" );
3dab1dad
YO
2208 }
2209
147e3846 2210 Perl_re_printf( aTHX_ " @%4" UVXf " ", (UV)base );
3dab1dad
YO
2211
2212 if ( base ) {
2213 U32 ofs = 0;
2214
2215 while( ( base + ofs < trie->uniquecharcount ) ||
2216 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
2217 && trie->trans[ base + ofs - trie->uniquecharcount ].check
2218 != state))
3dab1dad
YO
2219 ofs++;
2220
147e3846 2221 Perl_re_printf( aTHX_ "+%2" UVXf "[ ", (UV)ofs);
3dab1dad
YO
2222
2223 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
2224 if ( ( base + ofs >= trie->uniquecharcount )
2225 && ( base + ofs - trie->uniquecharcount
2226 < trie->lasttrans )
2227 && trie->trans[ base + ofs
2228 - trie->uniquecharcount ].check == state )
3dab1dad 2229 {
147e3846 2230 Perl_re_printf( aTHX_ "%*" UVXf, colwidth,
1e37780e
YO
2231 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next
2232 );
3dab1dad 2233 } else {
88f063b4 2234 Perl_re_printf( aTHX_ "%*s", colwidth," ." );
3dab1dad
YO
2235 }
2236 }
2237
6ad9a8ab 2238 Perl_re_printf( aTHX_ "]");
3dab1dad
YO
2239
2240 }
6ad9a8ab 2241 Perl_re_printf( aTHX_ "\n" );
3dab1dad 2242 }
6ad9a8ab 2243 Perl_re_indentf( aTHX_ "word_info N:(prev,len)=",
cb41e5d6 2244 depth);
2e64971a 2245 for (word=1; word <= trie->wordcount; word++) {
6ad9a8ab 2246 Perl_re_printf( aTHX_ " %d:(%d,%d)",
2e64971a
DM
2247 (int)word, (int)(trie->wordinfo[word].prev),
2248 (int)(trie->wordinfo[word].len));
2249 }
6ad9a8ab 2250 Perl_re_printf( aTHX_ "\n" );
538e84ed 2251}
3dab1dad 2252/*
3dab1dad 2253 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 2254 List tries normally only are used for construction when the number of
3dab1dad
YO
2255 possible chars (trie->uniquecharcount) is very high.
2256 Used for debugging make_trie().
2257*/
2258STATIC void
55eed653 2259S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2260 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2261 U32 depth)
3dab1dad
YO
2262{
2263 U32 state;
ab3bbdeb 2264 SV *sv=sv_newmortal();
55eed653 2265 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2266 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2267
2268 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
2269
3dab1dad 2270 /* print out the table precompression. */
6ad9a8ab 2271 Perl_re_indentf( aTHX_ "State :Word | Transition Data\n",
cb41e5d6 2272 depth+1 );
6ad9a8ab 2273 Perl_re_indentf( aTHX_ "%s",
cb41e5d6 2274 depth+1, "------:-----+-----------------\n" );
538e84ed 2275
3dab1dad
YO
2276 for( state=1 ; state < next_alloc ; state ++ ) {
2277 U16 charid;
538e84ed 2278
147e3846 2279 Perl_re_indentf( aTHX_ " %4" UVXf " :",
cb41e5d6 2280 depth+1, (UV)state );
3dab1dad 2281 if ( ! trie->states[ state ].wordnum ) {
6ad9a8ab 2282 Perl_re_printf( aTHX_ "%5s| ","");
3dab1dad 2283 } else {
6ad9a8ab 2284 Perl_re_printf( aTHX_ "W%4x| ",
3dab1dad
YO
2285 trie->states[ state ].wordnum
2286 );
2287 }
2288 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed 2289 SV ** const tmp = av_fetch( revcharmap,
88f063b4 2290 TRIE_LIST_ITEM(state, charid).forid, 0);
ab3bbdeb 2291 if ( tmp ) {
147e3846 2292 Perl_re_printf( aTHX_ "%*s:%3X=%4" UVXf " | ",
ab3bbdeb 2293 colwidth,
538e84ed
KW
2294 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
2295 colwidth,
2296 PL_colors[0], PL_colors[1],
2297 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
2298 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 2299 ) ,
88f063b4
KW
2300 TRIE_LIST_ITEM(state, charid).forid,
2301 (UV)TRIE_LIST_ITEM(state, charid).newstate
1e2e3d02 2302 );
538e84ed 2303 if (!(charid % 10))
6ad9a8ab 2304 Perl_re_printf( aTHX_ "\n%*s| ",
664e119d 2305 (int)((depth * 2) + 14), "");
1e2e3d02 2306 }
ab3bbdeb 2307 }
6ad9a8ab 2308 Perl_re_printf( aTHX_ "\n");
3dab1dad 2309 }
538e84ed 2310}
3dab1dad
YO
2311
2312/*
3dab1dad 2313 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
2314 This is the normal DFA style state transition table, with a few
2315 twists to facilitate compression later.
3dab1dad
YO
2316 Used for debugging make_trie().
2317*/
2318STATIC void
55eed653 2319S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
2320 HV *widecharmap, AV *revcharmap, U32 next_alloc,
2321 U32 depth)
3dab1dad
YO
2322{
2323 U32 state;
2324 U16 charid;
ab3bbdeb 2325 SV *sv=sv_newmortal();
55eed653 2326 int colwidth= widecharmap ? 6 : 4;
3dab1dad 2327 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2328
2329 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 2330
3dab1dad
YO
2331 /*
2332 print out the table precompression so that we can do a visual check
2333 that they are identical.
2334 */
538e84ed 2335
6ad9a8ab 2336 Perl_re_indentf( aTHX_ "Char : ", depth+1 );
3dab1dad
YO
2337
2338 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 2339 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 2340 if ( tmp ) {
6ad9a8ab 2341 Perl_re_printf( aTHX_ "%*s",
ab3bbdeb 2342 colwidth,
538e84ed 2343 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
2344 PL_colors[0], PL_colors[1],
2345 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
2346 PERL_PV_ESCAPE_FIRSTCHAR
2347 )
ab3bbdeb 2348 );
3dab1dad
YO
2349 }
2350 }
2351
4aaafc03
YO
2352 Perl_re_printf( aTHX_ "\n");
2353 Perl_re_indentf( aTHX_ "State+-", depth+1 );
3dab1dad
YO
2354
2355 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
6ad9a8ab 2356 Perl_re_printf( aTHX_ "%.*s", colwidth,"--------");
3dab1dad
YO
2357 }
2358
6ad9a8ab 2359 Perl_re_printf( aTHX_ "\n" );
3dab1dad
YO
2360
2361 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
2362
147e3846 2363 Perl_re_indentf( aTHX_ "%4" UVXf " : ",
cb41e5d6 2364 depth+1,
3dab1dad
YO
2365 (UV)TRIE_NODENUM( state ) );
2366
2367 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
2368 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
2369 if (v)
147e3846 2370 Perl_re_printf( aTHX_ "%*" UVXf, colwidth, v );
ab3bbdeb 2371 else
6ad9a8ab 2372 Perl_re_printf( aTHX_ "%*s", colwidth, "." );
3dab1dad
YO
2373 }
2374 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
147e3846 2375 Perl_re_printf( aTHX_ " (%4" UVXf ")\n",
538e84ed 2376 (UV)trie->trans[ state ].check );
3dab1dad 2377 } else {
147e3846 2378 Perl_re_printf( aTHX_ " (%4" UVXf ") W%4X\n",
538e84ed 2379 (UV)trie->trans[ state ].check,
3dab1dad
YO
2380 trie->states[ TRIE_NODENUM( state ) ].wordnum );
2381 }
2382 }
07be1b83 2383}
3dab1dad
YO
2384
2385#endif
2386
2e64971a 2387
786e8c11
YO
2388/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
2389 startbranch: the first branch in the whole branch sequence
2390 first : start branch of sequence of branch-exact nodes.
2391 May be the same as startbranch
2392 last : Thing following the last branch.
2393 May be the same as tail.
2394 tail : item following the branch sequence
2395 count : words in the sequence
a4525e78 2396 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 2397 depth : indent depth
3dab1dad 2398
786e8c11 2399Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 2400
786e8c11
YO
2401A trie is an N'ary tree where the branches are determined by digital
2402decomposition of the key. IE, at the root node you look up the 1st character and
2403follow that branch repeat until you find the end of the branches. Nodes can be
2404marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 2405
786e8c11 2406 /he|she|his|hers/
72f13be8 2407
786e8c11
YO
2408would convert into the following structure. Numbers represent states, letters
2409following numbers represent valid transitions on the letter from that state, if
2410the number is in square brackets it represents an accepting state, otherwise it
2411will be in parenthesis.
07be1b83 2412
786e8c11
YO
2413 +-h->+-e->[3]-+-r->(8)-+-s->[9]
2414 | |
2415 | (2)
2416 | |
2417 (1) +-i->(6)-+-s->[7]
2418 |
2419 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 2420
786e8c11
YO
2421 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
2422
2423This shows that when matching against the string 'hers' we will begin at state 1
2424read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
2425then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
2426is also accepting. Thus we know that we can match both 'he' and 'hers' with a
2427single traverse. We store a mapping from accepting to state to which word was
2428matched, and then when we have multiple possibilities we try to complete the
b8fda935 2429rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
2430
2431The only prior NFA like behaviour that would be changed by the TRIE support is
2432the silent ignoring of duplicate alternations which are of the form:
2433
2434 / (DUPE|DUPE) X? (?{ ... }) Y /x
2435
4b714af6 2436Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2437and without the optimisation. With the optimisations dupes will be silently
486ec47a 2438ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2439the following demonstrates:
2440
2441 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2442
2443which prints out 'word' three times, but
2444
2445 'words'=~/(word|word|word)(?{ print $1 })S/
2446
2447which doesnt print it out at all. This is due to other optimisations kicking in.
2448
2449Example of what happens on a structural level:
2450
486ec47a 2451The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2452
2453 1: CURLYM[1] {1,32767}(18)
2454 5: BRANCH(8)
2455 6: EXACT <ac>(16)
2456 8: BRANCH(11)
2457 9: EXACT <ad>(16)
2458 11: BRANCH(14)
2459 12: EXACT <ab>(16)
2460 16: SUCCEED(0)
2461 17: NOTHING(18)
2462 18: END(0)
2463
2464This would be optimizable with startbranch=5, first=5, last=16, tail=16
2465and should turn into:
2466
2467 1: CURLYM[1] {1,32767}(18)
2468 5: TRIE(16)
2469 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2470 <ac>
2471 <ad>
2472 <ab>
2473 16: SUCCEED(0)
2474 17: NOTHING(18)
2475 18: END(0)
2476
2477Cases where tail != last would be like /(?foo|bar)baz/:
2478
2479 1: BRANCH(4)
2480 2: EXACT <foo>(8)
2481 4: BRANCH(7)
2482 5: EXACT <bar>(8)
2483 7: TAIL(8)
2484 8: EXACT <baz>(10)
2485 10: END(0)
2486
2487which would be optimizable with startbranch=1, first=1, last=7, tail=8
2488and would end up looking like:
2489
2490 1: TRIE(8)
2491 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2492 <foo>
2493 <bar>
2494 7: TAIL(8)
2495 8: EXACT <baz>(10)
2496 10: END(0)
2497
c80e42f3 2498 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2499
2500is the recommended Unicode-aware way of saying
2501
2502 *(d++) = uv;
2503*/
2504
fab2782b 2505#define TRIE_STORE_REVCHAR(val) \
786e8c11 2506 STMT_START { \
73031816 2507 if (UTF) { \
668fcfea 2508 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2509 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 2510 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
2511 SvCUR_set(zlopp, kapow - flrbbbbb); \
2512 SvPOK_on(zlopp); \
2513 SvUTF8_on(zlopp); \
2514 av_push(revcharmap, zlopp); \
2515 } else { \
fab2782b 2516 char ooooff = (char)val; \
73031816
NC
2517 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2518 } \
2519 } STMT_END
786e8c11 2520
914a25d5
KW
2521/* This gets the next character from the input, folding it if not already
2522 * folded. */
2523#define TRIE_READ_CHAR STMT_START { \
2524 wordlen++; \
2525 if ( UTF ) { \
2526 /* if it is UTF then it is either already folded, or does not need \
2527 * folding */ \
1c1d615a 2528 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2529 } \
2530 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2531 /* This folder implies Unicode rules, which in the range expressible \
2532 * by not UTF is the lower case, with the two exceptions, one of \
2533 * which should have been taken care of before calling this */ \
2534 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2535 uvc = toLOWER_L1(*uc); \
2536 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2537 len = 1; \
914a25d5
KW
2538 } else { \
2539 /* raw data, will be folded later if needed */ \
2540 uvc = (U32)*uc; \
2541 len = 1; \
2542 } \
786e8c11
YO
2543} STMT_END
2544
2545
2546
2547#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2548 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
00195859 2549 U32 ging = TRIE_LIST_LEN( state ) * 2; \
f9003953 2550 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
00195859 2551 TRIE_LIST_LEN( state ) = ging; \
786e8c11
YO
2552 } \
2553 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2554 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2555 TRIE_LIST_CUR( state )++; \
2556} STMT_END
07be1b83 2557
786e8c11 2558#define TRIE_LIST_NEW(state) STMT_START { \
d09f14bf 2559 Newx( trie->states[ state ].trans.list, \
786e8c11
YO
2560 4, reg_trie_trans_le ); \
2561 TRIE_LIST_CUR( state ) = 1; \
2562 TRIE_LIST_LEN( state ) = 4; \
2563} STMT_END
07be1b83 2564
786e8c11
YO
2565#define TRIE_HANDLE_WORD(state) STMT_START { \
2566 U16 dupe= trie->states[ state ].wordnum; \
2567 regnode * const noper_next = regnext( noper ); \
2568 \
786e8c11
YO
2569 DEBUG_r({ \
2570 /* store the word for dumping */ \
2571 SV* tmp; \
2572 if (OP(noper) != NOTHING) \
740cce10 2573 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2574 else \
740cce10 2575 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2576 av_push( trie_words, tmp ); \
786e8c11
YO
2577 }); \
2578 \
2579 curword++; \
2e64971a
DM
2580 trie->wordinfo[curword].prev = 0; \
2581 trie->wordinfo[curword].len = wordlen; \
2582 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2583 \
2584 if ( noper_next < tail ) { \
2585 if (!trie->jump) \
538e84ed
KW
2586 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2587 sizeof(U16) ); \
7f69552c 2588 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2589 if (!jumper) \
2590 jumper = noper_next; \
2591 if (!nextbranch) \
2592 nextbranch= regnext(cur); \
2593 } \
2594 \
2595 if ( dupe ) { \
2e64971a
DM
2596 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2597 /* chain, so that when the bits of chain are later */\
2598 /* linked together, the dups appear in the chain */\
2599 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2600 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2601 } else { \
2602 /* we haven't inserted this word yet. */ \
2603 trie->states[ state ].wordnum = curword; \
2604 } \
2605} STMT_END
07be1b83 2606
3dab1dad 2607
786e8c11
YO
2608#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2609 ( ( base + charid >= ucharcount \
2610 && base + charid < ubound \
2611 && state == trie->trans[ base - ucharcount + charid ].check \
2612 && trie->trans[ base - ucharcount + charid ].next ) \
2613 ? trie->trans[ base - ucharcount + charid ].next \
2614 : ( state==1 ? special : 0 ) \
2615 )
3dab1dad 2616
8bcafbf4
YO
2617#define TRIE_BITMAP_SET_FOLDED(trie, uvc, folder) \
2618STMT_START { \
2619 TRIE_BITMAP_SET(trie, uvc); \
2620 /* store the folded codepoint */ \
2621 if ( folder ) \
2622 TRIE_BITMAP_SET(trie, folder[(U8) uvc ]); \
2623 \
2624 if ( !UTF ) { \
2625 /* store first byte of utf8 representation of */ \
2626 /* variant codepoints */ \
2627 if (! UVCHR_IS_INVARIANT(uvc)) { \
2628 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc)); \
2629 } \
2630 } \
2631} STMT_END
786e8c11
YO
2632#define MADE_TRIE 1
2633#define MADE_JUMP_TRIE 2
2634#define MADE_EXACT_TRIE 4
3dab1dad 2635
a3621e74 2636STATIC I32
538e84ed
KW
2637S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2638 regnode *first, regnode *last, regnode *tail,
2639 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2640{
2641 /* first pass, loop through and scan words */
2642 reg_trie_data *trie;
55eed653 2643 HV *widecharmap = NULL;
2b8b4781 2644 AV *revcharmap = newAV();
a3621e74 2645 regnode *cur;
a3621e74
YO
2646 STRLEN len = 0;
2647 UV uvc = 0;
2648 U16 curword = 0;
2649 U32 next_alloc = 0;
786e8c11
YO
2650 regnode *jumper = NULL;
2651 regnode *nextbranch = NULL;
7f69552c 2652 regnode *convert = NULL;
2e64971a 2653 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2654 /* we just use folder as a flag in utf8 */
1e696034 2655 const U8 * folder = NULL;
a3621e74 2656
3a611511
YO
2657 /* in the below add_data call we are storing either 'tu' or 'tuaa'
2658 * which stands for one trie structure, one hash, optionally followed
2659 * by two arrays */
2b8b4781 2660#ifdef DEBUGGING
3a611511 2661 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuaa"));
2b8b4781
NC
2662 AV *trie_words = NULL;
2663 /* along with revcharmap, this only used during construction but both are
2664 * useful during debugging so we store them in the struct when debugging.
8e11feef 2665 */
2b8b4781 2666#else
cf78de0b 2667 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2668 STRLEN trie_charcount=0;
3dab1dad 2669#endif
2b8b4781 2670 SV *re_trie_maxbuff;
a3621e74 2671 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2672
2673 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2674#ifndef DEBUGGING
2675 PERL_UNUSED_ARG(depth);
2676#endif
a3621e74 2677
1e696034 2678 switch (flags) {
f6b4b99d 2679 case EXACT: case EXACT_ONLY8: case EXACTL: break;
89829bb5 2680 case EXACTFAA:
0ea669f4 2681 case EXACTFUP:
a4525e78
KW
2682 case EXACTFU:
2683 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2684 case EXACTF: folder = PL_fold; break;
fab2782b 2685 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2686 }
2687
c944940b 2688 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2689 trie->refcount = 1;
3dab1dad 2690 trie->startstate = 1;
786e8c11 2691 trie->wordcount = word_count;
f8fc2ecf 2692 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2693 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
f6b4b99d 2694 if (flags == EXACT || flags == EXACT_ONLY8 || flags == EXACTL)
c944940b 2695 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2696 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2697 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2698
a3621e74 2699 DEBUG_r({
2b8b4781 2700 trie_words = newAV();
a3621e74 2701 });
a3621e74 2702
0111c4fd 2703 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
316ebaf2 2704 assert(re_trie_maxbuff);
a3621e74 2705 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2706 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2707 }
df826430 2708 DEBUG_TRIE_COMPILE_r({
6ad9a8ab 2709 Perl_re_indentf( aTHX_
cb41e5d6
YO
2710 "make_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2711 depth+1,
88f063b4 2712 REG_NODE_NUM(startbranch), REG_NODE_NUM(first),
538e84ed 2713 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2714 });
538e84ed 2715
7f69552c
YO
2716 /* Find the node we are going to overwrite */
2717 if ( first == startbranch && OP( last ) != BRANCH ) {
2718 /* whole branch chain */
2719 convert = first;
2720 } else {
2721 /* branch sub-chain */
2722 convert = NEXTOPER( first );
2723 }
538e84ed 2724
a3621e74
YO
2725 /* -- First loop and Setup --
2726
2727 We first traverse the branches and scan each word to determine if it
2728 contains widechars, and how many unique chars there are, this is
2729 important as we have to build a table with at least as many columns as we
2730 have unique chars.
2731
2732 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2733 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2734 the native representation of the character value as the key and IV's for
2735 the coded index.
a3621e74
YO
2736
2737 *TODO* If we keep track of how many times each character is used we can
2738 remap the columns so that the table compression later on is more
3b753521 2739 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2740 middle and the least common are on the outside. IMO this would be better
2741 than a most to least common mapping as theres a decent chance the most
2742 common letter will share a node with the least common, meaning the node
486ec47a 2743 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2744 case is when we have the least common nodes twice.
2745
2746 */
2747
a3621e74 2748 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2749 regnode *noper = NEXTOPER( cur );
944e05e3
YO
2750 const U8 *uc;
2751 const U8 *e;
bc031a7d 2752 int foldlen = 0;
07be1b83 2753 U32 wordlen = 0; /* required init */
bc031a7d
KW
2754 STRLEN minchars = 0;
2755 STRLEN maxchars = 0;
538e84ed
KW
2756 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2757 bitmap?*/
a3621e74 2758
3dab1dad 2759 if (OP(noper) == NOTHING) {
20ed8c88
YO
2760 /* skip past a NOTHING at the start of an alternation
2761 * eg, /(?:)a|(?:b)/ should be the same as /a|b/
2762 */
df826430 2763 regnode *noper_next= regnext(noper);
944e05e3
YO
2764 if (noper_next < tail)
2765 noper= noper_next;
2766 }
2767
f6b4b99d
KW
2768 if ( noper < tail
2769 && ( OP(noper) == flags
2770 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
a9f8c7ac 2771 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
0ea669f4 2772 || OP(noper) == EXACTFUP))))
f6b4b99d 2773 {
944e05e3
YO
2774 uc= (U8*)STRING(noper);
2775 e= uc + STR_LEN(noper);
2776 } else {
2777 trie->minlen= 0;
2778 continue;
3dab1dad 2779 }
df826430 2780
944e05e3 2781
fab2782b 2782 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2783 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2784 regardless of encoding */
0ea669f4 2785 if (OP( noper ) == EXACTFUP) {
fab2782b 2786 /* false positives are ok, so just set this */
0dc4a61d 2787 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2788 }
2789 }
dca5fc4c 2790
bc031a7d
KW
2791 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2792 branch */
3dab1dad 2793 TRIE_CHARCOUNT(trie)++;
a3621e74 2794 TRIE_READ_CHAR;
645de4ce 2795
bc031a7d
KW
2796 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2797 * is in effect. Under /i, this character can match itself, or
2798 * anything that folds to it. If not under /i, it can match just
2799 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2800 * all fold to k, and all are single characters. But some folds
2801 * expand to more than one character, so for example LATIN SMALL
2802 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2803 * the string beginning at 'uc' is 'ffi', it could be matched by
2804 * three characters, or just by the one ligature character. (It
2805 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2806 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2807 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2808 * match.) The trie needs to know the minimum and maximum number
2809 * of characters that could match so that it can use size alone to
2810 * quickly reject many match attempts. The max is simple: it is
2811 * the number of folded characters in this branch (since a fold is
2812 * never shorter than what folds to it. */
2813
2814 maxchars++;
2815
2816 /* And the min is equal to the max if not under /i (indicated by
2817 * 'folder' being NULL), or there are no multi-character folds. If
2818 * there is a multi-character fold, the min is incremented just
2819 * once, for the character that folds to the sequence. Each
2820 * character in the sequence needs to be added to the list below of
2821 * characters in the trie, but we count only the first towards the
2822 * min number of characters needed. This is done through the
2823 * variable 'foldlen', which is returned by the macros that look
2824 * for these sequences as the number of bytes the sequence
2825 * occupies. Each time through the loop, we decrement 'foldlen' by
2826 * how many bytes the current char occupies. Only when it reaches
2827 * 0 do we increment 'minchars' or look for another multi-character
2828 * sequence. */
2829 if (folder == NULL) {
2830 minchars++;
2831 }
2832 else if (foldlen > 0) {
2833 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2834 }
2835 else {
bc031a7d
KW
2836 minchars++;
2837
2838 /* See if *uc is the beginning of a multi-character fold. If
2839 * so, we decrement the length remaining to look at, to account
2840 * for the current character this iteration. (We can use 'uc'
2841 * instead of the fold returned by TRIE_READ_CHAR because for
2842 * non-UTF, the latin1_safe macro is smart enough to account
2843 * for all the unfolded characters, and because for UTF, the
2844 * string will already have been folded earlier in the
2845 * compilation process */
2846 if (UTF) {
2847 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2848 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2849 }
2850 }
bc031a7d
KW
2851 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2852 foldlen--;
2853 }
645de4ce 2854 }
bc031a7d
KW
2855
2856 /* The current character (and any potential folds) should be added
2857 * to the possible matching characters for this position in this
2858 * branch */
a3621e74 2859 if ( uvc < 256 ) {
fab2782b
YO
2860 if ( folder ) {
2861 U8 folded= folder[ (U8) uvc ];
2862 if ( !trie->charmap[ folded ] ) {
2863 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2864 TRIE_STORE_REVCHAR( folded );
2865 }
2866 }
a3621e74
YO
2867 if ( !trie->charmap[ uvc ] ) {
2868 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2869 TRIE_STORE_REVCHAR( uvc );
a3621e74 2870 }
02daf0ab 2871 if ( set_bit ) {
62012aee
KW
2872 /* store the codepoint in the bitmap, and its folded
2873 * equivalent. */
8bcafbf4 2874 TRIE_BITMAP_SET_FOLDED(trie, uvc, folder);
02daf0ab
YO
2875 set_bit = 0; /* We've done our bit :-) */
2876 }
a3621e74 2877 } else {
bc031a7d
KW
2878
2879 /* XXX We could come up with the list of code points that fold
2880 * to this using PL_utf8_foldclosures, except not for
2881 * multi-char folds, as there may be multiple combinations
2882 * there that could work, which needs to wait until runtime to
2883 * resolve (The comment about LIGATURE FFI above is such an
2884 * example */
2885
a3621e74 2886 SV** svpp;
55eed653
NC
2887 if ( !widecharmap )
2888 widecharmap = newHV();
a3621e74 2889
55eed653 2890 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2891
2892 if ( !svpp )
147e3846 2893 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%" UVXf, uvc );
a3621e74
YO
2894
2895 if ( !SvTRUE( *svpp ) ) {
2896 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2897 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2898 }
2899 }
bc031a7d
KW
2900 } /* end loop through characters in this branch of the trie */
2901
2902 /* We take the min and max for this branch and combine to find the min
2903 * and max for all branches processed so far */
3dab1dad 2904 if( cur == first ) {
bc031a7d
KW
2905 trie->minlen = minchars;
2906 trie->maxlen = maxchars;
2907 } else if (minchars < trie->minlen) {
2908 trie->minlen = minchars;
2909 } else if (maxchars > trie->maxlen) {
2910 trie->maxlen = maxchars;
fab2782b 2911 }
a3621e74
YO
2912 } /* end first pass */
2913 DEBUG_TRIE_COMPILE_r(
6ad9a8ab 2914 Perl_re_indentf( aTHX_
cb41e5d6
YO
2915 "TRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2916 depth+1,
55eed653 2917 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2918 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2919 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2920 );
a3621e74
YO
2921
2922 /*
2923 We now know what we are dealing with in terms of unique chars and
2924 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2925 representation using a flat table will take. If it's over a reasonable
2926 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2927 conservative but potentially much slower representation using an array
2928 of lists.
2929
2930 At the end we convert both representations into the same compressed
2931 form that will be used in regexec.c for matching with. The latter
2932 is a form that cannot be used to construct with but has memory
2933 properties similar to the list form and access properties similar
2934 to the table form making it both suitable for fast searches and
2935 small enough that its feasable to store for the duration of a program.
2936
2937 See the comment in the code where the compressed table is produced
2938 inplace from the flat tabe representation for an explanation of how
2939 the compression works.
2940
2941 */
2942
2943
2e64971a
DM
2944 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2945 prev_states[1] = 0;
2946
538e84ed
KW
2947 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2948 > SvIV(re_trie_maxbuff) )
2949 {
a3621e74
YO
2950 /*
2951 Second Pass -- Array Of Lists Representation
2952
2953 Each state will be represented by a list of charid:state records
2954 (reg_trie_trans_le) the first such element holds the CUR and LEN
2955 points of the allocated array. (See defines above).
2956
2957 We build the initial structure using the lists, and then convert
2958 it into the compressed table form which allows faster lookups
2959 (but cant be modified once converted).
a3621e74
YO
2960 */
2961
a3621e74
YO
2962 STRLEN transcount = 1;
2963
6ad9a8ab 2964 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using list compiler\n",
cb41e5d6 2965 depth+1));
686b73d4 2966
c944940b
JH
2967 trie->states = (reg_trie_state *)
2968 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2969 sizeof(reg_trie_state) );
a3621e74
YO
2970 TRIE_LIST_NEW(1);
2971 next_alloc = 2;
2972
2973 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2974
df826430 2975 regnode *noper = NEXTOPER( cur );
c445ea15
AL
2976 U32 state = 1; /* required init */
2977 U16 charid = 0; /* sanity init */
07be1b83 2978 U32 wordlen = 0; /* required init */
c445ea15 2979
df826430
YO
2980 if (OP(noper) == NOTHING) {
2981 regnode *noper_next= regnext(noper);
944e05e3
YO
2982 if (noper_next < tail)
2983 noper= noper_next;
df826430
YO
2984 }
2985
f6b4b99d
KW
2986 if ( noper < tail
2987 && ( OP(noper) == flags
2988 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
a9f8c7ac 2989 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
0ea669f4 2990 || OP(noper) == EXACTFUP))))
f6b4b99d 2991 {
944e05e3
YO
2992 const U8 *uc= (U8*)STRING(noper);
2993 const U8 *e= uc + STR_LEN(noper);
2994
786e8c11 2995 for ( ; uc < e ; uc += len ) {
c445ea15 2996
786e8c11 2997 TRIE_READ_CHAR;
c445ea15 2998
786e8c11
YO
2999 if ( uvc < 256 ) {
3000 charid = trie->charmap[ uvc ];
c445ea15 3001 } else {
538e84ed
KW
3002 SV** const svpp = hv_fetch( widecharmap,
3003 (char*)&uvc,
3004 sizeof( UV ),
3005 0);
786e8c11
YO
3006 if ( !svpp ) {
3007 charid = 0;
3008 } else {
3009 charid=(U16)SvIV( *svpp );
3010 }
c445ea15 3011 }
538e84ed
KW
3012 /* charid is now 0 if we dont know the char read, or
3013 * nonzero if we do */
786e8c11 3014 if ( charid ) {
a3621e74 3015
786e8c11
YO
3016 U16 check;
3017 U32 newstate = 0;
a3621e74 3018
786e8c11
YO
3019 charid--;
3020 if ( !trie->states[ state ].trans.list ) {
3021 TRIE_LIST_NEW( state );
c445ea15 3022 }
538e84ed
KW
3023 for ( check = 1;
3024 check <= TRIE_LIST_USED( state );
3025 check++ )
3026 {
3027 if ( TRIE_LIST_ITEM( state, check ).forid
3028 == charid )
3029 {
786e8c11
YO
3030 newstate = TRIE_LIST_ITEM( state, check ).newstate;
3031 break;
3032 }
3033 }
3034 if ( ! newstate ) {
3035 newstate = next_alloc++;
2e64971a 3036 prev_states[newstate] = state;
786e8c11
YO
3037 TRIE_LIST_PUSH( state, charid, newstate );
3038 transcount++;
3039 }
3040 state = newstate;
3041 } else {
147e3846 3042 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
c445ea15 3043 }
a28509cc 3044 }
c445ea15 3045 }
3dab1dad 3046 TRIE_HANDLE_WORD(state);
a3621e74
YO
3047
3048 } /* end second pass */
3049
1e2e3d02 3050 /* next alloc is the NEXT state to be allocated */
538e84ed 3051 trie->statecount = next_alloc;
c944940b
JH
3052 trie->states = (reg_trie_state *)
3053 PerlMemShared_realloc( trie->states,
3054 next_alloc
3055 * sizeof(reg_trie_state) );
a3621e74 3056
3dab1dad 3057 /* and now dump it out before we compress it */
2b8b4781
NC
3058 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
3059 revcharmap, next_alloc,
3060 depth+1)
1e2e3d02 3061 );
a3621e74 3062
c944940b
JH
3063 trie->trans = (reg_trie_trans *)
3064 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
3065 {
3066 U32 state;
a3621e74
YO
3067 U32 tp = 0;
3068 U32 zp = 0;
3069
3070
3071 for( state=1 ; state < next_alloc ; state ++ ) {
3072 U32 base=0;
3073
3074 /*
3075 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3076 Perl_re_printf( aTHX_ "tp: %d zp: %d ",tp,zp)
a3621e74
YO
3077 );
3078 */
3079
3080 if (trie->states[state].trans.list) {
3081 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
3082 U16 maxid=minid;
a28509cc 3083 U16 idx;
a3621e74
YO
3084
3085 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
3086 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
3087 if ( forid < minid ) {
3088 minid=forid;
3089 } else if ( forid > maxid ) {
3090 maxid=forid;
3091 }
a3621e74
YO
3092 }
3093 if ( transcount < tp + maxid - minid + 1) {
3094 transcount *= 2;
c944940b
JH
3095 trie->trans = (reg_trie_trans *)
3096 PerlMemShared_realloc( trie->trans,
446bd890
NC
3097 transcount
3098 * sizeof(reg_trie_trans) );
538e84ed
KW
3099 Zero( trie->trans + (transcount / 2),
3100 transcount / 2,
3101 reg_trie_trans );
a3621e74
YO
3102 }
3103 base = trie->uniquecharcount + tp - minid;
3104 if ( maxid == minid ) {
3105 U32 set = 0;
3106 for ( ; zp < tp ; zp++ ) {
3107 if ( ! trie->trans[ zp ].next ) {
3108 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
3109 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
3110 1).newstate;
a3621e74
YO
3111 trie->trans[ zp ].check = state;
3112 set = 1;
3113 break;
3114 }
3115 }
3116 if ( !set ) {
538e84ed
KW
3117 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
3118 1).newstate;
a3621e74
YO
3119 trie->trans[ tp ].check = state;
3120 tp++;
3121 zp = tp;
3122 }
3123 } else {
3124 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
3125 const U32 tid = base
3126 - trie->uniquecharcount
3127 + TRIE_LIST_ITEM( state, idx ).forid;
3128 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
3129 idx ).newstate;
a3621e74
YO
3130 trie->trans[ tid ].check = state;
3131 }
3132 tp += ( maxid - minid + 1 );
3133 }
3134 Safefree(trie->states[ state ].trans.list);
3135 }
3136 /*
3137 DEBUG_TRIE_COMPILE_MORE_r(
6ad9a8ab 3138 Perl_re_printf( aTHX_ " base: %d\n",base);
a3621e74
YO
3139 );
3140 */
3141 trie->states[ state ].trans.base=base;
3142 }
cc601c31 3143 trie->lasttrans = tp + 1;
a3621e74
YO
3144 }
3145 } else {
3146 /*
3147 Second Pass -- Flat Table Representation.
3148
b423522f
KW
3149 we dont use the 0 slot of either trans[] or states[] so we add 1 to
3150 each. We know that we will need Charcount+1 trans at most to store
3151 the data (one row per char at worst case) So we preallocate both
3152 structures assuming worst case.
a3621e74
YO
3153
3154 We then construct the trie using only the .next slots of the entry
3155 structs.
3156
b423522f
KW
3157 We use the .check field of the first entry of the node temporarily
3158 to make compression both faster and easier by keeping track of how
3159 many non zero fields are in the node.
a3621e74
YO
3160
3161 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
3162 transition.
3163
b423522f
KW
3164 There are two terms at use here: state as a TRIE_NODEIDX() which is
3165 a number representing the first entry of the node, and state as a
3166 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
3167 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
3168 if there are 2 entrys per node. eg:
a3621e74
YO
3169
3170 A B A B
3171 1. 2 4 1. 3 7
3172 2. 0 3 3. 0 5
3173 3. 0 0 5. 0 0
3174 4. 0 0 7. 0 0
3175
b423522f
KW
3176 The table is internally in the right hand, idx form. However as we
3177 also have to deal with the states array which is indexed by nodenum
3178 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
3179
3180 */
6ad9a8ab 3181 DEBUG_TRIE_COMPILE_MORE_r( Perl_re_indentf( aTHX_ "Compiling trie using table compiler\n",
cb41e5d6 3182 depth+1));
3dab1dad 3183
c944940b
JH
3184 trie->trans = (reg_trie_trans *)
3185 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
3186 * trie->uniquecharcount + 1,
3187 sizeof(reg_trie_trans) );
3188 trie->states = (reg_trie_state *)
3189 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
3190 sizeof(reg_trie_state) );
a3621e74
YO
3191 next_alloc = trie->uniquecharcount + 1;
3192
3dab1dad 3193
a3621e74
YO
3194 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
3195
df826430 3196 regnode *noper = NEXTOPER( cur );
a3621e74
YO
3197
3198 U32 state = 1; /* required init */
3199
3200 U16 charid = 0; /* sanity init */
3201 U32 accept_state = 0; /* sanity init */
a3621e74 3202
07be1b83 3203 U32 wordlen = 0; /* required init */
a3621e74 3204
df826430
YO
3205 if (OP(noper) == NOTHING) {
3206 regnode *noper_next= regnext(noper);
944e05e3
YO
3207 if (noper_next < tail)
3208 noper= noper_next;
df826430 3209 }
fab2782b 3210
f6b4b99d
KW
3211 if ( noper < tail
3212 && ( OP(noper) == flags
3213 || (flags == EXACT && OP(noper) == EXACT_ONLY8)
a9f8c7ac 3214 || (flags == EXACTFU && ( OP(noper) == EXACTFU_ONLY8
0ea669f4 3215 || OP(noper) == EXACTFUP))))
f6b4b99d 3216 {
944e05e3
YO
3217 const U8 *uc= (U8*)STRING(noper);
3218 const U8 *e= uc + STR_LEN(noper);
3219
786e8c11 3220 for ( ; uc < e ; uc += len ) {
a3621e74 3221
786e8c11 3222 TRIE_READ_CHAR;
a3621e74 3223
786e8c11
YO
3224 if ( uvc < 256 ) {
3225 charid = trie->charmap[ uvc ];
3226 } else {
538e84ed
KW
3227 SV* const * const svpp = hv_fetch( widecharmap,
3228 (char*)&uvc,
3229 sizeof( UV ),
3230 0);
786e8c11 3231 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 3232 }
786e8c11
YO
3233 if ( charid ) {
3234 charid--;
3235 if ( !trie->trans[ state + charid ].next ) {
3236 trie->trans[ state + charid ].next = next_alloc;
3237 trie->trans[ state ].check++;
2e64971a
DM
3238 prev_states[TRIE_NODENUM(next_alloc)]
3239 = TRIE_NODENUM(state);
786e8c11
YO
3240 next_alloc += trie->uniquecharcount;
3241 }
3242 state = trie->trans[ state + charid ].next;
3243 } else {
147e3846 3244 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %" IVdf, uvc );
786e8c11 3245 }
538e84ed
KW
3246 /* charid is now 0 if we dont know the char read, or
3247 * nonzero if we do */
a3621e74 3248 }
a3621e74 3249 }
3dab1dad
YO
3250 accept_state = TRIE_NODENUM( state );
3251 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
3252
3253 } /* end second pass */
3254
3dab1dad 3255 /* and now dump it out before we compress it */
2b8b4781
NC
3256 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
3257 revcharmap,
3258 next_alloc, depth+1));
a3621e74 3259
a3621e74
YO
3260 {
3261 /*
3262 * Inplace compress the table.*
3263
3264 For sparse data sets the table constructed by the trie algorithm will
3265 be mostly 0/FAIL transitions or to put it another way mostly empty.
3266 (Note that leaf nodes will not contain any transitions.)
3267
3268 This algorithm compresses the tables by eliminating most such
3269 transitions, at the cost of a modest bit of extra work during lookup:
3270
3271 - Each states[] entry contains a .base field which indicates the
3272 index in the state[] array wheres its transition data is stored.
3273
3b753521 3274 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
3275
3276 - If .base is nonzero then charid is added to it to find an entry in
3277 the trans array.
3278
3279 -If trans[states[state].base+charid].check!=state then the
3280 transition is taken to be a 0/Fail transition. Thus if there are fail
3281 transitions at the front of the node then the .base offset will point
3282 somewhere inside the previous nodes data (or maybe even into a node
3283 even earlier), but the .check field determines if the transition is
3284 valid.
3285
786e8c11 3286 XXX - wrong maybe?
a3621e74 3287 The following process inplace converts the table to the compressed
3b753521 3288 table: We first do not compress the root node 1,and mark all its
a3621e74 3289 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
3290 allows us to do a DFA construction from the compressed table later,
3291 and ensures that any .base pointers we calculate later are greater
3292 than 0.
a3621e74
YO
3293
3294 - We set 'pos' to indicate the first entry of the second node.
3295
3296 - We then iterate over the columns of the node, finding the first and
3297 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
3298 and set the .check pointers accordingly, and advance pos
3299 appropriately and repreat for the next node. Note that when we copy
3300 the next pointers we have to convert them from the original
3301 NODEIDX form to NODENUM form as the former is not valid post
3302 compression.
3303
3304 - If a node has no transitions used we mark its base as 0 and do not
3305 advance the pos pointer.
3306
3307 - If a node only has one transition we use a second pointer into the
3308 structure to fill in allocated fail transitions from other states.
3309 This pointer is independent of the main pointer and scans forward
3310 looking for null transitions that are allocated to a state. When it
3311 finds one it writes the single transition into the "hole". If the
786e8c11 3312 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
3313
3314 - Once compressed we can Renew/realloc the structures to release the
3315 excess space.
3316
3317 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
3318 specifically Fig 3.47 and the associated pseudocode.
3319
3320 demq
3321 */
a3b680e6 3322 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 3323 U32 state, charid;
a3621e74 3324 U32 pos = 0, zp=0;
1e2e3d02 3325 trie->statecount = laststate;
a3621e74
YO
3326
3327 for ( state = 1 ; state < laststate ; state++ ) {
3328 U8 flag = 0;
a28509cc
AL
3329 const U32 stateidx = TRIE_NODEIDX( state );
3330 const U32 o_used = trie->trans[ stateidx ].check;
3331 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
3332 trie->trans[ stateidx ].check = 0;
3333
538e84ed
KW
3334 for ( charid = 0;
3335 used && charid < trie->uniquecharcount;
3336 charid++ )
3337 {
a3621e74
YO
3338 if ( flag || trie->trans[ stateidx + charid ].next ) {
3339 if ( trie->trans[ stateidx + charid ].next ) {
3340 if (o_used == 1) {
3341 for ( ; zp < pos ; zp++ ) {
3342 if ( ! trie->trans[ zp ].next ) {
3343 break;
3344 }
3345 }
538e84ed
KW
3346 trie->states[ state ].trans.base
3347 = zp
3348 + trie->uniquecharcount
3349 - charid ;
3350 trie->trans[ zp ].next
3351 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
3352 + charid ].next );
a3621e74
YO
3353 trie->trans[ zp ].check = state;
3354 if ( ++zp > pos ) pos = zp;
3355 break;
3356 }
3357 used--;
3358 }
3359 if ( !flag ) {
3360 flag = 1;
538e84ed
KW
3361 trie->states[ state ].trans.base
3362 = pos + trie->uniquecharcount - charid ;
a3621e74 3363 }
538e84ed
KW
3364 trie->trans[ pos ].next
3365 = SAFE_TRIE_NODENUM(
3366 trie->trans[ stateidx + charid ].next );
a3621e74
YO
3367 trie->trans[ pos ].check = state;
3368 pos++;
3369 }
3370 }
3371 }
cc601c31 3372 trie->lasttrans = pos + 1;
c944940b
JH
3373 trie->states = (reg_trie_state *)
3374 PerlMemShared_realloc( trie->states, laststate
3375 * sizeof(reg_trie_state) );
a3621e74 3376 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3377 Perl_re_indentf( aTHX_ "Alloc: %d Orig: %" IVdf " elements, Final:%" IVdf ". Savings of %%%5.2f\n",
cb41e5d6 3378 depth+1,
538e84ed
KW
3379 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
3380 + 1 ),
3381 (IV)next_alloc,
3382 (IV)pos,
3383 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
3384 );
3385
3386 } /* end table compress */
3387 }
1e2e3d02 3388 DEBUG_TRIE_COMPILE_MORE_r(
147e3846 3389 Perl_re_indentf( aTHX_ "Statecount:%" UVxf " Lasttrans:%" UVxf "\n",
cb41e5d6 3390 depth+1,
1e2e3d02
YO
3391 (UV)trie->statecount,
3392 (UV)trie->lasttrans)
3393 );
cc601c31 3394 /* resize the trans array to remove unused space */
c944940b
JH
3395 trie->trans = (reg_trie_trans *)
3396 PerlMemShared_realloc( trie->trans, trie->lasttrans
3397 * sizeof(reg_trie_trans) );
a3621e74 3398
538e84ed 3399 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
3400 U8 nodetype =(U8)(flags & 0xFF);
3401 char *str=NULL;
538e84ed 3402
07be1b83 3403#ifdef DEBUGGING
e62cc96a 3404 regnode *optimize = NULL;
7122b237
YO
3405#ifdef RE_TRACK_PATTERN_OFFSETS
3406
b57a0404
JH
3407 U32 mjd_offset = 0;
3408 U32 mjd_nodelen = 0;
7122b237
YO
3409#endif /* RE_TRACK_PATTERN_OFFSETS */
3410#endif /* DEBUGGING */
a3621e74 3411 /*
3dab1dad
YO
3412 This means we convert either the first branch or the first Exact,
3413 depending on whether the thing following (in 'last') is a branch
3414 or not and whther first is the startbranch (ie is it a sub part of
3415 the alternation or is it the whole thing.)
3b753521 3416 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 3417 the whole branch sequence, including the first.
a3621e74 3418 */
3dab1dad 3419 /* Find the node we are going to overwrite */
7f69552c 3420 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 3421 /* branch sub-chain */
3dab1dad 3422 NEXT_OFF( first ) = (U16)(last - first);
7122b237 3423#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
3424 DEBUG_r({
3425 mjd_offset= Node_Offset((convert));
3426 mjd_nodelen=