This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Simplify expression
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
04e98a4d 89#include "dquote_static.c"
26faadbd 90#include "charclass_invlists.h"
81e983c1 91#include "inline_invlist.c"
1b0f46bf 92#include "unicode_constants.h"
04e98a4d 93
538e84ed
KW
94#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 96#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 97#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 98
a687059c
LW
99#ifndef STATIC
100#define STATIC static
101#endif
102
b1603ef8 103
09b2b2e6 104struct RExC_state_t {
514a91f1
DM
105 U32 flags; /* RXf_* are we folding, multilining? */
106 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 107 char *precomp; /* uncompiled string. */
288b8c02 108 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 109 regexp *rx; /* perl core regexp structure */
538e84ed
KW
110 regexp_internal *rxi; /* internal data for regexp object
111 pprivate field */
fac92740 112 char *start; /* Start of input for compile */
830247a4
IZ
113 char *end; /* End of input for compile */
114 char *parse; /* Input-scan pointer. */
ea3daa5d 115 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 116 regnode *emit_start; /* Start of emitted-code area */
538e84ed
KW
117 regnode *emit_bound; /* First regnode outside of the
118 allocated space */
f7c7e32a
DM
119 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
120 implies compiling, so don't emit */
9a81a976
KW
121 regnode_ssc emit_dummy; /* placeholder for emit to point to;
122 large enough for the largest
123 non-EXACTish node, so can use it as
124 scratch in pass1 */
830247a4
IZ
125 I32 naughty; /* How bad is this pattern? */
126 I32 sawback; /* Did we see \1, ...? */
127 U32 seen;
ea3daa5d 128 SSize_t size; /* Code size. */
538e84ed
KW
129 I32 npar; /* Capture buffer count, (OPEN) plus
130 one. ("par" 0 is the whole
131 pattern)*/
132 I32 nestroot; /* root parens we are in - used by
133 accept */
830247a4
IZ
134 I32 extralen;
135 I32 seen_zerolen;
40d049e4
YO
136 regnode **open_parens; /* pointers to open parens */
137 regnode **close_parens; /* pointers to close parens */
138 regnode *opend; /* END node in program */
02daf0ab
YO
139 I32 utf8; /* whether the pattern is utf8 or not */
140 I32 orig_utf8; /* whether the pattern was originally in utf8 */
141 /* XXX use this for future optimisation of case
142 * where pattern must be upgraded to utf8. */
e40e74fe
KW
143 I32 uni_semantics; /* If a d charset modifier should use unicode
144 rules, even if the pattern is not in
145 utf8 */
81714fb9 146 HV *paren_names; /* Paren names */
538e84ed 147
40d049e4
YO
148 regnode **recurse; /* Recurse regops */
149 I32 recurse_count; /* Number of recurse regops */
538e84ed
KW
150 U8 *study_chunk_recursed; /* bitmap of which parens we have moved
151 through */
09a65838 152 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 153 I32 in_lookbehind;
4624b182 154 I32 contains_locale;
cfafade5 155 I32 contains_i;
bb3f3ed2 156 I32 override_recoding;
9d53c457 157 I32 in_multi_char_class;
3d2bd50a 158 struct reg_code_block *code_blocks; /* positions of literal (?{})
68e2671b 159 within pattern */
b1603ef8
DM
160 int num_code_blocks; /* size of code_blocks[] */
161 int code_index; /* next code_blocks[] slot */
830247a4
IZ
162#if ADD_TO_REGEXEC
163 char *starttry; /* -Dr: where regtry was called. */
164#define RExC_starttry (pRExC_state->starttry)
165#endif
d24ca0c5 166 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 167#ifdef DEBUGGING
be8e71aa 168 const char *lastparse;
3dab1dad 169 I32 lastnum;
1f1031fe 170 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
171#define RExC_lastparse (pRExC_state->lastparse)
172#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 173#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 174#endif
09b2b2e6 175};
830247a4 176
e2509266 177#define RExC_flags (pRExC_state->flags)
514a91f1 178#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 179#define RExC_precomp (pRExC_state->precomp)
288b8c02 180#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 181#define RExC_rx (pRExC_state->rx)
f8fc2ecf 182#define RExC_rxi (pRExC_state->rxi)
fac92740 183#define RExC_start (pRExC_state->start)
830247a4
IZ
184#define RExC_end (pRExC_state->end)
185#define RExC_parse (pRExC_state->parse)
186#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237 187#ifdef RE_TRACK_PATTERN_OFFSETS
538e84ed
KW
188#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
189 others */
7122b237 190#endif
830247a4 191#define RExC_emit (pRExC_state->emit)
f7c7e32a 192#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 193#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 194#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
195#define RExC_naughty (pRExC_state->naughty)
196#define RExC_sawback (pRExC_state->sawback)
197#define RExC_seen (pRExC_state->seen)
198#define RExC_size (pRExC_state->size)
199#define RExC_npar (pRExC_state->npar)
e2e6a0f1 200#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
201#define RExC_extralen (pRExC_state->extralen)
202#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 203#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 204#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 205#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
206#define RExC_open_parens (pRExC_state->open_parens)
207#define RExC_close_parens (pRExC_state->close_parens)
208#define RExC_opend (pRExC_state->opend)
81714fb9 209#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
210#define RExC_recurse (pRExC_state->recurse)
211#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 212#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
213#define RExC_study_chunk_recursed_bytes \
214 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 215#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 216#define RExC_contains_locale (pRExC_state->contains_locale)
cfafade5 217#define RExC_contains_i (pRExC_state->contains_i)
9d53c457
KW
218#define RExC_override_recoding (pRExC_state->override_recoding)
219#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
830247a4 220
cde0cee5 221
a687059c
LW
222#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
223#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
4d68ffa0 224 ((*s) == '{' && regcurly(s, FALSE)))
a687059c
LW
225
226/*
227 * Flags to be passed up and down.
228 */
a687059c 229#define WORST 0 /* Worst case. */
a3b492c3 230#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 231
e64f369d 232/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
233 * character. (There needs to be a case: in the switch statement in regexec.c
234 * for any node marked SIMPLE.) Note that this is not the same thing as
235 * REGNODE_SIMPLE */
fda99bee 236#define SIMPLE 0x02
e64f369d 237#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
238#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
239#define TRYAGAIN 0x10 /* Weeded out a declaration. */
240#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
a687059c 241
3dab1dad
YO
242#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
243
07be1b83
YO
244/* whether trie related optimizations are enabled */
245#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
246#define TRIE_STUDY_OPT
786e8c11 247#define FULL_TRIE_STUDY
07be1b83
YO
248#define TRIE_STCLASS
249#endif
1de06328
YO
250
251
40d049e4
YO
252
253#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
254#define PBITVAL(paren) (1 << ((paren) & 7))
255#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
256#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
257#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
258
bbd61b5f 259#define REQUIRE_UTF8 STMT_START { \
8d9c2815
NC
260 if (!UTF) { \
261 *flagp = RESTART_UTF8; \
262 return NULL; \
263 } \
bbd61b5f 264 } STMT_END
40d049e4 265
f19b1a63
KW
266/* This converts the named class defined in regcomp.h to its equivalent class
267 * number defined in handy.h. */
268#define namedclass_to_classnum(class) ((int) ((class) / 2))
269#define classnum_to_namedclass(classnum) ((classnum) * 2)
270
de92f5e6
KW
271#define _invlist_union_complement_2nd(a, b, output) \
272 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
273#define _invlist_intersection_complement_2nd(a, b, output) \
274 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
275
1de06328
YO
276/* About scan_data_t.
277
278 During optimisation we recurse through the regexp program performing
279 various inplace (keyhole style) optimisations. In addition study_chunk
280 and scan_commit populate this data structure with information about
538e84ed 281 what strings MUST appear in the pattern. We look for the longest
3b753521 282 string that must appear at a fixed location, and we look for the
1de06328
YO
283 longest string that may appear at a floating location. So for instance
284 in the pattern:
538e84ed 285
1de06328 286 /FOO[xX]A.*B[xX]BAR/
538e84ed 287
1de06328
YO
288 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
289 strings (because they follow a .* construct). study_chunk will identify
290 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 291
1de06328 292 The strings can be composites, for instance
538e84ed 293
1de06328 294 /(f)(o)(o)/
538e84ed 295
1de06328 296 will result in a composite fixed substring 'foo'.
538e84ed 297
1de06328 298 For each string some basic information is maintained:
538e84ed 299
1de06328
YO
300 - offset or min_offset
301 This is the position the string must appear at, or not before.
302 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
303 characters must match before the string we are searching for.
304 Likewise when combined with minlenp and the length of the string it
538e84ed 305 tells us how many characters must appear after the string we have
1de06328 306 found.
538e84ed 307
1de06328
YO
308 - max_offset
309 Only used for floating strings. This is the rightmost point that
ea3daa5d 310 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 311 string can occur infinitely far to the right.
538e84ed 312
1de06328 313 - minlenp
2d608413
KW
314 A pointer to the minimum number of characters of the pattern that the
315 string was found inside. This is important as in the case of positive
538e84ed 316 lookahead or positive lookbehind we can have multiple patterns
1de06328 317 involved. Consider
538e84ed 318
1de06328 319 /(?=FOO).*F/
538e84ed 320
1de06328
YO
321 The minimum length of the pattern overall is 3, the minimum length
322 of the lookahead part is 3, but the minimum length of the part that
538e84ed 323 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 324 minimum length for the F is 1. This is important as the minimum length
538e84ed 325 is used to determine offsets in front of and behind the string being
1de06328 326 looked for. Since strings can be composites this is the length of the
486ec47a 327 pattern at the time it was committed with a scan_commit. Note that
1de06328 328 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 329 are not known until the full pattern has been compiled, thus the
1de06328 330 pointer to the value.
538e84ed 331
1de06328 332 - lookbehind
538e84ed 333
1de06328 334 In the case of lookbehind the string being searched for can be
538e84ed 335 offset past the start point of the final matching string.
1de06328
YO
336 If this value was just blithely removed from the min_offset it would
337 invalidate some of the calculations for how many chars must match
338 before or after (as they are derived from min_offset and minlen and
538e84ed 339 the length of the string being searched for).
1de06328
YO
340 When the final pattern is compiled and the data is moved from the
341 scan_data_t structure into the regexp structure the information
538e84ed
KW
342 about lookbehind is factored in, with the information that would
343 have been lost precalculated in the end_shift field for the
1de06328
YO
344 associated string.
345
346 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 347 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
348
349*/
2c2d71f5
JH
350
351typedef struct scan_data_t {
1de06328
YO
352 /*I32 len_min; unused */
353 /*I32 len_delta; unused */
49f55535 354 SSize_t pos_min;
ea3daa5d 355 SSize_t pos_delta;
2c2d71f5 356 SV *last_found;
ea3daa5d 357 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 358 SSize_t last_start_min;
ea3daa5d 359 SSize_t last_start_max;
1de06328
YO
360 SV **longest; /* Either &l_fixed, or &l_float. */
361 SV *longest_fixed; /* longest fixed string found in pattern */
49f55535 362 SSize_t offset_fixed; /* offset where it starts */
ea3daa5d 363 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
364 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
365 SV *longest_float; /* longest floating string found in pattern */
49f55535 366 SSize_t offset_float_min; /* earliest point in string it can appear */
ea3daa5d
FC
367 SSize_t offset_float_max; /* latest point in string it can appear */
368 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
49f55535 369 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
2c2d71f5
JH
370 I32 flags;
371 I32 whilem_c;
49f55535 372 SSize_t *last_closep;
b8f7bb16 373 regnode_ssc *start_class;
2c2d71f5
JH
374} scan_data_t;
375
c02c3054
KW
376/* The below is perhaps overboard, but this allows us to save a test at the
377 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
378 * and 'a' differ by a single bit; the same with the upper and lower case of
379 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
380 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
381 * then inverts it to form a mask, with just a single 0, in the bit position
382 * where the upper- and lowercase differ. XXX There are about 40 other
383 * instances in the Perl core where this micro-optimization could be used.
384 * Should decide if maintenance cost is worse, before changing those
385 *
386 * Returns a boolean as to whether or not 'v' is either a lowercase or
387 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
388 * compile-time constant, the generated code is better than some optimizing
389 * compilers figure out, amounting to a mask and test. The results are
390 * meaningless if 'c' is not one of [A-Za-z] */
391#define isARG2_lower_or_UPPER_ARG1(c, v) \
392 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
393
a687059c 394/*
e50aee73 395 * Forward declarations for pregcomp()'s friends.
a687059c 396 */
a0d0e21e 397
27da23d5 398static const scan_data_t zero_scan_data =
1de06328 399 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
400
401#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
402#define SF_BEFORE_SEOL 0x0001
403#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
404#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
405#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
406
44e3dfd2
BF
407#define SF_FIX_SHIFT_EOL (+2)
408#define SF_FL_SHIFT_EOL (+4)
c277df42
IZ
409
410#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
411#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
412
413#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
414#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
415#define SF_IS_INF 0x0040
416#define SF_HAS_PAR 0x0080
417#define SF_IN_PAR 0x0100
418#define SF_HAS_EVAL 0x0200
419#define SCF_DO_SUBSTR 0x0400
653099ff
GS
420#define SCF_DO_STCLASS_AND 0x0800
421#define SCF_DO_STCLASS_OR 0x1000
422#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 423#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 424
786e8c11 425#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 426#define SCF_SEEN_ACCEPT 0x8000
688e0391 427#define SCF_TRIE_DOING_RESTUDY 0x10000
07be1b83 428
43fead97 429#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
430
431/* The enums for all these are ordered so things work out correctly */
a62b1201 432#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
433#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
434 == REGEX_DEPENDS_CHARSET)
00b27cfc 435#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
436#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
437 >= REGEX_UNICODE_CHARSET)
438#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
439 == REGEX_ASCII_RESTRICTED_CHARSET)
440#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
441 >= REGEX_ASCII_RESTRICTED_CHARSET)
442#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
443 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 444
43fead97 445#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 446
f2c2a6ab
KW
447/* For programs that want to be strictly Unicode compatible by dying if any
448 * attempt is made to match a non-Unicode code point against a Unicode
449 * property. */
450#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
451
93733859 452#define OOB_NAMEDCLASS -1
b8c5462f 453
8e661ac5
KW
454/* There is no code point that is out-of-bounds, so this is problematic. But
455 * its only current use is to initialize a variable that is always set before
456 * looked at. */
457#define OOB_UNICODE 0xDEADBEEF
458
a0ed51b3
LW
459#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
460#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
461
8615cb43 462
b45f050a
JF
463/* length of regex to show in messages that don't mark a position within */
464#define RegexLengthToShowInErrorMessages 127
465
466/*
467 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
468 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
469 * op/pragma/warn/regcomp.
470 */
7253e4e3
RK
471#define MARKER1 "<-- HERE" /* marker as it appears in the description */
472#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 473
538e84ed
KW
474#define REPORT_LOCATION " in regex; marked by " MARKER1 \
475 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
b45f050a 476
c1d900c3
BF
477#define REPORT_LOCATION_ARGS(offset) \
478 UTF8fARG(UTF, offset, RExC_precomp), \
479 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
480
b45f050a
JF
481/*
482 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
483 * arg. Show regex, up to a maximum length. If it's too long, chop and add
484 * "...".
485 */
58e23c8d 486#define _FAIL(code) STMT_START { \
bfed75c6 487 const char *ellipses = ""; \
ccb2c380
MP
488 IV len = RExC_end - RExC_precomp; \
489 \
490 if (!SIZE_ONLY) \
a5e7bc51 491 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
492 if (len > RegexLengthToShowInErrorMessages) { \
493 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
494 len = RegexLengthToShowInErrorMessages - 10; \
495 ellipses = "..."; \
496 } \
58e23c8d 497 code; \
ccb2c380 498} STMT_END
8615cb43 499
58e23c8d 500#define FAIL(msg) _FAIL( \
c1d900c3
BF
501 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
502 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
503
504#define FAIL2(msg,arg) _FAIL( \
c1d900c3
BF
505 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
506 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 507
b45f050a 508/*
b45f050a
JF
509 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
510 */
ccb2c380 511#define Simple_vFAIL(m) STMT_START { \
a28509cc 512 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380 513 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
c1d900c3 514 m, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 515} STMT_END
b45f050a
JF
516
517/*
518 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
519 */
ccb2c380
MP
520#define vFAIL(m) STMT_START { \
521 if (!SIZE_ONLY) \
a5e7bc51 522 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
523 Simple_vFAIL(m); \
524} STMT_END
b45f050a
JF
525
526/*
527 * Like Simple_vFAIL(), but accepts two arguments.
528 */
ccb2c380 529#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 530 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
531 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
532 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 533} STMT_END
b45f050a
JF
534
535/*
536 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
537 */
ccb2c380
MP
538#define vFAIL2(m,a1) STMT_START { \
539 if (!SIZE_ONLY) \
a5e7bc51 540 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
541 Simple_vFAIL2(m, a1); \
542} STMT_END
b45f050a
JF
543
544
545/*
546 * Like Simple_vFAIL(), but accepts three arguments.
547 */
ccb2c380 548#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 549 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
550 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
551 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 552} STMT_END
b45f050a
JF
553
554/*
555 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
556 */
ccb2c380
MP
557#define vFAIL3(m,a1,a2) STMT_START { \
558 if (!SIZE_ONLY) \
a5e7bc51 559 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
560 Simple_vFAIL3(m, a1, a2); \
561} STMT_END
b45f050a
JF
562
563/*
564 * Like Simple_vFAIL(), but accepts four arguments.
565 */
ccb2c380 566#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 567 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
568 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
569 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 570} STMT_END
b45f050a 571
95db3ffa
KW
572#define vFAIL4(m,a1,a2,a3) STMT_START { \
573 if (!SIZE_ONLY) \
574 SAVEFREESV(RExC_rx_sv); \
575 Simple_vFAIL4(m, a1, a2, a3); \
576} STMT_END
577
946095af
BF
578/* A specialized version of vFAIL2 that works with UTF8f */
579#define vFAIL2utf8f(m, a1) STMT_START { \
ef3f731d
BF
580 const IV offset = RExC_parse - RExC_precomp; \
581 if (!SIZE_ONLY) \
582 SAVEFREESV(RExC_rx_sv); \
583 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
584 REPORT_LOCATION_ARGS(offset)); \
946095af
BF
585} STMT_END
586
587
5e0a247b
KW
588/* m is not necessarily a "literal string", in this macro */
589#define reg_warn_non_literal_string(loc, m) STMT_START { \
590 const IV offset = loc - RExC_precomp; \
591 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
c1d900c3 592 m, REPORT_LOCATION_ARGS(offset)); \
5e0a247b
KW
593} STMT_END
594
668c081a 595#define ckWARNreg(loc,m) STMT_START { \
a28509cc 596 const IV offset = loc - RExC_precomp; \
f10f4c18 597 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 598 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
599} STMT_END
600
0d6106aa
KW
601#define vWARN_dep(loc, m) STMT_START { \
602 const IV offset = loc - RExC_precomp; \
603 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
c1d900c3 604 REPORT_LOCATION_ARGS(offset)); \
0d6106aa
KW
605} STMT_END
606
147508a2
KW
607#define ckWARNdep(loc,m) STMT_START { \
608 const IV offset = loc - RExC_precomp; \
609 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
610 m REPORT_LOCATION, \
c1d900c3 611 REPORT_LOCATION_ARGS(offset)); \
147508a2
KW
612} STMT_END
613
668c081a 614#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 615 const IV offset = loc - RExC_precomp; \
d1d15184 616 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18 617 m REPORT_LOCATION, \
c1d900c3 618 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
619} STMT_END
620
b23eb183 621#define ckWARN2reg_d(loc,m, a1) STMT_START { \
2335b3d3 622 const IV offset = loc - RExC_precomp; \
b23eb183 623 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
2335b3d3 624 m REPORT_LOCATION, \
c1d900c3 625 a1, REPORT_LOCATION_ARGS(offset)); \
2335b3d3
KW
626} STMT_END
627
668c081a 628#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 629 const IV offset = loc - RExC_precomp; \
668c081a 630 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 631 a1, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
632} STMT_END
633
634#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 635 const IV offset = loc - RExC_precomp; \
ccb2c380 636 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 637 a1, a2, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
638} STMT_END
639
668c081a
NC
640#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
641 const IV offset = loc - RExC_precomp; \
642 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 643 a1, a2, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
644} STMT_END
645
ccb2c380 646#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 647 const IV offset = loc - RExC_precomp; \
ccb2c380 648 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 649 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
650} STMT_END
651
668c081a
NC
652#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
653 const IV offset = loc - RExC_precomp; \
654 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 655 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
656} STMT_END
657
ccb2c380 658#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 659 const IV offset = loc - RExC_precomp; \
ccb2c380 660 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 661 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 662} STMT_END
9d1d55b5 663
8615cb43 664
cd439c50 665/* Allow for side effects in s */
ccb2c380
MP
666#define REGC(c,s) STMT_START { \
667 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
668} STMT_END
cd439c50 669
538e84ed 670/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
671 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
672 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
673 * Element 0 holds the number n.
07be1b83 674 * Position is 1 indexed.
fac92740 675 */
7122b237
YO
676#ifndef RE_TRACK_PATTERN_OFFSETS
677#define Set_Node_Offset_To_R(node,byte)
678#define Set_Node_Offset(node,byte)
679#define Set_Cur_Node_Offset
680#define Set_Node_Length_To_R(node,len)
681#define Set_Node_Length(node,len)
6a86c6ad 682#define Set_Node_Cur_Length(node,start)
538e84ed
KW
683#define Node_Offset(n)
684#define Node_Length(n)
7122b237
YO
685#define Set_Node_Offset_Length(node,offset,len)
686#define ProgLen(ri) ri->u.proglen
687#define SetProgLen(ri,x) ri->u.proglen = x
688#else
689#define ProgLen(ri) ri->u.offsets[0]
690#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
691#define Set_Node_Offset_To_R(node,byte) STMT_START { \
692 if (! SIZE_ONLY) { \
693 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 694 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 695 if((node) < 0) { \
538e84ed
KW
696 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
697 (int)(node)); \
ccb2c380
MP
698 } else { \
699 RExC_offsets[2*(node)-1] = (byte); \
700 } \
701 } \
702} STMT_END
703
704#define Set_Node_Offset(node,byte) \
705 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
706#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
707
708#define Set_Node_Length_To_R(node,len) STMT_START { \
709 if (! SIZE_ONLY) { \
710 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 711 __LINE__, (int)(node), (int)(len))); \
ccb2c380 712 if((node) < 0) { \
538e84ed
KW
713 Perl_croak(aTHX_ "value of node is %d in Length macro", \
714 (int)(node)); \
ccb2c380
MP
715 } else { \
716 RExC_offsets[2*(node)] = (len); \
717 } \
718 } \
719} STMT_END
720
721#define Set_Node_Length(node,len) \
722 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
723#define Set_Node_Cur_Length(node, start) \
724 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
725
726/* Get offsets and lengths */
727#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
728#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
729
07be1b83
YO
730#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
731 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
732 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
733} STMT_END
7122b237 734#endif
07be1b83
YO
735
736#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
737#define EXPERIMENTAL_INPLACESCAN
f427392e 738#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 739
9e9ecfdf 740#define DEBUG_RExC_seen() \
538e84ed
KW
741 DEBUG_OPTIMISE_MORE_r({ \
742 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
743 \
744 if (RExC_seen & REG_SEEN_ZERO_LEN) \
745 PerlIO_printf(Perl_debug_log,"REG_SEEN_ZERO_LEN "); \
746 \
747 if (RExC_seen & REG_SEEN_LOOKBEHIND) \
748 PerlIO_printf(Perl_debug_log,"REG_SEEN_LOOKBEHIND "); \
749 \
750 if (RExC_seen & REG_SEEN_GPOS) \
751 PerlIO_printf(Perl_debug_log,"REG_SEEN_GPOS "); \
752 \
753 if (RExC_seen & REG_SEEN_CANY) \
754 PerlIO_printf(Perl_debug_log,"REG_SEEN_CANY "); \
755 \
756 if (RExC_seen & REG_SEEN_RECURSE) \
757 PerlIO_printf(Perl_debug_log,"REG_SEEN_RECURSE "); \
758 \
759 if (RExC_seen & REG_TOP_LEVEL_BRANCHES) \
760 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES "); \
761 \
762 if (RExC_seen & REG_SEEN_VERBARG) \
763 PerlIO_printf(Perl_debug_log,"REG_SEEN_VERBARG "); \
764 \
765 if (RExC_seen & REG_SEEN_CUTGROUP) \
766 PerlIO_printf(Perl_debug_log,"REG_SEEN_CUTGROUP "); \
767 \
768 if (RExC_seen & REG_SEEN_RUN_ON_COMMENT) \
769 PerlIO_printf(Perl_debug_log,"REG_SEEN_RUN_ON_COMMENT "); \
770 \
771 if (RExC_seen & REG_SEEN_EXACTF_SHARP_S) \
772 PerlIO_printf(Perl_debug_log,"REG_SEEN_EXACTF_SHARP_S "); \
773 \
774 if (RExC_seen & REG_SEEN_GOSTART) \
775 PerlIO_printf(Perl_debug_log,"REG_SEEN_GOSTART "); \
776 \
777 PerlIO_printf(Perl_debug_log,"\n"); \
9e9ecfdf
YO
778 });
779
304ee84b
YO
780#define DEBUG_STUDYDATA(str,data,depth) \
781DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 782 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
783 "%*s" str "Pos:%"IVdf"/%"IVdf \
784 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
785 (int)(depth)*2, "", \
786 (IV)((data)->pos_min), \
787 (IV)((data)->pos_delta), \
304ee84b 788 (UV)((data)->flags), \
1de06328 789 (IV)((data)->whilem_c), \
304ee84b
YO
790 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
791 is_inf ? "INF " : "" \
1de06328
YO
792 ); \
793 if ((data)->last_found) \
794 PerlIO_printf(Perl_debug_log, \
795 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
796 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
797 SvPVX_const((data)->last_found), \
798 (IV)((data)->last_end), \
799 (IV)((data)->last_start_min), \
800 (IV)((data)->last_start_max), \
801 ((data)->longest && \
802 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
803 SvPVX_const((data)->longest_fixed), \
804 (IV)((data)->offset_fixed), \
805 ((data)->longest && \
806 (data)->longest==&((data)->longest_float)) ? "*" : "", \
807 SvPVX_const((data)->longest_float), \
808 (IV)((data)->offset_float_min), \
809 (IV)((data)->offset_float_max) \
810 ); \
811 PerlIO_printf(Perl_debug_log,"\n"); \
812});
813
653099ff 814/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 815 Update the longest found anchored substring and the longest found
653099ff
GS
816 floating substrings if needed. */
817
4327152a 818STATIC void
ea3daa5d
FC
819S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
820 SSize_t *minlenp, int is_inf)
c277df42 821{
e1ec3a88
AL
822 const STRLEN l = CHR_SVLEN(data->last_found);
823 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 824 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 825
7918f24d
NC
826 PERL_ARGS_ASSERT_SCAN_COMMIT;
827
c277df42 828 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 829 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
830 if (*data->longest == data->longest_fixed) {
831 data->offset_fixed = l ? data->last_start_min : data->pos_min;
832 if (data->flags & SF_BEFORE_EOL)
b81d288d 833 data->flags
c277df42
IZ
834 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
835 else
836 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 837 data->minlen_fixed=minlenp;
1de06328 838 data->lookbehind_fixed=0;
a0ed51b3 839 }
304ee84b 840 else { /* *data->longest == data->longest_float */
c277df42 841 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
842 data->offset_float_max = (l
843 ? data->last_start_max
ea3daa5d
FC
844 : (data->pos_delta == SSize_t_MAX
845 ? SSize_t_MAX
846 : data->pos_min + data->pos_delta));
847 if (is_inf
848 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
849 data->offset_float_max = SSize_t_MAX;
c277df42 850 if (data->flags & SF_BEFORE_EOL)
b81d288d 851 data->flags
c277df42
IZ
852 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
853 else
854 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
855 data->minlen_float=minlenp;
856 data->lookbehind_float=0;
c277df42
IZ
857 }
858 }
859 SvCUR_set(data->last_found, 0);
0eda9292 860 {
a28509cc 861 SV * const sv = data->last_found;
097eb12c
AL
862 if (SvUTF8(sv) && SvMAGICAL(sv)) {
863 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
864 if (mg)
865 mg->mg_len = 0;
866 }
0eda9292 867 }
c277df42
IZ
868 data->last_end = -1;
869 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 870 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
871}
872
cdd87c1d
KW
873/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
874 * list that describes which code points it matches */
875
653099ff 876STATIC void
3420edd7 877S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 878{
cdd87c1d
KW
879 /* Set the SSC 'ssc' to match an empty string or any code point */
880
557bd3fb 881 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 882
71068078 883 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 884
cdd87c1d
KW
885 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
886 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
a0dd4231 887 ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
653099ff
GS
888}
889
653099ff 890STATIC int
cdd87c1d 891S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
653099ff 892{
c144baaa
KW
893 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
894 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
895 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
896 * in any way, so there's no point in using it */
cdd87c1d
KW
897
898 UV start, end;
899 bool ret;
653099ff 900
557bd3fb 901 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 902
71068078 903 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 904
77ebeeba 905 if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
cdd87c1d
KW
906 return FALSE;
907 }
908
909 /* See if the list consists solely of the range 0 - Infinity */
910 invlist_iterinit(ssc->invlist);
911 ret = invlist_iternext(ssc->invlist, &start, &end)
912 && start == 0
913 && end == UV_MAX;
914
915 invlist_iterfinish(ssc->invlist);
916
917 if (ret) {
918 return TRUE;
919 }
920
921 /* If e.g., both \w and \W are set, matches everything */
922 if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
923 int i;
924 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
925 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
926 return TRUE;
927 }
928 }
929 }
930
931 return FALSE;
653099ff
GS
932}
933
653099ff 934STATIC void
cdd87c1d 935S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 936{
cdd87c1d
KW
937 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
938 * string, any code point, or any posix class under locale */
939
557bd3fb 940 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 941
557bd3fb 942 Zero(ssc, 1, regnode_ssc);
71068078 943 set_ANYOF_SYNTHETIC(ssc);
cdd87c1d 944 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
3420edd7 945 ssc_anything(ssc);
cdd87c1d
KW
946
947 /* If any portion of the regex is to operate under locale rules,
948 * initialization includes it. The reason this isn't done for all regexes
949 * is that the optimizer was written under the assumption that locale was
950 * all-or-nothing. Given the complexity and lack of documentation in the
951 * optimizer, and that there are inadequate test cases for locale, many
952 * parts of it may not work properly, it is safest to avoid locale unless
953 * necessary. */
954 if (RExC_contains_locale) {
955 ANYOF_POSIXL_SETALL(ssc);
cfafade5
KW
956 ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
957 if (RExC_contains_i) {
958 ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD;
959 }
cdd87c1d
KW
960 }
961 else {
962 ANYOF_POSIXL_ZERO(ssc);
963 }
653099ff
GS
964}
965
b423522f
KW
966STATIC int
967S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
968 const regnode_ssc *ssc)
969{
970 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
971 * to the list of code points matched, and locale posix classes; hence does
972 * not check its flags) */
973
974 UV start, end;
975 bool ret;
976
977 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
978
71068078 979 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
980
981 invlist_iterinit(ssc->invlist);
982 ret = invlist_iternext(ssc->invlist, &start, &end)
983 && start == 0
984 && end == UV_MAX;
985
986 invlist_iterfinish(ssc->invlist);
987
988 if (! ret) {
989 return FALSE;
990 }
991
992 if (RExC_contains_locale) {
993 if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
994 || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
995 || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))
996 {
997 return FALSE;
998 }
999 if (RExC_contains_i && ! (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) {
1000 return FALSE;
1001 }
1002 }
1003
1004 return TRUE;
1005}
1006
1007STATIC SV*
1008S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
1009 const regnode_charclass_posixl* const node)
1010{
1011 /* Returns a mortal inversion list defining which code points are matched
1012 * by 'node', which is of type ANYOF. Handles complementing the result if
1013 * appropriate. If some code points aren't knowable at this time, the
1014 * returned list must, and will, contain every possible code point. */
1015
1016 SV* invlist = sv_2mortal(_new_invlist(0));
1017 unsigned int i;
1018 const U32 n = ARG(node);
1019
1020 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1021
1022 /* Look at the data structure created by S_set_ANYOF_arg() */
1023 if (n != ANYOF_NONBITMAP_EMPTY) {
1024 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1025 AV * const av = MUTABLE_AV(SvRV(rv));
1026 SV **const ary = AvARRAY(av);
1027 assert(RExC_rxi->data->what[n] == 's');
1028
1029 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1030 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1031 }
1032 else if (ary[0] && ary[0] != &PL_sv_undef) {
1033
1034 /* Here, no compile-time swash, and there are things that won't be
1035 * known until runtime -- we have to assume it could be anything */
1036 return _add_range_to_invlist(invlist, 0, UV_MAX);
1037 }
1038 else {
1039
1040 /* Here no compile-time swash, and no run-time only data. Use the
1041 * node's inversion list */
1042 invlist = sv_2mortal(invlist_clone(ary[2]));
1043 }
1044 }
1045
1046 /* An ANYOF node contains a bitmap for the first 256 code points, and an
1047 * inversion list for the others, but if there are code points that should
1048 * match only conditionally on the target string being UTF-8, those are
1049 * placed in the inversion list, and not the bitmap. Since there are
1050 * circumstances under which they could match, they are included in the
1051 * SSC. But if the ANYOF node is to be inverted, we have to exclude them
1052 * here, so that when we invert below, the end result actually does include
1053 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
1054 * before we add the unconditionally matched code points */
1055 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1056 _invlist_intersection_complement_2nd(invlist,
1057 PL_UpperLatin1,
1058 &invlist);
1059 }
1060
1061 /* Add in the points from the bit map */
1062 for (i = 0; i < 256; i++) {
1063 if (ANYOF_BITMAP_TEST(node, i)) {
1064 invlist = add_cp_to_invlist(invlist, i);
1065 }
1066 }
1067
1068 /* If this can match all upper Latin1 code points, have to add them
1069 * as well */
51ca120f 1070 if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
b423522f
KW
1071 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1072 }
1073
1074 /* Similarly for these */
1075 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1076 invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1077 }
1078
1079 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1080 _invlist_invert(invlist);
1081 }
1082
1083 return invlist;
1084}
1085
1051e1c4 1086/* These two functions currently do the exact same thing */
557bd3fb 1087#define ssc_init_zero ssc_init
653099ff 1088
cdd87c1d
KW
1089#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1090#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1091
b423522f
KW
1092STATIC void
1093S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
1094{
1095 /* Take the flags 'and_with' and accumulate them anded into the flags for
a0dd4231
KW
1096 * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored.
1097 * The flags 'and_with' should not come from another SSC (otherwise the
1098 * EMPTY_STRING flag won't work) */
b423522f 1099
eff8b7dc 1100 const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS;
b423522f
KW
1101
1102 PERL_ARGS_ASSERT_SSC_FLAGS_AND;
1103
1104 /* Use just the SSC-related flags from 'and_with' */
eff8b7dc 1105 ANYOF_FLAGS(ssc) &= (and_with & ANYOF_COMMON_FLAGS);
b423522f
KW
1106 ANYOF_FLAGS(ssc) |= ssc_only_flags;
1107}
1108
557bd3fb 1109/* 'AND' a given class with another one. Can create false positives. 'ssc'
8efd3f97 1110 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
b8f7bb16 1111 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1112
653099ff 1113STATIC void
b423522f
KW
1114S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1115 const regnode_ssc *and_with)
653099ff 1116{
cdd87c1d
KW
1117 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1118 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1119
a0dd4231
KW
1120 SV* anded_cp_list;
1121 U8 anded_flags;
1e6ade67 1122
cdd87c1d 1123 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1124
71068078 1125 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1126
1127 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1128 * the code point inversion list and just the relevant flags */
71068078 1129 if (is_ANYOF_SYNTHETIC(and_with)) {
a0dd4231
KW
1130 anded_cp_list = and_with->invlist;
1131 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1132
1133 /* XXX This is a kludge around what appears to be deficiencies in the
1134 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1135 * there are paths through the optimizer where it doesn't get weeded
1136 * out when it should. And if we don't make some extra provision for
1137 * it like the code just below, it doesn't get added when it should.
1138 * This solution is to add it only when AND'ing, which is here, and
1139 * only when what is being AND'ed is the pristine, original node
1140 * matching anything. Thus it is like adding it to ssc_anything() but
1141 * only when the result is to be AND'ed. Probably the same solution
1142 * could be adopted for the same problem we have with /l matching,
1143 * which is solved differently in S_ssc_init(), and that would lead to
1144 * fewer false positives than that solution has. But if this solution
1145 * creates bugs, the consequences are only that a warning isn't raised
1146 * that should be; while the consequences for having /l bugs is
1147 * incorrect matches */
1148 if (ssc_is_anything(and_with)) {
1149 anded_flags |= ANYOF_WARN_SUPER;
1150 }
a0dd4231
KW
1151 }
1152 else {
1153 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1154 (regnode_charclass_posixl*) and_with);
eff8b7dc 1155 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
a0dd4231
KW
1156 }
1157
1158 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1159
1160 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1161 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1162 * 'and_with' may be inverted. When not inverted, we have the situation of
1163 * computing:
1164 * (C1 | P1) & (C2 | P2)
1165 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1166 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1167 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1168 * <= ((C1 & C2) | P1 | P2)
1169 * Alternatively, the last few steps could be:
1170 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1171 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1172 * <= (C1 | C2 | (P1 & P2))
1173 * We favor the second approach if either P1 or P2 is non-empty. This is
1174 * because these components are a barrier to doing optimizations, as what
1175 * they match cannot be known until the moment of matching as they are
1176 * dependent on the current locale, 'AND"ing them likely will reduce or
1177 * eliminate them.
1178 * But we can do better if we know that C1,P1 are in their initial state (a
1179 * frequent occurrence), each matching everything:
1180 * (<everything>) & (C2 | P2) = C2 | P2
1181 * Similarly, if C2,P2 are in their initial state (again a frequent
1182 * occurrence), the result is a no-op
1183 * (C1 | P1) & (<everything>) = C1 | P1
1184 *
1185 * Inverted, we have
1186 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1187 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1188 * <= (C1 & ~C2) | (P1 & ~P2)
1189 * */
1aa99e6b 1190
a0dd4231 1191 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1192 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1193 {
cdd87c1d 1194 unsigned int i;
8951c461 1195
cdd87c1d
KW
1196 ssc_intersection(ssc,
1197 anded_cp_list,
1198 FALSE /* Has already been inverted */
1199 );
c6b76537 1200
cdd87c1d
KW
1201 /* If either P1 or P2 is empty, the intersection will be also; can skip
1202 * the loop */
1203 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1204 ANYOF_POSIXL_ZERO(ssc);
1205 }
1206 else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1207
1208 /* Note that the Posix class component P from 'and_with' actually
1209 * looks like:
1210 * P = Pa | Pb | ... | Pn
1211 * where each component is one posix class, such as in [\w\s].
1212 * Thus
1213 * ~P = ~(Pa | Pb | ... | Pn)
1214 * = ~Pa & ~Pb & ... & ~Pn
1215 * <= ~Pa | ~Pb | ... | ~Pn
1216 * The last is something we can easily calculate, but unfortunately
1217 * is likely to have many false positives. We could do better
1218 * in some (but certainly not all) instances if two classes in
1219 * P have known relationships. For example
1220 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1221 * So
1222 * :lower: & :print: = :lower:
1223 * And similarly for classes that must be disjoint. For example,
1224 * since \s and \w can have no elements in common based on rules in
1225 * the POSIX standard,
1226 * \w & ^\S = nothing
1227 * Unfortunately, some vendor locales do not meet the Posix
1228 * standard, in particular almost everything by Microsoft.
1229 * The loop below just changes e.g., \w into \W and vice versa */
1230
1231 regnode_charclass_posixl temp;
1232 int add = 1; /* To calculate the index of the complement */
1233
1234 ANYOF_POSIXL_ZERO(&temp);
1235 for (i = 0; i < ANYOF_MAX; i++) {
1236 assert(i % 2 != 0
1237 || ! ANYOF_POSIXL_TEST(and_with, i)
1238 || ! ANYOF_POSIXL_TEST(and_with, i + 1));
1239
1240 if (ANYOF_POSIXL_TEST(and_with, i)) {
1241 ANYOF_POSIXL_SET(&temp, i + add);
1242 }
1243 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1244 }
1245 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1246
cdd87c1d
KW
1247 } /* else ssc already has no posixes */
1248 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1249 in its initial state */
71068078 1250 else if (! is_ANYOF_SYNTHETIC(and_with)
cdd87c1d
KW
1251 || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
1252 {
1253 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1254 * copy it over 'ssc' */
1255 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1256 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1257 StructCopy(and_with, ssc, regnode_ssc);
1258 }
1259 else {
1260 ssc->invlist = anded_cp_list;
1261 ANYOF_POSIXL_ZERO(ssc);
1262 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1263 ANYOF_POSIXL_OR(and_with, ssc);
1264 }
1265 }
1266 }
1267 else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
1268 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1269 {
1270 /* One or the other of P1, P2 is non-empty. */
1271 ANYOF_POSIXL_AND(and_with, ssc);
1272 ssc_union(ssc, anded_cp_list, FALSE);
1273 }
1274 else { /* P1 = P2 = empty */
1275 ssc_intersection(ssc, anded_cp_list, FALSE);
1276 }
137165a6 1277 }
653099ff
GS
1278}
1279
653099ff 1280STATIC void
cdd87c1d
KW
1281S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1282 const regnode_ssc *or_with)
653099ff 1283{
cdd87c1d
KW
1284 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1285 * another SSC or a regular ANYOF class. Can create false positives if
1286 * 'or_with' is to be inverted. */
7918f24d 1287
a0dd4231
KW
1288 SV* ored_cp_list;
1289 U8 ored_flags;
c6b76537 1290
cdd87c1d 1291 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1292
71068078 1293 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1294
1295 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1296 * the code point inversion list and just the relevant flags */
71068078 1297 if (is_ANYOF_SYNTHETIC(or_with)) {
a0dd4231
KW
1298 ored_cp_list = or_with->invlist;
1299 ored_flags = ANYOF_FLAGS(or_with);
1300 }
1301 else {
1302 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1303 (regnode_charclass_posixl*) or_with);
eff8b7dc 1304 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
a0dd4231
KW
1305 }
1306
1307 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1308
1309 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1310 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1311 * 'or_with' may be inverted. When not inverted, we have the simple
1312 * situation of computing:
1313 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1314 * If P1|P2 yields a situation with both a class and its complement are
1315 * set, like having both \w and \W, this matches all code points, and we
1316 * can delete these from the P component of the ssc going forward. XXX We
1317 * might be able to delete all the P components, but I (khw) am not certain
1318 * about this, and it is better to be safe.
1319 *
1320 * Inverted, we have
1321 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1322 * <= (C1 | P1) | ~C2
1323 * <= (C1 | ~C2) | P1
1324 * (which results in actually simpler code than the non-inverted case)
1325 * */
9826f543 1326
a0dd4231 1327 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1328 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1329 {
cdd87c1d
KW
1330 /* We ignore P2, leaving P1 going forward */
1331 }
1332 else { /* Not inverted */
1333 ANYOF_POSIXL_OR(or_with, ssc);
1334 if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1335 unsigned int i;
1336 for (i = 0; i < ANYOF_MAX; i += 2) {
1337 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1338 {
1339 ssc_match_all_cp(ssc);
1340 ANYOF_POSIXL_CLEAR(ssc, i);
1341 ANYOF_POSIXL_CLEAR(ssc, i+1);
1342 if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1343 ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
1344 }
1345 }
1346 }
1347 }
1aa99e6b 1348 }
cdd87c1d
KW
1349
1350 ssc_union(ssc,
1351 ored_cp_list,
1352 FALSE /* Already has been inverted */
1353 );
653099ff
GS
1354}
1355
b423522f
KW
1356PERL_STATIC_INLINE void
1357S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1358{
1359 PERL_ARGS_ASSERT_SSC_UNION;
1360
71068078 1361 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1362
1363 _invlist_union_maybe_complement_2nd(ssc->invlist,
1364 invlist,
1365 invert2nd,
1366 &ssc->invlist);
1367}
1368
1369PERL_STATIC_INLINE void
1370S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1371 SV* const invlist,
1372 const bool invert2nd)
1373{
1374 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1375
71068078 1376 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1377
1378 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1379 invlist,
1380 invert2nd,
1381 &ssc->invlist);
1382}
1383
1384PERL_STATIC_INLINE void
1385S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1386{
1387 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1388
71068078 1389 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1390
1391 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1392}
1393
1394PERL_STATIC_INLINE void
1395S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1396{
1397 /* AND just the single code point 'cp' into the SSC 'ssc' */
1398
1399 SV* cp_list = _new_invlist(2);
1400
1401 PERL_ARGS_ASSERT_SSC_CP_AND;
1402
71068078 1403 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1404
1405 cp_list = add_cp_to_invlist(cp_list, cp);
1406 ssc_intersection(ssc, cp_list,
1407 FALSE /* Not inverted */
1408 );
1409 SvREFCNT_dec_NN(cp_list);
1410}
1411
1412PERL_STATIC_INLINE void
1413S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1414{
1415 /* Set the SSC 'ssc' to not match any locale things */
1416
1417 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1418
71068078 1419 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1420
1421 ANYOF_POSIXL_ZERO(ssc);
1422 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1423}
1424
1425STATIC void
1426S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1427{
1428 /* The inversion list in the SSC is marked mortal; now we need a more
1429 * permanent copy, which is stored the same way that is done in a regular
1430 * ANYOF node, with the first 256 code points in a bit map */
1431
1432 SV* invlist = invlist_clone(ssc->invlist);
1433
1434 PERL_ARGS_ASSERT_SSC_FINALIZE;
1435
71068078 1436 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 1437
a0dd4231
KW
1438 /* The code in this file assumes that all but these flags aren't relevant
1439 * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1440 * time we reach here */
eff8b7dc 1441 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
a0dd4231 1442
b423522f
KW
1443 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1444
1445 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
1446
1447 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
1448}
1449
a3621e74
YO
1450#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1451#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1452#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
1453#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1454 ? (TRIE_LIST_CUR( idx ) - 1) \
1455 : 0 )
a3621e74 1456
3dab1dad
YO
1457
1458#ifdef DEBUGGING
07be1b83 1459/*
2b8b4781
NC
1460 dump_trie(trie,widecharmap,revcharmap)
1461 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1462 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1463
1464 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1465 The _interim_ variants are used for debugging the interim
1466 tables that are used to generate the final compressed
1467 representation which is what dump_trie expects.
1468
486ec47a 1469 Part of the reason for their existence is to provide a form
3dab1dad 1470 of documentation as to how the different representations function.
07be1b83
YO
1471
1472*/
3dab1dad
YO
1473
1474/*
3dab1dad
YO
1475 Dumps the final compressed table form of the trie to Perl_debug_log.
1476 Used for debugging make_trie().
1477*/
b9a59e08 1478
3dab1dad 1479STATIC void
2b8b4781
NC
1480S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1481 AV *revcharmap, U32 depth)
3dab1dad
YO
1482{
1483 U32 state;
ab3bbdeb 1484 SV *sv=sv_newmortal();
55eed653 1485 int colwidth= widecharmap ? 6 : 4;
2e64971a 1486 U16 word;
3dab1dad
YO
1487 GET_RE_DEBUG_FLAGS_DECL;
1488
7918f24d 1489 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1490
3dab1dad
YO
1491 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1492 (int)depth * 2 + 2,"",
1493 "Match","Base","Ofs" );
1494
1495 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1496 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1497 if ( tmp ) {
538e84ed 1498 PerlIO_printf( Perl_debug_log, "%*s",
ab3bbdeb 1499 colwidth,
538e84ed 1500 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1501 PL_colors[0], PL_colors[1],
1502 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
1503 PERL_PV_ESCAPE_FIRSTCHAR
1504 )
ab3bbdeb 1505 );
3dab1dad
YO
1506 }
1507 }
1508 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1509 (int)depth * 2 + 2,"");
1510
1511 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1512 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1513 PerlIO_printf( Perl_debug_log, "\n");
1514
1e2e3d02 1515 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1516 const U32 base = trie->states[ state ].trans.base;
3dab1dad 1517
538e84ed
KW
1518 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1519 (int)depth * 2 + 2,"", (UV)state);
3dab1dad
YO
1520
1521 if ( trie->states[ state ].wordnum ) {
538e84ed
KW
1522 PerlIO_printf( Perl_debug_log, " W%4X",
1523 trie->states[ state ].wordnum );
3dab1dad
YO
1524 } else {
1525 PerlIO_printf( Perl_debug_log, "%6s", "" );
1526 }
1527
1528 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1529
1530 if ( base ) {
1531 U32 ofs = 0;
1532
1533 while( ( base + ofs < trie->uniquecharcount ) ||
1534 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
1535 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1536 != state))
3dab1dad
YO
1537 ofs++;
1538
1539 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1540
1541 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
1542 if ( ( base + ofs >= trie->uniquecharcount )
1543 && ( base + ofs - trie->uniquecharcount
1544 < trie->lasttrans )
1545 && trie->trans[ base + ofs
1546 - trie->uniquecharcount ].check == state )
3dab1dad 1547 {
ab3bbdeb
YO
1548 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1549 colwidth,
538e84ed
KW
1550 (UV)trie->trans[ base + ofs
1551 - trie->uniquecharcount ].next );
3dab1dad 1552 } else {
ab3bbdeb 1553 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1554 }
1555 }
1556
1557 PerlIO_printf( Perl_debug_log, "]");
1558
1559 }
1560 PerlIO_printf( Perl_debug_log, "\n" );
1561 }
538e84ed
KW
1562 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1563 (int)depth*2, "");
2e64971a
DM
1564 for (word=1; word <= trie->wordcount; word++) {
1565 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1566 (int)word, (int)(trie->wordinfo[word].prev),
1567 (int)(trie->wordinfo[word].len));
1568 }
1569 PerlIO_printf(Perl_debug_log, "\n" );
538e84ed 1570}
3dab1dad 1571/*
3dab1dad 1572 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 1573 List tries normally only are used for construction when the number of
3dab1dad
YO
1574 possible chars (trie->uniquecharcount) is very high.
1575 Used for debugging make_trie().
1576*/
1577STATIC void
55eed653 1578S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1579 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1580 U32 depth)
3dab1dad
YO
1581{
1582 U32 state;
ab3bbdeb 1583 SV *sv=sv_newmortal();
55eed653 1584 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1585 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1586
1587 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1588
3dab1dad 1589 /* print out the table precompression. */
ab3bbdeb
YO
1590 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1591 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1592 "------:-----+-----------------\n" );
538e84ed 1593
3dab1dad
YO
1594 for( state=1 ; state < next_alloc ; state ++ ) {
1595 U16 charid;
538e84ed 1596
ab3bbdeb 1597 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1598 (int)depth * 2 + 2,"", (UV)state );
1599 if ( ! trie->states[ state ].wordnum ) {
1600 PerlIO_printf( Perl_debug_log, "%5s| ","");
1601 } else {
1602 PerlIO_printf( Perl_debug_log, "W%4x| ",
1603 trie->states[ state ].wordnum
1604 );
1605 }
1606 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed
KW
1607 SV ** const tmp = av_fetch( revcharmap,
1608 TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1609 if ( tmp ) {
1610 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1611 colwidth,
538e84ed
KW
1612 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1613 colwidth,
1614 PL_colors[0], PL_colors[1],
1615 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1616 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 1617 ) ,
1e2e3d02
YO
1618 TRIE_LIST_ITEM(state,charid).forid,
1619 (UV)TRIE_LIST_ITEM(state,charid).newstate
1620 );
538e84ed 1621 if (!(charid % 10))
664e119d
RGS
1622 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1623 (int)((depth * 2) + 14), "");
1e2e3d02 1624 }
ab3bbdeb
YO
1625 }
1626 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad 1627 }
538e84ed 1628}
3dab1dad
YO
1629
1630/*
3dab1dad 1631 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
1632 This is the normal DFA style state transition table, with a few
1633 twists to facilitate compression later.
3dab1dad
YO
1634 Used for debugging make_trie().
1635*/
1636STATIC void
55eed653 1637S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1638 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1639 U32 depth)
3dab1dad
YO
1640{
1641 U32 state;
1642 U16 charid;
ab3bbdeb 1643 SV *sv=sv_newmortal();
55eed653 1644 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1645 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1646
1647 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 1648
3dab1dad
YO
1649 /*
1650 print out the table precompression so that we can do a visual check
1651 that they are identical.
1652 */
538e84ed 1653
3dab1dad
YO
1654 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1655
1656 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1657 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1658 if ( tmp ) {
538e84ed 1659 PerlIO_printf( Perl_debug_log, "%*s",
ab3bbdeb 1660 colwidth,
538e84ed 1661 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1662 PL_colors[0], PL_colors[1],
1663 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
1664 PERL_PV_ESCAPE_FIRSTCHAR
1665 )
ab3bbdeb 1666 );
3dab1dad
YO
1667 }
1668 }
1669
1670 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1671
1672 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1673 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1674 }
1675
1676 PerlIO_printf( Perl_debug_log, "\n" );
1677
1678 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1679
538e84ed 1680 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
3dab1dad
YO
1681 (int)depth * 2 + 2,"",
1682 (UV)TRIE_NODENUM( state ) );
1683
1684 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1685 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1686 if (v)
1687 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1688 else
1689 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1690 }
1691 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
538e84ed
KW
1692 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1693 (UV)trie->trans[ state ].check );
3dab1dad 1694 } else {
538e84ed
KW
1695 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1696 (UV)trie->trans[ state ].check,
3dab1dad
YO
1697 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1698 }
1699 }
07be1b83 1700}
3dab1dad
YO
1701
1702#endif
1703
2e64971a 1704
786e8c11
YO
1705/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1706 startbranch: the first branch in the whole branch sequence
1707 first : start branch of sequence of branch-exact nodes.
1708 May be the same as startbranch
1709 last : Thing following the last branch.
1710 May be the same as tail.
1711 tail : item following the branch sequence
1712 count : words in the sequence
1713 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1714 depth : indent depth
3dab1dad 1715
786e8c11 1716Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1717
786e8c11
YO
1718A trie is an N'ary tree where the branches are determined by digital
1719decomposition of the key. IE, at the root node you look up the 1st character and
1720follow that branch repeat until you find the end of the branches. Nodes can be
1721marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1722
786e8c11 1723 /he|she|his|hers/
72f13be8 1724
786e8c11
YO
1725would convert into the following structure. Numbers represent states, letters
1726following numbers represent valid transitions on the letter from that state, if
1727the number is in square brackets it represents an accepting state, otherwise it
1728will be in parenthesis.
07be1b83 1729
786e8c11
YO
1730 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1731 | |
1732 | (2)
1733 | |
1734 (1) +-i->(6)-+-s->[7]
1735 |
1736 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1737
786e8c11
YO
1738 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1739
1740This shows that when matching against the string 'hers' we will begin at state 1
1741read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1742then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1743is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1744single traverse. We store a mapping from accepting to state to which word was
1745matched, and then when we have multiple possibilities we try to complete the
1746rest of the regex in the order in which they occured in the alternation.
1747
1748The only prior NFA like behaviour that would be changed by the TRIE support is
1749the silent ignoring of duplicate alternations which are of the form:
1750
1751 / (DUPE|DUPE) X? (?{ ... }) Y /x
1752
4b714af6 1753Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1754and without the optimisation. With the optimisations dupes will be silently
486ec47a 1755ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1756the following demonstrates:
1757
1758 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1759
1760which prints out 'word' three times, but
1761
1762 'words'=~/(word|word|word)(?{ print $1 })S/
1763
1764which doesnt print it out at all. This is due to other optimisations kicking in.
1765
1766Example of what happens on a structural level:
1767
486ec47a 1768The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1769
1770 1: CURLYM[1] {1,32767}(18)
1771 5: BRANCH(8)
1772 6: EXACT <ac>(16)
1773 8: BRANCH(11)
1774 9: EXACT <ad>(16)
1775 11: BRANCH(14)
1776 12: EXACT <ab>(16)
1777 16: SUCCEED(0)
1778 17: NOTHING(18)
1779 18: END(0)
1780
1781This would be optimizable with startbranch=5, first=5, last=16, tail=16
1782and should turn into:
1783
1784 1: CURLYM[1] {1,32767}(18)
1785 5: TRIE(16)
1786 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1787 <ac>
1788 <ad>
1789 <ab>
1790 16: SUCCEED(0)
1791 17: NOTHING(18)
1792 18: END(0)
1793
1794Cases where tail != last would be like /(?foo|bar)baz/:
1795
1796 1: BRANCH(4)
1797 2: EXACT <foo>(8)
1798 4: BRANCH(7)
1799 5: EXACT <bar>(8)
1800 7: TAIL(8)
1801 8: EXACT <baz>(10)
1802 10: END(0)
1803
1804which would be optimizable with startbranch=1, first=1, last=7, tail=8
1805and would end up looking like:
1806
1807 1: TRIE(8)
1808 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1809 <foo>
1810 <bar>
1811 7: TAIL(8)
1812 8: EXACT <baz>(10)
1813 10: END(0)
1814
c80e42f3 1815 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
1816
1817is the recommended Unicode-aware way of saying
1818
1819 *(d++) = uv;
1820*/
1821
fab2782b 1822#define TRIE_STORE_REVCHAR(val) \
786e8c11 1823 STMT_START { \
73031816 1824 if (UTF) { \
fab2782b 1825 SV *zlopp = newSV(7); /* XXX: optimize me */ \
88c9ea1e 1826 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 1827 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
1828 SvCUR_set(zlopp, kapow - flrbbbbb); \
1829 SvPOK_on(zlopp); \
1830 SvUTF8_on(zlopp); \
1831 av_push(revcharmap, zlopp); \
1832 } else { \
fab2782b 1833 char ooooff = (char)val; \
73031816
NC
1834 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1835 } \
1836 } STMT_END
786e8c11 1837
914a25d5
KW
1838/* This gets the next character from the input, folding it if not already
1839 * folded. */
1840#define TRIE_READ_CHAR STMT_START { \
1841 wordlen++; \
1842 if ( UTF ) { \
1843 /* if it is UTF then it is either already folded, or does not need \
1844 * folding */ \
1c1d615a 1845 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
1846 } \
1847 else if (folder == PL_fold_latin1) { \
7d006b13
KW
1848 /* This folder implies Unicode rules, which in the range expressible \
1849 * by not UTF is the lower case, with the two exceptions, one of \
1850 * which should have been taken care of before calling this */ \
1851 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1852 uvc = toLOWER_L1(*uc); \
1853 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1854 len = 1; \
914a25d5
KW
1855 } else { \
1856 /* raw data, will be folded later if needed */ \
1857 uvc = (U32)*uc; \
1858 len = 1; \
1859 } \
786e8c11
YO
1860} STMT_END
1861
1862
1863
1864#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1865 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1866 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1867 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1868 } \
1869 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1870 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1871 TRIE_LIST_CUR( state )++; \
1872} STMT_END
07be1b83 1873
786e8c11
YO
1874#define TRIE_LIST_NEW(state) STMT_START { \
1875 Newxz( trie->states[ state ].trans.list, \
1876 4, reg_trie_trans_le ); \
1877 TRIE_LIST_CUR( state ) = 1; \
1878 TRIE_LIST_LEN( state ) = 4; \
1879} STMT_END
07be1b83 1880
786e8c11
YO
1881#define TRIE_HANDLE_WORD(state) STMT_START { \
1882 U16 dupe= trie->states[ state ].wordnum; \
1883 regnode * const noper_next = regnext( noper ); \
1884 \
786e8c11
YO
1885 DEBUG_r({ \
1886 /* store the word for dumping */ \
1887 SV* tmp; \
1888 if (OP(noper) != NOTHING) \
740cce10 1889 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1890 else \
740cce10 1891 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1892 av_push( trie_words, tmp ); \
786e8c11
YO
1893 }); \
1894 \
1895 curword++; \
2e64971a
DM
1896 trie->wordinfo[curword].prev = 0; \
1897 trie->wordinfo[curword].len = wordlen; \
1898 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1899 \
1900 if ( noper_next < tail ) { \
1901 if (!trie->jump) \
538e84ed
KW
1902 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1903 sizeof(U16) ); \
7f69552c 1904 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1905 if (!jumper) \
1906 jumper = noper_next; \
1907 if (!nextbranch) \
1908 nextbranch= regnext(cur); \
1909 } \
1910 \
1911 if ( dupe ) { \
2e64971a
DM
1912 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1913 /* chain, so that when the bits of chain are later */\
1914 /* linked together, the dups appear in the chain */\
1915 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1916 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1917 } else { \
1918 /* we haven't inserted this word yet. */ \
1919 trie->states[ state ].wordnum = curword; \
1920 } \
1921} STMT_END
07be1b83 1922
3dab1dad 1923
786e8c11
YO
1924#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1925 ( ( base + charid >= ucharcount \
1926 && base + charid < ubound \
1927 && state == trie->trans[ base - ucharcount + charid ].check \
1928 && trie->trans[ base - ucharcount + charid ].next ) \
1929 ? trie->trans[ base - ucharcount + charid ].next \
1930 : ( state==1 ? special : 0 ) \
1931 )
3dab1dad 1932
786e8c11
YO
1933#define MADE_TRIE 1
1934#define MADE_JUMP_TRIE 2
1935#define MADE_EXACT_TRIE 4
3dab1dad 1936
a3621e74 1937STATIC I32
538e84ed
KW
1938S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1939 regnode *first, regnode *last, regnode *tail,
1940 U32 word_count, U32 flags, U32 depth)
a3621e74 1941{
27da23d5 1942 dVAR;
a3621e74
YO
1943 /* first pass, loop through and scan words */
1944 reg_trie_data *trie;
55eed653 1945 HV *widecharmap = NULL;
2b8b4781 1946 AV *revcharmap = newAV();
a3621e74 1947 regnode *cur;
a3621e74
YO
1948 STRLEN len = 0;
1949 UV uvc = 0;
1950 U16 curword = 0;
1951 U32 next_alloc = 0;
786e8c11
YO
1952 regnode *jumper = NULL;
1953 regnode *nextbranch = NULL;
7f69552c 1954 regnode *convert = NULL;
2e64971a 1955 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1956 /* we just use folder as a flag in utf8 */
1e696034 1957 const U8 * folder = NULL;
a3621e74 1958
2b8b4781 1959#ifdef DEBUGGING
cf78de0b 1960 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2b8b4781
NC
1961 AV *trie_words = NULL;
1962 /* along with revcharmap, this only used during construction but both are
1963 * useful during debugging so we store them in the struct when debugging.
8e11feef 1964 */
2b8b4781 1965#else
cf78de0b 1966 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 1967 STRLEN trie_charcount=0;
3dab1dad 1968#endif
2b8b4781 1969 SV *re_trie_maxbuff;
a3621e74 1970 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1971
1972 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1973#ifndef DEBUGGING
1974 PERL_UNUSED_ARG(depth);
1975#endif
a3621e74 1976
1e696034 1977 switch (flags) {
79a81a6e 1978 case EXACT: break;
2f7f8cb1 1979 case EXACTFA:
fab2782b 1980 case EXACTFU_SS:
1e696034
KW
1981 case EXACTFU: folder = PL_fold_latin1; break;
1982 case EXACTF: folder = PL_fold; break;
fab2782b 1983 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
1984 }
1985
c944940b 1986 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1987 trie->refcount = 1;
3dab1dad 1988 trie->startstate = 1;
786e8c11 1989 trie->wordcount = word_count;
f8fc2ecf 1990 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1991 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
fab2782b 1992 if (flags == EXACT)
c944940b 1993 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1994 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1995 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1996
a3621e74 1997 DEBUG_r({
2b8b4781 1998 trie_words = newAV();
a3621e74 1999 });
a3621e74 2000
0111c4fd 2001 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 2002 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2003 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2004 }
df826430 2005 DEBUG_TRIE_COMPILE_r({
538e84ed
KW
2006 PerlIO_printf( Perl_debug_log,
2007 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2008 (int)depth * 2 + 2, "",
2009 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2010 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2011 });
538e84ed 2012
7f69552c
YO
2013 /* Find the node we are going to overwrite */
2014 if ( first == startbranch && OP( last ) != BRANCH ) {
2015 /* whole branch chain */
2016 convert = first;
2017 } else {
2018 /* branch sub-chain */
2019 convert = NEXTOPER( first );
2020 }
538e84ed 2021
a3621e74
YO
2022 /* -- First loop and Setup --
2023
2024 We first traverse the branches and scan each word to determine if it
2025 contains widechars, and how many unique chars there are, this is
2026 important as we have to build a table with at least as many columns as we
2027 have unique chars.
2028
2029 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2030 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2031 the native representation of the character value as the key and IV's for
2032 the coded index.
a3621e74
YO
2033
2034 *TODO* If we keep track of how many times each character is used we can
2035 remap the columns so that the table compression later on is more
3b753521 2036 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2037 middle and the least common are on the outside. IMO this would be better
2038 than a most to least common mapping as theres a decent chance the most
2039 common letter will share a node with the least common, meaning the node
486ec47a 2040 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2041 case is when we have the least common nodes twice.
2042
2043 */
2044
a3621e74 2045 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2046 regnode *noper = NEXTOPER( cur );
e1ec3a88 2047 const U8 *uc = (U8*)STRING( noper );
df826430 2048 const U8 *e = uc + STR_LEN( noper );
a3621e74 2049 STRLEN foldlen = 0;
07be1b83 2050 U32 wordlen = 0; /* required init */
afa96d92
KW
2051 STRLEN minbytes = 0;
2052 STRLEN maxbytes = 0;
538e84ed
KW
2053 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2054 bitmap?*/
a3621e74 2055
3dab1dad 2056 if (OP(noper) == NOTHING) {
df826430
YO
2057 regnode *noper_next= regnext(noper);
2058 if (noper_next != tail && OP(noper_next) == flags) {
2059 noper = noper_next;
2060 uc= (U8*)STRING(noper);
2061 e= uc + STR_LEN(noper);
2062 trie->minlen= STR_LEN(noper);
2063 } else {
2064 trie->minlen= 0;
2065 continue;
2066 }
3dab1dad 2067 }
df826430 2068
fab2782b 2069 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2070 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2071 regardless of encoding */
fab2782b
YO
2072 if (OP( noper ) == EXACTFU_SS) {
2073 /* false positives are ok, so just set this */
0dc4a61d 2074 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2075 }
2076 }
a3621e74 2077 for ( ; uc < e ; uc += len ) {
3dab1dad 2078 TRIE_CHARCOUNT(trie)++;
a3621e74 2079 TRIE_READ_CHAR;
645de4ce
KW
2080
2081 /* Acummulate to the current values, the range in the number of
2082 * bytes that this character could match. The max is presumed to
2083 * be the same as the folded input (which TRIE_READ_CHAR returns),
2084 * except that when this is not in UTF-8, it could be matched
2085 * against a string which is UTF-8, and the variant characters
2086 * could be 2 bytes instead of the 1 here. Likewise, for the
2087 * minimum number of bytes when not folded. When folding, the min
2088 * is assumed to be 1 byte could fold to match the single character
2089 * here, or in the case of a multi-char fold, 1 byte can fold to
2090 * the whole sequence. 'foldlen' is used to denote whether we are
2091 * in such a sequence, skipping the min setting if so. XXX TODO
2092 * Use the exact list of what folds to each character, from
2093 * PL_utf8_foldclosures */
2094 if (UTF) {
2095 maxbytes += UTF8SKIP(uc);
2096 if (! folder) {
2097 /* A non-UTF-8 string could be 1 byte to match our 2 */
2098 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
2099 ? 1
2100 : UTF8SKIP(uc);
2101 }
2102 else {
2103 if (foldlen) {
2104 foldlen -= UTF8SKIP(uc);
2105 }
2106 else {
2107 foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
2108 minbytes++;
2109 }
2110 }
2111 }
2112 else {
2113 maxbytes += (UNI_IS_INVARIANT(*uc))
2114 ? 1
2115 : 2;
2116 if (! folder) {
2117 minbytes++;
2118 }
2119 else {
2120 if (foldlen) {
2121 foldlen--;
2122 }
2123 else {
2124 foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
2125 minbytes++;
2126 }
2127 }
2128 }
a3621e74 2129 if ( uvc < 256 ) {
fab2782b
YO
2130 if ( folder ) {
2131 U8 folded= folder[ (U8) uvc ];
2132 if ( !trie->charmap[ folded ] ) {
2133 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2134 TRIE_STORE_REVCHAR( folded );
2135 }
2136 }
a3621e74
YO
2137 if ( !trie->charmap[ uvc ] ) {
2138 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2139 TRIE_STORE_REVCHAR( uvc );
a3621e74 2140 }
02daf0ab 2141 if ( set_bit ) {
62012aee
KW
2142 /* store the codepoint in the bitmap, and its folded
2143 * equivalent. */
fab2782b 2144 TRIE_BITMAP_SET(trie, uvc);
0921ee73
T
2145
2146 /* store the folded codepoint */
fab2782b 2147 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
0921ee73
T
2148
2149 if ( !UTF ) {
2150 /* store first byte of utf8 representation of
acdf4139 2151 variant codepoints */
6f2d5cbc 2152 if (! UVCHR_IS_INVARIANT(uvc)) {
acdf4139 2153 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
2154 }
2155 }
02daf0ab
YO
2156 set_bit = 0; /* We've done our bit :-) */
2157 }
a3621e74
YO
2158 } else {
2159 SV** svpp;
55eed653
NC
2160 if ( !widecharmap )
2161 widecharmap = newHV();
a3621e74 2162
55eed653 2163 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2164
2165 if ( !svpp )
e4584336 2166 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
2167
2168 if ( !SvTRUE( *svpp ) ) {
2169 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2170 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2171 }
2172 }
2173 }
3dab1dad 2174 if( cur == first ) {
afa96d92
KW
2175 trie->minlen = minbytes;
2176 trie->maxlen = maxbytes;
2177 } else if (minbytes < trie->minlen) {
2178 trie->minlen = minbytes;
2179 } else if (maxbytes > trie->maxlen) {
2180 trie->maxlen = maxbytes;
fab2782b 2181 }
a3621e74
YO
2182 } /* end first pass */
2183 DEBUG_TRIE_COMPILE_r(
538e84ed
KW
2184 PerlIO_printf( Perl_debug_log,
2185 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3dab1dad 2186 (int)depth * 2 + 2,"",
55eed653 2187 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2188 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2189 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2190 );
a3621e74
YO
2191
2192 /*
2193 We now know what we are dealing with in terms of unique chars and
2194 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2195 representation using a flat table will take. If it's over a reasonable
2196 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2197 conservative but potentially much slower representation using an array
2198 of lists.
2199
2200 At the end we convert both representations into the same compressed
2201 form that will be used in regexec.c for matching with. The latter
2202 is a form that cannot be used to construct with but has memory
2203 properties similar to the list form and access properties similar
2204 to the table form making it both suitable for fast searches and
2205 small enough that its feasable to store for the duration of a program.
2206
2207 See the comment in the code where the compressed table is produced
2208 inplace from the flat tabe representation for an explanation of how
2209 the compression works.
2210
2211 */
2212
2213
2e64971a
DM
2214 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2215 prev_states[1] = 0;
2216
538e84ed
KW
2217 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2218 > SvIV(re_trie_maxbuff) )
2219 {
a3621e74
YO
2220 /*
2221 Second Pass -- Array Of Lists Representation
2222
2223 Each state will be represented by a list of charid:state records
2224 (reg_trie_trans_le) the first such element holds the CUR and LEN
2225 points of the allocated array. (See defines above).
2226
2227 We build the initial structure using the lists, and then convert
2228 it into the compressed table form which allows faster lookups
2229 (but cant be modified once converted).
a3621e74
YO
2230 */
2231
a3621e74
YO
2232 STRLEN transcount = 1;
2233
538e84ed 2234 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1e2e3d02
YO
2235 "%*sCompiling trie using list compiler\n",
2236 (int)depth * 2 + 2, ""));
686b73d4 2237
c944940b
JH
2238 trie->states = (reg_trie_state *)
2239 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2240 sizeof(reg_trie_state) );
a3621e74
YO
2241 TRIE_LIST_NEW(1);
2242 next_alloc = 2;
2243
2244 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2245
df826430 2246 regnode *noper = NEXTOPER( cur );
c445ea15 2247 U8 *uc = (U8*)STRING( noper );
df826430 2248 const U8 *e = uc + STR_LEN( noper );
c445ea15
AL
2249 U32 state = 1; /* required init */
2250 U16 charid = 0; /* sanity init */
07be1b83 2251 U32 wordlen = 0; /* required init */
c445ea15 2252
df826430
YO
2253 if (OP(noper) == NOTHING) {
2254 regnode *noper_next= regnext(noper);
2255 if (noper_next != tail && OP(noper_next) == flags) {
2256 noper = noper_next;
2257 uc= (U8*)STRING(noper);
2258 e= uc + STR_LEN(noper);
2259 }
2260 }
2261
3dab1dad 2262 if (OP(noper) != NOTHING) {
786e8c11 2263 for ( ; uc < e ; uc += len ) {
c445ea15 2264
786e8c11 2265 TRIE_READ_CHAR;
c445ea15 2266
786e8c11
YO
2267 if ( uvc < 256 ) {
2268 charid = trie->charmap[ uvc ];
c445ea15 2269 } else {
538e84ed
KW
2270 SV** const svpp = hv_fetch( widecharmap,
2271 (char*)&uvc,
2272 sizeof( UV ),
2273 0);
786e8c11
YO
2274 if ( !svpp ) {
2275 charid = 0;
2276 } else {
2277 charid=(U16)SvIV( *svpp );
2278 }
c445ea15 2279 }
538e84ed
KW
2280 /* charid is now 0 if we dont know the char read, or
2281 * nonzero if we do */
786e8c11 2282 if ( charid ) {
a3621e74 2283
786e8c11
YO
2284 U16 check;
2285 U32 newstate = 0;
a3621e74 2286
786e8c11
YO
2287 charid--;
2288 if ( !trie->states[ state ].trans.list ) {
2289 TRIE_LIST_NEW( state );
c445ea15 2290 }
538e84ed
KW
2291 for ( check = 1;
2292 check <= TRIE_LIST_USED( state );
2293 check++ )
2294 {
2295 if ( TRIE_LIST_ITEM( state, check ).forid
2296 == charid )
2297 {
786e8c11
YO
2298 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2299 break;
2300 }
2301 }
2302 if ( ! newstate ) {
2303 newstate = next_alloc++;
2e64971a 2304 prev_states[newstate] = state;
786e8c11
YO
2305 TRIE_LIST_PUSH( state, charid, newstate );
2306 transcount++;
2307 }
2308 state = newstate;
2309 } else {
2310 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 2311 }
a28509cc 2312 }
c445ea15 2313 }
3dab1dad 2314 TRIE_HANDLE_WORD(state);
a3621e74
YO
2315
2316 } /* end second pass */
2317
1e2e3d02 2318 /* next alloc is the NEXT state to be allocated */
538e84ed 2319 trie->statecount = next_alloc;
c944940b
JH
2320 trie->states = (reg_trie_state *)
2321 PerlMemShared_realloc( trie->states,
2322 next_alloc
2323 * sizeof(reg_trie_state) );
a3621e74 2324
3dab1dad 2325 /* and now dump it out before we compress it */
2b8b4781
NC
2326 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2327 revcharmap, next_alloc,
2328 depth+1)
1e2e3d02 2329 );
a3621e74 2330
c944940b
JH
2331 trie->trans = (reg_trie_trans *)
2332 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2333 {
2334 U32 state;
a3621e74
YO
2335 U32 tp = 0;
2336 U32 zp = 0;
2337
2338
2339 for( state=1 ; state < next_alloc ; state ++ ) {
2340 U32 base=0;
2341
2342 /*
2343 DEBUG_TRIE_COMPILE_MORE_r(
2344 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2345 );
2346 */
2347
2348 if (trie->states[state].trans.list) {
2349 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2350 U16 maxid=minid;
a28509cc 2351 U16 idx;
a3621e74
YO
2352
2353 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
2354 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2355 if ( forid < minid ) {
2356 minid=forid;
2357 } else if ( forid > maxid ) {
2358 maxid=forid;
2359 }
a3621e74
YO
2360 }
2361 if ( transcount < tp + maxid - minid + 1) {
2362 transcount *= 2;
c944940b
JH
2363 trie->trans = (reg_trie_trans *)
2364 PerlMemShared_realloc( trie->trans,
446bd890
NC
2365 transcount
2366 * sizeof(reg_trie_trans) );
538e84ed
KW
2367 Zero( trie->trans + (transcount / 2),
2368 transcount / 2,
2369 reg_trie_trans );
a3621e74
YO
2370 }
2371 base = trie->uniquecharcount + tp - minid;
2372 if ( maxid == minid ) {
2373 U32 set = 0;
2374 for ( ; zp < tp ; zp++ ) {
2375 if ( ! trie->trans[ zp ].next ) {
2376 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
2377 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2378 1).newstate;
a3621e74
YO
2379 trie->trans[ zp ].check = state;
2380 set = 1;
2381 break;
2382 }
2383 }
2384 if ( !set ) {
538e84ed
KW
2385 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2386 1).newstate;
a3621e74
YO
2387 trie->trans[ tp ].check = state;
2388 tp++;
2389 zp = tp;
2390 }
2391 } else {
2392 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
2393 const U32 tid = base
2394 - trie->uniquecharcount
2395 + TRIE_LIST_ITEM( state, idx ).forid;
2396 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2397 idx ).newstate;
a3621e74
YO
2398 trie->trans[ tid ].check = state;
2399 }
2400 tp += ( maxid - minid + 1 );
2401 }
2402 Safefree(trie->states[ state ].trans.list);
2403 }
2404 /*
2405 DEBUG_TRIE_COMPILE_MORE_r(
2406 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2407 );
2408 */
2409 trie->states[ state ].trans.base=base;
2410 }
cc601c31 2411 trie->lasttrans = tp + 1;
a3621e74
YO
2412 }
2413 } else {
2414 /*
2415 Second Pass -- Flat Table Representation.
2416
b423522f
KW
2417 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2418 each. We know that we will need Charcount+1 trans at most to store
2419 the data (one row per char at worst case) So we preallocate both
2420 structures assuming worst case.
a3621e74
YO
2421
2422 We then construct the trie using only the .next slots of the entry
2423 structs.
2424
b423522f
KW
2425 We use the .check field of the first entry of the node temporarily
2426 to make compression both faster and easier by keeping track of how
2427 many non zero fields are in the node.
a3621e74
YO
2428
2429 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2430 transition.
2431
b423522f
KW
2432 There are two terms at use here: state as a TRIE_NODEIDX() which is
2433 a number representing the first entry of the node, and state as a
2434 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2435 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2436 if there are 2 entrys per node. eg:
a3621e74
YO
2437
2438 A B A B
2439 1. 2 4 1. 3 7
2440 2. 0 3 3. 0 5
2441 3. 0 0 5. 0 0
2442 4. 0 0 7. 0 0
2443
b423522f
KW
2444 The table is internally in the right hand, idx form. However as we
2445 also have to deal with the states array which is indexed by nodenum
2446 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
2447
2448 */
538e84ed 2449 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1e2e3d02
YO
2450 "%*sCompiling trie using table compiler\n",
2451 (int)depth * 2 + 2, ""));
3dab1dad 2452
c944940b
JH
2453 trie->trans = (reg_trie_trans *)
2454 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2455 * trie->uniquecharcount + 1,
2456 sizeof(reg_trie_trans) );
2457 trie->states = (reg_trie_state *)
2458 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2459 sizeof(reg_trie_state) );
a3621e74
YO
2460 next_alloc = trie->uniquecharcount + 1;
2461
3dab1dad 2462
a3621e74
YO
2463 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2464
df826430 2465 regnode *noper = NEXTOPER( cur );
a28509cc 2466 const U8 *uc = (U8*)STRING( noper );
df826430 2467 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
2468
2469 U32 state = 1; /* required init */
2470
2471 U16 charid = 0; /* sanity init */
2472 U32 accept_state = 0; /* sanity init */
a3621e74 2473
07be1b83 2474 U32 wordlen = 0; /* required init */
a3621e74 2475
df826430
YO
2476 if (OP(noper) == NOTHING) {
2477 regnode *noper_next= regnext(noper);
2478 if (noper_next != tail && OP(noper_next) == flags) {
2479 noper = noper_next;
2480 uc= (U8*)STRING(noper);
2481 e= uc + STR_LEN(noper);
2482 }
2483 }
fab2782b 2484
3dab1dad 2485 if ( OP(noper) != NOTHING ) {
786e8c11 2486 for ( ; uc < e ; uc += len ) {
a3621e74 2487
786e8c11 2488 TRIE_READ_CHAR;
a3621e74 2489
786e8c11
YO
2490 if ( uvc < 256 ) {
2491 charid = trie->charmap[ uvc ];
2492 } else {
538e84ed
KW
2493 SV* const * const svpp = hv_fetch( widecharmap,
2494 (char*)&uvc,
2495 sizeof( UV ),
2496 0);
786e8c11 2497 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 2498 }
786e8c11
YO
2499 if ( charid ) {
2500 charid--;
2501 if ( !trie->trans[ state + charid ].next ) {
2502 trie->trans[ state + charid ].next = next_alloc;
2503 trie->trans[ state ].check++;
2e64971a
DM
2504 prev_states[TRIE_NODENUM(next_alloc)]
2505 = TRIE_NODENUM(state);
786e8c11
YO
2506 next_alloc += trie->uniquecharcount;
2507 }
2508 state = trie->trans[ state + charid ].next;
2509 } else {
2510 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2511 }
538e84ed
KW
2512 /* charid is now 0 if we dont know the char read, or
2513 * nonzero if we do */
a3621e74 2514 }
a3621e74 2515 }
3dab1dad
YO
2516 accept_state = TRIE_NODENUM( state );
2517 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
2518
2519 } /* end second pass */
2520
3dab1dad 2521 /* and now dump it out before we compress it */
2b8b4781
NC
2522 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2523 revcharmap,
2524 next_alloc, depth+1));
a3621e74 2525
a3621e74
YO
2526 {
2527 /*
2528 * Inplace compress the table.*
2529
2530 For sparse data sets the table constructed by the trie algorithm will
2531 be mostly 0/FAIL transitions or to put it another way mostly empty.
2532 (Note that leaf nodes will not contain any transitions.)
2533
2534 This algorithm compresses the tables by eliminating most such
2535 transitions, at the cost of a modest bit of extra work during lookup:
2536
2537 - Each states[] entry contains a .base field which indicates the
2538 index in the state[] array wheres its transition data is stored.
2539
3b753521 2540 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
2541
2542 - If .base is nonzero then charid is added to it to find an entry in
2543 the trans array.
2544
2545 -If trans[states[state].base+charid].check!=state then the
2546 transition is taken to be a 0/Fail transition. Thus if there are fail
2547 transitions at the front of the node then the .base offset will point
2548 somewhere inside the previous nodes data (or maybe even into a node
2549 even earlier), but the .check field determines if the transition is
2550 valid.
2551
786e8c11 2552 XXX - wrong maybe?
a3621e74 2553 The following process inplace converts the table to the compressed
3b753521 2554 table: We first do not compress the root node 1,and mark all its
a3621e74 2555 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
2556 allows us to do a DFA construction from the compressed table later,
2557 and ensures that any .base pointers we calculate later are greater
2558 than 0.
a3621e74
YO
2559
2560 - We set 'pos' to indicate the first entry of the second node.
2561
2562 - We then iterate over the columns of the node, finding the first and
2563 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2564 and set the .check pointers accordingly, and advance pos
2565 appropriately and repreat for the next node. Note that when we copy
2566 the next pointers we have to convert them from the original
2567 NODEIDX form to NODENUM form as the former is not valid post
2568 compression.
2569
2570 - If a node has no transitions used we mark its base as 0 and do not
2571 advance the pos pointer.
2572
2573 - If a node only has one transition we use a second pointer into the
2574 structure to fill in allocated fail transitions from other states.
2575 This pointer is independent of the main pointer and scans forward
2576 looking for null transitions that are allocated to a state. When it
2577 finds one it writes the single transition into the "hole". If the
786e8c11 2578 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
2579
2580 - Once compressed we can Renew/realloc the structures to release the
2581 excess space.
2582
2583 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2584 specifically Fig 3.47 and the associated pseudocode.
2585
2586 demq
2587 */
a3b680e6 2588 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 2589 U32 state, charid;
a3621e74 2590 U32 pos = 0, zp=0;
1e2e3d02 2591 trie->statecount = laststate;
a3621e74
YO
2592
2593 for ( state = 1 ; state < laststate ; state++ ) {
2594 U8 flag = 0;
a28509cc
AL
2595 const U32 stateidx = TRIE_NODEIDX( state );
2596 const U32 o_used = trie->trans[ stateidx ].check;
2597 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
2598 trie->trans[ stateidx ].check = 0;
2599
538e84ed
KW
2600 for ( charid = 0;
2601 used && charid < trie->uniquecharcount;
2602 charid++ )
2603 {
a3621e74
YO
2604 if ( flag || trie->trans[ stateidx + charid ].next ) {
2605 if ( trie->trans[ stateidx + charid ].next ) {
2606 if (o_used == 1) {
2607 for ( ; zp < pos ; zp++ ) {
2608 if ( ! trie->trans[ zp ].next ) {
2609 break;
2610 }
2611 }
538e84ed
KW
2612 trie->states[ state ].trans.base
2613 = zp
2614 + trie->uniquecharcount
2615 - charid ;
2616 trie->trans[ zp ].next
2617 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2618 + charid ].next );
a3621e74
YO
2619 trie->trans[ zp ].check = state;
2620 if ( ++zp > pos ) pos = zp;
2621 break;
2622 }
2623 used--;
2624 }
2625 if ( !flag ) {
2626 flag = 1;
538e84ed
KW
2627 trie->states[ state ].trans.base
2628 = pos + trie->uniquecharcount - charid ;
a3621e74 2629 }
538e84ed
KW
2630 trie->trans[ pos ].next
2631 = SAFE_TRIE_NODENUM(
2632 trie->trans[ stateidx + charid ].next );
a3621e74
YO
2633 trie->trans[ pos ].check = state;
2634 pos++;
2635 }
2636 }
2637 }
cc601c31 2638 trie->lasttrans = pos + 1;
c944940b
JH
2639 trie->states = (reg_trie_state *)
2640 PerlMemShared_realloc( trie->states, laststate
2641 * sizeof(reg_trie_state) );
a3621e74 2642 DEBUG_TRIE_COMPILE_MORE_r(
538e84ed
KW
2643 PerlIO_printf( Perl_debug_log,
2644 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2645 (int)depth * 2 + 2,"",
2646 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2647 + 1 ),
2648 (IV)next_alloc,
2649 (IV)pos,
2650 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
2651 );
2652
2653 } /* end table compress */
2654 }
1e2e3d02 2655 DEBUG_TRIE_COMPILE_MORE_r(
538e84ed
KW
2656 PerlIO_printf(Perl_debug_log,
2657 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1e2e3d02
YO
2658 (int)depth * 2 + 2, "",
2659 (UV)trie->statecount,
2660 (UV)trie->lasttrans)
2661 );
cc601c31 2662 /* resize the trans array to remove unused space */
c944940b
JH
2663 trie->trans = (reg_trie_trans *)
2664 PerlMemShared_realloc( trie->trans, trie->lasttrans
2665 * sizeof(reg_trie_trans) );
a3621e74 2666
538e84ed 2667 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2668 U8 nodetype =(U8)(flags & 0xFF);
2669 char *str=NULL;
538e84ed 2670
07be1b83 2671#ifdef DEBUGGING
e62cc96a 2672 regnode *optimize = NULL;
7122b237
YO
2673#ifdef RE_TRACK_PATTERN_OFFSETS
2674
b57a0404
JH
2675 U32 mjd_offset = 0;
2676 U32 mjd_nodelen = 0;
7122b237
YO
2677#endif /* RE_TRACK_PATTERN_OFFSETS */
2678#endif /* DEBUGGING */
a3621e74 2679 /*
3dab1dad
YO
2680 This means we convert either the first branch or the first Exact,
2681 depending on whether the thing following (in 'last') is a branch
2682 or not and whther first is the startbranch (ie is it a sub part of
2683 the alternation or is it the whole thing.)
3b753521 2684 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2685 the whole branch sequence, including the first.
a3621e74 2686 */
3dab1dad 2687 /* Find the node we are going to overwrite */
7f69552c 2688 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2689 /* branch sub-chain */
3dab1dad 2690 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2691#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2692 DEBUG_r({
2693 mjd_offset= Node_Offset((convert));
2694 mjd_nodelen= Node_Length((convert));
2695 });
7122b237 2696#endif
7f69552c 2697 /* whole branch chain */
7122b237
YO
2698 }
2699#ifdef RE_TRACK_PATTERN_OFFSETS
2700 else {
7f69552c
YO
2701 DEBUG_r({
2702 const regnode *nop = NEXTOPER( convert );
2703 mjd_offset= Node_Offset((nop));
2704 mjd_nodelen= Node_Length((nop));
2705 });
07be1b83
YO
2706 }
2707 DEBUG_OPTIMISE_r(
538e84ed
KW
2708 PerlIO_printf(Perl_debug_log,
2709 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
07be1b83 2710 (int)depth * 2 + 2, "",
786e8c11 2711 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2712 );
7122b237 2713#endif
538e84ed 2714 /* But first we check to see if there is a common prefix we can
3dab1dad
YO
2715 split out as an EXACT and put in front of the TRIE node. */
2716 trie->startstate= 1;
55eed653 2717 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2718 U32 state;
1e2e3d02 2719 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2720 U32 ofs = 0;
8e11feef
RGS
2721 I32 idx = -1;
2722 U32 count = 0;
2723 const U32 base = trie->states[ state ].trans.base;
a3621e74 2724
3dab1dad 2725 if ( trie->states[state].wordnum )
8e11feef 2726 count = 1;
a3621e74 2727
8e11feef 2728 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2729 if ( ( base + ofs >= trie->uniquecharcount ) &&
2730 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2731 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2732 {
3dab1dad 2733 if ( ++count > 1 ) {
2b8b4781 2734 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2735 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2736 if ( state == 1 ) break;
3dab1dad
YO
2737 if ( count == 2 ) {
2738 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2739 DEBUG_OPTIMISE_r(
8e11feef
RGS
2740 PerlIO_printf(Perl_debug_log,
2741 "%*sNew Start State=%"UVuf" Class: [",
2742 (int)depth * 2 + 2, "",
786e8c11 2743 (UV)state));
be8e71aa 2744 if (idx >= 0) {
2b8b4781 2745 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2746 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2747
3dab1dad 2748 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2749 if ( folder )
2750 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2751 DEBUG_OPTIMISE_r(
f1f66076 2752 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2753 );
8e11feef
RGS
2754 }
2755 }
2756 TRIE_BITMAP_SET(trie,*ch);
2757 if ( folder )
2758 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2759 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2760 }
2761 idx = ofs;
2762 }
3dab1dad
YO
2763 }
2764 if ( count == 1 ) {
2b8b4781 2765 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2766 STRLEN len;
2767 char *ch = SvPV( *tmp, len );
de734bd5
A
2768 DEBUG_OPTIMISE_r({
2769 SV *sv=sv_newmortal();
8e11feef
RGS
2770 PerlIO_printf( Perl_debug_log,
2771 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2772 (int)depth * 2 + 2, "",
538e84ed
KW
2773 (UV)state, (UV)idx,
2774 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
de734bd5
A
2775 PL_colors[0], PL_colors[1],
2776 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed 2777 PERL_PV_ESCAPE_FIRSTCHAR
de734bd5
A
2778 )
2779 );
2780 });
3dab1dad
YO
2781 if ( state==1 ) {
2782 OP( convert ) = nodetype;
2783 str=STRING(convert);
2784 STR_LEN(convert)=0;
2785 }
c490c714
YO
2786 STR_LEN(convert) += len;
2787 while (len--)
de734bd5 2788 *str++ = *ch++;
8e11feef 2789 } else {
538e84ed 2790#ifdef DEBUGGING
8e11feef
RGS
2791 if (state>1)
2792 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2793#endif
8e11feef
RGS
2794 break;
2795 }
2796 }
2e64971a 2797 trie->prefixlen = (state-1);
3dab1dad 2798 if (str) {
8e11feef 2799 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2800 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2801 trie->startstate = state;
07be1b83
YO
2802 trie->minlen -= (state - 1);
2803 trie->maxlen -= (state - 1);
33809eae
JH
2804#ifdef DEBUGGING
2805 /* At least the UNICOS C compiler choked on this
2806 * being argument to DEBUG_r(), so let's just have
2807 * it right here. */
2808 if (
2809#ifdef PERL_EXT_RE_BUILD
2810 1
2811#else
2812 DEBUG_r_TEST
2813#endif
2814 ) {
2815 regnode *fix = convert;
2816 U32 word = trie->wordcount;
2817 mjd_nodelen++;
2818 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2819 while( ++fix < n ) {
2820 Set_Node_Offset_Length(fix, 0, 0);
2821 }
2822 while (word--) {
2823 SV ** const tmp = av_fetch( trie_words, word, 0 );
2824 if (tmp) {
2825 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2826 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2827 else
2828 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2829 }
2830 }
2831 }
2832#endif
8e11feef
RGS
2833 if (trie->maxlen) {
2834 convert = n;
2835 } else {
3dab1dad 2836 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2837 DEBUG_r(optimize= n);
3dab1dad
YO
2838 }
2839 }
2840 }
538e84ed
KW
2841 if (!jumper)
2842 jumper = last;
3dab1dad 2843 if ( trie->maxlen ) {
8e11feef
RGS
2844 NEXT_OFF( convert ) = (U16)(tail - convert);
2845 ARG_SET( convert, data_slot );
538e84ed
KW
2846 /* Store the offset to the first unabsorbed branch in
2847 jump[0], which is otherwise unused by the jump logic.
786e8c11 2848 We use this when dumping a trie and during optimisation. */
538e84ed 2849 if (trie->jump)
7f69552c 2850 trie->jump[0] = (U16)(nextbranch - convert);
538e84ed 2851
6c48061a
YO
2852 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2853 * and there is a bitmap
2854 * and the first "jump target" node we found leaves enough room
2855 * then convert the TRIE node into a TRIEC node, with the bitmap
2856 * embedded inline in the opcode - this is hypothetically faster.
2857 */
2858 if ( !trie->states[trie->startstate].wordnum
2859 && trie->bitmap
2860 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2861 {
2862 OP( convert ) = TRIEC;
2863 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2864 PerlMemShared_free(trie->bitmap);
786e8c11 2865 trie->bitmap= NULL;
538e84ed 2866 } else
786e8c11 2867 OP( convert ) = TRIE;
a3621e74 2868
3dab1dad
YO
2869 /* store the type in the flags */
2870 convert->flags = nodetype;
a5ca303d 2871 DEBUG_r({
538e84ed
KW
2872 optimize = convert
2873 + NODE_STEP_REGNODE
a5ca303d
YO
2874 + regarglen[ OP( convert ) ];
2875 });
538e84ed 2876 /* XXX We really should free up the resource in trie now,
a5ca303d 2877 as we won't use them - (which resources?) dmq */
3dab1dad 2878 }
a3621e74 2879 /* needed for dumping*/
e62cc96a 2880 DEBUG_r(if (optimize) {
07be1b83 2881 regnode *opt = convert;
bcdf7404 2882
e62cc96a 2883 while ( ++opt < optimize) {
07be1b83
YO
2884 Set_Node_Offset_Length(opt,0,0);
2885 }
538e84ed
KW
2886 /*
2887 Try to clean up some of the debris left after the
786e8c11 2888 optimisation.
a3621e74 2889 */
786e8c11 2890 while( optimize < jumper ) {
07be1b83 2891 mjd_nodelen += Node_Length((optimize));
a3621e74 2892 OP( optimize ) = OPTIMIZED;
07be1b83 2893 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2894 optimize++;
2895 }
07be1b83 2896 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2897 });
2898 } /* end node insert */
2e64971a
DM
2899
2900 /* Finish populating the prev field of the wordinfo array. Walk back
2901 * from each accept state until we find another accept state, and if
2902 * so, point the first word's .prev field at the second word. If the
2903 * second already has a .prev field set, stop now. This will be the
2904 * case either if we've already processed that word's accept state,
3b753521
FN
2905 * or that state had multiple words, and the overspill words were
2906 * already linked up earlier.
2e64971a
DM
2907 */
2908 {
2909 U16 word;
2910 U32 state;
2911 U16 prev;
2912
2913 for (word=1; word <= trie->wordcount; word++) {
2914 prev = 0;
2915 if (trie->wordinfo[word].prev)
2916 continue;
2917 state = trie->wordinfo[word].accept;
2918 while (state) {
2919 state = prev_states[state];
2920 if (!state)
2921 break;
2922 prev = trie->states[state].wordnum;
2923 if (prev)
2924 break;
2925 }
2926 trie->wordinfo[word].prev = prev;
2927 }
2928 Safefree(prev_states);
2929 }
2930
2931
2932 /* and now dump out the compressed format */
2933 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2934
55eed653 2935 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2936#ifdef DEBUGGING
2937 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2938 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2939#else
03e70be4 2940 SvREFCNT_dec_NN(revcharmap);
07be1b83 2941#endif
538e84ed
KW
2942 return trie->jump
2943 ? MADE_JUMP_TRIE
2944 : trie->startstate>1
2945 ? MADE_EXACT_TRIE
786e8c11
YO
2946 : MADE_TRIE;
2947}
2948
2949STATIC void
2950S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2951{
b423522f
KW
2952/* The Trie is constructed and compressed now so we can build a fail array if
2953 * it's needed
786e8c11 2954
b423522f
KW
2955 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2956 3.32 in the
2957 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2958 Ullman 1985/88
786e8c11
YO
2959 ISBN 0-201-10088-6
2960
b423522f
KW
2961 We find the fail state for each state in the trie, this state is the longest
2962 proper suffix of the current state's 'word' that is also a proper prefix of
2963 another word in our trie. State 1 represents the word '' and is thus the
2964 default fail state. This allows the DFA not to have to restart after its
2965 tried and failed a word at a given point, it simply continues as though it
2966 had been matching the other word in the first place.
786e8c11
YO
2967 Consider
2968 'abcdgu'=~/abcdefg|cdgu/
b423522f
KW
2969 When we get to 'd' we are still matching the first word, we would encounter
2970 'g' which would fail, which would bring us to the state representing 'd' in
2971 the second word where we would try 'g' and succeed, proceeding to match
2972 'cdgu'.
786e8c11
YO
2973 */
2974 /* add a fail transition */
3251b653
NC
2975 const U32 trie_offset = ARG(source);
2976 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2977 U32 *q;
2978 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2979 const U32 numstates = trie->statecount;
786e8c11
YO
2980 const U32 ubound = trie->lasttrans + ucharcount;
2981 U32 q_read = 0;
2982 U32 q_write = 0;
2983 U32 charid;
2984 U32 base = trie->states[ 1 ].trans.base;
2985 U32 *fail;
2986 reg_ac_data *aho;
cf78de0b 2987 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
786e8c11 2988 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2989
2990 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2991#ifndef DEBUGGING
2992 PERL_UNUSED_ARG(depth);
2993#endif
2994
2995
2996 ARG_SET( stclass, data_slot );
c944940b 2997 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2998 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2999 aho->trie=trie_offset;
446bd890
NC
3000 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3001 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 3002 Newxz( q, numstates, U32);
c944940b 3003 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
3004 aho->refcount = 1;
3005 fail = aho->fail;
3006 /* initialize fail[0..1] to be 1 so that we always have
3007 a valid final fail state */
3008 fail[ 0 ] = fail[ 1 ] = 1;
3009
3010 for ( charid = 0; charid < ucharcount ; charid++ ) {
3011 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3012 if ( newstate ) {
3013 q[ q_write ] = newstate;
3014 /* set to point at the root */
3015 fail[ q[ q_write++ ] ]=1;
3016 }
3017 }
3018 while ( q_read < q_write) {
3019 const U32 cur = q[ q_read++ % numstates ];
3020 base = trie->states[ cur ].trans.base;
3021
3022 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3023 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3024 if (ch_state) {
3025 U32 fail_state = cur;
3026 U32 fail_base;
3027 do {
3028 fail_state = fail[ fail_state ];
3029 fail_base = aho->states[ fail_state ].trans.base;
3030 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3031
3032 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3033 fail[ ch_state ] = fail_state;
3034 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3035 {
3036 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3037 }
3038 q[ q_write++ % numstates] = ch_state;
3039 }
3040 }
3041 }
3042 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3043 when we fail in state 1, this allows us to use the
3044 charclass scan to find a valid start char. This is based on the principle
3045 that theres a good chance the string being searched contains lots of stuff
3046 that cant be a start char.
3047 */
3048 fail[ 0 ] = fail[ 1 ] = 0;
3049 DEBUG_TRIE_COMPILE_r({
6d99fb9b 3050 PerlIO_printf(Perl_debug_log,
538e84ed 3051 "%*sStclass Failtable (%"UVuf" states): 0",
6d99fb9b 3052 (int)(depth * 2), "", (UV)numstates
1e2e3d02 3053 );
786e8c11
YO
3054 for( q_read=1; q_read<numstates; q_read++ ) {
3055 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3056 }
3057 PerlIO_printf(Perl_debug_log, "\n");
3058 });
3059 Safefree(q);
3060 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
3061}
3062
786e8c11 3063
07be1b83 3064#define DEBUG_PEEP(str,scan,depth) \
b515a41d 3065 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
3066 SV * const mysv=sv_newmortal(); \
3067 regnode *Next = regnext(scan); \
3068 regprop(RExC_rx, mysv, scan); \
7f69552c 3069 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
3070 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3071 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 3072 }});
07be1b83 3073
1de06328 3074
bb914485 3075/* The below joins as many adjacent EXACTish nodes as possible into a single
0a982f06
KW
3076 * one. The regop may be changed if the node(s) contain certain sequences that
3077 * require special handling. The joining is only done if:
bb914485
KW
3078 * 1) there is room in the current conglomerated node to entirely contain the
3079 * next one.
3080 * 2) they are the exact same node type
3081 *
87b8b349 3082 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
bb914485
KW
3083 * these get optimized out
3084 *
0a982f06
KW
3085 * If a node is to match under /i (folded), the number of characters it matches
3086 * can be different than its character length if it contains a multi-character
3087 * fold. *min_subtract is set to the total delta of the input nodes.
bb914485 3088 *
a0c4c608
KW
3089 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
3090 * and contains LATIN SMALL LETTER SHARP S
f758bddf 3091 *
bb914485 3092 * This is as good a place as any to discuss the design of handling these
0a982f06
KW
3093 * multi-character fold sequences. It's been wrong in Perl for a very long
3094 * time. There are three code points in Unicode whose multi-character folds
3095 * were long ago discovered to mess things up. The previous designs for
3096 * dealing with these involved assigning a special node for them. This
538e84ed 3097 * approach doesn't always work, as evidenced by this example:
a0c4c608 3098 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
538e84ed 3099 * Both sides fold to "sss", but if the pattern is parsed to create a node that
0a982f06 3100 * would match just the \xDF, it won't be able to handle the case where a
bb914485
KW
3101 * successful match would have to cross the node's boundary. The new approach
3102 * that hopefully generally solves the problem generates an EXACTFU_SS node
538e84ed 3103 * that is "sss" in this case.
bb914485 3104 *
0a982f06 3105 * It turns out that there are problems with all multi-character folds, and not
cb117658
KW
3106 * just these three. Now the code is general, for all such cases. The
3107 * approach taken is:
0a982f06 3108 * 1) This routine examines each EXACTFish node that could contain multi-
538e84ed
KW
3109 * character folded sequences. Since a single character can fold into
3110 * such a sequence, the minimum match length for this node is less than
3111 * the number of characters in the node. This routine returns in
3112 * *min_subtract how much to subtract from the the actual length of the
3113 * string to get a real minimum match length; it is 0 if there are no
3114 * multi-char foldeds. This delta is used by the caller to adjust the min
3115 * length of the match, and the delta between min and max, so that the
3116 * optimizer doesn't reject these possibilities based on size constraints.
cb117658 3117 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
0a982f06
KW
3118 * is used for an EXACTFU node that contains at least one "ss" sequence in
3119 * it. For non-UTF-8 patterns and strings, this is the only case where
3120 * there is a possible fold length change. That means that a regular
3121 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3122 * with length changes, and so can be processed faster. regexec.c takes
3123 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3124 * pre-folded by regcomp.c. This saves effort in regex matching.
87b8b349 3125 * However, the pre-folding isn't done for non-UTF8 patterns because the
0a982f06
KW
3126 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
3127 * down by forcing the pattern into UTF8 unless necessary. Also what
3128 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
3129 * possibilities for the non-UTF8 patterns are quite simple, except for
3130 * the sharp s. All the ones that don't involve a UTF-8 target string are
3131 * members of a fold-pair, and arrays are set up for all of them so that
3132 * the other member of the pair can be found quickly. Code elsewhere in
3133 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3134 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3135 * described in the next item.
098b07d5 3136 * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
1ca267a5
KW
3137 * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
3138 * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
3139 * (probably unwittingly, in Perl_regexec_flags()) makes is that a
3140 * character in the pattern corresponds to at most a single character in
3141 * the target string. (And I do mean character, and not byte here, unlike
3142 * other parts of the documentation that have never been updated to
3143 * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
3144 * two character string 'ss'; in EXACTFA nodes it can match
3145 * "\x{17F}\x{17F}". These violate the assumption, and they are the only
3146 * instances where it is violated. I'm reluctant to try to change the
3147 * assumption, as the code involved is impenetrable to me (khw), so
3148 * instead the code here punts. This routine examines (when the pattern
3149 * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
3150 * boolean indicating whether or not the node contains a sharp s. When it
3151 * is true, the caller sets a flag that later causes the optimizer in this
3152 * file to not set values for the floating and fixed string lengths, and
3153 * thus avoids the optimizer code in regexec.c that makes the invalid
3154 * assumption. Thus, there is no optimization based on string lengths for
3155 * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
3156 * (The reason the assumption is wrong only in these two cases is that all
3157 * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
3158 * other folds to their expanded versions. We can't prefold sharp s to
3159 * 'ss' in EXACTF nodes because we don't know at compile time if it
3160 * actually matches 'ss' or not. It will match iff the target string is
3161 * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
3162 * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
3163 * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
3164 * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
3165 * require the pattern to be forced into UTF-8, the overhead of which we
3166 * want to avoid.)
098b07d5
KW
3167 *
3168 * Similarly, the code that generates tries doesn't currently handle
3169 * not-already-folded multi-char folds, and it looks like a pain to change
3170 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3171 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3172 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3173 * using /iaa matching will be doing so almost entirely with ASCII
3174 * strings, so this should rarely be encountered in practice */
1de06328 3175
9d071ca8 3176#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
07be1b83 3177 if (PL_regkind[OP(scan)] == EXACT) \
9d071ca8 3178 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
07be1b83 3179
be8e71aa 3180STATIC U32
538e84ed
KW
3181S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
3182 UV *min_subtract, bool *has_exactf_sharp_s,
3183 U32 flags,regnode *val, U32 depth)
3184{
07be1b83
YO
3185 /* Merge several consecutive EXACTish nodes into one. */
3186 regnode *n = regnext(scan);
3187 U32 stringok = 1;
3188 regnode *next = scan + NODE_SZ_STR(scan);
3189 U32 merged = 0;
3190 U32 stopnow = 0;
3191#ifdef DEBUGGING
3192 regnode *stop = scan;
72f13be8 3193 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 3194#else
d47053eb
RGS
3195 PERL_UNUSED_ARG(depth);
3196#endif
7918f24d
NC
3197
3198 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 3199#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
3200 PERL_UNUSED_ARG(flags);
3201 PERL_UNUSED_ARG(val);
07be1b83 3202#endif
07be1b83 3203 DEBUG_PEEP("join",scan,depth);
bb914485 3204
3f410cf6
KW
3205 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3206 * EXACT ones that are mergeable to the current one. */
3207 while (n
3208 && (PL_regkind[OP(n)] == NOTHING
3209 || (stringok && OP(n) == OP(scan)))
07be1b83 3210 && NEXT_OFF(n)
3f410cf6
KW
3211 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3212 {
538e84ed 3213
07be1b83
YO
3214 if (OP(n) == TAIL || n > next)
3215 stringok = 0;
3216 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
3217 DEBUG_PEEP("skip:",n,depth);
3218 NEXT_OFF(scan) += NEXT_OFF(n);
3219 next = n + NODE_STEP_REGNODE;
3220#ifdef DEBUGGING
3221 if (stringok)
3222 stop = n;
3223#endif
3224 n = regnext(n);
3225 }
3226 else if (stringok) {
786e8c11 3227 const unsigned int oldl = STR_LEN(scan);
07be1b83 3228 regnode * const nnext = regnext(n);
b2230d39 3229
87b8b349
KW
3230 /* XXX I (khw) kind of doubt that this works on platforms where
3231 * U8_MAX is above 255 because of lots of other assumptions */
79a81a6e 3232 /* Don't join if the sum can't fit into a single node */
b2230d39
KW
3233 if (oldl + STR_LEN(n) > U8_MAX)
3234 break;
538e84ed 3235
07be1b83 3236 DEBUG_PEEP("merg",n,depth);
07be1b83 3237 merged++;
b2230d39 3238
07be1b83
YO
3239 NEXT_OFF(scan) += NEXT_OFF(n);
3240 STR_LEN(scan) += STR_LEN(n);
3241 next = n + NODE_SZ_STR(n);
3242 /* Now we can overwrite *n : */
3243 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3244#ifdef DEBUGGING
3245 stop = next - 1;
3246#endif
3247 n = nnext;
3248 if (stopnow) break;
3249 }
3250
d47053eb
RGS
3251#ifdef EXPERIMENTAL_INPLACESCAN
3252 if (flags && !NEXT_OFF(n)) {
3253 DEBUG_PEEP("atch", val, depth);
3254 if (reg_off_by_arg[OP(n)]) {
3255 ARG_SET(n, val - n);
3256 }
3257 else {
3258 NEXT_OFF(n) = val - n;
3259 }
3260 stopnow = 1;
3261 }
07be1b83
YO
3262#endif
3263 }
2c2b7f86 3264
9d071ca8 3265 *min_subtract = 0;
f758bddf 3266 *has_exactf_sharp_s = FALSE;
f646642f 3267
3f410cf6
KW
3268 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3269 * can now analyze for sequences of problematic code points. (Prior to
3270 * this final joining, sequences could have been split over boundaries, and
a0c4c608
KW
3271 * hence missed). The sequences only happen in folding, hence for any
3272 * non-EXACT EXACTish node */
86d6fcad 3273 if (OP(scan) != EXACT) {
0a982f06
KW
3274 const U8 * const s0 = (U8*) STRING(scan);
3275 const U8 * s = s0;
3276 const U8 * const s_end = s0 + STR_LEN(scan);
f758bddf
KW
3277
3278 /* One pass is made over the node's string looking for all the
3279 * possibilities. to avoid some tests in the loop, there are two main
3280 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3281 * non-UTF-8 */
3282 if (UTF) {
86d6fcad 3283
0a982f06
KW
3284 /* Examine the string for a multi-character fold sequence. UTF-8
3285 * patterns have all characters pre-folded by the time this code is
3286 * executed */
3287 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3288 length sequence we are looking for is 2 */
86d6fcad 3289 {
538e84ed 3290 int count = 0; /* How many characters in a multi-char fold */
0a982f06
KW
3291 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3292 if (! len) { /* Not a multi-char fold: get next char */
3293 s += UTF8SKIP(s);
3294 continue;
3295 }
bb914485 3296
0a982f06 3297 /* Nodes with 'ss' require special handling, except for EXACTFL
098b07d5
KW
3298 * and EXACTFA-ish for which there is no multi-char fold to
3299 * this */
0a982f06 3300 if (len == 2 && *s == 's' && *(s+1) == 's'
098b07d5
KW
3301 && OP(scan) != EXACTFL
3302 && OP(scan) != EXACTFA
3303 && OP(scan) != EXACTFA_NO_TRIE)
0a982f06
KW
3304 {
3305 count = 2;
3306 OP(scan) = EXACTFU_SS;
3307 s += 2;
3308 }
0a982f06
KW
3309 else { /* Here is a generic multi-char fold. */
3310 const U8* multi_end = s + len;
3311
3312 /* Count how many characters in it. In the case of /l and
3313 * /aa, no folds which contain ASCII code points are
3314 * allowed, so check for those, and skip if found. (In
3315 * EXACTFL, no folds are allowed to any Latin1 code point,
3316 * not just ASCII. But there aren't any of these
3317 * currently, nor ever likely, so don't take the time to
3318 * test for them. The code that generates the
3319 * is_MULTI_foo() macros croaks should one actually get put
3320 * into Unicode .) */
098b07d5
KW
3321 if (OP(scan) != EXACTFL
3322 && OP(scan) != EXACTFA
3323 && OP(scan) != EXACTFA_NO_TRIE)
3324 {
0a982f06
KW
3325 count = utf8_length(s, multi_end);
3326 s = multi_end;
3327 }
3328 else {
3329 while (s < multi_end) {
3330 if (isASCII(*s)) {
3331 s++;
3332 goto next_iteration;
3333 }
3334 else {
3335 s += UTF8SKIP(s);
3336 }
3337 count++;
3338 }
3339 }
3340 }
f758bddf 3341
0a982f06
KW
3342 /* The delta is how long the sequence is minus 1 (1 is how long
3343 * the character that folds to the sequence is) */
3344 *min_subtract += count - 1;
3345 next_iteration: ;
bb914485
KW
3346 }
3347 }
1ca267a5
KW
3348 else if (OP(scan) == EXACTFA) {
3349
3350 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3351 * fold to the ASCII range (and there are no existing ones in the
3352 * upper latin1 range). But, as outlined in the comments preceding
098b07d5
KW
3353 * this function, we need to flag any occurrences of the sharp s.
3354 * This character forbids trie formation (because of added
3355 * complexity) */
1ca267a5
KW
3356 while (s < s_end) {
3357 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
098b07d5 3358 OP(scan) = EXACTFA_NO_TRIE;
1ca267a5
KW
3359 *has_exactf_sharp_s = TRUE;
3360 break;
3361 }
3362 s++;
3363 continue;
3364 }
3365 }
3366 else if (OP(scan) != EXACTFL) {
3367
3368 /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
3369 * multi-char folds that are all Latin1. (This code knows that
3370 * there are no current multi-char folds possible with EXACTFL,
3371 * relying on fold_grind.t to catch any errors if the very unlikely
3372 * event happens that some get added in future Unicode versions.)
3373 * As explained in the comments preceding this function, we look
3374 * also for the sharp s in EXACTF nodes; it can be in the final
0a982f06
KW
3375 * position. Otherwise we can stop looking 1 byte earlier because
3376 * have to find at least two characters for a multi-fold */
f758bddf
KW
3377 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
3378
0a982f06 3379 while (s < upper) {
40b1ba4f 3380 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
0a982f06
KW
3381 if (! len) { /* Not a multi-char fold. */
3382 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
3383 {
3384 *has_exactf_sharp_s = TRUE;
3385 }
3386 s++;
3387 continue;
3388 }
3389
3390 if (len == 2
c02c3054
KW
3391 && isARG2_lower_or_UPPER_ARG1('s', *s)
3392 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
0a982f06
KW
3393 {
3394
3395 /* EXACTF nodes need to know that the minimum length
3396 * changed so that a sharp s in the string can match this
3397 * ss in the pattern, but they remain EXACTF nodes, as they
3398 * won't match this unless the target string is is UTF-8,
3399 * which we don't know until runtime */
3400 if (OP(scan) != EXACTF) {
3401 OP(scan) = EXACTFU_SS;
3402 }
86d6fcad 3403 }
0a982f06
KW
3404
3405 *min_subtract += len - 1;
3406 s += len;
86d6fcad
KW
3407 }
3408 }
07be1b83 3409 }
3f410cf6 3410
07be1b83 3411#ifdef DEBUGGING
bb789b09
DM
3412 /* Allow dumping but overwriting the collection of skipped
3413 * ops and/or strings with fake optimized ops */
07be1b83
YO
3414 n = scan + NODE_SZ_STR(scan);
3415 while (n <= stop) {
bb789b09
DM
3416 OP(n) = OPTIMIZED;
3417 FLAGS(n) = 0;
3418 NEXT_OFF(n) = 0;
07be1b83
YO
3419 n++;
3420 }
3421#endif
3422 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3423 return stopnow;
3424}
3425
486ec47a 3426/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
3427 Finds fixed substrings. */
3428
a0288114 3429/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
3430 to the position after last scanned or to NULL. */
3431
40d049e4
YO
3432#define INIT_AND_WITHP \
3433 assert(!and_withp); \
b8f7bb16 3434 Newx(and_withp,1, regnode_ssc); \
40d049e4 3435 SAVEFREEPV(and_withp)
07be1b83 3436
b515a41d 3437/* this is a chain of data about sub patterns we are processing that
486ec47a 3438 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
3439 we can simulate recursion without losing state. */
3440struct scan_frame;
3441typedef struct scan_frame {
3442 regnode *last; /* last node to process in this frame */
3443 regnode *next; /* next node to process when last is reached */
3444 struct scan_frame *prev; /*previous frame*/
09a65838 3445 U32 prev_recursed_depth;
b515a41d
YO
3446 I32 stop; /* what stopparen do we use */
3447} scan_frame;
3448
304ee84b
YO
3449
3450#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3451
49f55535 3452STATIC SSize_t
40d049e4 3453S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
ea3daa5d 3454 SSize_t *minlenp, SSize_t *deltap,
40d049e4
YO
3455 regnode *last,
3456 scan_data_t *data,
3457 I32 stopparen,
09a65838 3458 U32 recursed_depth,
b8f7bb16 3459 regnode_ssc *and_withp,
40d049e4 3460 U32 flags, U32 depth)
c277df42
IZ
3461 /* scanp: Start here (read-write). */
3462 /* deltap: Write maxlen-minlen here. */
3463 /* last: Stop before this one. */
40d049e4
YO
3464 /* data: string data about the pattern */
3465 /* stopparen: treat close N as END */
3466 /* recursed: which subroutines have we recursed into */
3467 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 3468{
97aff369 3469 dVAR;
ea3daa5d
FC
3470 /* There must be at least this number of characters to match */
3471 SSize_t min = 0;
2d608413 3472 I32 pars = 0, code;
c277df42 3473 regnode *scan = *scanp, *next;
ea3daa5d 3474 SSize_t delta = 0;
c277df42 3475 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 3476 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
3477 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3478 scan_data_t data_fake;
a3621e74 3479 SV *re_trie_maxbuff = NULL;
786e8c11 3480 regnode *first_non_open = scan;
ea3daa5d 3481 SSize_t stopmin = SSize_t_MAX;
8aa23a47 3482 scan_frame *frame = NULL;
a3621e74 3483 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 3484
7918f24d
NC
3485 PERL_ARGS_ASSERT_STUDY_CHUNK;
3486
13a24bad 3487#ifdef DEBUGGING
40d049e4 3488 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 3489#endif
786e8c11 3490 if ( depth == 0 ) {
40d049e4 3491 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
3492 first_non_open=regnext(first_non_open);
3493 }
3494
b81d288d 3495
8aa23a47
YO
3496 fake_study_recurse:
3497 while ( scan && OP(scan) != END && scan < last ){
2d608413
KW
3498 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3499 node length to get a real minimum (because
3500 the folded version may be shorter) */
f758bddf 3501 bool has_exactf_sharp_s = FALSE;
8aa23a47 3502 /* Peephole optimizer: */
09a65838
YO
3503 DEBUG_OPTIMISE_MORE_r(
3504 {
538e84ed
KW
3505 PerlIO_printf(Perl_debug_log,
3506 "%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
3507 ((int) depth*2), "", (long)stopparen,
3508 (unsigned long)depth, (unsigned long)recursed_depth);
09a65838
YO
3509 if (recursed_depth) {
3510 U32 i;
3511 U32 j;
3512 for ( j = 0 ; j < recursed_depth ; j++ ) {
3513 PerlIO_printf(Perl_debug_log,"[");
9e9ecfdf 3514 for ( i = 0 ; i < (U32)RExC_npar ; i++ )
09a65838
YO
3515 PerlIO_printf(Perl_debug_log,"%d",
3516 PAREN_TEST(RExC_study_chunk_recursed +
3517 (j * RExC_study_chunk_recursed_bytes), i)
3518 ? 1 : 0
3519 );
3520 PerlIO_printf(Perl_debug_log,"]");
3521 }
0ac75573 3522 }
09a65838
YO
3523 PerlIO_printf(Perl_debug_log,"\n");
3524