This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
make_ext.pl: Typo in error message
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
04e98a4d 89#include "dquote_static.c"
26faadbd 90#include "charclass_invlists.h"
81e983c1 91#include "inline_invlist.c"
1b0f46bf 92#include "unicode_constants.h"
04e98a4d 93
538e84ed
KW
94#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
96#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
97 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 98#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 99#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 100
a687059c
LW
101#ifndef STATIC
102#define STATIC static
103#endif
104
b1603ef8 105
09b2b2e6 106struct RExC_state_t {
514a91f1
DM
107 U32 flags; /* RXf_* are we folding, multilining? */
108 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 109 char *precomp; /* uncompiled string. */
288b8c02 110 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 111 regexp *rx; /* perl core regexp structure */
538e84ed
KW
112 regexp_internal *rxi; /* internal data for regexp object
113 pprivate field */
fac92740 114 char *start; /* Start of input for compile */
830247a4
IZ
115 char *end; /* End of input for compile */
116 char *parse; /* Input-scan pointer. */
ea3daa5d 117 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 118 regnode *emit_start; /* Start of emitted-code area */
538e84ed
KW
119 regnode *emit_bound; /* First regnode outside of the
120 allocated space */
f7c7e32a
DM
121 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
122 implies compiling, so don't emit */
9a81a976
KW
123 regnode_ssc emit_dummy; /* placeholder for emit to point to;
124 large enough for the largest
125 non-EXACTish node, so can use it as
126 scratch in pass1 */
830247a4
IZ
127 I32 naughty; /* How bad is this pattern? */
128 I32 sawback; /* Did we see \1, ...? */
129 U32 seen;
ea3daa5d 130 SSize_t size; /* Code size. */
538e84ed
KW
131 I32 npar; /* Capture buffer count, (OPEN) plus
132 one. ("par" 0 is the whole
133 pattern)*/
134 I32 nestroot; /* root parens we are in - used by
135 accept */
830247a4
IZ
136 I32 extralen;
137 I32 seen_zerolen;
40d049e4
YO
138 regnode **open_parens; /* pointers to open parens */
139 regnode **close_parens; /* pointers to close parens */
140 regnode *opend; /* END node in program */
02daf0ab
YO
141 I32 utf8; /* whether the pattern is utf8 or not */
142 I32 orig_utf8; /* whether the pattern was originally in utf8 */
143 /* XXX use this for future optimisation of case
144 * where pattern must be upgraded to utf8. */
e40e74fe
KW
145 I32 uni_semantics; /* If a d charset modifier should use unicode
146 rules, even if the pattern is not in
147 utf8 */
81714fb9 148 HV *paren_names; /* Paren names */
538e84ed 149
40d049e4
YO
150 regnode **recurse; /* Recurse regops */
151 I32 recurse_count; /* Number of recurse regops */
538e84ed
KW
152 U8 *study_chunk_recursed; /* bitmap of which parens we have moved
153 through */
09a65838 154 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 155 I32 in_lookbehind;
4624b182 156 I32 contains_locale;
cfafade5 157 I32 contains_i;
bb3f3ed2 158 I32 override_recoding;
9d53c457 159 I32 in_multi_char_class;
3d2bd50a 160 struct reg_code_block *code_blocks; /* positions of literal (?{})
68e2671b 161 within pattern */
b1603ef8
DM
162 int num_code_blocks; /* size of code_blocks[] */
163 int code_index; /* next code_blocks[] slot */
ee273784 164 SSize_t maxlen; /* mininum possible number of chars in string to match */
dc6d7f5c 165#ifdef ADD_TO_REGEXEC
830247a4
IZ
166 char *starttry; /* -Dr: where regtry was called. */
167#define RExC_starttry (pRExC_state->starttry)
168#endif
d24ca0c5 169 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 170#ifdef DEBUGGING
be8e71aa 171 const char *lastparse;
3dab1dad 172 I32 lastnum;
1f1031fe 173 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
174#define RExC_lastparse (pRExC_state->lastparse)
175#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 176#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 177#endif
09b2b2e6 178};
830247a4 179
e2509266 180#define RExC_flags (pRExC_state->flags)
514a91f1 181#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 182#define RExC_precomp (pRExC_state->precomp)
288b8c02 183#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 184#define RExC_rx (pRExC_state->rx)
f8fc2ecf 185#define RExC_rxi (pRExC_state->rxi)
fac92740 186#define RExC_start (pRExC_state->start)
830247a4
IZ
187#define RExC_end (pRExC_state->end)
188#define RExC_parse (pRExC_state->parse)
189#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237 190#ifdef RE_TRACK_PATTERN_OFFSETS
538e84ed
KW
191#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
192 others */
7122b237 193#endif
830247a4 194#define RExC_emit (pRExC_state->emit)
f7c7e32a 195#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 196#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 197#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
198#define RExC_naughty (pRExC_state->naughty)
199#define RExC_sawback (pRExC_state->sawback)
200#define RExC_seen (pRExC_state->seen)
201#define RExC_size (pRExC_state->size)
ee273784 202#define RExC_maxlen (pRExC_state->maxlen)
830247a4 203#define RExC_npar (pRExC_state->npar)
e2e6a0f1 204#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
205#define RExC_extralen (pRExC_state->extralen)
206#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 207#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 208#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 209#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
210#define RExC_open_parens (pRExC_state->open_parens)
211#define RExC_close_parens (pRExC_state->close_parens)
212#define RExC_opend (pRExC_state->opend)
81714fb9 213#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
214#define RExC_recurse (pRExC_state->recurse)
215#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 216#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
217#define RExC_study_chunk_recursed_bytes \
218 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 219#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 220#define RExC_contains_locale (pRExC_state->contains_locale)
cfafade5 221#define RExC_contains_i (pRExC_state->contains_i)
9d53c457
KW
222#define RExC_override_recoding (pRExC_state->override_recoding)
223#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
830247a4 224
cde0cee5 225
a687059c
LW
226#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
227#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 228 ((*s) == '{' && regcurly(s)))
a687059c
LW
229
230/*
231 * Flags to be passed up and down.
232 */
a687059c 233#define WORST 0 /* Worst case. */
a3b492c3 234#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 235
e64f369d 236/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
237 * character. (There needs to be a case: in the switch statement in regexec.c
238 * for any node marked SIMPLE.) Note that this is not the same thing as
239 * REGNODE_SIMPLE */
fda99bee 240#define SIMPLE 0x02
e64f369d 241#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
242#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
243#define TRYAGAIN 0x10 /* Weeded out a declaration. */
244#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
a687059c 245
3dab1dad
YO
246#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
247
07be1b83
YO
248/* whether trie related optimizations are enabled */
249#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
250#define TRIE_STUDY_OPT
786e8c11 251#define FULL_TRIE_STUDY
07be1b83
YO
252#define TRIE_STCLASS
253#endif
1de06328
YO
254
255
40d049e4
YO
256
257#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
258#define PBITVAL(paren) (1 << ((paren) & 7))
259#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
260#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
261#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
262
bbd61b5f 263#define REQUIRE_UTF8 STMT_START { \
8d9c2815
NC
264 if (!UTF) { \
265 *flagp = RESTART_UTF8; \
266 return NULL; \
267 } \
bbd61b5f 268 } STMT_END
40d049e4 269
f19b1a63
KW
270/* This converts the named class defined in regcomp.h to its equivalent class
271 * number defined in handy.h. */
272#define namedclass_to_classnum(class) ((int) ((class) / 2))
273#define classnum_to_namedclass(classnum) ((classnum) * 2)
274
de92f5e6
KW
275#define _invlist_union_complement_2nd(a, b, output) \
276 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
277#define _invlist_intersection_complement_2nd(a, b, output) \
278 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
279
1de06328
YO
280/* About scan_data_t.
281
282 During optimisation we recurse through the regexp program performing
283 various inplace (keyhole style) optimisations. In addition study_chunk
284 and scan_commit populate this data structure with information about
538e84ed 285 what strings MUST appear in the pattern. We look for the longest
3b753521 286 string that must appear at a fixed location, and we look for the
1de06328
YO
287 longest string that may appear at a floating location. So for instance
288 in the pattern:
538e84ed 289
1de06328 290 /FOO[xX]A.*B[xX]BAR/
538e84ed 291
1de06328
YO
292 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
293 strings (because they follow a .* construct). study_chunk will identify
294 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 295
1de06328 296 The strings can be composites, for instance
538e84ed 297
1de06328 298 /(f)(o)(o)/
538e84ed 299
1de06328 300 will result in a composite fixed substring 'foo'.
538e84ed 301
1de06328 302 For each string some basic information is maintained:
538e84ed 303
1de06328
YO
304 - offset or min_offset
305 This is the position the string must appear at, or not before.
306 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
307 characters must match before the string we are searching for.
308 Likewise when combined with minlenp and the length of the string it
538e84ed 309 tells us how many characters must appear after the string we have
1de06328 310 found.
538e84ed 311
1de06328
YO
312 - max_offset
313 Only used for floating strings. This is the rightmost point that
ea3daa5d 314 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 315 string can occur infinitely far to the right.
538e84ed 316
1de06328 317 - minlenp
2d608413
KW
318 A pointer to the minimum number of characters of the pattern that the
319 string was found inside. This is important as in the case of positive
538e84ed 320 lookahead or positive lookbehind we can have multiple patterns
1de06328 321 involved. Consider
538e84ed 322
1de06328 323 /(?=FOO).*F/
538e84ed 324
1de06328
YO
325 The minimum length of the pattern overall is 3, the minimum length
326 of the lookahead part is 3, but the minimum length of the part that
538e84ed 327 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 328 minimum length for the F is 1. This is important as the minimum length
538e84ed 329 is used to determine offsets in front of and behind the string being
1de06328 330 looked for. Since strings can be composites this is the length of the
486ec47a 331 pattern at the time it was committed with a scan_commit. Note that
1de06328 332 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 333 are not known until the full pattern has been compiled, thus the
1de06328 334 pointer to the value.
538e84ed 335
1de06328 336 - lookbehind
538e84ed 337
1de06328 338 In the case of lookbehind the string being searched for can be
538e84ed 339 offset past the start point of the final matching string.
1de06328
YO
340 If this value was just blithely removed from the min_offset it would
341 invalidate some of the calculations for how many chars must match
342 before or after (as they are derived from min_offset and minlen and
538e84ed 343 the length of the string being searched for).
1de06328
YO
344 When the final pattern is compiled and the data is moved from the
345 scan_data_t structure into the regexp structure the information
538e84ed
KW
346 about lookbehind is factored in, with the information that would
347 have been lost precalculated in the end_shift field for the
1de06328
YO
348 associated string.
349
350 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 351 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
352
353*/
2c2d71f5
JH
354
355typedef struct scan_data_t {
1de06328
YO
356 /*I32 len_min; unused */
357 /*I32 len_delta; unused */
49f55535 358 SSize_t pos_min;
ea3daa5d 359 SSize_t pos_delta;
2c2d71f5 360 SV *last_found;
ea3daa5d 361 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 362 SSize_t last_start_min;
ea3daa5d 363 SSize_t last_start_max;
1de06328
YO
364 SV **longest; /* Either &l_fixed, or &l_float. */
365 SV *longest_fixed; /* longest fixed string found in pattern */
49f55535 366 SSize_t offset_fixed; /* offset where it starts */
ea3daa5d 367 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
368 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
369 SV *longest_float; /* longest floating string found in pattern */
49f55535 370 SSize_t offset_float_min; /* earliest point in string it can appear */
ea3daa5d
FC
371 SSize_t offset_float_max; /* latest point in string it can appear */
372 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
49f55535 373 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
2c2d71f5
JH
374 I32 flags;
375 I32 whilem_c;
49f55535 376 SSize_t *last_closep;
b8f7bb16 377 regnode_ssc *start_class;
2c2d71f5
JH
378} scan_data_t;
379
a687059c 380/*
e50aee73 381 * Forward declarations for pregcomp()'s friends.
a687059c 382 */
a0d0e21e 383
27da23d5 384static const scan_data_t zero_scan_data =
1de06328 385 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
386
387#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
388#define SF_BEFORE_SEOL 0x0001
389#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
390#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
391#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
392
44e3dfd2
BF
393#define SF_FIX_SHIFT_EOL (+2)
394#define SF_FL_SHIFT_EOL (+4)
c277df42
IZ
395
396#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
397#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
398
399#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
400#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
401#define SF_IS_INF 0x0040
402#define SF_HAS_PAR 0x0080
403#define SF_IN_PAR 0x0100
404#define SF_HAS_EVAL 0x0200
405#define SCF_DO_SUBSTR 0x0400
653099ff
GS
406#define SCF_DO_STCLASS_AND 0x0800
407#define SCF_DO_STCLASS_OR 0x1000
408#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 409#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 410
786e8c11 411#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 412#define SCF_SEEN_ACCEPT 0x8000
688e0391 413#define SCF_TRIE_DOING_RESTUDY 0x10000
07be1b83 414
43fead97 415#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
416
417/* The enums for all these are ordered so things work out correctly */
a62b1201 418#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
419#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
420 == REGEX_DEPENDS_CHARSET)
00b27cfc 421#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
422#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
423 >= REGEX_UNICODE_CHARSET)
424#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
425 == REGEX_ASCII_RESTRICTED_CHARSET)
426#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
427 >= REGEX_ASCII_RESTRICTED_CHARSET)
428#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
429 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 430
43fead97 431#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 432
f2c2a6ab
KW
433/* For programs that want to be strictly Unicode compatible by dying if any
434 * attempt is made to match a non-Unicode code point against a Unicode
435 * property. */
436#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
437
93733859 438#define OOB_NAMEDCLASS -1
b8c5462f 439
8e661ac5
KW
440/* There is no code point that is out-of-bounds, so this is problematic. But
441 * its only current use is to initialize a variable that is always set before
442 * looked at. */
443#define OOB_UNICODE 0xDEADBEEF
444
a0ed51b3
LW
445#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
446#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
447
8615cb43 448
b45f050a
JF
449/* length of regex to show in messages that don't mark a position within */
450#define RegexLengthToShowInErrorMessages 127
451
452/*
453 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
454 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
455 * op/pragma/warn/regcomp.
456 */
7253e4e3
RK
457#define MARKER1 "<-- HERE" /* marker as it appears in the description */
458#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 459
538e84ed
KW
460#define REPORT_LOCATION " in regex; marked by " MARKER1 \
461 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
b45f050a 462
c1d900c3
BF
463#define REPORT_LOCATION_ARGS(offset) \
464 UTF8fARG(UTF, offset, RExC_precomp), \
465 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
466
b45f050a
JF
467/*
468 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
469 * arg. Show regex, up to a maximum length. If it's too long, chop and add
470 * "...".
471 */
58e23c8d 472#define _FAIL(code) STMT_START { \
bfed75c6 473 const char *ellipses = ""; \
ccb2c380
MP
474 IV len = RExC_end - RExC_precomp; \
475 \
476 if (!SIZE_ONLY) \
a5e7bc51 477 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
478 if (len > RegexLengthToShowInErrorMessages) { \
479 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
480 len = RegexLengthToShowInErrorMessages - 10; \
481 ellipses = "..."; \
482 } \
58e23c8d 483 code; \
ccb2c380 484} STMT_END
8615cb43 485
58e23c8d 486#define FAIL(msg) _FAIL( \
c1d900c3
BF
487 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
488 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
489
490#define FAIL2(msg,arg) _FAIL( \
c1d900c3
BF
491 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
492 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 493
b45f050a 494/*
b45f050a
JF
495 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
496 */
ccb2c380 497#define Simple_vFAIL(m) STMT_START { \
a28509cc 498 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380 499 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
c1d900c3 500 m, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 501} STMT_END
b45f050a
JF
502
503/*
504 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
505 */
ccb2c380
MP
506#define vFAIL(m) STMT_START { \
507 if (!SIZE_ONLY) \
a5e7bc51 508 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
509 Simple_vFAIL(m); \
510} STMT_END
b45f050a
JF
511
512/*
513 * Like Simple_vFAIL(), but accepts two arguments.
514 */
ccb2c380 515#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 516 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
517 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
518 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 519} STMT_END
b45f050a
JF
520
521/*
522 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
523 */
ccb2c380
MP
524#define vFAIL2(m,a1) STMT_START { \
525 if (!SIZE_ONLY) \
a5e7bc51 526 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
527 Simple_vFAIL2(m, a1); \
528} STMT_END
b45f050a
JF
529
530
531/*
532 * Like Simple_vFAIL(), but accepts three arguments.
533 */
ccb2c380 534#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 535 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
536 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
537 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 538} STMT_END
b45f050a
JF
539
540/*
541 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
542 */
ccb2c380
MP
543#define vFAIL3(m,a1,a2) STMT_START { \
544 if (!SIZE_ONLY) \
a5e7bc51 545 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
546 Simple_vFAIL3(m, a1, a2); \
547} STMT_END
b45f050a
JF
548
549/*
550 * Like Simple_vFAIL(), but accepts four arguments.
551 */
ccb2c380 552#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 553 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
554 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
555 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 556} STMT_END
b45f050a 557
95db3ffa
KW
558#define vFAIL4(m,a1,a2,a3) STMT_START { \
559 if (!SIZE_ONLY) \
560 SAVEFREESV(RExC_rx_sv); \
561 Simple_vFAIL4(m, a1, a2, a3); \
562} STMT_END
563
946095af
BF
564/* A specialized version of vFAIL2 that works with UTF8f */
565#define vFAIL2utf8f(m, a1) STMT_START { \
ef3f731d
BF
566 const IV offset = RExC_parse - RExC_precomp; \
567 if (!SIZE_ONLY) \
568 SAVEFREESV(RExC_rx_sv); \
569 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
570 REPORT_LOCATION_ARGS(offset)); \
946095af
BF
571} STMT_END
572
573
5e0a247b
KW
574/* m is not necessarily a "literal string", in this macro */
575#define reg_warn_non_literal_string(loc, m) STMT_START { \
576 const IV offset = loc - RExC_precomp; \
577 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
c1d900c3 578 m, REPORT_LOCATION_ARGS(offset)); \
5e0a247b
KW
579} STMT_END
580
668c081a 581#define ckWARNreg(loc,m) STMT_START { \
a28509cc 582 const IV offset = loc - RExC_precomp; \
f10f4c18 583 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 584 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
585} STMT_END
586
0d6106aa
KW
587#define vWARN_dep(loc, m) STMT_START { \
588 const IV offset = loc - RExC_precomp; \
589 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
c1d900c3 590 REPORT_LOCATION_ARGS(offset)); \
0d6106aa
KW
591} STMT_END
592
147508a2
KW
593#define ckWARNdep(loc,m) STMT_START { \
594 const IV offset = loc - RExC_precomp; \
595 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
596 m REPORT_LOCATION, \
c1d900c3 597 REPORT_LOCATION_ARGS(offset)); \
147508a2
KW
598} STMT_END
599
668c081a 600#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 601 const IV offset = loc - RExC_precomp; \
d1d15184 602 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18 603 m REPORT_LOCATION, \
c1d900c3 604 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
605} STMT_END
606
b23eb183 607#define ckWARN2reg_d(loc,m, a1) STMT_START { \
2335b3d3 608 const IV offset = loc - RExC_precomp; \
b23eb183 609 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
2335b3d3 610 m REPORT_LOCATION, \
c1d900c3 611 a1, REPORT_LOCATION_ARGS(offset)); \
2335b3d3
KW
612} STMT_END
613
668c081a 614#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 615 const IV offset = loc - RExC_precomp; \
668c081a 616 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 617 a1, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
618} STMT_END
619
620#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 621 const IV offset = loc - RExC_precomp; \
ccb2c380 622 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 623 a1, a2, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
624} STMT_END
625
668c081a
NC
626#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
627 const IV offset = loc - RExC_precomp; \
628 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 629 a1, a2, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
630} STMT_END
631
ccb2c380 632#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 633 const IV offset = loc - RExC_precomp; \
ccb2c380 634 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 635 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
636} STMT_END
637
668c081a
NC
638#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
639 const IV offset = loc - RExC_precomp; \
640 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 641 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
642} STMT_END
643
ccb2c380 644#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 645 const IV offset = loc - RExC_precomp; \
ccb2c380 646 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 647 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 648} STMT_END
9d1d55b5 649
8615cb43 650
cd439c50 651/* Allow for side effects in s */
ccb2c380
MP
652#define REGC(c,s) STMT_START { \
653 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
654} STMT_END
cd439c50 655
538e84ed 656/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
657 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
658 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
659 * Element 0 holds the number n.
07be1b83 660 * Position is 1 indexed.
fac92740 661 */
7122b237
YO
662#ifndef RE_TRACK_PATTERN_OFFSETS
663#define Set_Node_Offset_To_R(node,byte)
664#define Set_Node_Offset(node,byte)
665#define Set_Cur_Node_Offset
666#define Set_Node_Length_To_R(node,len)
667#define Set_Node_Length(node,len)
6a86c6ad 668#define Set_Node_Cur_Length(node,start)
538e84ed
KW
669#define Node_Offset(n)
670#define Node_Length(n)
7122b237
YO
671#define Set_Node_Offset_Length(node,offset,len)
672#define ProgLen(ri) ri->u.proglen
673#define SetProgLen(ri,x) ri->u.proglen = x
674#else
675#define ProgLen(ri) ri->u.offsets[0]
676#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
677#define Set_Node_Offset_To_R(node,byte) STMT_START { \
678 if (! SIZE_ONLY) { \
679 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 680 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 681 if((node) < 0) { \
538e84ed
KW
682 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
683 (int)(node)); \
ccb2c380
MP
684 } else { \
685 RExC_offsets[2*(node)-1] = (byte); \
686 } \
687 } \
688} STMT_END
689
690#define Set_Node_Offset(node,byte) \
691 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
692#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
693
694#define Set_Node_Length_To_R(node,len) STMT_START { \
695 if (! SIZE_ONLY) { \
696 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 697 __LINE__, (int)(node), (int)(len))); \
ccb2c380 698 if((node) < 0) { \
538e84ed
KW
699 Perl_croak(aTHX_ "value of node is %d in Length macro", \
700 (int)(node)); \
ccb2c380
MP
701 } else { \
702 RExC_offsets[2*(node)] = (len); \
703 } \
704 } \
705} STMT_END
706
707#define Set_Node_Length(node,len) \
708 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
709#define Set_Node_Cur_Length(node, start) \
710 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
711
712/* Get offsets and lengths */
713#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
714#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
715
07be1b83
YO
716#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
717 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
718 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
719} STMT_END
7122b237 720#endif
07be1b83
YO
721
722#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
723#define EXPERIMENTAL_INPLACESCAN
f427392e 724#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 725
9e9ecfdf 726#define DEBUG_RExC_seen() \
538e84ed
KW
727 DEBUG_OPTIMISE_MORE_r({ \
728 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
729 \
e384d5c1
YO
730 if (RExC_seen & REG_ZERO_LEN_SEEN) \
731 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
538e84ed 732 \
e384d5c1
YO
733 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
734 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
538e84ed 735 \
e384d5c1
YO
736 if (RExC_seen & REG_GPOS_SEEN) \
737 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
538e84ed 738 \
e384d5c1
YO
739 if (RExC_seen & REG_CANY_SEEN) \
740 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
538e84ed 741 \
e384d5c1
YO
742 if (RExC_seen & REG_RECURSE_SEEN) \
743 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
538e84ed 744 \
e384d5c1
YO
745 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
746 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 747 \
e384d5c1
YO
748 if (RExC_seen & REG_VERBARG_SEEN) \
749 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
538e84ed 750 \
e384d5c1
YO
751 if (RExC_seen & REG_CUTGROUP_SEEN) \
752 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
538e84ed 753 \
e384d5c1
YO
754 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
755 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 756 \
e384d5c1
YO
757 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
758 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 759 \
e384d5c1
YO
760 if (RExC_seen & REG_GOSTART_SEEN) \
761 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
538e84ed 762 \
ee273784
YO
763 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
764 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
765 \
538e84ed 766 PerlIO_printf(Perl_debug_log,"\n"); \
9e9ecfdf
YO
767 });
768
304ee84b
YO
769#define DEBUG_STUDYDATA(str,data,depth) \
770DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 771 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
772 "%*s" str "Pos:%"IVdf"/%"IVdf \
773 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
774 (int)(depth)*2, "", \
775 (IV)((data)->pos_min), \
776 (IV)((data)->pos_delta), \
304ee84b 777 (UV)((data)->flags), \
1de06328 778 (IV)((data)->whilem_c), \
304ee84b
YO
779 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
780 is_inf ? "INF " : "" \
1de06328
YO
781 ); \
782 if ((data)->last_found) \
783 PerlIO_printf(Perl_debug_log, \
784 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
785 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
786 SvPVX_const((data)->last_found), \
787 (IV)((data)->last_end), \
788 (IV)((data)->last_start_min), \
789 (IV)((data)->last_start_max), \
790 ((data)->longest && \
791 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
792 SvPVX_const((data)->longest_fixed), \
793 (IV)((data)->offset_fixed), \
794 ((data)->longest && \
795 (data)->longest==&((data)->longest_float)) ? "*" : "", \
796 SvPVX_const((data)->longest_float), \
797 (IV)((data)->offset_float_min), \
798 (IV)((data)->offset_float_max) \
799 ); \
800 PerlIO_printf(Perl_debug_log,"\n"); \
801});
802
653099ff 803/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 804 Update the longest found anchored substring and the longest found
653099ff
GS
805 floating substrings if needed. */
806
4327152a 807STATIC void
ea3daa5d
FC
808S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
809 SSize_t *minlenp, int is_inf)
c277df42 810{
e1ec3a88
AL
811 const STRLEN l = CHR_SVLEN(data->last_found);
812 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 813 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 814
7918f24d
NC
815 PERL_ARGS_ASSERT_SCAN_COMMIT;
816
c277df42 817 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 818 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
819 if (*data->longest == data->longest_fixed) {
820 data->offset_fixed = l ? data->last_start_min : data->pos_min;
821 if (data->flags & SF_BEFORE_EOL)
b81d288d 822 data->flags
c277df42
IZ
823 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
824 else
825 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 826 data->minlen_fixed=minlenp;
1de06328 827 data->lookbehind_fixed=0;
a0ed51b3 828 }
304ee84b 829 else { /* *data->longest == data->longest_float */
c277df42 830 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
831 data->offset_float_max = (l
832 ? data->last_start_max
ea3daa5d
FC
833 : (data->pos_delta == SSize_t_MAX
834 ? SSize_t_MAX
835 : data->pos_min + data->pos_delta));
836 if (is_inf
837 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
838 data->offset_float_max = SSize_t_MAX;
c277df42 839 if (data->flags & SF_BEFORE_EOL)
b81d288d 840 data->flags
c277df42
IZ
841 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
842 else
843 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
844 data->minlen_float=minlenp;
845 data->lookbehind_float=0;
c277df42
IZ
846 }
847 }
848 SvCUR_set(data->last_found, 0);
0eda9292 849 {
a28509cc 850 SV * const sv = data->last_found;
097eb12c
AL
851 if (SvUTF8(sv) && SvMAGICAL(sv)) {
852 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
853 if (mg)
854 mg->mg_len = 0;
855 }
0eda9292 856 }
c277df42
IZ
857 data->last_end = -1;
858 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 859 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
860}
861
cdd87c1d
KW
862/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
863 * list that describes which code points it matches */
864
653099ff 865STATIC void
3420edd7 866S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 867{
cdd87c1d
KW
868 /* Set the SSC 'ssc' to match an empty string or any code point */
869
557bd3fb 870 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 871
71068078 872 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 873
cdd87c1d
KW
874 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
875 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
93e92956 876 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
877}
878
653099ff 879STATIC int
dc3bf405 880S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 881{
c144baaa
KW
882 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
883 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
884 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
885 * in any way, so there's no point in using it */
cdd87c1d
KW
886
887 UV start, end;
888 bool ret;
653099ff 889
557bd3fb 890 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 891
71068078 892 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 893
93e92956 894 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
895 return FALSE;
896 }
897
898 /* See if the list consists solely of the range 0 - Infinity */
899 invlist_iterinit(ssc->invlist);
900 ret = invlist_iternext(ssc->invlist, &start, &end)
901 && start == 0
902 && end == UV_MAX;
903
904 invlist_iterfinish(ssc->invlist);
905
906 if (ret) {
907 return TRUE;
908 }
909
910 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 911 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
912 int i;
913 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
914 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
915 return TRUE;
916 }
917 }
918 }
919
920 return FALSE;
653099ff
GS
921}
922
653099ff 923STATIC void
cdd87c1d 924S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 925{
cdd87c1d
KW
926 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
927 * string, any code point, or any posix class under locale */
928
557bd3fb 929 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 930
557bd3fb 931 Zero(ssc, 1, regnode_ssc);
71068078 932 set_ANYOF_SYNTHETIC(ssc);
93e92956 933 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 934 ssc_anything(ssc);
cdd87c1d
KW
935
936 /* If any portion of the regex is to operate under locale rules,
937 * initialization includes it. The reason this isn't done for all regexes
938 * is that the optimizer was written under the assumption that locale was
939 * all-or-nothing. Given the complexity and lack of documentation in the
940 * optimizer, and that there are inadequate test cases for locale, many
941 * parts of it may not work properly, it is safest to avoid locale unless
942 * necessary. */
943 if (RExC_contains_locale) {
944 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
945 }
946 else {
947 ANYOF_POSIXL_ZERO(ssc);
948 }
653099ff
GS
949}
950
b423522f 951STATIC int
dc3bf405
BF
952S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
953 const regnode_ssc *ssc)
b423522f
KW
954{
955 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
956 * to the list of code points matched, and locale posix classes; hence does
957 * not check its flags) */
958
959 UV start, end;
960 bool ret;
961
962 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
963
71068078 964 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
965
966 invlist_iterinit(ssc->invlist);
967 ret = invlist_iternext(ssc->invlist, &start, &end)
968 && start == 0
969 && end == UV_MAX;
970
971 invlist_iterfinish(ssc->invlist);
972
973 if (! ret) {
974 return FALSE;
975 }
976
e0e1be5f 977 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 978 return FALSE;
b423522f
KW
979 }
980
981 return TRUE;
982}
983
984STATIC SV*
985S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 986 const regnode_charclass* const node)
b423522f
KW
987{
988 /* Returns a mortal inversion list defining which code points are matched
989 * by 'node', which is of type ANYOF. Handles complementing the result if
990 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
991 * returned list must, and will, contain every code point that is a
992 * possibility. */
b423522f
KW
993
994 SV* invlist = sv_2mortal(_new_invlist(0));
1ee208c4 995 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
996 unsigned int i;
997 const U32 n = ARG(node);
31f05a37 998 bool new_node_has_latin1 = FALSE;
b423522f
KW
999
1000 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1001
1002 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1003 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1004 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1005 AV * const av = MUTABLE_AV(SvRV(rv));
1006 SV **const ary = AvARRAY(av);
1007 assert(RExC_rxi->data->what[n] == 's');
1008
1009 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1010 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1011 }
1012 else if (ary[0] && ary[0] != &PL_sv_undef) {
1013
1014 /* Here, no compile-time swash, and there are things that won't be
1015 * known until runtime -- we have to assume it could be anything */
1016 return _add_range_to_invlist(invlist, 0, UV_MAX);
1017 }
1ee208c4 1018 else if (ary[3] && ary[3] != &PL_sv_undef) {
b423522f
KW
1019
1020 /* Here no compile-time swash, and no run-time only data. Use the
1021 * node's inversion list */
1ee208c4
KW
1022 invlist = sv_2mortal(invlist_clone(ary[3]));
1023 }
1024
1025 /* Get the code points valid only under UTF-8 locales */
1026 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1027 && ary[2] && ary[2] != &PL_sv_undef)
1028 {
1029 only_utf8_locale_invlist = ary[2];
b423522f
KW
1030 }
1031 }
1032
dcb20b36
KW
1033 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1034 * code points, and an inversion list for the others, but if there are code
1035 * points that should match only conditionally on the target string being
1036 * UTF-8, those are placed in the inversion list, and not the bitmap.
1037 * Since there are circumstances under which they could match, they are
1038 * included in the SSC. But if the ANYOF node is to be inverted, we have
1039 * to exclude them here, so that when we invert below, the end result
1040 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1041 * have to do this here before we add the unconditionally matched code
1042 * points */
b423522f
KW
1043 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1044 _invlist_intersection_complement_2nd(invlist,
1045 PL_UpperLatin1,
1046 &invlist);
1047 }
1048
1049 /* Add in the points from the bit map */
dcb20b36 1050 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
b423522f
KW
1051 if (ANYOF_BITMAP_TEST(node, i)) {
1052 invlist = add_cp_to_invlist(invlist, i);
31f05a37 1053 new_node_has_latin1 = TRUE;
b423522f
KW
1054 }
1055 }
1056
1057 /* If this can match all upper Latin1 code points, have to add them
1058 * as well */
93e92956 1059 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
b423522f
KW
1060 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1061 }
1062
1063 /* Similarly for these */
93e92956 1064 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1065 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1066 }
1067
1068 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1069 _invlist_invert(invlist);
1070 }
31f05a37
KW
1071 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1072
1073 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1074 * locale. We can skip this if there are no 0-255 at all. */
1075 _invlist_union(invlist, PL_Latin1, &invlist);
1076 }
1077
1ee208c4
KW
1078 /* Similarly add the UTF-8 locale possible matches. These have to be
1079 * deferred until after the non-UTF-8 locale ones are taken care of just
1080 * above, or it leads to wrong results under ANYOF_INVERT */
1081 if (only_utf8_locale_invlist) {
31f05a37 1082 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1083 only_utf8_locale_invlist,
31f05a37
KW
1084 ANYOF_FLAGS(node) & ANYOF_INVERT,
1085 &invlist);
1086 }
b423522f
KW
1087
1088 return invlist;
1089}
1090
1051e1c4 1091/* These two functions currently do the exact same thing */
557bd3fb 1092#define ssc_init_zero ssc_init
653099ff 1093
cdd87c1d
KW
1094#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1095#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1096
557bd3fb 1097/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1098 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1099 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1100
653099ff 1101STATIC void
b423522f 1102S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1103 const regnode_charclass *and_with)
653099ff 1104{
cdd87c1d
KW
1105 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1106 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1107
a0dd4231
KW
1108 SV* anded_cp_list;
1109 U8 anded_flags;
1e6ade67 1110
cdd87c1d 1111 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1112
71068078 1113 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1114
1115 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1116 * the code point inversion list and just the relevant flags */
71068078 1117 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1118 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
a0dd4231 1119 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1120
1121 /* XXX This is a kludge around what appears to be deficiencies in the
1122 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1123 * there are paths through the optimizer where it doesn't get weeded
1124 * out when it should. And if we don't make some extra provision for
1125 * it like the code just below, it doesn't get added when it should.
1126 * This solution is to add it only when AND'ing, which is here, and
1127 * only when what is being AND'ed is the pristine, original node
1128 * matching anything. Thus it is like adding it to ssc_anything() but
1129 * only when the result is to be AND'ed. Probably the same solution
1130 * could be adopted for the same problem we have with /l matching,
1131 * which is solved differently in S_ssc_init(), and that would lead to
1132 * fewer false positives than that solution has. But if this solution
1133 * creates bugs, the consequences are only that a warning isn't raised
1134 * that should be; while the consequences for having /l bugs is
1135 * incorrect matches */
7dcac5f6 1136 if (ssc_is_anything((regnode_ssc *)and_with)) {
e9b08962
KW
1137 anded_flags |= ANYOF_WARN_SUPER;
1138 }
a0dd4231
KW
1139 }
1140 else {
5c0f85ef 1141 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
eff8b7dc 1142 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
a0dd4231
KW
1143 }
1144
1145 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1146
1147 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1148 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1149 * 'and_with' may be inverted. When not inverted, we have the situation of
1150 * computing:
1151 * (C1 | P1) & (C2 | P2)
1152 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1153 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1154 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1155 * <= ((C1 & C2) | P1 | P2)
1156 * Alternatively, the last few steps could be:
1157 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1158 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1159 * <= (C1 | C2 | (P1 & P2))
1160 * We favor the second approach if either P1 or P2 is non-empty. This is
1161 * because these components are a barrier to doing optimizations, as what
1162 * they match cannot be known until the moment of matching as they are
1163 * dependent on the current locale, 'AND"ing them likely will reduce or
1164 * eliminate them.
1165 * But we can do better if we know that C1,P1 are in their initial state (a
1166 * frequent occurrence), each matching everything:
1167 * (<everything>) & (C2 | P2) = C2 | P2
1168 * Similarly, if C2,P2 are in their initial state (again a frequent
1169 * occurrence), the result is a no-op
1170 * (C1 | P1) & (<everything>) = C1 | P1
1171 *
1172 * Inverted, we have
1173 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1174 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1175 * <= (C1 & ~C2) | (P1 & ~P2)
1176 * */
1aa99e6b 1177
a0dd4231 1178 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1179 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1180 {
cdd87c1d 1181 unsigned int i;
8951c461 1182
cdd87c1d
KW
1183 ssc_intersection(ssc,
1184 anded_cp_list,
1185 FALSE /* Has already been inverted */
1186 );
c6b76537 1187
cdd87c1d
KW
1188 /* If either P1 or P2 is empty, the intersection will be also; can skip
1189 * the loop */
93e92956 1190 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1191 ANYOF_POSIXL_ZERO(ssc);
1192 }
e0e1be5f 1193 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1194
1195 /* Note that the Posix class component P from 'and_with' actually
1196 * looks like:
1197 * P = Pa | Pb | ... | Pn
1198 * where each component is one posix class, such as in [\w\s].
1199 * Thus
1200 * ~P = ~(Pa | Pb | ... | Pn)
1201 * = ~Pa & ~Pb & ... & ~Pn
1202 * <= ~Pa | ~Pb | ... | ~Pn
1203 * The last is something we can easily calculate, but unfortunately
1204 * is likely to have many false positives. We could do better
1205 * in some (but certainly not all) instances if two classes in
1206 * P have known relationships. For example
1207 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1208 * So
1209 * :lower: & :print: = :lower:
1210 * And similarly for classes that must be disjoint. For example,
1211 * since \s and \w can have no elements in common based on rules in
1212 * the POSIX standard,
1213 * \w & ^\S = nothing
1214 * Unfortunately, some vendor locales do not meet the Posix
1215 * standard, in particular almost everything by Microsoft.
1216 * The loop below just changes e.g., \w into \W and vice versa */
1217
1ee208c4 1218 regnode_charclass_posixl temp;
cdd87c1d
KW
1219 int add = 1; /* To calculate the index of the complement */
1220
1221 ANYOF_POSIXL_ZERO(&temp);
1222 for (i = 0; i < ANYOF_MAX; i++) {
1223 assert(i % 2 != 0
7dcac5f6
KW
1224 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1225 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1226
7dcac5f6 1227 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1228 ANYOF_POSIXL_SET(&temp, i + add);
1229 }
1230 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1231 }
1232 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1233
cdd87c1d
KW
1234 } /* else ssc already has no posixes */
1235 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1236 in its initial state */
71068078 1237 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1238 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1239 {
1240 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1241 * copy it over 'ssc' */
1242 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1243 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1244 StructCopy(and_with, ssc, regnode_ssc);
1245 }
1246 else {
1247 ssc->invlist = anded_cp_list;
1248 ANYOF_POSIXL_ZERO(ssc);
93e92956 1249 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1250 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1251 }
1252 }
1253 }
e0e1be5f 1254 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
93e92956 1255 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1256 {
1257 /* One or the other of P1, P2 is non-empty. */
93e92956 1258 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1259 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1260 }
cdd87c1d
KW
1261 ssc_union(ssc, anded_cp_list, FALSE);
1262 }
1263 else { /* P1 = P2 = empty */
1264 ssc_intersection(ssc, anded_cp_list, FALSE);
1265 }
137165a6 1266 }
653099ff
GS
1267}
1268
653099ff 1269STATIC void
cdd87c1d 1270S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1271 const regnode_charclass *or_with)
653099ff 1272{
cdd87c1d
KW
1273 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1274 * another SSC or a regular ANYOF class. Can create false positives if
1275 * 'or_with' is to be inverted. */
7918f24d 1276
a0dd4231
KW
1277 SV* ored_cp_list;
1278 U8 ored_flags;
c6b76537 1279
cdd87c1d 1280 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1281
71068078 1282 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1283
1284 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1285 * the code point inversion list and just the relevant flags */
71068078 1286 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1287 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
a0dd4231
KW
1288 ored_flags = ANYOF_FLAGS(or_with);
1289 }
1290 else {
5c0f85ef 1291 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
eff8b7dc 1292 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
a0dd4231
KW
1293 }
1294
1295 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1296
1297 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1298 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1299 * 'or_with' may be inverted. When not inverted, we have the simple
1300 * situation of computing:
1301 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1302 * If P1|P2 yields a situation with both a class and its complement are
1303 * set, like having both \w and \W, this matches all code points, and we
1304 * can delete these from the P component of the ssc going forward. XXX We
1305 * might be able to delete all the P components, but I (khw) am not certain
1306 * about this, and it is better to be safe.
1307 *
1308 * Inverted, we have
1309 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1310 * <= (C1 | P1) | ~C2
1311 * <= (C1 | ~C2) | P1
1312 * (which results in actually simpler code than the non-inverted case)
1313 * */
9826f543 1314
a0dd4231 1315 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1316 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1317 {
cdd87c1d 1318 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1319 } /* else Not inverted */
93e92956 1320 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1321 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1322 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1323 unsigned int i;
1324 for (i = 0; i < ANYOF_MAX; i += 2) {
1325 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1326 {
1327 ssc_match_all_cp(ssc);
1328 ANYOF_POSIXL_CLEAR(ssc, i);
1329 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1330 }
1331 }
1332 }
1aa99e6b 1333 }
cdd87c1d
KW
1334
1335 ssc_union(ssc,
1336 ored_cp_list,
1337 FALSE /* Already has been inverted */
1338 );
653099ff
GS
1339}
1340
b423522f
KW
1341PERL_STATIC_INLINE void
1342S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1343{
1344 PERL_ARGS_ASSERT_SSC_UNION;
1345
71068078 1346 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1347
1348 _invlist_union_maybe_complement_2nd(ssc->invlist,
1349 invlist,
1350 invert2nd,
1351 &ssc->invlist);
1352}
1353
1354PERL_STATIC_INLINE void
1355S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1356 SV* const invlist,
1357 const bool invert2nd)
1358{
1359 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1360
71068078 1361 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1362
1363 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1364 invlist,
1365 invert2nd,
1366 &ssc->invlist);
1367}
1368
1369PERL_STATIC_INLINE void
1370S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1371{
1372 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1373
71068078 1374 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1375
1376 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1377}
1378
1379PERL_STATIC_INLINE void
1380S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1381{
1382 /* AND just the single code point 'cp' into the SSC 'ssc' */
1383
1384 SV* cp_list = _new_invlist(2);
1385
1386 PERL_ARGS_ASSERT_SSC_CP_AND;
1387
71068078 1388 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1389
1390 cp_list = add_cp_to_invlist(cp_list, cp);
1391 ssc_intersection(ssc, cp_list,
1392 FALSE /* Not inverted */
1393 );
1394 SvREFCNT_dec_NN(cp_list);
1395}
1396
1397PERL_STATIC_INLINE void
dc3bf405 1398S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
1399{
1400 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
1401 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1402
71068078 1403 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1404
1405 ANYOF_POSIXL_ZERO(ssc);
1406 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1407}
1408
1409STATIC void
1410S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1411{
1412 /* The inversion list in the SSC is marked mortal; now we need a more
1413 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
1414 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1415 * map */
b423522f
KW
1416
1417 SV* invlist = invlist_clone(ssc->invlist);
1418
1419 PERL_ARGS_ASSERT_SSC_FINALIZE;
1420
71068078 1421 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 1422
a0dd4231 1423 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
1424 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1425 * by the time we reach here */
eff8b7dc 1426 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
a0dd4231 1427
b423522f
KW
1428 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1429
1ee208c4
KW
1430 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1431 NULL, NULL, NULL, FALSE);
b423522f 1432
85c8e306
KW
1433 /* Make sure is clone-safe */
1434 ssc->invlist = NULL;
1435
e0e1be5f 1436 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 1437 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
e0e1be5f 1438 }
1462525b
KW
1439
1440 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
1441}
1442
a3621e74
YO
1443#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1444#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1445#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
1446#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1447 ? (TRIE_LIST_CUR( idx ) - 1) \
1448 : 0 )
a3621e74 1449
3dab1dad
YO
1450
1451#ifdef DEBUGGING
07be1b83 1452/*
2b8b4781
NC
1453 dump_trie(trie,widecharmap,revcharmap)
1454 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1455 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1456
1457 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1458 The _interim_ variants are used for debugging the interim
1459 tables that are used to generate the final compressed
1460 representation which is what dump_trie expects.
1461
486ec47a 1462 Part of the reason for their existence is to provide a form
3dab1dad 1463 of documentation as to how the different representations function.
07be1b83
YO
1464
1465*/
3dab1dad
YO
1466
1467/*
3dab1dad
YO
1468 Dumps the final compressed table form of the trie to Perl_debug_log.
1469 Used for debugging make_trie().
1470*/
b9a59e08 1471
3dab1dad 1472STATIC void
2b8b4781
NC
1473S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1474 AV *revcharmap, U32 depth)
3dab1dad
YO
1475{
1476 U32 state;
ab3bbdeb 1477 SV *sv=sv_newmortal();
55eed653 1478 int colwidth= widecharmap ? 6 : 4;
2e64971a 1479 U16 word;
3dab1dad
YO
1480 GET_RE_DEBUG_FLAGS_DECL;
1481
7918f24d 1482 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1483
3dab1dad
YO
1484 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1485 (int)depth * 2 + 2,"",
1486 "Match","Base","Ofs" );
1487
1488 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1489 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1490 if ( tmp ) {
538e84ed 1491 PerlIO_printf( Perl_debug_log, "%*s",
ab3bbdeb 1492 colwidth,
538e84ed 1493 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1494 PL_colors[0], PL_colors[1],
1495 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
1496 PERL_PV_ESCAPE_FIRSTCHAR
1497 )
ab3bbdeb 1498 );
3dab1dad
YO
1499 }
1500 }
1501 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1502 (int)depth * 2 + 2,"");
1503
1504 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1505 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1506 PerlIO_printf( Perl_debug_log, "\n");
1507
1e2e3d02 1508 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1509 const U32 base = trie->states[ state ].trans.base;
3dab1dad 1510
538e84ed
KW
1511 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1512 (int)depth * 2 + 2,"", (UV)state);
3dab1dad
YO
1513
1514 if ( trie->states[ state ].wordnum ) {
538e84ed
KW
1515 PerlIO_printf( Perl_debug_log, " W%4X",
1516 trie->states[ state ].wordnum );
3dab1dad
YO
1517 } else {
1518 PerlIO_printf( Perl_debug_log, "%6s", "" );
1519 }
1520
1521 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1522
1523 if ( base ) {
1524 U32 ofs = 0;
1525
1526 while( ( base + ofs < trie->uniquecharcount ) ||
1527 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
1528 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1529 != state))
3dab1dad
YO
1530 ofs++;
1531
1532 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1533
1534 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
1535 if ( ( base + ofs >= trie->uniquecharcount )
1536 && ( base + ofs - trie->uniquecharcount
1537 < trie->lasttrans )
1538 && trie->trans[ base + ofs
1539 - trie->uniquecharcount ].check == state )
3dab1dad 1540 {
ab3bbdeb
YO
1541 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1542 colwidth,
538e84ed
KW
1543 (UV)trie->trans[ base + ofs
1544 - trie->uniquecharcount ].next );
3dab1dad 1545 } else {
ab3bbdeb 1546 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1547 }
1548 }
1549
1550 PerlIO_printf( Perl_debug_log, "]");
1551
1552 }
1553 PerlIO_printf( Perl_debug_log, "\n" );
1554 }
538e84ed
KW
1555 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1556 (int)depth*2, "");
2e64971a
DM
1557 for (word=1; word <= trie->wordcount; word++) {
1558 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1559 (int)word, (int)(trie->wordinfo[word].prev),
1560 (int)(trie->wordinfo[word].len));
1561 }
1562 PerlIO_printf(Perl_debug_log, "\n" );
538e84ed 1563}
3dab1dad 1564/*
3dab1dad 1565 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 1566 List tries normally only are used for construction when the number of
3dab1dad
YO
1567 possible chars (trie->uniquecharcount) is very high.
1568 Used for debugging make_trie().
1569*/
1570STATIC void
55eed653 1571S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1572 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1573 U32 depth)
3dab1dad
YO
1574{
1575 U32 state;
ab3bbdeb 1576 SV *sv=sv_newmortal();
55eed653 1577 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1578 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1579
1580 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1581
3dab1dad 1582 /* print out the table precompression. */
ab3bbdeb
YO
1583 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1584 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1585 "------:-----+-----------------\n" );
538e84ed 1586
3dab1dad
YO
1587 for( state=1 ; state < next_alloc ; state ++ ) {
1588 U16 charid;
538e84ed 1589
ab3bbdeb 1590 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1591 (int)depth * 2 + 2,"", (UV)state );
1592 if ( ! trie->states[ state ].wordnum ) {
1593 PerlIO_printf( Perl_debug_log, "%5s| ","");
1594 } else {
1595 PerlIO_printf( Perl_debug_log, "W%4x| ",
1596 trie->states[ state ].wordnum
1597 );
1598 }
1599 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed
KW
1600 SV ** const tmp = av_fetch( revcharmap,
1601 TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1602 if ( tmp ) {
1603 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1604 colwidth,
538e84ed
KW
1605 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1606 colwidth,
1607 PL_colors[0], PL_colors[1],
1608 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1609 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 1610 ) ,
1e2e3d02
YO
1611 TRIE_LIST_ITEM(state,charid).forid,
1612 (UV)TRIE_LIST_ITEM(state,charid).newstate
1613 );
538e84ed 1614 if (!(charid % 10))
664e119d
RGS
1615 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1616 (int)((depth * 2) + 14), "");
1e2e3d02 1617 }
ab3bbdeb
YO
1618 }
1619 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad 1620 }
538e84ed 1621}
3dab1dad
YO
1622
1623/*
3dab1dad 1624 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
1625 This is the normal DFA style state transition table, with a few
1626 twists to facilitate compression later.
3dab1dad
YO
1627 Used for debugging make_trie().
1628*/
1629STATIC void
55eed653 1630S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1631 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1632 U32 depth)
3dab1dad
YO
1633{
1634 U32 state;
1635 U16 charid;
ab3bbdeb 1636 SV *sv=sv_newmortal();
55eed653 1637 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1638 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1639
1640 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 1641
3dab1dad
YO
1642 /*
1643 print out the table precompression so that we can do a visual check
1644 that they are identical.
1645 */
538e84ed 1646
3dab1dad
YO
1647 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1648
1649 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1650 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1651 if ( tmp ) {
538e84ed 1652 PerlIO_printf( Perl_debug_log, "%*s",
ab3bbdeb 1653 colwidth,
538e84ed 1654 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1655 PL_colors[0], PL_colors[1],
1656 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
1657 PERL_PV_ESCAPE_FIRSTCHAR
1658 )
ab3bbdeb 1659 );
3dab1dad
YO
1660 }
1661 }
1662
1663 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1664
1665 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1666 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1667 }
1668
1669 PerlIO_printf( Perl_debug_log, "\n" );
1670
1671 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1672
538e84ed 1673 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
3dab1dad
YO
1674 (int)depth * 2 + 2,"",
1675 (UV)TRIE_NODENUM( state ) );
1676
1677 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1678 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1679 if (v)
1680 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1681 else
1682 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1683 }
1684 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
538e84ed
KW
1685 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1686 (UV)trie->trans[ state ].check );
3dab1dad 1687 } else {
538e84ed
KW
1688 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1689 (UV)trie->trans[ state ].check,
3dab1dad
YO
1690 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1691 }
1692 }
07be1b83 1693}
3dab1dad
YO
1694
1695#endif
1696
2e64971a 1697
786e8c11
YO
1698/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1699 startbranch: the first branch in the whole branch sequence
1700 first : start branch of sequence of branch-exact nodes.
1701 May be the same as startbranch
1702 last : Thing following the last branch.
1703 May be the same as tail.
1704 tail : item following the branch sequence
1705 count : words in the sequence
cce29a1d 1706 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
786e8c11 1707 depth : indent depth
3dab1dad 1708
786e8c11 1709Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1710
786e8c11
YO
1711A trie is an N'ary tree where the branches are determined by digital
1712decomposition of the key. IE, at the root node you look up the 1st character and
1713follow that branch repeat until you find the end of the branches. Nodes can be
1714marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1715
786e8c11 1716 /he|she|his|hers/
72f13be8 1717
786e8c11
YO
1718would convert into the following structure. Numbers represent states, letters
1719following numbers represent valid transitions on the letter from that state, if
1720the number is in square brackets it represents an accepting state, otherwise it
1721will be in parenthesis.
07be1b83 1722
786e8c11
YO
1723 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1724 | |
1725 | (2)
1726 | |
1727 (1) +-i->(6)-+-s->[7]
1728 |
1729 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1730
786e8c11
YO
1731 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1732
1733This shows that when matching against the string 'hers' we will begin at state 1
1734read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1735then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1736is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1737single traverse. We store a mapping from accepting to state to which word was
1738matched, and then when we have multiple possibilities we try to complete the
1739rest of the regex in the order in which they occured in the alternation.
1740
1741The only prior NFA like behaviour that would be changed by the TRIE support is
1742the silent ignoring of duplicate alternations which are of the form:
1743
1744 / (DUPE|DUPE) X? (?{ ... }) Y /x
1745
4b714af6 1746Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1747and without the optimisation. With the optimisations dupes will be silently
486ec47a 1748ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1749the following demonstrates:
1750
1751 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1752
1753which prints out 'word' three times, but
1754
1755 'words'=~/(word|word|word)(?{ print $1 })S/
1756
1757which doesnt print it out at all. This is due to other optimisations kicking in.
1758
1759Example of what happens on a structural level:
1760
486ec47a 1761The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1762
1763 1: CURLYM[1] {1,32767}(18)
1764 5: BRANCH(8)
1765 6: EXACT <ac>(16)
1766 8: BRANCH(11)
1767 9: EXACT <ad>(16)
1768 11: BRANCH(14)
1769 12: EXACT <ab>(16)
1770 16: SUCCEED(0)
1771 17: NOTHING(18)
1772 18: END(0)
1773
1774This would be optimizable with startbranch=5, first=5, last=16, tail=16
1775and should turn into:
1776
1777 1: CURLYM[1] {1,32767}(18)
1778 5: TRIE(16)
1779 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1780 <ac>
1781 <ad>
1782 <ab>
1783 16: SUCCEED(0)
1784 17: NOTHING(18)
1785 18: END(0)
1786
1787Cases where tail != last would be like /(?foo|bar)baz/:
1788
1789 1: BRANCH(4)
1790 2: EXACT <foo>(8)
1791 4: BRANCH(7)
1792 5: EXACT <bar>(8)
1793 7: TAIL(8)
1794 8: EXACT <baz>(10)
1795 10: END(0)
1796
1797which would be optimizable with startbranch=1, first=1, last=7, tail=8
1798and would end up looking like:
1799
1800 1: TRIE(8)
1801 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1802 <foo>
1803 <bar>
1804 7: TAIL(8)
1805 8: EXACT <baz>(10)
1806 10: END(0)
1807
c80e42f3 1808 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
1809
1810is the recommended Unicode-aware way of saying
1811
1812 *(d++) = uv;
1813*/
1814
fab2782b 1815#define TRIE_STORE_REVCHAR(val) \
786e8c11 1816 STMT_START { \
73031816 1817 if (UTF) { \
fab2782b 1818 SV *zlopp = newSV(7); /* XXX: optimize me */ \
88c9ea1e 1819 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 1820 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
1821 SvCUR_set(zlopp, kapow - flrbbbbb); \
1822 SvPOK_on(zlopp); \
1823 SvUTF8_on(zlopp); \
1824 av_push(revcharmap, zlopp); \
1825 } else { \
fab2782b 1826 char ooooff = (char)val; \
73031816
NC
1827 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1828 } \
1829 } STMT_END
786e8c11 1830
914a25d5
KW
1831/* This gets the next character from the input, folding it if not already
1832 * folded. */
1833#define TRIE_READ_CHAR STMT_START { \
1834 wordlen++; \
1835 if ( UTF ) { \
1836 /* if it is UTF then it is either already folded, or does not need \
1837 * folding */ \
1c1d615a 1838 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
1839 } \
1840 else if (folder == PL_fold_latin1) { \
7d006b13
KW
1841 /* This folder implies Unicode rules, which in the range expressible \
1842 * by not UTF is the lower case, with the two exceptions, one of \
1843 * which should have been taken care of before calling this */ \
1844 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1845 uvc = toLOWER_L1(*uc); \
1846 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1847 len = 1; \
914a25d5
KW
1848 } else { \
1849 /* raw data, will be folded later if needed */ \
1850 uvc = (U32)*uc; \
1851 len = 1; \
1852 } \
786e8c11
YO
1853} STMT_END
1854
1855
1856
1857#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1858 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1859 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1860 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1861 } \
1862 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1863 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1864 TRIE_LIST_CUR( state )++; \
1865} STMT_END
07be1b83 1866
786e8c11
YO
1867#define TRIE_LIST_NEW(state) STMT_START { \
1868 Newxz( trie->states[ state ].trans.list, \
1869 4, reg_trie_trans_le ); \
1870 TRIE_LIST_CUR( state ) = 1; \
1871 TRIE_LIST_LEN( state ) = 4; \
1872} STMT_END
07be1b83 1873
786e8c11
YO
1874#define TRIE_HANDLE_WORD(state) STMT_START { \
1875 U16 dupe= trie->states[ state ].wordnum; \
1876 regnode * const noper_next = regnext( noper ); \
1877 \
786e8c11
YO
1878 DEBUG_r({ \
1879 /* store the word for dumping */ \
1880 SV* tmp; \
1881 if (OP(noper) != NOTHING) \
740cce10 1882 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1883 else \
740cce10 1884 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1885 av_push( trie_words, tmp ); \
786e8c11
YO
1886 }); \
1887 \
1888 curword++; \
2e64971a
DM
1889 trie->wordinfo[curword].prev = 0; \
1890 trie->wordinfo[curword].len = wordlen; \
1891 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1892 \
1893 if ( noper_next < tail ) { \
1894 if (!trie->jump) \
538e84ed
KW
1895 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1896 sizeof(U16) ); \
7f69552c 1897 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1898 if (!jumper) \
1899 jumper = noper_next; \
1900 if (!nextbranch) \
1901 nextbranch= regnext(cur); \
1902 } \
1903 \
1904 if ( dupe ) { \
2e64971a
DM
1905 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1906 /* chain, so that when the bits of chain are later */\
1907 /* linked together, the dups appear in the chain */\
1908 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1909 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1910 } else { \
1911 /* we haven't inserted this word yet. */ \
1912 trie->states[ state ].wordnum = curword; \
1913 } \
1914} STMT_END
07be1b83 1915
3dab1dad 1916
786e8c11
YO
1917#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1918 ( ( base + charid >= ucharcount \
1919 && base + charid < ubound \
1920 && state == trie->trans[ base - ucharcount + charid ].check \
1921 && trie->trans[ base - ucharcount + charid ].next ) \
1922 ? trie->trans[ base - ucharcount + charid ].next \
1923 : ( state==1 ? special : 0 ) \
1924 )
3dab1dad 1925
786e8c11
YO
1926#define MADE_TRIE 1
1927#define MADE_JUMP_TRIE 2
1928#define MADE_EXACT_TRIE 4
3dab1dad 1929
a3621e74 1930STATIC I32
538e84ed
KW
1931S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1932 regnode *first, regnode *last, regnode *tail,
1933 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
1934{
1935 /* first pass, loop through and scan words */
1936 reg_trie_data *trie;
55eed653 1937 HV *widecharmap = NULL;
2b8b4781 1938 AV *revcharmap = newAV();
a3621e74 1939 regnode *cur;
a3621e74
YO
1940 STRLEN len = 0;
1941 UV uvc = 0;
1942 U16 curword = 0;
1943 U32 next_alloc = 0;
786e8c11
YO
1944 regnode *jumper = NULL;
1945 regnode *nextbranch = NULL;
7f69552c 1946 regnode *convert = NULL;
2e64971a 1947 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1948 /* we just use folder as a flag in utf8 */
1e696034 1949 const U8 * folder = NULL;
a3621e74 1950
2b8b4781 1951#ifdef DEBUGGING
cf78de0b 1952 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2b8b4781
NC
1953 AV *trie_words = NULL;
1954 /* along with revcharmap, this only used during construction but both are
1955 * useful during debugging so we store them in the struct when debugging.
8e11feef 1956 */
2b8b4781 1957#else
cf78de0b 1958 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 1959 STRLEN trie_charcount=0;
3dab1dad 1960#endif
2b8b4781 1961 SV *re_trie_maxbuff;
a3621e74 1962 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1963
1964 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1965#ifndef DEBUGGING
1966 PERL_UNUSED_ARG(depth);
1967#endif
a3621e74 1968
1e696034 1969 switch (flags) {
79a81a6e 1970 case EXACT: break;
2f7f8cb1 1971 case EXACTFA:
fab2782b 1972 case EXACTFU_SS:
1e696034
KW
1973 case EXACTFU: folder = PL_fold_latin1; break;
1974 case EXACTF: folder = PL_fold; break;
fab2782b 1975 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
1976 }
1977
c944940b 1978 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1979 trie->refcount = 1;
3dab1dad 1980 trie->startstate = 1;
786e8c11 1981 trie->wordcount = word_count;
f8fc2ecf 1982 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1983 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
fab2782b 1984 if (flags == EXACT)
c944940b 1985 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1986 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1987 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1988
a3621e74 1989 DEBUG_r({
2b8b4781 1990 trie_words = newAV();
a3621e74 1991 });
a3621e74 1992
0111c4fd 1993 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
316ebaf2 1994 assert(re_trie_maxbuff);
a3621e74 1995 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1996 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1997 }
df826430 1998 DEBUG_TRIE_COMPILE_r({
538e84ed
KW
1999 PerlIO_printf( Perl_debug_log,
2000 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2001 (int)depth * 2 + 2, "",
2002 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2003 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2004 });
538e84ed 2005
7f69552c
YO
2006 /* Find the node we are going to overwrite */
2007 if ( first == startbranch && OP( last ) != BRANCH ) {
2008 /* whole branch chain */
2009 convert = first;
2010 } else {
2011 /* branch sub-chain */
2012 convert = NEXTOPER( first );
2013 }
538e84ed 2014
a3621e74
YO
2015 /* -- First loop and Setup --
2016
2017 We first traverse the branches and scan each word to determine if it
2018 contains widechars, and how many unique chars there are, this is
2019 important as we have to build a table with at least as many columns as we
2020 have unique chars.
2021
2022 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2023 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2024 the native representation of the character value as the key and IV's for
2025 the coded index.
a3621e74
YO
2026
2027 *TODO* If we keep track of how many times each character is used we can
2028 remap the columns so that the table compression later on is more
3b753521 2029 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2030 middle and the least common are on the outside. IMO this would be better
2031 than a most to least common mapping as theres a decent chance the most
2032 common letter will share a node with the least common, meaning the node
486ec47a 2033 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2034 case is when we have the least common nodes twice.
2035
2036 */
2037
a3621e74 2038 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2039 regnode *noper = NEXTOPER( cur );
e1ec3a88 2040 const U8 *uc = (U8*)STRING( noper );
df826430 2041 const U8 *e = uc + STR_LEN( noper );
bc031a7d 2042 int foldlen = 0;
07be1b83 2043 U32 wordlen = 0; /* required init */
bc031a7d
KW
2044 STRLEN minchars = 0;
2045 STRLEN maxchars = 0;
538e84ed
KW
2046 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2047 bitmap?*/
a3621e74 2048
3dab1dad 2049 if (OP(noper) == NOTHING) {
df826430
YO
2050 regnode *noper_next= regnext(noper);
2051 if (noper_next != tail && OP(noper_next) == flags) {
2052 noper = noper_next;
2053 uc= (U8*)STRING(noper);
2054 e= uc + STR_LEN(noper);
2055 trie->minlen= STR_LEN(noper);
2056 } else {
2057 trie->minlen= 0;
2058 continue;
2059 }
3dab1dad 2060 }
df826430 2061
fab2782b 2062 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2063 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2064 regardless of encoding */
fab2782b
YO
2065 if (OP( noper ) == EXACTFU_SS) {
2066 /* false positives are ok, so just set this */
0dc4a61d 2067 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2068 }
2069 }
bc031a7d
KW
2070 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2071 branch */
3dab1dad 2072 TRIE_CHARCOUNT(trie)++;
a3621e74 2073 TRIE_READ_CHAR;
645de4ce 2074
bc031a7d
KW
2075 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2076 * is in effect. Under /i, this character can match itself, or
2077 * anything that folds to it. If not under /i, it can match just
2078 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2079 * all fold to k, and all are single characters. But some folds
2080 * expand to more than one character, so for example LATIN SMALL
2081 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2082 * the string beginning at 'uc' is 'ffi', it could be matched by
2083 * three characters, or just by the one ligature character. (It
2084 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2085 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2086 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2087 * match.) The trie needs to know the minimum and maximum number
2088 * of characters that could match so that it can use size alone to
2089 * quickly reject many match attempts. The max is simple: it is
2090 * the number of folded characters in this branch (since a fold is
2091 * never shorter than what folds to it. */
2092
2093 maxchars++;
2094
2095 /* And the min is equal to the max if not under /i (indicated by
2096 * 'folder' being NULL), or there are no multi-character folds. If
2097 * there is a multi-character fold, the min is incremented just
2098 * once, for the character that folds to the sequence. Each
2099 * character in the sequence needs to be added to the list below of
2100 * characters in the trie, but we count only the first towards the
2101 * min number of characters needed. This is done through the
2102 * variable 'foldlen', which is returned by the macros that look
2103 * for these sequences as the number of bytes the sequence
2104 * occupies. Each time through the loop, we decrement 'foldlen' by
2105 * how many bytes the current char occupies. Only when it reaches
2106 * 0 do we increment 'minchars' or look for another multi-character
2107 * sequence. */
2108 if (folder == NULL) {
2109 minchars++;
2110 }
2111 else if (foldlen > 0) {
2112 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2113 }
2114 else {
bc031a7d
KW
2115 minchars++;
2116
2117 /* See if *uc is the beginning of a multi-character fold. If
2118 * so, we decrement the length remaining to look at, to account
2119 * for the current character this iteration. (We can use 'uc'
2120 * instead of the fold returned by TRIE_READ_CHAR because for
2121 * non-UTF, the latin1_safe macro is smart enough to account
2122 * for all the unfolded characters, and because for UTF, the
2123 * string will already have been folded earlier in the
2124 * compilation process */
2125 if (UTF) {
2126 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2127 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2128 }
2129 }
bc031a7d
KW
2130 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2131 foldlen--;
2132 }
645de4ce 2133 }
bc031a7d
KW
2134
2135 /* The current character (and any potential folds) should be added
2136 * to the possible matching characters for this position in this
2137 * branch */
a3621e74 2138 if ( uvc < 256 ) {
fab2782b
YO
2139 if ( folder ) {
2140 U8 folded= folder[ (U8) uvc ];
2141 if ( !trie->charmap[ folded ] ) {
2142 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2143 TRIE_STORE_REVCHAR( folded );
2144 }
2145 }
a3621e74
YO
2146 if ( !trie->charmap[ uvc ] ) {
2147 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2148 TRIE_STORE_REVCHAR( uvc );
a3621e74 2149 }
02daf0ab 2150 if ( set_bit ) {
62012aee
KW
2151 /* store the codepoint in the bitmap, and its folded
2152 * equivalent. */
fab2782b 2153 TRIE_BITMAP_SET(trie, uvc);
0921ee73
T
2154
2155 /* store the folded codepoint */
fab2782b 2156 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
0921ee73
T
2157
2158 if ( !UTF ) {
2159 /* store first byte of utf8 representation of
acdf4139 2160 variant codepoints */
6f2d5cbc 2161 if (! UVCHR_IS_INVARIANT(uvc)) {
acdf4139 2162 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
2163 }
2164 }
02daf0ab
YO
2165 set_bit = 0; /* We've done our bit :-) */
2166 }
a3621e74 2167 } else {
bc031a7d
KW
2168
2169 /* XXX We could come up with the list of code points that fold
2170 * to this using PL_utf8_foldclosures, except not for
2171 * multi-char folds, as there may be multiple combinations
2172 * there that could work, which needs to wait until runtime to
2173 * resolve (The comment about LIGATURE FFI above is such an
2174 * example */
2175
a3621e74 2176 SV** svpp;
55eed653
NC
2177 if ( !widecharmap )
2178 widecharmap = newHV();
a3621e74 2179
55eed653 2180 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2181
2182 if ( !svpp )
e4584336 2183 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
2184
2185 if ( !SvTRUE( *svpp ) ) {
2186 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2187 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2188 }
2189 }
bc031a7d
KW
2190 } /* end loop through characters in this branch of the trie */
2191
2192 /* We take the min and max for this branch and combine to find the min
2193 * and max for all branches processed so far */
3dab1dad 2194 if( cur == first ) {
bc031a7d
KW
2195 trie->minlen = minchars;
2196 trie->maxlen = maxchars;
2197 } else if (minchars < trie->minlen) {
2198 trie->minlen = minchars;
2199 } else if (maxchars > trie->maxlen) {
2200 trie->maxlen = maxchars;
fab2782b 2201 }
a3621e74
YO
2202 } /* end first pass */
2203 DEBUG_TRIE_COMPILE_r(
538e84ed
KW
2204 PerlIO_printf( Perl_debug_log,
2205 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3dab1dad 2206 (int)depth * 2 + 2,"",
55eed653 2207 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2208 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2209 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2210 );
a3621e74
YO
2211
2212 /*
2213 We now know what we are dealing with in terms of unique chars and
2214 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2215 representation using a flat table will take. If it's over a reasonable
2216 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2217 conservative but potentially much slower representation using an array
2218 of lists.
2219
2220 At the end we convert both representations into the same compressed
2221 form that will be used in regexec.c for matching with. The latter
2222 is a form that cannot be used to construct with but has memory
2223 properties similar to the list form and access properties similar
2224 to the table form making it both suitable for fast searches and
2225 small enough that its feasable to store for the duration of a program.
2226
2227 See the comment in the code where the compressed table is produced
2228 inplace from the flat tabe representation for an explanation of how
2229 the compression works.
2230
2231 */
2232
2233
2e64971a
DM
2234 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2235 prev_states[1] = 0;
2236
538e84ed
KW
2237 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2238 > SvIV(re_trie_maxbuff) )
2239 {
a3621e74
YO
2240 /*
2241 Second Pass -- Array Of Lists Representation
2242
2243 Each state will be represented by a list of charid:state records
2244 (reg_trie_trans_le) the first such element holds the CUR and LEN
2245 points of the allocated array. (See defines above).
2246
2247 We build the initial structure using the lists, and then convert
2248 it into the compressed table form which allows faster lookups
2249 (but cant be modified once converted).
a3621e74
YO
2250 */
2251
a3621e74
YO
2252 STRLEN transcount = 1;
2253
538e84ed 2254 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1e2e3d02
YO
2255 "%*sCompiling trie using list compiler\n",
2256 (int)depth * 2 + 2, ""));
686b73d4 2257
c944940b
JH
2258 trie->states = (reg_trie_state *)
2259 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2260 sizeof(reg_trie_state) );
a3621e74
YO
2261 TRIE_LIST_NEW(1);
2262 next_alloc = 2;
2263
2264 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2265
df826430 2266 regnode *noper = NEXTOPER( cur );
c445ea15 2267 U8 *uc = (U8*)STRING( noper );
df826430 2268 const U8 *e = uc + STR_LEN( noper );
c445ea15
AL
2269 U32 state = 1; /* required init */
2270 U16 charid = 0; /* sanity init */
07be1b83 2271 U32 wordlen = 0; /* required init */
c445ea15 2272
df826430
YO
2273 if (OP(noper) == NOTHING) {
2274 regnode *noper_next= regnext(noper);
2275 if (noper_next != tail && OP(noper_next) == flags) {
2276 noper = noper_next;
2277 uc= (U8*)STRING(noper);
2278 e= uc + STR_LEN(noper);
2279 }
2280 }
2281
3dab1dad 2282 if (OP(noper) != NOTHING) {
786e8c11 2283 for ( ; uc < e ; uc += len ) {
c445ea15 2284
786e8c11 2285 TRIE_READ_CHAR;
c445ea15 2286
786e8c11
YO
2287 if ( uvc < 256 ) {
2288 charid = trie->charmap[ uvc ];
c445ea15 2289 } else {
538e84ed
KW
2290 SV** const svpp = hv_fetch( widecharmap,
2291 (char*)&uvc,
2292 sizeof( UV ),
2293 0);
786e8c11
YO
2294 if ( !svpp ) {
2295 charid = 0;
2296 } else {
2297 charid=(U16)SvIV( *svpp );
2298 }
c445ea15 2299 }
538e84ed
KW
2300 /* charid is now 0 if we dont know the char read, or
2301 * nonzero if we do */
786e8c11 2302 if ( charid ) {
a3621e74 2303
786e8c11
YO
2304 U16 check;
2305 U32 newstate = 0;
a3621e74 2306
786e8c11
YO
2307 charid--;
2308 if ( !trie->states[ state ].trans.list ) {
2309 TRIE_LIST_NEW( state );
c445ea15 2310 }
538e84ed
KW
2311 for ( check = 1;
2312 check <= TRIE_LIST_USED( state );
2313 check++ )
2314 {
2315 if ( TRIE_LIST_ITEM( state, check ).forid
2316 == charid )
2317 {
786e8c11
YO
2318 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2319 break;
2320 }
2321 }
2322 if ( ! newstate ) {
2323 newstate = next_alloc++;
2e64971a 2324 prev_states[newstate] = state;
786e8c11
YO
2325 TRIE_LIST_PUSH( state, charid, newstate );
2326 transcount++;
2327 }
2328 state = newstate;
2329 } else {
2330 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 2331 }
a28509cc 2332 }
c445ea15 2333 }
3dab1dad 2334 TRIE_HANDLE_WORD(state);
a3621e74
YO
2335
2336 } /* end second pass */
2337
1e2e3d02 2338 /* next alloc is the NEXT state to be allocated */
538e84ed 2339 trie->statecount = next_alloc;
c944940b
JH
2340 trie->states = (reg_trie_state *)
2341 PerlMemShared_realloc( trie->states,
2342 next_alloc
2343 * sizeof(reg_trie_state) );
a3621e74 2344
3dab1dad 2345 /* and now dump it out before we compress it */
2b8b4781
NC
2346 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2347 revcharmap, next_alloc,
2348 depth+1)
1e2e3d02 2349 );
a3621e74 2350
c944940b
JH
2351 trie->trans = (reg_trie_trans *)
2352 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2353 {
2354 U32 state;
a3621e74
YO
2355 U32 tp = 0;
2356 U32 zp = 0;
2357
2358
2359 for( state=1 ; state < next_alloc ; state ++ ) {
2360 U32 base=0;
2361
2362 /*
2363 DEBUG_TRIE_COMPILE_MORE_r(
2364 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2365 );
2366 */
2367
2368 if (trie->states[state].trans.list) {
2369 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2370 U16 maxid=minid;
a28509cc 2371 U16 idx;
a3621e74
YO
2372
2373 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
2374 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2375 if ( forid < minid ) {
2376 minid=forid;
2377 } else if ( forid > maxid ) {
2378 maxid=forid;
2379 }
a3621e74
YO
2380 }
2381 if ( transcount < tp + maxid - minid + 1) {
2382 transcount *= 2;
c944940b
JH
2383 trie->trans = (reg_trie_trans *)
2384 PerlMemShared_realloc( trie->trans,
446bd890
NC
2385 transcount
2386 * sizeof(reg_trie_trans) );
538e84ed
KW
2387 Zero( trie->trans + (transcount / 2),
2388 transcount / 2,
2389 reg_trie_trans );
a3621e74
YO
2390 }
2391 base = trie->uniquecharcount + tp - minid;
2392 if ( maxid == minid ) {
2393 U32 set = 0;
2394 for ( ; zp < tp ; zp++ ) {
2395 if ( ! trie->trans[ zp ].next ) {
2396 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
2397 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2398 1).newstate;
a3621e74
YO
2399 trie->trans[ zp ].check = state;
2400 set = 1;
2401 break;
2402 }
2403 }
2404 if ( !set ) {
538e84ed
KW
2405 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2406 1).newstate;
a3621e74
YO
2407 trie->trans[ tp ].check = state;
2408 tp++;
2409 zp = tp;
2410 }
2411 } else {
2412 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
2413 const U32 tid = base
2414 - trie->uniquecharcount
2415 + TRIE_LIST_ITEM( state, idx ).forid;
2416 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2417 idx ).newstate;
a3621e74
YO
2418 trie->trans[ tid ].check = state;
2419 }
2420 tp += ( maxid - minid + 1 );
2421 }
2422 Safefree(trie->states[ state ].trans.list);
2423 }
2424 /*
2425 DEBUG_TRIE_COMPILE_MORE_r(
2426 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2427 );
2428 */
2429 trie->states[ state ].trans.base=base;
2430 }
cc601c31 2431 trie->lasttrans = tp + 1;
a3621e74
YO
2432 }
2433 } else {
2434 /*
2435 Second Pass -- Flat Table Representation.
2436
b423522f
KW
2437 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2438 each. We know that we will need Charcount+1 trans at most to store
2439 the data (one row per char at worst case) So we preallocate both
2440 structures assuming worst case.
a3621e74
YO
2441
2442 We then construct the trie using only the .next slots of the entry
2443 structs.
2444
b423522f
KW
2445 We use the .check field of the first entry of the node temporarily
2446 to make compression both faster and easier by keeping track of how
2447 many non zero fields are in the node.
a3621e74
YO
2448
2449 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2450 transition.
2451
b423522f
KW
2452 There are two terms at use here: state as a TRIE_NODEIDX() which is
2453 a number representing the first entry of the node, and state as a
2454 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2455 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2456 if there are 2 entrys per node. eg:
a3621e74
YO
2457
2458 A B A B
2459 1. 2 4 1. 3 7
2460 2. 0 3 3. 0 5
2461 3. 0 0 5. 0 0
2462 4. 0 0 7. 0 0
2463
b423522f
KW
2464 The table is internally in the right hand, idx form. However as we
2465 also have to deal with the states array which is indexed by nodenum
2466 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
2467
2468 */
538e84ed 2469 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1e2e3d02
YO
2470 "%*sCompiling trie using table compiler\n",
2471 (int)depth * 2 + 2, ""));
3dab1dad 2472
c944940b
JH
2473 trie->trans = (reg_trie_trans *)
2474 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2475 * trie->uniquecharcount + 1,
2476 sizeof(reg_trie_trans) );
2477 trie->states = (reg_trie_state *)
2478 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2479 sizeof(reg_trie_state) );
a3621e74
YO
2480 next_alloc = trie->uniquecharcount + 1;
2481
3dab1dad 2482
a3621e74
YO
2483 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2484
df826430 2485 regnode *noper = NEXTOPER( cur );
a28509cc 2486 const U8 *uc = (U8*)STRING( noper );
df826430 2487 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
2488
2489 U32 state = 1; /* required init */
2490
2491 U16 charid = 0; /* sanity init */
2492 U32 accept_state = 0; /* sanity init */
a3621e74 2493
07be1b83 2494 U32 wordlen = 0; /* required init */
a3621e74 2495
df826430
YO
2496 if (OP(noper) == NOTHING) {
2497 regnode *noper_next= regnext(noper);
2498 if (noper_next != tail && OP(noper_next) == flags) {
2499 noper = noper_next;
2500 uc= (U8*)STRING(noper);
2501 e= uc + STR_LEN(noper);
2502 }
2503 }
fab2782b 2504
3dab1dad 2505 if ( OP(noper) != NOTHING ) {
786e8c11 2506 for ( ; uc < e ; uc += len ) {
a3621e74 2507
786e8c11 2508 TRIE_READ_CHAR;
a3621e74 2509
786e8c11
YO
2510 if ( uvc < 256 ) {
2511 charid = trie->charmap[ uvc ];
2512 } else {
538e84ed
KW
2513 SV* const * const svpp = hv_fetch( widecharmap,
2514 (char*)&uvc,
2515 sizeof( UV ),
2516 0);
786e8c11 2517 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 2518 }
786e8c11
YO
2519 if ( charid ) {
2520 charid--;
2521 if ( !trie->trans[ state + charid ].next ) {
2522 trie->trans[ state + charid ].next = next_alloc;
2523 trie->trans[ state ].check++;
2e64971a
DM
2524 prev_states[TRIE_NODENUM(next_alloc)]
2525 = TRIE_NODENUM(state);
786e8c11
YO
2526 next_alloc += trie->uniquecharcount;
2527 }
2528 state = trie->trans[ state + charid ].next;
2529 } else {
2530 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2531 }
538e84ed
KW
2532 /* charid is now 0 if we dont know the char read, or
2533 * nonzero if we do */
a3621e74 2534 }
a3621e74 2535 }
3dab1dad
YO
2536 accept_state = TRIE_NODENUM( state );
2537 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
2538
2539 } /* end second pass */
2540
3dab1dad 2541 /* and now dump it out before we compress it */
2b8b4781
NC
2542 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2543 revcharmap,
2544 next_alloc, depth+1));
a3621e74 2545
a3621e74
YO
2546 {
2547 /*
2548 * Inplace compress the table.*
2549
2550 For sparse data sets the table constructed by the trie algorithm will
2551 be mostly 0/FAIL transitions or to put it another way mostly empty.
2552 (Note that leaf nodes will not contain any transitions.)
2553
2554 This algorithm compresses the tables by eliminating most such
2555 transitions, at the cost of a modest bit of extra work during lookup:
2556
2557 - Each states[] entry contains a .base field which indicates the
2558 index in the state[] array wheres its transition data is stored.
2559
3b753521 2560 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
2561
2562 - If .base is nonzero then charid is added to it to find an entry in
2563 the trans array.
2564
2565 -If trans[states[state].base+charid].check!=state then the
2566 transition is taken to be a 0/Fail transition. Thus if there are fail
2567 transitions at the front of the node then the .base offset will point
2568 somewhere inside the previous nodes data (or maybe even into a node
2569 even earlier), but the .check field determines if the transition is
2570 valid.
2571
786e8c11 2572 XXX - wrong maybe?
a3621e74 2573 The following process inplace converts the table to the compressed
3b753521 2574 table: We first do not compress the root node 1,and mark all its
a3621e74 2575 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
2576 allows us to do a DFA construction from the compressed table later,
2577 and ensures that any .base pointers we calculate later are greater
2578 than 0.
a3621e74
YO
2579
2580 - We set 'pos' to indicate the first entry of the second node.
2581
2582 - We then iterate over the columns of the node, finding the first and
2583 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2584 and set the .check pointers accordingly, and advance pos
2585 appropriately and repreat for the next node. Note that when we copy
2586 the next pointers we have to convert them from the original
2587 NODEIDX form to NODENUM form as the former is not valid post
2588 compression.
2589
2590 - If a node has no transitions used we mark its base as 0 and do not
2591 advance the pos pointer.
2592
2593 - If a node only has one transition we use a second pointer into the
2594 structure to fill in allocated fail transitions from other states.
2595 This pointer is independent of the main pointer and scans forward
2596 looking for null transitions that are allocated to a state. When it
2597 finds one it writes the single transition into the "hole". If the
786e8c11 2598 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
2599
2600 - Once compressed we can Renew/realloc the structures to release the
2601 excess space.
2602
2603 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2604 specifically Fig 3.47 and the associated pseudocode.
2605
2606 demq
2607 */
a3b680e6 2608 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 2609 U32 state, charid;
a3621e74 2610 U32 pos = 0, zp=0;
1e2e3d02 2611 trie->statecount = laststate;
a3621e74
YO
2612
2613 for ( state = 1 ; state < laststate ; state++ ) {
2614 U8 flag = 0;
a28509cc
AL
2615 const U32 stateidx = TRIE_NODEIDX( state );
2616 const U32 o_used = trie->trans[ stateidx ].check;
2617 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
2618 trie->trans[ stateidx ].check = 0;
2619
538e84ed
KW
2620 for ( charid = 0;
2621 used && charid < trie->uniquecharcount;
2622 charid++ )
2623 {
a3621e74
YO
2624 if ( flag || trie->trans[ stateidx + charid ].next ) {
2625 if ( trie->trans[ stateidx + charid ].next ) {
2626 if (o_used == 1) {
2627 for ( ; zp < pos ; zp++ ) {
2628 if ( ! trie->trans[ zp ].next ) {
2629 break;
2630 }
2631 }
538e84ed
KW
2632 trie->states[ state ].trans.base
2633 = zp
2634 + trie->uniquecharcount
2635 - charid ;
2636 trie->trans[ zp ].next
2637 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2638 + charid ].next );
a3621e74
YO
2639 trie->trans[ zp ].check = state;
2640 if ( ++zp > pos ) pos = zp;
2641 break;
2642 }
2643 used--;
2644 }
2645 if ( !flag ) {
2646 flag = 1;
538e84ed
KW
2647 trie->states[ state ].trans.base
2648 = pos + trie->uniquecharcount - charid ;
a3621e74 2649 }
538e84ed
KW
2650 trie->trans[ pos ].next
2651 = SAFE_TRIE_NODENUM(
2652 trie->trans[ stateidx + charid ].next );
a3621e74
YO
2653 trie->trans[ pos ].check = state;
2654 pos++;
2655 }
2656 }
2657 }
cc601c31 2658 trie->lasttrans = pos + 1;
c944940b
JH
2659 trie->states = (reg_trie_state *)
2660 PerlMemShared_realloc( trie->states, laststate
2661 * sizeof(reg_trie_state) );
a3621e74 2662 DEBUG_TRIE_COMPILE_MORE_r(
538e84ed
KW
2663 PerlIO_printf( Perl_debug_log,
2664 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2665 (int)depth * 2 + 2,"",
2666 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2667 + 1 ),
2668 (IV)next_alloc,
2669 (IV)pos,
2670 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
2671 );
2672
2673 } /* end table compress */
2674 }
1e2e3d02 2675 DEBUG_TRIE_COMPILE_MORE_r(
538e84ed
KW
2676 PerlIO_printf(Perl_debug_log,
2677 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1e2e3d02
YO
2678 (int)depth * 2 + 2, "",
2679 (UV)trie->statecount,
2680 (UV)trie->lasttrans)
2681 );
cc601c31 2682 /* resize the trans array to remove unused space */
c944940b
JH
2683 trie->trans = (reg_trie_trans *)
2684 PerlMemShared_realloc( trie->trans, trie->lasttrans
2685 * sizeof(reg_trie_trans) );
a3621e74 2686
538e84ed 2687 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2688 U8 nodetype =(U8)(flags & 0xFF);
2689 char *str=NULL;
538e84ed 2690
07be1b83 2691#ifdef DEBUGGING
e62cc96a 2692 regnode *optimize = NULL;
7122b237
YO
2693#ifdef RE_TRACK_PATTERN_OFFSETS
2694
b57a0404
JH
2695 U32 mjd_offset = 0;
2696 U32 mjd_nodelen = 0;
7122b237
YO
2697#endif /* RE_TRACK_PATTERN_OFFSETS */
2698#endif /* DEBUGGING */
a3621e74 2699 /*
3dab1dad
YO
2700 This means we convert either the first branch or the first Exact,
2701 depending on whether the thing following (in 'last') is a branch
2702 or not and whther first is the startbranch (ie is it a sub part of
2703 the alternation or is it the whole thing.)
3b753521 2704 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2705 the whole branch sequence, including the first.
a3621e74 2706 */
3dab1dad 2707 /* Find the node we are going to overwrite */
7f69552c 2708 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2709 /* branch sub-chain */
3dab1dad 2710 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2711#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2712 DEBUG_r({
2713 mjd_offset= Node_Offset((convert));
2714 mjd_nodelen= Node_Length((convert));
2715 });
7122b237 2716#endif
7f69552c 2717 /* whole branch chain */
7122b237
YO
2718 }
2719#ifdef RE_TRACK_PATTERN_OFFSETS
2720 else {
7f69552c
YO
2721 DEBUG_r({
2722 const regnode *nop = NEXTOPER( convert );
2723 mjd_offset= Node_Offset((nop));
2724 mjd_nodelen= Node_Length((nop));
2725 });
07be1b83
YO
2726 }
2727 DEBUG_OPTIMISE_r(
538e84ed
KW
2728 PerlIO_printf(Perl_debug_log,
2729 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
07be1b83 2730 (int)depth * 2 + 2, "",
786e8c11 2731 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2732 );
7122b237 2733#endif
538e84ed 2734 /* But first we check to see if there is a common prefix we can
3dab1dad
YO
2735 split out as an EXACT and put in front of the TRIE node. */
2736 trie->startstate= 1;
55eed653 2737 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2738 U32 state;
1e2e3d02 2739 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2740 U32 ofs = 0;
8e11feef
RGS
2741 I32 idx = -1;
2742 U32 count = 0;
2743 const U32 base = trie->states[ state ].trans.base;
a3621e74 2744
3dab1dad 2745 if ( trie->states[state].wordnum )
8e11feef 2746 count = 1;
a3621e74 2747
8e11feef 2748 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2749 if ( ( base + ofs >= trie->uniquecharcount ) &&
2750 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2751 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2752 {
3dab1dad 2753 if ( ++count > 1 ) {
2b8b4781 2754 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2755 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2756 if ( state == 1 ) break;
3dab1dad
YO
2757 if ( count == 2 ) {
2758 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2759 DEBUG_OPTIMISE_r(
8e11feef
RGS
2760 PerlIO_printf(Perl_debug_log,
2761 "%*sNew Start State=%"UVuf" Class: [",
2762 (int)depth * 2 + 2, "",
786e8c11 2763 (UV)state));
be8e71aa 2764 if (idx >= 0) {
2b8b4781 2765 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2766 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2767
3dab1dad 2768 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2769 if ( folder )
2770 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2771 DEBUG_OPTIMISE_r(
f1f66076 2772 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2773 );
8e11feef
RGS
2774 }
2775 }
2776 TRIE_BITMAP_SET(trie,*ch);
2777 if ( folder )
2778 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2779 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2780 }
2781 idx = ofs;
2782 }
3dab1dad
YO
2783 }
2784 if ( count == 1 ) {
2b8b4781 2785 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2786 STRLEN len;
2787 char *ch = SvPV( *tmp, len );
de734bd5
A
2788 DEBUG_OPTIMISE_r({
2789 SV *sv=sv_newmortal();
8e11feef
RGS
2790 PerlIO_printf( Perl_debug_log,
2791 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2792 (int)depth * 2 + 2, "",
538e84ed
KW
2793 (UV)state, (UV)idx,
2794 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
de734bd5
A
2795 PL_colors[0], PL_colors[1],
2796 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed 2797 PERL_PV_ESCAPE_FIRSTCHAR
de734bd5
A
2798 )
2799 );
2800 });
3dab1dad
YO
2801 if ( state==1 ) {
2802 OP( convert ) = nodetype;
2803 str=STRING(convert);
2804 STR_LEN(convert)=0;
2805 }
c490c714
YO
2806 STR_LEN(convert) += len;
2807 while (len--)
de734bd5 2808 *str++ = *ch++;
8e11feef 2809 } else {
538e84ed 2810#ifdef DEBUGGING
8e11feef
RGS
2811 if (state>1)
2812 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2813#endif
8e11feef
RGS
2814 break;
2815 }
2816 }
2e64971a 2817 trie->prefixlen = (state-1);
3dab1dad 2818 if (str) {
8e11feef 2819 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2820 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2821 trie->startstate = state;
07be1b83
YO
2822 trie->minlen -= (state - 1);
2823 trie->maxlen -= (state - 1);
33809eae
JH
2824#ifdef DEBUGGING
2825 /* At least the UNICOS C compiler choked on this
2826 * being argument to DEBUG_r(), so let's just have
2827 * it right here. */
2828 if (
2829#ifdef PERL_EXT_RE_BUILD
2830 1
2831#else
2832 DEBUG_r_TEST
2833#endif
2834 ) {
2835 regnode *fix = convert;
2836 U32 word = trie->wordcount;
2837 mjd_nodelen++;
2838 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2839 while( ++fix < n ) {
2840 Set_Node_Offset_Length(fix, 0, 0);
2841 }
2842 while (word--) {
2843 SV ** const tmp = av_fetch( trie_words, word, 0 );
2844 if (tmp) {
2845 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2846 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2847 else
2848 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2849 }
2850 }
2851 }
2852#endif
8e11feef
RGS
2853 if (trie->maxlen) {
2854 convert = n;
2855 } else {
3dab1dad 2856 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2857 DEBUG_r(optimize= n);
3dab1dad
YO
2858 }
2859 }
2860 }
538e84ed
KW
2861 if (!jumper)
2862 jumper = last;
3dab1dad 2863 if ( trie->maxlen ) {
8e11feef
RGS
2864 NEXT_OFF( convert ) = (U16)(tail - convert);
2865 ARG_SET( convert, data_slot );
538e84ed
KW
2866 /* Store the offset to the first unabsorbed branch in
2867 jump[0], which is otherwise unused by the jump logic.
786e8c11 2868 We use this when dumping a trie and during optimisation. */
538e84ed 2869 if (trie->jump)
7f69552c 2870 trie->jump[0] = (U16)(nextbranch - convert);
538e84ed 2871
6c48061a
YO
2872 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2873 * and there is a bitmap
2874 * and the first "jump target" node we found leaves enough room
2875 * then convert the TRIE node into a TRIEC node, with the bitmap
2876 * embedded inline in the opcode - this is hypothetically faster.
2877 */
2878 if ( !trie->states[trie->startstate].wordnum
2879 && trie->bitmap
2880 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2881 {
2882 OP( convert ) = TRIEC;
2883 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2884 PerlMemShared_free(trie->bitmap);
786e8c11 2885 trie->bitmap= NULL;
538e84ed 2886 } else
786e8c11 2887 OP( convert ) = TRIE;
a3621e74 2888
3dab1dad
YO
2889 /* store the type in the flags */
2890 convert->flags = nodetype;
a5ca303d 2891 DEBUG_r({
538e84ed
KW
2892 optimize = convert
2893 + NODE_STEP_REGNODE
a5ca303d
YO
2894 + regarglen[ OP( convert ) ];
2895 });
538e84ed 2896 /* XXX We really should free up the resource in trie now,
a5ca303d 2897 as we won't use them - (which resources?) dmq */
3dab1dad 2898 }
a3621e74 2899 /* needed for dumping*/
e62cc96a 2900 DEBUG_r(if (optimize) {
07be1b83 2901 regnode *opt = convert;
bcdf7404 2902
e62cc96a 2903 while ( ++opt < optimize) {
07be1b83
YO
2904 Set_Node_Offset_Length(opt,0,0);
2905 }
538e84ed
KW
2906 /*
2907 Try to clean up some of the debris left after the
786e8c11 2908 optimisation.
a3621e74 2909 */
786e8c11 2910 while( optimize < jumper ) {
07be1b83 2911 mjd_nodelen += Node_Length((optimize));
a3621e74 2912 OP( optimize ) = OPTIMIZED;
07be1b83 2913 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2914 optimize++;
2915 }
07be1b83 2916 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2917 });
2918 } /* end node insert */
2e64971a
DM
2919
2920 /* Finish populating the prev field of the wordinfo array. Walk back
2921 * from each accept state until we find another accept state, and if
2922 * so, point the first word's .prev field at the second word. If the
2923 * second already has a .prev field set, stop now. This will be the
2924 * case either if we've already processed that word's accept state,
3b753521
FN
2925 * or that state had multiple words, and the overspill words were
2926 * already linked up earlier.
2e64971a
DM
2927 */
2928 {
2929 U16 word;
2930 U32 state;
2931 U16 prev;
2932
2933 for (word=1; word <= trie->wordcount; word++) {
2934 prev = 0;
2935 if (trie->wordinfo[word].prev)
2936 continue;
2937 state = trie->wordinfo[word].accept;
2938 while (state) {
2939 state = prev_states[state];
2940 if (!state)
2941 break;
2942 prev = trie->states[state].wordnum;
2943 if (prev)
2944 break;
2945 }
2946 trie->wordinfo[word].prev = prev;
2947 }
2948 Safefree(prev_states);
2949 }
2950
2951
2952 /* and now dump out the compressed format */
2953 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2954
55eed653 2955 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2956#ifdef DEBUGGING
2957 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2958 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2959#else
03e70be4 2960 SvREFCNT_dec_NN(revcharmap);
07be1b83 2961#endif
538e84ed
KW
2962 return trie->jump
2963 ? MADE_JUMP_TRIE
2964 : trie->startstate>1
2965 ? MADE_EXACT_TRIE
786e8c11
YO
2966 : MADE_TRIE;
2967}
2968
615a2e7f
YO
2969STATIC regnode *
2970S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
786e8c11 2971{
b423522f
KW
2972/* The Trie is constructed and compressed now so we can build a fail array if
2973 * it's needed
786e8c11 2974
b423522f
KW
2975 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2976 3.32 in the
2977 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2978 Ullman 1985/88
786e8c11
YO
2979 ISBN 0-201-10088-6
2980
b423522f
KW
2981 We find the fail state for each state in the trie, this state is the longest
2982 proper suffix of the current state's 'word' that is also a proper prefix of
2983 another word in our trie. State 1 represents the word '' and is thus the
2984 default fail state. This allows the DFA not to have to restart after its
2985 tried and failed a word at a given point, it simply continues as though it
2986 had been matching the other word in the first place.
786e8c11
YO
2987 Consider
2988 'abcdgu'=~/abcdefg|cdgu/
b423522f
KW
2989 When we get to 'd' we are still matching the first word, we would encounter
2990 'g' which would fail, which would bring us to the state representing 'd' in
2991 the second word where we would try 'g' and succeed, proceeding to match
2992 'cdgu'.
786e8c11
YO
2993 */
2994 /* add a fail transition */
3251b653
NC
2995 const U32 trie_offset = ARG(source);
2996 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2997 U32 *q;
2998 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2999 const U32 numstates = trie->statecount;
786e8c11
YO
3000 const U32 ubound = trie->lasttrans + ucharcount;
3001 U32 q_read = 0;
3002 U32 q_write = 0;
3003 U32 charid;
3004 U32 base = trie->states[ 1 ].trans.base;
3005 U32 *fail;
3006 reg_ac_data *aho;
cf78de0b 3007 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
615a2e7f 3008 regnode *stclass;
786e8c11 3009 GET_RE_DEBUG_FLAGS_DECL;
7918f24d 3010
615a2e7f 3011 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
81611534 3012 PERL_UNUSED_CONTEXT;
786e8c11
YO
3013#ifndef DEBUGGING
3014 PERL_UNUSED_ARG(depth);
3015#endif
3016
615a2e7f
YO
3017 if ( OP(source) == TRIE ) {
3018 struct regnode_1 *op = (struct regnode_1 *)
3019 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3020 StructCopy(source,op,struct regnode_1);
3021 stclass = (regnode *)op;
3022 } else {
3023 struct regnode_charclass *op = (struct regnode_charclass *)
3024 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3025 StructCopy(source,op,struct regnode_charclass);
3026 stclass = (regnode *)op;
3027 }
3028 OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
786e8c11
YO
3029
3030 ARG_SET( stclass, data_slot );
c944940b 3031 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 3032 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 3033 aho->trie=trie_offset;
446bd890
NC
3034 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3035 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 3036 Newxz( q, numstates, U32);
c944940b 3037 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
3038 aho->refcount = 1;
3039 fail = aho->fail;
3040 /* initialize fail[0..1] to be 1 so that we always have
3041 a valid final fail state */
3042 fail[ 0 ] = fail[ 1 ] = 1;
3043
3044 for ( charid = 0; charid < ucharcount ; charid++ ) {
3045 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3046 if ( newstate ) {
3047 q[ q_write ] = newstate;
3048 /* set to point at the root */
3049 fail[ q[ q_write++ ] ]=1;
3050 }
3051 }
3052 while ( q_read < q_write) {
3053 const U32 cur = q[ q_read++ % numstates ];
3054 base = trie->states[ cur ].trans.base;
3055
3056 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3057 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3058 if (ch_state) {
3059 U32 fail_state = cur;
3060 U32 fail_base;
3061 do {
3062 fail_state = fail[ fail_state ];
3063 fail_base = aho->states[ fail_state ].trans.base;
3064 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3065
3066 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3067 fail[ ch_state ] = fail_state;
3068 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3069 {
3070 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3071 }
3072 q[ q_write++ % numstates] = ch_state;
3073 }
3074 }
3075 }
3076 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3077 when we fail in state 1, this allows us to use the
3078 charclass scan to find a valid start char. This is based on the principle
3079 that theres a good chance the string being searched contains lots of stuff
3080 that cant be a start char.
3081 */
3082 fail[ 0 ] = fail[ 1 ] = 0;
3083 DEBUG_TRIE_COMPILE_r({
6d99fb9b 3084 PerlIO_printf(Perl_debug_log,
538e84ed 3085 "%*sStclass Failtable (%"UVuf" states): 0",
6d99fb9b 3086 (int)(depth * 2), "", (UV)numstates
1e2e3d02 3087 );
786e8c11
YO
3088 for( q_read=1; q_read<numstates; q_read++ ) {
3089 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3090 }
3091 PerlIO_printf(Perl_debug_log, "\n");
3092 });
3093 Safefree(q);
e384d5c1 3094 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
615a2e7f 3095 return stclass;
a3621e74
YO
3096}
3097
786e8c11 3098
07be1b83 3099#define DEBUG_PEEP(str,scan,depth) \
b515a41d 3100 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
3101 SV * const mysv=sv_newmortal(); \
3102 regnode *Next = regnext(scan); \
2395827c 3103 regprop(RExC_rx, mysv, scan, NULL); \
7f69552c 3104 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
3105 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3106 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 3107 }});
07be1b83 3108
1de06328 3109
bb914485 3110/* The below joins as many adjacent EXACTish nodes as possible into a single
0a982f06
KW
3111 * one. The regop may be changed if the node(s) contain certain sequences that
3112 * require special handling. The joining is only done if:
bb914485
KW
3113 * 1) there is room in the current conglomerated node to entirely contain the
3114 * next one.
3115 * 2) they are the exact same node type
3116 *
87b8b349 3117 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
bb914485
KW
3118 * these get optimized out
3119 *
0a982f06
KW
3120 * If a node is to match under /i (folded), the number of characters it matches
3121 * can be different than its character length if it contains a multi-character
31f05a37
KW
3122 * fold. *min_subtract is set to the total delta number of characters of the
3123 * input nodes.
bb914485 3124 *
fb29cc72 3125 * And *unfolded_multi_char is set to indicate whether or not the node contains
31f05a37
KW
3126 * an unfolded multi-char fold. This happens when whether the fold is valid or
3127 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3128 * SMALL LETTER SHARP S, as only if the target string being matched against
3129 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3130 * folding rules depend on the locale in force at runtime. (Multi-char folds
3131 * whose components are all above the Latin1 range are not run-time locale
3132 * dependent, and have already been folded by the time this function is
3133 * called.)
f758bddf 3134 *
bb914485 3135 * This is as good a place as any to discuss the design of handling these
0a982f06
KW
3136 * multi-character fold sequences. It's been wrong in Perl for a very long
3137 * time. There are three code points in Unicode whose multi-character folds
3138 * were long ago discovered to mess things up. The previous designs for
3139 * dealing with these involved assigning a special node for them. This
538e84ed 3140 * approach doesn't always work, as evidenced by this example:
a0c4c608 3141 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
538e84ed 3142 * Both sides fold to "sss", but if the pattern is parsed to create a node that
0a982f06 3143 * would match just the \xDF, it won't be able to handle the case where a
bb914485
KW
3144 * successful match would have to cross the node's boundary. The new approach
3145 * that hopefully generally solves the problem generates an EXACTFU_SS node
538e84ed 3146 * that is "sss" in this case.
bb914485 3147 *
0a982f06 3148 * It turns out that there are problems with all multi-character folds, and not
cb117658
KW
3149 * just these three. Now the code is general, for all such cases. The
3150 * approach taken is:
0a982f06 3151 * 1) This routine examines each EXACTFish node that could contain multi-
538e84ed
KW
3152 * character folded sequences. Since a single character can fold into
3153 * such a sequence, the minimum match length for this node is less than
3154 * the number of characters in the node. This routine returns in
31f05a37
KW
3155 * *min_subtract how many characters to subtract from the the actual
3156 * length of the string to get a real minimum match length; it is 0 if
3157 * there are no multi-char foldeds. This delta is used by the caller to
3158 * adjust the min length of the match, and the delta between min and max,
3159 * so that the optimizer doesn't reject these possibilities based on size
3160 * constraints.
cb117658 3161 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
0a982f06
KW
3162 * is used for an EXACTFU node that contains at least one "ss" sequence in
3163 * it. For non-UTF-8 patterns and strings, this is the only case where
3164 * there is a possible fold length change. That means that a regular
3165 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3166 * with length changes, and so can be processed faster. regexec.c takes
3167 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
31f05a37
KW
3168 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3169 * known until runtime). This saves effort in regex matching. However,
3170 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3171 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3172 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3173 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
0a982f06
KW
3174 * possibilities for the non-UTF8 patterns are quite simple, except for
3175 * the sharp s. All the ones that don't involve a UTF-8 target string are
3176 * members of a fold-pair, and arrays are set up for all of them so that
3177 * the other member of the pair can be found quickly. Code elsewhere in
3178 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3179 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3180 * described in the next item.
31f05a37
KW
3181 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3182 * validity of the fold won't be known until runtime, and so must remain
3183 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3184 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3185 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3186 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3187 * The reason this is a problem is that the optimizer part of regexec.c
3188 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3189 * that a character in the pattern corresponds to at most a single
3190 * character in the target string. (And I do mean character, and not byte
3191 * here, unlike other parts of the documentation that have never been
3192 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3193 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3194 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3195 * nodes, violate the assumption, and they are the only instances where it
3196 * is violated. I'm reluctant to try to change the assumption, as the
3197 * code involved is impenetrable to me (khw), so instead the code here
3198 * punts. This routine examines EXACTFL nodes, and (when the pattern
3199 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3200 * boolean indicating whether or not the node contains such a fold. When
3201 * it is true, the caller sets a flag that later causes the optimizer in
3202 * this file to not set values for the floating and fixed string lengths,
3203 * and thus avoids the optimizer code in regexec.c that makes the invalid
1ca267a5 3204 * assumption. Thus, there is no optimization based on string lengths for
31f05a37
KW
3205 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3206 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3207 * assumption is wrong only in these cases is that all other non-UTF-8
3208 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3209 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3210 * EXACTF nodes because we don't know at compile time if it actually
3211 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3212 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3213 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3214 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3215 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3216 * string would require the pattern to be forced into UTF-8, the overhead
3217 * of which we want to avoid. Similarly the unfolded multi-char folds in
3218 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3219 * locale.)
098b07d5
KW
3220 *
3221 * Similarly, the code that generates tries doesn't currently handle
3222 * not-already-folded multi-char folds, and it looks like a pain to change
3223 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3224 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3225 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3226 * using /iaa matching will be doing so almost entirely with ASCII
3227 * strings, so this should rarely be encountered in practice */
1de06328 3228
fb29cc72 3229#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
07be1b83 3230 if (PL_regkind[OP(scan)] == EXACT) \
fb29cc72 3231 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
07be1b83 3232
be8e71aa 3233STATIC U32
538e84ed 3234S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
fb29cc72 3235 UV *min_subtract, bool *unfolded_multi_char,
538e84ed
KW
3236 U32 flags,regnode *val, U32 depth)
3237{
07be1b83
YO
3238 /* Merge several consecutive EXACTish nodes into one. */
3239 regnode *n = regnext(scan);
3240 U32 stringok = 1;
3241 regnode *next = scan + NODE_SZ_STR(scan);
3242 U32 merged = 0;
3243 U32 stopnow = 0;
3244#ifdef DEBUGGING
3245 regnode *stop = scan;
72f13be8 3246 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 3247#else
d47053eb
RGS
3248 PERL_UNUSED_ARG(depth);
3249#endif
7918f24d
NC
3250
3251 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 3252#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
3253 PERL_UNUSED_ARG(flags);
3254 PERL_UNUSED_ARG(val);
07be1b83 3255#endif
07be1b83 3256 DEBUG_PEEP("join",scan,depth);
bb914485 3257
3f410cf6
KW
3258 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3259 * EXACT ones that are mergeable to the current one. */
3260 while (n
3261 && (PL_regkind[OP(n)] == NOTHING
3262 || (stringok && OP(n) == OP(scan)))
07be1b83 3263 && NEXT_OFF(n)
3f410cf6
KW
3264 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3265 {
538e84ed 3266
07be1b83
YO
3267 if (OP(n) == TAIL || n > next)
3268 stringok = 0;
3269 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
3270 DEBUG_PEEP("skip:",n,depth);
3271 NEXT_OFF(scan) += NEXT_OFF(n);
3272 next = n + NODE_STEP_REGNODE;
3273#ifdef DEBUGGING
3274 if (stringok)
3275 stop = n;
3276#endif
3277 n = regnext(n);
3278 }
3279 else if (stringok) {
786e8c11 3280 const unsigned int oldl = STR_LEN(scan);
07be1b83 3281 regnode * const nnext = regnext(n);
b2230d39 3282
baa60164
KW
3283 /* XXX I (khw) kind of doubt that this works on platforms (should
3284 * Perl ever run on one) where U8_MAX is above 255 because of lots
3285 * of other assumptions */
79a81a6e 3286 /* Don't join if the sum can't fit into a single node */
b2230d39
KW
3287 if (oldl + STR_LEN(n) > U8_MAX)
3288 break;
538e84ed 3289
07be1b83 3290 DEBUG_PEEP("merg",n,depth);
07be1b83 3291 merged++;
b2230d39 3292
07be1b83
YO
3293 NEXT_OFF(scan) += NEXT_OFF(n);
3294 STR_LEN(scan) += STR_LEN(n);
3295 next = n + NODE_SZ_STR(n);
3296 /* Now we can overwrite *n : */
3297 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3298#ifdef DEBUGGING
3299 stop = next - 1;
3300#endif
3301 n = nnext;
3302 if (stopnow) break;
3303 }
3304
d47053eb
RGS
3305#ifdef EXPERIMENTAL_INPLACESCAN
3306 if (flags && !NEXT_OFF(n)) {
3307 DEBUG_PEEP("atch", val, depth);
3308 if (reg_off_by_arg[OP(n)]) {
3309 ARG_SET(n, val - n);
3310 }
3311 else {
3312 NEXT_OFF(n) = val - n;
3313 }
3314 stopnow = 1;
3315 }
07be1b83
YO
3316#endif
3317 }
2c2b7f86 3318
9d071ca8 3319 *min_subtract = 0;
fb29cc72 3320 *unfolded_multi_char = FALSE;
f646642f 3321
3f410cf6
KW
3322 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3323 * can now analyze for sequences of problematic code points. (Prior to
3324 * this final joining, sequences could have been split over boundaries, and
a0c4c608
KW
3325 * hence missed). The sequences only happen in folding, hence for any
3326 * non-EXACT EXACTish node */
86d6fcad 3327 if (OP(scan) != EXACT) {
31f05a37
KW
3328 U8* s0 = (U8*) STRING(scan);
3329 U8* s = s0;
3330 U8* s_end = s0 + STR_LEN(scan);
3331
3332 int total_count_delta = 0; /* Total delta number of characters that
3333 multi-char folds expand to */
f758bddf
KW
3334
3335 /* One pass is made over the node's string looking for all the
baa60164 3336 * possibilities. To avoid some tests in the loop, there are two main
f758bddf
KW
3337 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3338 * non-UTF-8 */
3339 if (UTF) {
31f05a37
KW
3340 U8* folded = NULL;
3341
3342 if (OP(scan) == EXACTFL) {
3343 U8 *d;
3344
3345 /* An EXACTFL node would already have been changed to another
3346 * node type unless there is at least one character in it that
3347 * is problematic; likely a character whose fold definition
3348 * won't be known until runtime, and so has yet to be folded.
3349 * For all but the UTF-8 locale, folds are 1-1 in length, but
3350 * to handle the UTF-8 case, we need to create a temporary
3351 * folded copy using UTF-8 locale rules in order to analyze it.
3352 * This is because our macros that look to see if a sequence is
3353 * a multi-char fold assume everything is folded (otherwise the
3354 * tests in those macros would be too complicated and slow).
3355 * Note that here, the non-problematic folds will have already
3356 * been done, so we can just copy such characters. We actually
3357 * don't completely fold the EXACTFL string. We skip the
3358 * unfolded multi-char folds, as that would just create work
3359 * below to figure out the size they already are */
3360
3361 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3362 d = folded;
3363 while (s < s_end) {
3364 STRLEN s_len = UTF8SKIP(s);
3365 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3366 Copy(s, d, s_len, U8);
3367 d += s_len;
3368 }
3369 else if (is_FOLDS_TO_MULTI_utf8(s)) {
fb29cc72 3370 *unfolded_multi_char = TRUE;
31f05a37
KW
3371 Copy(s, d, s_len, U8);
3372 d += s_len;
3373 }
3374 else if (isASCII(*s)) {
3375 *(d++) = toFOLD(*s);
3376 }
3377 else {
3378 STRLEN len;
3379 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3380 d += len;
3381 }
3382 s += s_len;
3383 }
3384
3385 /* Point the remainder of the routine to look at our temporary
3386 * folded copy */
3387 s = folded;
3388 s_end = d;
3389 } /* End of creating folded copy of EXACTFL string */
86d6fcad 3390
0a982f06
KW
3391 /* Examine the string for a multi-character fold sequence. UTF-8
3392 * patterns have all characters pre-folded by the time this code is
3393 * executed */
3394 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3395 length sequence we are looking for is 2 */
86d6fcad 3396 {
538e84ed 3397 int count = 0; /* How many characters in a multi-char fold */
251b239f 3398 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
0a982f06
KW
3399 if (! len) { /* Not a multi-char fold: get next char */
3400 s += UTF8SKIP(s);
3401 continue;
3402 }
bb914485 3403
31f05a37
KW
3404 /* Nodes with 'ss' require special handling, except for
3405 * EXACTFA-ish for which there is no multi-char fold to this */
0a982f06 3406 if (len == 2 && *s == 's' && *(s+1) == 's'
098b07d5
KW
3407 && OP(scan) != EXACTFA
3408 && OP(scan) != EXACTFA_NO_TRIE)
0a982f06
KW
3409 {
3410 count = 2;
31f05a37
KW
3411 if (OP(scan) != EXACTFL) {
3412 OP(scan) = EXACTFU_SS;
3413 }
0a982f06
KW
3414 s += 2;
3415 }
0a982f06 3416 else { /* Here is a generic multi-char fold. */
31f05a37
KW
3417 U8* multi_end = s + len;
3418
2fe3656f
KW
3419 /* Count how many characters are in it. In the case of
3420 * /aa, no folds which contain ASCII code points are
3421 * allowed, so check for those, and skip if found. */
31f05a37 3422 if (OP(scan) != EXACTFA && OP(scan) != EXACTFA_NO_TRIE) {
0a982f06
KW
3423 count = utf8_length(s, multi_end);
3424 s = multi_end;
3425 }
3426 else {
3427 while (s < multi_end) {
3428 if (isASCII(*s)) {
3429 s++;
3430 goto next_iteration;
3431 }
3432 else {
3433 s += UTF8SKIP(s);
3434 }
3435 count++;
3436 }
3437 }
3438 }
f758bddf 3439
0a982f06
KW
3440 /* The delta is how long the sequence is minus 1 (1 is how long
3441 * the character that folds to the sequence is) */
31f05a37
KW
3442 total_count_delta += count - 1;
3443 next_iteration: ;
bb914485 3444 }
31f05a37
KW
3445
3446 /* We created a temporary folded copy of the string in EXACTFL
3447 * nodes. Therefore we need to be sure it doesn't go below zero,
3448 * as the real string could be shorter */
3449 if (OP(scan) == EXACTFL) {
3450 int total_chars = utf8_length((U8*) STRING(scan),
3451 (U8*) STRING(scan) + STR_LEN(scan));
3452 if (total_count_delta > total_chars) {
3453 total_count_delta = total_chars;
3454 }
3455 }
3456
3457 *min_subtract += total_count_delta;
3458 Safefree(folded);
bb914485 3459 }
1ca267a5
KW
3460 else if (OP(scan) == EXACTFA) {
3461
3462 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3463 * fold to the ASCII range (and there are no existing ones in the