This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Move macro definition earlier in file
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
04e98a4d 89#include "dquote_static.c"
26faadbd 90#include "charclass_invlists.h"
81e983c1 91#include "inline_invlist.c"
1b0f46bf 92#include "unicode_constants.h"
04e98a4d 93
538e84ed
KW
94#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
95 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
96#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
97 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 98#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 99#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 100
a687059c
LW
101#ifndef STATIC
102#define STATIC static
103#endif
104
dea37815
KW
105#ifndef MIN
106#define MIN(a,b) ((a) < (b) ? (a) : (b))
107#endif
b1603ef8 108
09b2b2e6 109struct RExC_state_t {
514a91f1
DM
110 U32 flags; /* RXf_* are we folding, multilining? */
111 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 112 char *precomp; /* uncompiled string. */
288b8c02 113 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 114 regexp *rx; /* perl core regexp structure */
538e84ed
KW
115 regexp_internal *rxi; /* internal data for regexp object
116 pprivate field */
fac92740 117 char *start; /* Start of input for compile */
830247a4
IZ
118 char *end; /* End of input for compile */
119 char *parse; /* Input-scan pointer. */
ea3daa5d 120 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 121 regnode *emit_start; /* Start of emitted-code area */
538e84ed
KW
122 regnode *emit_bound; /* First regnode outside of the
123 allocated space */
f7c7e32a
DM
124 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
125 implies compiling, so don't emit */
9a81a976
KW
126 regnode_ssc emit_dummy; /* placeholder for emit to point to;
127 large enough for the largest
128 non-EXACTish node, so can use it as
129 scratch in pass1 */
830247a4
IZ
130 I32 naughty; /* How bad is this pattern? */
131 I32 sawback; /* Did we see \1, ...? */
132 U32 seen;
ea3daa5d 133 SSize_t size; /* Code size. */
538e84ed
KW
134 I32 npar; /* Capture buffer count, (OPEN) plus
135 one. ("par" 0 is the whole
136 pattern)*/
137 I32 nestroot; /* root parens we are in - used by
138 accept */
830247a4
IZ
139 I32 extralen;
140 I32 seen_zerolen;
40d049e4
YO
141 regnode **open_parens; /* pointers to open parens */
142 regnode **close_parens; /* pointers to close parens */
143 regnode *opend; /* END node in program */
02daf0ab
YO
144 I32 utf8; /* whether the pattern is utf8 or not */
145 I32 orig_utf8; /* whether the pattern was originally in utf8 */
146 /* XXX use this for future optimisation of case
147 * where pattern must be upgraded to utf8. */
e40e74fe
KW
148 I32 uni_semantics; /* If a d charset modifier should use unicode
149 rules, even if the pattern is not in
150 utf8 */
81714fb9 151 HV *paren_names; /* Paren names */
538e84ed 152
40d049e4
YO
153 regnode **recurse; /* Recurse regops */
154 I32 recurse_count; /* Number of recurse regops */
538e84ed
KW
155 U8 *study_chunk_recursed; /* bitmap of which parens we have moved
156 through */
09a65838 157 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 158 I32 in_lookbehind;
4624b182 159 I32 contains_locale;
cfafade5 160 I32 contains_i;
bb3f3ed2 161 I32 override_recoding;
9d53c457 162 I32 in_multi_char_class;
3d2bd50a 163 struct reg_code_block *code_blocks; /* positions of literal (?{})
68e2671b 164 within pattern */
b1603ef8
DM
165 int num_code_blocks; /* size of code_blocks[] */
166 int code_index; /* next code_blocks[] slot */
ee273784 167 SSize_t maxlen; /* mininum possible number of chars in string to match */
dc6d7f5c 168#ifdef ADD_TO_REGEXEC
830247a4
IZ
169 char *starttry; /* -Dr: where regtry was called. */
170#define RExC_starttry (pRExC_state->starttry)
171#endif
d24ca0c5 172 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 173#ifdef DEBUGGING
be8e71aa 174 const char *lastparse;
3dab1dad 175 I32 lastnum;
1f1031fe 176 AV *paren_name_list; /* idx -> name */
d9a72fcc 177 U32 study_chunk_recursed_count;
3dab1dad
YO
178#define RExC_lastparse (pRExC_state->lastparse)
179#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 180#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 181#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
3dab1dad 182#endif
09b2b2e6 183};
830247a4 184
e2509266 185#define RExC_flags (pRExC_state->flags)
514a91f1 186#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 187#define RExC_precomp (pRExC_state->precomp)
288b8c02 188#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 189#define RExC_rx (pRExC_state->rx)
f8fc2ecf 190#define RExC_rxi (pRExC_state->rxi)
fac92740 191#define RExC_start (pRExC_state->start)
830247a4
IZ
192#define RExC_end (pRExC_state->end)
193#define RExC_parse (pRExC_state->parse)
194#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237 195#ifdef RE_TRACK_PATTERN_OFFSETS
538e84ed
KW
196#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
197 others */
7122b237 198#endif
830247a4 199#define RExC_emit (pRExC_state->emit)
f7c7e32a 200#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 201#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 202#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
203#define RExC_naughty (pRExC_state->naughty)
204#define RExC_sawback (pRExC_state->sawback)
205#define RExC_seen (pRExC_state->seen)
206#define RExC_size (pRExC_state->size)
ee273784 207#define RExC_maxlen (pRExC_state->maxlen)
830247a4 208#define RExC_npar (pRExC_state->npar)
e2e6a0f1 209#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
210#define RExC_extralen (pRExC_state->extralen)
211#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 212#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 213#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 214#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
215#define RExC_open_parens (pRExC_state->open_parens)
216#define RExC_close_parens (pRExC_state->close_parens)
217#define RExC_opend (pRExC_state->opend)
81714fb9 218#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
219#define RExC_recurse (pRExC_state->recurse)
220#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 221#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
222#define RExC_study_chunk_recursed_bytes \
223 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 224#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 225#define RExC_contains_locale (pRExC_state->contains_locale)
cfafade5 226#define RExC_contains_i (pRExC_state->contains_i)
9d53c457
KW
227#define RExC_override_recoding (pRExC_state->override_recoding)
228#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
830247a4 229
cde0cee5 230
a687059c
LW
231#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
232#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 233 ((*s) == '{' && regcurly(s)))
a687059c
LW
234
235/*
236 * Flags to be passed up and down.
237 */
a687059c 238#define WORST 0 /* Worst case. */
a3b492c3 239#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 240
e64f369d 241/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
242 * character. (There needs to be a case: in the switch statement in regexec.c
243 * for any node marked SIMPLE.) Note that this is not the same thing as
244 * REGNODE_SIMPLE */
fda99bee 245#define SIMPLE 0x02
e64f369d 246#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
247#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
248#define TRYAGAIN 0x10 /* Weeded out a declaration. */
249#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
a687059c 250
3dab1dad
YO
251#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
252
07be1b83
YO
253/* whether trie related optimizations are enabled */
254#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
255#define TRIE_STUDY_OPT
786e8c11 256#define FULL_TRIE_STUDY
07be1b83
YO
257#define TRIE_STCLASS
258#endif
1de06328
YO
259
260
40d049e4
YO
261
262#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
263#define PBITVAL(paren) (1 << ((paren) & 7))
264#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
265#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
266#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
267
bbd61b5f 268#define REQUIRE_UTF8 STMT_START { \
8d9c2815
NC
269 if (!UTF) { \
270 *flagp = RESTART_UTF8; \
271 return NULL; \
272 } \
bbd61b5f 273 } STMT_END
40d049e4 274
f19b1a63
KW
275/* This converts the named class defined in regcomp.h to its equivalent class
276 * number defined in handy.h. */
277#define namedclass_to_classnum(class) ((int) ((class) / 2))
278#define classnum_to_namedclass(classnum) ((classnum) * 2)
279
de92f5e6
KW
280#define _invlist_union_complement_2nd(a, b, output) \
281 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
282#define _invlist_intersection_complement_2nd(a, b, output) \
283 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
284
1de06328
YO
285/* About scan_data_t.
286
287 During optimisation we recurse through the regexp program performing
288 various inplace (keyhole style) optimisations. In addition study_chunk
289 and scan_commit populate this data structure with information about
538e84ed 290 what strings MUST appear in the pattern. We look for the longest
3b753521 291 string that must appear at a fixed location, and we look for the
1de06328
YO
292 longest string that may appear at a floating location. So for instance
293 in the pattern:
538e84ed 294
1de06328 295 /FOO[xX]A.*B[xX]BAR/
538e84ed 296
1de06328
YO
297 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
298 strings (because they follow a .* construct). study_chunk will identify
299 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 300
1de06328 301 The strings can be composites, for instance
538e84ed 302
1de06328 303 /(f)(o)(o)/
538e84ed 304
1de06328 305 will result in a composite fixed substring 'foo'.
538e84ed 306
1de06328 307 For each string some basic information is maintained:
538e84ed 308
1de06328
YO
309 - offset or min_offset
310 This is the position the string must appear at, or not before.
311 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
312 characters must match before the string we are searching for.
313 Likewise when combined with minlenp and the length of the string it
538e84ed 314 tells us how many characters must appear after the string we have
1de06328 315 found.
538e84ed 316
1de06328
YO
317 - max_offset
318 Only used for floating strings. This is the rightmost point that
ea3daa5d 319 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 320 string can occur infinitely far to the right.
538e84ed 321
1de06328 322 - minlenp
2d608413
KW
323 A pointer to the minimum number of characters of the pattern that the
324 string was found inside. This is important as in the case of positive
538e84ed 325 lookahead or positive lookbehind we can have multiple patterns
1de06328 326 involved. Consider
538e84ed 327
1de06328 328 /(?=FOO).*F/
538e84ed 329
1de06328
YO
330 The minimum length of the pattern overall is 3, the minimum length
331 of the lookahead part is 3, but the minimum length of the part that
538e84ed 332 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 333 minimum length for the F is 1. This is important as the minimum length
538e84ed 334 is used to determine offsets in front of and behind the string being
1de06328 335 looked for. Since strings can be composites this is the length of the
486ec47a 336 pattern at the time it was committed with a scan_commit. Note that
1de06328 337 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 338 are not known until the full pattern has been compiled, thus the
1de06328 339 pointer to the value.
538e84ed 340
1de06328 341 - lookbehind
538e84ed 342
1de06328 343 In the case of lookbehind the string being searched for can be
538e84ed 344 offset past the start point of the final matching string.
1de06328
YO
345 If this value was just blithely removed from the min_offset it would
346 invalidate some of the calculations for how many chars must match
347 before or after (as they are derived from min_offset and minlen and
538e84ed 348 the length of the string being searched for).
1de06328
YO
349 When the final pattern is compiled and the data is moved from the
350 scan_data_t structure into the regexp structure the information
538e84ed
KW
351 about lookbehind is factored in, with the information that would
352 have been lost precalculated in the end_shift field for the
1de06328
YO
353 associated string.
354
355 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 356 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
357
358*/
2c2d71f5
JH
359
360typedef struct scan_data_t {
1de06328
YO
361 /*I32 len_min; unused */
362 /*I32 len_delta; unused */
49f55535 363 SSize_t pos_min;
ea3daa5d 364 SSize_t pos_delta;
2c2d71f5 365 SV *last_found;
ea3daa5d 366 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 367 SSize_t last_start_min;
ea3daa5d 368 SSize_t last_start_max;
1de06328
YO
369 SV **longest; /* Either &l_fixed, or &l_float. */
370 SV *longest_fixed; /* longest fixed string found in pattern */
49f55535 371 SSize_t offset_fixed; /* offset where it starts */
ea3daa5d 372 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
373 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
374 SV *longest_float; /* longest floating string found in pattern */
49f55535 375 SSize_t offset_float_min; /* earliest point in string it can appear */
ea3daa5d
FC
376 SSize_t offset_float_max; /* latest point in string it can appear */
377 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
49f55535 378 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
2c2d71f5
JH
379 I32 flags;
380 I32 whilem_c;
49f55535 381 SSize_t *last_closep;
b8f7bb16 382 regnode_ssc *start_class;
2c2d71f5
JH
383} scan_data_t;
384
a687059c 385/*
e50aee73 386 * Forward declarations for pregcomp()'s friends.
a687059c 387 */
a0d0e21e 388
27da23d5 389static const scan_data_t zero_scan_data =
1de06328 390 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
391
392#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
393#define SF_BEFORE_SEOL 0x0001
394#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
395#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
396#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
397
44e3dfd2
BF
398#define SF_FIX_SHIFT_EOL (+2)
399#define SF_FL_SHIFT_EOL (+4)
c277df42
IZ
400
401#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
402#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
403
404#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
405#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
406#define SF_IS_INF 0x0040
407#define SF_HAS_PAR 0x0080
408#define SF_IN_PAR 0x0100
409#define SF_HAS_EVAL 0x0200
410#define SCF_DO_SUBSTR 0x0400
653099ff
GS
411#define SCF_DO_STCLASS_AND 0x0800
412#define SCF_DO_STCLASS_OR 0x1000
413#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 414#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 415
786e8c11 416#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 417#define SCF_SEEN_ACCEPT 0x8000
688e0391 418#define SCF_TRIE_DOING_RESTUDY 0x10000
07be1b83 419
43fead97 420#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
421
422/* The enums for all these are ordered so things work out correctly */
a62b1201 423#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
424#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
425 == REGEX_DEPENDS_CHARSET)
00b27cfc 426#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
427#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
428 >= REGEX_UNICODE_CHARSET)
429#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
430 == REGEX_ASCII_RESTRICTED_CHARSET)
431#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
432 >= REGEX_ASCII_RESTRICTED_CHARSET)
433#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
434 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 435
43fead97 436#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 437
f2c2a6ab
KW
438/* For programs that want to be strictly Unicode compatible by dying if any
439 * attempt is made to match a non-Unicode code point against a Unicode
440 * property. */
441#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
442
93733859 443#define OOB_NAMEDCLASS -1
b8c5462f 444
8e661ac5
KW
445/* There is no code point that is out-of-bounds, so this is problematic. But
446 * its only current use is to initialize a variable that is always set before
447 * looked at. */
448#define OOB_UNICODE 0xDEADBEEF
449
a0ed51b3
LW
450#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
451#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
452
8615cb43 453
b45f050a
JF
454/* length of regex to show in messages that don't mark a position within */
455#define RegexLengthToShowInErrorMessages 127
456
457/*
458 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
459 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
460 * op/pragma/warn/regcomp.
461 */
7253e4e3
RK
462#define MARKER1 "<-- HERE" /* marker as it appears in the description */
463#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 464
538e84ed
KW
465#define REPORT_LOCATION " in regex; marked by " MARKER1 \
466 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
b45f050a 467
c1d900c3
BF
468#define REPORT_LOCATION_ARGS(offset) \
469 UTF8fARG(UTF, offset, RExC_precomp), \
470 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
471
b45f050a
JF
472/*
473 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
474 * arg. Show regex, up to a maximum length. If it's too long, chop and add
475 * "...".
476 */
58e23c8d 477#define _FAIL(code) STMT_START { \
bfed75c6 478 const char *ellipses = ""; \
ccb2c380
MP
479 IV len = RExC_end - RExC_precomp; \
480 \
481 if (!SIZE_ONLY) \
a5e7bc51 482 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
483 if (len > RegexLengthToShowInErrorMessages) { \
484 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
485 len = RegexLengthToShowInErrorMessages - 10; \
486 ellipses = "..."; \
487 } \
58e23c8d 488 code; \
ccb2c380 489} STMT_END
8615cb43 490
58e23c8d 491#define FAIL(msg) _FAIL( \
c1d900c3
BF
492 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
493 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
494
495#define FAIL2(msg,arg) _FAIL( \
c1d900c3
BF
496 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
497 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 498
b45f050a 499/*
b45f050a
JF
500 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
501 */
ccb2c380 502#define Simple_vFAIL(m) STMT_START { \
8fb0127d
YO
503 const IV offset = \
504 (RExC_parse > RExC_end ? RExC_end : RExC_parse) - RExC_precomp; \
ccb2c380 505 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
c1d900c3 506 m, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 507} STMT_END
b45f050a
JF
508
509/*
510 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
511 */
ccb2c380
MP
512#define vFAIL(m) STMT_START { \
513 if (!SIZE_ONLY) \
a5e7bc51 514 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
515 Simple_vFAIL(m); \
516} STMT_END
b45f050a
JF
517
518/*
519 * Like Simple_vFAIL(), but accepts two arguments.
520 */
ccb2c380 521#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 522 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
523 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
524 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 525} STMT_END
b45f050a
JF
526
527/*
528 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
529 */
ccb2c380
MP
530#define vFAIL2(m,a1) STMT_START { \
531 if (!SIZE_ONLY) \
a5e7bc51 532 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
533 Simple_vFAIL2(m, a1); \
534} STMT_END
b45f050a
JF
535
536
537/*
538 * Like Simple_vFAIL(), but accepts three arguments.
539 */
ccb2c380 540#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 541 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
542 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
543 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 544} STMT_END
b45f050a
JF
545
546/*
547 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
548 */
ccb2c380
MP
549#define vFAIL3(m,a1,a2) STMT_START { \
550 if (!SIZE_ONLY) \
a5e7bc51 551 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
552 Simple_vFAIL3(m, a1, a2); \
553} STMT_END
b45f050a
JF
554
555/*
556 * Like Simple_vFAIL(), but accepts four arguments.
557 */
ccb2c380 558#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 559 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
560 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
561 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 562} STMT_END
b45f050a 563
95db3ffa
KW
564#define vFAIL4(m,a1,a2,a3) STMT_START { \
565 if (!SIZE_ONLY) \
566 SAVEFREESV(RExC_rx_sv); \
567 Simple_vFAIL4(m, a1, a2, a3); \
568} STMT_END
569
946095af
BF
570/* A specialized version of vFAIL2 that works with UTF8f */
571#define vFAIL2utf8f(m, a1) STMT_START { \
ef3f731d
BF
572 const IV offset = RExC_parse - RExC_precomp; \
573 if (!SIZE_ONLY) \
574 SAVEFREESV(RExC_rx_sv); \
575 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
576 REPORT_LOCATION_ARGS(offset)); \
946095af
BF
577} STMT_END
578
499333dc
KW
579/* These have asserts in them because of [perl #122671] Many warnings in
580 * regcomp.c can occur twice. If they get output in pass1 and later in that
581 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
582 * would get output again. So they should be output in pass2, and these
583 * asserts make sure new warnings follow that paradigm. */
946095af 584
5e0a247b
KW
585/* m is not necessarily a "literal string", in this macro */
586#define reg_warn_non_literal_string(loc, m) STMT_START { \
587 const IV offset = loc - RExC_precomp; \
499333dc 588 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
c1d900c3 589 m, REPORT_LOCATION_ARGS(offset)); \
5e0a247b
KW
590} STMT_END
591
668c081a 592#define ckWARNreg(loc,m) STMT_START { \
a28509cc 593 const IV offset = loc - RExC_precomp; \
499333dc 594 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 595 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
596} STMT_END
597
0d6106aa
KW
598#define vWARN_dep(loc, m) STMT_START { \
599 const IV offset = loc - RExC_precomp; \
499333dc 600 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
c1d900c3 601 REPORT_LOCATION_ARGS(offset)); \
0d6106aa
KW
602} STMT_END
603
147508a2
KW
604#define ckWARNdep(loc,m) STMT_START { \
605 const IV offset = loc - RExC_precomp; \
499333dc 606 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
147508a2 607 m REPORT_LOCATION, \
c1d900c3 608 REPORT_LOCATION_ARGS(offset)); \
147508a2
KW
609} STMT_END
610
668c081a 611#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 612 const IV offset = loc - RExC_precomp; \
499333dc 613 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18 614 m REPORT_LOCATION, \
c1d900c3 615 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
616} STMT_END
617
b23eb183 618#define ckWARN2reg_d(loc,m, a1) STMT_START { \
2335b3d3 619 const IV offset = loc - RExC_precomp; \
499333dc 620 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
2335b3d3 621 m REPORT_LOCATION, \
c1d900c3 622 a1, REPORT_LOCATION_ARGS(offset)); \
2335b3d3
KW
623} STMT_END
624
668c081a 625#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 626 const IV offset = loc - RExC_precomp; \
499333dc 627 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 628 a1, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
629} STMT_END
630
631#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 632 const IV offset = loc - RExC_precomp; \
499333dc 633 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 634 a1, a2, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
635} STMT_END
636
668c081a
NC
637#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
638 const IV offset = loc - RExC_precomp; \
499333dc 639 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 640 a1, a2, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
641} STMT_END
642
ccb2c380 643#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 644 const IV offset = loc - RExC_precomp; \
499333dc 645 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 646 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
647} STMT_END
648
668c081a
NC
649#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
650 const IV offset = loc - RExC_precomp; \
499333dc 651 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 652 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
653} STMT_END
654
ccb2c380 655#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 656 const IV offset = loc - RExC_precomp; \
499333dc 657 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 658 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 659} STMT_END
9d1d55b5 660
8615cb43 661
cd439c50 662/* Allow for side effects in s */
ccb2c380
MP
663#define REGC(c,s) STMT_START { \
664 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
665} STMT_END
cd439c50 666
538e84ed 667/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
668 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
669 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
670 * Element 0 holds the number n.
07be1b83 671 * Position is 1 indexed.
fac92740 672 */
7122b237
YO
673#ifndef RE_TRACK_PATTERN_OFFSETS
674#define Set_Node_Offset_To_R(node,byte)
675#define Set_Node_Offset(node,byte)
676#define Set_Cur_Node_Offset
677#define Set_Node_Length_To_R(node,len)
678#define Set_Node_Length(node,len)
6a86c6ad 679#define Set_Node_Cur_Length(node,start)
538e84ed
KW
680#define Node_Offset(n)
681#define Node_Length(n)
7122b237
YO
682#define Set_Node_Offset_Length(node,offset,len)
683#define ProgLen(ri) ri->u.proglen
684#define SetProgLen(ri,x) ri->u.proglen = x
685#else
686#define ProgLen(ri) ri->u.offsets[0]
687#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
688#define Set_Node_Offset_To_R(node,byte) STMT_START { \
689 if (! SIZE_ONLY) { \
690 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 691 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 692 if((node) < 0) { \
538e84ed
KW
693 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
694 (int)(node)); \
ccb2c380
MP
695 } else { \
696 RExC_offsets[2*(node)-1] = (byte); \
697 } \
698 } \
699} STMT_END
700
701#define Set_Node_Offset(node,byte) \
702 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
703#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
704
705#define Set_Node_Length_To_R(node,len) STMT_START { \
706 if (! SIZE_ONLY) { \
707 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 708 __LINE__, (int)(node), (int)(len))); \
ccb2c380 709 if((node) < 0) { \
538e84ed
KW
710 Perl_croak(aTHX_ "value of node is %d in Length macro", \
711 (int)(node)); \
ccb2c380
MP
712 } else { \
713 RExC_offsets[2*(node)] = (len); \
714 } \
715 } \
716} STMT_END
717
718#define Set_Node_Length(node,len) \
719 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
720#define Set_Node_Cur_Length(node, start) \
721 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
722
723/* Get offsets and lengths */
724#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
725#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
726
07be1b83
YO
727#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
728 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
729 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
730} STMT_END
7122b237 731#endif
07be1b83
YO
732
733#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
734#define EXPERIMENTAL_INPLACESCAN
f427392e 735#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 736
9e9ecfdf 737#define DEBUG_RExC_seen() \
538e84ed
KW
738 DEBUG_OPTIMISE_MORE_r({ \
739 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
740 \
e384d5c1
YO
741 if (RExC_seen & REG_ZERO_LEN_SEEN) \
742 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
538e84ed 743 \
e384d5c1
YO
744 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
745 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
538e84ed 746 \
e384d5c1
YO
747 if (RExC_seen & REG_GPOS_SEEN) \
748 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
538e84ed 749 \
e384d5c1
YO
750 if (RExC_seen & REG_CANY_SEEN) \
751 PerlIO_printf(Perl_debug_log,"REG_CANY_SEEN "); \
538e84ed 752 \
e384d5c1
YO
753 if (RExC_seen & REG_RECURSE_SEEN) \
754 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
538e84ed 755 \
e384d5c1
YO
756 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
757 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 758 \
e384d5c1
YO
759 if (RExC_seen & REG_VERBARG_SEEN) \
760 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
538e84ed 761 \
e384d5c1
YO
762 if (RExC_seen & REG_CUTGROUP_SEEN) \
763 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
538e84ed 764 \
e384d5c1
YO
765 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
766 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 767 \
e384d5c1
YO
768 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
769 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 770 \
e384d5c1
YO
771 if (RExC_seen & REG_GOSTART_SEEN) \
772 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
538e84ed 773 \
ee273784
YO
774 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
775 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
776 \
538e84ed 777 PerlIO_printf(Perl_debug_log,"\n"); \
9e9ecfdf
YO
778 });
779
304ee84b
YO
780#define DEBUG_STUDYDATA(str,data,depth) \
781DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 782 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
783 "%*s" str "Pos:%"IVdf"/%"IVdf \
784 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
785 (int)(depth)*2, "", \
786 (IV)((data)->pos_min), \
787 (IV)((data)->pos_delta), \
304ee84b 788 (UV)((data)->flags), \
1de06328 789 (IV)((data)->whilem_c), \
304ee84b
YO
790 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
791 is_inf ? "INF " : "" \
1de06328
YO
792 ); \
793 if ((data)->last_found) \
794 PerlIO_printf(Perl_debug_log, \
795 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
796 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
797 SvPVX_const((data)->last_found), \
798 (IV)((data)->last_end), \
799 (IV)((data)->last_start_min), \
800 (IV)((data)->last_start_max), \
801 ((data)->longest && \
802 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
803 SvPVX_const((data)->longest_fixed), \
804 (IV)((data)->offset_fixed), \
805 ((data)->longest && \
806 (data)->longest==&((data)->longest_float)) ? "*" : "", \
807 SvPVX_const((data)->longest_float), \
808 (IV)((data)->offset_float_min), \
809 (IV)((data)->offset_float_max) \
810 ); \
811 PerlIO_printf(Perl_debug_log,"\n"); \
812});
813
549b4e78
KW
814#ifdef DEBUGGING
815
8e35b056
KW
816/* is c a control character for which we have a mnemonic? */
817#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
818
549b4e78
KW
819STATIC const char *
820S_cntrl_to_mnemonic(const U8 c)
821{
822 /* Returns the mnemonic string that represents character 'c', if one
823 * exists; NULL otherwise. The only ones that exist for the purposes of
824 * this routine are a few control characters */
825
826 switch (c) {
827 case '\a': return "\\a";
828 case '\b': return "\\b";
829 case ESC_NATIVE: return "\\e";
830 case '\f': return "\\f";
831 case '\n': return "\\n";
832 case '\r': return "\\r";
833 case '\t': return "\\t";
834 }
835
836 return NULL;
837}
838
839#endif
840
653099ff 841/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 842 Update the longest found anchored substring and the longest found
653099ff
GS
843 floating substrings if needed. */
844
4327152a 845STATIC void
ea3daa5d
FC
846S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
847 SSize_t *minlenp, int is_inf)
c277df42 848{
e1ec3a88
AL
849 const STRLEN l = CHR_SVLEN(data->last_found);
850 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 851 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 852
7918f24d
NC
853 PERL_ARGS_ASSERT_SCAN_COMMIT;
854
c277df42 855 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 856 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
857 if (*data->longest == data->longest_fixed) {
858 data->offset_fixed = l ? data->last_start_min : data->pos_min;
859 if (data->flags & SF_BEFORE_EOL)
b81d288d 860 data->flags
c277df42
IZ
861 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
862 else
863 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 864 data->minlen_fixed=minlenp;
1de06328 865 data->lookbehind_fixed=0;
a0ed51b3 866 }
304ee84b 867 else { /* *data->longest == data->longest_float */
c277df42 868 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
869 data->offset_float_max = (l
870 ? data->last_start_max
ea3daa5d
FC
871 : (data->pos_delta == SSize_t_MAX
872 ? SSize_t_MAX
873 : data->pos_min + data->pos_delta));
874 if (is_inf
875 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
876 data->offset_float_max = SSize_t_MAX;
c277df42 877 if (data->flags & SF_BEFORE_EOL)
b81d288d 878 data->flags
c277df42
IZ
879 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
880 else
881 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
882 data->minlen_float=minlenp;
883 data->lookbehind_float=0;
c277df42
IZ
884 }
885 }
886 SvCUR_set(data->last_found, 0);
0eda9292 887 {
a28509cc 888 SV * const sv = data->last_found;
097eb12c
AL
889 if (SvUTF8(sv) && SvMAGICAL(sv)) {
890 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
891 if (mg)
892 mg->mg_len = 0;
893 }
0eda9292 894 }
c277df42
IZ
895 data->last_end = -1;
896 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 897 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
898}
899
cdd87c1d
KW
900/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
901 * list that describes which code points it matches */
902
653099ff 903STATIC void
3420edd7 904S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 905{
cdd87c1d
KW
906 /* Set the SSC 'ssc' to match an empty string or any code point */
907
557bd3fb 908 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 909
71068078 910 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 911
cdd87c1d
KW
912 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
913 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
93e92956 914 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
915}
916
653099ff 917STATIC int
dc3bf405 918S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 919{
c144baaa
KW
920 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
921 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
922 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
923 * in any way, so there's no point in using it */
cdd87c1d
KW
924
925 UV start, end;
926 bool ret;
653099ff 927
557bd3fb 928 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 929
71068078 930 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 931
93e92956 932 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
933 return FALSE;
934 }
935
936 /* See if the list consists solely of the range 0 - Infinity */
937 invlist_iterinit(ssc->invlist);
938 ret = invlist_iternext(ssc->invlist, &start, &end)
939 && start == 0
940 && end == UV_MAX;
941
942 invlist_iterfinish(ssc->invlist);
943
944 if (ret) {
945 return TRUE;
946 }
947
948 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 949 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
950 int i;
951 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
952 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
953 return TRUE;
954 }
955 }
956 }
957
958 return FALSE;
653099ff
GS
959}
960
653099ff 961STATIC void
cdd87c1d 962S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 963{
cdd87c1d
KW
964 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
965 * string, any code point, or any posix class under locale */
966
557bd3fb 967 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 968
557bd3fb 969 Zero(ssc, 1, regnode_ssc);
71068078 970 set_ANYOF_SYNTHETIC(ssc);
93e92956 971 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 972 ssc_anything(ssc);
cdd87c1d
KW
973
974 /* If any portion of the regex is to operate under locale rules,
975 * initialization includes it. The reason this isn't done for all regexes
976 * is that the optimizer was written under the assumption that locale was
977 * all-or-nothing. Given the complexity and lack of documentation in the
978 * optimizer, and that there are inadequate test cases for locale, many
979 * parts of it may not work properly, it is safest to avoid locale unless
980 * necessary. */
981 if (RExC_contains_locale) {
982 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
983 }
984 else {
985 ANYOF_POSIXL_ZERO(ssc);
986 }
653099ff
GS
987}
988
b423522f 989STATIC int
dc3bf405
BF
990S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
991 const regnode_ssc *ssc)
b423522f
KW
992{
993 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
994 * to the list of code points matched, and locale posix classes; hence does
995 * not check its flags) */
996
997 UV start, end;
998 bool ret;
999
1000 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1001
71068078 1002 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1003
1004 invlist_iterinit(ssc->invlist);
1005 ret = invlist_iternext(ssc->invlist, &start, &end)
1006 && start == 0
1007 && end == UV_MAX;
1008
1009 invlist_iterfinish(ssc->invlist);
1010
1011 if (! ret) {
1012 return FALSE;
1013 }
1014
e0e1be5f 1015 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1016 return FALSE;
b423522f
KW
1017 }
1018
1019 return TRUE;
1020}
1021
1022STATIC SV*
1023S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1024 const regnode_charclass* const node)
b423522f
KW
1025{
1026 /* Returns a mortal inversion list defining which code points are matched
1027 * by 'node', which is of type ANYOF. Handles complementing the result if
1028 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1029 * returned list must, and will, contain every code point that is a
1030 * possibility. */
b423522f
KW
1031
1032 SV* invlist = sv_2mortal(_new_invlist(0));
1ee208c4 1033 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1034 unsigned int i;
1035 const U32 n = ARG(node);
31f05a37 1036 bool new_node_has_latin1 = FALSE;
b423522f
KW
1037
1038 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1039
1040 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1041 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1042 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1043 AV * const av = MUTABLE_AV(SvRV(rv));
1044 SV **const ary = AvARRAY(av);
1045 assert(RExC_rxi->data->what[n] == 's');
1046
1047 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1048 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1049 }
1050 else if (ary[0] && ary[0] != &PL_sv_undef) {
1051
1052 /* Here, no compile-time swash, and there are things that won't be
1053 * known until runtime -- we have to assume it could be anything */
1054 return _add_range_to_invlist(invlist, 0, UV_MAX);
1055 }
1ee208c4 1056 else if (ary[3] && ary[3] != &PL_sv_undef) {
b423522f
KW
1057
1058 /* Here no compile-time swash, and no run-time only data. Use the
1059 * node's inversion list */
1ee208c4
KW
1060 invlist = sv_2mortal(invlist_clone(ary[3]));
1061 }
1062
1063 /* Get the code points valid only under UTF-8 locales */
1064 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1065 && ary[2] && ary[2] != &PL_sv_undef)
1066 {
1067 only_utf8_locale_invlist = ary[2];
b423522f
KW
1068 }
1069 }
1070
dcb20b36
KW
1071 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1072 * code points, and an inversion list for the others, but if there are code
1073 * points that should match only conditionally on the target string being
1074 * UTF-8, those are placed in the inversion list, and not the bitmap.
1075 * Since there are circumstances under which they could match, they are
1076 * included in the SSC. But if the ANYOF node is to be inverted, we have
1077 * to exclude them here, so that when we invert below, the end result
1078 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1079 * have to do this here before we add the unconditionally matched code
1080 * points */
b423522f
KW
1081 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1082 _invlist_intersection_complement_2nd(invlist,
1083 PL_UpperLatin1,
1084 &invlist);
1085 }
1086
1087 /* Add in the points from the bit map */
dcb20b36 1088 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
b423522f
KW
1089 if (ANYOF_BITMAP_TEST(node, i)) {
1090 invlist = add_cp_to_invlist(invlist, i);
31f05a37 1091 new_node_has_latin1 = TRUE;
b423522f
KW
1092 }
1093 }
1094
1095 /* If this can match all upper Latin1 code points, have to add them
1096 * as well */
93e92956 1097 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_NON_UTF8_NON_ASCII) {
b423522f
KW
1098 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1099 }
1100
1101 /* Similarly for these */
93e92956 1102 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1103 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1104 }
1105
1106 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1107 _invlist_invert(invlist);
1108 }
31f05a37
KW
1109 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1110
1111 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1112 * locale. We can skip this if there are no 0-255 at all. */
1113 _invlist_union(invlist, PL_Latin1, &invlist);
1114 }
1115
1ee208c4
KW
1116 /* Similarly add the UTF-8 locale possible matches. These have to be
1117 * deferred until after the non-UTF-8 locale ones are taken care of just
1118 * above, or it leads to wrong results under ANYOF_INVERT */
1119 if (only_utf8_locale_invlist) {
31f05a37 1120 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1121 only_utf8_locale_invlist,
31f05a37
KW
1122 ANYOF_FLAGS(node) & ANYOF_INVERT,
1123 &invlist);
1124 }
b423522f
KW
1125
1126 return invlist;
1127}
1128
1051e1c4 1129/* These two functions currently do the exact same thing */
557bd3fb 1130#define ssc_init_zero ssc_init
653099ff 1131
cdd87c1d
KW
1132#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1133#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1134
557bd3fb 1135/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1136 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1137 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1138
653099ff 1139STATIC void
b423522f 1140S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1141 const regnode_charclass *and_with)
653099ff 1142{
cdd87c1d
KW
1143 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1144 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1145
a0dd4231
KW
1146 SV* anded_cp_list;
1147 U8 anded_flags;
1e6ade67 1148
cdd87c1d 1149 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1150
71068078 1151 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1152
1153 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1154 * the code point inversion list and just the relevant flags */
71068078 1155 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1156 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
a0dd4231 1157 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1158
1159 /* XXX This is a kludge around what appears to be deficiencies in the
1160 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1161 * there are paths through the optimizer where it doesn't get weeded
1162 * out when it should. And if we don't make some extra provision for
1163 * it like the code just below, it doesn't get added when it should.
1164 * This solution is to add it only when AND'ing, which is here, and
1165 * only when what is being AND'ed is the pristine, original node
1166 * matching anything. Thus it is like adding it to ssc_anything() but
1167 * only when the result is to be AND'ed. Probably the same solution
1168 * could be adopted for the same problem we have with /l matching,
1169 * which is solved differently in S_ssc_init(), and that would lead to
1170 * fewer false positives than that solution has. But if this solution
1171 * creates bugs, the consequences are only that a warning isn't raised
1172 * that should be; while the consequences for having /l bugs is
1173 * incorrect matches */
7dcac5f6 1174 if (ssc_is_anything((regnode_ssc *)and_with)) {
e9b08962
KW
1175 anded_flags |= ANYOF_WARN_SUPER;
1176 }
a0dd4231
KW
1177 }
1178 else {
5c0f85ef 1179 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
eff8b7dc 1180 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
a0dd4231
KW
1181 }
1182
1183 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1184
1185 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1186 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1187 * 'and_with' may be inverted. When not inverted, we have the situation of
1188 * computing:
1189 * (C1 | P1) & (C2 | P2)
1190 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1191 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1192 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1193 * <= ((C1 & C2) | P1 | P2)
1194 * Alternatively, the last few steps could be:
1195 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1196 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1197 * <= (C1 | C2 | (P1 & P2))
1198 * We favor the second approach if either P1 or P2 is non-empty. This is
1199 * because these components are a barrier to doing optimizations, as what
1200 * they match cannot be known until the moment of matching as they are
1201 * dependent on the current locale, 'AND"ing them likely will reduce or
1202 * eliminate them.
1203 * But we can do better if we know that C1,P1 are in their initial state (a
1204 * frequent occurrence), each matching everything:
1205 * (<everything>) & (C2 | P2) = C2 | P2
1206 * Similarly, if C2,P2 are in their initial state (again a frequent
1207 * occurrence), the result is a no-op
1208 * (C1 | P1) & (<everything>) = C1 | P1
1209 *
1210 * Inverted, we have
1211 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1212 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1213 * <= (C1 & ~C2) | (P1 & ~P2)
1214 * */
1aa99e6b 1215
a0dd4231 1216 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1217 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1218 {
cdd87c1d 1219 unsigned int i;
8951c461 1220
cdd87c1d
KW
1221 ssc_intersection(ssc,
1222 anded_cp_list,
1223 FALSE /* Has already been inverted */
1224 );
c6b76537 1225
cdd87c1d
KW
1226 /* If either P1 or P2 is empty, the intersection will be also; can skip
1227 * the loop */
93e92956 1228 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1229 ANYOF_POSIXL_ZERO(ssc);
1230 }
e0e1be5f 1231 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1232
1233 /* Note that the Posix class component P from 'and_with' actually
1234 * looks like:
1235 * P = Pa | Pb | ... | Pn
1236 * where each component is one posix class, such as in [\w\s].
1237 * Thus
1238 * ~P = ~(Pa | Pb | ... | Pn)
1239 * = ~Pa & ~Pb & ... & ~Pn
1240 * <= ~Pa | ~Pb | ... | ~Pn
1241 * The last is something we can easily calculate, but unfortunately
1242 * is likely to have many false positives. We could do better
1243 * in some (but certainly not all) instances if two classes in
1244 * P have known relationships. For example
1245 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1246 * So
1247 * :lower: & :print: = :lower:
1248 * And similarly for classes that must be disjoint. For example,
1249 * since \s and \w can have no elements in common based on rules in
1250 * the POSIX standard,
1251 * \w & ^\S = nothing
1252 * Unfortunately, some vendor locales do not meet the Posix
1253 * standard, in particular almost everything by Microsoft.
1254 * The loop below just changes e.g., \w into \W and vice versa */
1255
1ee208c4 1256 regnode_charclass_posixl temp;
cdd87c1d
KW
1257 int add = 1; /* To calculate the index of the complement */
1258
1259 ANYOF_POSIXL_ZERO(&temp);
1260 for (i = 0; i < ANYOF_MAX; i++) {
1261 assert(i % 2 != 0
7dcac5f6
KW
1262 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1263 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1264
7dcac5f6 1265 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1266 ANYOF_POSIXL_SET(&temp, i + add);
1267 }
1268 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1269 }
1270 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1271
cdd87c1d
KW
1272 } /* else ssc already has no posixes */
1273 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1274 in its initial state */
71068078 1275 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1276 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1277 {
1278 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1279 * copy it over 'ssc' */
1280 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1281 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1282 StructCopy(and_with, ssc, regnode_ssc);
1283 }
1284 else {
1285 ssc->invlist = anded_cp_list;
1286 ANYOF_POSIXL_ZERO(ssc);
93e92956 1287 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1288 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1289 }
1290 }
1291 }
e0e1be5f 1292 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
93e92956 1293 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1294 {
1295 /* One or the other of P1, P2 is non-empty. */
93e92956 1296 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1297 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1298 }
cdd87c1d
KW
1299 ssc_union(ssc, anded_cp_list, FALSE);
1300 }
1301 else { /* P1 = P2 = empty */
1302 ssc_intersection(ssc, anded_cp_list, FALSE);
1303 }
137165a6 1304 }
653099ff
GS
1305}
1306
653099ff 1307STATIC void
cdd87c1d 1308S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1309 const regnode_charclass *or_with)
653099ff 1310{
cdd87c1d
KW
1311 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1312 * another SSC or a regular ANYOF class. Can create false positives if
1313 * 'or_with' is to be inverted. */
7918f24d 1314
a0dd4231
KW
1315 SV* ored_cp_list;
1316 U8 ored_flags;
c6b76537 1317
cdd87c1d 1318 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1319
71068078 1320 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1321
1322 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1323 * the code point inversion list and just the relevant flags */
71068078 1324 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1325 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
a0dd4231
KW
1326 ored_flags = ANYOF_FLAGS(or_with);
1327 }
1328 else {
5c0f85ef 1329 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
eff8b7dc 1330 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
a0dd4231
KW
1331 }
1332
1333 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1334
1335 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1336 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1337 * 'or_with' may be inverted. When not inverted, we have the simple
1338 * situation of computing:
1339 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1340 * If P1|P2 yields a situation with both a class and its complement are
1341 * set, like having both \w and \W, this matches all code points, and we
1342 * can delete these from the P component of the ssc going forward. XXX We
1343 * might be able to delete all the P components, but I (khw) am not certain
1344 * about this, and it is better to be safe.
1345 *
1346 * Inverted, we have
1347 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1348 * <= (C1 | P1) | ~C2
1349 * <= (C1 | ~C2) | P1
1350 * (which results in actually simpler code than the non-inverted case)
1351 * */
9826f543 1352
a0dd4231 1353 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1354 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1355 {
cdd87c1d 1356 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1357 } /* else Not inverted */
93e92956 1358 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1359 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1360 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1361 unsigned int i;
1362 for (i = 0; i < ANYOF_MAX; i += 2) {
1363 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1364 {
1365 ssc_match_all_cp(ssc);
1366 ANYOF_POSIXL_CLEAR(ssc, i);
1367 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1368 }
1369 }
1370 }
1aa99e6b 1371 }
cdd87c1d
KW
1372
1373 ssc_union(ssc,
1374 ored_cp_list,
1375 FALSE /* Already has been inverted */
1376 );
653099ff
GS
1377}
1378
b423522f
KW
1379PERL_STATIC_INLINE void
1380S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1381{
1382 PERL_ARGS_ASSERT_SSC_UNION;
1383
71068078 1384 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1385
1386 _invlist_union_maybe_complement_2nd(ssc->invlist,
1387 invlist,
1388 invert2nd,
1389 &ssc->invlist);
1390}
1391
1392PERL_STATIC_INLINE void
1393S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1394 SV* const invlist,
1395 const bool invert2nd)
1396{
1397 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1398
71068078 1399 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1400
1401 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1402 invlist,
1403 invert2nd,
1404 &ssc->invlist);
1405}
1406
1407PERL_STATIC_INLINE void
1408S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1409{
1410 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1411
71068078 1412 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1413
1414 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1415}
1416
1417PERL_STATIC_INLINE void
1418S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1419{
1420 /* AND just the single code point 'cp' into the SSC 'ssc' */
1421
1422 SV* cp_list = _new_invlist(2);
1423
1424 PERL_ARGS_ASSERT_SSC_CP_AND;
1425
71068078 1426 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1427
1428 cp_list = add_cp_to_invlist(cp_list, cp);
1429 ssc_intersection(ssc, cp_list,
1430 FALSE /* Not inverted */
1431 );
1432 SvREFCNT_dec_NN(cp_list);
1433}
1434
1435PERL_STATIC_INLINE void
dc3bf405 1436S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
1437{
1438 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
1439 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1440
71068078 1441 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1442
1443 ANYOF_POSIXL_ZERO(ssc);
1444 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1445}
1446
1447STATIC void
1448S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1449{
1450 /* The inversion list in the SSC is marked mortal; now we need a more
1451 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
1452 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1453 * map */
b423522f
KW
1454
1455 SV* invlist = invlist_clone(ssc->invlist);
1456
1457 PERL_ARGS_ASSERT_SSC_FINALIZE;
1458
71068078 1459 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 1460
a0dd4231 1461 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
1462 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1463 * by the time we reach here */
eff8b7dc 1464 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
a0dd4231 1465
b423522f
KW
1466 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1467
1ee208c4
KW
1468 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1469 NULL, NULL, NULL, FALSE);
b423522f 1470
85c8e306
KW
1471 /* Make sure is clone-safe */
1472 ssc->invlist = NULL;
1473
e0e1be5f 1474 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 1475 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
e0e1be5f 1476 }
1462525b
KW
1477
1478 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
1479}
1480
a3621e74
YO
1481#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1482#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1483#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
1484#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1485 ? (TRIE_LIST_CUR( idx ) - 1) \
1486 : 0 )
a3621e74 1487
3dab1dad
YO
1488
1489#ifdef DEBUGGING
07be1b83 1490/*
2b8b4781
NC
1491 dump_trie(trie,widecharmap,revcharmap)
1492 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1493 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1494
1495 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1496 The _interim_ variants are used for debugging the interim
1497 tables that are used to generate the final compressed
1498 representation which is what dump_trie expects.
1499
486ec47a 1500 Part of the reason for their existence is to provide a form
3dab1dad 1501 of documentation as to how the different representations function.
07be1b83
YO
1502
1503*/
3dab1dad
YO
1504
1505/*
3dab1dad
YO
1506 Dumps the final compressed table form of the trie to Perl_debug_log.
1507 Used for debugging make_trie().
1508*/
b9a59e08 1509
3dab1dad 1510STATIC void
2b8b4781
NC
1511S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1512 AV *revcharmap, U32 depth)
3dab1dad
YO
1513{
1514 U32 state;
ab3bbdeb 1515 SV *sv=sv_newmortal();
55eed653 1516 int colwidth= widecharmap ? 6 : 4;
2e64971a 1517 U16 word;
3dab1dad
YO
1518 GET_RE_DEBUG_FLAGS_DECL;
1519
7918f24d 1520 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1521
3dab1dad
YO
1522 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1523 (int)depth * 2 + 2,"",
1524 "Match","Base","Ofs" );
1525
1526 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1527 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1528 if ( tmp ) {
538e84ed 1529 PerlIO_printf( Perl_debug_log, "%*s",
ab3bbdeb 1530 colwidth,
538e84ed 1531 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1532 PL_colors[0], PL_colors[1],
1533 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
1534 PERL_PV_ESCAPE_FIRSTCHAR
1535 )
ab3bbdeb 1536 );
3dab1dad
YO
1537 }
1538 }
1539 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1540 (int)depth * 2 + 2,"");
1541
1542 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1543 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1544 PerlIO_printf( Perl_debug_log, "\n");
1545
1e2e3d02 1546 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1547 const U32 base = trie->states[ state ].trans.base;
3dab1dad 1548
538e84ed
KW
1549 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1550 (int)depth * 2 + 2,"", (UV)state);
3dab1dad
YO
1551
1552 if ( trie->states[ state ].wordnum ) {
538e84ed
KW
1553 PerlIO_printf( Perl_debug_log, " W%4X",
1554 trie->states[ state ].wordnum );
3dab1dad
YO
1555 } else {
1556 PerlIO_printf( Perl_debug_log, "%6s", "" );
1557 }
1558
1559 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1560
1561 if ( base ) {
1562 U32 ofs = 0;
1563
1564 while( ( base + ofs < trie->uniquecharcount ) ||
1565 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
1566 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1567 != state))
3dab1dad
YO
1568 ofs++;
1569
1570 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1571
1572 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
1573 if ( ( base + ofs >= trie->uniquecharcount )
1574 && ( base + ofs - trie->uniquecharcount
1575 < trie->lasttrans )
1576 && trie->trans[ base + ofs
1577 - trie->uniquecharcount ].check == state )
3dab1dad 1578 {
ab3bbdeb
YO
1579 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1580 colwidth,
538e84ed
KW
1581 (UV)trie->trans[ base + ofs
1582 - trie->uniquecharcount ].next );
3dab1dad 1583 } else {
ab3bbdeb 1584 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1585 }
1586 }
1587
1588 PerlIO_printf( Perl_debug_log, "]");
1589
1590 }
1591 PerlIO_printf( Perl_debug_log, "\n" );
1592 }
538e84ed
KW
1593 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1594 (int)depth*2, "");
2e64971a
DM
1595 for (word=1; word <= trie->wordcount; word++) {
1596 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1597 (int)word, (int)(trie->wordinfo[word].prev),
1598 (int)(trie->wordinfo[word].len));
1599 }
1600 PerlIO_printf(Perl_debug_log, "\n" );
538e84ed 1601}
3dab1dad 1602/*
3dab1dad 1603 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 1604 List tries normally only are used for construction when the number of
3dab1dad
YO
1605 possible chars (trie->uniquecharcount) is very high.
1606 Used for debugging make_trie().
1607*/
1608STATIC void
55eed653 1609S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1610 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1611 U32 depth)
3dab1dad
YO
1612{
1613 U32 state;
ab3bbdeb 1614 SV *sv=sv_newmortal();
55eed653 1615 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1616 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1617
1618 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1619
3dab1dad 1620 /* print out the table precompression. */
ab3bbdeb
YO
1621 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1622 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1623 "------:-----+-----------------\n" );
538e84ed 1624
3dab1dad
YO
1625 for( state=1 ; state < next_alloc ; state ++ ) {
1626 U16 charid;
538e84ed 1627
ab3bbdeb 1628 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1629 (int)depth * 2 + 2,"", (UV)state );
1630 if ( ! trie->states[ state ].wordnum ) {
1631 PerlIO_printf( Perl_debug_log, "%5s| ","");
1632 } else {
1633 PerlIO_printf( Perl_debug_log, "W%4x| ",
1634 trie->states[ state ].wordnum
1635 );
1636 }
1637 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed
KW
1638 SV ** const tmp = av_fetch( revcharmap,
1639 TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1640 if ( tmp ) {
1641 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1642 colwidth,
538e84ed
KW
1643 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1644 colwidth,
1645 PL_colors[0], PL_colors[1],
1646 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1647 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 1648 ) ,
1e2e3d02
YO
1649 TRIE_LIST_ITEM(state,charid).forid,
1650 (UV)TRIE_LIST_ITEM(state,charid).newstate
1651 );
538e84ed 1652 if (!(charid % 10))
664e119d
RGS
1653 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1654 (int)((depth * 2) + 14), "");
1e2e3d02 1655 }
ab3bbdeb
YO
1656 }
1657 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad 1658 }
538e84ed 1659}
3dab1dad
YO
1660
1661/*
3dab1dad 1662 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
1663 This is the normal DFA style state transition table, with a few
1664 twists to facilitate compression later.
3dab1dad
YO
1665 Used for debugging make_trie().
1666*/
1667STATIC void
55eed653 1668S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1669 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1670 U32 depth)
3dab1dad
YO
1671{
1672 U32 state;
1673 U16 charid;
ab3bbdeb 1674 SV *sv=sv_newmortal();
55eed653 1675 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1676 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1677
1678 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 1679
3dab1dad
YO
1680 /*
1681 print out the table precompression so that we can do a visual check
1682 that they are identical.
1683 */
538e84ed 1684
3dab1dad
YO
1685 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1686
1687 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1688 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1689 if ( tmp ) {
538e84ed 1690 PerlIO_printf( Perl_debug_log, "%*s",
ab3bbdeb 1691 colwidth,
538e84ed 1692 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1693 PL_colors[0], PL_colors[1],
1694 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
1695 PERL_PV_ESCAPE_FIRSTCHAR
1696 )
ab3bbdeb 1697 );
3dab1dad
YO
1698 }
1699 }
1700
1701 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1702
1703 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1704 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1705 }
1706
1707 PerlIO_printf( Perl_debug_log, "\n" );
1708
1709 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1710
538e84ed 1711 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
3dab1dad
YO
1712 (int)depth * 2 + 2,"",
1713 (UV)TRIE_NODENUM( state ) );
1714
1715 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1716 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1717 if (v)
1718 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1719 else
1720 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1721 }
1722 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
538e84ed
KW
1723 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1724 (UV)trie->trans[ state ].check );
3dab1dad 1725 } else {
538e84ed
KW
1726 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1727 (UV)trie->trans[ state ].check,
3dab1dad
YO
1728 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1729 }
1730 }
07be1b83 1731}
3dab1dad
YO
1732
1733#endif
1734
2e64971a 1735
786e8c11
YO
1736/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1737 startbranch: the first branch in the whole branch sequence
1738 first : start branch of sequence of branch-exact nodes.
1739 May be the same as startbranch
1740 last : Thing following the last branch.
1741 May be the same as tail.
1742 tail : item following the branch sequence
1743 count : words in the sequence
cce29a1d 1744 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS)/
786e8c11 1745 depth : indent depth
3dab1dad 1746
786e8c11 1747Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1748
786e8c11
YO
1749A trie is an N'ary tree where the branches are determined by digital
1750decomposition of the key. IE, at the root node you look up the 1st character and
1751follow that branch repeat until you find the end of the branches. Nodes can be
1752marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1753
786e8c11 1754 /he|she|his|hers/
72f13be8 1755
786e8c11
YO
1756would convert into the following structure. Numbers represent states, letters
1757following numbers represent valid transitions on the letter from that state, if
1758the number is in square brackets it represents an accepting state, otherwise it
1759will be in parenthesis.
07be1b83 1760
786e8c11
YO
1761 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1762 | |
1763 | (2)
1764 | |
1765 (1) +-i->(6)-+-s->[7]
1766 |
1767 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1768
786e8c11
YO
1769 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1770
1771This shows that when matching against the string 'hers' we will begin at state 1
1772read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1773then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1774is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1775single traverse. We store a mapping from accepting to state to which word was
1776matched, and then when we have multiple possibilities we try to complete the
1777rest of the regex in the order in which they occured in the alternation.
1778
1779The only prior NFA like behaviour that would be changed by the TRIE support is
1780the silent ignoring of duplicate alternations which are of the form:
1781
1782 / (DUPE|DUPE) X? (?{ ... }) Y /x
1783
4b714af6 1784Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1785and without the optimisation. With the optimisations dupes will be silently
486ec47a 1786ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1787the following demonstrates:
1788
1789 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1790
1791which prints out 'word' three times, but
1792
1793 'words'=~/(word|word|word)(?{ print $1 })S/
1794
1795which doesnt print it out at all. This is due to other optimisations kicking in.
1796
1797Example of what happens on a structural level:
1798
486ec47a 1799The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1800
1801 1: CURLYM[1] {1,32767}(18)
1802 5: BRANCH(8)
1803 6: EXACT <ac>(16)
1804 8: BRANCH(11)
1805 9: EXACT <ad>(16)
1806 11: BRANCH(14)
1807 12: EXACT <ab>(16)
1808 16: SUCCEED(0)
1809 17: NOTHING(18)
1810 18: END(0)
1811
1812This would be optimizable with startbranch=5, first=5, last=16, tail=16
1813and should turn into:
1814
1815 1: CURLYM[1] {1,32767}(18)
1816 5: TRIE(16)
1817 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1818 <ac>
1819 <ad>
1820 <ab>
1821 16: SUCCEED(0)
1822 17: NOTHING(18)
1823 18: END(0)
1824
1825Cases where tail != last would be like /(?foo|bar)baz/:
1826
1827 1: BRANCH(4)
1828 2: EXACT <foo>(8)
1829 4: BRANCH(7)
1830 5: EXACT <bar>(8)
1831 7: TAIL(8)
1832 8: EXACT <baz>(10)
1833 10: END(0)
1834
1835which would be optimizable with startbranch=1, first=1, last=7, tail=8
1836and would end up looking like:
1837
1838 1: TRIE(8)
1839 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1840 <foo>
1841 <bar>
1842 7: TAIL(8)
1843 8: EXACT <baz>(10)
1844 10: END(0)
1845
c80e42f3 1846 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
1847
1848is the recommended Unicode-aware way of saying
1849
1850 *(d++) = uv;
1851*/
1852
fab2782b 1853#define TRIE_STORE_REVCHAR(val) \
786e8c11 1854 STMT_START { \
73031816 1855 if (UTF) { \
fab2782b 1856 SV *zlopp = newSV(7); /* XXX: optimize me */ \
88c9ea1e 1857 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 1858 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
1859 SvCUR_set(zlopp, kapow - flrbbbbb); \
1860 SvPOK_on(zlopp); \
1861 SvUTF8_on(zlopp); \
1862 av_push(revcharmap, zlopp); \
1863 } else { \
fab2782b 1864 char ooooff = (char)val; \
73031816
NC
1865 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1866 } \
1867 } STMT_END
786e8c11 1868
914a25d5
KW
1869/* This gets the next character from the input, folding it if not already
1870 * folded. */
1871#define TRIE_READ_CHAR STMT_START { \
1872 wordlen++; \
1873 if ( UTF ) { \
1874 /* if it is UTF then it is either already folded, or does not need \
1875 * folding */ \
1c1d615a 1876 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
1877 } \
1878 else if (folder == PL_fold_latin1) { \
7d006b13
KW
1879 /* This folder implies Unicode rules, which in the range expressible \
1880 * by not UTF is the lower case, with the two exceptions, one of \
1881 * which should have been taken care of before calling this */ \
1882 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1883 uvc = toLOWER_L1(*uc); \
1884 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1885 len = 1; \
914a25d5
KW
1886 } else { \
1887 /* raw data, will be folded later if needed */ \
1888 uvc = (U32)*uc; \
1889 len = 1; \
1890 } \
786e8c11
YO
1891} STMT_END
1892
1893
1894
1895#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1896 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1897 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1898 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1899 } \
1900 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1901 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1902 TRIE_LIST_CUR( state )++; \
1903} STMT_END
07be1b83 1904
786e8c11
YO
1905#define TRIE_LIST_NEW(state) STMT_START { \
1906 Newxz( trie->states[ state ].trans.list, \
1907 4, reg_trie_trans_le ); \
1908 TRIE_LIST_CUR( state ) = 1; \
1909 TRIE_LIST_LEN( state ) = 4; \
1910} STMT_END
07be1b83 1911
786e8c11
YO
1912#define TRIE_HANDLE_WORD(state) STMT_START { \
1913 U16 dupe= trie->states[ state ].wordnum; \
1914 regnode * const noper_next = regnext( noper ); \
1915 \
786e8c11
YO
1916 DEBUG_r({ \
1917 /* store the word for dumping */ \
1918 SV* tmp; \
1919 if (OP(noper) != NOTHING) \
740cce10 1920 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1921 else \
740cce10 1922 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1923 av_push( trie_words, tmp ); \
786e8c11
YO
1924 }); \
1925 \
1926 curword++; \
2e64971a
DM
1927 trie->wordinfo[curword].prev = 0; \
1928 trie->wordinfo[curword].len = wordlen; \
1929 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1930 \
1931 if ( noper_next < tail ) { \
1932 if (!trie->jump) \
538e84ed
KW
1933 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
1934 sizeof(U16) ); \
7f69552c 1935 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1936 if (!jumper) \
1937 jumper = noper_next; \
1938 if (!nextbranch) \
1939 nextbranch= regnext(cur); \
1940 } \
1941 \
1942 if ( dupe ) { \
2e64971a
DM
1943 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1944 /* chain, so that when the bits of chain are later */\
1945 /* linked together, the dups appear in the chain */\
1946 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1947 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1948 } else { \
1949 /* we haven't inserted this word yet. */ \
1950 trie->states[ state ].wordnum = curword; \
1951 } \
1952} STMT_END
07be1b83 1953
3dab1dad 1954
786e8c11
YO
1955#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1956 ( ( base + charid >= ucharcount \
1957 && base + charid < ubound \
1958 && state == trie->trans[ base - ucharcount + charid ].check \
1959 && trie->trans[ base - ucharcount + charid ].next ) \
1960 ? trie->trans[ base - ucharcount + charid ].next \
1961 : ( state==1 ? special : 0 ) \
1962 )
3dab1dad 1963
786e8c11
YO
1964#define MADE_TRIE 1
1965#define MADE_JUMP_TRIE 2
1966#define MADE_EXACT_TRIE 4
3dab1dad 1967
a3621e74 1968STATIC I32
538e84ed
KW
1969S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
1970 regnode *first, regnode *last, regnode *tail,
1971 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
1972{
1973 /* first pass, loop through and scan words */
1974 reg_trie_data *trie;
55eed653 1975 HV *widecharmap = NULL;
2b8b4781 1976 AV *revcharmap = newAV();
a3621e74 1977 regnode *cur;
a3621e74
YO
1978 STRLEN len = 0;
1979 UV uvc = 0;
1980 U16 curword = 0;
1981 U32 next_alloc = 0;
786e8c11
YO
1982 regnode *jumper = NULL;
1983 regnode *nextbranch = NULL;
7f69552c 1984 regnode *convert = NULL;
2e64971a 1985 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1986 /* we just use folder as a flag in utf8 */
1e696034 1987 const U8 * folder = NULL;
a3621e74 1988
2b8b4781 1989#ifdef DEBUGGING
cf78de0b 1990 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2b8b4781
NC
1991 AV *trie_words = NULL;
1992 /* along with revcharmap, this only used during construction but both are
1993 * useful during debugging so we store them in the struct when debugging.
8e11feef 1994 */
2b8b4781 1995#else
cf78de0b 1996 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 1997 STRLEN trie_charcount=0;
3dab1dad 1998#endif
2b8b4781 1999 SV *re_trie_maxbuff;
a3621e74 2000 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2001
2002 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2003#ifndef DEBUGGING
2004 PERL_UNUSED_ARG(depth);
2005#endif
a3621e74 2006
1e696034 2007 switch (flags) {
79a81a6e 2008 case EXACT: break;
2f7f8cb1 2009 case EXACTFA:
fab2782b 2010 case EXACTFU_SS:
1e696034
KW
2011 case EXACTFU: folder = PL_fold_latin1; break;
2012 case EXACTF: folder = PL_fold; break;
fab2782b 2013 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2014 }
2015
c944940b 2016 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2017 trie->refcount = 1;
3dab1dad 2018 trie->startstate = 1;
786e8c11 2019 trie->wordcount = word_count;
f8fc2ecf 2020 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2021 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
fab2782b 2022 if (flags == EXACT)
c944940b 2023 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2024 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2025 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2026
a3621e74 2027 DEBUG_r({
2b8b4781 2028 trie_words = newAV();
a3621e74 2029 });
a3621e74 2030
0111c4fd 2031 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
316ebaf2 2032 assert(re_trie_maxbuff);
a3621e74 2033 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2034 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2035 }
df826430 2036 DEBUG_TRIE_COMPILE_r({
538e84ed
KW
2037 PerlIO_printf( Perl_debug_log,
2038 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2039 (int)depth * 2 + 2, "",
2040 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2041 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2042 });
538e84ed 2043
7f69552c
YO
2044 /* Find the node we are going to overwrite */
2045 if ( first == startbranch && OP( last ) != BRANCH ) {
2046 /* whole branch chain */
2047 convert = first;
2048 } else {
2049 /* branch sub-chain */
2050 convert = NEXTOPER( first );
2051 }
538e84ed 2052
a3621e74
YO
2053 /* -- First loop and Setup --
2054
2055 We first traverse the branches and scan each word to determine if it
2056 contains widechars, and how many unique chars there are, this is
2057 important as we have to build a table with at least as many columns as we
2058 have unique chars.
2059
2060 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2061 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2062 the native representation of the character value as the key and IV's for
2063 the coded index.
a3621e74
YO
2064
2065 *TODO* If we keep track of how many times each character is used we can
2066 remap the columns so that the table compression later on is more
3b753521 2067 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2068 middle and the least common are on the outside. IMO this would be better
2069 than a most to least common mapping as theres a decent chance the most
2070 common letter will share a node with the least common, meaning the node
486ec47a 2071 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2072 case is when we have the least common nodes twice.
2073
2074 */
2075
a3621e74 2076 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2077 regnode *noper = NEXTOPER( cur );
e1ec3a88 2078 const U8 *uc = (U8*)STRING( noper );
df826430 2079 const U8 *e = uc + STR_LEN( noper );
bc031a7d 2080 int foldlen = 0;
07be1b83 2081 U32 wordlen = 0; /* required init */
bc031a7d
KW
2082 STRLEN minchars = 0;
2083 STRLEN maxchars = 0;
538e84ed
KW
2084 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2085 bitmap?*/
a3621e74 2086
3dab1dad 2087 if (OP(noper) == NOTHING) {
df826430
YO
2088 regnode *noper_next= regnext(noper);
2089 if (noper_next != tail && OP(noper_next) == flags) {
2090 noper = noper_next;
2091 uc= (U8*)STRING(noper);
2092 e= uc + STR_LEN(noper);
2093 trie->minlen= STR_LEN(noper);
2094 } else {
2095 trie->minlen= 0;
2096 continue;
2097 }
3dab1dad 2098 }
df826430 2099
fab2782b 2100 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2101 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2102 regardless of encoding */
fab2782b
YO
2103 if (OP( noper ) == EXACTFU_SS) {
2104 /* false positives are ok, so just set this */
0dc4a61d 2105 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2106 }
2107 }
bc031a7d
KW
2108 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2109 branch */
3dab1dad 2110 TRIE_CHARCOUNT(trie)++;
a3621e74 2111 TRIE_READ_CHAR;
645de4ce 2112
bc031a7d
KW
2113 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2114 * is in effect. Under /i, this character can match itself, or
2115 * anything that folds to it. If not under /i, it can match just
2116 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2117 * all fold to k, and all are single characters. But some folds
2118 * expand to more than one character, so for example LATIN SMALL
2119 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2120 * the string beginning at 'uc' is 'ffi', it could be matched by
2121 * three characters, or just by the one ligature character. (It
2122 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2123 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2124 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2125 * match.) The trie needs to know the minimum and maximum number
2126 * of characters that could match so that it can use size alone to
2127 * quickly reject many match attempts. The max is simple: it is
2128 * the number of folded characters in this branch (since a fold is
2129 * never shorter than what folds to it. */
2130
2131 maxchars++;
2132
2133 /* And the min is equal to the max if not under /i (indicated by
2134 * 'folder' being NULL), or there are no multi-character folds. If
2135 * there is a multi-character fold, the min is incremented just
2136 * once, for the character that folds to the sequence. Each
2137 * character in the sequence needs to be added to the list below of
2138 * characters in the trie, but we count only the first towards the
2139 * min number of characters needed. This is done through the
2140 * variable 'foldlen', which is returned by the macros that look
2141 * for these sequences as the number of bytes the sequence
2142 * occupies. Each time through the loop, we decrement 'foldlen' by
2143 * how many bytes the current char occupies. Only when it reaches
2144 * 0 do we increment 'minchars' or look for another multi-character
2145 * sequence. */
2146 if (folder == NULL) {
2147 minchars++;
2148 }
2149 else if (foldlen > 0) {
2150 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2151 }
2152 else {
bc031a7d
KW
2153 minchars++;
2154
2155 /* See if *uc is the beginning of a multi-character fold. If
2156 * so, we decrement the length remaining to look at, to account
2157 * for the current character this iteration. (We can use 'uc'
2158 * instead of the fold returned by TRIE_READ_CHAR because for
2159 * non-UTF, the latin1_safe macro is smart enough to account
2160 * for all the unfolded characters, and because for UTF, the
2161 * string will already have been folded earlier in the
2162 * compilation process */
2163 if (UTF) {
2164 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2165 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2166 }
2167 }
bc031a7d
KW
2168 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2169 foldlen--;
2170 }
645de4ce 2171 }
bc031a7d
KW
2172
2173 /* The current character (and any potential folds) should be added
2174 * to the possible matching characters for this position in this
2175 * branch */
a3621e74 2176 if ( uvc < 256 ) {
fab2782b
YO
2177 if ( folder ) {
2178 U8 folded= folder[ (U8) uvc ];
2179 if ( !trie->charmap[ folded ] ) {
2180 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2181 TRIE_STORE_REVCHAR( folded );
2182 }
2183 }
a3621e74
YO
2184 if ( !trie->charmap[ uvc ] ) {
2185 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2186 TRIE_STORE_REVCHAR( uvc );
a3621e74 2187 }
02daf0ab 2188 if ( set_bit ) {
62012aee
KW
2189 /* store the codepoint in the bitmap, and its folded
2190 * equivalent. */
fab2782b 2191 TRIE_BITMAP_SET(trie, uvc);
0921ee73
T
2192
2193 /* store the folded codepoint */
fab2782b 2194 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
0921ee73
T
2195
2196 if ( !UTF ) {
2197 /* store first byte of utf8 representation of
acdf4139 2198 variant codepoints */
6f2d5cbc 2199 if (! UVCHR_IS_INVARIANT(uvc)) {
acdf4139 2200 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
2201 }
2202 }
02daf0ab
YO
2203 set_bit = 0; /* We've done our bit :-) */
2204 }
a3621e74 2205 } else {
bc031a7d
KW
2206
2207 /* XXX We could come up with the list of code points that fold
2208 * to this using PL_utf8_foldclosures, except not for
2209 * multi-char folds, as there may be multiple combinations
2210 * there that could work, which needs to wait until runtime to
2211 * resolve (The comment about LIGATURE FFI above is such an
2212 * example */
2213
a3621e74 2214 SV** svpp;
55eed653
NC
2215 if ( !widecharmap )
2216 widecharmap = newHV();
a3621e74 2217
55eed653 2218 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2219
2220 if ( !svpp )
e4584336 2221 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
2222
2223 if ( !SvTRUE( *svpp ) ) {
2224 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2225 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2226 }
2227 }
bc031a7d
KW
2228 } /* end loop through characters in this branch of the trie */
2229
2230 /* We take the min and max for this branch and combine to find the min
2231 * and max for all branches processed so far */
3dab1dad 2232 if( cur == first ) {
bc031a7d
KW
2233 trie->minlen = minchars;
2234 trie->maxlen = maxchars;
2235 } else if (minchars < trie->minlen) {
2236 trie->minlen = minchars;
2237 } else if (maxchars > trie->maxlen) {
2238 trie->maxlen = maxchars;
fab2782b 2239 }
a3621e74
YO
2240 } /* end first pass */
2241 DEBUG_TRIE_COMPILE_r(
538e84ed
KW
2242 PerlIO_printf( Perl_debug_log,
2243 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3dab1dad 2244 (int)depth * 2 + 2,"",
55eed653 2245 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2246 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2247 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2248 );
a3621e74
YO
2249
2250 /*
2251 We now know what we are dealing with in terms of unique chars and
2252 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2253 representation using a flat table will take. If it's over a reasonable
2254 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2255 conservative but potentially much slower representation using an array
2256 of lists.
2257
2258 At the end we convert both representations into the same compressed
2259 form that will be used in regexec.c for matching with. The latter
2260 is a form that cannot be used to construct with but has memory
2261 properties similar to the list form and access properties similar
2262 to the table form making it both suitable for fast searches and
2263 small enough that its feasable to store for the duration of a program.
2264
2265 See the comment in the code where the compressed table is produced
2266 inplace from the flat tabe representation for an explanation of how
2267 the compression works.
2268
2269 */
2270
2271
2e64971a
DM
2272 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2273 prev_states[1] = 0;
2274
538e84ed
KW
2275 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2276 > SvIV(re_trie_maxbuff) )
2277 {
a3621e74
YO
2278 /*
2279 Second Pass -- Array Of Lists Representation
2280
2281 Each state will be represented by a list of charid:state records
2282 (reg_trie_trans_le) the first such element holds the CUR and LEN
2283 points of the allocated array. (See defines above).
2284
2285 We build the initial structure using the lists, and then convert
2286 it into the compressed table form which allows faster lookups
2287 (but cant be modified once converted).
a3621e74
YO
2288 */
2289
a3621e74
YO
2290 STRLEN transcount = 1;
2291
538e84ed 2292 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1e2e3d02
YO
2293 "%*sCompiling trie using list compiler\n",
2294 (int)depth * 2 + 2, ""));
686b73d4 2295
c944940b
JH
2296 trie->states = (reg_trie_state *)
2297 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2298 sizeof(reg_trie_state) );
a3621e74
YO
2299 TRIE_LIST_NEW(1);
2300 next_alloc = 2;
2301
2302 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2303
df826430 2304 regnode *noper = NEXTOPER( cur );
c445ea15 2305 U8 *uc = (U8*)STRING( noper );
df826430 2306 const U8 *e = uc + STR_LEN( noper );
c445ea15
AL
2307 U32 state = 1; /* required init */
2308 U16 charid = 0; /* sanity init */
07be1b83 2309 U32 wordlen = 0; /* required init */
c445ea15 2310
df826430
YO
2311 if (OP(noper) == NOTHING) {
2312 regnode *noper_next= regnext(noper);
2313 if (noper_next != tail && OP(noper_next) == flags) {
2314 noper = noper_next;
2315 uc= (U8*)STRING(noper);
2316 e= uc + STR_LEN(noper);
2317 }
2318 }
2319
3dab1dad 2320 if (OP(noper) != NOTHING) {
786e8c11 2321 for ( ; uc < e ; uc += len ) {
c445ea15 2322
786e8c11 2323 TRIE_READ_CHAR;
c445ea15 2324
786e8c11
YO
2325 if ( uvc < 256 ) {
2326 charid = trie->charmap[ uvc ];
c445ea15 2327 } else {
538e84ed
KW
2328 SV** const svpp = hv_fetch( widecharmap,
2329 (char*)&uvc,
2330 sizeof( UV ),
2331 0);
786e8c11
YO
2332 if ( !svpp ) {
2333 charid = 0;
2334 } else {
2335 charid=(U16)SvIV( *svpp );
2336 }
c445ea15 2337 }
538e84ed
KW
2338 /* charid is now 0 if we dont know the char read, or
2339 * nonzero if we do */
786e8c11 2340 if ( charid ) {
a3621e74 2341
786e8c11
YO
2342 U16 check;
2343 U32 newstate = 0;
a3621e74 2344
786e8c11
YO
2345 charid--;
2346 if ( !trie->states[ state ].trans.list ) {
2347 TRIE_LIST_NEW( state );
c445ea15 2348 }
538e84ed
KW
2349 for ( check = 1;
2350 check <= TRIE_LIST_USED( state );
2351 check++ )
2352 {
2353 if ( TRIE_LIST_ITEM( state, check ).forid
2354 == charid )
2355 {
786e8c11
YO
2356 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2357 break;
2358 }
2359 }
2360 if ( ! newstate ) {
2361 newstate = next_alloc++;
2e64971a 2362 prev_states[newstate] = state;
786e8c11
YO
2363 TRIE_LIST_PUSH( state, charid, newstate );
2364 transcount++;
2365 }
2366 state = newstate;
2367 } else {
2368 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 2369 }
a28509cc 2370 }
c445ea15 2371 }
3dab1dad 2372 TRIE_HANDLE_WORD(state);
a3621e74
YO
2373
2374 } /* end second pass */
2375
1e2e3d02 2376 /* next alloc is the NEXT state to be allocated */
538e84ed 2377 trie->statecount = next_alloc;
c944940b
JH
2378 trie->states = (reg_trie_state *)
2379 PerlMemShared_realloc( trie->states,
2380 next_alloc
2381 * sizeof(reg_trie_state) );
a3621e74 2382
3dab1dad 2383 /* and now dump it out before we compress it */
2b8b4781
NC
2384 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2385 revcharmap, next_alloc,
2386 depth+1)
1e2e3d02 2387 );
a3621e74 2388
c944940b
JH
2389 trie->trans = (reg_trie_trans *)
2390 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2391 {
2392 U32 state;
a3621e74
YO
2393 U32 tp = 0;
2394 U32 zp = 0;
2395
2396
2397 for( state=1 ; state < next_alloc ; state ++ ) {
2398 U32 base=0;
2399
2400 /*
2401 DEBUG_TRIE_COMPILE_MORE_r(
2402 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2403 );
2404 */
2405
2406 if (trie->states[state].trans.list) {
2407 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2408 U16 maxid=minid;
a28509cc 2409 U16 idx;
a3621e74
YO
2410
2411 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
2412 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2413 if ( forid < minid ) {
2414 minid=forid;
2415 } else if ( forid > maxid ) {
2416 maxid=forid;
2417 }
a3621e74
YO
2418 }
2419 if ( transcount < tp + maxid - minid + 1) {
2420 transcount *= 2;
c944940b
JH
2421 trie->trans = (reg_trie_trans *)
2422 PerlMemShared_realloc( trie->trans,
446bd890
NC
2423 transcount
2424 * sizeof(reg_trie_trans) );
538e84ed
KW
2425 Zero( trie->trans + (transcount / 2),
2426 transcount / 2,
2427 reg_trie_trans );
a3621e74
YO
2428 }
2429 base = trie->uniquecharcount + tp - minid;
2430 if ( maxid == minid ) {
2431 U32 set = 0;
2432 for ( ; zp < tp ; zp++ ) {
2433 if ( ! trie->trans[ zp ].next ) {
2434 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
2435 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2436 1).newstate;
a3621e74
YO
2437 trie->trans[ zp ].check = state;
2438 set = 1;
2439 break;
2440 }
2441 }
2442 if ( !set ) {
538e84ed
KW
2443 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2444 1).newstate;
a3621e74
YO
2445 trie->trans[ tp ].check = state;
2446 tp++;
2447 zp = tp;
2448 }
2449 } else {
2450 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
2451 const U32 tid = base
2452 - trie->uniquecharcount
2453 + TRIE_LIST_ITEM( state, idx ).forid;
2454 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2455 idx ).newstate;
a3621e74
YO
2456 trie->trans[ tid ].check = state;
2457 }
2458 tp += ( maxid - minid + 1 );
2459 }
2460 Safefree(trie->states[ state ].trans.list);
2461 }
2462 /*
2463 DEBUG_TRIE_COMPILE_MORE_r(
2464 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2465 );
2466 */
2467 trie->states[ state ].trans.base=base;
2468 }
cc601c31 2469 trie->lasttrans = tp + 1;
a3621e74
YO
2470 }
2471 } else {
2472 /*
2473 Second Pass -- Flat Table Representation.
2474
b423522f
KW
2475 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2476 each. We know that we will need Charcount+1 trans at most to store
2477 the data (one row per char at worst case) So we preallocate both
2478 structures assuming worst case.
a3621e74
YO
2479
2480 We then construct the trie using only the .next slots of the entry
2481 structs.
2482
b423522f
KW
2483 We use the .check field of the first entry of the node temporarily
2484 to make compression both faster and easier by keeping track of how
2485 many non zero fields are in the node.
a3621e74
YO
2486
2487 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2488 transition.
2489
b423522f
KW
2490 There are two terms at use here: state as a TRIE_NODEIDX() which is
2491 a number representing the first entry of the node, and state as a
2492 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2493 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2494 if there are 2 entrys per node. eg:
a3621e74
YO
2495
2496 A B A B
2497 1. 2 4 1. 3 7
2498 2. 0 3 3. 0 5
2499 3. 0 0 5. 0 0
2500 4. 0 0 7. 0 0
2501
b423522f
KW
2502 The table is internally in the right hand, idx form. However as we
2503 also have to deal with the states array which is indexed by nodenum
2504 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
2505
2506 */
538e84ed 2507 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1e2e3d02
YO
2508 "%*sCompiling trie using table compiler\n",
2509 (int)depth * 2 + 2, ""));
3dab1dad 2510
c944940b
JH
2511 trie->trans = (reg_trie_trans *)
2512 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2513 * trie->uniquecharcount + 1,
2514 sizeof(reg_trie_trans) );
2515 trie->states = (reg_trie_state *)
2516 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2517 sizeof(reg_trie_state) );
a3621e74
YO
2518 next_alloc = trie->uniquecharcount + 1;
2519
3dab1dad 2520
a3621e74
YO
2521 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2522
df826430 2523 regnode *noper = NEXTOPER( cur );
a28509cc 2524 const U8 *uc = (U8*)STRING( noper );
df826430 2525 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
2526
2527 U32 state = 1; /* required init */
2528
2529 U16 charid = 0; /* sanity init */
2530 U32 accept_state = 0; /* sanity init */
a3621e74 2531
07be1b83 2532 U32 wordlen = 0; /* required init */
a3621e74 2533
df826430
YO
2534 if (OP(noper) == NOTHING) {
2535 regnode *noper_next= regnext(noper);
2536 if (noper_next != tail && OP(noper_next) == flags) {
2537 noper = noper_next;
2538 uc= (U8*)STRING(noper);
2539 e= uc + STR_LEN(noper);
2540 }
2541 }
fab2782b 2542
3dab1dad 2543 if ( OP(noper) != NOTHING ) {
786e8c11 2544 for ( ; uc < e ; uc += len ) {
a3621e74 2545
786e8c11 2546 TRIE_READ_CHAR;
a3621e74 2547
786e8c11
YO
2548 if ( uvc < 256 ) {
2549 charid = trie->charmap[ uvc ];
2550 } else {
538e84ed
KW
2551 SV* const * const svpp = hv_fetch( widecharmap,
2552 (char*)&uvc,
2553 sizeof( UV ),
2554 0);
786e8c11 2555 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 2556 }
786e8c11
YO
2557 if ( charid ) {
2558 charid--;
2559 if ( !trie->trans[ state + charid ].next ) {
2560 trie->trans[ state + charid ].next = next_alloc;
2561 trie->trans[ state ].check++;
2e64971a
DM
2562 prev_states[TRIE_NODENUM(next_alloc)]
2563 = TRIE_NODENUM(state);
786e8c11
YO
2564 next_alloc += trie->uniquecharcount;
2565 }
2566 state = trie->trans[ state + charid ].next;
2567 } else {
2568 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2569 }
538e84ed
KW
2570 /* charid is now 0 if we dont know the char read, or
2571 * nonzero if we do */
a3621e74 2572 }
a3621e74 2573 }
3dab1dad
YO
2574 accept_state = TRIE_NODENUM( state );
2575 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
2576
2577 } /* end second pass */
2578
3dab1dad 2579 /* and now dump it out before we compress it */
2b8b4781
NC
2580 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2581 revcharmap,
2582 next_alloc, depth+1));
a3621e74 2583
a3621e74
YO
2584 {
2585 /*
2586 * Inplace compress the table.*
2587
2588 For sparse data sets the table constructed by the trie algorithm will
2589 be mostly 0/FAIL transitions or to put it another way mostly empty.
2590 (Note that leaf nodes will not contain any transitions.)
2591
2592 This algorithm compresses the tables by eliminating most such
2593 transitions, at the cost of a modest bit of extra work during lookup:
2594
2595 - Each states[] entry contains a .base field which indicates the
2596 index in the state[] array wheres its transition data is stored.
2597
3b753521 2598 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
2599
2600 - If .base is nonzero then charid is added to it to find an entry in
2601 the trans array.
2602
2603 -If trans[states[state].base+charid].check!=state then the
2604 transition is taken to be a 0/Fail transition. Thus if there are fail
2605 transitions at the front of the node then the .base offset will point
2606 somewhere inside the previous nodes data (or maybe even into a node
2607 even earlier), but the .check field determines if the transition is
2608 valid.
2609
786e8c11 2610 XXX - wrong maybe?
a3621e74 2611 The following process inplace converts the table to the compressed
3b753521 2612 table: We first do not compress the root node 1,and mark all its
a3621e74 2613 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
2614 allows us to do a DFA construction from the compressed table later,
2615 and ensures that any .base pointers we calculate later are greater
2616 than 0.
a3621e74
YO
2617
2618 - We set 'pos' to indicate the first entry of the second node.
2619
2620 - We then iterate over the columns of the node, finding the first and
2621 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2622 and set the .check pointers accordingly, and advance pos
2623 appropriately and repreat for the next node. Note that when we copy
2624 the next pointers we have to convert them from the original
2625 NODEIDX form to NODENUM form as the former is not valid post
2626 compression.
2627
2628 - If a node has no transitions used we mark its base as 0 and do not
2629 advance the pos pointer.
2630
2631 - If a node only has one transition we use a second pointer into the
2632 structure to fill in allocated fail transitions from other states.
2633 This pointer is independent of the main pointer and scans forward
2634 looking for null transitions that are allocated to a state. When it
2635 finds one it writes the single transition into the "hole". If the
786e8c11 2636 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
2637
2638 - Once compressed we can Renew/realloc the structures to release the
2639 excess space.
2640
2641 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2642 specifically Fig 3.47 and the associated pseudocode.
2643
2644 demq
2645 */
a3b680e6 2646 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 2647 U32 state, charid;
a3621e74 2648 U32 pos = 0, zp=0;
1e2e3d02 2649 trie->statecount = laststate;
a3621e74
YO
2650
2651 for ( state = 1 ; state < laststate ; state++ ) {
2652 U8 flag = 0;
a28509cc
AL
2653 const U32 stateidx = TRIE_NODEIDX( state );
2654 const U32 o_used = trie->trans[ stateidx ].check;
2655 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
2656 trie->trans[ stateidx ].check = 0;
2657
538e84ed
KW
2658 for ( charid = 0;
2659 used && charid < trie->uniquecharcount;
2660 charid++ )
2661 {
a3621e74
YO
2662 if ( flag || trie->trans[ stateidx + charid ].next ) {
2663 if ( trie->trans[ stateidx + charid ].next ) {
2664 if (o_used == 1) {
2665 for ( ; zp < pos ; zp++ ) {
2666 if ( ! trie->trans[ zp ].next ) {
2667 break;
2668 }
2669 }
538e84ed
KW
2670 trie->states[ state ].trans.base
2671 = zp
2672 + trie->uniquecharcount
2673 - charid ;
2674 trie->trans[ zp ].next
2675 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2676 + charid ].next );
a3621e74
YO
2677 trie->trans[ zp ].check = state;
2678 if ( ++zp > pos ) pos = zp;
2679 break;
2680 }
2681 used--;
2682 }
2683 if ( !flag ) {
2684 flag = 1;
538e84ed
KW
2685 trie->states[ state ].trans.base
2686 = pos + trie->uniquecharcount - charid ;
a3621e74 2687 }
538e84ed
KW
2688 trie->trans[ pos ].next
2689 = SAFE_TRIE_NODENUM(
2690 trie->trans[ stateidx + charid ].next );
a3621e74
YO
2691 trie->trans[ pos ].check = state;
2692 pos++;
2693 }
2694 }
2695 }
cc601c31 2696 trie->lasttrans = pos + 1;
c944940b
JH
2697 trie->states = (reg_trie_state *)
2698 PerlMemShared_realloc( trie->states, laststate
2699 * sizeof(reg_trie_state) );
a3621e74 2700 DEBUG_TRIE_COMPILE_MORE_r(
538e84ed
KW
2701 PerlIO_printf( Perl_debug_log,
2702 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2703 (int)depth * 2 + 2,"",
2704 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2705 + 1 ),
2706 (IV)next_alloc,
2707 (IV)pos,
2708 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
2709 );
2710
2711 } /* end table compress */
2712 }
1e2e3d02 2713 DEBUG_TRIE_COMPILE_MORE_r(
538e84ed
KW
2714 PerlIO_printf(Perl_debug_log,
2715 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1e2e3d02
YO
2716 (int)depth * 2 + 2, "",
2717 (UV)trie->statecount,
2718 (UV)trie->lasttrans)
2719 );
cc601c31 2720 /* resize the trans array to remove unused space */
c944940b
JH
2721 trie->trans = (reg_trie_trans *)
2722 PerlMemShared_realloc( trie->trans, trie->lasttrans
2723 * sizeof(reg_trie_trans) );
a3621e74 2724
538e84ed 2725 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2726 U8 nodetype =(U8)(flags & 0xFF);
2727 char *str=NULL;
538e84ed 2728
07be1b83 2729#ifdef DEBUGGING
e62cc96a 2730 regnode *optimize = NULL;
7122b237
YO
2731#ifdef RE_TRACK_PATTERN_OFFSETS
2732
b57a0404
JH
2733 U32 mjd_offset = 0;
2734 U32 mjd_nodelen = 0;
7122b237
YO
2735#endif /* RE_TRACK_PATTERN_OFFSETS */
2736#endif /* DEBUGGING */
a3621e74 2737 /*
3dab1dad
YO
2738 This means we convert either the first branch or the first Exact,
2739 depending on whether the thing following (in 'last') is a branch
2740 or not and whther first is the startbranch (ie is it a sub part of
2741 the alternation or is it the whole thing.)
3b753521 2742 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2743 the whole branch sequence, including the first.
a3621e74 2744 */
3dab1dad 2745 /* Find the node we are going to overwrite */
7f69552c 2746 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2747 /* branch sub-chain */
3dab1dad 2748 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2749#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2750 DEBUG_r({
2751 mjd_offset= Node_Offset((convert));
2752 mjd_nodelen= Node_Length((convert));
2753 });
7122b237 2754#endif
7f69552c 2755 /* whole branch chain */
7122b237
YO
2756 }
2757#ifdef RE_TRACK_PATTERN_OFFSETS
2758 else {
7f69552c
YO
2759 DEBUG_r({
2760 const regnode *nop = NEXTOPER( convert );
2761 mjd_offset= Node_Offset((nop));
2762 mjd_nodelen= Node_Length((nop));
2763 });
07be1b83
YO
2764 }
2765 DEBUG_OPTIMISE_r(
538e84ed
KW
2766 PerlIO_printf(Perl_debug_log,
2767 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
07be1b83 2768 (int)depth * 2 + 2, "",
786e8c11 2769 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2770 );
7122b237 2771#endif
538e84ed 2772 /* But first we check to see if there is a common prefix we can
3dab1dad
YO
2773 split out as an EXACT and put in front of the TRIE node. */
2774 trie->startstate= 1;
55eed653 2775 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2776 U32 state;
1e2e3d02 2777 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2778 U32 ofs = 0;
8e11feef
RGS
2779 I32 idx = -1;
2780 U32 count = 0;
2781 const U32 base = trie->states[ state ].trans.base;
a3621e74 2782
3dab1dad 2783 if ( trie->states[state].wordnum )
8e11feef 2784 count = 1;
a3621e74 2785
8e11feef 2786 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2787 if ( ( base + ofs >= trie->uniquecharcount ) &&
2788 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2789 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2790 {
3dab1dad 2791 if ( ++count > 1 ) {
2b8b4781 2792 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2793 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2794 if ( state == 1 ) break;
3dab1dad
YO
2795 if ( count == 2 ) {
2796 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2797 DEBUG_OPTIMISE_r(
8e11feef
RGS
2798 PerlIO_printf(Perl_debug_log,
2799 "%*sNew Start State=%"UVuf" Class: [",
2800 (int)depth * 2 + 2, "",
786e8c11 2801 (UV)state));
be8e71aa 2802 if (idx >= 0) {
2b8b4781 2803 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2804 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2805
3dab1dad 2806 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2807 if ( folder )
2808 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2809 DEBUG_OPTIMISE_r(
f1f66076 2810 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2811 );
8e11feef
RGS
2812 }
2813 }
2814 TRIE_BITMAP_SET(trie,*ch);
2815 if ( folder )
2816 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2817 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2818 }
2819 idx = ofs;
2820 }
3dab1dad
YO
2821 }
2822 if ( count == 1 ) {
2b8b4781 2823 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2824 STRLEN len;
2825 char *ch = SvPV( *tmp, len );
de734bd5
A
2826 DEBUG_OPTIMISE_r({
2827 SV *sv=sv_newmortal();
8e11feef
RGS
2828 PerlIO_printf( Perl_debug_log,
2829 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2830 (int)depth * 2 + 2, "",
538e84ed
KW
2831 (UV)state, (UV)idx,
2832 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
de734bd5
A
2833 PL_colors[0], PL_colors[1],
2834 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed 2835 PERL_PV_ESCAPE_FIRSTCHAR
de734bd5
A
2836 )
2837 );
2838 });
3dab1dad
YO
2839 if ( state==1 ) {
2840 OP( convert ) = nodetype;
2841 str=STRING(convert);
2842 STR_LEN(convert)=0;
2843 }
c490c714
YO
2844 STR_LEN(convert) += len;
2845 while (len--)
de734bd5 2846 *str++ = *ch++;
8e11feef 2847 } else {
538e84ed 2848#ifdef DEBUGGING
8e11feef
RGS
2849 if (state>1)
2850 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2851#endif
8e11feef
RGS
2852 break;
2853 }
2854 }
2e64971a 2855 trie->prefixlen = (state-1);
3dab1dad 2856 if (str) {
8e11feef 2857 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2858 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2859 trie->startstate = state;
07be1b83
YO
2860 trie->minlen -= (state - 1);
2861 trie->maxlen -= (state - 1);
33809eae
JH
2862#ifdef DEBUGGING
2863 /* At least the UNICOS C compiler choked on this
2864 * being argument to DEBUG_r(), so let's just have
2865 * it right here. */
2866 if (
2867#ifdef PERL_EXT_RE_BUILD
2868 1
2869#else
2870 DEBUG_r_TEST
2871#endif
2872 ) {
2873 regnode *fix = convert;
2874 U32 word = trie->wordcount;
2875 mjd_nodelen++;
2876 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2877 while( ++fix < n ) {
2878 Set_Node_Offset_Length(fix, 0, 0);
2879 }
2880 while (word--) {
2881 SV ** const tmp = av_fetch( trie_words, word, 0 );
2882 if (tmp) {
2883 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2884 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2885 else
2886 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2887 }
2888 }
2889 }
2890#endif
8e11feef
RGS
2891 if (trie->maxlen) {
2892 convert = n;
2893 } else {
3dab1dad 2894 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2895 DEBUG_r(optimize= n);
3dab1dad
YO
2896 }
2897 }
2898 }
538e84ed
KW
2899 if (!jumper)
2900 jumper = last;
3dab1dad 2901 if ( trie->maxlen ) {
8e11feef
RGS
2902 NEXT_OFF( convert ) = (U16)(tail - convert);
2903 ARG_SET( convert, data_slot );
538e84ed
KW
2904 /* Store the offset to the first unabsorbed branch in
2905 jump[0], which is otherwise unused by the jump logic.
786e8c11 2906 We use this when dumping a trie and during optimisation. */
538e84ed 2907 if (trie->jump)
7f69552c 2908 trie->jump[0] = (U16)(nextbranch - convert);
538e84ed 2909
6c48061a
YO
2910 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2911 * and there is a bitmap
2912 * and the first "jump target" node we found leaves enough room
2913 * then convert the TRIE node into a TRIEC node, with the bitmap
2914 * embedded inline in the opcode - this is hypothetically faster.
2915 */
2916 if ( !trie->states[trie->startstate].wordnum
2917 && trie->bitmap
2918 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2919 {
2920 OP( convert ) = TRIEC;
2921 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2922 PerlMemShared_free(trie->bitmap);
786e8c11 2923 trie->bitmap= NULL;
538e84ed 2924 } else
786e8c11 2925 OP( convert ) = TRIE;
a3621e74 2926
3dab1dad
YO
2927 /* store the type in the flags */
2928 convert->flags = nodetype;
a5ca303d 2929 DEBUG_r({
538e84ed
KW
2930 optimize = convert
2931 + NODE_STEP_REGNODE
a5ca303d
YO
2932 + regarglen[ OP( convert ) ];
2933 });
538e84ed 2934 /* XXX We really should free up the resource in trie now,
a5ca303d 2935 as we won't use them - (which resources?) dmq */
3dab1dad 2936 }
a3621e74 2937 /* needed for dumping*/
e62cc96a 2938 DEBUG_r(if (optimize) {
07be1b83 2939 regnode *opt = convert;
bcdf7404 2940
e62cc96a 2941 while ( ++opt < optimize) {
07be1b83
YO
2942 Set_Node_Offset_Length(opt,0,0);
2943 }
538e84ed
KW
2944 /*
2945 Try to clean up some of the debris left after the
786e8c11 2946 optimisation.
a3621e74 2947 */
786e8c11 2948 while( optimize < jumper ) {
07be1b83 2949 mjd_nodelen += Node_Length((optimize));
a3621e74 2950 OP( optimize ) = OPTIMIZED;
07be1b83 2951 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2952 optimize++;
2953 }
07be1b83 2954 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2955 });
2956 } /* end node insert */
2e64971a
DM
2957
2958 /* Finish populating the prev field of the wordinfo array. Walk back
2959 * from each accept state until we find another accept state, and if
2960 * so, point the first word's .prev field at the second word. If the
2961 * second already has a .prev field set, stop now. This will be the
2962 * case either if we've already processed that word's accept state,
3b753521
FN
2963 * or that state had multiple words, and the overspill words were
2964 * already linked up earlier.
2e64971a
DM
2965 */
2966 {
2967 U16 word;
2968 U32 state;
2969 U16 prev;
2970
2971 for (word=1; word <= trie->wordcount; word++) {
2972 prev = 0;
2973 if (trie->wordinfo[word].prev)
2974 continue;
2975 state = trie->wordinfo[word].accept;
2976 while (state) {
2977 state = prev_states[state];
2978 if (!state)
2979 break;
2980 prev = trie->states[state].wordnum;
2981 if (prev)
2982 break;
2983 }
2984 trie->wordinfo[word].prev = prev;
2985 }
2986 Safefree(prev_states);
2987 }
2988
2989
2990 /* and now dump out the compressed format */
2991 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2992
55eed653 2993 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2994#ifdef DEBUGGING
2995 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2996 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2997#else
03e70be4 2998 SvREFCNT_dec_NN(revcharmap);
07be1b83 2999#endif
538e84ed
KW
3000 return trie->jump
3001 ? MADE_JUMP_TRIE
3002 : trie->startstate>1
3003 ? MADE_EXACT_TRIE
786e8c11
YO
3004 : MADE_TRIE;
3005}
3006
615a2e7f
YO
3007STATIC regnode *
3008S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
786e8c11 3009{
b423522f
KW
3010/* The Trie is constructed and compressed now so we can build a fail array if
3011 * it's needed
786e8c11 3012
b423522f
KW
3013 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3014 3.32 in the
3015 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3016 Ullman 1985/88
786e8c11
YO
3017 ISBN 0-201-10088-6
3018
b423522f
KW
3019 We find the fail state for each state in the trie, this state is the longest
3020 proper suffix of the current state's 'word' that is also a proper prefix of
3021 another word in our trie. State 1 represents the word '' and is thus the
3022 default fail state. This allows the DFA not to have to restart after its
3023 tried and failed a word at a given point, it simply continues as though it
3024 had been matching the other word in the first place.
786e8c11
YO
3025 Consider
3026 'abcdgu'=~/abcdefg|cdgu/
b423522f
KW
3027 When we get to 'd' we are still matching the first word, we would encounter
3028 'g' which would fail, which would bring us to the state representing 'd' in
3029 the second word where we would try 'g' and succeed, proceeding to match
3030 'cdgu'.
786e8c11
YO
3031 */
3032 /* add a fail transition */
3251b653
NC
3033 const U32 trie_offset = ARG(source);
3034 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
3035 U32 *q;
3036 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 3037 const U32 numstates = trie->statecount;
786e8c11
YO
3038 const U32 ubound = trie->lasttrans + ucharcount;
3039 U32 q_read = 0;
3040 U32 q_write = 0;
3041 U32 charid;
3042 U32 base = trie->states[ 1 ].trans.base;
3043 U32 *fail;
3044 reg_ac_data *aho;
cf78de0b 3045 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
615a2e7f 3046 regnode *stclass;
786e8c11 3047 GET_RE_DEBUG_FLAGS_DECL;
7918f24d 3048
615a2e7f 3049 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
81611534 3050 PERL_UNUSED_CONTEXT;
786e8c11
YO
3051#ifndef DEBUGGING
3052 PERL_UNUSED_ARG(depth);
3053#endif
3054
615a2e7f
YO
3055 if ( OP(source) == TRIE ) {
3056 struct regnode_1 *op = (struct regnode_1 *)
3057 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3058 StructCopy(source,op,struct regnode_1);
3059 stclass = (regnode *)op;
3060 } else {
3061 struct regnode_charclass *op = (struct regnode_charclass *)
3062 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3063 StructCopy(source,op,struct regnode_charclass);
3064 stclass = (regnode *)op;
3065 }
3066 OP(stclass)+=2; /* covert the TRIE type to its AHO-CORASICK equivalent */
786e8c11
YO
3067
3068 ARG_SET( stclass, data_slot );
c944940b 3069 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 3070 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 3071 aho->trie=trie_offset;
446bd890
NC
3072 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3073 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 3074 Newxz( q, numstates, U32);
c944940b 3075 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
3076 aho->refcount = 1;
3077 fail = aho->fail;
3078 /* initialize fail[0..1] to be 1 so that we always have
3079 a valid final fail state */
3080 fail[ 0 ] = fail[ 1 ] = 1;
3081
3082 for ( charid = 0; charid < ucharcount ; charid++ ) {
3083 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3084 if ( newstate ) {
3085 q[ q_write ] = newstate;
3086 /* set to point at the root */
3087 fail[ q[ q_write++ ] ]=1;
3088 }
3089 }
3090 while ( q_read < q_write) {
3091 const U32 cur = q[ q_read++ % numstates ];
3092 base = trie->states[ cur ].trans.base;
3093
3094 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3095 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3096 if (ch_state) {
3097 U32 fail_state = cur;
3098 U32 fail_base;
3099 do {
3100 fail_state = fail[ fail_state ];
3101 fail_base = aho->states[ fail_state ].trans.base;
3102 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3103
3104 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3105 fail[ ch_state ] = fail_state;
3106 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3107 {
3108 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3109 }
3110 q[ q_write++ % numstates] = ch_state;
3111 }
3112 }
3113 }
3114 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3115 when we fail in state 1, this allows us to use the
3116 charclass scan to find a valid start char. This is based on the principle
3117 that theres a good chance the string being searched contains lots of stuff
3118 that cant be a start char.
3119 */
3120 fail[ 0 ] = fail[ 1 ] = 0;
3121 DEBUG_TRIE_COMPILE_r({
6d99fb9b 3122 PerlIO_printf(Perl_debug_log,
538e84ed 3123 "%*sStclass Failtable (%"UVuf" states): 0",
6d99fb9b 3124 (int)(depth * 2), "", (UV)numstates
1e2e3d02 3125 );
786e8c11
YO
3126 for( q_read=1; q_read<numstates; q_read++ ) {
3127 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3128 }
3129 PerlIO_printf(Perl_debug_log, "\n");
3130 });
3131 Safefree(q);
e384d5c1 3132 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
615a2e7f 3133 return stclass;
a3621e74
YO
3134}
3135
786e8c11 3136
07be1b83 3137#define DEBUG_PEEP(str,scan,depth) \
b515a41d 3138 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
3139 SV * const mysv=sv_newmortal(); \
3140 regnode *Next = regnext(scan); \
2395827c 3141 regprop(RExC_rx, mysv, scan, NULL); \
7f69552c 3142 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
3143 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3144 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 3145 }});
07be1b83 3146
1de06328 3147
bb914485 3148/* The below joins as many adjacent EXACTish nodes as possible into a single
0a982f06
KW
3149 * one. The regop may be changed if the node(s) contain certain sequences that
3150 * require special handling. The joining is only done if:
bb914485
KW
3151 * 1) there is room in the current conglomerated node to entirely contain the
3152 * next one.
3153 * 2) they are the exact same node type
3154 *
87b8b349 3155 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
bb914485
KW
3156 * these get optimized out
3157 *
0a982f06
KW
3158 * If a node is to match under /i (folded), the number of characters it matches
3159 * can be different than its character length if it contains a multi-character
31f05a37
KW
3160 * fold. *min_subtract is set to the total delta number of characters of the
3161 * input nodes.
bb914485 3162 *
fb29cc72 3163 * And *unfolded_multi_char is set to indicate whether or not the node contains
31f05a37
KW
3164 * an unfolded multi-char fold. This happens when whether the fold is valid or
3165 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3166 * SMALL LETTER SHARP S, as only if the target string being matched against
3167 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3168 * folding rules depend on the locale in force at runtime. (Multi-char folds
3169 * whose components are all above the Latin1 range are not run-time locale
3170 * dependent, and have already been folded by the time this function is
3171 * called.)
f758bddf 3172 *
bb914485 3173 * This is as good a place as any to discuss the design of handling these
0a982f06
KW
3174 * multi-character fold sequences. It's been wrong in Perl for a very long
3175 * time. There are three code points in Unicode whose multi-character folds
3176 * were long ago discovered to mess things up. The previous designs for
3177 * dealing with these involved assigning a special node for them. This
538e84ed 3178 * approach doesn't always work, as evidenced by this example:
a0c4c608 3179 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
538e84ed 3180 * Both sides fold to "sss", but if the pattern is parsed to create a node that
0a982f06 3181 * would match just the \xDF, it won't be able to handle the case where a
bb914485
KW
3182 * successful match would have to cross the node's boundary. The new approach
3183 * that hopefully generally solves the problem generates an EXACTFU_SS node
538e84ed 3184 * that is "sss" in this case.
bb914485 3185 *
0a982f06 3186 * It turns out that there are problems with all multi-character folds, and not
cb117658
KW
3187 * just these three. Now the code is general, for all such cases. The
3188 * approach taken is:
0a982f06 3189 * 1) This routine examines each EXACTFish node that could contain multi-
538e84ed
KW
3190 * character folded sequences. Since a single character can fold into
3191 * such a sequence, the minimum match length for this node is less than
3192 * the number of characters in the node. This routine returns in
31f05a37
KW
3193 * *min_subtract how many characters to subtract from the the actual
3194 * length of the string to get a real minimum match length; it is 0 if
3195 * there are no multi-char foldeds. This delta is used by the caller to
3196 * adjust the min length of the match, and the delta between min and max,
3197 * so that the optimizer doesn't reject these possibilities based on size
3198 * constraints.
cb117658 3199 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
0a982f06
KW
3200 * is used for an EXACTFU node that contains at least one "ss" sequence in
3201 * it. For non-UTF-8 patterns and strings, this is the only case where
3202 * there is a possible fold length change. That means that a regular
3203 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3204 * with length changes, and so can be processed faster. regexec.c takes
3205 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
31f05a37
KW
3206 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3207 * known until runtime). This saves effort in regex matching. However,
3208 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3209 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3210 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3211 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
0a982f06
KW
3212 * possibilities for the non-UTF8 patterns are quite simple, except for
3213 * the sharp s. All the ones that don't involve a UTF-8 target string are
3214 * members of a fold-pair, and arrays are set up for all of them so that
3215 * the other member of the pair can be found quickly. Code elsewhere in
3216 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3217 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3218 * described in the next item.
31f05a37
KW
3219 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3220 * validity of the fold won't be known until runtime, and so must remain
3221 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3222 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3223 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3224 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3225 * The reason this is a problem is that the optimizer part of regexec.c
3226 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3227 * that a character in the pattern corresponds to at most a single
3228 * character in the target string. (And I do mean character, and not byte
3229 * here, unlike other parts of the documentation that have never been
3230 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3231 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3232 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3233 * nodes, violate the assumption, and they are the only instances where it
3234 * is violated. I'm reluctant to try to change the assumption, as the
3235 * code involved is impenetrable to me (khw), so instead the code here
3236 * punts. This routine examines EXACTFL nodes, and (when the pattern
3237 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3238 * boolean indicating whether or not the node contains such a fold. When
3239 * it is true, the caller sets a flag that later causes the optimizer in
3240 * this file to not set values for the floating and fixed string lengths,
3241 * and thus avoids the optimizer code in regexec.c that makes the invalid
1ca267a5 3242 * assumption. Thus, there is no optimization based on string lengths for
31f05a37
KW
3243 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3244 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3245 * assumption is wrong only in these cases is that all other non-UTF-8
3246 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3247 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3248 * EXACTF nodes because we don't know at compile time if it actually
3249 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3250 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3251 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3252 * a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
3253 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
3254 * string would require the pattern to be forced into UTF-8, the overhead
3255 * of which we want to avoid. Similarly the unfolded multi-char folds in
3256 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
3257 * locale.)
098b07d5
KW
3258 *
3259 * Similarly, the code that generates tries doesn't currently handle
3260 * not-already-folded multi-char folds, and it looks like a pain to change
3261 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3262 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3263 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3264 * using /iaa matching will be doing so almost entirely with ASCII
3265 * strings, so this should rarely be encountered in practice */
1de06328 3266
fb29cc72 3267#define JOIN_EXACT(scan,min_subtract,unfolded_multi_char, flags) \
07be1b83 3268 if (PL_regkind[OP(scan)] == EXACT) \
fb29cc72 3269 join_exact(pRExC_state,(scan),(min_subtract),unfolded_multi_char, (flags),NULL,depth+1)
07be1b83 3270
be8e71aa 3271STATIC U32
538e84ed 3272S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
fb29cc72 3273 UV *min_subtract, bool *unfolded_multi_char,
538e84ed
KW
3274 U32 flags,regnode *val, U32 depth)
3275{
07be1b83
YO
3276 /* Merge several consecutive EXACTish nodes into one. */
3277 regnode *n = regnext(scan);
3278 U32 stringok = 1;
3279 regnode *next = scan + NODE_SZ_STR(scan);
3280 U32 merged = 0;
3281 U32 stopnow = 0;
3282#ifdef DEBUGGING
3283 regnode *stop = scan;
72f13be8 3284 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 3285#else
d47053eb
RGS
3286 PERL_UNUSED_ARG(depth);
3287#endif
7918f24d
NC
3288
3289 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 3290#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
3291 PERL_UNUSED_ARG(flags);
3292 PERL_UNUSED_ARG(val);
07be1b83 3293#endif
07be1b83 3294 DEBUG_PEEP("join",scan,depth);
bb914485 3295
3f410cf6
KW
3296 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3297 * EXACT ones that are mergeable to the current one. */
3298 while (n
3299 && (PL_regkind[OP(n)] == NOTHING
3300 || (stringok && OP(n) == OP(scan)))
07be1b83 3301 && NEXT_OFF(n)
3f410cf6
KW
3302 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3303 {
538e84ed 3304
07be1b83
YO
3305 if (OP(n) == TAIL || n > next)
3306 stringok = 0;
3307 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
3308 DEBUG_PEEP("skip:",n,depth);
3309 NEXT_OFF(scan) += NEXT_OFF(n);
3310 next = n + NODE_STEP_REGNODE;
3311#ifdef DEBUGGING
3312 if (stringok)
3313 stop = n;
3314#endif
3315 n = regnext(n);
3316 }
3317 else if (stringok) {
786e8c11 3318 const unsigned int oldl = STR_LEN(scan);
07be1b83 3319 regnode * const nnext = regnext(n);
b2230d39 3320
baa60164
KW
3321 /* XXX I (khw) kind of doubt that this works on platforms (should
3322 * Perl ever run on one) where U8_MAX is above 255 because of lots
3323 * of other assumptions */
79a81a6e 3324 /* Don't join if the sum can't fit into a single node */
b2230d39
KW
3325 if (oldl + STR_LEN(n) > U8_MAX)
3326 break;
538e84ed 3327
07be1b83 3328 DEBUG_PEEP("merg",n,depth);
07be1b83 3329 merged++;
b2230d39 3330
07be1b83
YO
3331 NEXT_OFF(scan) += NEXT_OFF(n);
3332 STR_LEN(scan) += STR_LEN(n);
3333 next = n + NODE_SZ_STR(n);
3334 /* Now we can overwrite *n : */
3335 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3336#ifdef DEBUGGING
3337 stop = next - 1;
3338#endif
3339 n = nnext;
3340 if (stopnow) break;
3341 }
3342
d47053eb
RGS
3343#ifdef EXPERIMENTAL_INPLACESCAN
3344 if (flags && !NEXT_OFF(n)) {
3345 DEBUG_PEEP("atch", val, depth);
3346 if (reg_off_by_arg[OP(n)]) {
3347 ARG_SET(n, val - n);
3348 }
3349 else {
3350 NEXT_OFF(n) = val - n;
3351 }
3352 stopnow = 1;
3353 }
07be1b83
YO
3354#endif
3355 }
2c2b7f86 3356
9d071ca8 3357 *min_subtract = 0;
fb29cc72 3358 *unfolded_multi_char = FALSE;
f646642f 3359
3f410cf6
KW
3360 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3361 * can now analyze for sequences of problematic code points. (Prior to
3362 * this final joining, sequences could have been split over boundaries, and
a0c4c608
KW
3363 * hence missed). The sequences only happen in folding, hence for any
3364 * non-EXACT EXACTish node */
86d6fcad 3365 if (OP(scan) != EXACT) {
31f05a37
KW
3366 U8* s0 = (U8*) STRING(scan);
3367 U8* s = s0;
3368 U8* s_end = s0 + STR_LEN(scan);
3369
3370 int total_count_delta = 0; /* Total delta number of characters that
3371 multi-char folds expand to */
f758bddf
KW
3372
3373 /* One pass is made over the node's string looking for all the
baa60164 3374 * possibilities. To avoid some tests in the loop, there are two main
f758bddf
KW
3375 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3376 * non-UTF-8 */
3377 if (UTF) {
31f05a37
KW
3378 U8* folded = NULL;
3379
3380 if (OP(scan) == EXACTFL) {
3381 U8 *d;
3382
3383 /* An EXACTFL node would already have been changed to another
3384 * node type unless there is at least one character in it that
3385 * is problematic; likely a character whose fold definition
3386 * won't be known until runtime, and so has yet to be folded.
3387 * For all but the UTF-8 locale, folds are 1-1 in length, but
3388 * to handle the UTF-8 case, we need to create a temporary
3389 * folded copy using UTF-8 locale rules in order to analyze it.
3390 * This is because our macros that look to see if a sequence is
3391 * a multi-char fold assume everything is folded (otherwise the
3392 * tests in those macros would be too complicated and slow).
3393 * Note that here, the non-problematic folds will have already
3394 * been done, so we can just copy such characters. We actually
3395 * don't completely fold the EXACTFL string. We skip the
3396 * unfolded multi-char folds, as that would just create work
3397 * below to figure out the size they already are */
3398
3399 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
3400 d = folded;
3401 while (s < s_end) {
3402 STRLEN s_len = UTF8SKIP(s);
3403 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
3404 Copy(s, d, s_len, U8);
3405 d += s_len;
3406 }
3407 else if (is_FOLDS_TO_MULTI_utf8(s)) {
fb29cc72 3408 *unfolded_multi_char = TRUE;
31f05a37
KW
3409 Copy(s, d, s_len, U8);
3410 d += s_len;
3411 }
3412 else if (isASCII(*s)) {
3413 *(d++) = toFOLD(*s);
3414 }
3415 else {
3416 STRLEN len;
3417 _to_utf8_fold_flags(s, d, &len, FOLD_FLAGS_FULL);
3418 d += len;
3419 }
3420 s += s_len;
3421 }
3422
3423 /* Point the remainder of the routine to look at our temporary
3424 * folded copy */
3425 s = folded;
3426 s_end = d;
3427 } /* End of creating folded copy of EXACTFL string */
86d6fcad 3428
0a982f06
KW
3429 /* Examine the string for a multi-character fold sequence. UTF-8
3430 * patterns have all characters pre-folded by the time this code is
3431 * executed */
3432 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3433 length sequence we are looking for is 2 */
86d6fcad 3434 {
538e84ed 3435 int count = 0; /* How many characters in a multi-char fold */
251b239f 3436 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
0a982f06
KW
3437 if (! len) { /* Not a multi-char fold: get next char */
3438 s += UTF8SKIP(s);
3439 continue;
3440 }
bb914485 3441
31f05a37
KW
3442 /* Nodes with 'ss' require special handling, except for
3443 * EXACTFA-ish for which there is no multi-char fold to this */
0a982f06 3444 if (len == 2 && *s == 's' && *(s+1) == 's'
098b07d5
KW
3445 && OP(scan) != EXACTFA
3446 && OP(scan) != EXACTFA_NO_TRIE)
0a982f06
KW
3447 {
3448 count = 2;
31f05a37
KW
3449 if (OP(scan) != EXACTFL) {
3450 OP(scan) = EXACTFU_SS;
3451 }
0a982f06
KW
3452 s += 2;
3453 }
0a982f06 3454 else { /* Here is a generic multi-char fold. */
31f05a37
KW
3455 U8* multi_end = s + len;
3456
2fe3656f
KW
3457 /* Count how many characters are in it. In the case of
3458 * /aa, no folds which contain ASCII code points are
3459 * allowed, so check for those, and skip if found. */