This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Don't output empty debugging text
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
04e98a4d 89#include "dquote_static.c"
26faadbd 90#include "charclass_invlists.h"
81e983c1 91#include "inline_invlist.c"
1b0f46bf 92#include "unicode_constants.h"
04e98a4d 93
94dc5c2d 94#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 95#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 96#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
b1603ef8 102
09b2b2e6 103struct RExC_state_t {
514a91f1
DM
104 U32 flags; /* RXf_* are we folding, multilining? */
105 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 106 char *precomp; /* uncompiled string. */
288b8c02 107 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
108 regexp *rx; /* perl core regexp structure */
109 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 110 char *start; /* Start of input for compile */
830247a4
IZ
111 char *end; /* End of input for compile */
112 char *parse; /* Input-scan pointer. */
ea3daa5d 113 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 114 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 115 regnode *emit_bound; /* First regnode outside of the allocated space */
f7c7e32a
DM
116 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
117 implies compiling, so don't emit */
9a81a976
KW
118 regnode_ssc emit_dummy; /* placeholder for emit to point to;
119 large enough for the largest
120 non-EXACTish node, so can use it as
121 scratch in pass1 */
830247a4
IZ
122 I32 naughty; /* How bad is this pattern? */
123 I32 sawback; /* Did we see \1, ...? */
124 U32 seen;
ea3daa5d 125 SSize_t size; /* Code size. */
09a65838 126 I32 npar; /* Capture buffer count, (OPEN) plus one. ("par" 0 is the whole pattern)*/
e2e6a0f1 127 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
128 I32 extralen;
129 I32 seen_zerolen;
40d049e4
YO
130 regnode **open_parens; /* pointers to open parens */
131 regnode **close_parens; /* pointers to close parens */
132 regnode *opend; /* END node in program */
02daf0ab
YO
133 I32 utf8; /* whether the pattern is utf8 or not */
134 I32 orig_utf8; /* whether the pattern was originally in utf8 */
135 /* XXX use this for future optimisation of case
136 * where pattern must be upgraded to utf8. */
e40e74fe
KW
137 I32 uni_semantics; /* If a d charset modifier should use unicode
138 rules, even if the pattern is not in
139 utf8 */
81714fb9 140 HV *paren_names; /* Paren names */
1f1031fe 141
40d049e4
YO
142 regnode **recurse; /* Recurse regops */
143 I32 recurse_count; /* Number of recurse regops */
09a65838
YO
144 U8 *study_chunk_recursed; /* bitmap of which parens we have moved through */
145 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 146 I32 in_lookbehind;
4624b182 147 I32 contains_locale;
cfafade5 148 I32 contains_i;
bb3f3ed2 149 I32 override_recoding;
9d53c457 150 I32 in_multi_char_class;
3d2bd50a 151 struct reg_code_block *code_blocks; /* positions of literal (?{})
68e2671b 152 within pattern */
b1603ef8
DM
153 int num_code_blocks; /* size of code_blocks[] */
154 int code_index; /* next code_blocks[] slot */
830247a4
IZ
155#if ADD_TO_REGEXEC
156 char *starttry; /* -Dr: where regtry was called. */
157#define RExC_starttry (pRExC_state->starttry)
158#endif
d24ca0c5 159 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 160#ifdef DEBUGGING
be8e71aa 161 const char *lastparse;
3dab1dad 162 I32 lastnum;
1f1031fe 163 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
164#define RExC_lastparse (pRExC_state->lastparse)
165#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 166#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 167#endif
09b2b2e6 168};
830247a4 169
e2509266 170#define RExC_flags (pRExC_state->flags)
514a91f1 171#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 172#define RExC_precomp (pRExC_state->precomp)
288b8c02 173#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 174#define RExC_rx (pRExC_state->rx)
f8fc2ecf 175#define RExC_rxi (pRExC_state->rxi)
fac92740 176#define RExC_start (pRExC_state->start)
830247a4
IZ
177#define RExC_end (pRExC_state->end)
178#define RExC_parse (pRExC_state->parse)
179#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
180#ifdef RE_TRACK_PATTERN_OFFSETS
181#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
182#endif
830247a4 183#define RExC_emit (pRExC_state->emit)
f7c7e32a 184#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 185#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 186#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
187#define RExC_naughty (pRExC_state->naughty)
188#define RExC_sawback (pRExC_state->sawback)
189#define RExC_seen (pRExC_state->seen)
190#define RExC_size (pRExC_state->size)
191#define RExC_npar (pRExC_state->npar)
e2e6a0f1 192#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
193#define RExC_extralen (pRExC_state->extralen)
194#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 195#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 196#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 197#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
198#define RExC_open_parens (pRExC_state->open_parens)
199#define RExC_close_parens (pRExC_state->close_parens)
200#define RExC_opend (pRExC_state->opend)
81714fb9 201#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
202#define RExC_recurse (pRExC_state->recurse)
203#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838
YO
204#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
205#define RExC_study_chunk_recursed_bytes (pRExC_state->study_chunk_recursed_bytes)
b57e4118 206#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 207#define RExC_contains_locale (pRExC_state->contains_locale)
cfafade5 208#define RExC_contains_i (pRExC_state->contains_i)
9d53c457
KW
209#define RExC_override_recoding (pRExC_state->override_recoding)
210#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
830247a4 211
cde0cee5 212
a687059c
LW
213#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
214#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
4d68ffa0 215 ((*s) == '{' && regcurly(s, FALSE)))
a687059c
LW
216
217/*
218 * Flags to be passed up and down.
219 */
a687059c 220#define WORST 0 /* Worst case. */
a3b492c3 221#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 222
e64f369d 223/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
224 * character. (There needs to be a case: in the switch statement in regexec.c
225 * for any node marked SIMPLE.) Note that this is not the same thing as
226 * REGNODE_SIMPLE */
fda99bee 227#define SIMPLE 0x02
e64f369d 228#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
229#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
230#define TRYAGAIN 0x10 /* Weeded out a declaration. */
231#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
a687059c 232
3dab1dad
YO
233#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
234
07be1b83
YO
235/* whether trie related optimizations are enabled */
236#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
237#define TRIE_STUDY_OPT
786e8c11 238#define FULL_TRIE_STUDY
07be1b83
YO
239#define TRIE_STCLASS
240#endif
1de06328
YO
241
242
40d049e4
YO
243
244#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
245#define PBITVAL(paren) (1 << ((paren) & 7))
246#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
247#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
248#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
249
bbd61b5f 250#define REQUIRE_UTF8 STMT_START { \
8d9c2815
NC
251 if (!UTF) { \
252 *flagp = RESTART_UTF8; \
253 return NULL; \
254 } \
bbd61b5f 255 } STMT_END
40d049e4 256
f19b1a63
KW
257/* This converts the named class defined in regcomp.h to its equivalent class
258 * number defined in handy.h. */
259#define namedclass_to_classnum(class) ((int) ((class) / 2))
260#define classnum_to_namedclass(classnum) ((classnum) * 2)
261
de92f5e6
KW
262#define _invlist_union_complement_2nd(a, b, output) \
263 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
264#define _invlist_intersection_complement_2nd(a, b, output) \
265 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
266
1de06328
YO
267/* About scan_data_t.
268
269 During optimisation we recurse through the regexp program performing
270 various inplace (keyhole style) optimisations. In addition study_chunk
271 and scan_commit populate this data structure with information about
272 what strings MUST appear in the pattern. We look for the longest
3b753521 273 string that must appear at a fixed location, and we look for the
1de06328
YO
274 longest string that may appear at a floating location. So for instance
275 in the pattern:
276
277 /FOO[xX]A.*B[xX]BAR/
278
279 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
280 strings (because they follow a .* construct). study_chunk will identify
281 both FOO and BAR as being the longest fixed and floating strings respectively.
282
283 The strings can be composites, for instance
284
285 /(f)(o)(o)/
286
287 will result in a composite fixed substring 'foo'.
288
289 For each string some basic information is maintained:
290
291 - offset or min_offset
292 This is the position the string must appear at, or not before.
293 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
294 characters must match before the string we are searching for.
295 Likewise when combined with minlenp and the length of the string it
1de06328
YO
296 tells us how many characters must appear after the string we have
297 found.
298
299 - max_offset
300 Only used for floating strings. This is the rightmost point that
ea3daa5d 301 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328
YO
302 string can occur infinitely far to the right.
303
304 - minlenp
2d608413
KW
305 A pointer to the minimum number of characters of the pattern that the
306 string was found inside. This is important as in the case of positive
1de06328
YO
307 lookahead or positive lookbehind we can have multiple patterns
308 involved. Consider
309
310 /(?=FOO).*F/
311
312 The minimum length of the pattern overall is 3, the minimum length
313 of the lookahead part is 3, but the minimum length of the part that
314 will actually match is 1. So 'FOO's minimum length is 3, but the
315 minimum length for the F is 1. This is important as the minimum length
316 is used to determine offsets in front of and behind the string being
317 looked for. Since strings can be composites this is the length of the
486ec47a 318 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
319 the length is calculated by study_chunk, so that the minimum lengths
320 are not known until the full pattern has been compiled, thus the
321 pointer to the value.
322
323 - lookbehind
324
325 In the case of lookbehind the string being searched for can be
326 offset past the start point of the final matching string.
327 If this value was just blithely removed from the min_offset it would
328 invalidate some of the calculations for how many chars must match
329 before or after (as they are derived from min_offset and minlen and
330 the length of the string being searched for).
331 When the final pattern is compiled and the data is moved from the
332 scan_data_t structure into the regexp structure the information
333 about lookbehind is factored in, with the information that would
334 have been lost precalculated in the end_shift field for the
335 associated string.
336
337 The fields pos_min and pos_delta are used to store the minimum offset
338 and the delta to the maximum offset at the current point in the pattern.
339
340*/
2c2d71f5
JH
341
342typedef struct scan_data_t {
1de06328
YO
343 /*I32 len_min; unused */
344 /*I32 len_delta; unused */
49f55535 345 SSize_t pos_min;
ea3daa5d 346 SSize_t pos_delta;
2c2d71f5 347 SV *last_found;
ea3daa5d 348 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 349 SSize_t last_start_min;
ea3daa5d 350 SSize_t last_start_max;
1de06328
YO
351 SV **longest; /* Either &l_fixed, or &l_float. */
352 SV *longest_fixed; /* longest fixed string found in pattern */
49f55535 353 SSize_t offset_fixed; /* offset where it starts */
ea3daa5d 354 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
355 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
356 SV *longest_float; /* longest floating string found in pattern */
49f55535 357 SSize_t offset_float_min; /* earliest point in string it can appear */
ea3daa5d
FC
358 SSize_t offset_float_max; /* latest point in string it can appear */
359 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
49f55535 360 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
2c2d71f5
JH
361 I32 flags;
362 I32 whilem_c;
49f55535 363 SSize_t *last_closep;
b8f7bb16 364 regnode_ssc *start_class;
2c2d71f5
JH
365} scan_data_t;
366
c02c3054
KW
367/* The below is perhaps overboard, but this allows us to save a test at the
368 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
369 * and 'a' differ by a single bit; the same with the upper and lower case of
370 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
371 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
372 * then inverts it to form a mask, with just a single 0, in the bit position
373 * where the upper- and lowercase differ. XXX There are about 40 other
374 * instances in the Perl core where this micro-optimization could be used.
375 * Should decide if maintenance cost is worse, before changing those
376 *
377 * Returns a boolean as to whether or not 'v' is either a lowercase or
378 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
379 * compile-time constant, the generated code is better than some optimizing
380 * compilers figure out, amounting to a mask and test. The results are
381 * meaningless if 'c' is not one of [A-Za-z] */
382#define isARG2_lower_or_UPPER_ARG1(c, v) \
383 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
384
a687059c 385/*
e50aee73 386 * Forward declarations for pregcomp()'s friends.
a687059c 387 */
a0d0e21e 388
27da23d5 389static const scan_data_t zero_scan_data =
1de06328 390 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
391
392#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
393#define SF_BEFORE_SEOL 0x0001
394#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
395#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
396#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
397
44e3dfd2
BF
398#define SF_FIX_SHIFT_EOL (+2)
399#define SF_FL_SHIFT_EOL (+4)
c277df42
IZ
400
401#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
402#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
403
404#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
405#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
406#define SF_IS_INF 0x0040
407#define SF_HAS_PAR 0x0080
408#define SF_IN_PAR 0x0100
409#define SF_HAS_EVAL 0x0200
410#define SCF_DO_SUBSTR 0x0400
653099ff
GS
411#define SCF_DO_STCLASS_AND 0x0800
412#define SCF_DO_STCLASS_OR 0x1000
413#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 414#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 415
786e8c11 416#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 417#define SCF_SEEN_ACCEPT 0x8000
688e0391 418#define SCF_TRIE_DOING_RESTUDY 0x10000
07be1b83 419
43fead97 420#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
421
422/* The enums for all these are ordered so things work out correctly */
a62b1201 423#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
cfaf538b 424#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
00b27cfc 425#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
426#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
427#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
2f7f8cb1 428#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
a725e29c 429#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 430
43fead97 431#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 432
f2c2a6ab
KW
433/* For programs that want to be strictly Unicode compatible by dying if any
434 * attempt is made to match a non-Unicode code point against a Unicode
435 * property. */
436#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
437
93733859 438#define OOB_NAMEDCLASS -1
b8c5462f 439
8e661ac5
KW
440/* There is no code point that is out-of-bounds, so this is problematic. But
441 * its only current use is to initialize a variable that is always set before
442 * looked at. */
443#define OOB_UNICODE 0xDEADBEEF
444
a0ed51b3
LW
445#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
446#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
447
8615cb43 448
b45f050a
JF
449/* length of regex to show in messages that don't mark a position within */
450#define RegexLengthToShowInErrorMessages 127
451
452/*
453 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
454 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
455 * op/pragma/warn/regcomp.
456 */
7253e4e3
RK
457#define MARKER1 "<-- HERE" /* marker as it appears in the description */
458#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 459
c1d900c3 460#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
b45f050a 461
c1d900c3
BF
462#define REPORT_LOCATION_ARGS(offset) \
463 UTF8fARG(UTF, offset, RExC_precomp), \
464 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
465
b45f050a
JF
466/*
467 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
468 * arg. Show regex, up to a maximum length. If it's too long, chop and add
469 * "...".
470 */
58e23c8d 471#define _FAIL(code) STMT_START { \
bfed75c6 472 const char *ellipses = ""; \
ccb2c380
MP
473 IV len = RExC_end - RExC_precomp; \
474 \
475 if (!SIZE_ONLY) \
a5e7bc51 476 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
477 if (len > RegexLengthToShowInErrorMessages) { \
478 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
479 len = RegexLengthToShowInErrorMessages - 10; \
480 ellipses = "..."; \
481 } \
58e23c8d 482 code; \
ccb2c380 483} STMT_END
8615cb43 484
58e23c8d 485#define FAIL(msg) _FAIL( \
c1d900c3
BF
486 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
487 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
488
489#define FAIL2(msg,arg) _FAIL( \
c1d900c3
BF
490 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
491 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 492
b45f050a 493/*
b45f050a
JF
494 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
495 */
ccb2c380 496#define Simple_vFAIL(m) STMT_START { \
a28509cc 497 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380 498 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
c1d900c3 499 m, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 500} STMT_END
b45f050a
JF
501
502/*
503 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
504 */
ccb2c380
MP
505#define vFAIL(m) STMT_START { \
506 if (!SIZE_ONLY) \
a5e7bc51 507 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
508 Simple_vFAIL(m); \
509} STMT_END
b45f050a
JF
510
511/*
512 * Like Simple_vFAIL(), but accepts two arguments.
513 */
ccb2c380 514#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 515 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
516 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
517 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 518} STMT_END
b45f050a
JF
519
520/*
521 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
522 */
ccb2c380
MP
523#define vFAIL2(m,a1) STMT_START { \
524 if (!SIZE_ONLY) \
a5e7bc51 525 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
526 Simple_vFAIL2(m, a1); \
527} STMT_END
b45f050a
JF
528
529
530/*
531 * Like Simple_vFAIL(), but accepts three arguments.
532 */
ccb2c380 533#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 534 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
535 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
536 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 537} STMT_END
b45f050a
JF
538
539/*
540 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
541 */
ccb2c380
MP
542#define vFAIL3(m,a1,a2) STMT_START { \
543 if (!SIZE_ONLY) \
a5e7bc51 544 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
545 Simple_vFAIL3(m, a1, a2); \
546} STMT_END
b45f050a
JF
547
548/*
549 * Like Simple_vFAIL(), but accepts four arguments.
550 */
ccb2c380 551#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 552 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
553 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
554 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 555} STMT_END
b45f050a 556
95db3ffa
KW
557#define vFAIL4(m,a1,a2,a3) STMT_START { \
558 if (!SIZE_ONLY) \
559 SAVEFREESV(RExC_rx_sv); \
560 Simple_vFAIL4(m, a1, a2, a3); \
561} STMT_END
562
946095af
BF
563/* A specialized version of vFAIL2 that works with UTF8f */
564#define vFAIL2utf8f(m, a1) STMT_START { \
ef3f731d
BF
565 const IV offset = RExC_parse - RExC_precomp; \
566 if (!SIZE_ONLY) \
567 SAVEFREESV(RExC_rx_sv); \
568 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
569 REPORT_LOCATION_ARGS(offset)); \
946095af
BF
570} STMT_END
571
572
5e0a247b
KW
573/* m is not necessarily a "literal string", in this macro */
574#define reg_warn_non_literal_string(loc, m) STMT_START { \
575 const IV offset = loc - RExC_precomp; \
576 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
c1d900c3 577 m, REPORT_LOCATION_ARGS(offset)); \
5e0a247b
KW
578} STMT_END
579
668c081a 580#define ckWARNreg(loc,m) STMT_START { \
a28509cc 581 const IV offset = loc - RExC_precomp; \
f10f4c18 582 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 583 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
584} STMT_END
585
0d6106aa
KW
586#define vWARN_dep(loc, m) STMT_START { \
587 const IV offset = loc - RExC_precomp; \
588 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
c1d900c3 589 REPORT_LOCATION_ARGS(offset)); \
0d6106aa
KW
590} STMT_END
591
147508a2
KW
592#define ckWARNdep(loc,m) STMT_START { \
593 const IV offset = loc - RExC_precomp; \
594 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
595 m REPORT_LOCATION, \
c1d900c3 596 REPORT_LOCATION_ARGS(offset)); \
147508a2
KW
597} STMT_END
598
668c081a 599#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 600 const IV offset = loc - RExC_precomp; \
d1d15184 601 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18 602 m REPORT_LOCATION, \
c1d900c3 603 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
604} STMT_END
605
b23eb183 606#define ckWARN2reg_d(loc,m, a1) STMT_START { \
2335b3d3 607 const IV offset = loc - RExC_precomp; \
b23eb183 608 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
2335b3d3 609 m REPORT_LOCATION, \
c1d900c3 610 a1, REPORT_LOCATION_ARGS(offset)); \
2335b3d3
KW
611} STMT_END
612
668c081a 613#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 614 const IV offset = loc - RExC_precomp; \
668c081a 615 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 616 a1, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
617} STMT_END
618
619#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 620 const IV offset = loc - RExC_precomp; \
ccb2c380 621 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 622 a1, a2, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
623} STMT_END
624
668c081a
NC
625#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
626 const IV offset = loc - RExC_precomp; \
627 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 628 a1, a2, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
629} STMT_END
630
ccb2c380 631#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 632 const IV offset = loc - RExC_precomp; \
ccb2c380 633 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 634 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
635} STMT_END
636
668c081a
NC
637#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
638 const IV offset = loc - RExC_precomp; \
639 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 640 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
641} STMT_END
642
ccb2c380 643#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 644 const IV offset = loc - RExC_precomp; \
ccb2c380 645 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 646 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 647} STMT_END
9d1d55b5 648
8615cb43 649
cd439c50 650/* Allow for side effects in s */
ccb2c380
MP
651#define REGC(c,s) STMT_START { \
652 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
653} STMT_END
cd439c50 654
fac92740
MJD
655/* Macros for recording node offsets. 20001227 mjd@plover.com
656 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
657 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
658 * Element 0 holds the number n.
07be1b83 659 * Position is 1 indexed.
fac92740 660 */
7122b237
YO
661#ifndef RE_TRACK_PATTERN_OFFSETS
662#define Set_Node_Offset_To_R(node,byte)
663#define Set_Node_Offset(node,byte)
664#define Set_Cur_Node_Offset
665#define Set_Node_Length_To_R(node,len)
666#define Set_Node_Length(node,len)
6a86c6ad 667#define Set_Node_Cur_Length(node,start)
7122b237
YO
668#define Node_Offset(n)
669#define Node_Length(n)
670#define Set_Node_Offset_Length(node,offset,len)
671#define ProgLen(ri) ri->u.proglen
672#define SetProgLen(ri,x) ri->u.proglen = x
673#else
674#define ProgLen(ri) ri->u.offsets[0]
675#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
676#define Set_Node_Offset_To_R(node,byte) STMT_START { \
677 if (! SIZE_ONLY) { \
678 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 679 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 680 if((node) < 0) { \
551405c4 681 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
682 } else { \
683 RExC_offsets[2*(node)-1] = (byte); \
684 } \
685 } \
686} STMT_END
687
688#define Set_Node_Offset(node,byte) \
689 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
690#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
691
692#define Set_Node_Length_To_R(node,len) STMT_START { \
693 if (! SIZE_ONLY) { \
694 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 695 __LINE__, (int)(node), (int)(len))); \
ccb2c380 696 if((node) < 0) { \
551405c4 697 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
698 } else { \
699 RExC_offsets[2*(node)] = (len); \
700 } \
701 } \
702} STMT_END
703
704#define Set_Node_Length(node,len) \
705 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
706#define Set_Node_Cur_Length(node, start) \
707 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
708
709/* Get offsets and lengths */
710#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
711#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
712
07be1b83
YO
713#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
714 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
715 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
716} STMT_END
7122b237 717#endif
07be1b83
YO
718
719#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
720#define EXPERIMENTAL_INPLACESCAN
f427392e 721#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 722
9e9ecfdf
YO
723#define DEBUG_RExC_seen() \
724 DEBUG_OPTIMISE_MORE_r({ \
725 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
726 \
727 if (RExC_seen & REG_SEEN_ZERO_LEN) \
728 PerlIO_printf(Perl_debug_log,"REG_SEEN_ZERO_LEN "); \
729 \
730 if (RExC_seen & REG_SEEN_LOOKBEHIND) \
731 PerlIO_printf(Perl_debug_log,"REG_SEEN_LOOKBEHIND "); \
732 \
733 if (RExC_seen & REG_SEEN_GPOS) \
734 PerlIO_printf(Perl_debug_log,"REG_SEEN_GPOS "); \
735 \
736 if (RExC_seen & REG_SEEN_CANY) \
737 PerlIO_printf(Perl_debug_log,"REG_SEEN_CANY "); \
738 \
739 if (RExC_seen & REG_SEEN_RECURSE) \
740 PerlIO_printf(Perl_debug_log,"REG_SEEN_RECURSE "); \
741 \
742 if (RExC_seen & REG_TOP_LEVEL_BRANCHES) \
743 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES "); \
744 \
745 if (RExC_seen & REG_SEEN_VERBARG) \
746 PerlIO_printf(Perl_debug_log,"REG_SEEN_VERBARG "); \
747 \
748 if (RExC_seen & REG_SEEN_CUTGROUP) \
749 PerlIO_printf(Perl_debug_log,"REG_SEEN_CUTGROUP "); \
750 \
751 if (RExC_seen & REG_SEEN_RUN_ON_COMMENT) \
752 PerlIO_printf(Perl_debug_log,"REG_SEEN_RUN_ON_COMMENT "); \
753 \
754 if (RExC_seen & REG_SEEN_EXACTF_SHARP_S) \
755 PerlIO_printf(Perl_debug_log,"REG_SEEN_EXACTF_SHARP_S "); \
756 \
757 if (RExC_seen & REG_SEEN_GOSTART) \
758 PerlIO_printf(Perl_debug_log,"REG_SEEN_GOSTART "); \
759 \
760 PerlIO_printf(Perl_debug_log,"\n"); \
761 });
762
304ee84b
YO
763#define DEBUG_STUDYDATA(str,data,depth) \
764DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 765 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
766 "%*s" str "Pos:%"IVdf"/%"IVdf \
767 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
768 (int)(depth)*2, "", \
769 (IV)((data)->pos_min), \
770 (IV)((data)->pos_delta), \
304ee84b 771 (UV)((data)->flags), \
1de06328 772 (IV)((data)->whilem_c), \
304ee84b
YO
773 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
774 is_inf ? "INF " : "" \
1de06328
YO
775 ); \
776 if ((data)->last_found) \
777 PerlIO_printf(Perl_debug_log, \
778 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
779 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
780 SvPVX_const((data)->last_found), \
781 (IV)((data)->last_end), \
782 (IV)((data)->last_start_min), \
783 (IV)((data)->last_start_max), \
784 ((data)->longest && \
785 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
786 SvPVX_const((data)->longest_fixed), \
787 (IV)((data)->offset_fixed), \
788 ((data)->longest && \
789 (data)->longest==&((data)->longest_float)) ? "*" : "", \
790 SvPVX_const((data)->longest_float), \
791 (IV)((data)->offset_float_min), \
792 (IV)((data)->offset_float_max) \
793 ); \
794 PerlIO_printf(Perl_debug_log,"\n"); \
795});
796
653099ff 797/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 798 Update the longest found anchored substring and the longest found
653099ff
GS
799 floating substrings if needed. */
800
4327152a 801STATIC void
ea3daa5d
FC
802S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
803 SSize_t *minlenp, int is_inf)
c277df42 804{
e1ec3a88
AL
805 const STRLEN l = CHR_SVLEN(data->last_found);
806 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 807 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 808
7918f24d
NC
809 PERL_ARGS_ASSERT_SCAN_COMMIT;
810
c277df42 811 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 812 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
813 if (*data->longest == data->longest_fixed) {
814 data->offset_fixed = l ? data->last_start_min : data->pos_min;
815 if (data->flags & SF_BEFORE_EOL)
b81d288d 816 data->flags
c277df42
IZ
817 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
818 else
819 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 820 data->minlen_fixed=minlenp;
1de06328 821 data->lookbehind_fixed=0;
a0ed51b3 822 }
304ee84b 823 else { /* *data->longest == data->longest_float */
c277df42 824 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
825 data->offset_float_max = (l
826 ? data->last_start_max
ea3daa5d
FC
827 : (data->pos_delta == SSize_t_MAX
828 ? SSize_t_MAX
829 : data->pos_min + data->pos_delta));
830 if (is_inf
831 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
832 data->offset_float_max = SSize_t_MAX;
c277df42 833 if (data->flags & SF_BEFORE_EOL)
b81d288d 834 data->flags
c277df42
IZ
835 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
836 else
837 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
838 data->minlen_float=minlenp;
839 data->lookbehind_float=0;
c277df42
IZ
840 }
841 }
842 SvCUR_set(data->last_found, 0);
0eda9292 843 {
a28509cc 844 SV * const sv = data->last_found;
097eb12c
AL
845 if (SvUTF8(sv) && SvMAGICAL(sv)) {
846 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
847 if (mg)
848 mg->mg_len = 0;
849 }
0eda9292 850 }
c277df42
IZ
851 data->last_end = -1;
852 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 853 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
854}
855
cdd87c1d
KW
856/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
857 * list that describes which code points it matches */
858
653099ff 859STATIC void
3420edd7 860S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 861{
cdd87c1d
KW
862 /* Set the SSC 'ssc' to match an empty string or any code point */
863
557bd3fb 864 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 865
71068078 866 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 867
cdd87c1d
KW
868 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
869 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
a0dd4231 870 ANYOF_FLAGS(ssc) |= ANYOF_EMPTY_STRING; /* Plus match empty string */
653099ff
GS
871}
872
653099ff 873STATIC int
cdd87c1d 874S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
653099ff 875{
c144baaa
KW
876 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
877 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
878 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
879 * in any way, so there's no point in using it */
cdd87c1d
KW
880
881 UV start, end;
882 bool ret;
653099ff 883
557bd3fb 884 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 885
71068078 886 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 887
77ebeeba 888 if (! (ANYOF_FLAGS(ssc) & ANYOF_EMPTY_STRING)) {
cdd87c1d
KW
889 return FALSE;
890 }
891
892 /* See if the list consists solely of the range 0 - Infinity */
893 invlist_iterinit(ssc->invlist);
894 ret = invlist_iternext(ssc->invlist, &start, &end)
895 && start == 0
896 && end == UV_MAX;
897
898 invlist_iterfinish(ssc->invlist);
899
900 if (ret) {
901 return TRUE;
902 }
903
904 /* If e.g., both \w and \W are set, matches everything */
905 if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
906 int i;
907 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
908 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
909 return TRUE;
910 }
911 }
912 }
913
914 return FALSE;
653099ff
GS
915}
916
653099ff 917STATIC void
cdd87c1d 918S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 919{
cdd87c1d
KW
920 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
921 * string, any code point, or any posix class under locale */
922
557bd3fb 923 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 924
557bd3fb 925 Zero(ssc, 1, regnode_ssc);
71068078 926 set_ANYOF_SYNTHETIC(ssc);
cdd87c1d 927 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
3420edd7 928 ssc_anything(ssc);
cdd87c1d
KW
929
930 /* If any portion of the regex is to operate under locale rules,
931 * initialization includes it. The reason this isn't done for all regexes
932 * is that the optimizer was written under the assumption that locale was
933 * all-or-nothing. Given the complexity and lack of documentation in the
934 * optimizer, and that there are inadequate test cases for locale, many
935 * parts of it may not work properly, it is safest to avoid locale unless
936 * necessary. */
937 if (RExC_contains_locale) {
938 ANYOF_POSIXL_SETALL(ssc);
cfafade5
KW
939 ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
940 if (RExC_contains_i) {
941 ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD;
942 }
cdd87c1d
KW
943 }
944 else {
945 ANYOF_POSIXL_ZERO(ssc);
946 }
653099ff
GS
947}
948
b423522f
KW
949STATIC int
950S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
951 const regnode_ssc *ssc)
952{
953 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
954 * to the list of code points matched, and locale posix classes; hence does
955 * not check its flags) */
956
957 UV start, end;
958 bool ret;
959
960 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
961
71068078 962 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
963
964 invlist_iterinit(ssc->invlist);
965 ret = invlist_iternext(ssc->invlist, &start, &end)
966 && start == 0
967 && end == UV_MAX;
968
969 invlist_iterfinish(ssc->invlist);
970
971 if (! ret) {
972 return FALSE;
973 }
974
975 if (RExC_contains_locale) {
976 if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
977 || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
978 || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))
979 {
980 return FALSE;
981 }
982 if (RExC_contains_i && ! (ANYOF_FLAGS(ssc) & ANYOF_LOC_FOLD)) {
983 return FALSE;
984 }
985 }
986
987 return TRUE;
988}
989
990STATIC SV*
991S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
992 const regnode_charclass_posixl* const node)
993{
994 /* Returns a mortal inversion list defining which code points are matched
995 * by 'node', which is of type ANYOF. Handles complementing the result if
996 * appropriate. If some code points aren't knowable at this time, the
997 * returned list must, and will, contain every possible code point. */
998
999 SV* invlist = sv_2mortal(_new_invlist(0));
1000 unsigned int i;
1001 const U32 n = ARG(node);
1002
1003 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1004
1005 /* Look at the data structure created by S_set_ANYOF_arg() */
1006 if (n != ANYOF_NONBITMAP_EMPTY) {
1007 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1008 AV * const av = MUTABLE_AV(SvRV(rv));
1009 SV **const ary = AvARRAY(av);
1010 assert(RExC_rxi->data->what[n] == 's');
1011
1012 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1013 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1014 }
1015 else if (ary[0] && ary[0] != &PL_sv_undef) {
1016
1017 /* Here, no compile-time swash, and there are things that won't be
1018 * known until runtime -- we have to assume it could be anything */
1019 return _add_range_to_invlist(invlist, 0, UV_MAX);
1020 }
1021 else {
1022
1023 /* Here no compile-time swash, and no run-time only data. Use the
1024 * node's inversion list */
1025 invlist = sv_2mortal(invlist_clone(ary[2]));
1026 }
1027 }
1028
1029 /* An ANYOF node contains a bitmap for the first 256 code points, and an
1030 * inversion list for the others, but if there are code points that should
1031 * match only conditionally on the target string being UTF-8, those are
1032 * placed in the inversion list, and not the bitmap. Since there are
1033 * circumstances under which they could match, they are included in the
1034 * SSC. But if the ANYOF node is to be inverted, we have to exclude them
1035 * here, so that when we invert below, the end result actually does include
1036 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
1037 * before we add the unconditionally matched code points */
1038 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1039 _invlist_intersection_complement_2nd(invlist,
1040 PL_UpperLatin1,
1041 &invlist);
1042 }
1043
1044 /* Add in the points from the bit map */
1045 for (i = 0; i < 256; i++) {
1046 if (ANYOF_BITMAP_TEST(node, i)) {
1047 invlist = add_cp_to_invlist(invlist, i);
1048 }
1049 }
1050
1051 /* If this can match all upper Latin1 code points, have to add them
1052 * as well */
51ca120f 1053 if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_NON_ASCII_ALL) {
b423522f
KW
1054 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1055 }
1056
1057 /* Similarly for these */
1058 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
1059 invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
1060 }
1061
1062 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1063 _invlist_invert(invlist);
1064 }
1065
1066 return invlist;
1067}
1068
1051e1c4 1069/* These two functions currently do the exact same thing */
557bd3fb 1070#define ssc_init_zero ssc_init
653099ff 1071
cdd87c1d
KW
1072#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1073#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1074
b423522f
KW
1075STATIC void
1076S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
1077{
1078 /* Take the flags 'and_with' and accumulate them anded into the flags for
a0dd4231
KW
1079 * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored.
1080 * The flags 'and_with' should not come from another SSC (otherwise the
1081 * EMPTY_STRING flag won't work) */
b423522f 1082
eff8b7dc 1083 const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS;
b423522f
KW
1084
1085 PERL_ARGS_ASSERT_SSC_FLAGS_AND;
1086
1087 /* Use just the SSC-related flags from 'and_with' */
eff8b7dc 1088 ANYOF_FLAGS(ssc) &= (and_with & ANYOF_COMMON_FLAGS);
b423522f
KW
1089 ANYOF_FLAGS(ssc) |= ssc_only_flags;
1090}
1091
557bd3fb 1092/* 'AND' a given class with another one. Can create false positives. 'ssc'
8efd3f97 1093 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
b8f7bb16 1094 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1095
653099ff 1096STATIC void
b423522f
KW
1097S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1098 const regnode_ssc *and_with)
653099ff 1099{
cdd87c1d
KW
1100 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1101 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1102
a0dd4231
KW
1103 SV* anded_cp_list;
1104 U8 anded_flags;
1e6ade67 1105
cdd87c1d 1106 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1107
71068078 1108 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1109
1110 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1111 * the code point inversion list and just the relevant flags */
71068078 1112 if (is_ANYOF_SYNTHETIC(and_with)) {
a0dd4231
KW
1113 anded_cp_list = and_with->invlist;
1114 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1115
1116 /* XXX This is a kludge around what appears to be deficiencies in the
1117 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1118 * there are paths through the optimizer where it doesn't get weeded
1119 * out when it should. And if we don't make some extra provision for
1120 * it like the code just below, it doesn't get added when it should.
1121 * This solution is to add it only when AND'ing, which is here, and
1122 * only when what is being AND'ed is the pristine, original node
1123 * matching anything. Thus it is like adding it to ssc_anything() but
1124 * only when the result is to be AND'ed. Probably the same solution
1125 * could be adopted for the same problem we have with /l matching,
1126 * which is solved differently in S_ssc_init(), and that would lead to
1127 * fewer false positives than that solution has. But if this solution
1128 * creates bugs, the consequences are only that a warning isn't raised
1129 * that should be; while the consequences for having /l bugs is
1130 * incorrect matches */
1131 if (ssc_is_anything(and_with)) {
1132 anded_flags |= ANYOF_WARN_SUPER;
1133 }
a0dd4231
KW
1134 }
1135 else {
1136 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1137 (regnode_charclass_posixl*) and_with);
eff8b7dc 1138 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
a0dd4231
KW
1139 }
1140
1141 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1142
1143 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1144 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1145 * 'and_with' may be inverted. When not inverted, we have the situation of
1146 * computing:
1147 * (C1 | P1) & (C2 | P2)
1148 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1149 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1150 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1151 * <= ((C1 & C2) | P1 | P2)
1152 * Alternatively, the last few steps could be:
1153 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1154 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1155 * <= (C1 | C2 | (P1 & P2))
1156 * We favor the second approach if either P1 or P2 is non-empty. This is
1157 * because these components are a barrier to doing optimizations, as what
1158 * they match cannot be known until the moment of matching as they are
1159 * dependent on the current locale, 'AND"ing them likely will reduce or
1160 * eliminate them.
1161 * But we can do better if we know that C1,P1 are in their initial state (a
1162 * frequent occurrence), each matching everything:
1163 * (<everything>) & (C2 | P2) = C2 | P2
1164 * Similarly, if C2,P2 are in their initial state (again a frequent
1165 * occurrence), the result is a no-op
1166 * (C1 | P1) & (<everything>) = C1 | P1
1167 *
1168 * Inverted, we have
1169 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1170 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1171 * <= (C1 & ~C2) | (P1 & ~P2)
1172 * */
1aa99e6b 1173
a0dd4231 1174 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1175 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1176 {
cdd87c1d 1177 unsigned int i;
8951c461 1178
cdd87c1d
KW
1179 ssc_intersection(ssc,
1180 anded_cp_list,
1181 FALSE /* Has already been inverted */
1182 );
c6b76537 1183
cdd87c1d
KW
1184 /* If either P1 or P2 is empty, the intersection will be also; can skip
1185 * the loop */
1186 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1187 ANYOF_POSIXL_ZERO(ssc);
1188 }
1189 else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1190
1191 /* Note that the Posix class component P from 'and_with' actually
1192 * looks like:
1193 * P = Pa | Pb | ... | Pn
1194 * where each component is one posix class, such as in [\w\s].
1195 * Thus
1196 * ~P = ~(Pa | Pb | ... | Pn)
1197 * = ~Pa & ~Pb & ... & ~Pn
1198 * <= ~Pa | ~Pb | ... | ~Pn
1199 * The last is something we can easily calculate, but unfortunately
1200 * is likely to have many false positives. We could do better
1201 * in some (but certainly not all) instances if two classes in
1202 * P have known relationships. For example
1203 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1204 * So
1205 * :lower: & :print: = :lower:
1206 * And similarly for classes that must be disjoint. For example,
1207 * since \s and \w can have no elements in common based on rules in
1208 * the POSIX standard,
1209 * \w & ^\S = nothing
1210 * Unfortunately, some vendor locales do not meet the Posix
1211 * standard, in particular almost everything by Microsoft.
1212 * The loop below just changes e.g., \w into \W and vice versa */
1213
1214 regnode_charclass_posixl temp;
1215 int add = 1; /* To calculate the index of the complement */
1216
1217 ANYOF_POSIXL_ZERO(&temp);
1218 for (i = 0; i < ANYOF_MAX; i++) {
1219 assert(i % 2 != 0
1220 || ! ANYOF_POSIXL_TEST(and_with, i)
1221 || ! ANYOF_POSIXL_TEST(and_with, i + 1));
1222
1223 if (ANYOF_POSIXL_TEST(and_with, i)) {
1224 ANYOF_POSIXL_SET(&temp, i + add);
1225 }
1226 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1227 }
1228 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1229
cdd87c1d
KW
1230 } /* else ssc already has no posixes */
1231 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1232 in its initial state */
71068078 1233 else if (! is_ANYOF_SYNTHETIC(and_with)
cdd87c1d
KW
1234 || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
1235 {
1236 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1237 * copy it over 'ssc' */
1238 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1239 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1240 StructCopy(and_with, ssc, regnode_ssc);
1241 }
1242 else {
1243 ssc->invlist = anded_cp_list;
1244 ANYOF_POSIXL_ZERO(ssc);
1245 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1246 ANYOF_POSIXL_OR(and_with, ssc);
1247 }
1248 }
1249 }
1250 else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
1251 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1252 {
1253 /* One or the other of P1, P2 is non-empty. */
1254 ANYOF_POSIXL_AND(and_with, ssc);
1255 ssc_union(ssc, anded_cp_list, FALSE);
1256 }
1257 else { /* P1 = P2 = empty */
1258 ssc_intersection(ssc, anded_cp_list, FALSE);
1259 }
137165a6 1260 }
653099ff
GS
1261}
1262
653099ff 1263STATIC void
cdd87c1d
KW
1264S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1265 const regnode_ssc *or_with)
653099ff 1266{
cdd87c1d
KW
1267 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1268 * another SSC or a regular ANYOF class. Can create false positives if
1269 * 'or_with' is to be inverted. */
7918f24d 1270
a0dd4231
KW
1271 SV* ored_cp_list;
1272 U8 ored_flags;
c6b76537 1273
cdd87c1d 1274 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1275
71068078 1276 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1277
1278 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1279 * the code point inversion list and just the relevant flags */
71068078 1280 if (is_ANYOF_SYNTHETIC(or_with)) {
a0dd4231
KW
1281 ored_cp_list = or_with->invlist;
1282 ored_flags = ANYOF_FLAGS(or_with);
1283 }
1284 else {
1285 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state,
1286 (regnode_charclass_posixl*) or_with);
eff8b7dc 1287 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
a0dd4231
KW
1288 }
1289
1290 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1291
1292 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1293 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1294 * 'or_with' may be inverted. When not inverted, we have the simple
1295 * situation of computing:
1296 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1297 * If P1|P2 yields a situation with both a class and its complement are
1298 * set, like having both \w and \W, this matches all code points, and we
1299 * can delete these from the P component of the ssc going forward. XXX We
1300 * might be able to delete all the P components, but I (khw) am not certain
1301 * about this, and it is better to be safe.
1302 *
1303 * Inverted, we have
1304 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1305 * <= (C1 | P1) | ~C2
1306 * <= (C1 | ~C2) | P1
1307 * (which results in actually simpler code than the non-inverted case)
1308 * */
9826f543 1309
a0dd4231 1310 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1311 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1312 {
cdd87c1d
KW
1313 /* We ignore P2, leaving P1 going forward */
1314 }
1315 else { /* Not inverted */
1316 ANYOF_POSIXL_OR(or_with, ssc);
1317 if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1318 unsigned int i;
1319 for (i = 0; i < ANYOF_MAX; i += 2) {
1320 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1321 {
1322 ssc_match_all_cp(ssc);
1323 ANYOF_POSIXL_CLEAR(ssc, i);
1324 ANYOF_POSIXL_CLEAR(ssc, i+1);
1325 if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1326 ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
1327 }
1328 }
1329 }
1330 }
1aa99e6b 1331 }
cdd87c1d
KW
1332
1333 ssc_union(ssc,
1334 ored_cp_list,
1335 FALSE /* Already has been inverted */
1336 );
653099ff
GS
1337}
1338
b423522f
KW
1339PERL_STATIC_INLINE void
1340S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1341{
1342 PERL_ARGS_ASSERT_SSC_UNION;
1343
71068078 1344 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1345
1346 _invlist_union_maybe_complement_2nd(ssc->invlist,
1347 invlist,
1348 invert2nd,
1349 &ssc->invlist);
1350}
1351
1352PERL_STATIC_INLINE void
1353S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1354 SV* const invlist,
1355 const bool invert2nd)
1356{
1357 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1358
71068078 1359 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1360
1361 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1362 invlist,
1363 invert2nd,
1364 &ssc->invlist);
1365}
1366
1367PERL_STATIC_INLINE void
1368S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1369{
1370 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1371
71068078 1372 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1373
1374 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1375}
1376
1377PERL_STATIC_INLINE void
1378S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1379{
1380 /* AND just the single code point 'cp' into the SSC 'ssc' */
1381
1382 SV* cp_list = _new_invlist(2);
1383
1384 PERL_ARGS_ASSERT_SSC_CP_AND;
1385
71068078 1386 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1387
1388 cp_list = add_cp_to_invlist(cp_list, cp);
1389 ssc_intersection(ssc, cp_list,
1390 FALSE /* Not inverted */
1391 );
1392 SvREFCNT_dec_NN(cp_list);
1393}
1394
1395PERL_STATIC_INLINE void
1396S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
1397{
1398 /* Set the SSC 'ssc' to not match any locale things */
1399
1400 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1401
71068078 1402 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1403
1404 ANYOF_POSIXL_ZERO(ssc);
1405 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1406}
1407
1408STATIC void
1409S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1410{
1411 /* The inversion list in the SSC is marked mortal; now we need a more
1412 * permanent copy, which is stored the same way that is done in a regular
1413 * ANYOF node, with the first 256 code points in a bit map */
1414
1415 SV* invlist = invlist_clone(ssc->invlist);
1416
1417 PERL_ARGS_ASSERT_SSC_FINALIZE;
1418
71068078 1419 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 1420
a0dd4231
KW
1421 /* The code in this file assumes that all but these flags aren't relevant
1422 * to the SSC, except ANYOF_EMPTY_STRING, which should be cleared by the
1423 * time we reach here */
eff8b7dc 1424 assert(! (ANYOF_FLAGS(ssc) & ~ANYOF_COMMON_FLAGS));
a0dd4231 1425
b423522f
KW
1426 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1427
1428 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
1429
1430 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
1431}
1432
a3621e74
YO
1433#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1434#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1435#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1436#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1437
3dab1dad
YO
1438
1439#ifdef DEBUGGING
07be1b83 1440/*
2b8b4781
NC
1441 dump_trie(trie,widecharmap,revcharmap)
1442 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1443 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1444
1445 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1446 The _interim_ variants are used for debugging the interim
1447 tables that are used to generate the final compressed
1448 representation which is what dump_trie expects.
1449
486ec47a 1450 Part of the reason for their existence is to provide a form
3dab1dad 1451 of documentation as to how the different representations function.
07be1b83
YO
1452
1453*/
3dab1dad
YO
1454
1455/*
3dab1dad
YO
1456 Dumps the final compressed table form of the trie to Perl_debug_log.
1457 Used for debugging make_trie().
1458*/
b9a59e08 1459
3dab1dad 1460STATIC void
2b8b4781
NC
1461S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1462 AV *revcharmap, U32 depth)
3dab1dad
YO
1463{
1464 U32 state;
ab3bbdeb 1465 SV *sv=sv_newmortal();
55eed653 1466 int colwidth= widecharmap ? 6 : 4;
2e64971a 1467 U16 word;
3dab1dad
YO
1468 GET_RE_DEBUG_FLAGS_DECL;
1469
7918f24d 1470 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1471
3dab1dad
YO
1472 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1473 (int)depth * 2 + 2,"",
1474 "Match","Base","Ofs" );
1475
1476 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1477 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1478 if ( tmp ) {
ab3bbdeb
YO
1479 PerlIO_printf( Perl_debug_log, "%*s",
1480 colwidth,
ddc5bc0f 1481 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1482 PL_colors[0], PL_colors[1],
1483 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1484 PERL_PV_ESCAPE_FIRSTCHAR
1485 )
1486 );
3dab1dad
YO
1487 }
1488 }
1489 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1490 (int)depth * 2 + 2,"");
1491
1492 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1493 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1494 PerlIO_printf( Perl_debug_log, "\n");
1495
1e2e3d02 1496 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1497 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
1498
1499 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1500
1501 if ( trie->states[ state ].wordnum ) {
1502 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1503 } else {
1504 PerlIO_printf( Perl_debug_log, "%6s", "" );
1505 }
1506
1507 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1508
1509 if ( base ) {
1510 U32 ofs = 0;
1511
1512 while( ( base + ofs < trie->uniquecharcount ) ||
1513 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1514 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1515 ofs++;
1516
1517 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1518
1519 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1520 if ( ( base + ofs >= trie->uniquecharcount ) &&
1521 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1522 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1523 {
ab3bbdeb
YO
1524 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1525 colwidth,
3dab1dad
YO
1526 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1527 } else {
ab3bbdeb 1528 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1529 }
1530 }
1531
1532 PerlIO_printf( Perl_debug_log, "]");
1533
1534 }
1535 PerlIO_printf( Perl_debug_log, "\n" );
1536 }
2e64971a
DM
1537 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1538 for (word=1; word <= trie->wordcount; word++) {
1539 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1540 (int)word, (int)(trie->wordinfo[word].prev),
1541 (int)(trie->wordinfo[word].len));
1542 }
1543 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
1544}
1545/*
3dab1dad
YO
1546 Dumps a fully constructed but uncompressed trie in list form.
1547 List tries normally only are used for construction when the number of
1548 possible chars (trie->uniquecharcount) is very high.
1549 Used for debugging make_trie().
1550*/
1551STATIC void
55eed653 1552S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1553 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1554 U32 depth)
3dab1dad
YO
1555{
1556 U32 state;
ab3bbdeb 1557 SV *sv=sv_newmortal();
55eed653 1558 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1559 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1560
1561 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1562
3dab1dad 1563 /* print out the table precompression. */
ab3bbdeb
YO
1564 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1565 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1566 "------:-----+-----------------\n" );
3dab1dad
YO
1567
1568 for( state=1 ; state < next_alloc ; state ++ ) {
1569 U16 charid;
1570
ab3bbdeb 1571 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1572 (int)depth * 2 + 2,"", (UV)state );
1573 if ( ! trie->states[ state ].wordnum ) {
1574 PerlIO_printf( Perl_debug_log, "%5s| ","");
1575 } else {
1576 PerlIO_printf( Perl_debug_log, "W%4x| ",
1577 trie->states[ state ].wordnum
1578 );
1579 }
1580 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1581 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1582 if ( tmp ) {
1583 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1584 colwidth,
ddc5bc0f 1585 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1586 PL_colors[0], PL_colors[1],
1587 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1588 PERL_PV_ESCAPE_FIRSTCHAR
1589 ) ,
1e2e3d02
YO
1590 TRIE_LIST_ITEM(state,charid).forid,
1591 (UV)TRIE_LIST_ITEM(state,charid).newstate
1592 );
1593 if (!(charid % 10))
664e119d
RGS
1594 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1595 (int)((depth * 2) + 14), "");
1e2e3d02 1596 }
ab3bbdeb
YO
1597 }
1598 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1599 }
1600}
1601
1602/*
3dab1dad
YO
1603 Dumps a fully constructed but uncompressed trie in table form.
1604 This is the normal DFA style state transition table, with a few
1605 twists to facilitate compression later.
1606 Used for debugging make_trie().
1607*/
1608STATIC void
55eed653 1609S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1610 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1611 U32 depth)
3dab1dad
YO
1612{
1613 U32 state;
1614 U16 charid;
ab3bbdeb 1615 SV *sv=sv_newmortal();
55eed653 1616 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1617 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1618
1619 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1620
1621 /*
1622 print out the table precompression so that we can do a visual check
1623 that they are identical.
1624 */
1625
1626 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1627
1628 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1629 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1630 if ( tmp ) {
ab3bbdeb
YO
1631 PerlIO_printf( Perl_debug_log, "%*s",
1632 colwidth,
ddc5bc0f 1633 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1634 PL_colors[0], PL_colors[1],
1635 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1636 PERL_PV_ESCAPE_FIRSTCHAR
1637 )
1638 );
3dab1dad
YO
1639 }
1640 }
1641
1642 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1643
1644 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1645 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1646 }
1647
1648 PerlIO_printf( Perl_debug_log, "\n" );
1649
1650 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1651
1652 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1653 (int)depth * 2 + 2,"",
1654 (UV)TRIE_NODENUM( state ) );
1655
1656 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1657 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1658 if (v)
1659 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1660 else
1661 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1662 }
1663 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1664 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1665 } else {
1666 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1667 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1668 }
1669 }
07be1b83 1670}
3dab1dad
YO
1671
1672#endif
1673
2e64971a 1674
786e8c11
YO
1675/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1676 startbranch: the first branch in the whole branch sequence
1677 first : start branch of sequence of branch-exact nodes.
1678 May be the same as startbranch
1679 last : Thing following the last branch.
1680 May be the same as tail.
1681 tail : item following the branch sequence
1682 count : words in the sequence
1683 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1684 depth : indent depth
3dab1dad 1685
786e8c11 1686Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1687
786e8c11
YO
1688A trie is an N'ary tree where the branches are determined by digital
1689decomposition of the key. IE, at the root node you look up the 1st character and
1690follow that branch repeat until you find the end of the branches. Nodes can be
1691marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1692
786e8c11 1693 /he|she|his|hers/
72f13be8 1694
786e8c11
YO
1695would convert into the following structure. Numbers represent states, letters
1696following numbers represent valid transitions on the letter from that state, if
1697the number is in square brackets it represents an accepting state, otherwise it
1698will be in parenthesis.
07be1b83 1699
786e8c11
YO
1700 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1701 | |
1702 | (2)
1703 | |
1704 (1) +-i->(6)-+-s->[7]
1705 |
1706 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1707
786e8c11
YO
1708 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1709
1710This shows that when matching against the string 'hers' we will begin at state 1
1711read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1712then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1713is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1714single traverse. We store a mapping from accepting to state to which word was
1715matched, and then when we have multiple possibilities we try to complete the
1716rest of the regex in the order in which they occured in the alternation.
1717
1718The only prior NFA like behaviour that would be changed by the TRIE support is
1719the silent ignoring of duplicate alternations which are of the form:
1720
1721 / (DUPE|DUPE) X? (?{ ... }) Y /x
1722
4b714af6 1723Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1724and without the optimisation. With the optimisations dupes will be silently
486ec47a 1725ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1726the following demonstrates:
1727
1728 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1729
1730which prints out 'word' three times, but
1731
1732 'words'=~/(word|word|word)(?{ print $1 })S/
1733
1734which doesnt print it out at all. This is due to other optimisations kicking in.
1735
1736Example of what happens on a structural level:
1737
486ec47a 1738The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1739
1740 1: CURLYM[1] {1,32767}(18)
1741 5: BRANCH(8)
1742 6: EXACT <ac>(16)
1743 8: BRANCH(11)
1744 9: EXACT <ad>(16)
1745 11: BRANCH(14)
1746 12: EXACT <ab>(16)
1747 16: SUCCEED(0)
1748 17: NOTHING(18)
1749 18: END(0)
1750
1751This would be optimizable with startbranch=5, first=5, last=16, tail=16
1752and should turn into:
1753
1754 1: CURLYM[1] {1,32767}(18)
1755 5: TRIE(16)
1756 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1757 <ac>
1758 <ad>
1759 <ab>
1760 16: SUCCEED(0)
1761 17: NOTHING(18)
1762 18: END(0)
1763
1764Cases where tail != last would be like /(?foo|bar)baz/:
1765
1766 1: BRANCH(4)
1767 2: EXACT <foo>(8)
1768 4: BRANCH(7)
1769 5: EXACT <bar>(8)
1770 7: TAIL(8)
1771 8: EXACT <baz>(10)
1772 10: END(0)
1773
1774which would be optimizable with startbranch=1, first=1, last=7, tail=8
1775and would end up looking like:
1776
1777 1: TRIE(8)
1778 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1779 <foo>
1780 <bar>
1781 7: TAIL(8)
1782 8: EXACT <baz>(10)
1783 10: END(0)
1784
c80e42f3 1785 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
1786
1787is the recommended Unicode-aware way of saying
1788
1789 *(d++) = uv;
1790*/
1791
fab2782b 1792#define TRIE_STORE_REVCHAR(val) \
786e8c11 1793 STMT_START { \
73031816 1794 if (UTF) { \
fab2782b 1795 SV *zlopp = newSV(7); /* XXX: optimize me */ \
88c9ea1e 1796 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 1797 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
1798 SvCUR_set(zlopp, kapow - flrbbbbb); \
1799 SvPOK_on(zlopp); \
1800 SvUTF8_on(zlopp); \
1801 av_push(revcharmap, zlopp); \
1802 } else { \
fab2782b 1803 char ooooff = (char)val; \
73031816
NC
1804 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1805 } \
1806 } STMT_END
786e8c11 1807
914a25d5
KW
1808/* This gets the next character from the input, folding it if not already
1809 * folded. */
1810#define TRIE_READ_CHAR STMT_START { \
1811 wordlen++; \
1812 if ( UTF ) { \
1813 /* if it is UTF then it is either already folded, or does not need \
1814 * folding */ \
1c1d615a 1815 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
1816 } \
1817 else if (folder == PL_fold_latin1) { \
7d006b13
KW
1818 /* This folder implies Unicode rules, which in the range expressible \
1819 * by not UTF is the lower case, with the two exceptions, one of \
1820 * which should have been taken care of before calling this */ \
1821 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1822 uvc = toLOWER_L1(*uc); \
1823 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1824 len = 1; \
914a25d5
KW
1825 } else { \
1826 /* raw data, will be folded later if needed */ \
1827 uvc = (U32)*uc; \
1828 len = 1; \
1829 } \
786e8c11
YO
1830} STMT_END
1831
1832
1833
1834#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1835 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1836 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1837 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1838 } \
1839 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1840 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1841 TRIE_LIST_CUR( state )++; \
1842} STMT_END
07be1b83 1843
786e8c11
YO
1844#define TRIE_LIST_NEW(state) STMT_START { \
1845 Newxz( trie->states[ state ].trans.list, \
1846 4, reg_trie_trans_le ); \
1847 TRIE_LIST_CUR( state ) = 1; \
1848 TRIE_LIST_LEN( state ) = 4; \
1849} STMT_END
07be1b83 1850
786e8c11
YO
1851#define TRIE_HANDLE_WORD(state) STMT_START { \
1852 U16 dupe= trie->states[ state ].wordnum; \
1853 regnode * const noper_next = regnext( noper ); \
1854 \
786e8c11
YO
1855 DEBUG_r({ \
1856 /* store the word for dumping */ \
1857 SV* tmp; \
1858 if (OP(noper) != NOTHING) \
740cce10 1859 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1860 else \
740cce10 1861 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1862 av_push( trie_words, tmp ); \
786e8c11
YO
1863 }); \
1864 \
1865 curword++; \
2e64971a
DM
1866 trie->wordinfo[curword].prev = 0; \
1867 trie->wordinfo[curword].len = wordlen; \
1868 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1869 \
1870 if ( noper_next < tail ) { \
1871 if (!trie->jump) \
c944940b 1872 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1873 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1874 if (!jumper) \
1875 jumper = noper_next; \
1876 if (!nextbranch) \
1877 nextbranch= regnext(cur); \
1878 } \
1879 \
1880 if ( dupe ) { \
2e64971a
DM
1881 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1882 /* chain, so that when the bits of chain are later */\
1883 /* linked together, the dups appear in the chain */\
1884 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1885 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1886 } else { \
1887 /* we haven't inserted this word yet. */ \
1888 trie->states[ state ].wordnum = curword; \
1889 } \
1890} STMT_END
07be1b83 1891
3dab1dad 1892
786e8c11
YO
1893#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1894 ( ( base + charid >= ucharcount \
1895 && base + charid < ubound \
1896 && state == trie->trans[ base - ucharcount + charid ].check \
1897 && trie->trans[ base - ucharcount + charid ].next ) \
1898 ? trie->trans[ base - ucharcount + charid ].next \
1899 : ( state==1 ? special : 0 ) \
1900 )
3dab1dad 1901
786e8c11
YO
1902#define MADE_TRIE 1
1903#define MADE_JUMP_TRIE 2
1904#define MADE_EXACT_TRIE 4
3dab1dad 1905
a3621e74 1906STATIC I32
786e8c11 1907S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1908{
27da23d5 1909 dVAR;
a3621e74
YO
1910 /* first pass, loop through and scan words */
1911 reg_trie_data *trie;
55eed653 1912 HV *widecharmap = NULL;
2b8b4781 1913 AV *revcharmap = newAV();
a3621e74 1914 regnode *cur;
a3621e74
YO
1915 STRLEN len = 0;
1916 UV uvc = 0;
1917 U16 curword = 0;
1918 U32 next_alloc = 0;
786e8c11
YO
1919 regnode *jumper = NULL;
1920 regnode *nextbranch = NULL;
7f69552c 1921 regnode *convert = NULL;
2e64971a 1922 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1923 /* we just use folder as a flag in utf8 */
1e696034 1924 const U8 * folder = NULL;
a3621e74 1925
2b8b4781 1926#ifdef DEBUGGING
cf78de0b 1927 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2b8b4781
NC
1928 AV *trie_words = NULL;
1929 /* along with revcharmap, this only used during construction but both are
1930 * useful during debugging so we store them in the struct when debugging.
8e11feef 1931 */
2b8b4781 1932#else
cf78de0b 1933 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 1934 STRLEN trie_charcount=0;
3dab1dad 1935#endif
2b8b4781 1936 SV *re_trie_maxbuff;
a3621e74 1937 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1938
1939 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1940#ifndef DEBUGGING
1941 PERL_UNUSED_ARG(depth);
1942#endif
a3621e74 1943
1e696034 1944 switch (flags) {
79a81a6e 1945 case EXACT: break;
2f7f8cb1 1946 case EXACTFA:
fab2782b 1947 case EXACTFU_SS:
1e696034
KW
1948 case EXACTFU: folder = PL_fold_latin1; break;
1949 case EXACTF: folder = PL_fold; break;
fab2782b 1950 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
1951 }
1952
c944940b 1953 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1954 trie->refcount = 1;
3dab1dad 1955 trie->startstate = 1;
786e8c11 1956 trie->wordcount = word_count;
f8fc2ecf 1957 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1958 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
fab2782b 1959 if (flags == EXACT)
c944940b 1960 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1961 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1962 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1963
a3621e74 1964 DEBUG_r({
2b8b4781 1965 trie_words = newAV();
a3621e74 1966 });
a3621e74 1967
0111c4fd 1968 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1969 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1970 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1971 }
df826430 1972 DEBUG_TRIE_COMPILE_r({
3dab1dad 1973 PerlIO_printf( Perl_debug_log,
786e8c11 1974 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1975 (int)depth * 2 + 2, "",
1976 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1977 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1978 (int)depth);
3dab1dad 1979 });
7f69552c
YO
1980
1981 /* Find the node we are going to overwrite */
1982 if ( first == startbranch && OP( last ) != BRANCH ) {
1983 /* whole branch chain */
1984 convert = first;
1985 } else {
1986 /* branch sub-chain */
1987 convert = NEXTOPER( first );
1988 }
1989
a3621e74
YO
1990 /* -- First loop and Setup --
1991
1992 We first traverse the branches and scan each word to determine if it
1993 contains widechars, and how many unique chars there are, this is
1994 important as we have to build a table with at least as many columns as we
1995 have unique chars.
1996
1997 We use an array of integers to represent the character codes 0..255
38a44b82 1998 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1999 native representation of the character value as the key and IV's for the
2000 coded index.
2001
2002 *TODO* If we keep track of how many times each character is used we can
2003 remap the columns so that the table compression later on is more
3b753521 2004 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2005 middle and the least common are on the outside. IMO this would be better
2006 than a most to least common mapping as theres a decent chance the most
2007 common letter will share a node with the least common, meaning the node
486ec47a 2008 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2009 case is when we have the least common nodes twice.
2010
2011 */
2012
a3621e74 2013 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2014 regnode *noper = NEXTOPER( cur );
e1ec3a88 2015 const U8 *uc = (U8*)STRING( noper );
df826430 2016 const U8 *e = uc + STR_LEN( noper );
a3621e74 2017 STRLEN foldlen = 0;
07be1b83 2018 U32 wordlen = 0; /* required init */
afa96d92
KW
2019 STRLEN minbytes = 0;
2020 STRLEN maxbytes = 0;
02daf0ab 2021 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 2022
3dab1dad 2023 if (OP(noper) == NOTHING) {
df826430
YO
2024 regnode *noper_next= regnext(noper);
2025 if (noper_next != tail && OP(noper_next) == flags) {
2026 noper = noper_next;
2027 uc= (U8*)STRING(noper);
2028 e= uc + STR_LEN(noper);
2029 trie->minlen= STR_LEN(noper);
2030 } else {
2031 trie->minlen= 0;
2032 continue;
2033 }
3dab1dad 2034 }
df826430 2035
fab2782b 2036 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2037 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2038 regardless of encoding */
fab2782b
YO
2039 if (OP( noper ) == EXACTFU_SS) {
2040 /* false positives are ok, so just set this */
0dc4a61d 2041 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2042 }
2043 }
a3621e74 2044 for ( ; uc < e ; uc += len ) {
3dab1dad 2045 TRIE_CHARCOUNT(trie)++;
a3621e74 2046 TRIE_READ_CHAR;
645de4ce
KW
2047
2048 /* Acummulate to the current values, the range in the number of
2049 * bytes that this character could match. The max is presumed to
2050 * be the same as the folded input (which TRIE_READ_CHAR returns),
2051 * except that when this is not in UTF-8, it could be matched
2052 * against a string which is UTF-8, and the variant characters
2053 * could be 2 bytes instead of the 1 here. Likewise, for the
2054 * minimum number of bytes when not folded. When folding, the min
2055 * is assumed to be 1 byte could fold to match the single character
2056 * here, or in the case of a multi-char fold, 1 byte can fold to
2057 * the whole sequence. 'foldlen' is used to denote whether we are
2058 * in such a sequence, skipping the min setting if so. XXX TODO
2059 * Use the exact list of what folds to each character, from
2060 * PL_utf8_foldclosures */
2061 if (UTF) {
2062 maxbytes += UTF8SKIP(uc);
2063 if (! folder) {
2064 /* A non-UTF-8 string could be 1 byte to match our 2 */
2065 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
2066 ? 1
2067 : UTF8SKIP(uc);
2068 }
2069 else {
2070 if (foldlen) {
2071 foldlen -= UTF8SKIP(uc);
2072 }
2073 else {
2074 foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
2075 minbytes++;
2076 }
2077 }
2078 }
2079 else {
2080 maxbytes += (UNI_IS_INVARIANT(*uc))
2081 ? 1
2082 : 2;
2083 if (! folder) {
2084 minbytes++;
2085 }
2086 else {
2087 if (foldlen) {
2088 foldlen--;
2089 }
2090 else {
2091 foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
2092 minbytes++;
2093 }
2094 }
2095 }
a3621e74 2096 if ( uvc < 256 ) {
fab2782b
YO
2097 if ( folder ) {
2098 U8 folded= folder[ (U8) uvc ];
2099 if ( !trie->charmap[ folded ] ) {
2100 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2101 TRIE_STORE_REVCHAR( folded );
2102 }
2103 }
a3621e74
YO
2104 if ( !trie->charmap[ uvc ] ) {
2105 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2106 TRIE_STORE_REVCHAR( uvc );
a3621e74 2107 }
02daf0ab 2108 if ( set_bit ) {
62012aee
KW
2109 /* store the codepoint in the bitmap, and its folded
2110 * equivalent. */
fab2782b 2111 TRIE_BITMAP_SET(trie, uvc);
0921ee73
T
2112
2113 /* store the folded codepoint */
fab2782b 2114 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
0921ee73
T
2115
2116 if ( !UTF ) {
2117 /* store first byte of utf8 representation of
acdf4139 2118 variant codepoints */
6f2d5cbc 2119 if (! UVCHR_IS_INVARIANT(uvc)) {
acdf4139 2120 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
2121 }
2122 }
02daf0ab
YO
2123 set_bit = 0; /* We've done our bit :-) */
2124 }
a3621e74
YO
2125 } else {
2126 SV** svpp;
55eed653
NC
2127 if ( !widecharmap )
2128 widecharmap = newHV();
a3621e74 2129
55eed653 2130 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2131
2132 if ( !svpp )
e4584336 2133 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
2134
2135 if ( !SvTRUE( *svpp ) ) {
2136 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2137 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2138 }
2139 }
2140 }
3dab1dad 2141 if( cur == first ) {
afa96d92
KW
2142 trie->minlen = minbytes;
2143 trie->maxlen = maxbytes;
2144 } else if (minbytes < trie->minlen) {
2145 trie->minlen = minbytes;
2146 } else if (maxbytes > trie->maxlen) {
2147 trie->maxlen = maxbytes;
fab2782b 2148 }
a3621e74
YO
2149 } /* end first pass */
2150 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
2151 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2152 (int)depth * 2 + 2,"",
55eed653 2153 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2154 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2155 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2156 );
a3621e74
YO
2157
2158 /*
2159 We now know what we are dealing with in terms of unique chars and
2160 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2161 representation using a flat table will take. If it's over a reasonable
2162 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2163 conservative but potentially much slower representation using an array
2164 of lists.
2165
2166 At the end we convert both representations into the same compressed
2167 form that will be used in regexec.c for matching with. The latter
2168 is a form that cannot be used to construct with but has memory
2169 properties similar to the list form and access properties similar
2170 to the table form making it both suitable for fast searches and
2171 small enough that its feasable to store for the duration of a program.
2172
2173 See the comment in the code where the compressed table is produced
2174 inplace from the flat tabe representation for an explanation of how
2175 the compression works.
2176
2177 */
2178
2179
2e64971a
DM
2180 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2181 prev_states[1] = 0;
2182
3dab1dad 2183 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
2184 /*
2185 Second Pass -- Array Of Lists Representation
2186
2187 Each state will be represented by a list of charid:state records
2188 (reg_trie_trans_le) the first such element holds the CUR and LEN
2189 points of the allocated array. (See defines above).
2190
2191 We build the initial structure using the lists, and then convert
2192 it into the compressed table form which allows faster lookups
2193 (but cant be modified once converted).
a3621e74
YO
2194 */
2195
a3621e74
YO
2196 STRLEN transcount = 1;
2197
1e2e3d02
YO
2198 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2199 "%*sCompiling trie using list compiler\n",
2200 (int)depth * 2 + 2, ""));
686b73d4 2201
c944940b
JH
2202 trie->states = (reg_trie_state *)
2203 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2204 sizeof(reg_trie_state) );
a3621e74
YO
2205 TRIE_LIST_NEW(1);
2206 next_alloc = 2;
2207
2208 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2209
df826430 2210 regnode *noper = NEXTOPER( cur );
c445ea15 2211 U8 *uc = (U8*)STRING( noper );
df826430 2212 const U8 *e = uc + STR_LEN( noper );
c445ea15
AL
2213 U32 state = 1; /* required init */
2214 U16 charid = 0; /* sanity init */
07be1b83 2215 U32 wordlen = 0; /* required init */
c445ea15 2216
df826430
YO
2217 if (OP(noper) == NOTHING) {
2218 regnode *noper_next= regnext(noper);
2219 if (noper_next != tail && OP(noper_next) == flags) {
2220 noper = noper_next;
2221 uc= (U8*)STRING(noper);
2222 e= uc + STR_LEN(noper);
2223 }
2224 }
2225
3dab1dad 2226 if (OP(noper) != NOTHING) {
786e8c11 2227 for ( ; uc < e ; uc += len ) {
c445ea15 2228
786e8c11 2229 TRIE_READ_CHAR;
c445ea15 2230
786e8c11
YO
2231 if ( uvc < 256 ) {
2232 charid = trie->charmap[ uvc ];
c445ea15 2233 } else {
55eed653 2234 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
2235 if ( !svpp ) {
2236 charid = 0;
2237 } else {
2238 charid=(U16)SvIV( *svpp );
2239 }
c445ea15 2240 }
786e8c11
YO
2241 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2242 if ( charid ) {
a3621e74 2243
786e8c11
YO
2244 U16 check;
2245 U32 newstate = 0;
a3621e74 2246
786e8c11
YO
2247 charid--;
2248 if ( !trie->states[ state ].trans.list ) {
2249 TRIE_LIST_NEW( state );
c445ea15 2250 }
786e8c11
YO
2251 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
2252 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
2253 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2254 break;
2255 }
2256 }
2257 if ( ! newstate ) {
2258 newstate = next_alloc++;
2e64971a 2259 prev_states[newstate] = state;
786e8c11
YO
2260 TRIE_LIST_PUSH( state, charid, newstate );
2261 transcount++;
2262 }
2263 state = newstate;
2264 } else {
2265 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 2266 }
a28509cc 2267 }
c445ea15 2268 }
3dab1dad 2269 TRIE_HANDLE_WORD(state);
a3621e74
YO
2270
2271 } /* end second pass */
2272
1e2e3d02
YO
2273 /* next alloc is the NEXT state to be allocated */
2274 trie->statecount = next_alloc;
c944940b
JH
2275 trie->states = (reg_trie_state *)
2276 PerlMemShared_realloc( trie->states,
2277 next_alloc
2278 * sizeof(reg_trie_state) );
a3621e74 2279
3dab1dad 2280 /* and now dump it out before we compress it */
2b8b4781
NC
2281 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2282 revcharmap, next_alloc,
2283 depth+1)
1e2e3d02 2284 );
a3621e74 2285
c944940b
JH
2286 trie->trans = (reg_trie_trans *)
2287 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2288 {
2289 U32 state;
a3621e74
YO
2290 U32 tp = 0;
2291 U32 zp = 0;
2292
2293
2294 for( state=1 ; state < next_alloc ; state ++ ) {
2295 U32 base=0;
2296
2297 /*
2298 DEBUG_TRIE_COMPILE_MORE_r(
2299 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2300 );
2301 */
2302
2303 if (trie->states[state].trans.list) {
2304 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2305 U16 maxid=minid;
a28509cc 2306 U16 idx;
a3621e74
YO
2307
2308 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
2309 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2310 if ( forid < minid ) {
2311 minid=forid;
2312 } else if ( forid > maxid ) {
2313 maxid=forid;
2314 }
a3621e74
YO
2315 }
2316 if ( transcount < tp + maxid - minid + 1) {
2317 transcount *= 2;
c944940b
JH
2318 trie->trans = (reg_trie_trans *)
2319 PerlMemShared_realloc( trie->trans,
446bd890
NC
2320 transcount
2321 * sizeof(reg_trie_trans) );
a3621e74
YO
2322 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
2323 }
2324 base = trie->uniquecharcount + tp - minid;
2325 if ( maxid == minid ) {
2326 U32 set = 0;
2327 for ( ; zp < tp ; zp++ ) {
2328 if ( ! trie->trans[ zp ].next ) {
2329 base = trie->uniquecharcount + zp - minid;
2330 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2331 trie->trans[ zp ].check = state;
2332 set = 1;
2333 break;
2334 }
2335 }
2336 if ( !set ) {
2337 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2338 trie->trans[ tp ].check = state;
2339 tp++;
2340 zp = tp;
2341 }
2342 } else {
2343 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 2344 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
2345 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2346 trie->trans[ tid ].check = state;
2347 }
2348 tp += ( maxid - minid + 1 );
2349 }
2350 Safefree(trie->states[ state ].trans.list);
2351 }
2352 /*
2353 DEBUG_TRIE_COMPILE_MORE_r(
2354 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2355 );
2356 */
2357 trie->states[ state ].trans.base=base;
2358 }
cc601c31 2359 trie->lasttrans = tp + 1;
a3621e74
YO
2360 }
2361 } else {
2362 /*
2363 Second Pass -- Flat Table Representation.
2364
b423522f
KW
2365 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2366 each. We know that we will need Charcount+1 trans at most to store
2367 the data (one row per char at worst case) So we preallocate both
2368 structures assuming worst case.
a3621e74
YO
2369
2370 We then construct the trie using only the .next slots of the entry
2371 structs.
2372
b423522f
KW
2373 We use the .check field of the first entry of the node temporarily
2374 to make compression both faster and easier by keeping track of how
2375 many non zero fields are in the node.
a3621e74
YO
2376
2377 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2378 transition.
2379
b423522f
KW
2380 There are two terms at use here: state as a TRIE_NODEIDX() which is
2381 a number representing the first entry of the node, and state as a
2382 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2383 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2384 if there are 2 entrys per node. eg:
a3621e74
YO
2385
2386 A B A B
2387 1. 2 4 1. 3 7
2388 2. 0 3 3. 0 5
2389 3. 0 0 5. 0 0
2390 4. 0 0 7. 0 0
2391
b423522f
KW
2392 The table is internally in the right hand, idx form. However as we
2393 also have to deal with the states array which is indexed by nodenum
2394 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
2395
2396 */
1e2e3d02
YO
2397 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2398 "%*sCompiling trie using table compiler\n",
2399 (int)depth * 2 + 2, ""));
3dab1dad 2400
c944940b
JH
2401 trie->trans = (reg_trie_trans *)
2402 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2403 * trie->uniquecharcount + 1,
2404 sizeof(reg_trie_trans) );
2405 trie->states = (reg_trie_state *)
2406 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2407 sizeof(reg_trie_state) );
a3621e74
YO
2408 next_alloc = trie->uniquecharcount + 1;
2409
3dab1dad 2410
a3621e74
YO
2411 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2412
df826430 2413 regnode *noper = NEXTOPER( cur );
a28509cc 2414 const U8 *uc = (U8*)STRING( noper );
df826430 2415 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
2416
2417 U32 state = 1; /* required init */
2418
2419 U16 charid = 0; /* sanity init */
2420 U32 accept_state = 0; /* sanity init */
a3621e74 2421
07be1b83 2422 U32 wordlen = 0; /* required init */
a3621e74 2423
df826430
YO
2424 if (OP(noper) == NOTHING) {
2425 regnode *noper_next= regnext(noper);
2426 if (noper_next != tail && OP(noper_next) == flags) {
2427 noper = noper_next;
2428 uc= (U8*)STRING(noper);
2429 e= uc + STR_LEN(noper);
2430 }
2431 }
fab2782b 2432
3dab1dad 2433 if ( OP(noper) != NOTHING ) {
786e8c11 2434 for ( ; uc < e ; uc += len ) {
a3621e74 2435
786e8c11 2436 TRIE_READ_CHAR;
a3621e74 2437
786e8c11
YO
2438 if ( uvc < 256 ) {
2439 charid = trie->charmap[ uvc ];
2440 } else {
55eed653 2441 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 2442 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 2443 }
786e8c11
YO
2444 if ( charid ) {
2445 charid--;
2446 if ( !trie->trans[ state + charid ].next ) {
2447 trie->trans[ state + charid ].next = next_alloc;
2448 trie->trans[ state ].check++;
2e64971a
DM
2449 prev_states[TRIE_NODENUM(next_alloc)]
2450 = TRIE_NODENUM(state);
786e8c11
YO
2451 next_alloc += trie->uniquecharcount;
2452 }
2453 state = trie->trans[ state + charid ].next;
2454 } else {
2455 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2456 }
2457 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 2458 }
a3621e74 2459 }
3dab1dad
YO
2460 accept_state = TRIE_NODENUM( state );
2461 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
2462
2463 } /* end second pass */
2464
3dab1dad 2465 /* and now dump it out before we compress it */
2b8b4781
NC
2466 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2467 revcharmap,
2468 next_alloc, depth+1));
a3621e74 2469
a3621e74
YO
2470 {
2471 /*
2472 * Inplace compress the table.*
2473
2474 For sparse data sets the table constructed by the trie algorithm will
2475 be mostly 0/FAIL transitions or to put it another way mostly empty.
2476 (Note that leaf nodes will not contain any transitions.)
2477
2478 This algorithm compresses the tables by eliminating most such
2479 transitions, at the cost of a modest bit of extra work during lookup:
2480
2481 - Each states[] entry contains a .base field which indicates the
2482 index in the state[] array wheres its transition data is stored.
2483
3b753521 2484 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
2485
2486 - If .base is nonzero then charid is added to it to find an entry in
2487 the trans array.
2488
2489 -If trans[states[state].base+charid].check!=state then the
2490 transition is taken to be a 0/Fail transition. Thus if there are fail
2491 transitions at the front of the node then the .base offset will point
2492 somewhere inside the previous nodes data (or maybe even into a node
2493 even earlier), but the .check field determines if the transition is
2494 valid.
2495
786e8c11 2496 XXX - wrong maybe?
a3621e74 2497 The following process inplace converts the table to the compressed
3b753521 2498 table: We first do not compress the root node 1,and mark all its
a3621e74 2499 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
2500 allows us to do a DFA construction from the compressed table later,
2501 and ensures that any .base pointers we calculate later are greater
2502 than 0.
a3621e74
YO
2503
2504 - We set 'pos' to indicate the first entry of the second node.
2505
2506 - We then iterate over the columns of the node, finding the first and
2507 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2508 and set the .check pointers accordingly, and advance pos
2509 appropriately and repreat for the next node. Note that when we copy
2510 the next pointers we have to convert them from the original
2511 NODEIDX form to NODENUM form as the former is not valid post
2512 compression.
2513
2514 - If a node has no transitions used we mark its base as 0 and do not
2515 advance the pos pointer.
2516
2517 - If a node only has one transition we use a second pointer into the
2518 structure to fill in allocated fail transitions from other states.
2519 This pointer is independent of the main pointer and scans forward
2520 looking for null transitions that are allocated to a state. When it
2521 finds one it writes the single transition into the "hole". If the
786e8c11 2522 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
2523
2524 - Once compressed we can Renew/realloc the structures to release the
2525 excess space.
2526
2527 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2528 specifically Fig 3.47 and the associated pseudocode.
2529
2530 demq
2531 */
a3b680e6 2532 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 2533 U32 state, charid;
a3621e74 2534 U32 pos = 0, zp=0;
1e2e3d02 2535 trie->statecount = laststate;
a3621e74
YO
2536
2537 for ( state = 1 ; state < laststate ; state++ ) {
2538 U8 flag = 0;
a28509cc
AL
2539 const U32 stateidx = TRIE_NODEIDX( state );
2540 const U32 o_used = trie->trans[ stateidx ].check;
2541 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
2542 trie->trans[ stateidx ].check = 0;
2543
2544 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2545 if ( flag || trie->trans[ stateidx + charid ].next ) {
2546 if ( trie->trans[ stateidx + charid ].next ) {
2547 if (o_used == 1) {
2548 for ( ; zp < pos ; zp++ ) {
2549 if ( ! trie->trans[ zp ].next ) {
2550 break;
2551 }
2552 }
2553 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2554 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2555 trie->trans[ zp ].check = state;
2556 if ( ++zp > pos ) pos = zp;
2557 break;
2558 }
2559 used--;
2560 }
2561 if ( !flag ) {
2562 flag = 1;
2563 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2564 }
2565 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2566 trie->trans[ pos ].check = state;
2567 pos++;
2568 }
2569 }
2570 }
cc601c31 2571 trie->lasttrans = pos + 1;
c944940b
JH
2572 trie->states = (reg_trie_state *)
2573 PerlMemShared_realloc( trie->states, laststate
2574 * sizeof(reg_trie_state) );
a3621e74 2575 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 2576 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2577 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2578 (int)depth * 2 + 2,"",
2579 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
2580 (IV)next_alloc,
2581 (IV)pos,
a3621e74
YO
2582 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2583 );
2584
2585 } /* end table compress */
2586 }
1e2e3d02
YO
2587 DEBUG_TRIE_COMPILE_MORE_r(
2588 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2589 (int)depth * 2 + 2, "",
2590 (UV)trie->statecount,
2591 (UV)trie->lasttrans)
2592 );
cc601c31 2593 /* resize the trans array to remove unused space */
c944940b
JH
2594 trie->trans = (reg_trie_trans *)
2595 PerlMemShared_realloc( trie->trans, trie->lasttrans
2596 * sizeof(reg_trie_trans) );
a3621e74 2597
3b753521 2598 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2599 U8 nodetype =(U8)(flags & 0xFF);
2600 char *str=NULL;
786e8c11 2601
07be1b83 2602#ifdef DEBUGGING
e62cc96a 2603 regnode *optimize = NULL;
7122b237
YO
2604#ifdef RE_TRACK_PATTERN_OFFSETS
2605
b57a0404
JH
2606 U32 mjd_offset = 0;
2607 U32 mjd_nodelen = 0;
7122b237
YO
2608#endif /* RE_TRACK_PATTERN_OFFSETS */
2609#endif /* DEBUGGING */
a3621e74 2610 /*
3dab1dad
YO
2611 This means we convert either the first branch or the first Exact,
2612 depending on whether the thing following (in 'last') is a branch
2613 or not and whther first is the startbranch (ie is it a sub part of
2614 the alternation or is it the whole thing.)
3b753521 2615 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2616 the whole branch sequence, including the first.
a3621e74 2617 */
3dab1dad 2618 /* Find the node we are going to overwrite */
7f69552c 2619 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2620 /* branch sub-chain */
3dab1dad 2621 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2622#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2623 DEBUG_r({
2624 mjd_offset= Node_Offset((convert));
2625 mjd_nodelen= Node_Length((convert));
2626 });
7122b237 2627#endif
7f69552c 2628 /* whole branch chain */
7122b237
YO
2629 }
2630#ifdef RE_TRACK_PATTERN_OFFSETS
2631 else {
7f69552c
YO
2632 DEBUG_r({
2633 const regnode *nop = NEXTOPER( convert );
2634 mjd_offset= Node_Offset((nop));
2635 mjd_nodelen= Node_Length((nop));
2636 });
07be1b83
YO
2637 }
2638 DEBUG_OPTIMISE_r(
2639 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2640 (int)depth * 2 + 2, "",
786e8c11 2641 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2642 );
7122b237 2643#endif
3dab1dad
YO
2644 /* But first we check to see if there is a common prefix we can
2645 split out as an EXACT and put in front of the TRIE node. */
2646 trie->startstate= 1;
55eed653 2647 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2648 U32 state;
1e2e3d02 2649 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2650 U32 ofs = 0;
8e11feef
RGS
2651 I32 idx = -1;
2652 U32 count = 0;
2653 const U32 base = trie->states[ state ].trans.base;
a3621e74 2654
3dab1dad 2655 if ( trie->states[state].wordnum )
8e11feef 2656 count = 1;
a3621e74 2657
8e11feef 2658 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2659 if ( ( base + ofs >= trie->uniquecharcount ) &&
2660 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2661 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2662 {
3dab1dad 2663 if ( ++count > 1 ) {
2b8b4781 2664 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2665 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2666 if ( state == 1 ) break;
3dab1dad
YO
2667 if ( count == 2 ) {
2668 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2669 DEBUG_OPTIMISE_r(
8e11feef
RGS
2670 PerlIO_printf(Perl_debug_log,
2671 "%*sNew Start State=%"UVuf" Class: [",
2672 (int)depth * 2 + 2, "",
786e8c11 2673 (UV)state));
be8e71aa 2674 if (idx >= 0) {
2b8b4781 2675 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2676 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2677
3dab1dad 2678 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2679 if ( folder )
2680 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2681 DEBUG_OPTIMISE_r(
f1f66076 2682 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2683 );
8e11feef
RGS
2684 }
2685 }
2686 TRIE_BITMAP_SET(trie,*ch);
2687 if ( folder )
2688 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2689 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2690 }
2691 idx = ofs;
2692 }
3dab1dad
YO
2693 }
2694 if ( count == 1 ) {
2b8b4781 2695 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2696 STRLEN len;
2697 char *ch = SvPV( *tmp, len );
de734bd5
A
2698 DEBUG_OPTIMISE_r({
2699 SV *sv=sv_newmortal();
8e11feef
RGS
2700 PerlIO_printf( Perl_debug_log,
2701 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2702 (int)depth * 2 + 2, "",
de734bd5
A
2703 (UV)state, (UV)idx,
2704 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2705 PL_colors[0], PL_colors[1],
2706 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2707 PERL_PV_ESCAPE_FIRSTCHAR
2708 )
2709 );
2710 });
3dab1dad
YO
2711 if ( state==1 ) {
2712 OP( convert ) = nodetype;
2713 str=STRING(convert);
2714 STR_LEN(convert)=0;
2715 }
c490c714
YO
2716 STR_LEN(convert) += len;
2717 while (len--)
de734bd5 2718 *str++ = *ch++;
8e11feef 2719 } else {
f9049ba1 2720#ifdef DEBUGGING
8e11feef
RGS
2721 if (state>1)
2722 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2723#endif
8e11feef
RGS
2724 break;
2725 }
2726 }
2e64971a 2727 trie->prefixlen = (state-1);
3dab1dad 2728 if (str) {
8e11feef 2729 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2730 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2731 trie->startstate = state;
07be1b83
YO
2732 trie->minlen -= (state - 1);
2733 trie->maxlen -= (state - 1);
33809eae
JH
2734#ifdef DEBUGGING
2735 /* At least the UNICOS C compiler choked on this
2736 * being argument to DEBUG_r(), so let's just have
2737 * it right here. */
2738 if (
2739#ifdef PERL_EXT_RE_BUILD
2740 1
2741#else
2742 DEBUG_r_TEST
2743#endif
2744 ) {
2745 regnode *fix = convert;
2746 U32 word = trie->wordcount;
2747 mjd_nodelen++;
2748 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2749 while( ++fix < n ) {
2750 Set_Node_Offset_Length(fix, 0, 0);
2751 }
2752 while (word--) {
2753 SV ** const tmp = av_fetch( trie_words, word, 0 );
2754 if (tmp) {
2755 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2756 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2757 else
2758 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2759 }
2760 }
2761 }
2762#endif
8e11feef
RGS
2763 if (trie->maxlen) {
2764 convert = n;
2765 } else {
3dab1dad 2766 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2767 DEBUG_r(optimize= n);
3dab1dad
YO
2768 }
2769 }
2770 }
a5ca303d
YO
2771 if (!jumper)
2772 jumper = last;
3dab1dad 2773 if ( trie->maxlen ) {
8e11feef
RGS
2774 NEXT_OFF( convert ) = (U16)(tail - convert);
2775 ARG_SET( convert, data_slot );
786e8c11
YO
2776 /* Store the offset to the first unabsorbed branch in
2777 jump[0], which is otherwise unused by the jump logic.
2778 We use this when dumping a trie and during optimisation. */
2779 if (trie->jump)
7f69552c 2780 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2781
6c48061a
YO
2782 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2783 * and there is a bitmap
2784 * and the first "jump target" node we found leaves enough room
2785 * then convert the TRIE node into a TRIEC node, with the bitmap
2786 * embedded inline in the opcode - this is hypothetically faster.
2787 */
2788 if ( !trie->states[trie->startstate].wordnum
2789 && trie->bitmap
2790 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2791 {
2792 OP( convert ) = TRIEC;
2793 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2794 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2795 trie->bitmap= NULL;
2796 } else
2797 OP( convert ) = TRIE;
a3621e74 2798
3dab1dad
YO
2799 /* store the type in the flags */
2800 convert->flags = nodetype;
a5ca303d
YO
2801 DEBUG_r({
2802 optimize = convert
2803 + NODE_STEP_REGNODE
2804 + regarglen[ OP( convert ) ];
2805 });
2806 /* XXX We really should free up the resource in trie now,
2807 as we won't use them - (which resources?) dmq */
3dab1dad 2808 }
a3621e74 2809 /* needed for dumping*/
e62cc96a 2810 DEBUG_r(if (optimize) {
07be1b83 2811 regnode *opt = convert;
bcdf7404 2812
e62cc96a 2813 while ( ++opt < optimize) {
07be1b83
YO
2814 Set_Node_Offset_Length(opt,0,0);
2815 }
786e8c11
YO
2816 /*
2817 Try to clean up some of the debris left after the
2818 optimisation.
a3621e74 2819 */
786e8c11 2820 while( optimize < jumper ) {
07be1b83 2821 mjd_nodelen += Node_Length((optimize));
a3621e74 2822 OP( optimize ) = OPTIMIZED;
07be1b83 2823 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2824 optimize++;
2825 }
07be1b83 2826 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2827 });
2828 } /* end node insert */
2e64971a
DM
2829
2830 /* Finish populating the prev field of the wordinfo array. Walk back
2831 * from each accept state until we find another accept state, and if
2832 * so, point the first word's .prev field at the second word. If the
2833 * second already has a .prev field set, stop now. This will be the
2834 * case either if we've already processed that word's accept state,
3b753521
FN
2835 * or that state had multiple words, and the overspill words were
2836 * already linked up earlier.
2e64971a
DM
2837 */
2838 {
2839 U16 word;
2840 U32 state;
2841 U16 prev;
2842
2843 for (word=1; word <= trie->wordcount; word++) {
2844 prev = 0;
2845 if (trie->wordinfo[word].prev)
2846 continue;
2847 state = trie->wordinfo[word].accept;
2848 while (state) {
2849 state = prev_states[state];
2850 if (!state)
2851 break;
2852 prev = trie->states[state].wordnum;
2853 if (prev)
2854 break;
2855 }
2856 trie->wordinfo[word].prev = prev;
2857 }
2858 Safefree(prev_states);
2859 }
2860
2861
2862 /* and now dump out the compressed format */
2863 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2864
55eed653 2865 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2866#ifdef DEBUGGING
2867 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2868 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2869#else
03e70be4 2870 SvREFCNT_dec_NN(revcharmap);
07be1b83 2871#endif
786e8c11
YO
2872 return trie->jump
2873 ? MADE_JUMP_TRIE
2874 : trie->startstate>1
2875 ? MADE_EXACT_TRIE
2876 : MADE_TRIE;
2877}
2878
2879STATIC void
2880S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2881{
b423522f
KW
2882/* The Trie is constructed and compressed now so we can build a fail array if
2883 * it's needed
786e8c11 2884
b423522f
KW
2885 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
2886 3.32 in the
2887 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
2888 Ullman 1985/88
786e8c11
YO
2889 ISBN 0-201-10088-6
2890
b423522f
KW
2891 We find the fail state for each state in the trie, this state is the longest
2892 proper suffix of the current state's 'word' that is also a proper prefix of
2893 another word in our trie. State 1 represents the word '' and is thus the
2894 default fail state. This allows the DFA not to have to restart after its
2895 tried and failed a word at a given point, it simply continues as though it
2896 had been matching the other word in the first place.
786e8c11
YO
2897 Consider
2898 'abcdgu'=~/abcdefg|cdgu/
b423522f
KW
2899 When we get to 'd' we are still matching the first word, we would encounter
2900 'g' which would fail, which would bring us to the state representing 'd' in
2901 the second word where we would try 'g' and succeed, proceeding to match
2902 'cdgu'.
786e8c11
YO
2903 */
2904 /* add a fail transition */
3251b653
NC
2905 const U32 trie_offset = ARG(source);
2906 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2907 U32 *q;
2908 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2909 const U32 numstates = trie->statecount;
786e8c11
YO
2910 const U32 ubound = trie->lasttrans + ucharcount;
2911 U32 q_read = 0;
2912 U32 q_write = 0;
2913 U32 charid;
2914 U32 base = trie->states[ 1 ].trans.base;
2915 U32 *fail;
2916 reg_ac_data *aho;
cf78de0b 2917 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
786e8c11 2918 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2919
2920 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2921#ifndef DEBUGGING
2922 PERL_UNUSED_ARG(depth);
2923#endif
2924
2925
2926 ARG_SET( stclass, data_slot );
c944940b 2927 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2928 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2929 aho->trie=trie_offset;
446bd890
NC
2930 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2931 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2932 Newxz( q, numstates, U32);
c944940b 2933 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2934 aho->refcount = 1;
2935 fail = aho->fail;
2936 /* initialize fail[0..1] to be 1 so that we always have
2937 a valid final fail state */
2938 fail[ 0 ] = fail[ 1 ] = 1;
2939
2940 for ( charid = 0; charid < ucharcount ; charid++ ) {
2941 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2942 if ( newstate ) {
2943 q[ q_write ] = newstate;
2944 /* set to point at the root */
2945 fail[ q[ q_write++ ] ]=1;
2946 }
2947 }
2948 while ( q_read < q_write) {
2949 const U32 cur = q[ q_read++ % numstates ];
2950 base = trie->states[ cur ].trans.base;
2951
2952 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2953 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2954 if (ch_state) {
2955 U32 fail_state = cur;
2956 U32 fail_base;
2957 do {
2958 fail_state = fail[ fail_state ];
2959 fail_base = aho->states[ fail_state ].trans.base;
2960 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2961
2962 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2963 fail[ ch_state ] = fail_state;
2964 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2965 {
2966 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2967 }
2968 q[ q_write++ % numstates] = ch_state;
2969 }
2970 }
2971 }
2972 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2973 when we fail in state 1, this allows us to use the
2974 charclass scan to find a valid start char. This is based on the principle
2975 that theres a good chance the string being searched contains lots of stuff
2976 that cant be a start char.
2977 */
2978 fail[ 0 ] = fail[ 1 ] = 0;
2979 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2980 PerlIO_printf(Perl_debug_log,
2981 "%*sStclass Failtable (%"UVuf" states): 0",
2982 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2983 );
786e8c11
YO
2984 for( q_read=1; q_read<numstates; q_read++ ) {
2985 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2986 }
2987 PerlIO_printf(Perl_debug_log, "\n");
2988 });
2989 Safefree(q);
2990 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2991}
2992
786e8c11 2993
07be1b83 2994#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2995 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2996 SV * const mysv=sv_newmortal(); \
2997 regnode *Next = regnext(scan); \
2998 regprop(RExC_rx, mysv, scan); \
7f69552c 2999 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
3000 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
3001 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 3002 }});
07be1b83 3003
1de06328 3004
bb914485 3005/* The below joins as many adjacent EXACTish nodes as possible into a single
0a982f06
KW
3006 * one. The regop may be changed if the node(s) contain certain sequences that
3007 * require special handling. The joining is only done if:
bb914485
KW
3008 * 1) there is room in the current conglomerated node to entirely contain the
3009 * next one.
3010 * 2) they are the exact same node type
3011 *
87b8b349 3012 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
bb914485
KW
3013 * these get optimized out
3014 *
0a982f06
KW
3015 * If a node is to match under /i (folded), the number of characters it matches
3016 * can be different than its character length if it contains a multi-character
3017 * fold. *min_subtract is set to the total delta of the input nodes.
bb914485 3018 *
a0c4c608
KW
3019 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
3020 * and contains LATIN SMALL LETTER SHARP S
f758bddf 3021 *
bb914485 3022 * This is as good a place as any to discuss the design of handling these
0a982f06
KW
3023 * multi-character fold sequences. It's been wrong in Perl for a very long
3024 * time. There are three code points in Unicode whose multi-character folds
3025 * were long ago discovered to mess things up. The previous designs for
3026 * dealing with these involved assigning a special node for them. This
3027 * approach doesn't work, as evidenced by this example:
a0c4c608 3028 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
0a982f06
KW
3029 * Both these fold to "sss", but if the pattern is parsed to create a node that
3030 * would match just the \xDF, it won't be able to handle the case where a
bb914485
KW
3031 * successful match would have to cross the node's boundary. The new approach
3032 * that hopefully generally solves the problem generates an EXACTFU_SS node
3033 * that is "sss".
3034 *
0a982f06 3035 * It turns out that there are problems with all multi-character folds, and not
cb117658
KW
3036 * just these three. Now the code is general, for all such cases. The
3037 * approach taken is:
0a982f06
KW
3038 * 1) This routine examines each EXACTFish node that could contain multi-
3039 * character fold sequences. It returns in *min_subtract how much to
9d071ca8 3040 * subtract from the the actual length of the string to get a real minimum
0a982f06
KW
3041 * match length; it is 0 if there are no multi-char folds. This delta is
3042 * used by the caller to adjust the min length of the match, and the delta
3043 * between min and max, so that the optimizer doesn't reject these
3044 * possibilities based on size constraints.
cb117658 3045 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
0a982f06
KW
3046 * is used for an EXACTFU node that contains at least one "ss" sequence in
3047 * it. For non-UTF-8 patterns and strings, this is the only case where
3048 * there is a possible fold length change. That means that a regular
3049 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3050 * with length changes, and so can be processed faster. regexec.c takes
3051 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
3052 * pre-folded by regcomp.c. This saves effort in regex matching.
87b8b349 3053 * However, the pre-folding isn't done for non-UTF8 patterns because the
0a982f06
KW
3054 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
3055 * down by forcing the pattern into UTF8 unless necessary. Also what
3056 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
3057 * possibilities for the non-UTF8 patterns are quite simple, except for
3058 * the sharp s. All the ones that don't involve a UTF-8 target string are
3059 * members of a fold-pair, and arrays are set up for all of them so that
3060 * the other member of the pair can be found quickly. Code elsewhere in
3061 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3062 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3063 * described in the next item.
098b07d5 3064 * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
1ca267a5
KW
3065 * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
3066 * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
3067 * (probably unwittingly, in Perl_regexec_flags()) makes is that a
3068 * character in the pattern corresponds to at most a single character in
3069 * the target string. (And I do mean character, and not byte here, unlike
3070 * other parts of the documentation that have never been updated to
3071 * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
3072 * two character string 'ss'; in EXACTFA nodes it can match
3073 * "\x{17F}\x{17F}". These violate the assumption, and they are the only
3074 * instances where it is violated. I'm reluctant to try to change the
3075 * assumption, as the code involved is impenetrable to me (khw), so
3076 * instead the code here punts. This routine examines (when the pattern
3077 * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
3078 * boolean indicating whether or not the node contains a sharp s. When it
3079 * is true, the caller sets a flag that later causes the optimizer in this
3080 * file to not set values for the floating and fixed string lengths, and
3081 * thus avoids the optimizer code in regexec.c that makes the invalid
3082 * assumption. Thus, there is no optimization based on string lengths for
3083 * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
3084 * (The reason the assumption is wrong only in these two cases is that all
3085 * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
3086 * other folds to their expanded versions. We can't prefold sharp s to
3087 * 'ss' in EXACTF nodes because we don't know at compile time if it
3088 * actually matches 'ss' or not. It will match iff the target string is
3089 * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
3090 * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
3091 * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
3092 * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
3093 * require the pattern to be forced into UTF-8, the overhead of which we
3094 * want to avoid.)
098b07d5
KW
3095 *
3096 * Similarly, the code that generates tries doesn't currently handle
3097 * not-already-folded multi-char folds, and it looks like a pain to change
3098 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3099 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3100 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3101 * using /iaa matching will be doing so almost entirely with ASCII
3102 * strings, so this should rarely be encountered in practice */
1de06328 3103
9d071ca8 3104#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
07be1b83 3105 if (PL_regkind[OP(scan)] == EXACT) \
9d071ca8 3106 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
07be1b83 3107
be8e71aa 3108STATIC U32
9d071ca8 3109S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
07be1b83
YO
3110 /* Merge several consecutive EXACTish nodes into one. */
3111 regnode *n = regnext(scan);
3112 U32 stringok = 1;
3113 regnode *next = scan + NODE_SZ_STR(scan);
3114 U32 merged = 0;
3115 U32 stopnow = 0;
3116#ifdef DEBUGGING
3117 regnode *stop = scan;
72f13be8 3118 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 3119#else
d47053eb
RGS
3120 PERL_UNUSED_ARG(depth);
3121#endif
7918f24d
NC
3122
3123 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 3124#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
3125 PERL_UNUSED_ARG(flags);
3126 PERL_UNUSED_ARG(val);
07be1b83 3127#endif
07be1b83 3128 DEBUG_PEEP("join",scan,depth);
bb914485 3129
3f410cf6
KW
3130 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3131 * EXACT ones that are mergeable to the current one. */
3132 while (n
3133 && (PL_regkind[OP(n)] == NOTHING
3134 || (stringok && OP(n) == OP(scan)))
07be1b83 3135 && NEXT_OFF(n)
3f410cf6
KW
3136 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3137 {
07be1b83
YO
3138
3139 if (OP(n) == TAIL || n > next)
3140 stringok = 0;
3141 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
3142 DEBUG_PEEP("skip:",n,depth);
3143 NEXT_OFF(scan) += NEXT_OFF(n);
3144 next = n + NODE_STEP_REGNODE;
3145#ifdef DEBUGGING
3146 if (stringok)
3147 stop = n;
3148#endif
3149 n = regnext(n);
3150 }
3151 else if (stringok) {
786e8c11 3152 const unsigned int oldl = STR_LEN(scan);
07be1b83 3153 regnode * const nnext = regnext(n);
b2230d39 3154
87b8b349
KW
3155 /* XXX I (khw) kind of doubt that this works on platforms where
3156 * U8_MAX is above 255 because of lots of other assumptions */
79a81a6e 3157 /* Don't join if the sum can't fit into a single node */
b2230d39
KW
3158 if (oldl + STR_LEN(n) > U8_MAX)
3159 break;
07be1b83
YO
3160
3161 DEBUG_PEEP("merg",n,depth);
07be1b83 3162 merged++;
b2230d39 3163
07be1b83
YO
3164 NEXT_OFF(scan) += NEXT_OFF(n);
3165 STR_LEN(scan) += STR_LEN(n);
3166 next = n + NODE_SZ_STR(n);
3167 /* Now we can overwrite *n : */
3168 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3169#ifdef DEBUGGING
3170 stop = next - 1;
3171#endif
3172 n = nnext;
3173 if (stopnow) break;
3174 }
3175
d47053eb
RGS
3176#ifdef EXPERIMENTAL_INPLACESCAN
3177 if (flags && !NEXT_OFF(n)) {
3178 DEBUG_PEEP("atch", val, depth);
3179 if (reg_off_by_arg[OP(n)]) {
3180 ARG_SET(n, val - n);
3181 }
3182 else {
3183 NEXT_OFF(n) = val - n;
3184 }
3185 stopnow = 1;
3186 }
07be1b83
YO
3187#endif
3188 }
2c2b7f86 3189
9d071ca8 3190 *min_subtract = 0;
f758bddf 3191 *has_exactf_sharp_s = FALSE;
f646642f 3192
3f410cf6
KW
3193 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3194 * can now analyze for sequences of problematic code points. (Prior to
3195 * this final joining, sequences could have been split over boundaries, and
a0c4c608
KW
3196 * hence missed). The sequences only happen in folding, hence for any
3197 * non-EXACT EXACTish node */
86d6fcad 3198 if (OP(scan) != EXACT) {
0a982f06
KW
3199 const U8 * const s0 = (U8*) STRING(scan);
3200 const U8 * s = s0;
3201 const U8 * const s_end = s0 + STR_LEN(scan);
f758bddf
KW
3202
3203 /* One pass is made over the node's string looking for all the
3204 * possibilities. to avoid some tests in the loop, there are two main
3205 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3206 * non-UTF-8 */
3207 if (UTF) {
86d6fcad 3208
0a982f06
KW
3209 /* Examine the string for a multi-character fold sequence. UTF-8
3210 * patterns have all characters pre-folded by the time this code is
3211 * executed */
3212 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3213 length sequence we are looking for is 2 */
86d6fcad 3214 {
0a982f06
KW
3215 int count = 0;
3216 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3217 if (! len) { /* Not a multi-char fold: get next char */
3218 s += UTF8SKIP(s);
3219 continue;
3220 }
bb914485 3221
0a982f06 3222 /* Nodes with 'ss' require special handling, except for EXACTFL
098b07d5
KW
3223 * and EXACTFA-ish for which there is no multi-char fold to
3224 * this */
0a982f06 3225 if (len == 2 && *s == 's' && *(s+1) == 's'
098b07d5
KW
3226 && OP(scan) != EXACTFL
3227 && OP(scan) != EXACTFA
3228 && OP(scan) != EXACTFA_NO_TRIE)
0a982f06
KW
3229 {
3230 count = 2;
3231 OP(scan) = EXACTFU_SS;
3232 s += 2;
3233 }
0a982f06
KW
3234 else { /* Here is a generic multi-char fold. */
3235 const U8* multi_end = s + len;
3236
3237 /* Count how many characters in it. In the case of /l and
3238 * /aa, no folds which contain ASCII code points are
3239 * allowed, so check for those, and skip if found. (In
3240 * EXACTFL, no folds are allowed to any Latin1 code point,
3241 * not just ASCII. But there aren't any of these
3242 * currently, nor ever likely, so don't take the time to
3243 * test for them. The code that generates the
3244 * is_MULTI_foo() macros croaks should one actually get put
3245 * into Unicode .) */
098b07d5
KW
3246 if (OP(scan) != EXACTFL
3247 && OP(scan) != EXACTFA
3248 && OP(scan) != EXACTFA_NO_TRIE)
3249 {
0a982f06
KW
3250 count = utf8_length(s, multi_end);
3251 s = multi_end;
3252 }
3253 else {
3254 while (s < multi_end) {
3255 if (isASCII(*s)) {
3256 s++;
3257 goto next_iteration;
3258 }
3259 else {
3260 s += UTF8SKIP(s);
3261 }
3262 count++;
3263 }
3264 }
3265 }
f758bddf 3266
0a982f06
KW
3267 /* The delta is how long the sequence is minus 1 (1 is how long
3268 * the character that folds to the sequence is) */
3269 *min_subtract += count - 1;
3270 next_iteration: ;
bb914485
KW
3271 }
3272 }
1ca267a5
KW
3273 else if (OP(scan) == EXACTFA) {
3274
3275 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3276 * fold to the ASCII range (and there are no existing ones in the
3277 * upper latin1 range). But, as outlined in the comments preceding
098b07d5
KW
3278 * this function, we need to flag any occurrences of the sharp s.
3279 * This character forbids trie formation (because of added
3280 * complexity) */
1ca267a5
KW
3281 while (s < s_end) {
3282 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
098b07d5 3283 OP(scan) = EXACTFA_NO_TRIE;
1ca267a5
KW
3284 *has_exactf_sharp_s = TRUE;
3285 break;
3286 }
3287 s++;
3288 continue;
3289 }
3290 }
3291 else if (OP(scan) != EXACTFL) {
3292
3293 /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
3294 * multi-char folds that are all Latin1. (This code knows that
3295 * there are no current multi-char folds possible with EXACTFL,
3296 * relying on fold_grind.t to catch any errors if the very unlikely
3297 * event happens that some get added in future Unicode versions.)
3298 * As explained in the comments preceding this function, we look
3299 * also for the sharp s in EXACTF nodes; it can be in the final
0a982f06
KW
3300 * position. Otherwise we can stop looking 1 byte earlier because
3301 * have to find at least two characters for a multi-fold */
f758bddf
KW
3302 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
3303
0a982f06 3304 while (s < upper) {
40b1ba4f 3305 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
0a982f06
KW
3306 if (! len) { /* Not a multi-char fold. */
3307 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
3308 {
3309 *has_exactf_sharp_s = TRUE;
3310 }
3311 s++;
3312 continue;
3313 }
3314
3315 if (len == 2
c02c3054
KW
3316 && isARG2_lower_or_UPPER_ARG1('s', *s)
3317 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
0a982f06
KW
3318 {
3319
3320 /* EXACTF nodes need to know that the minimum length
3321 * changed so that a sharp s in the string can match this
3322 * ss in the pattern, but they remain EXACTF nodes, as they
3323 * won't match this unless the target string is is UTF-8,
3324 * which we don't know until runtime */
3325 if (OP(scan) != EXACTF) {
3326 OP(scan) = EXACTFU_SS;
3327 }
86d6fcad 3328 }
0a982f06
KW
3329
3330 *min_subtract += len - 1;
3331 s += len;
86d6fcad
KW
3332 }
3333 }
07be1b83 3334 }
3f410cf6 3335
07be1b83 3336#ifdef DEBUGGING
bb789b09
DM
3337 /* Allow dumping but overwriting the collection of skipped
3338 * ops and/or strings with fake optimized ops */
07be1b83
YO
3339 n = scan + NODE_SZ_STR(scan);
3340 while (n <= stop) {
bb789b09
DM
3341 OP(n) = OPTIMIZED;
3342 FLAGS(n) = 0;
3343 NEXT_OFF(n) = 0;
07be1b83
YO
3344 n++;
3345 }
3346#endif
3347 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3348 return stopnow;
3349}
3350
486ec47a 3351/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
3352 Finds fixed substrings. */
3353
a0288114 3354/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
3355 to the position after last scanned or to NULL. */
3356
40d049e4
YO
3357#define INIT_AND_WITHP \
3358 assert(!and_withp); \
b8f7bb16 3359 Newx(and_withp,1, regnode_ssc); \
40d049e4 3360 SAVEFREEPV(and_withp)
07be1b83 3361
b515a41d 3362/* this is a chain of data about sub patterns we are processing that
486ec47a 3363 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
3364 we can simulate recursion without losing state. */
3365struct scan_frame;
3366typedef struct scan_frame {
3367 regnode *last; /* last node to process in this frame */
3368 regnode *next; /* next node to process when last is reached */
3369 struct scan_frame *prev; /*previous frame*/
09a65838 3370 U32 prev_recursed_depth;
b515a41d
YO
3371 I32 stop; /* what stopparen do we use */
3372} scan_frame;
3373
304ee84b
YO
3374
3375#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3376
49f55535 3377STATIC SSize_t
40d049e4 3378S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
ea3daa5d 3379 SSize_t *minlenp, SSize_t *deltap,
40d049e4
YO
3380 regnode *last,
3381 scan_data_t *data,
3382 I32 stopparen,
09a65838 3383 U32 recursed_depth,
b8f7bb16 3384 regnode_ssc *and_withp,
40d049e4 3385 U32 flags, U32 depth)
c277df42
IZ
3386 /* scanp: Start here (read-write). */
3387 /* deltap: Write maxlen-minlen here. */
3388 /* last: Stop before this one. */
40d049e4
YO
3389 /* data: string data about the pattern */
3390 /* stopparen: treat close N as END */
3391 /* recursed: which subroutines have we recursed into */
3392 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 3393{
97aff369 3394 dVAR;
ea3daa5d
FC
3395 /* There must be at least this number of characters to match */
3396 SSize_t min = 0;
2d608413 3397 I32 pars = 0, code;
c277df42 3398 regnode *scan = *scanp, *next;
ea3daa5d 3399 SSize_t delta = 0;
c277df42 3400 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 3401 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
3402 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3403 scan_data_t data_fake;
a3621e74 3404 SV *re_trie_maxbuff = NULL;
786e8c11 3405 regnode *first_non_open = scan;
ea3daa5d 3406 SSize_t stopmin = SSize_t_MAX;
8aa23a47 3407 scan_frame *frame = NULL;
a3621e74 3408 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 3409
7918f24d
NC
3410 PERL_ARGS_ASSERT_STUDY_CHUNK;
3411
13a24bad 3412#ifdef DEBUGGING
40d049e4 3413 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 3414#endif
786e8c11 3415 if ( depth == 0 ) {
40d049e4 3416 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
3417 first_non_open=regnext(first_non_open);
3418 }
3419
b81d288d 3420
8aa23a47
YO
3421 fake_study_recurse:
3422 while ( scan && OP(scan) != END && scan < last ){
2d608413
KW
3423 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3424 node length to get a real minimum (because
3425 the folded version may be shorter) */
f758bddf 3426 bool has_exactf_sharp_s = FALSE;
8aa23a47 3427 /* Peephole optimizer: */
09a65838
YO
3428 DEBUG_OPTIMISE_MORE_r(
3429 {
9adf3763 3430 PerlIO_printf(Perl_debug_log,"%*sstudy_chunk stopparen=%ld depth=%lu recursed_depth=%lu ",
c2606296
TC
3431 ((int) depth*2), "", (long)stopparen,
3432 (unsigned long)depth, (unsigned long)recursed_depth);
09a65838
YO
3433 if (recursed_depth) {
3434 U32 i;
3435 U32 j;
3436 for ( j = 0 ; j < recursed_depth ; j++ ) {
3437 PerlIO_printf(Perl_debug_log,"[");
9e9ecfdf 3438 for ( i = 0 ; i < (U32)RExC_npar ; i++ )
09a65838
YO
3439 PerlIO_printf(Perl_debug_log,"%d",
3440 PAREN_TEST(RExC_study_chunk_recursed +
3441 (j * RExC_study_chunk_recursed_bytes), i)
3442 ? 1 : 0
3443 );
3444 PerlIO_printf(Perl_debug_log,"]");
3445 }
0ac75573 3446 }
09a65838
YO
3447 PerlIO_printf(Perl_debug_log,"\n");
3448 }
3449 );
0ac75573
YO
3450 DEBUG_STUDYDATA("Peep:", data, depth);
3451 DEBUG_PEEP("Peep", scan, depth);
3452
a0c4c608
KW
3453
3454 /* Its not clear to khw or hv why this is done here, and not in the
3455 * clauses that deal with EXACT nodes. khw's guess is that it's
3456 * because of a previous design */
9d071ca8 3457 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
8aa23a47
YO
3458
3459 /* Follow the next-chain of the current node and optimize
3460 away all the NOTHINGs from it. */
3461 if (OP(scan) != CURLYX) {
3462 const int max = (reg_off_by_arg[OP(scan)]
3463 ? I32_MAX
3464 /* I32 may be smaller than U16 on CRAYs! */
3465 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3466 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3467 int noff;
3468 regnode *n = scan;
686b73d4 3469
8aa23a47
YO
3470 /* Skip NOTHING and LONGJMP. */
3471 while ((n = regnext(n))
3472 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3473 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3474 && off + noff < max)
3475 off += noff;
3476 if (reg_off_by_arg[OP(scan)])
3477 ARG(scan) = off;
3478 else
3479 NEXT_OFF(scan) = off;
3480 }
a3621e74 3481
c277df42 3482
8aa23a47
YO
3483
3484 /* The principal pseudo-switch. Cannot be a switch, since we
3485 look into several different things. */
3486 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3487 || OP(scan) == IFTHEN) {
3488 next = regnext(scan);
3489 code = OP(scan);
3490 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
686b73d4 3491
8aa23a47 3492 if (OP(next) == code || code == IFTHEN) {
b423522f
KW
3493 /* NOTE - There is similar code to this block below for
3494 * handling TRIE nodes on a re-study. If you change stuff here
3495 * check there too. */
49f55535 3496 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
b8f7bb16 3497 regnode_ssc accum;
8aa23a47 3498 regnode * const startbranch=scan;
686b73d4 3499
8aa23a47 3500 if (flags & SCF_DO_SUBSTR)
304ee84b 3501 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47 3502 if (flags & SCF_DO_STCLASS)
557bd3fb 3503 ssc_init_zero(pRExC_state, &accum);
8aa23a47
YO
3504
3505 while (OP(scan) == code) {
49f55535
FC
3506 SSize_t deltanext, minnext, fake;
3507 I32 f = 0;
b8f7bb16 3508 regnode_ssc this_class;
8aa23a47
YO
3509
3510 num++;
3511 data_fake.flags = 0;
3512 if (data) {
3513 data_fake.whilem_c = data->whilem_c;
3514 data_fake.last_closep = data->last_closep;
3515 }
3516 else
3517 data_fake.last_closep = &fake;
58e23c8d
YO
3518
3519 data_fake.pos_delta = delta;
8aa23a47
YO
3520 next = regnext(scan);
3521 scan = NEXTOPER(scan);
3522 if (code != BRANCH)
c277df42 3523 scan = NEXTOPER(scan);
8aa23a47 3524 if (flags & SCF_DO_STCLASS) {
557bd3fb 3525 ssc_init(pRExC_state, &this_class);
8aa23a47
YO
3526 data_fake.start_class = &this_class;
3527 f = SCF_DO_STCLASS_AND;
58e23c8d 3528 }
8aa23a47
YO
3529 if (flags & SCF_WHILEM_VISITED_POS)
3530 f |= SCF_WHILEM_VISITED_POS;
3531
3532 /* we suppose the run is continuous, last=next...*/
3533 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3534 next, &data_fake,
09a65838 3535 stopparen, recursed_depth, NULL, f,depth+1);
8aa23a47
YO
3536 if (min1 > minnext)
3537 min1 = minnext;
ea3daa5d 3538 if (deltanext == SSize_t_MAX) {
8aa23a47 3539 is_inf = is_inf_internal = 1;
ea3daa5d 3540 max1 = SSize_t_MAX;
9b139d09
GG
3541 } else if (max1 < minnext + deltanext)
3542 max1 = minnext + deltanext;
8aa23a47
YO
3543 scan = next;
3544 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3545 pars++;
3546 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3547 if ( stopmin > minnext)
3548 stopmin = min + min1;
3549 flags &= ~SCF_DO_SUBSTR;
3550 if (data)
3551 data->flags |= SCF_SEEN_ACCEPT;
3552 }
3553 if (data) {
3554 if (data_fake.flags & SF_HAS_EVAL)
3555 data->flags |= SF_HAS_EVAL;
3556 data->whilem_c = data_fake.whilem_c;
3dab1dad 3557 }
8aa23a47 3558 if (flags & SCF_DO_STCLASS)
557bd3fb 3559 ssc_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
3560 }
3561 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3562 min1 = 0;
3563 if (flags & SCF_DO_SUBSTR) {
3564 data->pos_min += min1;
ea3daa5d
FC
3565 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3566 data->pos_delta = SSize_t_MAX;
9b139d09
GG
3567 else
3568 data->pos_delta += max1 - min1;
8aa23a47
YO
3569 if (max1 != min1 || is_inf)
3570 data->longest = &(data->longest_float);
3571 }
3572 min += min1;
ea3daa5d
FC
3573 if (delta == SSize_t_MAX
3574 || SSize_t_MAX - delta - (max1 - min1) < 0)
3575 delta = SSize_t_MAX;
9b139d09
GG
3576 else
3577 delta += max1 - min1;
8aa23a47 3578 if (flags & SCF_DO_STCLASS_OR) {
557bd3fb 3579 ssc_or(pRExC_state, data->start_class, &accum);
8aa23a47 3580 if (min1) {
1ca93ef1 3581 ssc_and(pRExC_state, data->start_class, and_withp);
8aa23a47 3582 flags &= ~SCF_DO_STCLASS;
653099ff