This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: update RExC_start when parsing outside input
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
afda64e8 84EXTERN_C const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
f7e03a10 89#include "dquote_inline.h"
b992490d 90#include "invlist_inline.h"
1b0f46bf 91#include "unicode_constants.h"
04e98a4d 92
538e84ed
KW
93#define HAS_NONLATIN1_FOLD_CLOSURE(i) \
94 _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
f12c0118
KW
95#define HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(i) \
96 _HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 97#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 98#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 99
a687059c
LW
100#ifndef STATIC
101#define STATIC static
102#endif
103
dea37815
KW
104#ifndef MIN
105#define MIN(a,b) ((a) < (b) ? (a) : (b))
106#endif
b1603ef8 107
3f910e62
YO
108/* this is a chain of data about sub patterns we are processing that
109 need to be handled separately/specially in study_chunk. Its so
110 we can simulate recursion without losing state. */
111struct scan_frame;
112typedef struct scan_frame {
113 regnode *last_regnode; /* last node to process in this frame */
114 regnode *next_regnode; /* next node to process when last is reached */
115 U32 prev_recursed_depth;
116 I32 stopparen; /* what stopparen do we use */
117 U32 is_top_frame; /* what flags do we use? */
118
119 struct scan_frame *this_prev_frame; /* this previous frame */
120 struct scan_frame *prev_frame; /* previous frame */
121 struct scan_frame *next_frame; /* next frame */
122} scan_frame;
123
f4ae5a27
KW
124/* Certain characters are output as a sequence with the first being a
125 * backslash. */
126#define isBACKSLASHED_PUNCT(c) \
127 ((c) == '-' || (c) == ']' || (c) == '\\' || (c) == '^')
128
129
09b2b2e6 130struct RExC_state_t {
514a91f1
DM
131 U32 flags; /* RXf_* are we folding, multilining? */
132 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 133 char *precomp; /* uncompiled string. */
711b303b 134 char *precomp_end; /* pointer to end of uncompiled string. */
288b8c02 135 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf 136 regexp *rx; /* perl core regexp structure */
538e84ed
KW
137 regexp_internal *rxi; /* internal data for regexp object
138 pprivate field */
fac92740 139 char *start; /* Start of input for compile */
830247a4
IZ
140 char *end; /* End of input for compile */
141 char *parse; /* Input-scan pointer. */
ea3daa5d 142 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 143 regnode *emit_start; /* Start of emitted-code area */
538e84ed
KW
144 regnode *emit_bound; /* First regnode outside of the
145 allocated space */
f7c7e32a
DM
146 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
147 implies compiling, so don't emit */
9a81a976
KW
148 regnode_ssc emit_dummy; /* placeholder for emit to point to;
149 large enough for the largest
150 non-EXACTish node, so can use it as
151 scratch in pass1 */
830247a4
IZ
152 I32 naughty; /* How bad is this pattern? */
153 I32 sawback; /* Did we see \1, ...? */
154 U32 seen;
ea3daa5d 155 SSize_t size; /* Code size. */
538e84ed
KW
156 I32 npar; /* Capture buffer count, (OPEN) plus
157 one. ("par" 0 is the whole
158 pattern)*/
159 I32 nestroot; /* root parens we are in - used by
160 accept */
830247a4
IZ
161 I32 extralen;
162 I32 seen_zerolen;
40d049e4
YO
163 regnode **open_parens; /* pointers to open parens */
164 regnode **close_parens; /* pointers to close parens */
165 regnode *opend; /* END node in program */
02daf0ab
YO
166 I32 utf8; /* whether the pattern is utf8 or not */
167 I32 orig_utf8; /* whether the pattern was originally in utf8 */
168 /* XXX use this for future optimisation of case
169 * where pattern must be upgraded to utf8. */
e40e74fe
KW
170 I32 uni_semantics; /* If a d charset modifier should use unicode
171 rules, even if the pattern is not in
172 utf8 */
81714fb9 173 HV *paren_names; /* Paren names */
538e84ed 174
40d049e4
YO
175 regnode **recurse; /* Recurse regops */
176 I32 recurse_count; /* Number of recurse regops */
4286711a 177 U8 *study_chunk_recursed; /* bitmap of which subs we have moved
538e84ed 178 through */
09a65838 179 U32 study_chunk_recursed_bytes; /* bytes in bitmap */
b57e4118 180 I32 in_lookbehind;
4624b182 181 I32 contains_locale;
cfafade5 182 I32 contains_i;
bb3f3ed2 183 I32 override_recoding;
b6d67071
KW
184#ifdef EBCDIC
185 I32 recode_x_to_native;
186#endif
9d53c457 187 I32 in_multi_char_class;
3d2bd50a 188 struct reg_code_block *code_blocks; /* positions of literal (?{})
68e2671b 189 within pattern */
b1603ef8
DM
190 int num_code_blocks; /* size of code_blocks[] */
191 int code_index; /* next code_blocks[] slot */
ee273784 192 SSize_t maxlen; /* mininum possible number of chars in string to match */
3f910e62
YO
193 scan_frame *frame_head;
194 scan_frame *frame_last;
195 U32 frame_count;
67cdf558 196 U32 strict;
dc6d7f5c 197#ifdef ADD_TO_REGEXEC
830247a4
IZ
198 char *starttry; /* -Dr: where regtry was called. */
199#define RExC_starttry (pRExC_state->starttry)
200#endif
d24ca0c5 201 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 202#ifdef DEBUGGING
be8e71aa 203 const char *lastparse;
3dab1dad 204 I32 lastnum;
1f1031fe 205 AV *paren_name_list; /* idx -> name */
d9a72fcc 206 U32 study_chunk_recursed_count;
c9f0d54c
YO
207 SV *mysv1;
208 SV *mysv2;
3dab1dad
YO
209#define RExC_lastparse (pRExC_state->lastparse)
210#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 211#define RExC_paren_name_list (pRExC_state->paren_name_list)
d9a72fcc 212#define RExC_study_chunk_recursed_count (pRExC_state->study_chunk_recursed_count)
c9f0d54c
YO
213#define RExC_mysv (pRExC_state->mysv1)
214#define RExC_mysv1 (pRExC_state->mysv1)
215#define RExC_mysv2 (pRExC_state->mysv2)
216
3dab1dad 217#endif
512c0f5a 218 bool seen_unfolded_sharp_s;
09b2b2e6 219};
830247a4 220
e2509266 221#define RExC_flags (pRExC_state->flags)
514a91f1 222#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 223#define RExC_precomp (pRExC_state->precomp)
711b303b 224#define RExC_precomp_end (pRExC_state->precomp_end)
288b8c02 225#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 226#define RExC_rx (pRExC_state->rx)
f8fc2ecf 227#define RExC_rxi (pRExC_state->rxi)
fac92740 228#define RExC_start (pRExC_state->start)
830247a4
IZ
229#define RExC_end (pRExC_state->end)
230#define RExC_parse (pRExC_state->parse)
231#define RExC_whilem_seen (pRExC_state->whilem_seen)
512c0f5a
KW
232
233/* Set during the sizing pass when there is a LATIN SMALL LETTER SHARP S in any
234 * EXACTF node, hence was parsed under /di rules. If later in the parse,
235 * something forces the pattern into using /ui rules, the sharp s should be
236 * folded into the sequence 'ss', which takes up more space than previously
237 * calculated. This means that the sizing pass needs to be restarted. (The
238 * node also becomes an EXACTFU_SS.) For all other characters, an EXACTF node
239 * that gets converted to /ui (and EXACTFU) occupies the same amount of space,
240 * so there is no need to resize [perl #125990]. */
241#define RExC_seen_unfolded_sharp_s (pRExC_state->seen_unfolded_sharp_s)
242
7122b237 243#ifdef RE_TRACK_PATTERN_OFFSETS
538e84ed
KW
244#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the
245 others */
7122b237 246#endif
830247a4 247#define RExC_emit (pRExC_state->emit)
f7c7e32a 248#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 249#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 250#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
251#define RExC_sawback (pRExC_state->sawback)
252#define RExC_seen (pRExC_state->seen)
253#define RExC_size (pRExC_state->size)
ee273784 254#define RExC_maxlen (pRExC_state->maxlen)
830247a4 255#define RExC_npar (pRExC_state->npar)
e2e6a0f1 256#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
257#define RExC_extralen (pRExC_state->extralen)
258#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 259#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 260#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 261#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
262#define RExC_open_parens (pRExC_state->open_parens)
263#define RExC_close_parens (pRExC_state->close_parens)
264#define RExC_opend (pRExC_state->opend)
81714fb9 265#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
266#define RExC_recurse (pRExC_state->recurse)
267#define RExC_recurse_count (pRExC_state->recurse_count)
09a65838 268#define RExC_study_chunk_recursed (pRExC_state->study_chunk_recursed)
538e84ed
KW
269#define RExC_study_chunk_recursed_bytes \
270 (pRExC_state->study_chunk_recursed_bytes)
b57e4118 271#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 272#define RExC_contains_locale (pRExC_state->contains_locale)
cfafade5 273#define RExC_contains_i (pRExC_state->contains_i)
9d53c457 274#define RExC_override_recoding (pRExC_state->override_recoding)
b6d67071
KW
275#ifdef EBCDIC
276# define RExC_recode_x_to_native (pRExC_state->recode_x_to_native)
277#endif
9d53c457 278#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
3f910e62
YO
279#define RExC_frame_head (pRExC_state->frame_head)
280#define RExC_frame_last (pRExC_state->frame_last)
281#define RExC_frame_count (pRExC_state->frame_count)
67cdf558 282#define RExC_strict (pRExC_state->strict)
830247a4 283
99807a43
HS
284/* Heuristic check on the complexity of the pattern: if TOO_NAUGHTY, we set
285 * a flag to disable back-off on the fixed/floating substrings - if it's
286 * a high complexity pattern we assume the benefit of avoiding a full match
287 * is worth the cost of checking for the substrings even if they rarely help.
288 */
289#define RExC_naughty (pRExC_state->naughty)
290#define TOO_NAUGHTY (10)
291#define MARK_NAUGHTY(add) \
292 if (RExC_naughty < TOO_NAUGHTY) \
293 RExC_naughty += (add)
294#define MARK_NAUGHTY_EXP(exp, add) \
295 if (RExC_naughty < TOO_NAUGHTY) \
296 RExC_naughty += RExC_naughty / (exp) + (add)
cde0cee5 297
a687059c
LW
298#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
299#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
412f55bb 300 ((*s) == '{' && regcurly(s)))
a687059c
LW
301
302/*
303 * Flags to be passed up and down.
304 */
a687059c 305#define WORST 0 /* Worst case. */
a3b492c3 306#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 307
e64f369d 308/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
309 * character. (There needs to be a case: in the switch statement in regexec.c
310 * for any node marked SIMPLE.) Note that this is not the same thing as
311 * REGNODE_SIMPLE */
fda99bee 312#define SIMPLE 0x02
e64f369d 313#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
314#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
315#define TRYAGAIN 0x10 /* Weeded out a declaration. */
b97943f7
KW
316#define RESTART_PASS1 0x20 /* Need to restart sizing pass */
317#define NEED_UTF8 0x40 /* In conjunction with RESTART_PASS1, need to
318 calcuate sizes as UTF-8 */
a687059c 319
3dab1dad
YO
320#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
321
07be1b83
YO
322/* whether trie related optimizations are enabled */
323#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
324#define TRIE_STUDY_OPT
786e8c11 325#define FULL_TRIE_STUDY
07be1b83
YO
326#define TRIE_STCLASS
327#endif
1de06328
YO
328
329
40d049e4
YO
330
331#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
332#define PBITVAL(paren) (1 << ((paren) & 7))
333#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
334#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
335#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
336
82a6ada4 337#define REQUIRE_UTF8(flagp) STMT_START { \
8d9c2815 338 if (!UTF) { \
82a6ada4
KW
339 assert(PASS1); \
340 *flagp = RESTART_PASS1|NEED_UTF8; \
8d9c2815
NC
341 return NULL; \
342 } \
82a6ada4 343 } STMT_END
40d049e4 344
512c0f5a
KW
345/* Change from /d into /u rules, and restart the parse if we've already seen
346 * something whose size would increase as a result, by setting *flagp and
347 * returning 'restart_retval'. RExC_uni_semantics is a flag that indicates
348 * we've change to /u during the parse. */
349#define REQUIRE_UNI_RULES(flagp, restart_retval) \
350 STMT_START { \
351 if (DEPENDS_SEMANTICS) { \
352 assert(PASS1); \
353 set_regex_charset(&RExC_flags, REGEX_UNICODE_CHARSET); \
354 RExC_uni_semantics = 1; \
355 if (RExC_seen_unfolded_sharp_s) { \
356 *flagp |= RESTART_PASS1; \
357 return restart_retval; \
358 } \
359 } \
360 } STMT_END
361
f19b1a63
KW
362/* This converts the named class defined in regcomp.h to its equivalent class
363 * number defined in handy.h. */
364#define namedclass_to_classnum(class) ((int) ((class) / 2))
365#define classnum_to_namedclass(classnum) ((classnum) * 2)
366
de92f5e6
KW
367#define _invlist_union_complement_2nd(a, b, output) \
368 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
369#define _invlist_intersection_complement_2nd(a, b, output) \
370 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
371
1de06328
YO
372/* About scan_data_t.
373
374 During optimisation we recurse through the regexp program performing
375 various inplace (keyhole style) optimisations. In addition study_chunk
376 and scan_commit populate this data structure with information about
538e84ed 377 what strings MUST appear in the pattern. We look for the longest
3b753521 378 string that must appear at a fixed location, and we look for the
1de06328
YO
379 longest string that may appear at a floating location. So for instance
380 in the pattern:
538e84ed 381
1de06328 382 /FOO[xX]A.*B[xX]BAR/
538e84ed 383
1de06328
YO
384 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
385 strings (because they follow a .* construct). study_chunk will identify
386 both FOO and BAR as being the longest fixed and floating strings respectively.
538e84ed 387
1de06328 388 The strings can be composites, for instance
538e84ed 389
1de06328 390 /(f)(o)(o)/
538e84ed 391
1de06328 392 will result in a composite fixed substring 'foo'.
538e84ed 393
1de06328 394 For each string some basic information is maintained:
538e84ed 395
1de06328
YO
396 - offset or min_offset
397 This is the position the string must appear at, or not before.
398 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
399 characters must match before the string we are searching for.
400 Likewise when combined with minlenp and the length of the string it
538e84ed 401 tells us how many characters must appear after the string we have
1de06328 402 found.
538e84ed 403
1de06328
YO
404 - max_offset
405 Only used for floating strings. This is the rightmost point that
ea3daa5d 406 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328 407 string can occur infinitely far to the right.
538e84ed 408
1de06328 409 - minlenp
2d608413
KW
410 A pointer to the minimum number of characters of the pattern that the
411 string was found inside. This is important as in the case of positive
538e84ed 412 lookahead or positive lookbehind we can have multiple patterns
1de06328 413 involved. Consider
538e84ed 414
1de06328 415 /(?=FOO).*F/
538e84ed 416
1de06328
YO
417 The minimum length of the pattern overall is 3, the minimum length
418 of the lookahead part is 3, but the minimum length of the part that
538e84ed 419 will actually match is 1. So 'FOO's minimum length is 3, but the
1de06328 420 minimum length for the F is 1. This is important as the minimum length
538e84ed 421 is used to determine offsets in front of and behind the string being
1de06328 422 looked for. Since strings can be composites this is the length of the
486ec47a 423 pattern at the time it was committed with a scan_commit. Note that
1de06328 424 the length is calculated by study_chunk, so that the minimum lengths
538e84ed 425 are not known until the full pattern has been compiled, thus the
1de06328 426 pointer to the value.
538e84ed 427
1de06328 428 - lookbehind
538e84ed 429
1de06328 430 In the case of lookbehind the string being searched for can be
538e84ed 431 offset past the start point of the final matching string.
1de06328
YO
432 If this value was just blithely removed from the min_offset it would
433 invalidate some of the calculations for how many chars must match
434 before or after (as they are derived from min_offset and minlen and
538e84ed 435 the length of the string being searched for).
1de06328
YO
436 When the final pattern is compiled and the data is moved from the
437 scan_data_t structure into the regexp structure the information
538e84ed
KW
438 about lookbehind is factored in, with the information that would
439 have been lost precalculated in the end_shift field for the
1de06328
YO
440 associated string.
441
442 The fields pos_min and pos_delta are used to store the minimum offset
538e84ed 443 and the delta to the maximum offset at the current point in the pattern.
1de06328
YO
444
445*/
2c2d71f5
JH
446
447typedef struct scan_data_t {
1de06328
YO
448 /*I32 len_min; unused */
449 /*I32 len_delta; unused */
49f55535 450 SSize_t pos_min;
ea3daa5d 451 SSize_t pos_delta;
2c2d71f5 452 SV *last_found;
ea3daa5d 453 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 454 SSize_t last_start_min;
ea3daa5d 455 SSize_t last_start_max;
1de06328
YO
456 SV **longest; /* Either &l_fixed, or &l_float. */
457 SV *longest_fixed; /* longest fixed string found in pattern */
49f55535 458 SSize_t offset_fixed; /* offset where it starts */
ea3daa5d 459 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
460 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
461 SV *longest_float; /* longest floating string found in pattern */
49f55535 462 SSize_t offset_float_min; /* earliest point in string it can appear */
ea3daa5d
FC
463 SSize_t offset_float_max; /* latest point in string it can appear */
464 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
49f55535 465 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
2c2d71f5
JH
466 I32 flags;
467 I32 whilem_c;
49f55535 468 SSize_t *last_closep;
b8f7bb16 469 regnode_ssc *start_class;
2c2d71f5
JH
470} scan_data_t;
471
a687059c 472/*
e50aee73 473 * Forward declarations for pregcomp()'s friends.
a687059c 474 */
a0d0e21e 475
27da23d5 476static const scan_data_t zero_scan_data =
1de06328 477 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
478
479#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
480#define SF_BEFORE_SEOL 0x0001
481#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
482#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
483#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
484
44e3dfd2
BF
485#define SF_FIX_SHIFT_EOL (+2)
486#define SF_FL_SHIFT_EOL (+4)
c277df42
IZ
487
488#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
489#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
490
491#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
492#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
493#define SF_IS_INF 0x0040
494#define SF_HAS_PAR 0x0080
495#define SF_IN_PAR 0x0100
496#define SF_HAS_EVAL 0x0200
497#define SCF_DO_SUBSTR 0x0400
653099ff
GS
498#define SCF_DO_STCLASS_AND 0x0800
499#define SCF_DO_STCLASS_OR 0x1000
500#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 501#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 502
786e8c11 503#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
538e84ed 504#define SCF_SEEN_ACCEPT 0x8000
688e0391 505#define SCF_TRIE_DOING_RESTUDY 0x10000
4286711a
YO
506#define SCF_IN_DEFINE 0x20000
507
508
509
07be1b83 510
43fead97 511#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
512
513/* The enums for all these are ordered so things work out correctly */
a62b1201 514#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
538e84ed
KW
515#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) \
516 == REGEX_DEPENDS_CHARSET)
00b27cfc 517#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
538e84ed
KW
518#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) \
519 >= REGEX_UNICODE_CHARSET)
520#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
521 == REGEX_ASCII_RESTRICTED_CHARSET)
522#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) \
523 >= REGEX_ASCII_RESTRICTED_CHARSET)
524#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) \
525 == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 526
43fead97 527#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 528
f2c2a6ab
KW
529/* For programs that want to be strictly Unicode compatible by dying if any
530 * attempt is made to match a non-Unicode code point against a Unicode
531 * property. */
532#define ALWAYS_WARN_SUPER ckDEAD(packWARN(WARN_NON_UNICODE))
533
93733859 534#define OOB_NAMEDCLASS -1
b8c5462f 535
8e661ac5
KW
536/* There is no code point that is out-of-bounds, so this is problematic. But
537 * its only current use is to initialize a variable that is always set before
538 * looked at. */
539#define OOB_UNICODE 0xDEADBEEF
540
a0ed51b3
LW
541#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
542#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
543
8615cb43 544
b45f050a
JF
545/* length of regex to show in messages that don't mark a position within */
546#define RegexLengthToShowInErrorMessages 127
547
548/*
549 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
550 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
551 * op/pragma/warn/regcomp.
552 */
7253e4e3
RK
553#define MARKER1 "<-- HERE" /* marker as it appears in the description */
554#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 555
538e84ed
KW
556#define REPORT_LOCATION " in regex; marked by " MARKER1 \
557 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
b45f050a 558
d528642a
KW
559#define REPORT_LOCATION_ARGS(loc) \
560 UTF8fARG(UTF, \
561 ((loc) > RExC_end) \
711b303b 562 ? RExC_precomp_end - RExC_precomp \
d528642a
KW
563 : (loc) - RExC_precomp, \
564 RExC_precomp), \
565 UTF8fARG(UTF, \
566 ((loc) > RExC_end) \
567 ? 0 \
711b303b 568 : RExC_precomp_end - (loc), \
d528642a 569 ((loc) > RExC_end) \
711b303b 570 ? RExC_precomp_end \
d528642a 571 : (loc))
c1d900c3 572
8a6d8ec6
HS
573/* Used to point after bad bytes for an error message, but avoid skipping
574 * past a nul byte. */
575#define SKIP_IF_CHAR(s) (!*(s) ? 0 : UTF ? UTF8SKIP(s) : 1)
576
b45f050a
JF
577/*
578 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
579 * arg. Show regex, up to a maximum length. If it's too long, chop and add
580 * "...".
581 */
58e23c8d 582#define _FAIL(code) STMT_START { \
bfed75c6 583 const char *ellipses = ""; \
711b303b 584 IV len = RExC_precomp_end - RExC_precomp; \
ccb2c380
MP
585 \
586 if (!SIZE_ONLY) \
a5e7bc51 587 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
588 if (len > RegexLengthToShowInErrorMessages) { \
589 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
590 len = RegexLengthToShowInErrorMessages - 10; \
591 ellipses = "..."; \
592 } \
58e23c8d 593 code; \
ccb2c380 594} STMT_END
8615cb43 595
58e23c8d 596#define FAIL(msg) _FAIL( \
c1d900c3
BF
597 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
598 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
599
600#define FAIL2(msg,arg) _FAIL( \
c1d900c3
BF
601 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
602 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 603
b45f050a 604/*
b45f050a
JF
605 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
606 */
ccb2c380 607#define Simple_vFAIL(m) STMT_START { \
ccb2c380 608 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
d528642a 609 m, REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 610} STMT_END
b45f050a
JF
611
612/*
613 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
614 */
ccb2c380
MP
615#define vFAIL(m) STMT_START { \
616 if (!SIZE_ONLY) \
a5e7bc51 617 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
618 Simple_vFAIL(m); \
619} STMT_END
b45f050a
JF
620
621/*
622 * Like Simple_vFAIL(), but accepts two arguments.
623 */
ccb2c380 624#define Simple_vFAIL2(m,a1) STMT_START { \
d528642a
KW
625 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
626 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 627} STMT_END
b45f050a
JF
628
629/*
630 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
631 */
ccb2c380
MP
632#define vFAIL2(m,a1) STMT_START { \
633 if (!SIZE_ONLY) \
a5e7bc51 634 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
635 Simple_vFAIL2(m, a1); \
636} STMT_END
b45f050a
JF
637
638
639/*
640 * Like Simple_vFAIL(), but accepts three arguments.
641 */
ccb2c380 642#define Simple_vFAIL3(m, a1, a2) STMT_START { \
c1d900c3 643 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 644 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 645} STMT_END
b45f050a
JF
646
647/*
648 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
649 */
ccb2c380
MP
650#define vFAIL3(m,a1,a2) STMT_START { \
651 if (!SIZE_ONLY) \
a5e7bc51 652 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
653 Simple_vFAIL3(m, a1, a2); \
654} STMT_END
b45f050a
JF
655
656/*
657 * Like Simple_vFAIL(), but accepts four arguments.
658 */
ccb2c380 659#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
d528642a
KW
660 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
661 REPORT_LOCATION_ARGS(RExC_parse)); \
ccb2c380 662} STMT_END
b45f050a 663
95db3ffa
KW
664#define vFAIL4(m,a1,a2,a3) STMT_START { \
665 if (!SIZE_ONLY) \
666 SAVEFREESV(RExC_rx_sv); \
667 Simple_vFAIL4(m, a1, a2, a3); \
668} STMT_END
669
946095af 670/* A specialized version of vFAIL2 that works with UTF8f */
d528642a
KW
671#define vFAIL2utf8f(m, a1) STMT_START { \
672 if (!SIZE_ONLY) \
673 SAVEFREESV(RExC_rx_sv); \
674 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
675 REPORT_LOCATION_ARGS(RExC_parse)); \
946095af
BF
676} STMT_END
677
3ba22297 678#define vFAIL3utf8f(m, a1, a2) STMT_START { \
3ba22297
KW
679 if (!SIZE_ONLY) \
680 SAVEFREESV(RExC_rx_sv); \
681 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
d528642a 682 REPORT_LOCATION_ARGS(RExC_parse)); \
3ba22297
KW
683} STMT_END
684
499333dc
KW
685/* These have asserts in them because of [perl #122671] Many warnings in
686 * regcomp.c can occur twice. If they get output in pass1 and later in that
687 * pass, the pattern has to be converted to UTF-8 and the pass restarted, they
688 * would get output again. So they should be output in pass2, and these
689 * asserts make sure new warnings follow that paradigm. */
946095af 690
5e0a247b
KW
691/* m is not necessarily a "literal string", in this macro */
692#define reg_warn_non_literal_string(loc, m) STMT_START { \
d528642a
KW
693 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
694 "%s" REPORT_LOCATION, \
695 m, REPORT_LOCATION_ARGS(loc)); \
5e0a247b
KW
696} STMT_END
697
668c081a 698#define ckWARNreg(loc,m) STMT_START { \
d528642a
KW
699 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
700 m REPORT_LOCATION, \
701 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
702} STMT_END
703
b927b7e9 704#define vWARN(loc, m) STMT_START { \
d528642a
KW
705 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
706 m REPORT_LOCATION, \
707 REPORT_LOCATION_ARGS(loc)); \
b927b7e9
KW
708} STMT_END
709
0d6106aa 710#define vWARN_dep(loc, m) STMT_START { \
d528642a
KW
711 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), \
712 m REPORT_LOCATION, \
713 REPORT_LOCATION_ARGS(loc)); \
0d6106aa
KW
714} STMT_END
715
147508a2 716#define ckWARNdep(loc,m) STMT_START { \
d528642a
KW
717 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
718 m REPORT_LOCATION, \
719 REPORT_LOCATION_ARGS(loc)); \
147508a2
KW
720} STMT_END
721
d528642a
KW
722#define ckWARNregdep(loc,m) STMT_START { \
723 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, \
724 WARN_REGEXP), \
725 m REPORT_LOCATION, \
726 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
727} STMT_END
728
d528642a
KW
729#define ckWARN2reg_d(loc,m, a1) STMT_START { \
730 __ASSERT_(PASS2) Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
731 m REPORT_LOCATION, \
732 a1, REPORT_LOCATION_ARGS(loc)); \
2335b3d3
KW
733} STMT_END
734
d528642a
KW
735#define ckWARN2reg(loc, m, a1) STMT_START { \
736 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
737 m REPORT_LOCATION, \
738 a1, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
739} STMT_END
740
d528642a
KW
741#define vWARN3(loc, m, a1, a2) STMT_START { \
742 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
743 m REPORT_LOCATION, \
744 a1, a2, REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
745} STMT_END
746
d528642a
KW
747#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
748 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
749 m REPORT_LOCATION, \
750 a1, a2, \
751 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
752} STMT_END
753
ccb2c380 754#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
755 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
756 m REPORT_LOCATION, \
757 a1, a2, a3, \
758 REPORT_LOCATION_ARGS(loc)); \
ccb2c380
MP
759} STMT_END
760
668c081a 761#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
d528642a
KW
762 __ASSERT_(PASS2) Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), \
763 m REPORT_LOCATION, \
764 a1, a2, a3, \
765 REPORT_LOCATION_ARGS(loc)); \
668c081a
NC
766} STMT_END
767
ccb2c380 768#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
d528642a
KW
769 __ASSERT_(PASS2) Perl_warner(aTHX_ packWARN(WARN_REGEXP), \
770 m REPORT_LOCATION, \
771 a1, a2, a3, a4, \
772 REPORT_LOCATION_ARGS(loc)); \
ccb2c380 773} STMT_END
9d1d55b5 774
538e84ed 775/* Macros for recording node offsets. 20001227 mjd@plover.com
fac92740
MJD
776 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
777 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
778 * Element 0 holds the number n.
07be1b83 779 * Position is 1 indexed.
fac92740 780 */
7122b237
YO
781#ifndef RE_TRACK_PATTERN_OFFSETS
782#define Set_Node_Offset_To_R(node,byte)
783#define Set_Node_Offset(node,byte)
784#define Set_Cur_Node_Offset
785#define Set_Node_Length_To_R(node,len)
786#define Set_Node_Length(node,len)
6a86c6ad 787#define Set_Node_Cur_Length(node,start)
538e84ed
KW
788#define Node_Offset(n)
789#define Node_Length(n)
7122b237
YO
790#define Set_Node_Offset_Length(node,offset,len)
791#define ProgLen(ri) ri->u.proglen
792#define SetProgLen(ri,x) ri->u.proglen = x
793#else
794#define ProgLen(ri) ri->u.offsets[0]
795#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
796#define Set_Node_Offset_To_R(node,byte) STMT_START { \
797 if (! SIZE_ONLY) { \
798 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 799 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 800 if((node) < 0) { \
538e84ed
KW
801 Perl_croak(aTHX_ "value of node is %d in Offset macro", \
802 (int)(node)); \
ccb2c380
MP
803 } else { \
804 RExC_offsets[2*(node)-1] = (byte); \
805 } \
806 } \
807} STMT_END
808
809#define Set_Node_Offset(node,byte) \
810 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
811#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
812
813#define Set_Node_Length_To_R(node,len) STMT_START { \
814 if (! SIZE_ONLY) { \
815 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 816 __LINE__, (int)(node), (int)(len))); \
ccb2c380 817 if((node) < 0) { \
538e84ed
KW
818 Perl_croak(aTHX_ "value of node is %d in Length macro", \
819 (int)(node)); \
ccb2c380
MP
820 } else { \
821 RExC_offsets[2*(node)] = (len); \
822 } \
823 } \
824} STMT_END
825
826#define Set_Node_Length(node,len) \
827 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
828#define Set_Node_Cur_Length(node, start) \
829 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
830
831/* Get offsets and lengths */
832#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
833#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
834
07be1b83
YO
835#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
836 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
837 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
838} STMT_END
7122b237 839#endif
07be1b83
YO
840
841#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
842#define EXPERIMENTAL_INPLACESCAN
f427392e 843#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 844
9e9ecfdf 845#define DEBUG_RExC_seen() \
538e84ed
KW
846 DEBUG_OPTIMISE_MORE_r({ \
847 PerlIO_printf(Perl_debug_log,"RExC_seen: "); \
848 \
e384d5c1
YO
849 if (RExC_seen & REG_ZERO_LEN_SEEN) \
850 PerlIO_printf(Perl_debug_log,"REG_ZERO_LEN_SEEN "); \
538e84ed 851 \
e384d5c1
YO
852 if (RExC_seen & REG_LOOKBEHIND_SEEN) \
853 PerlIO_printf(Perl_debug_log,"REG_LOOKBEHIND_SEEN "); \
538e84ed 854 \
e384d5c1
YO
855 if (RExC_seen & REG_GPOS_SEEN) \
856 PerlIO_printf(Perl_debug_log,"REG_GPOS_SEEN "); \
538e84ed 857 \
e384d5c1
YO
858 if (RExC_seen & REG_RECURSE_SEEN) \
859 PerlIO_printf(Perl_debug_log,"REG_RECURSE_SEEN "); \
538e84ed 860 \
e384d5c1
YO
861 if (RExC_seen & REG_TOP_LEVEL_BRANCHES_SEEN) \
862 PerlIO_printf(Perl_debug_log,"REG_TOP_LEVEL_BRANCHES_SEEN "); \
538e84ed 863 \
e384d5c1
YO
864 if (RExC_seen & REG_VERBARG_SEEN) \
865 PerlIO_printf(Perl_debug_log,"REG_VERBARG_SEEN "); \
538e84ed 866 \
e384d5c1
YO
867 if (RExC_seen & REG_CUTGROUP_SEEN) \
868 PerlIO_printf(Perl_debug_log,"REG_CUTGROUP_SEEN "); \
538e84ed 869 \
e384d5c1
YO
870 if (RExC_seen & REG_RUN_ON_COMMENT_SEEN) \
871 PerlIO_printf(Perl_debug_log,"REG_RUN_ON_COMMENT_SEEN "); \
538e84ed 872 \
e384d5c1
YO
873 if (RExC_seen & REG_UNFOLDED_MULTI_SEEN) \
874 PerlIO_printf(Perl_debug_log,"REG_UNFOLDED_MULTI_SEEN "); \
538e84ed 875 \
e384d5c1
YO
876 if (RExC_seen & REG_GOSTART_SEEN) \
877 PerlIO_printf(Perl_debug_log,"REG_GOSTART_SEEN "); \
538e84ed 878 \
ee273784
YO
879 if (RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN) \
880 PerlIO_printf(Perl_debug_log,"REG_UNBOUNDED_QUANTIFIER_SEEN "); \
881 \
538e84ed 882 PerlIO_printf(Perl_debug_log,"\n"); \
9e9ecfdf
YO
883 });
884
fdfb4f21
YO
885#define DEBUG_SHOW_STUDY_FLAG(flags,flag) \
886 if ((flags) & flag) PerlIO_printf(Perl_debug_log, "%s ", #flag)
887
888#define DEBUG_SHOW_STUDY_FLAGS(flags,open_str,close_str) \
889 if ( ( flags ) ) { \
890 PerlIO_printf(Perl_debug_log, "%s", open_str); \
891 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_SEOL); \
892 DEBUG_SHOW_STUDY_FLAG(flags,SF_FL_BEFORE_MEOL); \
893 DEBUG_SHOW_STUDY_FLAG(flags,SF_IS_INF); \
894 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_PAR); \
895 DEBUG_SHOW_STUDY_FLAG(flags,SF_IN_PAR); \
896 DEBUG_SHOW_STUDY_FLAG(flags,SF_HAS_EVAL); \
897 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_SUBSTR); \
898 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_AND); \
899 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS_OR); \
900 DEBUG_SHOW_STUDY_FLAG(flags,SCF_DO_STCLASS); \
901 DEBUG_SHOW_STUDY_FLAG(flags,SCF_WHILEM_VISITED_POS); \
902 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_RESTUDY); \
903 DEBUG_SHOW_STUDY_FLAG(flags,SCF_SEEN_ACCEPT); \
904 DEBUG_SHOW_STUDY_FLAG(flags,SCF_TRIE_DOING_RESTUDY); \
905 DEBUG_SHOW_STUDY_FLAG(flags,SCF_IN_DEFINE); \
906 PerlIO_printf(Perl_debug_log, "%s", close_str); \
907 }
908
909
304ee84b
YO
910#define DEBUG_STUDYDATA(str,data,depth) \
911DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 912 PerlIO_printf(Perl_debug_log, \
304ee84b 913 "%*s" str "Pos:%"IVdf"/%"IVdf \
fdfb4f21 914 " Flags: 0x%"UVXf, \
1de06328
YO
915 (int)(depth)*2, "", \
916 (IV)((data)->pos_min), \
917 (IV)((data)->pos_delta), \
fdfb4f21
YO
918 (UV)((data)->flags) \
919 ); \
920 DEBUG_SHOW_STUDY_FLAGS((data)->flags," [ ","]"); \
921 PerlIO_printf(Perl_debug_log, \
922 " Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328 923 (IV)((data)->whilem_c), \
304ee84b
YO
924 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
925 is_inf ? "INF " : "" \
1de06328
YO
926 ); \
927 if ((data)->last_found) \
928 PerlIO_printf(Perl_debug_log, \
929 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
930 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
931 SvPVX_const((data)->last_found), \
932 (IV)((data)->last_end), \
933 (IV)((data)->last_start_min), \
934 (IV)((data)->last_start_max), \
935 ((data)->longest && \
936 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
937 SvPVX_const((data)->longest_fixed), \
938 (IV)((data)->offset_fixed), \
939 ((data)->longest && \
940 (data)->longest==&((data)->longest_float)) ? "*" : "", \
941 SvPVX_const((data)->longest_float), \
942 (IV)((data)->offset_float_min), \
943 (IV)((data)->offset_float_max) \
944 ); \
945 PerlIO_printf(Perl_debug_log,"\n"); \
946});
947
8e35b056
KW
948/* is c a control character for which we have a mnemonic? */
949#define isMNEMONIC_CNTRL(c) _IS_MNEMONIC_CNTRL_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
950
549b4e78
KW
951STATIC const char *
952S_cntrl_to_mnemonic(const U8 c)
953{
954 /* Returns the mnemonic string that represents character 'c', if one
955 * exists; NULL otherwise. The only ones that exist for the purposes of
956 * this routine are a few control characters */
957
958 switch (c) {
959 case '\a': return "\\a";
960 case '\b': return "\\b";
961 case ESC_NATIVE: return "\\e";
962 case '\f': return "\\f";
963 case '\n': return "\\n";
964 case '\r': return "\\r";
965 case '\t': return "\\t";
966 }
967
968 return NULL;
969}
970
653099ff 971/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 972 Update the longest found anchored substring and the longest found
653099ff
GS
973 floating substrings if needed. */
974
4327152a 975STATIC void
ea3daa5d
FC
976S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
977 SSize_t *minlenp, int is_inf)
c277df42 978{
e1ec3a88
AL
979 const STRLEN l = CHR_SVLEN(data->last_found);
980 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 981 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 982
7918f24d
NC
983 PERL_ARGS_ASSERT_SCAN_COMMIT;
984
c277df42 985 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 986 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
987 if (*data->longest == data->longest_fixed) {
988 data->offset_fixed = l ? data->last_start_min : data->pos_min;
989 if (data->flags & SF_BEFORE_EOL)
b81d288d 990 data->flags
c277df42
IZ
991 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
992 else
993 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 994 data->minlen_fixed=minlenp;
1de06328 995 data->lookbehind_fixed=0;
a0ed51b3 996 }
304ee84b 997 else { /* *data->longest == data->longest_float */
c277df42 998 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d 999 data->offset_float_max = (l
646e8787
DM
1000 ? data->last_start_max
1001 : (data->pos_delta > SSize_t_MAX - data->pos_min
ea3daa5d
FC
1002 ? SSize_t_MAX
1003 : data->pos_min + data->pos_delta));
1004 if (is_inf
1005 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
1006 data->offset_float_max = SSize_t_MAX;
c277df42 1007 if (data->flags & SF_BEFORE_EOL)
b81d288d 1008 data->flags
c277df42
IZ
1009 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1010 else
1011 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
1012 data->minlen_float=minlenp;
1013 data->lookbehind_float=0;
c277df42
IZ
1014 }
1015 }
1016 SvCUR_set(data->last_found, 0);
0eda9292 1017 {
a28509cc 1018 SV * const sv = data->last_found;
097eb12c
AL
1019 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1020 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1021 if (mg)
1022 mg->mg_len = 0;
1023 }
0eda9292 1024 }
c277df42
IZ
1025 data->last_end = -1;
1026 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 1027 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
1028}
1029
cdd87c1d
KW
1030/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1031 * list that describes which code points it matches */
1032
653099ff 1033STATIC void
3420edd7 1034S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1035{
cdd87c1d
KW
1036 /* Set the SSC 'ssc' to match an empty string or any code point */
1037
557bd3fb 1038 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1039
71068078 1040 assert(is_ANYOF_SYNTHETIC(ssc));
3fffb88a 1041
cdd87c1d
KW
1042 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
1043 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
93e92956 1044 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
653099ff
GS
1045}
1046
653099ff 1047STATIC int
dc3bf405 1048S_ssc_is_anything(const regnode_ssc *ssc)
653099ff 1049{
c144baaa
KW
1050 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
1051 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
1052 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
1053 * in any way, so there's no point in using it */
cdd87c1d
KW
1054
1055 UV start, end;
1056 bool ret;
653099ff 1057
557bd3fb 1058 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1059
71068078 1060 assert(is_ANYOF_SYNTHETIC(ssc));
cdd87c1d 1061
93e92956 1062 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
cdd87c1d
KW
1063 return FALSE;
1064 }
1065
1066 /* See if the list consists solely of the range 0 - Infinity */
1067 invlist_iterinit(ssc->invlist);
1068 ret = invlist_iternext(ssc->invlist, &start, &end)
1069 && start == 0
1070 && end == UV_MAX;
1071
1072 invlist_iterfinish(ssc->invlist);
1073
1074 if (ret) {
1075 return TRUE;
1076 }
1077
1078 /* If e.g., both \w and \W are set, matches everything */
e0e1be5f 1079 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1080 int i;
1081 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1082 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1083 return TRUE;
1084 }
1085 }
1086 }
1087
1088 return FALSE;
653099ff
GS
1089}
1090
653099ff 1091STATIC void
cdd87c1d 1092S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1093{
cdd87c1d
KW
1094 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1095 * string, any code point, or any posix class under locale */
1096
557bd3fb 1097 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1098
557bd3fb 1099 Zero(ssc, 1, regnode_ssc);
71068078 1100 set_ANYOF_SYNTHETIC(ssc);
93e92956 1101 ARG_SET(ssc, ANYOF_ONLY_HAS_BITMAP);
3420edd7 1102 ssc_anything(ssc);
cdd87c1d 1103
2f306ab9
KW
1104 /* If any portion of the regex is to operate under locale rules that aren't
1105 * fully known at compile time, initialization includes it. The reason
1106 * this isn't done for all regexes is that the optimizer was written under
1107 * the assumption that locale was all-or-nothing. Given the complexity and
1108 * lack of documentation in the optimizer, and that there are inadequate
1109 * test cases for locale, many parts of it may not work properly, it is
1110 * safest to avoid locale unless necessary. */
cdd87c1d
KW
1111 if (RExC_contains_locale) {
1112 ANYOF_POSIXL_SETALL(ssc);
cdd87c1d
KW
1113 }
1114 else {
1115 ANYOF_POSIXL_ZERO(ssc);
1116 }
653099ff
GS
1117}
1118
b423522f 1119STATIC int
dc3bf405
BF
1120S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
1121 const regnode_ssc *ssc)
b423522f
KW
1122{
1123 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
1124 * to the list of code points matched, and locale posix classes; hence does
1125 * not check its flags) */
1126
1127 UV start, end;
1128 bool ret;
1129
1130 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
1131
71068078 1132 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1133
1134 invlist_iterinit(ssc->invlist);
1135 ret = invlist_iternext(ssc->invlist, &start, &end)
1136 && start == 0
1137 && end == UV_MAX;
1138
1139 invlist_iterfinish(ssc->invlist);
1140
1141 if (! ret) {
1142 return FALSE;
1143 }
1144
e0e1be5f 1145 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
31f05a37 1146 return FALSE;
b423522f
KW
1147 }
1148
1149 return TRUE;
1150}
1151
1152STATIC SV*
1153S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
5c0f85ef 1154 const regnode_charclass* const node)
b423522f
KW
1155{
1156 /* Returns a mortal inversion list defining which code points are matched
1157 * by 'node', which is of type ANYOF. Handles complementing the result if
1158 * appropriate. If some code points aren't knowable at this time, the
31f05a37
KW
1159 * returned list must, and will, contain every code point that is a
1160 * possibility. */
b423522f
KW
1161
1162 SV* invlist = sv_2mortal(_new_invlist(0));
1ee208c4 1163 SV* only_utf8_locale_invlist = NULL;
b423522f
KW
1164 unsigned int i;
1165 const U32 n = ARG(node);
31f05a37 1166 bool new_node_has_latin1 = FALSE;
b423522f
KW
1167
1168 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
1169
1170 /* Look at the data structure created by S_set_ANYOF_arg() */
93e92956 1171 if (n != ANYOF_ONLY_HAS_BITMAP) {
b423522f
KW
1172 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
1173 AV * const av = MUTABLE_AV(SvRV(rv));
1174 SV **const ary = AvARRAY(av);
1175 assert(RExC_rxi->data->what[n] == 's');
1176
1177 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
1178 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
1179 }
1180 else if (ary[0] && ary[0] != &PL_sv_undef) {
1181
1182 /* Here, no compile-time swash, and there are things that won't be
1183 * known until runtime -- we have to assume it could be anything */
1184 return _add_range_to_invlist(invlist, 0, UV_MAX);
1185 }
1ee208c4 1186 else if (ary[3] && ary[3] != &PL_sv_undef) {
b423522f
KW
1187
1188 /* Here no compile-time swash, and no run-time only data. Use the
1189 * node's inversion list */
1ee208c4
KW
1190 invlist = sv_2mortal(invlist_clone(ary[3]));
1191 }
1192
1193 /* Get the code points valid only under UTF-8 locales */
1194 if ((ANYOF_FLAGS(node) & ANYOF_LOC_FOLD)
1195 && ary[2] && ary[2] != &PL_sv_undef)
1196 {
1197 only_utf8_locale_invlist = ary[2];
b423522f
KW
1198 }
1199 }
1200
dcb20b36
KW
1201 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
1202 * code points, and an inversion list for the others, but if there are code
1203 * points that should match only conditionally on the target string being
1204 * UTF-8, those are placed in the inversion list, and not the bitmap.
1205 * Since there are circumstances under which they could match, they are
1206 * included in the SSC. But if the ANYOF node is to be inverted, we have
1207 * to exclude them here, so that when we invert below, the end result
1208 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
1209 * have to do this here before we add the unconditionally matched code
1210 * points */
b423522f
KW
1211 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1212 _invlist_intersection_complement_2nd(invlist,
1213 PL_UpperLatin1,
1214 &invlist);
1215 }
1216
1217 /* Add in the points from the bit map */
dcb20b36 1218 for (i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
b423522f
KW
1219 if (ANYOF_BITMAP_TEST(node, i)) {
1220 invlist = add_cp_to_invlist(invlist, i);
31f05a37 1221 new_node_has_latin1 = TRUE;
b423522f
KW
1222 }
1223 }
1224
1225 /* If this can match all upper Latin1 code points, have to add them
1226 * as well */
f240c685
KW
1227 if (OP(node) == ANYOFD
1228 && (ANYOF_FLAGS(node) & ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER))
1229 {
b423522f
KW
1230 _invlist_union(invlist, PL_UpperLatin1, &invlist);
1231 }
1232
1233 /* Similarly for these */
93e92956 1234 if (ANYOF_FLAGS(node) & ANYOF_MATCHES_ALL_ABOVE_BITMAP) {
e0a1ff7a 1235 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
b423522f
KW
1236 }
1237
1238 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
1239 _invlist_invert(invlist);
1240 }
31f05a37
KW
1241 else if (new_node_has_latin1 && ANYOF_FLAGS(node) & ANYOF_LOC_FOLD) {
1242
1243 /* Under /li, any 0-255 could fold to any other 0-255, depending on the
1244 * locale. We can skip this if there are no 0-255 at all. */
1245 _invlist_union(invlist, PL_Latin1, &invlist);
1246 }
1247
1ee208c4
KW
1248 /* Similarly add the UTF-8 locale possible matches. These have to be
1249 * deferred until after the non-UTF-8 locale ones are taken care of just
1250 * above, or it leads to wrong results under ANYOF_INVERT */
1251 if (only_utf8_locale_invlist) {
31f05a37 1252 _invlist_union_maybe_complement_2nd(invlist,
1ee208c4 1253 only_utf8_locale_invlist,
31f05a37
KW
1254 ANYOF_FLAGS(node) & ANYOF_INVERT,
1255 &invlist);
1256 }
b423522f
KW
1257
1258 return invlist;
1259}
1260
1051e1c4 1261/* These two functions currently do the exact same thing */
557bd3fb 1262#define ssc_init_zero ssc_init
653099ff 1263
cdd87c1d
KW
1264#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1265#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1266
557bd3fb 1267/* 'AND' a given class with another one. Can create false positives. 'ssc'
93e92956
KW
1268 * should not be inverted. 'and_with->flags & ANYOF_MATCHES_POSIXL' should be
1269 * 0 if 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1270
653099ff 1271STATIC void
b423522f 1272S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1273 const regnode_charclass *and_with)
653099ff 1274{
cdd87c1d
KW
1275 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1276 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1277
a0dd4231
KW
1278 SV* anded_cp_list;
1279 U8 anded_flags;
1e6ade67 1280
cdd87c1d 1281 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1282
71068078 1283 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1284
1285 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
1286 * the code point inversion list and just the relevant flags */
71068078 1287 if (is_ANYOF_SYNTHETIC(and_with)) {
7dcac5f6 1288 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
a0dd4231 1289 anded_flags = ANYOF_FLAGS(and_with);
e9b08962
KW
1290
1291 /* XXX This is a kludge around what appears to be deficiencies in the
1292 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
1293 * there are paths through the optimizer where it doesn't get weeded
1294 * out when it should. And if we don't make some extra provision for
1295 * it like the code just below, it doesn't get added when it should.
1296 * This solution is to add it only when AND'ing, which is here, and
1297 * only when what is being AND'ed is the pristine, original node
1298 * matching anything. Thus it is like adding it to ssc_anything() but
1299 * only when the result is to be AND'ed. Probably the same solution
1300 * could be adopted for the same problem we have with /l matching,
1301 * which is solved differently in S_ssc_init(), and that would lead to
1302 * fewer false positives than that solution has. But if this solution
1303 * creates bugs, the consequences are only that a warning isn't raised
1304 * that should be; while the consequences for having /l bugs is
1305 * incorrect matches */
7dcac5f6 1306 if (ssc_is_anything((regnode_ssc *)and_with)) {
f240c685 1307 anded_flags |= ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER;
e9b08962 1308 }
a0dd4231
KW
1309 }
1310 else {
5c0f85ef 1311 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
f240c685
KW
1312 if (OP(and_with) == ANYOFD) {
1313 anded_flags = ANYOF_FLAGS(and_with) & ANYOF_COMMON_FLAGS;
1314 }
1315 else {
1316 anded_flags = ANYOF_FLAGS(and_with)
1317 &( ANYOF_COMMON_FLAGS
108316fb
KW
1318 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1319 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
f240c685 1320 }
a0dd4231
KW
1321 }
1322
1323 ANYOF_FLAGS(ssc) &= anded_flags;
cdd87c1d
KW
1324
1325 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1326 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1327 * 'and_with' may be inverted. When not inverted, we have the situation of
1328 * computing:
1329 * (C1 | P1) & (C2 | P2)
1330 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1331 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1332 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1333 * <= ((C1 & C2) | P1 | P2)
1334 * Alternatively, the last few steps could be:
1335 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1336 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1337 * <= (C1 | C2 | (P1 & P2))
1338 * We favor the second approach if either P1 or P2 is non-empty. This is
1339 * because these components are a barrier to doing optimizations, as what
1340 * they match cannot be known until the moment of matching as they are
1341 * dependent on the current locale, 'AND"ing them likely will reduce or
1342 * eliminate them.
1343 * But we can do better if we know that C1,P1 are in their initial state (a
1344 * frequent occurrence), each matching everything:
1345 * (<everything>) & (C2 | P2) = C2 | P2
1346 * Similarly, if C2,P2 are in their initial state (again a frequent
1347 * occurrence), the result is a no-op
1348 * (C1 | P1) & (<everything>) = C1 | P1
1349 *
1350 * Inverted, we have
1351 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1352 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1353 * <= (C1 & ~C2) | (P1 & ~P2)
1354 * */
1aa99e6b 1355
a0dd4231 1356 if ((ANYOF_FLAGS(and_with) & ANYOF_INVERT)
71068078 1357 && ! is_ANYOF_SYNTHETIC(and_with))
a0dd4231 1358 {
cdd87c1d 1359 unsigned int i;
8951c461 1360
cdd87c1d
KW
1361 ssc_intersection(ssc,
1362 anded_cp_list,
1363 FALSE /* Has already been inverted */
1364 );
c6b76537 1365
cdd87c1d
KW
1366 /* If either P1 or P2 is empty, the intersection will be also; can skip
1367 * the loop */
93e92956 1368 if (! (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL)) {
cdd87c1d
KW
1369 ANYOF_POSIXL_ZERO(ssc);
1370 }
e0e1be5f 1371 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1372
1373 /* Note that the Posix class component P from 'and_with' actually
1374 * looks like:
1375 * P = Pa | Pb | ... | Pn
1376 * where each component is one posix class, such as in [\w\s].
1377 * Thus
1378 * ~P = ~(Pa | Pb | ... | Pn)
1379 * = ~Pa & ~Pb & ... & ~Pn
1380 * <= ~Pa | ~Pb | ... | ~Pn
1381 * The last is something we can easily calculate, but unfortunately
1382 * is likely to have many false positives. We could do better
1383 * in some (but certainly not all) instances if two classes in
1384 * P have known relationships. For example
1385 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1386 * So
1387 * :lower: & :print: = :lower:
1388 * And similarly for classes that must be disjoint. For example,
1389 * since \s and \w can have no elements in common based on rules in
1390 * the POSIX standard,
1391 * \w & ^\S = nothing
1392 * Unfortunately, some vendor locales do not meet the Posix
1393 * standard, in particular almost everything by Microsoft.
1394 * The loop below just changes e.g., \w into \W and vice versa */
1395
1ee208c4 1396 regnode_charclass_posixl temp;
cdd87c1d
KW
1397 int add = 1; /* To calculate the index of the complement */
1398
1399 ANYOF_POSIXL_ZERO(&temp);
1400 for (i = 0; i < ANYOF_MAX; i++) {
1401 assert(i % 2 != 0
7dcac5f6
KW
1402 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
1403 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
cdd87c1d 1404
7dcac5f6 1405 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
cdd87c1d
KW
1406 ANYOF_POSIXL_SET(&temp, i + add);
1407 }
1408 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1409 }
1410 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1411
cdd87c1d
KW
1412 } /* else ssc already has no posixes */
1413 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1414 in its initial state */
71068078 1415 else if (! is_ANYOF_SYNTHETIC(and_with)
7dcac5f6 1416 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
cdd87c1d
KW
1417 {
1418 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1419 * copy it over 'ssc' */
1420 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
71068078 1421 if (is_ANYOF_SYNTHETIC(and_with)) {
cdd87c1d
KW
1422 StructCopy(and_with, ssc, regnode_ssc);
1423 }
1424 else {
1425 ssc->invlist = anded_cp_list;
1426 ANYOF_POSIXL_ZERO(ssc);
93e92956 1427 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1428 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
cdd87c1d
KW
1429 }
1430 }
1431 }
e0e1be5f 1432 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
93e92956 1433 || (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL))
cdd87c1d
KW
1434 {
1435 /* One or the other of P1, P2 is non-empty. */
93e92956 1436 if (ANYOF_FLAGS(and_with) & ANYOF_MATCHES_POSIXL) {
1ea8b7fe
KW
1437 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
1438 }
cdd87c1d
KW
1439 ssc_union(ssc, anded_cp_list, FALSE);
1440 }
1441 else { /* P1 = P2 = empty */
1442 ssc_intersection(ssc, anded_cp_list, FALSE);
1443 }
137165a6 1444 }
653099ff
GS
1445}
1446
653099ff 1447STATIC void
cdd87c1d 1448S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
7dcac5f6 1449 const regnode_charclass *or_with)
653099ff 1450{
cdd87c1d
KW
1451 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1452 * another SSC or a regular ANYOF class. Can create false positives if
1453 * 'or_with' is to be inverted. */
7918f24d 1454
a0dd4231
KW
1455 SV* ored_cp_list;
1456 U8 ored_flags;
c6b76537 1457
cdd87c1d 1458 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1459
71068078 1460 assert(is_ANYOF_SYNTHETIC(ssc));
a0dd4231
KW
1461
1462 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
1463 * the code point inversion list and just the relevant flags */
71068078 1464 if (is_ANYOF_SYNTHETIC(or_with)) {
7dcac5f6 1465 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
a0dd4231
KW
1466 ored_flags = ANYOF_FLAGS(or_with);
1467 }
1468 else {
5c0f85ef 1469 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
eff8b7dc 1470 ored_flags = ANYOF_FLAGS(or_with) & ANYOF_COMMON_FLAGS;
f240c685
KW
1471 if (OP(or_with) != ANYOFD) {
1472 ored_flags
1473 |= ANYOF_FLAGS(or_with)
108316fb
KW
1474 & ( ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1475 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP);
f240c685 1476 }
a0dd4231
KW
1477 }
1478
1479 ANYOF_FLAGS(ssc) |= ored_flags;
cdd87c1d
KW
1480
1481 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1482 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1483 * 'or_with' may be inverted. When not inverted, we have the simple
1484 * situation of computing:
1485 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1486 * If P1|P2 yields a situation with both a class and its complement are
1487 * set, like having both \w and \W, this matches all code points, and we
1488 * can delete these from the P component of the ssc going forward. XXX We
1489 * might be able to delete all the P components, but I (khw) am not certain
1490 * about this, and it is better to be safe.
1491 *
1492 * Inverted, we have
1493 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1494 * <= (C1 | P1) | ~C2
1495 * <= (C1 | ~C2) | P1
1496 * (which results in actually simpler code than the non-inverted case)
1497 * */
9826f543 1498
a0dd4231 1499 if ((ANYOF_FLAGS(or_with) & ANYOF_INVERT)
71068078 1500 && ! is_ANYOF_SYNTHETIC(or_with))
a0dd4231 1501 {
cdd87c1d 1502 /* We ignore P2, leaving P1 going forward */
1ea8b7fe 1503 } /* else Not inverted */
93e92956 1504 else if (ANYOF_FLAGS(or_with) & ANYOF_MATCHES_POSIXL) {
7dcac5f6 1505 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
e0e1be5f 1506 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
cdd87c1d
KW
1507 unsigned int i;
1508 for (i = 0; i < ANYOF_MAX; i += 2) {
1509 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1510 {
1511 ssc_match_all_cp(ssc);
1512 ANYOF_POSIXL_CLEAR(ssc, i);
1513 ANYOF_POSIXL_CLEAR(ssc, i+1);
cdd87c1d
KW
1514 }
1515 }
1516 }
1aa99e6b 1517 }
cdd87c1d
KW
1518
1519 ssc_union(ssc,
1520 ored_cp_list,
1521 FALSE /* Already has been inverted */
1522 );
653099ff
GS
1523}
1524
b423522f
KW
1525PERL_STATIC_INLINE void
1526S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
1527{
1528 PERL_ARGS_ASSERT_SSC_UNION;
1529
71068078 1530 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1531
1532 _invlist_union_maybe_complement_2nd(ssc->invlist,
1533 invlist,
1534 invert2nd,
1535 &ssc->invlist);
1536}
1537
1538PERL_STATIC_INLINE void
1539S_ssc_intersection(pTHX_ regnode_ssc *ssc,
1540 SV* const invlist,
1541 const bool invert2nd)
1542{
1543 PERL_ARGS_ASSERT_SSC_INTERSECTION;
1544
71068078 1545 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1546
1547 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
1548 invlist,
1549 invert2nd,
1550 &ssc->invlist);
1551}
1552
1553PERL_STATIC_INLINE void
1554S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
1555{
1556 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
1557
71068078 1558 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1559
1560 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
1561}
1562
1563PERL_STATIC_INLINE void
1564S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
1565{
1566 /* AND just the single code point 'cp' into the SSC 'ssc' */
1567
1568 SV* cp_list = _new_invlist(2);
1569
1570 PERL_ARGS_ASSERT_SSC_CP_AND;
1571
71068078 1572 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1573
1574 cp_list = add_cp_to_invlist(cp_list, cp);
1575 ssc_intersection(ssc, cp_list,
1576 FALSE /* Not inverted */
1577 );
1578 SvREFCNT_dec_NN(cp_list);
1579}
1580
1581PERL_STATIC_INLINE void
dc3bf405 1582S_ssc_clear_locale(regnode_ssc *ssc)
b423522f
KW
1583{
1584 /* Set the SSC 'ssc' to not match any locale things */
b423522f
KW
1585 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
1586
71068078 1587 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f
KW
1588
1589 ANYOF_POSIXL_ZERO(ssc);
1590 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
1591}
1592
b35552de
KW
1593#define NON_OTHER_COUNT NON_OTHER_COUNT_FOR_USE_ONLY_BY_REGCOMP_DOT_C
1594
1595STATIC bool
1596S_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
1597{
1598 /* The synthetic start class is used to hopefully quickly winnow down
1599 * places where a pattern could start a match in the target string. If it
1600 * doesn't really narrow things down that much, there isn't much point to
1601 * having the overhead of using it. This function uses some very crude
1602 * heuristics to decide if to use the ssc or not.
1603 *
1604 * It returns TRUE if 'ssc' rules out more than half what it considers to
1605 * be the "likely" possible matches, but of course it doesn't know what the
1606 * actual things being matched are going to be; these are only guesses
1607 *
1608 * For /l matches, it assumes that the only likely matches are going to be
1609 * in the 0-255 range, uniformly distributed, so half of that is 127
1610 * For /a and /d matches, it assumes that the likely matches will be just
1611 * the ASCII range, so half of that is 63
1612 * For /u and there isn't anything matching above the Latin1 range, it
1613 * assumes that that is the only range likely to be matched, and uses
1614 * half that as the cut-off: 127. If anything matches above Latin1,
1615 * it assumes that all of Unicode could match (uniformly), except for
1616 * non-Unicode code points and things in the General Category "Other"
1617 * (unassigned, private use, surrogates, controls and formats). This
1618 * is a much large number. */
1619
1620 const U32 max_match = (LOC)
1621 ? 127
1622 : (! UNI_SEMANTICS)
1623 ? 63
1624 : (invlist_highest(ssc->invlist) < 256)
1625 ? 127
1626 : ((NON_OTHER_COUNT + 1) / 2) - 1;
1627 U32 count = 0; /* Running total of number of code points matched by
1628 'ssc' */
1629 UV start, end; /* Start and end points of current range in inversion
1630 list */
1631
1632 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
1633
1634 invlist_iterinit(ssc->invlist);
1635 while (invlist_iternext(ssc->invlist, &start, &end)) {
1636
1637 /* /u is the only thing that we expect to match above 255; so if not /u
1638 * and even if there are matches above 255, ignore them. This catches
1639 * things like \d under /d which does match the digits above 255, but
1640 * since the pattern is /d, it is not likely to be expecting them */
1641 if (! UNI_SEMANTICS) {
1642 if (start > 255) {
1643 break;
1644 }
1645 end = MIN(end, 255);
1646 }
1647 count += end - start + 1;
1648 if (count > max_match) {
1649 invlist_iterfinish(ssc->invlist);
1650 return FALSE;
1651 }
1652 }
1653
1654 return TRUE;
1655}
1656
1657
b423522f
KW
1658STATIC void
1659S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
1660{
1661 /* The inversion list in the SSC is marked mortal; now we need a more
1662 * permanent copy, which is stored the same way that is done in a regular
dcb20b36
KW
1663 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
1664 * map */
b423522f
KW
1665
1666 SV* invlist = invlist_clone(ssc->invlist);
1667
1668 PERL_ARGS_ASSERT_SSC_FINALIZE;
1669
71068078 1670 assert(is_ANYOF_SYNTHETIC(ssc));
b423522f 1671
a0dd4231 1672 /* The code in this file assumes that all but these flags aren't relevant
93e92956
KW
1673 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
1674 * by the time we reach here */
f240c685
KW
1675 assert(! (ANYOF_FLAGS(ssc)
1676 & ~( ANYOF_COMMON_FLAGS
108316fb
KW
1677 |ANYOF_SHARED_d_MATCHES_ALL_NON_UTF8_NON_ASCII_non_d_WARN_SUPER
1678 |ANYOF_SHARED_d_UPPER_LATIN1_UTF8_STRING_MATCHES_non_d_RUNTIME_USER_PROP)));
a0dd4231 1679
b423522f
KW
1680 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
1681
1ee208c4
KW
1682 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist,
1683 NULL, NULL, NULL, FALSE);
b423522f 1684
85c8e306
KW
1685 /* Make sure is clone-safe */
1686 ssc->invlist = NULL;
1687
e0e1be5f 1688 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
93e92956 1689 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
e0e1be5f 1690 }
1462525b 1691
b2e90ddf
KW
1692 if (RExC_contains_locale) {
1693 OP(ssc) = ANYOFL;
1694 }
1695
1462525b 1696 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
b423522f
KW
1697}
1698
a3621e74
YO
1699#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1700#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1701#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
538e84ed
KW
1702#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list \
1703 ? (TRIE_LIST_CUR( idx ) - 1) \
1704 : 0 )
a3621e74 1705
3dab1dad
YO
1706
1707#ifdef DEBUGGING
07be1b83 1708/*
2b8b4781
NC
1709 dump_trie(trie,widecharmap,revcharmap)
1710 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1711 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1712
1713 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1714 The _interim_ variants are used for debugging the interim
1715 tables that are used to generate the final compressed
1716 representation which is what dump_trie expects.
1717
486ec47a 1718 Part of the reason for their existence is to provide a form
3dab1dad 1719 of documentation as to how the different representations function.
07be1b83
YO
1720
1721*/
3dab1dad
YO
1722
1723/*
3dab1dad
YO
1724 Dumps the final compressed table form of the trie to Perl_debug_log.
1725 Used for debugging make_trie().
1726*/
b9a59e08 1727
3dab1dad 1728STATIC void
2b8b4781
NC
1729S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1730 AV *revcharmap, U32 depth)
3dab1dad
YO
1731{
1732 U32 state;
ab3bbdeb 1733 SV *sv=sv_newmortal();
55eed653 1734 int colwidth= widecharmap ? 6 : 4;
2e64971a 1735 U16 word;
3dab1dad
YO
1736 GET_RE_DEBUG_FLAGS_DECL;
1737
7918f24d 1738 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1739
3dab1dad
YO
1740 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1741 (int)depth * 2 + 2,"",
1742 "Match","Base","Ofs" );
1743
1744 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1745 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1746 if ( tmp ) {
538e84ed 1747 PerlIO_printf( Perl_debug_log, "%*s",
ab3bbdeb 1748 colwidth,
538e84ed 1749 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1750 PL_colors[0], PL_colors[1],
1751 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
1752 PERL_PV_ESCAPE_FIRSTCHAR
1753 )
ab3bbdeb 1754 );
3dab1dad
YO
1755 }
1756 }
1757 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1758 (int)depth * 2 + 2,"");
1759
1760 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1761 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1762 PerlIO_printf( Perl_debug_log, "\n");
1763
1e2e3d02 1764 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1765 const U32 base = trie->states[ state ].trans.base;
3dab1dad 1766
538e84ed
KW
1767 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|",
1768 (int)depth * 2 + 2,"", (UV)state);
3dab1dad
YO
1769
1770 if ( trie->states[ state ].wordnum ) {
538e84ed
KW
1771 PerlIO_printf( Perl_debug_log, " W%4X",
1772 trie->states[ state ].wordnum );
3dab1dad
YO
1773 } else {
1774 PerlIO_printf( Perl_debug_log, "%6s", "" );
1775 }
1776
1777 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1778
1779 if ( base ) {
1780 U32 ofs = 0;
1781
1782 while( ( base + ofs < trie->uniquecharcount ) ||
1783 ( base + ofs - trie->uniquecharcount < trie->lasttrans
538e84ed
KW
1784 && trie->trans[ base + ofs - trie->uniquecharcount ].check
1785 != state))
3dab1dad
YO
1786 ofs++;
1787
1788 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1789
1790 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
538e84ed
KW
1791 if ( ( base + ofs >= trie->uniquecharcount )
1792 && ( base + ofs - trie->uniquecharcount
1793 < trie->lasttrans )
1794 && trie->trans[ base + ofs
1795 - trie->uniquecharcount ].check == state )
3dab1dad 1796 {
ab3bbdeb
YO
1797 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1798 colwidth,
538e84ed
KW
1799 (UV)trie->trans[ base + ofs
1800 - trie->uniquecharcount ].next );
3dab1dad 1801 } else {
ab3bbdeb 1802 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1803 }
1804 }
1805
1806 PerlIO_printf( Perl_debug_log, "]");
1807
1808 }
1809 PerlIO_printf( Perl_debug_log, "\n" );
1810 }
538e84ed
KW
1811 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=",
1812 (int)depth*2, "");
2e64971a
DM
1813 for (word=1; word <= trie->wordcount; word++) {
1814 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1815 (int)word, (int)(trie->wordinfo[word].prev),
1816 (int)(trie->wordinfo[word].len));
1817 }
1818 PerlIO_printf(Perl_debug_log, "\n" );
538e84ed 1819}
3dab1dad 1820/*
3dab1dad 1821 Dumps a fully constructed but uncompressed trie in list form.
538e84ed 1822 List tries normally only are used for construction when the number of
3dab1dad
YO
1823 possible chars (trie->uniquecharcount) is very high.
1824 Used for debugging make_trie().
1825*/
1826STATIC void
55eed653 1827S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1828 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1829 U32 depth)
3dab1dad
YO
1830{
1831 U32 state;
ab3bbdeb 1832 SV *sv=sv_newmortal();
55eed653 1833 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1834 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1835
1836 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1837
3dab1dad 1838 /* print out the table precompression. */
ab3bbdeb
YO
1839 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1840 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1841 "------:-----+-----------------\n" );
538e84ed 1842
3dab1dad
YO
1843 for( state=1 ; state < next_alloc ; state ++ ) {
1844 U16 charid;
538e84ed 1845
ab3bbdeb 1846 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1847 (int)depth * 2 + 2,"", (UV)state );
1848 if ( ! trie->states[ state ].wordnum ) {
1849 PerlIO_printf( Perl_debug_log, "%5s| ","");
1850 } else {
1851 PerlIO_printf( Perl_debug_log, "W%4x| ",
1852 trie->states[ state ].wordnum
1853 );
1854 }
1855 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
538e84ed
KW
1856 SV ** const tmp = av_fetch( revcharmap,
1857 TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1858 if ( tmp ) {
1859 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1860 colwidth,
538e84ed
KW
1861 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp),
1862 colwidth,
1863 PL_colors[0], PL_colors[1],
1864 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0)
1865 | PERL_PV_ESCAPE_FIRSTCHAR
ab3bbdeb 1866 ) ,
1e2e3d02
YO
1867 TRIE_LIST_ITEM(state,charid).forid,
1868 (UV)TRIE_LIST_ITEM(state,charid).newstate
1869 );
538e84ed 1870 if (!(charid % 10))
664e119d
RGS
1871 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1872 (int)((depth * 2) + 14), "");
1e2e3d02 1873 }
ab3bbdeb
YO
1874 }
1875 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad 1876 }
538e84ed 1877}
3dab1dad
YO
1878
1879/*
3dab1dad 1880 Dumps a fully constructed but uncompressed trie in table form.
538e84ed
KW
1881 This is the normal DFA style state transition table, with a few
1882 twists to facilitate compression later.
3dab1dad
YO
1883 Used for debugging make_trie().
1884*/
1885STATIC void
55eed653 1886S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1887 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1888 U32 depth)
3dab1dad
YO
1889{
1890 U32 state;
1891 U16 charid;
ab3bbdeb 1892 SV *sv=sv_newmortal();
55eed653 1893 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1894 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1895
1896 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
538e84ed 1897
3dab1dad
YO
1898 /*
1899 print out the table precompression so that we can do a visual check
1900 that they are identical.
1901 */
538e84ed 1902
3dab1dad
YO
1903 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1904
1905 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1906 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1907 if ( tmp ) {
538e84ed 1908 PerlIO_printf( Perl_debug_log, "%*s",
ab3bbdeb 1909 colwidth,
538e84ed 1910 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1911 PL_colors[0], PL_colors[1],
1912 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed
KW
1913 PERL_PV_ESCAPE_FIRSTCHAR
1914 )
ab3bbdeb 1915 );
3dab1dad
YO
1916 }
1917 }
1918
1919 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1920
1921 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1922 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1923 }
1924
1925 PerlIO_printf( Perl_debug_log, "\n" );
1926
1927 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1928
538e84ed 1929 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
3dab1dad
YO
1930 (int)depth * 2 + 2,"",
1931 (UV)TRIE_NODENUM( state ) );
1932
1933 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1934 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1935 if (v)
1936 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1937 else
1938 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1939 }
1940 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
538e84ed
KW
1941 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n",
1942 (UV)trie->trans[ state ].check );
3dab1dad 1943 } else {
538e84ed
KW
1944 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n",
1945 (UV)trie->trans[ state ].check,
3dab1dad
YO
1946 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1947 }
1948 }
07be1b83 1949}
3dab1dad
YO
1950
1951#endif
1952
2e64971a 1953
786e8c11
YO
1954/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1955 startbranch: the first branch in the whole branch sequence
1956 first : start branch of sequence of branch-exact nodes.
1957 May be the same as startbranch
1958 last : Thing following the last branch.
1959 May be the same as tail.
1960 tail : item following the branch sequence
1961 count : words in the sequence
a4525e78 1962 flags : currently the OP() type we will be building one of /EXACT(|F|FA|FU|FU_SS|L|FLU8)/
786e8c11 1963 depth : indent depth
3dab1dad 1964
786e8c11 1965Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1966
786e8c11
YO
1967A trie is an N'ary tree where the branches are determined by digital
1968decomposition of the key. IE, at the root node you look up the 1st character and
1969follow that branch repeat until you find the end of the branches. Nodes can be
1970marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1971
786e8c11 1972 /he|she|his|hers/
72f13be8 1973
786e8c11
YO
1974would convert into the following structure. Numbers represent states, letters
1975following numbers represent valid transitions on the letter from that state, if
1976the number is in square brackets it represents an accepting state, otherwise it
1977will be in parenthesis.
07be1b83 1978
786e8c11
YO
1979 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1980 | |
1981 | (2)
1982 | |
1983 (1) +-i->(6)-+-s->[7]
1984 |
1985 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1986
786e8c11
YO
1987 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1988
1989This shows that when matching against the string 'hers' we will begin at state 1
1990read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1991then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1992is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1993single traverse. We store a mapping from accepting to state to which word was
1994matched, and then when we have multiple possibilities we try to complete the
b8fda935 1995rest of the regex in the order in which they occurred in the alternation.
786e8c11
YO
1996
1997The only prior NFA like behaviour that would be changed by the TRIE support is
1998the silent ignoring of duplicate alternations which are of the form:
1999
2000 / (DUPE|DUPE) X? (?{ ... }) Y /x
2001
4b714af6 2002Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 2003and without the optimisation. With the optimisations dupes will be silently
486ec47a 2004ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
2005the following demonstrates:
2006
2007 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
2008
2009which prints out 'word' three times, but
2010
2011 'words'=~/(word|word|word)(?{ print $1 })S/
2012
2013which doesnt print it out at all. This is due to other optimisations kicking in.
2014
2015Example of what happens on a structural level:
2016
486ec47a 2017The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
2018
2019 1: CURLYM[1] {1,32767}(18)
2020 5: BRANCH(8)
2021 6: EXACT <ac>(16)
2022 8: BRANCH(11)
2023 9: EXACT <ad>(16)
2024 11: BRANCH(14)
2025 12: EXACT <ab>(16)
2026 16: SUCCEED(0)
2027 17: NOTHING(18)
2028 18: END(0)
2029
2030This would be optimizable with startbranch=5, first=5, last=16, tail=16
2031and should turn into:
2032
2033 1: CURLYM[1] {1,32767}(18)
2034 5: TRIE(16)
2035 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
2036 <ac>
2037 <ad>
2038 <ab>
2039 16: SUCCEED(0)
2040 17: NOTHING(18)
2041 18: END(0)
2042
2043Cases where tail != last would be like /(?foo|bar)baz/:
2044
2045 1: BRANCH(4)
2046 2: EXACT <foo>(8)
2047 4: BRANCH(7)
2048 5: EXACT <bar>(8)
2049 7: TAIL(8)
2050 8: EXACT <baz>(10)
2051 10: END(0)
2052
2053which would be optimizable with startbranch=1, first=1, last=7, tail=8
2054and would end up looking like:
2055
2056 1: TRIE(8)
2057 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
2058 <foo>
2059 <bar>
2060 7: TAIL(8)
2061 8: EXACT <baz>(10)
2062 10: END(0)
2063
c80e42f3 2064 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
2065
2066is the recommended Unicode-aware way of saying
2067
2068 *(d++) = uv;
2069*/
2070
fab2782b 2071#define TRIE_STORE_REVCHAR(val) \
786e8c11 2072 STMT_START { \
73031816 2073 if (UTF) { \
668fcfea 2074 SV *zlopp = newSV(UTF8_MAXBYTES); \
88c9ea1e 2075 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 2076 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
2077 SvCUR_set(zlopp, kapow - flrbbbbb); \
2078 SvPOK_on(zlopp); \
2079 SvUTF8_on(zlopp); \
2080 av_push(revcharmap, zlopp); \
2081 } else { \
fab2782b 2082 char ooooff = (char)val; \
73031816
NC
2083 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
2084 } \
2085 } STMT_END
786e8c11 2086
914a25d5
KW
2087/* This gets the next character from the input, folding it if not already
2088 * folded. */
2089#define TRIE_READ_CHAR STMT_START { \
2090 wordlen++; \
2091 if ( UTF ) { \
2092 /* if it is UTF then it is either already folded, or does not need \
2093 * folding */ \
1c1d615a 2094 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
2095 } \
2096 else if (folder == PL_fold_latin1) { \
7d006b13
KW
2097 /* This folder implies Unicode rules, which in the range expressible \
2098 * by not UTF is the lower case, with the two exceptions, one of \
2099 * which should have been taken care of before calling this */ \
2100 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
2101 uvc = toLOWER_L1(*uc); \
2102 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
2103 len = 1; \
914a25d5
KW
2104 } else { \
2105 /* raw data, will be folded later if needed */ \
2106 uvc = (U32)*uc; \
2107 len = 1; \
2108 } \
786e8c11
YO
2109} STMT_END
2110
2111
2112
2113#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
2114 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
2115 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
2116 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
2117 } \
2118 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
2119 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
2120 TRIE_LIST_CUR( state )++; \
2121} STMT_END
07be1b83 2122
786e8c11
YO
2123#define TRIE_LIST_NEW(state) STMT_START { \
2124 Newxz( trie->states[ state ].trans.list, \
2125 4, reg_trie_trans_le ); \
2126 TRIE_LIST_CUR( state ) = 1; \
2127 TRIE_LIST_LEN( state ) = 4; \
2128} STMT_END
07be1b83 2129
786e8c11
YO
2130#define TRIE_HANDLE_WORD(state) STMT_START { \
2131 U16 dupe= trie->states[ state ].wordnum; \
2132 regnode * const noper_next = regnext( noper ); \
2133 \
786e8c11
YO
2134 DEBUG_r({ \
2135 /* store the word for dumping */ \
2136 SV* tmp; \
2137 if (OP(noper) != NOTHING) \
740cce10 2138 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 2139 else \
740cce10 2140 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 2141 av_push( trie_words, tmp ); \
786e8c11
YO
2142 }); \
2143 \
2144 curword++; \
2e64971a
DM
2145 trie->wordinfo[curword].prev = 0; \
2146 trie->wordinfo[curword].len = wordlen; \
2147 trie->wordinfo[curword].accept = state; \
786e8c11
YO
2148 \
2149 if ( noper_next < tail ) { \
2150 if (!trie->jump) \
538e84ed
KW
2151 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, \
2152 sizeof(U16) ); \
7f69552c 2153 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
2154 if (!jumper) \
2155 jumper = noper_next; \
2156 if (!nextbranch) \
2157 nextbranch= regnext(cur); \
2158 } \
2159 \
2160 if ( dupe ) { \
2e64971a
DM
2161 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
2162 /* chain, so that when the bits of chain are later */\
2163 /* linked together, the dups appear in the chain */\
2164 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
2165 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
2166 } else { \
2167 /* we haven't inserted this word yet. */ \
2168 trie->states[ state ].wordnum = curword; \
2169 } \
2170} STMT_END
07be1b83 2171
3dab1dad 2172
786e8c11
YO
2173#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
2174 ( ( base + charid >= ucharcount \
2175 && base + charid < ubound \
2176 && state == trie->trans[ base - ucharcount + charid ].check \
2177 && trie->trans[ base - ucharcount + charid ].next ) \
2178 ? trie->trans[ base - ucharcount + charid ].next \
2179 : ( state==1 ? special : 0 ) \
2180 )
3dab1dad 2181
786e8c11
YO
2182#define MADE_TRIE 1
2183#define MADE_JUMP_TRIE 2
2184#define MADE_EXACT_TRIE 4
3dab1dad 2185
a3621e74 2186STATIC I32
538e84ed
KW
2187S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch,
2188 regnode *first, regnode *last, regnode *tail,
2189 U32 word_count, U32 flags, U32 depth)
a3621e74
YO
2190{
2191 /* first pass, loop through and scan words */
2192 reg_trie_data *trie;
55eed653 2193 HV *widecharmap = NULL;
2b8b4781 2194 AV *revcharmap = newAV();
a3621e74 2195 regnode *cur;
a3621e74
YO
2196 STRLEN len = 0;
2197 UV uvc = 0;
2198 U16 curword = 0;
2199 U32 next_alloc = 0;
786e8c11
YO
2200 regnode *jumper = NULL;
2201 regnode *nextbranch = NULL;
7f69552c 2202 regnode *convert = NULL;
2e64971a 2203 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 2204 /* we just use folder as a flag in utf8 */
1e696034 2205 const U8 * folder = NULL;
a3621e74 2206
2b8b4781 2207#ifdef DEBUGGING
cf78de0b 2208 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2b8b4781
NC
2209 AV *trie_words = NULL;
2210 /* along with revcharmap, this only used during construction but both are
2211 * useful during debugging so we store them in the struct when debugging.
8e11feef 2212 */
2b8b4781 2213#else
cf78de0b 2214 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 2215 STRLEN trie_charcount=0;
3dab1dad 2216#endif
2b8b4781 2217 SV *re_trie_maxbuff;
a3621e74 2218 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2219
2220 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
2221#ifndef DEBUGGING
2222 PERL_UNUSED_ARG(depth);
2223#endif
a3621e74 2224
1e696034 2225 switch (flags) {
a4525e78 2226 case EXACT: case EXACTL: break;
2f7f8cb1 2227 case EXACTFA:
fab2782b 2228 case EXACTFU_SS:
a4525e78
KW
2229 case EXACTFU:
2230 case EXACTFLU8: folder = PL_fold_latin1; break;
1e696034 2231 case EXACTF: folder = PL_fold; break;
fab2782b 2232 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
2233 }
2234
c944940b 2235 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 2236 trie->refcount = 1;
3dab1dad 2237 trie->startstate = 1;
786e8c11 2238 trie->wordcount = word_count;
f8fc2ecf 2239 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 2240 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
a4525e78 2241 if (flags == EXACT || flags == EXACTL)
c944940b 2242 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
2243 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
2244 trie->wordcount+1, sizeof(reg_trie_wordinfo));
2245
a3621e74 2246 DEBUG_r({
2b8b4781 2247 trie_words = newAV();
a3621e74 2248 });
a3621e74 2249
0111c4fd 2250 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
316ebaf2 2251 assert(re_trie_maxbuff);
a3621e74 2252 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 2253 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 2254 }
df826430 2255 DEBUG_TRIE_COMPILE_r({
538e84ed
KW
2256 PerlIO_printf( Perl_debug_log,
2257 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
2258 (int)depth * 2 + 2, "",
2259 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
2260 REG_NODE_NUM(last), REG_NODE_NUM(tail), (int)depth);
3dab1dad 2261 });
538e84ed 2262
7f69552c
YO
2263 /* Find the node we are going to overwrite */
2264 if ( first == startbranch && OP( last ) != BRANCH ) {
2265 /* whole branch chain */
2266 convert = first;
2267 } else {
2268 /* branch sub-chain */
2269 convert = NEXTOPER( first );
2270 }
538e84ed 2271
a3621e74
YO
2272 /* -- First loop and Setup --
2273
2274 We first traverse the branches and scan each word to determine if it
2275 contains widechars, and how many unique chars there are, this is
2276 important as we have to build a table with at least as many columns as we
2277 have unique chars.
2278
2279 We use an array of integers to represent the character codes 0..255
538e84ed
KW
2280 (trie->charmap) and we use a an HV* to store Unicode characters. We use
2281 the native representation of the character value as the key and IV's for
2282 the coded index.
a3621e74
YO
2283
2284 *TODO* If we keep track of how many times each character is used we can
2285 remap the columns so that the table compression later on is more
3b753521 2286 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
2287 middle and the least common are on the outside. IMO this would be better
2288 than a most to least common mapping as theres a decent chance the most
2289 common letter will share a node with the least common, meaning the node
486ec47a 2290 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
2291 case is when we have the least common nodes twice.
2292
2293 */
2294
a3621e74 2295 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 2296 regnode *noper = NEXTOPER( cur );
e1ec3a88 2297 const U8 *uc = (U8*)STRING( noper );
df826430 2298 const U8 *e = uc + STR_LEN( noper );
bc031a7d 2299 int foldlen = 0;
07be1b83 2300 U32 wordlen = 0; /* required init */
bc031a7d
KW
2301 STRLEN minchars = 0;
2302 STRLEN maxchars = 0;
538e84ed
KW
2303 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the
2304 bitmap?*/
a3621e74 2305
3dab1dad 2306 if (OP(noper) == NOTHING) {
df826430
YO
2307 regnode *noper_next= regnext(noper);
2308 if (noper_next != tail && OP(noper_next) == flags) {
2309 noper = noper_next;
2310 uc= (U8*)STRING(noper);
2311 e= uc + STR_LEN(noper);
2312 trie->minlen= STR_LEN(noper);
2313 } else {
2314 trie->minlen= 0;
2315 continue;
2316 }
3dab1dad 2317 }
df826430 2318
fab2782b 2319 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
2320 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
2321 regardless of encoding */
fab2782b
YO
2322 if (OP( noper ) == EXACTFU_SS) {
2323 /* false positives are ok, so just set this */
0dc4a61d 2324 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
2325 }
2326 }
bc031a7d
KW
2327 for ( ; uc < e ; uc += len ) { /* Look at each char in the current
2328 branch */
3dab1dad 2329 TRIE_CHARCOUNT(trie)++;
a3621e74 2330 TRIE_READ_CHAR;
645de4ce 2331
bc031a7d
KW
2332 /* TRIE_READ_CHAR returns the current character, or its fold if /i
2333 * is in effect. Under /i, this character can match itself, or
2334 * anything that folds to it. If not under /i, it can match just
2335 * itself. Most folds are 1-1, for example k, K, and KELVIN SIGN
2336 * all fold to k, and all are single characters. But some folds
2337 * expand to more than one character, so for example LATIN SMALL
2338 * LIGATURE FFI folds to the three character sequence 'ffi'. If
2339 * the string beginning at 'uc' is 'ffi', it could be matched by
2340 * three characters, or just by the one ligature character. (It
2341 * could also be matched by two characters: LATIN SMALL LIGATURE FF
2342 * followed by 'i', or by 'f' followed by LATIN SMALL LIGATURE FI).
2343 * (Of course 'I' and/or 'F' instead of 'i' and 'f' can also
2344 * match.) The trie needs to know the minimum and maximum number
2345 * of characters that could match so that it can use size alone to
2346 * quickly reject many match attempts. The max is simple: it is
2347 * the number of folded characters in this branch (since a fold is
2348 * never shorter than what folds to it. */
2349
2350 maxchars++;
2351
2352 /* And the min is equal to the max if not under /i (indicated by
2353 * 'folder' being NULL), or there are no multi-character folds. If
2354 * there is a multi-character fold, the min is incremented just
2355 * once, for the character that folds to the sequence. Each
2356 * character in the sequence needs to be added to the list below of
2357 * characters in the trie, but we count only the first towards the
2358 * min number of characters needed. This is done through the
2359 * variable 'foldlen', which is returned by the macros that look
2360 * for these sequences as the number of bytes the sequence
2361 * occupies. Each time through the loop, we decrement 'foldlen' by
2362 * how many bytes the current char occupies. Only when it reaches
2363 * 0 do we increment 'minchars' or look for another multi-character
2364 * sequence. */
2365 if (folder == NULL) {
2366 minchars++;
2367 }
2368 else if (foldlen > 0) {
2369 foldlen -= (UTF) ? UTF8SKIP(uc) : 1;
645de4ce
KW
2370 }
2371 else {
bc031a7d
KW
2372 minchars++;
2373
2374 /* See if *uc is the beginning of a multi-character fold. If
2375 * so, we decrement the length remaining to look at, to account
2376 * for the current character this iteration. (We can use 'uc'
2377 * instead of the fold returned by TRIE_READ_CHAR because for
2378 * non-UTF, the latin1_safe macro is smart enough to account
2379 * for all the unfolded characters, and because for UTF, the
2380 * string will already have been folded earlier in the
2381 * compilation process */
2382 if (UTF) {
2383 if ((foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e))) {
2384 foldlen -= UTF8SKIP(uc);
645de4ce
KW
2385 }
2386 }
bc031a7d
KW
2387 else if ((foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e))) {
2388 foldlen--;
2389 }
645de4ce 2390 }
bc031a7d
KW
2391
2392 /* The current character (and any potential folds) should be added
2393 * to the possible matching characters for this position in this
2394 * branch */
a3621e74 2395 if ( uvc < 256 ) {
fab2782b
YO
2396 if ( folder ) {
2397 U8 folded= folder[ (U8) uvc ];
2398 if ( !trie->charmap[ folded ] ) {
2399 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2400 TRIE_STORE_REVCHAR( folded );
2401 }
2402 }
a3621e74
YO
2403 if ( !trie->charmap[ uvc ] ) {
2404 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2405 TRIE_STORE_REVCHAR( uvc );
a3621e74 2406 }
02daf0ab 2407 if ( set_bit ) {
62012aee
KW
2408 /* store the codepoint in the bitmap, and its folded
2409 * equivalent. */
fab2782b 2410 TRIE_BITMAP_SET(trie, uvc);
0921ee73
T
2411
2412 /* store the folded codepoint */
fab2782b 2413 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
0921ee73
T
2414
2415 if ( !UTF ) {
2416 /* store first byte of utf8 representation of
acdf4139 2417 variant codepoints */
6f2d5cbc 2418 if (! UVCHR_IS_INVARIANT(uvc)) {
acdf4139 2419 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
2420 }
2421 }
02daf0ab
YO
2422 set_bit = 0; /* We've done our bit :-) */
2423 }
a3621e74 2424 } else {
bc031a7d
KW
2425
2426 /* XXX We could come up with the list of code points that fold
2427 * to this using PL_utf8_foldclosures, except not for
2428 * multi-char folds, as there may be multiple combinations
2429 * there that could work, which needs to wait until runtime to
2430 * resolve (The comment about LIGATURE FFI above is such an
2431 * example */
2432
a3621e74 2433 SV** svpp;
55eed653
NC
2434 if ( !widecharmap )
2435 widecharmap = newHV();
a3621e74 2436
55eed653 2437 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2438
2439 if ( !svpp )
e4584336 2440 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
2441
2442 if ( !SvTRUE( *svpp ) ) {
2443 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2444 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2445 }
2446 }
bc031a7d
KW
2447 } /* end loop through characters in this branch of the trie */
2448
2449 /* We take the min and max for this branch and combine to find the min
2450 * and max for all branches processed so far */
3dab1dad 2451 if( cur == first ) {
bc031a7d
KW
2452 trie->minlen = minchars;
2453 trie->maxlen = maxchars;
2454 } else if (minchars < trie->minlen) {
2455 trie->minlen = minchars;
2456 } else if (maxchars > trie->maxlen) {
2457 trie->maxlen = maxchars;
fab2782b 2458 }
a3621e74
YO
2459 } /* end first pass */
2460 DEBUG_TRIE_COMPILE_r(
538e84ed
KW
2461 PerlIO_printf( Perl_debug_log,
2462 "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
3dab1dad 2463 (int)depth * 2 + 2,"",
55eed653 2464 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2465 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2466 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2467 );
a3621e74
YO
2468
2469 /*
2470 We now know what we are dealing with in terms of unique chars and
2471 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2472 representation using a flat table will take. If it's over a reasonable
2473 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2474 conservative but potentially much slower representation using an array
2475 of lists.
2476
2477 At the end we convert both representations into the same compressed
2478 form that will be used in regexec.c for matching with. The latter
2479 is a form that cannot be used to construct with but has memory
2480 properties similar to the list form and access properties similar
2481 to the table form making it both suitable for fast searches and
2482 small enough that its feasable to store for the duration of a program.
2483
2484 See the comment in the code where the compressed table is produced
2485 inplace from the flat tabe representation for an explanation of how
2486 the compression works.
2487
2488 */
2489
2490
2e64971a
DM
2491 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2492 prev_states[1] = 0;
2493
538e84ed
KW
2494 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1)
2495 > SvIV(re_trie_maxbuff) )
2496 {
a3621e74
YO
2497 /*
2498 Second Pass -- Array Of Lists Representation
2499
2500 Each state will be represented by a list of charid:state records
2501 (reg_trie_trans_le) the first such element holds the CUR and LEN
2502 points of the allocated array. (See defines above).
2503
2504 We build the initial structure using the lists, and then convert
2505 it into the compressed table form which allows faster lookups
2506 (but cant be modified once converted).
a3621e74
YO
2507 */
2508
a3621e74
YO
2509 STRLEN transcount = 1;
2510
538e84ed 2511 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1e2e3d02
YO
2512 "%*sCompiling trie using list compiler\n",
2513 (int)depth * 2 + 2, ""));
686b73d4 2514
c944940b
JH
2515 trie->states = (reg_trie_state *)
2516 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2517 sizeof(reg_trie_state) );
a3621e74
YO
2518 TRIE_LIST_NEW(1);
2519 next_alloc = 2;
2520
2521 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2522
df826430 2523 regnode *noper = NEXTOPER( cur );
c445ea15 2524 U8 *uc = (U8*)STRING( noper );
df826430 2525 const U8 *e = uc + STR_LEN( noper );
c445ea15
AL
2526 U32 state = 1; /* required init */
2527 U16 charid = 0; /* sanity init */
07be1b83 2528 U32 wordlen = 0; /* required init */
c445ea15 2529
df826430
YO
2530 if (OP(noper) == NOTHING) {
2531 regnode *noper_next= regnext(noper);
2532 if (noper_next != tail && OP(noper_next) == flags) {
2533 noper = noper_next;
2534 uc= (U8*)STRING(noper);
2535 e= uc + STR_LEN(noper);
2536 }
2537 }
2538
3dab1dad 2539 if (OP(noper) != NOTHING) {
786e8c11 2540 for ( ; uc < e ; uc += len ) {
c445ea15 2541
786e8c11 2542 TRIE_READ_CHAR;
c445ea15 2543
786e8c11
YO
2544 if ( uvc < 256 ) {
2545 charid = trie->charmap[ uvc ];
c445ea15 2546 } else {
538e84ed
KW
2547 SV** const svpp = hv_fetch( widecharmap,
2548 (char*)&uvc,
2549 sizeof( UV ),
2550 0);
786e8c11
YO
2551 if ( !svpp ) {
2552 charid = 0;
2553 } else {
2554 charid=(U16)SvIV( *svpp );
2555 }
c445ea15 2556 }
538e84ed
KW
2557 /* charid is now 0 if we dont know the char read, or
2558 * nonzero if we do */
786e8c11 2559 if ( charid ) {
a3621e74 2560
786e8c11
YO
2561 U16 check;
2562 U32 newstate = 0;
a3621e74 2563
786e8c11
YO
2564 charid--;
2565 if ( !trie->states[ state ].trans.list ) {
2566 TRIE_LIST_NEW( state );
c445ea15 2567 }
538e84ed
KW
2568 for ( check = 1;
2569 check <= TRIE_LIST_USED( state );
2570 check++ )
2571 {
2572 if ( TRIE_LIST_ITEM( state, check ).forid
2573 == charid )
2574 {
786e8c11
YO
2575 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2576 break;
2577 }
2578 }
2579 if ( ! newstate ) {
2580 newstate = next_alloc++;
2e64971a 2581 prev_states[newstate] = state;
786e8c11
YO
2582 TRIE_LIST_PUSH( state, charid, newstate );
2583 transcount++;
2584 }
2585 state = newstate;
2586 } else {
2587 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 2588 }
a28509cc 2589 }
c445ea15 2590 }
3dab1dad 2591 TRIE_HANDLE_WORD(state);
a3621e74
YO
2592
2593 } /* end second pass */
2594
1e2e3d02 2595 /* next alloc is the NEXT state to be allocated */
538e84ed 2596 trie->statecount = next_alloc;
c944940b
JH
2597 trie->states = (reg_trie_state *)
2598 PerlMemShared_realloc( trie->states,
2599 next_alloc
2600 * sizeof(reg_trie_state) );
a3621e74 2601
3dab1dad 2602 /* and now dump it out before we compress it */
2b8b4781
NC
2603 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2604 revcharmap, next_alloc,
2605 depth+1)
1e2e3d02 2606 );
a3621e74 2607
c944940b
JH
2608 trie->trans = (reg_trie_trans *)
2609 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2610 {
2611 U32 state;
a3621e74
YO
2612 U32 tp = 0;
2613 U32 zp = 0;
2614
2615
2616 for( state=1 ; state < next_alloc ; state ++ ) {
2617 U32 base=0;
2618
2619 /*
2620 DEBUG_TRIE_COMPILE_MORE_r(
2621 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2622 );
2623 */
2624
2625 if (trie->states[state].trans.list) {
2626 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2627 U16 maxid=minid;
a28509cc 2628 U16 idx;
a3621e74
YO
2629
2630 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
2631 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2632 if ( forid < minid ) {
2633 minid=forid;
2634 } else if ( forid > maxid ) {
2635 maxid=forid;
2636 }
a3621e74
YO
2637 }
2638 if ( transcount < tp + maxid - minid + 1) {
2639 transcount *= 2;
c944940b
JH
2640 trie->trans = (reg_trie_trans *)
2641 PerlMemShared_realloc( trie->trans,
446bd890
NC
2642 transcount
2643 * sizeof(reg_trie_trans) );
538e84ed
KW
2644 Zero( trie->trans + (transcount / 2),
2645 transcount / 2,
2646 reg_trie_trans );
a3621e74
YO
2647 }
2648 base = trie->uniquecharcount + tp - minid;
2649 if ( maxid == minid ) {
2650 U32 set = 0;
2651 for ( ; zp < tp ; zp++ ) {
2652 if ( ! trie->trans[ zp ].next ) {
2653 base = trie->uniquecharcount + zp - minid;
538e84ed
KW
2654 trie->trans[ zp ].next = TRIE_LIST_ITEM( state,
2655 1).newstate;
a3621e74
YO
2656 trie->trans[ zp ].check = state;
2657 set = 1;
2658 break;
2659 }
2660 }
2661 if ( !set ) {
538e84ed
KW
2662 trie->trans[ tp ].next = TRIE_LIST_ITEM( state,
2663 1).newstate;
a3621e74
YO
2664 trie->trans[ tp ].check = state;
2665 tp++;
2666 zp = tp;
2667 }
2668 } else {
2669 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
538e84ed
KW
2670 const U32 tid = base
2671 - trie->uniquecharcount
2672 + TRIE_LIST_ITEM( state, idx ).forid;
2673 trie->trans[ tid ].next = TRIE_LIST_ITEM( state,
2674 idx ).newstate;
a3621e74
YO
2675 trie->trans[ tid ].check = state;
2676 }
2677 tp += ( maxid - minid + 1 );
2678 }
2679 Safefree(trie->states[ state ].trans.list);
2680 }
2681 /*
2682 DEBUG_TRIE_COMPILE_MORE_r(
2683 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2684 );
2685 */
2686 trie->states[ state ].trans.base=base;
2687 }
cc601c31 2688 trie->lasttrans = tp + 1;
a3621e74
YO
2689 }
2690 } else {
2691 /*
2692 Second Pass -- Flat Table Representation.
2693
b423522f
KW
2694 we dont use the 0 slot of either trans[] or states[] so we add 1 to
2695 each. We know that we will need Charcount+1 trans at most to store
2696 the data (one row per char at worst case) So we preallocate both
2697 structures assuming worst case.
a3621e74
YO
2698
2699 We then construct the trie using only the .next slots of the entry
2700 structs.
2701
b423522f
KW
2702 We use the .check field of the first entry of the node temporarily
2703 to make compression both faster and easier by keeping track of how
2704 many non zero fields are in the node.
a3621e74
YO
2705
2706 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2707 transition.
2708
b423522f
KW
2709 There are two terms at use here: state as a TRIE_NODEIDX() which is
2710 a number representing the first entry of the node, and state as a
2711 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1)
2712 and TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3)
2713 if there are 2 entrys per node. eg:
a3621e74
YO
2714
2715 A B A B
2716 1. 2 4 1. 3 7
2717 2. 0 3 3. 0 5
2718 3. 0 0 5. 0 0
2719 4. 0 0 7. 0 0
2720
b423522f
KW
2721 The table is internally in the right hand, idx form. However as we
2722 also have to deal with the states array which is indexed by nodenum
2723 we have to use TRIE_NODENUM() to convert.
a3621e74
YO
2724
2725 */
538e84ed 2726 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
1e2e3d02
YO
2727 "%*sCompiling trie using table compiler\n",
2728 (int)depth * 2 + 2, ""));
3dab1dad 2729
c944940b
JH
2730 trie->trans = (reg_trie_trans *)
2731 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2732 * trie->uniquecharcount + 1,
2733 sizeof(reg_trie_trans) );
2734 trie->states = (reg_trie_state *)
2735 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2736 sizeof(reg_trie_state) );
a3621e74
YO
2737 next_alloc = trie->uniquecharcount + 1;
2738
3dab1dad 2739
a3621e74
YO
2740 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2741
df826430 2742 regnode *noper = NEXTOPER( cur );
a28509cc 2743 const U8 *uc = (U8*)STRING( noper );
df826430 2744 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
2745
2746 U32 state = 1; /* required init */
2747
2748 U16 charid = 0; /* sanity init */
2749 U32 accept_state = 0; /* sanity init */
a3621e74 2750
07be1b83 2751 U32 wordlen = 0; /* required init */
a3621e74 2752
df826430
YO
2753 if (OP(noper) == NOTHING) {
2754 regnode *noper_next= regnext(noper);
2755 if (noper_next != tail && OP(noper_next) == flags) {
2756 noper = noper_next;
2757 uc= (U8*)STRING(noper);
2758 e= uc + STR_LEN(noper);
2759 }
2760 }
fab2782b 2761
3dab1dad 2762 if ( OP(noper) != NOTHING ) {
786e8c11 2763 for ( ; uc < e ; uc += len ) {
a3621e74 2764
786e8c11 2765 TRIE_READ_CHAR;
a3621e74 2766
786e8c11
YO
2767 if ( uvc < 256 ) {
2768 charid = trie->charmap[ uvc ];
2769 } else {
538e84ed
KW
2770 SV* const * const svpp = hv_fetch( widecharmap,
2771 (char*)&uvc,
2772 sizeof( UV ),
2773 0);
786e8c11 2774 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 2775 }
786e8c11
YO
2776 if ( charid ) {
2777 charid--;
2778 if ( !trie->trans[ state + charid ].next ) {
2779 trie->trans[ state + charid ].next = next_alloc;
2780 trie->trans[ state ].check++;
2e64971a
DM
2781 prev_states[TRIE_NODENUM(next_alloc)]
2782 = TRIE_NODENUM(state);
786e8c11
YO
2783 next_alloc += trie->uniquecharcount;
2784 }
2785 state = trie->trans[ state + charid ].next;
2786 } else {
2787 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2788 }
538e84ed
KW
2789 /* charid is now 0 if we dont know the char read, or
2790 * nonzero if we do */
a3621e74 2791 }
a3621e74 2792 }
3dab1dad
YO
2793 accept_state = TRIE_NODENUM( state );
2794 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
2795
2796 } /* end second pass */
2797
3dab1dad 2798 /* and now dump it out before we compress it */
2b8b4781
NC
2799 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2800 revcharmap,
2801 next_alloc, depth+1));
a3621e74 2802
a3621e74
YO
2803 {
2804 /*
2805 * Inplace compress the table.*
2806
2807 For sparse data sets the table constructed by the trie algorithm will
2808 be mostly 0/FAIL transitions or to put it another way mostly empty.
2809 (Note that leaf nodes will not contain any transitions.)
2810
2811 This algorithm compresses the tables by eliminating most such
2812 transitions, at the cost of a modest bit of extra work during lookup:
2813
2814 - Each states[] entry contains a .base field which indicates the
2815 index in the state[] array wheres its transition data is stored.
2816
3b753521 2817 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
2818
2819 - If .base is nonzero then charid is added to it to find an entry in
2820 the trans array.
2821
2822 -If trans[states[state].base+charid].check!=state then the
2823 transition is taken to be a 0/Fail transition. Thus if there are fail
2824 transitions at the front of the node then the .base offset will point
2825 somewhere inside the previous nodes data (or maybe even into a node
2826 even earlier), but the .check field determines if the transition is
2827 valid.
2828
786e8c11 2829 XXX - wrong maybe?
a3621e74 2830 The following process inplace converts the table to the compressed
3b753521 2831 table: We first do not compress the root node 1,and mark all its
a3621e74 2832 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
2833 allows us to do a DFA construction from the compressed table later,
2834 and ensures that any .base pointers we calculate later are greater
2835 than 0.
a3621e74
YO
2836
2837 - We set 'pos' to indicate the first entry of the second node.
2838
2839 - We then iterate over the columns of the node, finding the first and
2840 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2841 and set the .check pointers accordingly, and advance pos
2842 appropriately and repreat for the next node. Note that when we copy
2843 the next pointers we have to convert them from the original
2844 NODEIDX form to NODENUM form as the former is not valid post
2845 compression.
2846
2847 - If a node has no transitions used we mark its base as 0 and do not
2848 advance the pos pointer.
2849
2850 - If a node only has one transition we use a second pointer into the
2851 structure to fill in allocated fail transitions from other states.
2852 This pointer is independent of the main pointer and scans forward
2853 looking for null transitions that are allocated to a state. When it
2854 finds one it writes the single transition into the "hole". If the
786e8c11 2855 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
2856
2857 - Once compressed we can Renew/realloc the structures to release the
2858 excess space.
2859
2860 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2861 specifically Fig 3.47 and the associated pseudocode.
2862
2863 demq
2864 */
a3b680e6 2865 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 2866 U32 state, charid;
a3621e74 2867 U32 pos = 0, zp=0;
1e2e3d02 2868 trie->statecount = laststate;
a3621e74
YO
2869
2870 for ( state = 1 ; state < laststate ; state++ ) {
2871 U8 flag = 0;
a28509cc
AL
2872 const U32 stateidx = TRIE_NODEIDX( state );
2873 const U32 o_used = trie->trans[ stateidx ].check;
2874 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
2875 trie->trans[ stateidx ].check = 0;
2876
538e84ed
KW
2877 for ( charid = 0;
2878 used && charid < trie->uniquecharcount;
2879 charid++ )
2880 {
a3621e74
YO
2881 if ( flag || trie->trans[ stateidx + charid ].next ) {
2882 if ( trie->trans[ stateidx + charid ].next ) {
2883 if (o_used == 1) {
2884 for ( ; zp < pos ; zp++ ) {
2885 if ( ! trie->trans[ zp ].next ) {
2886 break;
2887 }
2888 }
538e84ed
KW
2889 trie->states[ state ].trans.base
2890 = zp
2891 + trie->uniquecharcount
2892 - charid ;
2893 trie->trans[ zp ].next
2894 = SAFE_TRIE_NODENUM( trie->trans[ stateidx
2895 + charid ].next );
a3621e74
YO
2896 trie->trans[ zp ].check = state;
2897 if ( ++zp > pos ) pos = zp;
2898 break;
2899 }
2900 used--;
2901 }
2902 if ( !flag ) {
2903 flag = 1;
538e84ed
KW
2904 trie->states[ state ].trans.base
2905 = pos + trie->uniquecharcount - charid ;
a3621e74 2906 }
538e84ed
KW
2907 trie->trans[ pos ].next
2908 = SAFE_TRIE_NODENUM(
2909 trie->trans[ stateidx + charid ].next );
a3621e74
YO
2910 trie->trans[ pos ].check = state;
2911 pos++;
2912 }
2913 }
2914 }
cc601c31 2915 trie->lasttrans = pos + 1;
c944940b
JH
2916 trie->states = (reg_trie_state *)
2917 PerlMemShared_realloc( trie->states, laststate
2918 * sizeof(reg_trie_state) );
a3621e74 2919 DEBUG_TRIE_COMPILE_MORE_r(
538e84ed
KW
2920 PerlIO_printf( Perl_debug_log,
2921 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2922 (int)depth * 2 + 2,"",
2923 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount
2924 + 1 ),
2925 (IV)next_alloc,
2926 (IV)pos,
2927 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
a3621e74
YO
2928 );
2929
2930 } /* end table compress */
2931 }
1e2e3d02 2932 DEBUG_TRIE_COMPILE_MORE_r(
538e84ed
KW
2933 PerlIO_printf(Perl_debug_log,
2934 "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
1e2e3d02
YO
2935 (int)depth * 2 + 2, "",
2936 (UV)trie->statecount,
2937 (UV)trie->lasttrans)
2938 );
cc601c31 2939 /* resize the trans array to remove unused space */
c944940b
JH
2940 trie->trans = (reg_trie_trans *)
2941 PerlMemShared_realloc( trie->trans, trie->lasttrans
2942 * sizeof(reg_trie_trans) );
a3621e74 2943
538e84ed 2944 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2945 U8 nodetype =(U8)(flags & 0xFF);
2946 char *str=NULL;
538e84ed 2947
07be1b83 2948#ifdef DEBUGGING
e62cc96a 2949 regnode *optimize = NULL;
7122b237
YO
2950#ifdef RE_TRACK_PATTERN_OFFSETS
2951
b57a0404
JH
2952 U32 mjd_offset = 0;
2953 U32 mjd_nodelen = 0;
7122b237
YO
2954#endif /* RE_TRACK_PATTERN_OFFSETS */
2955#endif /* DEBUGGING */
a3621e74 2956 /*
3dab1dad
YO
2957 This means we convert either the first branch or the first Exact,
2958 depending on whether the thing following (in 'last') is a branch
2959 or not and whther first is the startbranch (ie is it a sub part of
2960 the alternation or is it the whole thing.)
3b753521 2961 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2962 the whole branch sequence, including the first.
a3621e74 2963 */
3dab1dad 2964 /* Find the node we are going to overwrite */
7f69552c 2965 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2966 /* branch sub-chain */
3dab1dad 2967 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2968#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2969 DEBUG_r({
2970 mjd_offset= Node_Offset((convert));
2971 mjd_nodelen= Node_Length((convert));
2972 });
7122b237 2973#endif
7f69552c 2974 /* whole branch chain */
7122b237
YO
2975 }
2976#ifdef RE_TRACK_PATTERN_OFFSETS
2977 else {
7f69552c
YO
2978 DEBUG_r({
2979 const regnode *nop = NEXTOPER( convert );
2980 mjd_offset= Node_Offset((nop));
2981 mjd_nodelen= Node_Length((nop));
2982 });
07be1b83
YO
2983 }
2984 DEBUG_OPTIMISE_r(
538e84ed
KW
2985 PerlIO_printf(Perl_debug_log,
2986 "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
07be1b83 2987 (int)depth * 2 + 2, "",
786e8c11 2988 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2989 );
7122b237 2990#endif
538e84ed 2991 /* But first we check to see if there is a common prefix we can
3dab1dad
YO
2992 split out as an EXACT and put in front of the TRIE node. */
2993 trie->startstate= 1;
55eed653 2994 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2995 U32 state;
1e2e3d02 2996 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2997 U32 ofs = 0;
8e11feef
RGS
2998 I32 idx = -1;
2999 U32 count = 0;
3000 const U32 base = trie->states[ state ].trans.base;
a3621e74 3001
3dab1dad 3002 if ( trie->states[state].wordnum )
8e11feef 3003 count = 1;
a3621e74 3004
8e11feef 3005 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
3006 if ( ( base + ofs >= trie->uniquecharcount ) &&
3007 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
3008 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
3009 {
3dab1dad 3010 if ( ++count > 1 ) {
2b8b4781 3011 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 3012 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 3013 if ( state == 1 ) break;
3dab1dad
YO
3014 if ( count == 2 ) {
3015 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
3016 DEBUG_OPTIMISE_r(
8e11feef
RGS
3017 PerlIO_printf(Perl_debug_log,
3018 "%*sNew Start State=%"UVuf" Class: [",
3019 (int)depth * 2 + 2, "",
786e8c11 3020 (UV)state));
be8e71aa 3021 if (idx >= 0) {
2b8b4781 3022 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 3023 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 3024
3dab1dad 3025 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
3026 if ( folder )
3027 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 3028 DEBUG_OPTIMISE_r(
f1f66076 3029 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 3030 );
8e11feef
RGS
3031 }
3032 }
3033 TRIE_BITMAP_SET(trie,*ch);
3034 if ( folder )
3035 TRIE_BITMAP_SET(trie,folder[ *ch ]);
3036 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
3037 }
3038 idx = ofs;
3039 }
3dab1dad
YO
3040 }
3041 if ( count == 1 ) {
2b8b4781 3042 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
3043 STRLEN len;
3044 char *ch = SvPV( *tmp, len );
de734bd5
A
3045 DEBUG_OPTIMISE_r({
3046 SV *sv=sv_newmortal();
8e11feef
RGS
3047 PerlIO_printf( Perl_debug_log,
3048 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
3049 (int)depth * 2 + 2, "",
538e84ed
KW
3050 (UV)state, (UV)idx,
3051 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
de734bd5
A
3052 PL_colors[0], PL_colors[1],
3053 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
538e84ed 3054 PERL_PV_ESCAPE_FIRSTCHAR
de734bd5
A
3055 )
3056 );
3057 });
3dab1dad
YO
3058 if ( state==1 ) {
3059 OP( convert ) = nodetype;
3060 str=STRING(convert);
3061 STR_LEN(convert)=0;
3062 }
c490c714
YO
3063 STR_LEN(convert) += len;
3064 while (len--)
de734bd5 3065 *str++ = *ch++;
8e11feef 3066 } else {
538e84ed 3067#ifdef DEBUGGING
8e11feef
RGS
3068 if (state>1)
3069 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 3070#endif
8e11feef
RGS
3071 break;
3072 }
3073 }
2e64971a 3074 trie->prefixlen = (state-1);
3dab1dad 3075 if (str) {
8e11feef 3076 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 3077 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 3078 trie->startstate = state;
07be1b83
YO
3079 trie->minlen -= (state - 1);
3080 trie->maxlen -= (state - 1);
33809eae
JH
3081#ifdef DEBUGGING
3082 /* At least the UNICOS C compiler choked on this
3083 * being argument to DEBUG_r(), so let's just have
3084 * it right here. */
3085 if (
3086#ifdef PERL_EXT_RE_BUILD
3087 1
3088#else
3089 DEBUG_r_TEST
3090#endif
3091 ) {
3092 regnode *fix = convert;
3093 U32 word = trie->wordcount;
3094 mjd_nodelen++;
3095 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
3096 while( ++fix < n ) {
3097 Set_Node_Offset_Length(fix, 0, 0);
3098 }
3099 while (word--) {
3100 SV ** const tmp = av_fetch( trie_words, word, 0 );
3101 if (tmp) {
3102 if ( STR_LEN(convert) <= SvCUR(*tmp) )
3103 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
3104 else
3105 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
3106 }
3107 }
3108 }
3109#endif
8e11feef
RGS
3110 if (trie->maxlen) {
3111 convert = n;
3112 } else {
3dab1dad 3113 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 3114 DEBUG_r(optimize= n);
3dab1dad
YO
3115 }
3116 }
3117 }
538e84ed
KW
3118 if (!jumper)
3119 jumper = last;
3dab1dad 3120 if ( trie->maxlen ) {
8e11feef
RGS
3121 NEXT_OFF( convert ) = (U16)(tail - convert);
3122 ARG_SET( convert, data_slot );
538e84ed
KW
3123 /* Store the offset to the first unabsorbed branch in
3124 jump[0], which is otherwise unused by the jump logic.
786e8c11 3125 We use this when dumping a trie and during optimisation. */
538e84ed 3126 if (trie->jump)
7f69552c 3127 trie->jump[0] = (U16)(nextbranch - convert);
538e84ed 3128
6c48061a
YO
3129 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
3130 * and there is a bitmap
3131 * and the first "jump target" node we found leaves enough room
3132 * then convert the TRIE node into a TRIEC node, with the bitmap
3133 * embedded inline in the opcode - this is hypothetically faster.
3134 */
3135 if ( !trie->states[trie->startstate].wordnum
3136 && trie->bitmap
3137 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
3138 {
3139 OP( convert ) = TRIEC;
3140 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 3141 PerlMemShared_free(trie->bitmap);
786e8c11 3142 trie->bitmap= NULL;
538e84ed 3143 } else
786e8c11 3144 OP( convert ) = TRIE;
a3621e74 3145
3dab1dad
YO
3146 /* store the type in the flags */
3147 convert->flags = nodetype;
a5ca303d 3148 DEBUG_r({
538e84ed
KW
3149 optimize = convert
3150 + NODE_STEP_REGNODE
a5ca303d
YO
3151 + regarglen[ OP( convert ) ];
3152 });
538e84ed 3153 /* XXX We really should free up the resource in trie now,
a5ca303d 3154 as we won't use them - (which resources?) dmq */
3dab1dad 3155 }
a3621e74 3156 /* needed for dumping*/
e62cc96a 3157 DEBUG_r(if (optimize) {
07be1b83 3158 regnode *opt = convert;
bcdf7404 3159
e62cc96a 3160 while ( ++opt < optimize) {
07be1b83
YO
3161 Set_Node_Offset_Length(opt,0,0);
3162 }
538e84ed
KW
3163 /*
3164 Try to clean up some of the debris left after the
786e8c11 3165 optimisation.
a3621e74 3166 */
786e8c11 3167 while( optimize < jumper ) {
07be1b83 3168 mjd_nodelen += Node_Length((optimize));
a3621e74 3169 OP( optimize ) = OPTIMIZED;
07be1b83 3170 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
3171 optimize++;
3172 }
07be1b83 3173 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
3174 });
3175 } /* end node insert */
2e64971a
DM
3176
3177 /* Finish populating the prev field of the wordinfo array. Walk back
3178 * from each accept state until we find another accept state, and if
3179 * so, point the first word's .prev field at the second word. If the
3180 * second already has a .prev field set, stop now. This will be the
3181 * case either if we've already processed that word's accept state,
3b753521
FN
3182 * or that state had multiple words, and the overspill words were
3183 * already linked up earlier.
2e64971a
DM
3184 */
3185 {
3186 U16 word;
3187 U32 state;
3188 U16 prev;
3189
3190 for (word=1; word <= trie->wordcount; word++) {
3191 prev = 0;
3192 if (trie->wordinfo[word].prev)
3193 continue;
3194 state = trie->wordinfo[word].accept;
3195 while (state) {
3196 state = prev_states[state];
3197 if (!state)
3198 break;
3199 prev = trie->states[state].wordnum;
3200 if (prev)
3201 break;
3202 }
3203 trie->wordinfo[word].prev = prev;
3204 }
3205 Safefree(prev_states);
3206 }
3207
3208
3209 /* and now dump out the compressed format */
3210 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
3211
55eed653 3212 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
3213#ifdef DEBUGGING
3214 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
3215 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
3216#else
03e70be4 3217 SvREFCNT_dec_NN(revcharmap);
07be1b83 3218#endif
538e84ed
KW
3219 return trie->jump
3220 ? MADE_JUMP_TRIE
3221 : trie->startstate>1
3222 ? MADE_EXACT_TRIE
786e8c11
YO
3223 : MADE_TRIE;
3224}
3225
615a2e7f
YO
3226STATIC regnode *
3227S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *source, U32 depth)
786e8c11 3228{
b423522f
KW
3229/* The Trie is constructed and compressed now so we can build a fail array if
3230 * it's needed
786e8c11 3231
b423522f
KW
3232 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and
3233 3.32 in the
3234 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi,
3235 Ullman 1985/88
786e8c11
YO
3236 ISBN 0-201-10088-6
3237
b423522f
KW
3238 We find the fail state for each state in the trie, this state is the longest
3239 proper suffix of the current state's 'word' that is also a proper prefix of
3240 another word in our trie. State 1 represents the word '' and is thus the
3241 default fail state. This allows the DFA not to have to restart after its
3242 tried and failed a word at a given point, it simply continues as though it
3243 had been matching the other word in the first place.
786e8c11
YO
3244 Consider
3245 'abcdgu'=~/abcdefg|cdgu/
b423522f
KW
3246 When we get to 'd' we are still matching the first word, we would encounter
3247 'g' which would fail, which would bring us to the state representing 'd' in
3248 the second word where we would try 'g' and succeed, proceeding to match
3249 'cdgu'.
786e8c11
YO
3250 */
3251 /* add a fail transition */
3251b653
NC
3252 const U32 trie_offset = ARG(source);
3253 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
3254 U32 *q;
3255 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 3256 const U32 numstates = trie->statecount;
786e8c11
YO
3257 const U32 ubound = trie->lasttrans + ucharcount;
3258 U32 q_read = 0;
3259 U32 q_write = 0;
3260 U32 charid;
3261 U32 base = trie->states[ 1 ].trans.base;
3262 U32 *fail;
3263 reg_ac_data *aho;
cf78de0b 3264 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
615a2e7f 3265 regnode *stclass;
786e8c11 3266 GET_RE_DEBUG_FLAGS_DECL;
7918f24d 3267
615a2e7f 3268 PERL_ARGS_ASSERT_CONSTRUCT_AHOCORASICK_FROM_TRIE;
81611534 3269 PERL_UNUSED_CONTEXT;
786e8c11
YO
3270#ifndef DEBUGGING
3271 PERL_UNUSED_ARG(depth);
3272#endif
3273
615a2e7f
YO
3274 if ( OP(source) == TRIE ) {
3275 struct regnode_1 *op = (struct regnode_1 *)
3276 PerlMemShared_calloc(1, sizeof(struct regnode_1));
3277 StructCopy(source,op,struct regnode_1);
3278 stclass = (regnode *)op;
3279 } else {
3280 struct regnode_charclass *op = (struct regnode_charclass *)
3281 PerlMemShared_calloc(1, sizeof(struct regnode_charclass));
3282 StructCopy(source,op,struct regnode_charclass);
3283 stclass = (regnode *)op;
3284 }
2f306ab9 3285 OP(stclass)+=2; /* convert the TRIE type to its AHO-CORASICK equivalent */
786e8c11
YO
3286
3287 ARG_SET( stclass, data_slot );
c944940b 3288 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 3289 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 3290 aho->trie=trie_offset;
446bd890
NC
3291 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
3292 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 3293 Newxz( q, numstates, U32);
c944940b 3294 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
3295 aho->refcount = 1;
3296 fail = aho->fail;
3297 /* initialize fail[0..1] to be 1 so that we always have
3298 a valid final fail state */
3299 fail[ 0 ] = fail[ 1 ] = 1;
3300
3301 for ( charid = 0; charid < ucharcount ; charid++ ) {
3302 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
3303 if ( newstate ) {
3304 q[ q_write ] = newstate;
3305 /* set to point at the root */
3306 fail[ q[ q_write++ ] ]=1;
3307 }
3308 }
3309 while ( q_read < q_write) {
3310 const U32 cur = q[ q_read++ % numstates ];
3311 base = trie->states[ cur ].trans.base;
3312
3313 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
3314 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
3315 if (ch_state) {
3316 U32 fail_state = cur;
3317 U32 fail_base;
3318 do {
3319 fail_state = fail[ fail_state ];
3320 fail_base = aho->states[ fail_state ].trans.base;
3321 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
3322
3323 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
3324 fail[ ch_state ] = fail_state;
3325 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
3326 {
3327 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
3328 }
3329 q[ q_write++ % numstates] = ch_state;
3330 }
3331 }
3332 }
3333 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
3334 when we fail in state 1, this allows us to use the
3335 charclass scan to find a valid start char. This is based on the principle
3336 that theres a good chance the string being searched contains lots of stuff
3337 that cant be a start char.
3338 */
3339 fail[ 0 ] = fail[ 1 ] = 0;
3340 DEBUG_TRIE_COMPILE_r({
6d99fb9b 3341 PerlIO_printf(Perl_debug_log,
538e84ed 3342 "%*sStclass Failtable (%"UVuf" states): 0",
6d99fb9b 3343 (int)(depth * 2), "", (UV)numstates
1e2e3d02 3344 );
786e8c11
YO
3345 for( q_read=1; q_read<numstates; q_read++ ) {
3346 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
3347 }
3348 PerlIO_printf(Perl_debug_log, "\n");
3349 });
3350 Safefree(q);
e384d5c1 3351 /*RExC_seen |= REG_TRIEDFA_SEEN;*/
615a2e7f 3352 return stclass;
a3621e74
YO
3353}
3354
786e8c11 3355
07be1b83 3356#define DEBUG_PEEP(str,scan,depth) \
b515a41d 3357 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83 3358 regnode *Next = regnext(scan); \
8b9781c9 3359 regprop(RExC_rx, RExC_mysv, scan, NULL, pRExC_state); \
fdfb4f21 3360 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)", \
c9f0d54c
YO
3361 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(RExC_mysv),\
3362 Next ? (REG_NODE_NUM(Next)) : 0 ); \
fdfb4f21
YO
3363 DEBUG_SHOW_STUDY_FLAGS(flags," [ ","]");\
3364 PerlIO_printf(Perl_debug_log, "\n"); \
b515a41d 3365 }});
07be1b83 3366
bb914485 3367/* The below joins as many adjacent EXACTish nodes as possible into a single
0a982f06
KW
3368 * one. The regop may be changed if the node(s) contain certain sequences that
3369 * require special handling. The joining is only done if:
bb914485
KW
3370 * 1) there is room in the current conglomerated node to entirely contain the
3371 * next one.
3372 * 2) they are the exact same node type
3373 *
87b8b349 3374 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
bb914485
KW
3375 * these get optimized out
3376 *
0e7784f3
KW
3377 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
3378 * as possible, even if that means splitting an existing node so that its first
3379 * part is moved to the preceeding node. This would maximise the efficiency of
3380 * memEQ during matching. Elsewhere in this file, khw proposes splitting
3381 * EXACTFish nodes into portions that don't change under folding vs those that
3382 * do. Those portions that don't change may be the only things in the pattern that
3383 * could be used to find fixed and floating strings.
3384 *
0a982f06
KW
3385 * If a node is to match under /i (folded), the number of characters it matches
3386 * can be different than its character length if it contains a multi-character
31f05a37
KW
3387 * fold. *min_subtract is set to the total delta number of characters of the
3388 * input nodes.
bb914485 3389 *
fb29cc72 3390 * And *unfolded_multi_char is set to indicate whether or not the node contains
31f05a37
KW
3391 * an unfolded multi-char fold. This happens when whether the fold is valid or
3392 * not won't be known until runtime; namely for EXACTF nodes that contain LATIN
3393 * SMALL LETTER SHARP S, as only if the target string being matched against
3394 * turns out to be UTF-8 is that fold valid; and also for EXACTFL nodes whose
3395 * folding rules depend on the locale in force at runtime. (Multi-char folds
3396 * whose components are all above the Latin1 range are not run-time locale
3397 * dependent, and have already been folded by the time this function is
3398 * called.)
f758bddf 3399 *
bb914485 3400 * This is as good a place as any to discuss the design of handling these
0a982f06
KW
3401 * multi-character fold sequences. It's been wrong in Perl for a very long
3402 * time. There are three code points in Unicode whose multi-character folds
3403 * were long ago discovered to mess things up. The previous designs for
3404 * dealing with these involved assigning a special node for them. This
538e84ed 3405 * approach doesn't always work, as evidenced by this example:
a0c4c608 3406 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
538e84ed 3407 * Both sides fold to "sss", but if the pattern is parsed to create a node that
0a982f06 3408 * would match just the \xDF, it won't be able to handle the case where a
bb914485
KW
3409 * successful match would have to cross the node's boundary. The new approach
3410 * that hopefully generally solves the problem generates an EXACTFU_SS node
538e84ed 3411 * that is "sss" in this case.
bb914485 3412 *
0a982f06 3413 * It turns out that there are problems with all multi-character folds, and not
cb117658
KW
3414 * just these three. Now the code is general, for all such cases. The
3415 * approach taken is:
0a982f06 3416 * 1) This routine examines each EXACTFish node that could contain multi-
538e84ed
KW
3417 * character folded sequences. Since a single character can fold into
3418 * such a sequence, the minimum match length for this node is less than
3419 * the number of characters in the node. This routine returns in
31f05a37
KW
3420 * *min_subtract how many characters to subtract from the the actual
3421 * length of the string to get a real minimum match length; it is 0 if
3422 * there are no multi-char foldeds. This delta is used by the caller to
3423 * adjust the min length of the match, and the delta between min and max,
3424 * so that the optimizer doesn't reject these possibilities based on size
3425 * constraints.
cb117658 3426 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
0a982f06
KW
3427 * is used for an EXACTFU node that contains at least one "ss" sequence in
3428 * it. For non-UTF-8 patterns and strings, this is the only case where
3429 * there is a possible fold length change. That means that a regular
3430 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
3431 * with length changes, and so can be processed faster. regexec.c takes
3432 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
31f05a37
KW
3433 * pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
3434 * known until runtime). This saves effort in regex matching. However,
3435 * the pre-folding isn't done for non-UTF8 patterns because the fold of
3436 * the MICRO SIGN requires UTF-8, and we don't want to slow things down by
3437 * forcing the pattern into UTF8 unless necessary. Also what EXACTF (and,
3438 * again, EXACTFL) nodes fold to isn't known until runtime. The fold
0a982f06
KW
3439 * possibilities for the non-UTF8 patterns are quite simple, except for
3440 * the sharp s. All the ones that don't involve a UTF-8 target string are
3441 * members of a fold-pair, and arrays are set up for all of them so that
3442 * the other member of the pair can be found quickly. Code elsewhere in
3443 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
3444 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
3445 * described in the next item.
31f05a37
KW
3446 * 3) A problem remains for unfolded multi-char folds. (These occur when the
3447 * validity of the fold won't be known until runtime, and so must remain
3448 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFA
3449 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
3450 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
3451 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
3452 * The reason this is a problem is that the optimizer part of regexec.c
3453 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
3454 * that a character in the pattern corresponds to at most a single
3455 * character in the target string. (And I do mean character, and not byte
3456 * here, unlike other parts of the documentation that have never been
3457 * updated to account for multibyte Unicode.) sharp s in EXACTF and
3458 * EXACTFL nodes can match the two character string 'ss'; in EXACTFA nodes
3459 * it can match "\x{17F}\x{17F}". These, along with other ones in EXACTFL
3460 * nodes, violate the assumption, and they are the only instances where it
3461 * is violated. I'm reluctant to try to change the assumption, as the
3462 * code involved is impenetrable to me (khw), so instead the code here
3463 * punts. This routine examines EXACTFL nodes, and (when the pattern
3464 * isn't UTF-8) EXACTF and EXACTFA for such unfolded folds, and returns a
3465 * boolean indicating whether or not the node contains such a fold. When
3466 * it is true, the caller sets a flag that later causes the optimizer in
3467 * this file to not set values for the floating and fixed string lengths,
3468 * and thus avoids the optimizer code in regexec.c that makes the invalid
1ca267a5 3469 * assumption. Thus, there is no optimization based on string lengths for
31f05a37
KW
3470 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
3471 * EXACTF and EXACTFA nodes that contain the sharp s. (The reason the
3472 * assumption is wrong only in these cases is that all other non-UTF-8
3473 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
3474 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
3475 * EXACTF nodes because we don't know at compile time if it actually
3476 * matches 'ss' or not. For EXACTF nodes it will match iff the target
3477 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
3478 * always matches; and EXACTFA where it never does. In an EXACTFA node in
3479<