This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Remove unused parameter in static function
[perl5.git] / regcomp.c
CommitLineData
a0d0e21e
LW
1/* regcomp.c
2 */
3
4/*
4ac71550
TC
5 * 'A fair jaw-cracker dwarf-language must be.' --Samwise Gamgee
6 *
7 * [p.285 of _The Lord of the Rings_, II/iii: "The Ring Goes South"]
a0d0e21e
LW
8 */
9
61296642
DM
10/* This file contains functions for compiling a regular expression. See
11 * also regexec.c which funnily enough, contains functions for executing
166f8a29 12 * a regular expression.
e4a054ea
DM
13 *
14 * This file is also copied at build time to ext/re/re_comp.c, where
15 * it's built with -DPERL_EXT_RE_BUILD -DPERL_EXT_RE_DEBUG -DPERL_EXT.
16 * This causes the main functions to be compiled under new names and with
17 * debugging support added, which makes "use re 'debug'" work.
166f8a29
DM
18 */
19
a687059c
LW
20/* NOTE: this is derived from Henry Spencer's regexp code, and should not
21 * confused with the original package (see point 3 below). Thanks, Henry!
22 */
23
24/* Additional note: this code is very heavily munged from Henry's version
25 * in places. In some spots I've traded clarity for efficiency, so don't
26 * blame Henry for some of the lack of readability.
27 */
28
e50aee73 29/* The names of the functions have been changed from regcomp and
3b753521 30 * regexec to pregcomp and pregexec in order to avoid conflicts
e50aee73
AD
31 * with the POSIX routines of the same names.
32*/
33
b9d5759e 34#ifdef PERL_EXT_RE_BUILD
54df2634 35#include "re_top.h"
b81d288d 36#endif
56953603 37
a687059c 38/*
e50aee73 39 * pregcomp and pregexec -- regsub and regerror are not used in perl
a687059c
LW
40 *
41 * Copyright (c) 1986 by University of Toronto.
42 * Written by Henry Spencer. Not derived from licensed software.
43 *
44 * Permission is granted to anyone to use this software for any
45 * purpose on any computer system, and to redistribute it freely,
46 * subject to the following restrictions:
47 *
48 * 1. The author is not responsible for the consequences of use of
49 * this software, no matter how awful, even if they arise
50 * from defects in it.
51 *
52 * 2. The origin of this software must not be misrepresented, either
53 * by explicit claim or by omission.
54 *
55 * 3. Altered versions must be plainly marked as such, and must not
56 * be misrepresented as being the original software.
57 *
58 *
59 **** Alterations to Henry's code are...
60 ****
4bb101f2 61 **** Copyright (C) 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999,
1129b882
NC
62 **** 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
63 **** by Larry Wall and others
a687059c 64 ****
9ef589d8
LW
65 **** You may distribute under the terms of either the GNU General Public
66 **** License or the Artistic License, as specified in the README file.
67
a687059c
LW
68 *
69 * Beware that some of this code is subtly aware of the way operator
70 * precedence is structured in regular expressions. Serious changes in
71 * regular-expression syntax might require a total rethink.
72 */
73#include "EXTERN.h"
864dbfa3 74#define PERL_IN_REGCOMP_C
a687059c 75#include "perl.h"
d06ea78c 76
acfe0abc 77#ifndef PERL_IN_XSUB_RE
d06ea78c
GS
78# include "INTERN.h"
79#endif
c277df42
IZ
80
81#define REG_COMP_C
54df2634
NC
82#ifdef PERL_IN_XSUB_RE
83# include "re_comp.h"
40b6423c 84extern const struct regexp_engine my_reg_engine;
54df2634
NC
85#else
86# include "regcomp.h"
87#endif
a687059c 88
04e98a4d 89#include "dquote_static.c"
26faadbd 90#include "charclass_invlists.h"
81e983c1 91#include "inline_invlist.c"
1b0f46bf 92#include "unicode_constants.h"
04e98a4d 93
94dc5c2d 94#define HAS_NONLATIN1_FOLD_CLOSURE(i) _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(i)
26faadbd 95#define IS_NON_FINAL_FOLD(c) _IS_NON_FINAL_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
2c61f163 96#define IS_IN_SOME_FOLD_L1(c) _IS_IN_SOME_FOLD_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c)
94dc5c2d 97
a687059c
LW
98#ifndef STATIC
99#define STATIC static
100#endif
101
b1603ef8 102
09b2b2e6 103struct RExC_state_t {
514a91f1
DM
104 U32 flags; /* RXf_* are we folding, multilining? */
105 U32 pm_flags; /* PMf_* stuff from the calling PMOP */
830247a4 106 char *precomp; /* uncompiled string. */
288b8c02 107 REGEXP *rx_sv; /* The SV that is the regexp. */
f8fc2ecf
YO
108 regexp *rx; /* perl core regexp structure */
109 regexp_internal *rxi; /* internal data for regexp object pprivate field */
fac92740 110 char *start; /* Start of input for compile */
830247a4
IZ
111 char *end; /* End of input for compile */
112 char *parse; /* Input-scan pointer. */
ea3daa5d 113 SSize_t whilem_seen; /* number of WHILEM in this expr */
fac92740 114 regnode *emit_start; /* Start of emitted-code area */
3b57cd43 115 regnode *emit_bound; /* First regnode outside of the allocated space */
f7c7e32a
DM
116 regnode *emit; /* Code-emit pointer; if = &emit_dummy,
117 implies compiling, so don't emit */
9a81a976
KW
118 regnode_ssc emit_dummy; /* placeholder for emit to point to;
119 large enough for the largest
120 non-EXACTish node, so can use it as
121 scratch in pass1 */
830247a4
IZ
122 I32 naughty; /* How bad is this pattern? */
123 I32 sawback; /* Did we see \1, ...? */
124 U32 seen;
ea3daa5d 125 SSize_t size; /* Code size. */
c74340f9
YO
126 I32 npar; /* Capture buffer count, (OPEN). */
127 I32 cpar; /* Capture buffer count, (CLOSE). */
e2e6a0f1 128 I32 nestroot; /* root parens we are in - used by accept */
830247a4
IZ
129 I32 extralen;
130 I32 seen_zerolen;
40d049e4
YO
131 regnode **open_parens; /* pointers to open parens */
132 regnode **close_parens; /* pointers to close parens */
133 regnode *opend; /* END node in program */
02daf0ab
YO
134 I32 utf8; /* whether the pattern is utf8 or not */
135 I32 orig_utf8; /* whether the pattern was originally in utf8 */
136 /* XXX use this for future optimisation of case
137 * where pattern must be upgraded to utf8. */
e40e74fe
KW
138 I32 uni_semantics; /* If a d charset modifier should use unicode
139 rules, even if the pattern is not in
140 utf8 */
81714fb9 141 HV *paren_names; /* Paren names */
1f1031fe 142
40d049e4
YO
143 regnode **recurse; /* Recurse regops */
144 I32 recurse_count; /* Number of recurse regops */
b57e4118 145 I32 in_lookbehind;
4624b182 146 I32 contains_locale;
cfafade5 147 I32 contains_i;
bb3f3ed2 148 I32 override_recoding;
9d53c457 149 I32 in_multi_char_class;
3d2bd50a 150 struct reg_code_block *code_blocks; /* positions of literal (?{})
68e2671b 151 within pattern */
b1603ef8
DM
152 int num_code_blocks; /* size of code_blocks[] */
153 int code_index; /* next code_blocks[] slot */
830247a4
IZ
154#if ADD_TO_REGEXEC
155 char *starttry; /* -Dr: where regtry was called. */
156#define RExC_starttry (pRExC_state->starttry)
157#endif
d24ca0c5 158 SV *runtime_code_qr; /* qr with the runtime code blocks */
3dab1dad 159#ifdef DEBUGGING
be8e71aa 160 const char *lastparse;
3dab1dad 161 I32 lastnum;
1f1031fe 162 AV *paren_name_list; /* idx -> name */
3dab1dad
YO
163#define RExC_lastparse (pRExC_state->lastparse)
164#define RExC_lastnum (pRExC_state->lastnum)
1f1031fe 165#define RExC_paren_name_list (pRExC_state->paren_name_list)
3dab1dad 166#endif
09b2b2e6 167};
830247a4 168
e2509266 169#define RExC_flags (pRExC_state->flags)
514a91f1 170#define RExC_pm_flags (pRExC_state->pm_flags)
830247a4 171#define RExC_precomp (pRExC_state->precomp)
288b8c02 172#define RExC_rx_sv (pRExC_state->rx_sv)
830247a4 173#define RExC_rx (pRExC_state->rx)
f8fc2ecf 174#define RExC_rxi (pRExC_state->rxi)
fac92740 175#define RExC_start (pRExC_state->start)
830247a4
IZ
176#define RExC_end (pRExC_state->end)
177#define RExC_parse (pRExC_state->parse)
178#define RExC_whilem_seen (pRExC_state->whilem_seen)
7122b237
YO
179#ifdef RE_TRACK_PATTERN_OFFSETS
180#define RExC_offsets (pRExC_state->rxi->u.offsets) /* I am not like the others */
181#endif
830247a4 182#define RExC_emit (pRExC_state->emit)
f7c7e32a 183#define RExC_emit_dummy (pRExC_state->emit_dummy)
fac92740 184#define RExC_emit_start (pRExC_state->emit_start)
3b57cd43 185#define RExC_emit_bound (pRExC_state->emit_bound)
830247a4
IZ
186#define RExC_naughty (pRExC_state->naughty)
187#define RExC_sawback (pRExC_state->sawback)
188#define RExC_seen (pRExC_state->seen)
189#define RExC_size (pRExC_state->size)
190#define RExC_npar (pRExC_state->npar)
e2e6a0f1 191#define RExC_nestroot (pRExC_state->nestroot)
830247a4
IZ
192#define RExC_extralen (pRExC_state->extralen)
193#define RExC_seen_zerolen (pRExC_state->seen_zerolen)
1aa99e6b 194#define RExC_utf8 (pRExC_state->utf8)
e40e74fe 195#define RExC_uni_semantics (pRExC_state->uni_semantics)
02daf0ab 196#define RExC_orig_utf8 (pRExC_state->orig_utf8)
40d049e4
YO
197#define RExC_open_parens (pRExC_state->open_parens)
198#define RExC_close_parens (pRExC_state->close_parens)
199#define RExC_opend (pRExC_state->opend)
81714fb9 200#define RExC_paren_names (pRExC_state->paren_names)
40d049e4
YO
201#define RExC_recurse (pRExC_state->recurse)
202#define RExC_recurse_count (pRExC_state->recurse_count)
b57e4118 203#define RExC_in_lookbehind (pRExC_state->in_lookbehind)
4624b182 204#define RExC_contains_locale (pRExC_state->contains_locale)
cfafade5 205#define RExC_contains_i (pRExC_state->contains_i)
9d53c457
KW
206#define RExC_override_recoding (pRExC_state->override_recoding)
207#define RExC_in_multi_char_class (pRExC_state->in_multi_char_class)
830247a4 208
cde0cee5 209
a687059c
LW
210#define ISMULT1(c) ((c) == '*' || (c) == '+' || (c) == '?')
211#define ISMULT2(s) ((*s) == '*' || (*s) == '+' || (*s) == '?' || \
4d68ffa0 212 ((*s) == '{' && regcurly(s, FALSE)))
a687059c
LW
213
214/*
215 * Flags to be passed up and down.
216 */
a687059c 217#define WORST 0 /* Worst case. */
a3b492c3 218#define HASWIDTH 0x01 /* Known to match non-null strings. */
fda99bee 219
e64f369d 220/* Simple enough to be STAR/PLUS operand; in an EXACTish node must be a single
2fd92675
KW
221 * character. (There needs to be a case: in the switch statement in regexec.c
222 * for any node marked SIMPLE.) Note that this is not the same thing as
223 * REGNODE_SIMPLE */
fda99bee 224#define SIMPLE 0x02
e64f369d 225#define SPSTART 0x04 /* Starts with * or + */
8d9c2815
NC
226#define POSTPONED 0x08 /* (?1),(?&name), (??{...}) or similar */
227#define TRYAGAIN 0x10 /* Weeded out a declaration. */
228#define RESTART_UTF8 0x20 /* Restart, need to calcuate sizes as UTF-8 */
a687059c 229
3dab1dad
YO
230#define REG_NODE_NUM(x) ((x) ? (int)((x)-RExC_emit_start) : -1)
231
07be1b83
YO
232/* whether trie related optimizations are enabled */
233#if PERL_ENABLE_EXTENDED_TRIE_OPTIMISATION
234#define TRIE_STUDY_OPT
786e8c11 235#define FULL_TRIE_STUDY
07be1b83
YO
236#define TRIE_STCLASS
237#endif
1de06328
YO
238
239
40d049e4
YO
240
241#define PBYTE(u8str,paren) ((U8*)(u8str))[(paren) >> 3]
242#define PBITVAL(paren) (1 << ((paren) & 7))
243#define PAREN_TEST(u8str,paren) ( PBYTE(u8str,paren) & PBITVAL(paren))
244#define PAREN_SET(u8str,paren) PBYTE(u8str,paren) |= PBITVAL(paren)
245#define PAREN_UNSET(u8str,paren) PBYTE(u8str,paren) &= (~PBITVAL(paren))
246
bbd61b5f 247#define REQUIRE_UTF8 STMT_START { \
8d9c2815
NC
248 if (!UTF) { \
249 *flagp = RESTART_UTF8; \
250 return NULL; \
251 } \
bbd61b5f 252 } STMT_END
40d049e4 253
f19b1a63
KW
254/* This converts the named class defined in regcomp.h to its equivalent class
255 * number defined in handy.h. */
256#define namedclass_to_classnum(class) ((int) ((class) / 2))
257#define classnum_to_namedclass(classnum) ((classnum) * 2)
258
de92f5e6
KW
259#define _invlist_union_complement_2nd(a, b, output) \
260 _invlist_union_maybe_complement_2nd(a, b, TRUE, output)
261#define _invlist_intersection_complement_2nd(a, b, output) \
262 _invlist_intersection_maybe_complement_2nd(a, b, TRUE, output)
263
fb38762f
KW
264STATIC void
265S_ssc_flags_and(regnode_ssc *ssc, const U8 and_with)
266{
267 /* Take the flags 'and_with' and accumulate them anded into the flags for
268 * the SSC 'ssc'. The non-SSC related flags in 'and_with' are ignored. */
269
270 const U8 ssc_only_flags = ANYOF_FLAGS(ssc) & ~ANYOF_LOCALE_FLAGS;
271
272 PERL_ARGS_ASSERT_SSC_FLAGS_AND;
273
274 /* Use just the SSC-related flags from 'and_with' */
275 ANYOF_FLAGS(ssc) &= (and_with & ANYOF_LOCALE_FLAGS);
276 ANYOF_FLAGS(ssc) |= ssc_only_flags;
277}
278
279STATIC int
280S_ssc_is_cp_posixl_init(pTHX_ const RExC_state_t *pRExC_state,
281 const regnode_ssc *ssc)
282{
283 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
284 * to the list of code points matched, and locale posix classes; hence does
285 * not check its flags) */
286
287 UV start, end;
288 bool ret;
289
290 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
291
292 assert(OP(ssc) == ANYOF_SYNTHETIC);
293
294 invlist_iterinit(ssc->invlist);
295 ret = invlist_iternext(ssc->invlist, &start, &end)
296 && start == 0
297 && end == UV_MAX;
298
299 invlist_iterfinish(ssc->invlist);
300
301 if (! ret) {
302 return FALSE;
303 }
304
305 if (RExC_contains_locale) {
306 if (! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE)
307 || ! (ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
308 || ! ANYOF_POSIXL_TEST_ALL_SET(ssc))
309 {
310 return FALSE;
311 }
312 }
313
314 return TRUE;
315}
316
317STATIC SV*
318S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
319 const regnode_charclass_posixl* const node)
320{
321 /* Returns a mortal inversion list defining which code points are matched
322 * by 'node', which is of type ANYOF. Handles complementing the result if
323 * appropriate. If some code points aren't knowable at this time, the
324 * returned list must, and will, contain every possible code point. */
325
326 SV* invlist = sv_2mortal(_new_invlist(0));
327 unsigned int i;
328 const U32 n = ARG(node);
329
330 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
331
332 /* Look at the data structure created by S_set_ANYOF_arg() */
333 if (n != ANYOF_NONBITMAP_EMPTY) {
334 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
335 AV * const av = MUTABLE_AV(SvRV(rv));
336 SV **const ary = AvARRAY(av);
337 assert(RExC_rxi->data->what[n] == 's');
338
339 if (ary[1] && ary[1] != &PL_sv_undef) { /* Has compile-time swash */
340 invlist = sv_2mortal(invlist_clone(_get_swash_invlist(ary[1])));
341 }
342 else if (ary[0] && ary[0] != &PL_sv_undef) {
343
344 /* Here, no compile-time swash, and there are things that won't be
345 * known until runtime -- we have to assume it could be anything */
346 return _add_range_to_invlist(invlist, 0, UV_MAX);
347 }
348 else {
349
350 /* Here no compile-time swash, and no run-time only data. Use the
351 * node's inversion list */
352 invlist = sv_2mortal(invlist_clone(ary[2]));
353 }
354 }
355
356 /* An ANYOF node contains a bitmap for the first 256 code points, and an
357 * inversion list for the others, but if there are code points that should
358 * match only conditionally on the target string being UTF-8, those are
359 * placed in the inversion list, and not the bitmap. Since there are
360 * circumstances under which they could match, they are included in the
361 * SSC. But if the ANYOF node is to be inverted, we have to exclude them
362 * here, so that when we invert below, the end result actually does include
363 * them. (Think about "\xe0" =~ /[^\xc0]/di;). We have to do this here
364 * before we add the unconditionally matched code points */
365 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
366 _invlist_intersection_complement_2nd(invlist,
367 PL_UpperLatin1,
368 &invlist);
369 }
370
371 /* Add in the points from the bit map */
372 for (i = 0; i < 256; i++) {
373 if (ANYOF_BITMAP_TEST(node, i)) {
374 invlist = add_cp_to_invlist(invlist, i);
375 }
376 }
377
378 /* If this can match all upper Latin1 code points, have to add them
379 * as well */
380 if (ANYOF_FLAGS(node) & ANYOF_NON_UTF8_LATIN1_ALL) {
381 _invlist_union(invlist, PL_UpperLatin1, &invlist);
382 }
383
384 /* Similarly for these */
385 if (ANYOF_FLAGS(node) & ANYOF_ABOVE_LATIN1_ALL) {
386 invlist = _add_range_to_invlist(invlist, 256, UV_MAX);
387 }
388
389 if (ANYOF_FLAGS(node) & ANYOF_INVERT) {
390 _invlist_invert(invlist);
391 }
392
393 return invlist;
394}
395
396PERL_STATIC_INLINE void
397S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
398{
399 PERL_ARGS_ASSERT_SSC_UNION;
400
401 assert(OP(ssc) == ANYOF_SYNTHETIC);
402
403 _invlist_union_maybe_complement_2nd(ssc->invlist,
404 invlist,
405 invert2nd,
406 &ssc->invlist);
407}
408
409PERL_STATIC_INLINE void
410S_ssc_intersection(pTHX_ regnode_ssc *ssc,
411 SV* const invlist,
412 const bool invert2nd)
413{
414 PERL_ARGS_ASSERT_SSC_INTERSECTION;
415
416 assert(OP(ssc) == ANYOF_SYNTHETIC);
417
418 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
419 invlist,
420 invert2nd,
421 &ssc->invlist);
422}
423
424PERL_STATIC_INLINE void
425S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
426{
427 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
428
429 assert(OP(ssc) == ANYOF_SYNTHETIC);
430
431 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
432}
433
434PERL_STATIC_INLINE void
435S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
436{
437 /* AND just the single code point 'cp' into the SSC 'ssc' */
438
439 SV* cp_list = _new_invlist(2);
440
441 PERL_ARGS_ASSERT_SSC_CP_AND;
442
443 assert(OP(ssc) == ANYOF_SYNTHETIC);
444
445 cp_list = add_cp_to_invlist(cp_list, cp);
446 ssc_intersection(ssc, cp_list,
447 FALSE /* Not inverted */
448 );
449 SvREFCNT_dec_NN(cp_list);
450}
451
452PERL_STATIC_INLINE void
453S_ssc_clear_locale(pTHX_ regnode_ssc *ssc)
454{
455 /* Set the SSC 'ssc' to not match any locale things */
456
457 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
458
459 assert(OP(ssc) == ANYOF_SYNTHETIC);
460
461 ANYOF_POSIXL_ZERO(ssc);
462 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
463}
464
465STATIC void
466S_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
467{
468 /* The inversion list in the SSC is marked mortal; now we need a more
469 * permanent copy, which is stored the same way that is done in a regular
470 * ANYOF node, with the first 256 code points in a bit map */
471
472 SV* invlist = invlist_clone(ssc->invlist);
473
474 PERL_ARGS_ASSERT_SSC_FINALIZE;
475
476 assert(OP(ssc) == ANYOF_SYNTHETIC);
477
478 populate_ANYOF_from_invlist( (regnode *) ssc, &invlist);
479
480 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL, FALSE);
481
482 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE) || RExC_contains_locale);
483}
484
1de06328
YO
485/* About scan_data_t.
486
487 During optimisation we recurse through the regexp program performing
488 various inplace (keyhole style) optimisations. In addition study_chunk
489 and scan_commit populate this data structure with information about
490 what strings MUST appear in the pattern. We look for the longest
3b753521 491 string that must appear at a fixed location, and we look for the
1de06328
YO
492 longest string that may appear at a floating location. So for instance
493 in the pattern:
494
495 /FOO[xX]A.*B[xX]BAR/
496
497 Both 'FOO' and 'A' are fixed strings. Both 'B' and 'BAR' are floating
498 strings (because they follow a .* construct). study_chunk will identify
499 both FOO and BAR as being the longest fixed and floating strings respectively.
500
501 The strings can be composites, for instance
502
503 /(f)(o)(o)/
504
505 will result in a composite fixed substring 'foo'.
506
507 For each string some basic information is maintained:
508
509 - offset or min_offset
510 This is the position the string must appear at, or not before.
511 It also implicitly (when combined with minlenp) tells us how many
3b753521
FN
512 characters must match before the string we are searching for.
513 Likewise when combined with minlenp and the length of the string it
1de06328
YO
514 tells us how many characters must appear after the string we have
515 found.
516
517 - max_offset
518 Only used for floating strings. This is the rightmost point that
ea3daa5d 519 the string can appear at. If set to SSize_t_MAX it indicates that the
1de06328
YO
520 string can occur infinitely far to the right.
521
522 - minlenp
2d608413
KW
523 A pointer to the minimum number of characters of the pattern that the
524 string was found inside. This is important as in the case of positive
1de06328
YO
525 lookahead or positive lookbehind we can have multiple patterns
526 involved. Consider
527
528 /(?=FOO).*F/
529
530 The minimum length of the pattern overall is 3, the minimum length
531 of the lookahead part is 3, but the minimum length of the part that
532 will actually match is 1. So 'FOO's minimum length is 3, but the
533 minimum length for the F is 1. This is important as the minimum length
534 is used to determine offsets in front of and behind the string being
535 looked for. Since strings can be composites this is the length of the
486ec47a 536 pattern at the time it was committed with a scan_commit. Note that
1de06328
YO
537 the length is calculated by study_chunk, so that the minimum lengths
538 are not known until the full pattern has been compiled, thus the
539 pointer to the value.
540
541 - lookbehind
542
543 In the case of lookbehind the string being searched for can be
544 offset past the start point of the final matching string.
545 If this value was just blithely removed from the min_offset it would
546 invalidate some of the calculations for how many chars must match
547 before or after (as they are derived from min_offset and minlen and
548 the length of the string being searched for).
549 When the final pattern is compiled and the data is moved from the
550 scan_data_t structure into the regexp structure the information
551 about lookbehind is factored in, with the information that would
552 have been lost precalculated in the end_shift field for the
553 associated string.
554
555 The fields pos_min and pos_delta are used to store the minimum offset
556 and the delta to the maximum offset at the current point in the pattern.
557
558*/
2c2d71f5
JH
559
560typedef struct scan_data_t {
1de06328
YO
561 /*I32 len_min; unused */
562 /*I32 len_delta; unused */
49f55535 563 SSize_t pos_min;
ea3daa5d 564 SSize_t pos_delta;
2c2d71f5 565 SV *last_found;
ea3daa5d 566 SSize_t last_end; /* min value, <0 unless valid. */
49f55535 567 SSize_t last_start_min;
ea3daa5d 568 SSize_t last_start_max;
1de06328
YO
569 SV **longest; /* Either &l_fixed, or &l_float. */
570 SV *longest_fixed; /* longest fixed string found in pattern */
49f55535 571 SSize_t offset_fixed; /* offset where it starts */
ea3daa5d 572 SSize_t *minlen_fixed; /* pointer to the minlen relevant to the string */
1de06328
YO
573 I32 lookbehind_fixed; /* is the position of the string modfied by LB */
574 SV *longest_float; /* longest floating string found in pattern */
49f55535 575 SSize_t offset_float_min; /* earliest point in string it can appear */
ea3daa5d
FC
576 SSize_t offset_float_max; /* latest point in string it can appear */
577 SSize_t *minlen_float; /* pointer to the minlen relevant to the string */
49f55535 578 SSize_t lookbehind_float; /* is the pos of the string modified by LB */
2c2d71f5
JH
579 I32 flags;
580 I32 whilem_c;
49f55535 581 SSize_t *last_closep;
b8f7bb16 582 regnode_ssc *start_class;
2c2d71f5
JH
583} scan_data_t;
584
c02c3054
KW
585/* The below is perhaps overboard, but this allows us to save a test at the
586 * expense of a mask. This is because on both EBCDIC and ASCII machines, 'A'
587 * and 'a' differ by a single bit; the same with the upper and lower case of
588 * all other ASCII-range alphabetics. On ASCII platforms, they are 32 apart;
589 * on EBCDIC, they are 64. This uses an exclusive 'or' to find that bit and
590 * then inverts it to form a mask, with just a single 0, in the bit position
591 * where the upper- and lowercase differ. XXX There are about 40 other
592 * instances in the Perl core where this micro-optimization could be used.
593 * Should decide if maintenance cost is worse, before changing those
594 *
595 * Returns a boolean as to whether or not 'v' is either a lowercase or
596 * uppercase instance of 'c', where 'c' is in [A-Za-z]. If 'c' is a
597 * compile-time constant, the generated code is better than some optimizing
598 * compilers figure out, amounting to a mask and test. The results are
599 * meaningless if 'c' is not one of [A-Za-z] */
600#define isARG2_lower_or_UPPER_ARG1(c, v) \
601 (((v) & ~('A' ^ 'a')) == ((c) & ~('A' ^ 'a')))
602
a687059c 603/*
e50aee73 604 * Forward declarations for pregcomp()'s friends.
a687059c 605 */
a0d0e21e 606
27da23d5 607static const scan_data_t zero_scan_data =
1de06328 608 { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ,0};
c277df42
IZ
609
610#define SF_BEFORE_EOL (SF_BEFORE_SEOL|SF_BEFORE_MEOL)
07be1b83
YO
611#define SF_BEFORE_SEOL 0x0001
612#define SF_BEFORE_MEOL 0x0002
c277df42
IZ
613#define SF_FIX_BEFORE_EOL (SF_FIX_BEFORE_SEOL|SF_FIX_BEFORE_MEOL)
614#define SF_FL_BEFORE_EOL (SF_FL_BEFORE_SEOL|SF_FL_BEFORE_MEOL)
615
44e3dfd2
BF
616#define SF_FIX_SHIFT_EOL (+2)
617#define SF_FL_SHIFT_EOL (+4)
c277df42
IZ
618
619#define SF_FIX_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FIX_SHIFT_EOL)
620#define SF_FIX_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FIX_SHIFT_EOL)
621
622#define SF_FL_BEFORE_SEOL (SF_BEFORE_SEOL << SF_FL_SHIFT_EOL)
623#define SF_FL_BEFORE_MEOL (SF_BEFORE_MEOL << SF_FL_SHIFT_EOL) /* 0x20 */
07be1b83
YO
624#define SF_IS_INF 0x0040
625#define SF_HAS_PAR 0x0080
626#define SF_IN_PAR 0x0100
627#define SF_HAS_EVAL 0x0200
628#define SCF_DO_SUBSTR 0x0400
653099ff
GS
629#define SCF_DO_STCLASS_AND 0x0800
630#define SCF_DO_STCLASS_OR 0x1000
631#define SCF_DO_STCLASS (SCF_DO_STCLASS_AND|SCF_DO_STCLASS_OR)
e1901655 632#define SCF_WHILEM_VISITED_POS 0x2000
c277df42 633
786e8c11 634#define SCF_TRIE_RESTUDY 0x4000 /* Do restudy? */
e2e6a0f1 635#define SCF_SEEN_ACCEPT 0x8000
688e0391 636#define SCF_TRIE_DOING_RESTUDY 0x10000
07be1b83 637
43fead97 638#define UTF cBOOL(RExC_utf8)
00b27cfc
KW
639
640/* The enums for all these are ordered so things work out correctly */
a62b1201 641#define LOC (get_regex_charset(RExC_flags) == REGEX_LOCALE_CHARSET)
cfaf538b 642#define DEPENDS_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_DEPENDS_CHARSET)
00b27cfc 643#define UNI_SEMANTICS (get_regex_charset(RExC_flags) == REGEX_UNICODE_CHARSET)
cfaf538b
KW
644#define AT_LEAST_UNI_SEMANTICS (get_regex_charset(RExC_flags) >= REGEX_UNICODE_CHARSET)
645#define ASCII_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_RESTRICTED_CHARSET)
2f7f8cb1 646#define AT_LEAST_ASCII_RESTRICTED (get_regex_charset(RExC_flags) >= REGEX_ASCII_RESTRICTED_CHARSET)
a725e29c 647#define ASCII_FOLD_RESTRICTED (get_regex_charset(RExC_flags) == REGEX_ASCII_MORE_RESTRICTED_CHARSET)
a62b1201 648
43fead97 649#define FOLD cBOOL(RExC_flags & RXf_PMf_FOLD)
a0ed51b3 650
93733859 651#define OOB_NAMEDCLASS -1
b8c5462f 652
8e661ac5
KW
653/* There is no code point that is out-of-bounds, so this is problematic. But
654 * its only current use is to initialize a variable that is always set before
655 * looked at. */
656#define OOB_UNICODE 0xDEADBEEF
657
a0ed51b3
LW
658#define CHR_SVLEN(sv) (UTF ? sv_len_utf8(sv) : SvCUR(sv))
659#define CHR_DIST(a,b) (UTF ? utf8_distance(a,b) : a - b)
660
8615cb43 661
b45f050a
JF
662/* length of regex to show in messages that don't mark a position within */
663#define RegexLengthToShowInErrorMessages 127
664
665/*
666 * If MARKER[12] are adjusted, be sure to adjust the constants at the top
667 * of t/op/regmesg.t, the tests in t/op/re_tests, and those in
668 * op/pragma/warn/regcomp.
669 */
7253e4e3
RK
670#define MARKER1 "<-- HERE" /* marker as it appears in the description */
671#define MARKER2 " <-- HERE " /* marker as it appears within the regex */
b81d288d 672
c1d900c3 673#define REPORT_LOCATION " in regex; marked by " MARKER1 " in m/%"UTF8f MARKER2 "%"UTF8f"/"
b45f050a 674
c1d900c3
BF
675#define REPORT_LOCATION_ARGS(offset) \
676 UTF8fARG(UTF, offset, RExC_precomp), \
677 UTF8fARG(UTF, RExC_end - RExC_precomp - offset, RExC_precomp + offset)
678
b45f050a
JF
679/*
680 * Calls SAVEDESTRUCTOR_X if needed, then calls Perl_croak with the given
681 * arg. Show regex, up to a maximum length. If it's too long, chop and add
682 * "...".
683 */
58e23c8d 684#define _FAIL(code) STMT_START { \
bfed75c6 685 const char *ellipses = ""; \
ccb2c380
MP
686 IV len = RExC_end - RExC_precomp; \
687 \
688 if (!SIZE_ONLY) \
a5e7bc51 689 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
690 if (len > RegexLengthToShowInErrorMessages) { \
691 /* chop 10 shorter than the max, to ensure meaning of "..." */ \
692 len = RegexLengthToShowInErrorMessages - 10; \
693 ellipses = "..."; \
694 } \
58e23c8d 695 code; \
ccb2c380 696} STMT_END
8615cb43 697
58e23c8d 698#define FAIL(msg) _FAIL( \
c1d900c3
BF
699 Perl_croak(aTHX_ "%s in regex m/%"UTF8f"%s/", \
700 msg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d
YO
701
702#define FAIL2(msg,arg) _FAIL( \
c1d900c3
BF
703 Perl_croak(aTHX_ msg " in regex m/%"UTF8f"%s/", \
704 arg, UTF8fARG(UTF, len, RExC_precomp), ellipses))
58e23c8d 705
b45f050a 706/*
b45f050a
JF
707 * Simple_vFAIL -- like FAIL, but marks the current location in the scan
708 */
ccb2c380 709#define Simple_vFAIL(m) STMT_START { \
a28509cc 710 const IV offset = RExC_parse - RExC_precomp; \
ccb2c380 711 Perl_croak(aTHX_ "%s" REPORT_LOCATION, \
c1d900c3 712 m, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 713} STMT_END
b45f050a
JF
714
715/*
716 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL()
717 */
ccb2c380
MP
718#define vFAIL(m) STMT_START { \
719 if (!SIZE_ONLY) \
a5e7bc51 720 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
721 Simple_vFAIL(m); \
722} STMT_END
b45f050a
JF
723
724/*
725 * Like Simple_vFAIL(), but accepts two arguments.
726 */
ccb2c380 727#define Simple_vFAIL2(m,a1) STMT_START { \
a28509cc 728 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
729 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
730 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 731} STMT_END
b45f050a
JF
732
733/*
734 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL2().
735 */
ccb2c380
MP
736#define vFAIL2(m,a1) STMT_START { \
737 if (!SIZE_ONLY) \
a5e7bc51 738 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
739 Simple_vFAIL2(m, a1); \
740} STMT_END
b45f050a
JF
741
742
743/*
744 * Like Simple_vFAIL(), but accepts three arguments.
745 */
ccb2c380 746#define Simple_vFAIL3(m, a1, a2) STMT_START { \
a28509cc 747 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
748 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, \
749 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 750} STMT_END
b45f050a
JF
751
752/*
753 * Calls SAVEDESTRUCTOR_X if needed, then Simple_vFAIL3().
754 */
ccb2c380
MP
755#define vFAIL3(m,a1,a2) STMT_START { \
756 if (!SIZE_ONLY) \
a5e7bc51 757 SAVEFREESV(RExC_rx_sv); \
ccb2c380
MP
758 Simple_vFAIL3(m, a1, a2); \
759} STMT_END
b45f050a
JF
760
761/*
762 * Like Simple_vFAIL(), but accepts four arguments.
763 */
ccb2c380 764#define Simple_vFAIL4(m, a1, a2, a3) STMT_START { \
a28509cc 765 const IV offset = RExC_parse - RExC_precomp; \
c1d900c3
BF
766 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, a2, a3, \
767 REPORT_LOCATION_ARGS(offset)); \
ccb2c380 768} STMT_END
b45f050a 769
95db3ffa
KW
770#define vFAIL4(m,a1,a2,a3) STMT_START { \
771 if (!SIZE_ONLY) \
772 SAVEFREESV(RExC_rx_sv); \
773 Simple_vFAIL4(m, a1, a2, a3); \
774} STMT_END
775
946095af
BF
776/* A specialized version of vFAIL2 that works with UTF8f */
777#define vFAIL2utf8f(m, a1) STMT_START { \
ef3f731d
BF
778 const IV offset = RExC_parse - RExC_precomp; \
779 if (!SIZE_ONLY) \
780 SAVEFREESV(RExC_rx_sv); \
781 S_re_croak2(aTHX_ UTF, m, REPORT_LOCATION, a1, \
782 REPORT_LOCATION_ARGS(offset)); \
946095af
BF
783} STMT_END
784
785
5e0a247b
KW
786/* m is not necessarily a "literal string", in this macro */
787#define reg_warn_non_literal_string(loc, m) STMT_START { \
788 const IV offset = loc - RExC_precomp; \
789 Perl_warner(aTHX_ packWARN(WARN_REGEXP), "%s" REPORT_LOCATION, \
c1d900c3 790 m, REPORT_LOCATION_ARGS(offset)); \
5e0a247b
KW
791} STMT_END
792
668c081a 793#define ckWARNreg(loc,m) STMT_START { \
a28509cc 794 const IV offset = loc - RExC_precomp; \
f10f4c18 795 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 796 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
797} STMT_END
798
0d6106aa
KW
799#define vWARN_dep(loc, m) STMT_START { \
800 const IV offset = loc - RExC_precomp; \
801 Perl_warner(aTHX_ packWARN(WARN_DEPRECATED), m REPORT_LOCATION, \
c1d900c3 802 REPORT_LOCATION_ARGS(offset)); \
0d6106aa
KW
803} STMT_END
804
147508a2
KW
805#define ckWARNdep(loc,m) STMT_START { \
806 const IV offset = loc - RExC_precomp; \
807 Perl_ck_warner_d(aTHX_ packWARN(WARN_DEPRECATED), \
808 m REPORT_LOCATION, \
c1d900c3 809 REPORT_LOCATION_ARGS(offset)); \
147508a2
KW
810} STMT_END
811
668c081a 812#define ckWARNregdep(loc,m) STMT_START { \
a28509cc 813 const IV offset = loc - RExC_precomp; \
d1d15184 814 Perl_ck_warner_d(aTHX_ packWARN2(WARN_DEPRECATED, WARN_REGEXP), \
f10f4c18 815 m REPORT_LOCATION, \
c1d900c3 816 REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
817} STMT_END
818
b23eb183 819#define ckWARN2reg_d(loc,m, a1) STMT_START { \
2335b3d3 820 const IV offset = loc - RExC_precomp; \
b23eb183 821 Perl_ck_warner_d(aTHX_ packWARN(WARN_REGEXP), \
2335b3d3 822 m REPORT_LOCATION, \
c1d900c3 823 a1, REPORT_LOCATION_ARGS(offset)); \
2335b3d3
KW
824} STMT_END
825
668c081a 826#define ckWARN2reg(loc, m, a1) STMT_START { \
a28509cc 827 const IV offset = loc - RExC_precomp; \
668c081a 828 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 829 a1, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
830} STMT_END
831
832#define vWARN3(loc, m, a1, a2) STMT_START { \
a28509cc 833 const IV offset = loc - RExC_precomp; \
ccb2c380 834 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 835 a1, a2, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
836} STMT_END
837
668c081a
NC
838#define ckWARN3reg(loc, m, a1, a2) STMT_START { \
839 const IV offset = loc - RExC_precomp; \
840 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 841 a1, a2, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
842} STMT_END
843
ccb2c380 844#define vWARN4(loc, m, a1, a2, a3) STMT_START { \
a28509cc 845 const IV offset = loc - RExC_precomp; \
ccb2c380 846 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 847 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
ccb2c380
MP
848} STMT_END
849
668c081a
NC
850#define ckWARN4reg(loc, m, a1, a2, a3) STMT_START { \
851 const IV offset = loc - RExC_precomp; \
852 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 853 a1, a2, a3, REPORT_LOCATION_ARGS(offset)); \
668c081a
NC
854} STMT_END
855
ccb2c380 856#define vWARN5(loc, m, a1, a2, a3, a4) STMT_START { \
a28509cc 857 const IV offset = loc - RExC_precomp; \
ccb2c380 858 Perl_warner(aTHX_ packWARN(WARN_REGEXP), m REPORT_LOCATION, \
c1d900c3 859 a1, a2, a3, a4, REPORT_LOCATION_ARGS(offset)); \
ccb2c380 860} STMT_END
9d1d55b5 861
8615cb43 862
cd439c50 863/* Allow for side effects in s */
ccb2c380
MP
864#define REGC(c,s) STMT_START { \
865 if (!SIZE_ONLY) *(s) = (c); else (void)(s); \
866} STMT_END
cd439c50 867
fac92740
MJD
868/* Macros for recording node offsets. 20001227 mjd@plover.com
869 * Nodes are numbered 1, 2, 3, 4. Node #n's position is recorded in
870 * element 2*n-1 of the array. Element #2n holds the byte length node #n.
871 * Element 0 holds the number n.
07be1b83 872 * Position is 1 indexed.
fac92740 873 */
7122b237
YO
874#ifndef RE_TRACK_PATTERN_OFFSETS
875#define Set_Node_Offset_To_R(node,byte)
876#define Set_Node_Offset(node,byte)
877#define Set_Cur_Node_Offset
878#define Set_Node_Length_To_R(node,len)
879#define Set_Node_Length(node,len)
6a86c6ad 880#define Set_Node_Cur_Length(node,start)
7122b237
YO
881#define Node_Offset(n)
882#define Node_Length(n)
883#define Set_Node_Offset_Length(node,offset,len)
884#define ProgLen(ri) ri->u.proglen
885#define SetProgLen(ri,x) ri->u.proglen = x
886#else
887#define ProgLen(ri) ri->u.offsets[0]
888#define SetProgLen(ri,x) ri->u.offsets[0] = x
ccb2c380
MP
889#define Set_Node_Offset_To_R(node,byte) STMT_START { \
890 if (! SIZE_ONLY) { \
891 MJD_OFFSET_DEBUG(("** (%d) offset of node %d is %d.\n", \
2a49f0f5 892 __LINE__, (int)(node), (int)(byte))); \
ccb2c380 893 if((node) < 0) { \
551405c4 894 Perl_croak(aTHX_ "value of node is %d in Offset macro", (int)(node)); \
ccb2c380
MP
895 } else { \
896 RExC_offsets[2*(node)-1] = (byte); \
897 } \
898 } \
899} STMT_END
900
901#define Set_Node_Offset(node,byte) \
902 Set_Node_Offset_To_R((node)-RExC_emit_start, (byte)-RExC_start)
903#define Set_Cur_Node_Offset Set_Node_Offset(RExC_emit, RExC_parse)
904
905#define Set_Node_Length_To_R(node,len) STMT_START { \
906 if (! SIZE_ONLY) { \
907 MJD_OFFSET_DEBUG(("** (%d) size of node %d is %d.\n", \
551405c4 908 __LINE__, (int)(node), (int)(len))); \
ccb2c380 909 if((node) < 0) { \
551405c4 910 Perl_croak(aTHX_ "value of node is %d in Length macro", (int)(node)); \
ccb2c380
MP
911 } else { \
912 RExC_offsets[2*(node)] = (len); \
913 } \
914 } \
915} STMT_END
916
917#define Set_Node_Length(node,len) \
918 Set_Node_Length_To_R((node)-RExC_emit_start, len)
6a86c6ad
NC
919#define Set_Node_Cur_Length(node, start) \
920 Set_Node_Length(node, RExC_parse - start)
fac92740
MJD
921
922/* Get offsets and lengths */
923#define Node_Offset(n) (RExC_offsets[2*((n)-RExC_emit_start)-1])
924#define Node_Length(n) (RExC_offsets[2*((n)-RExC_emit_start)])
925
07be1b83
YO
926#define Set_Node_Offset_Length(node,offset,len) STMT_START { \
927 Set_Node_Offset_To_R((node)-RExC_emit_start, (offset)); \
928 Set_Node_Length_To_R((node)-RExC_emit_start, (len)); \
929} STMT_END
7122b237 930#endif
07be1b83
YO
931
932#if PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS
933#define EXPERIMENTAL_INPLACESCAN
f427392e 934#endif /*PERL_ENABLE_EXPERIMENTAL_REGEX_OPTIMISATIONS*/
07be1b83 935
304ee84b
YO
936#define DEBUG_STUDYDATA(str,data,depth) \
937DEBUG_OPTIMISE_MORE_r(if(data){ \
1de06328 938 PerlIO_printf(Perl_debug_log, \
304ee84b
YO
939 "%*s" str "Pos:%"IVdf"/%"IVdf \
940 " Flags: 0x%"UVXf" Whilem_c: %"IVdf" Lcp: %"IVdf" %s", \
1de06328
YO
941 (int)(depth)*2, "", \
942 (IV)((data)->pos_min), \
943 (IV)((data)->pos_delta), \
304ee84b 944 (UV)((data)->flags), \
1de06328 945 (IV)((data)->whilem_c), \
304ee84b
YO
946 (IV)((data)->last_closep ? *((data)->last_closep) : -1), \
947 is_inf ? "INF " : "" \
1de06328
YO
948 ); \
949 if ((data)->last_found) \
950 PerlIO_printf(Perl_debug_log, \
951 "Last:'%s' %"IVdf":%"IVdf"/%"IVdf" %sFixed:'%s' @ %"IVdf \
952 " %sFloat: '%s' @ %"IVdf"/%"IVdf"", \
953 SvPVX_const((data)->last_found), \
954 (IV)((data)->last_end), \
955 (IV)((data)->last_start_min), \
956 (IV)((data)->last_start_max), \
957 ((data)->longest && \
958 (data)->longest==&((data)->longest_fixed)) ? "*" : "", \
959 SvPVX_const((data)->longest_fixed), \
960 (IV)((data)->offset_fixed), \
961 ((data)->longest && \
962 (data)->longest==&((data)->longest_float)) ? "*" : "", \
963 SvPVX_const((data)->longest_float), \
964 (IV)((data)->offset_float_min), \
965 (IV)((data)->offset_float_max) \
966 ); \
967 PerlIO_printf(Perl_debug_log,"\n"); \
968});
969
653099ff 970/* Mark that we cannot extend a found fixed substring at this point.
786e8c11 971 Update the longest found anchored substring and the longest found
653099ff
GS
972 floating substrings if needed. */
973
4327152a 974STATIC void
ea3daa5d
FC
975S_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
976 SSize_t *minlenp, int is_inf)
c277df42 977{
e1ec3a88
AL
978 const STRLEN l = CHR_SVLEN(data->last_found);
979 const STRLEN old_l = CHR_SVLEN(*data->longest);
1de06328 980 GET_RE_DEBUG_FLAGS_DECL;
b81d288d 981
7918f24d
NC
982 PERL_ARGS_ASSERT_SCAN_COMMIT;
983
c277df42 984 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
6b43b216 985 SvSetMagicSV(*data->longest, data->last_found);
c277df42
IZ
986 if (*data->longest == data->longest_fixed) {
987 data->offset_fixed = l ? data->last_start_min : data->pos_min;
988 if (data->flags & SF_BEFORE_EOL)
b81d288d 989 data->flags
c277df42
IZ
990 |= ((data->flags & SF_BEFORE_EOL) << SF_FIX_SHIFT_EOL);
991 else
992 data->flags &= ~SF_FIX_BEFORE_EOL;
686b73d4 993 data->minlen_fixed=minlenp;
1de06328 994 data->lookbehind_fixed=0;
a0ed51b3 995 }
304ee84b 996 else { /* *data->longest == data->longest_float */
c277df42 997 data->offset_float_min = l ? data->last_start_min : data->pos_min;
b81d288d
AB
998 data->offset_float_max = (l
999 ? data->last_start_max
ea3daa5d
FC
1000 : (data->pos_delta == SSize_t_MAX
1001 ? SSize_t_MAX
1002 : data->pos_min + data->pos_delta));
1003 if (is_inf
1004 || (STRLEN)data->offset_float_max > (STRLEN)SSize_t_MAX)
1005 data->offset_float_max = SSize_t_MAX;
c277df42 1006 if (data->flags & SF_BEFORE_EOL)
b81d288d 1007 data->flags
c277df42
IZ
1008 |= ((data->flags & SF_BEFORE_EOL) << SF_FL_SHIFT_EOL);
1009 else
1010 data->flags &= ~SF_FL_BEFORE_EOL;
1de06328
YO
1011 data->minlen_float=minlenp;
1012 data->lookbehind_float=0;
c277df42
IZ
1013 }
1014 }
1015 SvCUR_set(data->last_found, 0);
0eda9292 1016 {
a28509cc 1017 SV * const sv = data->last_found;
097eb12c
AL
1018 if (SvUTF8(sv) && SvMAGICAL(sv)) {
1019 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
1020 if (mg)
1021 mg->mg_len = 0;
1022 }
0eda9292 1023 }
c277df42
IZ
1024 data->last_end = -1;
1025 data->flags &= ~SF_BEFORE_EOL;
bcdf7404 1026 DEBUG_STUDYDATA("commit: ",data,0);
c277df42
IZ
1027}
1028
899d20b9
KW
1029/* These macros set, clear and test whether the synthetic start class ('ssc',
1030 * given by the parameter) matches an empty string (EOS). This uses the
1031 * 'next_off' field in the node, to save a bit in the flags field. The ssc
1032 * stands alone, so there is never a next_off, so this field is otherwise
1033 * unused. The EOS information is used only for compilation, but theoretically
1034 * it could be passed on to the execution code. This could be used to store
cdd87c1d
KW
1035 * more than one bit of information, but only this one is currently used. This
1036 * flag could be moved back to the bitmap instead, shared with INVERT, as no
1037 * SSC is ever inverted */
899d20b9
KW
1038#define SET_SSC_EOS(node) STMT_START { (node)->next_off = TRUE; } STMT_END
1039#define CLEAR_SSC_EOS(node) STMT_START { (node)->next_off = FALSE; } STMT_END
1040#define TEST_SSC_EOS(node) cBOOL((node)->next_off)
1041
cdd87c1d
KW
1042/* An SSC is just a regnode_charclass_posix with an extra field: the inversion
1043 * list that describes which code points it matches */
1044
653099ff 1045STATIC void
3420edd7 1046S_ssc_anything(pTHX_ regnode_ssc *ssc)
653099ff 1047{
cdd87c1d
KW
1048 /* Set the SSC 'ssc' to match an empty string or any code point */
1049
557bd3fb 1050 PERL_ARGS_ASSERT_SSC_ANYTHING;
7918f24d 1051
cdd87c1d 1052 assert(OP(ssc) == ANYOF_SYNTHETIC);
3fffb88a 1053
cdd87c1d
KW
1054 ssc->invlist = sv_2mortal(_new_invlist(2)); /* mortalize so won't leak */
1055 _append_range_to_invlist(ssc->invlist, 0, UV_MAX);
1056 SET_SSC_EOS(ssc); /* Plus match empty string */
653099ff
GS
1057}
1058
653099ff 1059STATIC int
cdd87c1d 1060S_ssc_is_anything(pTHX_ const regnode_ssc *ssc)
653099ff 1061{
cdd87c1d
KW
1062 /* Returns TRUE if the SSC 'ssc' can match the empty string or any code
1063 * point */
1064
1065 UV start, end;
1066 bool ret;
653099ff 1067
557bd3fb 1068 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
7918f24d 1069
cdd87c1d
KW
1070 assert(OP(ssc) == ANYOF_SYNTHETIC);
1071
1072 if (! TEST_SSC_EOS(ssc)) {
1073 return FALSE;
1074 }
1075
1076 /* See if the list consists solely of the range 0 - Infinity */
1077 invlist_iterinit(ssc->invlist);
1078 ret = invlist_iternext(ssc->invlist, &start, &end)
1079 && start == 0
1080 && end == UV_MAX;
1081
1082 invlist_iterfinish(ssc->invlist);
1083
1084 if (ret) {
1085 return TRUE;
1086 }
1087
1088 /* If e.g., both \w and \W are set, matches everything */
1089 if (ANYOF_FLAGS(ssc) & ANYOF_POSIXL) {
1090 int i;
1091 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
1092 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
1093 return TRUE;
1094 }
1095 }
1096 }
1097
1098 return FALSE;
653099ff
GS
1099}
1100
653099ff 1101STATIC void
cdd87c1d 1102S_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
653099ff 1103{
cdd87c1d
KW
1104 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
1105 * string, any code point, or any posix class under locale */
1106
557bd3fb 1107 PERL_ARGS_ASSERT_SSC_INIT;
7918f24d 1108
557bd3fb 1109 Zero(ssc, 1, regnode_ssc);
557bd3fb 1110 OP(ssc) = ANYOF_SYNTHETIC;
cdd87c1d 1111 ARG_SET(ssc, ANYOF_NONBITMAP_EMPTY);
3420edd7 1112 ssc_anything(ssc);
cdd87c1d
KW
1113
1114 /* If any portion of the regex is to operate under locale rules,
1115 * initialization includes it. The reason this isn't done for all regexes
1116 * is that the optimizer was written under the assumption that locale was
1117 * all-or-nothing. Given the complexity and lack of documentation in the
1118 * optimizer, and that there are inadequate test cases for locale, many
1119 * parts of it may not work properly, it is safest to avoid locale unless
1120 * necessary. */
1121 if (RExC_contains_locale) {
1122 ANYOF_POSIXL_SETALL(ssc);
cfafade5
KW
1123 ANYOF_FLAGS(ssc) |= ANYOF_LOCALE|ANYOF_POSIXL;
1124 if (RExC_contains_i) {
1125 ANYOF_FLAGS(ssc) |= ANYOF_LOC_FOLD;
1126 }
cdd87c1d
KW
1127 }
1128 else {
1129 ANYOF_POSIXL_ZERO(ssc);
1130 }
653099ff
GS
1131}
1132
1051e1c4 1133/* These two functions currently do the exact same thing */
557bd3fb 1134#define ssc_init_zero ssc_init
653099ff 1135
cdd87c1d
KW
1136#define ssc_add_cp(ssc, cp) ssc_add_range((ssc), (cp), (cp))
1137#define ssc_match_all_cp(ssc) ssc_add_range(ssc, 0, UV_MAX)
1138
557bd3fb 1139/* 'AND' a given class with another one. Can create false positives. 'ssc'
8efd3f97 1140 * should not be inverted. 'and_with->flags & ANYOF_POSIXL' should be 0 if
b8f7bb16 1141 * 'and_with' is a regnode_charclass instead of a regnode_ssc. */
cdd87c1d 1142
653099ff 1143STATIC void
1ca93ef1 1144S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc, const regnode_ssc *and_with)
653099ff 1145{
cdd87c1d
KW
1146 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
1147 * another SSC or a regular ANYOF class. Can create false positives. */
40d049e4 1148
cdd87c1d
KW
1149 /* If 'and_with' is an SSC, we already have its inversion list; otherwise
1150 * have to calculate it */
1151 SV* anded_cp_list = (OP(and_with) == ANYOF_SYNTHETIC)
1152 ? and_with->invlist
1153 : get_ANYOF_cp_list_for_ssc(pRExC_state,
1154 (regnode_charclass_posixl*) and_with);
1e6ade67 1155
cdd87c1d 1156 PERL_ARGS_ASSERT_SSC_AND;
653099ff 1157
cdd87c1d
KW
1158 assert(OP(ssc) == ANYOF_SYNTHETIC);
1159 assert(! (ANYOF_FLAGS(ssc) & ANYOF_INVERT)); /* SSCs are never inverted */
1160
1161 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1162 * C2 is the list of code points in 'and-with'; P2, its posix classes.
1163 * 'and_with' may be inverted. When not inverted, we have the situation of
1164 * computing:
1165 * (C1 | P1) & (C2 | P2)
1166 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
1167 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1168 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
1169 * <= ((C1 & C2) | P1 | P2)
1170 * Alternatively, the last few steps could be:
1171 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
1172 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
1173 * <= (C1 | C2 | (P1 & P2))
1174 * We favor the second approach if either P1 or P2 is non-empty. This is
1175 * because these components are a barrier to doing optimizations, as what
1176 * they match cannot be known until the moment of matching as they are
1177 * dependent on the current locale, 'AND"ing them likely will reduce or
1178 * eliminate them.
1179 * But we can do better if we know that C1,P1 are in their initial state (a
1180 * frequent occurrence), each matching everything:
1181 * (<everything>) & (C2 | P2) = C2 | P2
1182 * Similarly, if C2,P2 are in their initial state (again a frequent
1183 * occurrence), the result is a no-op
1184 * (C1 | P1) & (<everything>) = C1 | P1
1185 *
1186 * Inverted, we have
1187 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
1188 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
1189 * <= (C1 & ~C2) | (P1 & ~P2)
1190 * */
1aa99e6b 1191
1867fb0b 1192 if (ANYOF_FLAGS(and_with) & ANYOF_INVERT) {
cdd87c1d 1193 unsigned int i;
8951c461 1194
cdd87c1d 1195 assert(OP(and_with) != ANYOF_SYNTHETIC);
3ad98780 1196
cdd87c1d
KW
1197 ssc_intersection(ssc,
1198 anded_cp_list,
1199 FALSE /* Has already been inverted */
1200 );
c6b76537 1201
cdd87c1d
KW
1202 /* If either P1 or P2 is empty, the intersection will be also; can skip
1203 * the loop */
1204 if (! (ANYOF_FLAGS(and_with) & ANYOF_POSIXL)) {
1205 ANYOF_POSIXL_ZERO(ssc);
1206 }
1207 else if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1208
1209 /* Note that the Posix class component P from 'and_with' actually
1210 * looks like:
1211 * P = Pa | Pb | ... | Pn
1212 * where each component is one posix class, such as in [\w\s].
1213 * Thus
1214 * ~P = ~(Pa | Pb | ... | Pn)
1215 * = ~Pa & ~Pb & ... & ~Pn
1216 * <= ~Pa | ~Pb | ... | ~Pn
1217 * The last is something we can easily calculate, but unfortunately
1218 * is likely to have many false positives. We could do better
1219 * in some (but certainly not all) instances if two classes in
1220 * P have known relationships. For example
1221 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
1222 * So
1223 * :lower: & :print: = :lower:
1224 * And similarly for classes that must be disjoint. For example,
1225 * since \s and \w can have no elements in common based on rules in
1226 * the POSIX standard,
1227 * \w & ^\S = nothing
1228 * Unfortunately, some vendor locales do not meet the Posix
1229 * standard, in particular almost everything by Microsoft.
1230 * The loop below just changes e.g., \w into \W and vice versa */
1231
1232 regnode_charclass_posixl temp;
1233 int add = 1; /* To calculate the index of the complement */
1234
1235 ANYOF_POSIXL_ZERO(&temp);
1236 for (i = 0; i < ANYOF_MAX; i++) {
1237 assert(i % 2 != 0
1238 || ! ANYOF_POSIXL_TEST(and_with, i)
1239 || ! ANYOF_POSIXL_TEST(and_with, i + 1));
1240
1241 if (ANYOF_POSIXL_TEST(and_with, i)) {
1242 ANYOF_POSIXL_SET(&temp, i + add);
1243 }
1244 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
1245 }
1246 ANYOF_POSIXL_AND(&temp, ssc);
c6b76537 1247
cdd87c1d
KW
1248 } /* else ssc already has no posixes */
1249 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
1250 in its initial state */
1251 else if (OP(and_with) != ANYOF_SYNTHETIC
1252 || ! ssc_is_cp_posixl_init(pRExC_state, and_with))
1253 {
1254 /* But if 'ssc' is in its initial state, the result is just 'and_with';
1255 * copy it over 'ssc' */
1256 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
1257 if (OP(and_with) == ANYOF_SYNTHETIC) {
1258 StructCopy(and_with, ssc, regnode_ssc);
1259 }
1260 else {
1261 ssc->invlist = anded_cp_list;
1262 ANYOF_POSIXL_ZERO(ssc);
1263 if (ANYOF_FLAGS(and_with) & ANYOF_POSIXL) {
1264 ANYOF_POSIXL_OR(and_with, ssc);
1265 }
1266 }
1267 }
1268 else if ((ANYOF_FLAGS(ssc) & ANYOF_POSIXL)
1269 || (ANYOF_FLAGS(and_with) & ANYOF_POSIXL))
1270 {
1271 /* One or the other of P1, P2 is non-empty. */
1272 ANYOF_POSIXL_AND(and_with, ssc);
1273 ssc_union(ssc, anded_cp_list, FALSE);
1274 }
1275 else { /* P1 = P2 = empty */
1276 ssc_intersection(ssc, anded_cp_list, FALSE);
1277 }
137165a6 1278 }
cdd87c1d
KW
1279
1280 ssc_flags_and(ssc, ANYOF_FLAGS(and_with));
653099ff
GS
1281}
1282
653099ff 1283STATIC void
cdd87c1d
KW
1284S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
1285 const regnode_ssc *or_with)
653099ff 1286{
cdd87c1d
KW
1287 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
1288 * another SSC or a regular ANYOF class. Can create false positives if
1289 * 'or_with' is to be inverted. */
7918f24d 1290
cdd87c1d
KW
1291 /* If 'or_with' is an SSC, we already have its inversion list; otherwise
1292 * have to calculate it */
1293 SV* ored_cp_list = (OP(or_with) == ANYOF_SYNTHETIC)
1294 ? or_with->invlist
1295 : get_ANYOF_cp_list_for_ssc(pRExC_state,
1296 (regnode_charclass_posixl*) or_with);
c6b76537 1297
cdd87c1d 1298 PERL_ARGS_ASSERT_SSC_OR;
c6b76537 1299
cdd87c1d
KW
1300 assert(OP(ssc) == ANYOF_SYNTHETIC);
1301 assert(! (ANYOF_FLAGS(ssc) & ANYOF_INVERT));
1302
1303 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
1304 * C2 is the list of code points in 'or-with'; P2, its posix classes.
1305 * 'or_with' may be inverted. When not inverted, we have the simple
1306 * situation of computing:
1307 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
1308 * If P1|P2 yields a situation with both a class and its complement are
1309 * set, like having both \w and \W, this matches all code points, and we
1310 * can delete these from the P component of the ssc going forward. XXX We
1311 * might be able to delete all the P components, but I (khw) am not certain
1312 * about this, and it is better to be safe.
1313 *
1314 * Inverted, we have
1315 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
1316 * <= (C1 | P1) | ~C2
1317 * <= (C1 | ~C2) | P1
1318 * (which results in actually simpler code than the non-inverted case)
1319 * */
9826f543 1320
cdd87c1d
KW
1321 /* Use just the SSC-related flags from 'or_with' */
1322 ANYOF_FLAGS(ssc) |= (ANYOF_FLAGS(or_with) & ANYOF_LOCALE_FLAGS);
d94b1d13 1323
cdd87c1d
KW
1324 if (ANYOF_FLAGS(or_with) & ANYOF_INVERT) {
1325 assert(OP(or_with) != ANYOF_SYNTHETIC);
1326 /* We ignore P2, leaving P1 going forward */
1327 }
1328 else { /* Not inverted */
1329 ANYOF_POSIXL_OR(or_with, ssc);
1330 if (ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1331 unsigned int i;
1332 for (i = 0; i < ANYOF_MAX; i += 2) {
1333 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
1334 {
1335 ssc_match_all_cp(ssc);
1336 ANYOF_POSIXL_CLEAR(ssc, i);
1337 ANYOF_POSIXL_CLEAR(ssc, i+1);
1338 if (! ANYOF_POSIXL_TEST_ANY_SET(ssc)) {
1339 ANYOF_FLAGS(ssc) &= ~ANYOF_POSIXL;
1340 }
1341 }
1342 }
1343 }
1aa99e6b 1344 }
cdd87c1d
KW
1345
1346 ssc_union(ssc,
1347 ored_cp_list,
1348 FALSE /* Already has been inverted */
1349 );
653099ff
GS
1350}
1351
a3621e74
YO
1352#define TRIE_LIST_ITEM(state,idx) (trie->states[state].trans.list)[ idx ]
1353#define TRIE_LIST_CUR(state) ( TRIE_LIST_ITEM( state, 0 ).forid )
1354#define TRIE_LIST_LEN(state) ( TRIE_LIST_ITEM( state, 0 ).newstate )
1355#define TRIE_LIST_USED(idx) ( trie->states[state].trans.list ? (TRIE_LIST_CUR( idx ) - 1) : 0 )
1356
3dab1dad
YO
1357
1358#ifdef DEBUGGING
07be1b83 1359/*
2b8b4781
NC
1360 dump_trie(trie,widecharmap,revcharmap)
1361 dump_trie_interim_list(trie,widecharmap,revcharmap,next_alloc)
1362 dump_trie_interim_table(trie,widecharmap,revcharmap,next_alloc)
3dab1dad
YO
1363
1364 These routines dump out a trie in a somewhat readable format.
07be1b83
YO
1365 The _interim_ variants are used for debugging the interim
1366 tables that are used to generate the final compressed
1367 representation which is what dump_trie expects.
1368
486ec47a 1369 Part of the reason for their existence is to provide a form
3dab1dad 1370 of documentation as to how the different representations function.
07be1b83
YO
1371
1372*/
3dab1dad
YO
1373
1374/*
3dab1dad
YO
1375 Dumps the final compressed table form of the trie to Perl_debug_log.
1376 Used for debugging make_trie().
1377*/
b9a59e08 1378
3dab1dad 1379STATIC void
2b8b4781
NC
1380S_dump_trie(pTHX_ const struct _reg_trie_data *trie, HV *widecharmap,
1381 AV *revcharmap, U32 depth)
3dab1dad
YO
1382{
1383 U32 state;
ab3bbdeb 1384 SV *sv=sv_newmortal();
55eed653 1385 int colwidth= widecharmap ? 6 : 4;
2e64971a 1386 U16 word;
3dab1dad
YO
1387 GET_RE_DEBUG_FLAGS_DECL;
1388
7918f24d 1389 PERL_ARGS_ASSERT_DUMP_TRIE;
ab3bbdeb 1390
3dab1dad
YO
1391 PerlIO_printf( Perl_debug_log, "%*sChar : %-6s%-6s%-4s ",
1392 (int)depth * 2 + 2,"",
1393 "Match","Base","Ofs" );
1394
1395 for( state = 0 ; state < trie->uniquecharcount ; state++ ) {
2b8b4781 1396 SV ** const tmp = av_fetch( revcharmap, state, 0);
3dab1dad 1397 if ( tmp ) {
ab3bbdeb
YO
1398 PerlIO_printf( Perl_debug_log, "%*s",
1399 colwidth,
ddc5bc0f 1400 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1401 PL_colors[0], PL_colors[1],
1402 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1403 PERL_PV_ESCAPE_FIRSTCHAR
1404 )
1405 );
3dab1dad
YO
1406 }
1407 }
1408 PerlIO_printf( Perl_debug_log, "\n%*sState|-----------------------",
1409 (int)depth * 2 + 2,"");
1410
1411 for( state = 0 ; state < trie->uniquecharcount ; state++ )
ab3bbdeb 1412 PerlIO_printf( Perl_debug_log, "%.*s", colwidth, "--------");
3dab1dad
YO
1413 PerlIO_printf( Perl_debug_log, "\n");
1414
1e2e3d02 1415 for( state = 1 ; state < trie->statecount ; state++ ) {
be8e71aa 1416 const U32 base = trie->states[ state ].trans.base;
3dab1dad
YO
1417
1418 PerlIO_printf( Perl_debug_log, "%*s#%4"UVXf"|", (int)depth * 2 + 2,"", (UV)state);
1419
1420 if ( trie->states[ state ].wordnum ) {
1421 PerlIO_printf( Perl_debug_log, " W%4X", trie->states[ state ].wordnum );
1422 } else {
1423 PerlIO_printf( Perl_debug_log, "%6s", "" );
1424 }
1425
1426 PerlIO_printf( Perl_debug_log, " @%4"UVXf" ", (UV)base );
1427
1428 if ( base ) {
1429 U32 ofs = 0;
1430
1431 while( ( base + ofs < trie->uniquecharcount ) ||
1432 ( base + ofs - trie->uniquecharcount < trie->lasttrans
1433 && trie->trans[ base + ofs - trie->uniquecharcount ].check != state))
1434 ofs++;
1435
1436 PerlIO_printf( Perl_debug_log, "+%2"UVXf"[ ", (UV)ofs);
1437
1438 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
1439 if ( ( base + ofs >= trie->uniquecharcount ) &&
1440 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
1441 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
1442 {
ab3bbdeb
YO
1443 PerlIO_printf( Perl_debug_log, "%*"UVXf,
1444 colwidth,
3dab1dad
YO
1445 (UV)trie->trans[ base + ofs - trie->uniquecharcount ].next );
1446 } else {
ab3bbdeb 1447 PerlIO_printf( Perl_debug_log, "%*s",colwidth," ." );
3dab1dad
YO
1448 }
1449 }
1450
1451 PerlIO_printf( Perl_debug_log, "]");
1452
1453 }
1454 PerlIO_printf( Perl_debug_log, "\n" );
1455 }
2e64971a
DM
1456 PerlIO_printf(Perl_debug_log, "%*sword_info N:(prev,len)=", (int)depth*2, "");
1457 for (word=1; word <= trie->wordcount; word++) {
1458 PerlIO_printf(Perl_debug_log, " %d:(%d,%d)",
1459 (int)word, (int)(trie->wordinfo[word].prev),
1460 (int)(trie->wordinfo[word].len));
1461 }
1462 PerlIO_printf(Perl_debug_log, "\n" );
3dab1dad
YO
1463}
1464/*
3dab1dad
YO
1465 Dumps a fully constructed but uncompressed trie in list form.
1466 List tries normally only are used for construction when the number of
1467 possible chars (trie->uniquecharcount) is very high.
1468 Used for debugging make_trie().
1469*/
1470STATIC void
55eed653 1471S_dump_trie_interim_list(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1472 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1473 U32 depth)
3dab1dad
YO
1474{
1475 U32 state;
ab3bbdeb 1476 SV *sv=sv_newmortal();
55eed653 1477 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1478 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1479
1480 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_LIST;
1481
3dab1dad 1482 /* print out the table precompression. */
ab3bbdeb
YO
1483 PerlIO_printf( Perl_debug_log, "%*sState :Word | Transition Data\n%*s%s",
1484 (int)depth * 2 + 2,"", (int)depth * 2 + 2,"",
1485 "------:-----+-----------------\n" );
3dab1dad
YO
1486
1487 for( state=1 ; state < next_alloc ; state ++ ) {
1488 U16 charid;
1489
ab3bbdeb 1490 PerlIO_printf( Perl_debug_log, "%*s %4"UVXf" :",
3dab1dad
YO
1491 (int)depth * 2 + 2,"", (UV)state );
1492 if ( ! trie->states[ state ].wordnum ) {
1493 PerlIO_printf( Perl_debug_log, "%5s| ","");
1494 } else {
1495 PerlIO_printf( Perl_debug_log, "W%4x| ",
1496 trie->states[ state ].wordnum
1497 );
1498 }
1499 for( charid = 1 ; charid <= TRIE_LIST_USED( state ) ; charid++ ) {
2b8b4781 1500 SV ** const tmp = av_fetch( revcharmap, TRIE_LIST_ITEM(state,charid).forid, 0);
ab3bbdeb
YO
1501 if ( tmp ) {
1502 PerlIO_printf( Perl_debug_log, "%*s:%3X=%4"UVXf" | ",
1503 colwidth,
ddc5bc0f 1504 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1505 PL_colors[0], PL_colors[1],
1506 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1507 PERL_PV_ESCAPE_FIRSTCHAR
1508 ) ,
1e2e3d02
YO
1509 TRIE_LIST_ITEM(state,charid).forid,
1510 (UV)TRIE_LIST_ITEM(state,charid).newstate
1511 );
1512 if (!(charid % 10))
664e119d
RGS
1513 PerlIO_printf(Perl_debug_log, "\n%*s| ",
1514 (int)((depth * 2) + 14), "");
1e2e3d02 1515 }
ab3bbdeb
YO
1516 }
1517 PerlIO_printf( Perl_debug_log, "\n");
3dab1dad
YO
1518 }
1519}
1520
1521/*
3dab1dad
YO
1522 Dumps a fully constructed but uncompressed trie in table form.
1523 This is the normal DFA style state transition table, with a few
1524 twists to facilitate compression later.
1525 Used for debugging make_trie().
1526*/
1527STATIC void
55eed653 1528S_dump_trie_interim_table(pTHX_ const struct _reg_trie_data *trie,
2b8b4781
NC
1529 HV *widecharmap, AV *revcharmap, U32 next_alloc,
1530 U32 depth)
3dab1dad
YO
1531{
1532 U32 state;
1533 U16 charid;
ab3bbdeb 1534 SV *sv=sv_newmortal();
55eed653 1535 int colwidth= widecharmap ? 6 : 4;
3dab1dad 1536 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1537
1538 PERL_ARGS_ASSERT_DUMP_TRIE_INTERIM_TABLE;
3dab1dad
YO
1539
1540 /*
1541 print out the table precompression so that we can do a visual check
1542 that they are identical.
1543 */
1544
1545 PerlIO_printf( Perl_debug_log, "%*sChar : ",(int)depth * 2 + 2,"" );
1546
1547 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
2b8b4781 1548 SV ** const tmp = av_fetch( revcharmap, charid, 0);
3dab1dad 1549 if ( tmp ) {
ab3bbdeb
YO
1550 PerlIO_printf( Perl_debug_log, "%*s",
1551 colwidth,
ddc5bc0f 1552 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), colwidth,
ab3bbdeb
YO
1553 PL_colors[0], PL_colors[1],
1554 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
1555 PERL_PV_ESCAPE_FIRSTCHAR
1556 )
1557 );
3dab1dad
YO
1558 }
1559 }
1560
1561 PerlIO_printf( Perl_debug_log, "\n%*sState+-",(int)depth * 2 + 2,"" );
1562
1563 for( charid=0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb 1564 PerlIO_printf( Perl_debug_log, "%.*s", colwidth,"--------");
3dab1dad
YO
1565 }
1566
1567 PerlIO_printf( Perl_debug_log, "\n" );
1568
1569 for( state=1 ; state < next_alloc ; state += trie->uniquecharcount ) {
1570
1571 PerlIO_printf( Perl_debug_log, "%*s%4"UVXf" : ",
1572 (int)depth * 2 + 2,"",
1573 (UV)TRIE_NODENUM( state ) );
1574
1575 for( charid = 0 ; charid < trie->uniquecharcount ; charid++ ) {
ab3bbdeb
YO
1576 UV v=(UV)SAFE_TRIE_NODENUM( trie->trans[ state + charid ].next );
1577 if (v)
1578 PerlIO_printf( Perl_debug_log, "%*"UVXf, colwidth, v );
1579 else
1580 PerlIO_printf( Perl_debug_log, "%*s", colwidth, "." );
3dab1dad
YO
1581 }
1582 if ( ! trie->states[ TRIE_NODENUM( state ) ].wordnum ) {
1583 PerlIO_printf( Perl_debug_log, " (%4"UVXf")\n", (UV)trie->trans[ state ].check );
1584 } else {
1585 PerlIO_printf( Perl_debug_log, " (%4"UVXf") W%4X\n", (UV)trie->trans[ state ].check,
1586 trie->states[ TRIE_NODENUM( state ) ].wordnum );
1587 }
1588 }
07be1b83 1589}
3dab1dad
YO
1590
1591#endif
1592
2e64971a 1593
786e8c11
YO
1594/* make_trie(startbranch,first,last,tail,word_count,flags,depth)
1595 startbranch: the first branch in the whole branch sequence
1596 first : start branch of sequence of branch-exact nodes.
1597 May be the same as startbranch
1598 last : Thing following the last branch.
1599 May be the same as tail.
1600 tail : item following the branch sequence
1601 count : words in the sequence
1602 flags : currently the OP() type we will be building one of /EXACT(|F|Fl)/
1603 depth : indent depth
3dab1dad 1604
786e8c11 1605Inplace optimizes a sequence of 2 or more Branch-Exact nodes into a TRIE node.
07be1b83 1606
786e8c11
YO
1607A trie is an N'ary tree where the branches are determined by digital
1608decomposition of the key. IE, at the root node you look up the 1st character and
1609follow that branch repeat until you find the end of the branches. Nodes can be
1610marked as "accepting" meaning they represent a complete word. Eg:
07be1b83 1611
786e8c11 1612 /he|she|his|hers/
72f13be8 1613
786e8c11
YO
1614would convert into the following structure. Numbers represent states, letters
1615following numbers represent valid transitions on the letter from that state, if
1616the number is in square brackets it represents an accepting state, otherwise it
1617will be in parenthesis.
07be1b83 1618
786e8c11
YO
1619 +-h->+-e->[3]-+-r->(8)-+-s->[9]
1620 | |
1621 | (2)
1622 | |
1623 (1) +-i->(6)-+-s->[7]
1624 |
1625 +-s->(3)-+-h->(4)-+-e->[5]
07be1b83 1626
786e8c11
YO
1627 Accept Word Mapping: 3=>1 (he),5=>2 (she), 7=>3 (his), 9=>4 (hers)
1628
1629This shows that when matching against the string 'hers' we will begin at state 1
1630read 'h' and move to state 2, read 'e' and move to state 3 which is accepting,
1631then read 'r' and go to state 8 followed by 's' which takes us to state 9 which
1632is also accepting. Thus we know that we can match both 'he' and 'hers' with a
1633single traverse. We store a mapping from accepting to state to which word was
1634matched, and then when we have multiple possibilities we try to complete the
1635rest of the regex in the order in which they occured in the alternation.
1636
1637The only prior NFA like behaviour that would be changed by the TRIE support is
1638the silent ignoring of duplicate alternations which are of the form:
1639
1640 / (DUPE|DUPE) X? (?{ ... }) Y /x
1641
4b714af6 1642Thus EVAL blocks following a trie may be called a different number of times with
786e8c11 1643and without the optimisation. With the optimisations dupes will be silently
486ec47a 1644ignored. This inconsistent behaviour of EVAL type nodes is well established as
786e8c11
YO
1645the following demonstrates:
1646
1647 'words'=~/(word|word|word)(?{ print $1 })[xyz]/
1648
1649which prints out 'word' three times, but
1650
1651 'words'=~/(word|word|word)(?{ print $1 })S/
1652
1653which doesnt print it out at all. This is due to other optimisations kicking in.
1654
1655Example of what happens on a structural level:
1656
486ec47a 1657The regexp /(ac|ad|ab)+/ will produce the following debug output:
786e8c11
YO
1658
1659 1: CURLYM[1] {1,32767}(18)
1660 5: BRANCH(8)
1661 6: EXACT <ac>(16)
1662 8: BRANCH(11)
1663 9: EXACT <ad>(16)
1664 11: BRANCH(14)
1665 12: EXACT <ab>(16)
1666 16: SUCCEED(0)
1667 17: NOTHING(18)
1668 18: END(0)
1669
1670This would be optimizable with startbranch=5, first=5, last=16, tail=16
1671and should turn into:
1672
1673 1: CURLYM[1] {1,32767}(18)
1674 5: TRIE(16)
1675 [Words:3 Chars Stored:6 Unique Chars:4 States:5 NCP:1]
1676 <ac>
1677 <ad>
1678 <ab>
1679 16: SUCCEED(0)
1680 17: NOTHING(18)
1681 18: END(0)
1682
1683Cases where tail != last would be like /(?foo|bar)baz/:
1684
1685 1: BRANCH(4)
1686 2: EXACT <foo>(8)
1687 4: BRANCH(7)
1688 5: EXACT <bar>(8)
1689 7: TAIL(8)
1690 8: EXACT <baz>(10)
1691 10: END(0)
1692
1693which would be optimizable with startbranch=1, first=1, last=7, tail=8
1694and would end up looking like:
1695
1696 1: TRIE(8)
1697 [Words:2 Chars Stored:6 Unique Chars:5 States:7 NCP:1]
1698 <foo>
1699 <bar>
1700 7: TAIL(8)
1701 8: EXACT <baz>(10)
1702 10: END(0)
1703
c80e42f3 1704 d = uvchr_to_utf8_flags(d, uv, 0);
786e8c11
YO
1705
1706is the recommended Unicode-aware way of saying
1707
1708 *(d++) = uv;
1709*/
1710
fab2782b 1711#define TRIE_STORE_REVCHAR(val) \
786e8c11 1712 STMT_START { \
73031816 1713 if (UTF) { \
fab2782b 1714 SV *zlopp = newSV(7); /* XXX: optimize me */ \
88c9ea1e 1715 unsigned char *flrbbbbb = (unsigned char *) SvPVX(zlopp); \
c80e42f3 1716 unsigned const char *const kapow = uvchr_to_utf8(flrbbbbb, val); \
73031816
NC
1717 SvCUR_set(zlopp, kapow - flrbbbbb); \
1718 SvPOK_on(zlopp); \
1719 SvUTF8_on(zlopp); \
1720 av_push(revcharmap, zlopp); \
1721 } else { \
fab2782b 1722 char ooooff = (char)val; \
73031816
NC
1723 av_push(revcharmap, newSVpvn(&ooooff, 1)); \
1724 } \
1725 } STMT_END
786e8c11 1726
914a25d5
KW
1727/* This gets the next character from the input, folding it if not already
1728 * folded. */
1729#define TRIE_READ_CHAR STMT_START { \
1730 wordlen++; \
1731 if ( UTF ) { \
1732 /* if it is UTF then it is either already folded, or does not need \
1733 * folding */ \
1c1d615a 1734 uvc = valid_utf8_to_uvchr( (const U8*) uc, &len); \
914a25d5
KW
1735 } \
1736 else if (folder == PL_fold_latin1) { \
7d006b13
KW
1737 /* This folder implies Unicode rules, which in the range expressible \
1738 * by not UTF is the lower case, with the two exceptions, one of \
1739 * which should have been taken care of before calling this */ \
1740 assert(*uc != LATIN_SMALL_LETTER_SHARP_S); \
1741 uvc = toLOWER_L1(*uc); \
1742 if (UNLIKELY(uvc == MICRO_SIGN)) uvc = GREEK_SMALL_LETTER_MU; \
1743 len = 1; \
914a25d5
KW
1744 } else { \
1745 /* raw data, will be folded later if needed */ \
1746 uvc = (U32)*uc; \
1747 len = 1; \
1748 } \
786e8c11
YO
1749} STMT_END
1750
1751
1752
1753#define TRIE_LIST_PUSH(state,fid,ns) STMT_START { \
1754 if ( TRIE_LIST_CUR( state ) >=TRIE_LIST_LEN( state ) ) { \
f9003953
NC
1755 U32 ging = TRIE_LIST_LEN( state ) *= 2; \
1756 Renew( trie->states[ state ].trans.list, ging, reg_trie_trans_le ); \
786e8c11
YO
1757 } \
1758 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).forid = fid; \
1759 TRIE_LIST_ITEM( state, TRIE_LIST_CUR( state ) ).newstate = ns; \
1760 TRIE_LIST_CUR( state )++; \
1761} STMT_END
07be1b83 1762
786e8c11
YO
1763#define TRIE_LIST_NEW(state) STMT_START { \
1764 Newxz( trie->states[ state ].trans.list, \
1765 4, reg_trie_trans_le ); \
1766 TRIE_LIST_CUR( state ) = 1; \
1767 TRIE_LIST_LEN( state ) = 4; \
1768} STMT_END
07be1b83 1769
786e8c11
YO
1770#define TRIE_HANDLE_WORD(state) STMT_START { \
1771 U16 dupe= trie->states[ state ].wordnum; \
1772 regnode * const noper_next = regnext( noper ); \
1773 \
786e8c11
YO
1774 DEBUG_r({ \
1775 /* store the word for dumping */ \
1776 SV* tmp; \
1777 if (OP(noper) != NOTHING) \
740cce10 1778 tmp = newSVpvn_utf8(STRING(noper), STR_LEN(noper), UTF); \
786e8c11 1779 else \
740cce10 1780 tmp = newSVpvn_utf8( "", 0, UTF ); \
2b8b4781 1781 av_push( trie_words, tmp ); \
786e8c11
YO
1782 }); \
1783 \
1784 curword++; \
2e64971a
DM
1785 trie->wordinfo[curword].prev = 0; \
1786 trie->wordinfo[curword].len = wordlen; \
1787 trie->wordinfo[curword].accept = state; \
786e8c11
YO
1788 \
1789 if ( noper_next < tail ) { \
1790 if (!trie->jump) \
c944940b 1791 trie->jump = (U16 *) PerlMemShared_calloc( word_count + 1, sizeof(U16) ); \
7f69552c 1792 trie->jump[curword] = (U16)(noper_next - convert); \
786e8c11
YO
1793 if (!jumper) \
1794 jumper = noper_next; \
1795 if (!nextbranch) \
1796 nextbranch= regnext(cur); \
1797 } \
1798 \
1799 if ( dupe ) { \
2e64971a
DM
1800 /* It's a dupe. Pre-insert into the wordinfo[].prev */\
1801 /* chain, so that when the bits of chain are later */\
1802 /* linked together, the dups appear in the chain */\
1803 trie->wordinfo[curword].prev = trie->wordinfo[dupe].prev; \
1804 trie->wordinfo[dupe].prev = curword; \
786e8c11
YO
1805 } else { \
1806 /* we haven't inserted this word yet. */ \
1807 trie->states[ state ].wordnum = curword; \
1808 } \
1809} STMT_END
07be1b83 1810
3dab1dad 1811
786e8c11
YO
1812#define TRIE_TRANS_STATE(state,base,ucharcount,charid,special) \
1813 ( ( base + charid >= ucharcount \
1814 && base + charid < ubound \
1815 && state == trie->trans[ base - ucharcount + charid ].check \
1816 && trie->trans[ base - ucharcount + charid ].next ) \
1817 ? trie->trans[ base - ucharcount + charid ].next \
1818 : ( state==1 ? special : 0 ) \
1819 )
3dab1dad 1820
786e8c11
YO
1821#define MADE_TRIE 1
1822#define MADE_JUMP_TRIE 2
1823#define MADE_EXACT_TRIE 4
3dab1dad 1824
a3621e74 1825STATIC I32
786e8c11 1826S_make_trie(pTHX_ RExC_state_t *pRExC_state, regnode *startbranch, regnode *first, regnode *last, regnode *tail, U32 word_count, U32 flags, U32 depth)
a3621e74 1827{
27da23d5 1828 dVAR;
a3621e74
YO
1829 /* first pass, loop through and scan words */
1830 reg_trie_data *trie;
55eed653 1831 HV *widecharmap = NULL;
2b8b4781 1832 AV *revcharmap = newAV();
a3621e74 1833 regnode *cur;
a3621e74
YO
1834 STRLEN len = 0;
1835 UV uvc = 0;
1836 U16 curword = 0;
1837 U32 next_alloc = 0;
786e8c11
YO
1838 regnode *jumper = NULL;
1839 regnode *nextbranch = NULL;
7f69552c 1840 regnode *convert = NULL;
2e64971a 1841 U32 *prev_states; /* temp array mapping each state to previous one */
a3621e74 1842 /* we just use folder as a flag in utf8 */
1e696034 1843 const U8 * folder = NULL;
a3621e74 1844
2b8b4781 1845#ifdef DEBUGGING
cf78de0b 1846 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tuuu"));
2b8b4781
NC
1847 AV *trie_words = NULL;
1848 /* along with revcharmap, this only used during construction but both are
1849 * useful during debugging so we store them in the struct when debugging.
8e11feef 1850 */
2b8b4781 1851#else
cf78de0b 1852 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("tu"));
3dab1dad 1853 STRLEN trie_charcount=0;
3dab1dad 1854#endif
2b8b4781 1855 SV *re_trie_maxbuff;
a3621e74 1856 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
1857
1858 PERL_ARGS_ASSERT_MAKE_TRIE;
72f13be8
YO
1859#ifndef DEBUGGING
1860 PERL_UNUSED_ARG(depth);
1861#endif
a3621e74 1862
1e696034 1863 switch (flags) {
79a81a6e 1864 case EXACT: break;
2f7f8cb1 1865 case EXACTFA:
fab2782b 1866 case EXACTFU_SS:
1e696034
KW
1867 case EXACTFU: folder = PL_fold_latin1; break;
1868 case EXACTF: folder = PL_fold; break;
1869 case EXACTFL: folder = PL_fold_locale; break;
fab2782b 1870 default: Perl_croak( aTHX_ "panic! In trie construction, unknown node type %u %s", (unsigned) flags, PL_reg_name[flags] );
1e696034
KW
1871 }
1872
c944940b 1873 trie = (reg_trie_data *) PerlMemShared_calloc( 1, sizeof(reg_trie_data) );
a3621e74 1874 trie->refcount = 1;
3dab1dad 1875 trie->startstate = 1;
786e8c11 1876 trie->wordcount = word_count;
f8fc2ecf 1877 RExC_rxi->data->data[ data_slot ] = (void*)trie;
c944940b 1878 trie->charmap = (U16 *) PerlMemShared_calloc( 256, sizeof(U16) );
fab2782b 1879 if (flags == EXACT)
c944940b 1880 trie->bitmap = (char *) PerlMemShared_calloc( ANYOF_BITMAP_SIZE, 1 );
2e64971a
DM
1881 trie->wordinfo = (reg_trie_wordinfo *) PerlMemShared_calloc(
1882 trie->wordcount+1, sizeof(reg_trie_wordinfo));
1883
a3621e74 1884 DEBUG_r({
2b8b4781 1885 trie_words = newAV();
a3621e74 1886 });
a3621e74 1887
0111c4fd 1888 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
a3621e74 1889 if (!SvIOK(re_trie_maxbuff)) {
0111c4fd 1890 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
a3621e74 1891 }
df826430 1892 DEBUG_TRIE_COMPILE_r({
3dab1dad 1893 PerlIO_printf( Perl_debug_log,
786e8c11 1894 "%*smake_trie start==%d, first==%d, last==%d, tail==%d depth=%d\n",
3dab1dad
YO
1895 (int)depth * 2 + 2, "",
1896 REG_NODE_NUM(startbranch),REG_NODE_NUM(first),
786e8c11 1897 REG_NODE_NUM(last), REG_NODE_NUM(tail),
85c3142d 1898 (int)depth);
3dab1dad 1899 });
7f69552c
YO
1900
1901 /* Find the node we are going to overwrite */
1902 if ( first == startbranch && OP( last ) != BRANCH ) {
1903 /* whole branch chain */
1904 convert = first;
1905 } else {
1906 /* branch sub-chain */
1907 convert = NEXTOPER( first );
1908 }
1909
a3621e74
YO
1910 /* -- First loop and Setup --
1911
1912 We first traverse the branches and scan each word to determine if it
1913 contains widechars, and how many unique chars there are, this is
1914 important as we have to build a table with at least as many columns as we
1915 have unique chars.
1916
1917 We use an array of integers to represent the character codes 0..255
38a44b82 1918 (trie->charmap) and we use a an HV* to store Unicode characters. We use the
a3621e74
YO
1919 native representation of the character value as the key and IV's for the
1920 coded index.
1921
1922 *TODO* If we keep track of how many times each character is used we can
1923 remap the columns so that the table compression later on is more
3b753521 1924 efficient in terms of memory by ensuring the most common value is in the
a3621e74
YO
1925 middle and the least common are on the outside. IMO this would be better
1926 than a most to least common mapping as theres a decent chance the most
1927 common letter will share a node with the least common, meaning the node
486ec47a 1928 will not be compressible. With a middle is most common approach the worst
a3621e74
YO
1929 case is when we have the least common nodes twice.
1930
1931 */
1932
a3621e74 1933 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
df826430 1934 regnode *noper = NEXTOPER( cur );
e1ec3a88 1935 const U8 *uc = (U8*)STRING( noper );
df826430 1936 const U8 *e = uc + STR_LEN( noper );
a3621e74 1937 STRLEN foldlen = 0;
07be1b83 1938 U32 wordlen = 0; /* required init */
afa96d92
KW
1939 STRLEN minbytes = 0;
1940 STRLEN maxbytes = 0;
02daf0ab 1941 bool set_bit = trie->bitmap ? 1 : 0; /*store the first char in the bitmap?*/
a3621e74 1942
3dab1dad 1943 if (OP(noper) == NOTHING) {
df826430
YO
1944 regnode *noper_next= regnext(noper);
1945 if (noper_next != tail && OP(noper_next) == flags) {
1946 noper = noper_next;
1947 uc= (U8*)STRING(noper);
1948 e= uc + STR_LEN(noper);
1949 trie->minlen= STR_LEN(noper);
1950 } else {
1951 trie->minlen= 0;
1952 continue;
1953 }
3dab1dad 1954 }
df826430 1955
fab2782b 1956 if ( set_bit ) { /* bitmap only alloced when !(UTF&&Folding) */
02daf0ab
YO
1957 TRIE_BITMAP_SET(trie,*uc); /* store the raw first byte
1958 regardless of encoding */
fab2782b
YO
1959 if (OP( noper ) == EXACTFU_SS) {
1960 /* false positives are ok, so just set this */
0dc4a61d 1961 TRIE_BITMAP_SET(trie, LATIN_SMALL_LETTER_SHARP_S);
fab2782b
YO
1962 }
1963 }
a3621e74 1964 for ( ; uc < e ; uc += len ) {
3dab1dad 1965 TRIE_CHARCOUNT(trie)++;
a3621e74 1966 TRIE_READ_CHAR;
645de4ce
KW
1967
1968 /* Acummulate to the current values, the range in the number of
1969 * bytes that this character could match. The max is presumed to
1970 * be the same as the folded input (which TRIE_READ_CHAR returns),
1971 * except that when this is not in UTF-8, it could be matched
1972 * against a string which is UTF-8, and the variant characters
1973 * could be 2 bytes instead of the 1 here. Likewise, for the
1974 * minimum number of bytes when not folded. When folding, the min
1975 * is assumed to be 1 byte could fold to match the single character
1976 * here, or in the case of a multi-char fold, 1 byte can fold to
1977 * the whole sequence. 'foldlen' is used to denote whether we are
1978 * in such a sequence, skipping the min setting if so. XXX TODO
1979 * Use the exact list of what folds to each character, from
1980 * PL_utf8_foldclosures */
1981 if (UTF) {
1982 maxbytes += UTF8SKIP(uc);
1983 if (! folder) {
1984 /* A non-UTF-8 string could be 1 byte to match our 2 */
1985 minbytes += (UTF8_IS_DOWNGRADEABLE_START(*uc))
1986 ? 1
1987 : UTF8SKIP(uc);
1988 }
1989 else {
1990 if (foldlen) {
1991 foldlen -= UTF8SKIP(uc);
1992 }
1993 else {
1994 foldlen = is_MULTI_CHAR_FOLD_utf8_safe(uc, e);
1995 minbytes++;
1996 }
1997 }
1998 }
1999 else {
2000 maxbytes += (UNI_IS_INVARIANT(*uc))
2001 ? 1
2002 : 2;
2003 if (! folder) {
2004 minbytes++;
2005 }
2006 else {
2007 if (foldlen) {
2008 foldlen--;
2009 }
2010 else {
2011 foldlen = is_MULTI_CHAR_FOLD_latin1_safe(uc, e);
2012 minbytes++;
2013 }
2014 }
2015 }
a3621e74 2016 if ( uvc < 256 ) {
fab2782b
YO
2017 if ( folder ) {
2018 U8 folded= folder[ (U8) uvc ];
2019 if ( !trie->charmap[ folded ] ) {
2020 trie->charmap[ folded ]=( ++trie->uniquecharcount );
2021 TRIE_STORE_REVCHAR( folded );
2022 }
2023 }
a3621e74
YO
2024 if ( !trie->charmap[ uvc ] ) {
2025 trie->charmap[ uvc ]=( ++trie->uniquecharcount );
fab2782b 2026 TRIE_STORE_REVCHAR( uvc );
a3621e74 2027 }
02daf0ab 2028 if ( set_bit ) {
62012aee
KW
2029 /* store the codepoint in the bitmap, and its folded
2030 * equivalent. */
fab2782b 2031 TRIE_BITMAP_SET(trie, uvc);
0921ee73
T
2032
2033 /* store the folded codepoint */
fab2782b 2034 if ( folder ) TRIE_BITMAP_SET(trie, folder[(U8) uvc ]);
0921ee73
T
2035
2036 if ( !UTF ) {
2037 /* store first byte of utf8 representation of
acdf4139 2038 variant codepoints */
6f2d5cbc 2039 if (! UVCHR_IS_INVARIANT(uvc)) {
acdf4139 2040 TRIE_BITMAP_SET(trie, UTF8_TWO_BYTE_HI(uvc));
0921ee73
T
2041 }
2042 }
02daf0ab
YO
2043 set_bit = 0; /* We've done our bit :-) */
2044 }
a3621e74
YO
2045 } else {
2046 SV** svpp;
55eed653
NC
2047 if ( !widecharmap )
2048 widecharmap = newHV();
a3621e74 2049
55eed653 2050 svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 1 );
a3621e74
YO
2051
2052 if ( !svpp )
e4584336 2053 Perl_croak( aTHX_ "error creating/fetching widecharmap entry for 0x%"UVXf, uvc );
a3621e74
YO
2054
2055 if ( !SvTRUE( *svpp ) ) {
2056 sv_setiv( *svpp, ++trie->uniquecharcount );
fab2782b 2057 TRIE_STORE_REVCHAR(uvc);
a3621e74
YO
2058 }
2059 }
2060 }
3dab1dad 2061 if( cur == first ) {
afa96d92
KW
2062 trie->minlen = minbytes;
2063 trie->maxlen = maxbytes;
2064 } else if (minbytes < trie->minlen) {
2065 trie->minlen = minbytes;
2066 } else if (maxbytes > trie->maxlen) {
2067 trie->maxlen = maxbytes;
fab2782b 2068 }
a3621e74
YO
2069 } /* end first pass */
2070 DEBUG_TRIE_COMPILE_r(
3dab1dad
YO
2071 PerlIO_printf( Perl_debug_log, "%*sTRIE(%s): W:%d C:%d Uq:%d Min:%d Max:%d\n",
2072 (int)depth * 2 + 2,"",
55eed653 2073 ( widecharmap ? "UTF8" : "NATIVE" ), (int)word_count,
be8e71aa
YO
2074 (int)TRIE_CHARCOUNT(trie), trie->uniquecharcount,
2075 (int)trie->minlen, (int)trie->maxlen )
a3621e74 2076 );
a3621e74
YO
2077
2078 /*
2079 We now know what we are dealing with in terms of unique chars and
2080 string sizes so we can calculate how much memory a naive
0111c4fd
RGS
2081 representation using a flat table will take. If it's over a reasonable
2082 limit (as specified by ${^RE_TRIE_MAXBUF}) we use a more memory
a3621e74
YO
2083 conservative but potentially much slower representation using an array
2084 of lists.
2085
2086 At the end we convert both representations into the same compressed
2087 form that will be used in regexec.c for matching with. The latter
2088 is a form that cannot be used to construct with but has memory
2089 properties similar to the list form and access properties similar
2090 to the table form making it both suitable for fast searches and
2091 small enough that its feasable to store for the duration of a program.
2092
2093 See the comment in the code where the compressed table is produced
2094 inplace from the flat tabe representation for an explanation of how
2095 the compression works.
2096
2097 */
2098
2099
2e64971a
DM
2100 Newx(prev_states, TRIE_CHARCOUNT(trie) + 2, U32);
2101 prev_states[1] = 0;
2102
3dab1dad 2103 if ( (IV)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1) > SvIV(re_trie_maxbuff) ) {
a3621e74
YO
2104 /*
2105 Second Pass -- Array Of Lists Representation
2106
2107 Each state will be represented by a list of charid:state records
2108 (reg_trie_trans_le) the first such element holds the CUR and LEN
2109 points of the allocated array. (See defines above).
2110
2111 We build the initial structure using the lists, and then convert
2112 it into the compressed table form which allows faster lookups
2113 (but cant be modified once converted).
a3621e74
YO
2114 */
2115
a3621e74
YO
2116 STRLEN transcount = 1;
2117
1e2e3d02
YO
2118 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2119 "%*sCompiling trie using list compiler\n",
2120 (int)depth * 2 + 2, ""));
686b73d4 2121
c944940b
JH
2122 trie->states = (reg_trie_state *)
2123 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2124 sizeof(reg_trie_state) );
a3621e74
YO
2125 TRIE_LIST_NEW(1);
2126 next_alloc = 2;
2127
2128 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2129
df826430 2130 regnode *noper = NEXTOPER( cur );
c445ea15 2131 U8 *uc = (U8*)STRING( noper );
df826430 2132 const U8 *e = uc + STR_LEN( noper );
c445ea15
AL
2133 U32 state = 1; /* required init */
2134 U16 charid = 0; /* sanity init */
07be1b83 2135 U32 wordlen = 0; /* required init */
c445ea15 2136
df826430
YO
2137 if (OP(noper) == NOTHING) {
2138 regnode *noper_next= regnext(noper);
2139 if (noper_next != tail && OP(noper_next) == flags) {
2140 noper = noper_next;
2141 uc= (U8*)STRING(noper);
2142 e= uc + STR_LEN(noper);
2143 }
2144 }
2145
3dab1dad 2146 if (OP(noper) != NOTHING) {
786e8c11 2147 for ( ; uc < e ; uc += len ) {
c445ea15 2148
786e8c11 2149 TRIE_READ_CHAR;
c445ea15 2150
786e8c11
YO
2151 if ( uvc < 256 ) {
2152 charid = trie->charmap[ uvc ];
c445ea15 2153 } else {
55eed653 2154 SV** const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11
YO
2155 if ( !svpp ) {
2156 charid = 0;
2157 } else {
2158 charid=(U16)SvIV( *svpp );
2159 }
c445ea15 2160 }
786e8c11
YO
2161 /* charid is now 0 if we dont know the char read, or nonzero if we do */
2162 if ( charid ) {
a3621e74 2163
786e8c11
YO
2164 U16 check;
2165 U32 newstate = 0;
a3621e74 2166
786e8c11
YO
2167 charid--;
2168 if ( !trie->states[ state ].trans.list ) {
2169 TRIE_LIST_NEW( state );
c445ea15 2170 }
786e8c11
YO
2171 for ( check = 1; check <= TRIE_LIST_USED( state ); check++ ) {
2172 if ( TRIE_LIST_ITEM( state, check ).forid == charid ) {
2173 newstate = TRIE_LIST_ITEM( state, check ).newstate;
2174 break;
2175 }
2176 }
2177 if ( ! newstate ) {
2178 newstate = next_alloc++;
2e64971a 2179 prev_states[newstate] = state;
786e8c11
YO
2180 TRIE_LIST_PUSH( state, charid, newstate );
2181 transcount++;
2182 }
2183 state = newstate;
2184 } else {
2185 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
c445ea15 2186 }
a28509cc 2187 }
c445ea15 2188 }
3dab1dad 2189 TRIE_HANDLE_WORD(state);
a3621e74
YO
2190
2191 } /* end second pass */
2192
1e2e3d02
YO
2193 /* next alloc is the NEXT state to be allocated */
2194 trie->statecount = next_alloc;
c944940b
JH
2195 trie->states = (reg_trie_state *)
2196 PerlMemShared_realloc( trie->states,
2197 next_alloc
2198 * sizeof(reg_trie_state) );
a3621e74 2199
3dab1dad 2200 /* and now dump it out before we compress it */
2b8b4781
NC
2201 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_list(trie, widecharmap,
2202 revcharmap, next_alloc,
2203 depth+1)
1e2e3d02 2204 );
a3621e74 2205
c944940b
JH
2206 trie->trans = (reg_trie_trans *)
2207 PerlMemShared_calloc( transcount, sizeof(reg_trie_trans) );
a3621e74
YO
2208 {
2209 U32 state;
a3621e74
YO
2210 U32 tp = 0;
2211 U32 zp = 0;
2212
2213
2214 for( state=1 ; state < next_alloc ; state ++ ) {
2215 U32 base=0;
2216
2217 /*
2218 DEBUG_TRIE_COMPILE_MORE_r(
2219 PerlIO_printf( Perl_debug_log, "tp: %d zp: %d ",tp,zp)
2220 );
2221 */
2222
2223 if (trie->states[state].trans.list) {
2224 U16 minid=TRIE_LIST_ITEM( state, 1).forid;
2225 U16 maxid=minid;
a28509cc 2226 U16 idx;
a3621e74
YO
2227
2228 for( idx = 2 ; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15
AL
2229 const U16 forid = TRIE_LIST_ITEM( state, idx).forid;
2230 if ( forid < minid ) {
2231 minid=forid;
2232 } else if ( forid > maxid ) {
2233 maxid=forid;
2234 }
a3621e74
YO
2235 }
2236 if ( transcount < tp + maxid - minid + 1) {
2237 transcount *= 2;
c944940b
JH
2238 trie->trans = (reg_trie_trans *)
2239 PerlMemShared_realloc( trie->trans,
446bd890
NC
2240 transcount
2241 * sizeof(reg_trie_trans) );
a3621e74
YO
2242 Zero( trie->trans + (transcount / 2), transcount / 2 , reg_trie_trans );
2243 }
2244 base = trie->uniquecharcount + tp - minid;
2245 if ( maxid == minid ) {
2246 U32 set = 0;
2247 for ( ; zp < tp ; zp++ ) {
2248 if ( ! trie->trans[ zp ].next ) {
2249 base = trie->uniquecharcount + zp - minid;
2250 trie->trans[ zp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2251 trie->trans[ zp ].check = state;
2252 set = 1;
2253 break;
2254 }
2255 }
2256 if ( !set ) {
2257 trie->trans[ tp ].next = TRIE_LIST_ITEM( state, 1).newstate;
2258 trie->trans[ tp ].check = state;
2259 tp++;
2260 zp = tp;
2261 }
2262 } else {
2263 for ( idx=1; idx <= TRIE_LIST_USED( state ) ; idx++ ) {
c445ea15 2264 const U32 tid = base - trie->uniquecharcount + TRIE_LIST_ITEM( state, idx ).forid;
a3621e74
YO
2265 trie->trans[ tid ].next = TRIE_LIST_ITEM( state, idx ).newstate;
2266 trie->trans[ tid ].check = state;
2267 }
2268 tp += ( maxid - minid + 1 );
2269 }
2270 Safefree(trie->states[ state ].trans.list);
2271 }
2272 /*
2273 DEBUG_TRIE_COMPILE_MORE_r(
2274 PerlIO_printf( Perl_debug_log, " base: %d\n",base);
2275 );
2276 */
2277 trie->states[ state ].trans.base=base;
2278 }
cc601c31 2279 trie->lasttrans = tp + 1;
a3621e74
YO
2280 }
2281 } else {
2282 /*
2283 Second Pass -- Flat Table Representation.
2284
2285 we dont use the 0 slot of either trans[] or states[] so we add 1 to each.
2286 We know that we will need Charcount+1 trans at most to store the data
2287 (one row per char at worst case) So we preallocate both structures
2288 assuming worst case.
2289
2290 We then construct the trie using only the .next slots of the entry
2291 structs.
2292
3b753521 2293 We use the .check field of the first entry of the node temporarily to
a3621e74
YO
2294 make compression both faster and easier by keeping track of how many non
2295 zero fields are in the node.
2296
2297 Since trans are numbered from 1 any 0 pointer in the table is a FAIL
2298 transition.
2299
2300 There are two terms at use here: state as a TRIE_NODEIDX() which is a
2301 number representing the first entry of the node, and state as a
2302 TRIE_NODENUM() which is the trans number. state 1 is TRIE_NODEIDX(1) and
2303 TRIE_NODENUM(1), state 2 is TRIE_NODEIDX(2) and TRIE_NODENUM(3) if there
2304 are 2 entrys per node. eg:
2305
2306 A B A B
2307 1. 2 4 1. 3 7
2308 2. 0 3 3. 0 5
2309 3. 0 0 5. 0 0
2310 4. 0 0 7. 0 0
2311
2312 The table is internally in the right hand, idx form. However as we also
2313 have to deal with the states array which is indexed by nodenum we have to
2314 use TRIE_NODENUM() to convert.
2315
2316 */
1e2e3d02
YO
2317 DEBUG_TRIE_COMPILE_MORE_r( PerlIO_printf( Perl_debug_log,
2318 "%*sCompiling trie using table compiler\n",
2319 (int)depth * 2 + 2, ""));
3dab1dad 2320
c944940b
JH
2321 trie->trans = (reg_trie_trans *)
2322 PerlMemShared_calloc( ( TRIE_CHARCOUNT(trie) + 1 )
2323 * trie->uniquecharcount + 1,
2324 sizeof(reg_trie_trans) );
2325 trie->states = (reg_trie_state *)
2326 PerlMemShared_calloc( TRIE_CHARCOUNT(trie) + 2,
2327 sizeof(reg_trie_state) );
a3621e74
YO
2328 next_alloc = trie->uniquecharcount + 1;
2329
3dab1dad 2330
a3621e74
YO
2331 for ( cur = first ; cur < last ; cur = regnext( cur ) ) {
2332
df826430 2333 regnode *noper = NEXTOPER( cur );
a28509cc 2334 const U8 *uc = (U8*)STRING( noper );
df826430 2335 const U8 *e = uc + STR_LEN( noper );
a3621e74
YO
2336
2337 U32 state = 1; /* required init */
2338
2339 U16 charid = 0; /* sanity init */
2340 U32 accept_state = 0; /* sanity init */
a3621e74 2341
07be1b83 2342 U32 wordlen = 0; /* required init */
a3621e74 2343
df826430
YO
2344 if (OP(noper) == NOTHING) {
2345 regnode *noper_next= regnext(noper);
2346 if (noper_next != tail && OP(noper_next) == flags) {
2347 noper = noper_next;
2348 uc= (U8*)STRING(noper);
2349 e= uc + STR_LEN(noper);
2350 }
2351 }
fab2782b 2352
3dab1dad 2353 if ( OP(noper) != NOTHING ) {
786e8c11 2354 for ( ; uc < e ; uc += len ) {
a3621e74 2355
786e8c11 2356 TRIE_READ_CHAR;
a3621e74 2357
786e8c11
YO
2358 if ( uvc < 256 ) {
2359 charid = trie->charmap[ uvc ];
2360 } else {
55eed653 2361 SV* const * const svpp = hv_fetch( widecharmap, (char*)&uvc, sizeof( UV ), 0);
786e8c11 2362 charid = svpp ? (U16)SvIV(*svpp) : 0;
a3621e74 2363 }
786e8c11
YO
2364 if ( charid ) {
2365 charid--;
2366 if ( !trie->trans[ state + charid ].next ) {
2367 trie->trans[ state + charid ].next = next_alloc;
2368 trie->trans[ state ].check++;
2e64971a
DM
2369 prev_states[TRIE_NODENUM(next_alloc)]
2370 = TRIE_NODENUM(state);
786e8c11
YO
2371 next_alloc += trie->uniquecharcount;
2372 }
2373 state = trie->trans[ state + charid ].next;
2374 } else {
2375 Perl_croak( aTHX_ "panic! In trie construction, no char mapping for %"IVdf, uvc );
2376 }
2377 /* charid is now 0 if we dont know the char read, or nonzero if we do */
a3621e74 2378 }
a3621e74 2379 }
3dab1dad
YO
2380 accept_state = TRIE_NODENUM( state );
2381 TRIE_HANDLE_WORD(accept_state);
a3621e74
YO
2382
2383 } /* end second pass */
2384
3dab1dad 2385 /* and now dump it out before we compress it */
2b8b4781
NC
2386 DEBUG_TRIE_COMPILE_MORE_r(dump_trie_interim_table(trie, widecharmap,
2387 revcharmap,
2388 next_alloc, depth+1));
a3621e74 2389
a3621e74
YO
2390 {
2391 /*
2392 * Inplace compress the table.*
2393
2394 For sparse data sets the table constructed by the trie algorithm will
2395 be mostly 0/FAIL transitions or to put it another way mostly empty.
2396 (Note that leaf nodes will not contain any transitions.)
2397
2398 This algorithm compresses the tables by eliminating most such
2399 transitions, at the cost of a modest bit of extra work during lookup:
2400
2401 - Each states[] entry contains a .base field which indicates the
2402 index in the state[] array wheres its transition data is stored.
2403
3b753521 2404 - If .base is 0 there are no valid transitions from that node.
a3621e74
YO
2405
2406 - If .base is nonzero then charid is added to it to find an entry in
2407 the trans array.
2408
2409 -If trans[states[state].base+charid].check!=state then the
2410 transition is taken to be a 0/Fail transition. Thus if there are fail
2411 transitions at the front of the node then the .base offset will point
2412 somewhere inside the previous nodes data (or maybe even into a node
2413 even earlier), but the .check field determines if the transition is
2414 valid.
2415
786e8c11 2416 XXX - wrong maybe?
a3621e74 2417 The following process inplace converts the table to the compressed
3b753521 2418 table: We first do not compress the root node 1,and mark all its
a3621e74 2419 .check pointers as 1 and set its .base pointer as 1 as well. This
3b753521
FN
2420 allows us to do a DFA construction from the compressed table later,
2421 and ensures that any .base pointers we calculate later are greater
2422 than 0.
a3621e74
YO
2423
2424 - We set 'pos' to indicate the first entry of the second node.
2425
2426 - We then iterate over the columns of the node, finding the first and
2427 last used entry at l and m. We then copy l..m into pos..(pos+m-l),
2428 and set the .check pointers accordingly, and advance pos
2429 appropriately and repreat for the next node. Note that when we copy
2430 the next pointers we have to convert them from the original
2431 NODEIDX form to NODENUM form as the former is not valid post
2432 compression.
2433
2434 - If a node has no transitions used we mark its base as 0 and do not
2435 advance the pos pointer.
2436
2437 - If a node only has one transition we use a second pointer into the
2438 structure to fill in allocated fail transitions from other states.
2439 This pointer is independent of the main pointer and scans forward
2440 looking for null transitions that are allocated to a state. When it
2441 finds one it writes the single transition into the "hole". If the
786e8c11 2442 pointer doesnt find one the single transition is appended as normal.
a3621e74
YO
2443
2444 - Once compressed we can Renew/realloc the structures to release the
2445 excess space.
2446
2447 See "Table-Compression Methods" in sec 3.9 of the Red Dragon,
2448 specifically Fig 3.47 and the associated pseudocode.
2449
2450 demq
2451 */
a3b680e6 2452 const U32 laststate = TRIE_NODENUM( next_alloc );
a28509cc 2453 U32 state, charid;
a3621e74 2454 U32 pos = 0, zp=0;
1e2e3d02 2455 trie->statecount = laststate;
a3621e74
YO
2456
2457 for ( state = 1 ; state < laststate ; state++ ) {
2458 U8 flag = 0;
a28509cc
AL
2459 const U32 stateidx = TRIE_NODEIDX( state );
2460 const U32 o_used = trie->trans[ stateidx ].check;
2461 U32 used = trie->trans[ stateidx ].check;
a3621e74
YO
2462 trie->trans[ stateidx ].check = 0;
2463
2464 for ( charid = 0 ; used && charid < trie->uniquecharcount ; charid++ ) {
2465 if ( flag || trie->trans[ stateidx + charid ].next ) {
2466 if ( trie->trans[ stateidx + charid ].next ) {
2467 if (o_used == 1) {
2468 for ( ; zp < pos ; zp++ ) {
2469 if ( ! trie->trans[ zp ].next ) {
2470 break;
2471 }
2472 }
2473 trie->states[ state ].trans.base = zp + trie->uniquecharcount - charid ;
2474 trie->trans[ zp ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2475 trie->trans[ zp ].check = state;
2476 if ( ++zp > pos ) pos = zp;
2477 break;
2478 }
2479 used--;
2480 }
2481 if ( !flag ) {
2482 flag = 1;
2483 trie->states[ state ].trans.base = pos + trie->uniquecharcount - charid ;
2484 }
2485 trie->trans[ pos ].next = SAFE_TRIE_NODENUM( trie->trans[ stateidx + charid ].next );
2486 trie->trans[ pos ].check = state;
2487 pos++;
2488 }
2489 }
2490 }
cc601c31 2491 trie->lasttrans = pos + 1;
c944940b
JH
2492 trie->states = (reg_trie_state *)
2493 PerlMemShared_realloc( trie->states, laststate
2494 * sizeof(reg_trie_state) );
a3621e74 2495 DEBUG_TRIE_COMPILE_MORE_r(
e4584336 2496 PerlIO_printf( Perl_debug_log,
3dab1dad
YO
2497 "%*sAlloc: %d Orig: %"IVdf" elements, Final:%"IVdf". Savings of %%%5.2f\n",
2498 (int)depth * 2 + 2,"",
2499 (int)( ( TRIE_CHARCOUNT(trie) + 1 ) * trie->uniquecharcount + 1 ),
5d7488b2
AL
2500 (IV)next_alloc,
2501 (IV)pos,
a3621e74
YO
2502 ( ( next_alloc - pos ) * 100 ) / (double)next_alloc );
2503 );
2504
2505 } /* end table compress */
2506 }
1e2e3d02
YO
2507 DEBUG_TRIE_COMPILE_MORE_r(
2508 PerlIO_printf(Perl_debug_log, "%*sStatecount:%"UVxf" Lasttrans:%"UVxf"\n",
2509 (int)depth * 2 + 2, "",
2510 (UV)trie->statecount,
2511 (UV)trie->lasttrans)
2512 );
cc601c31 2513 /* resize the trans array to remove unused space */
c944940b
JH
2514 trie->trans = (reg_trie_trans *)
2515 PerlMemShared_realloc( trie->trans, trie->lasttrans
2516 * sizeof(reg_trie_trans) );
a3621e74 2517
3b753521 2518 { /* Modify the program and insert the new TRIE node */
3dab1dad
YO
2519 U8 nodetype =(U8)(flags & 0xFF);
2520 char *str=NULL;
786e8c11 2521
07be1b83 2522#ifdef DEBUGGING
e62cc96a 2523 regnode *optimize = NULL;
7122b237
YO
2524#ifdef RE_TRACK_PATTERN_OFFSETS
2525
b57a0404
JH
2526 U32 mjd_offset = 0;
2527 U32 mjd_nodelen = 0;
7122b237
YO
2528#endif /* RE_TRACK_PATTERN_OFFSETS */
2529#endif /* DEBUGGING */
a3621e74 2530 /*
3dab1dad
YO
2531 This means we convert either the first branch or the first Exact,
2532 depending on whether the thing following (in 'last') is a branch
2533 or not and whther first is the startbranch (ie is it a sub part of
2534 the alternation or is it the whole thing.)
3b753521 2535 Assuming its a sub part we convert the EXACT otherwise we convert
3dab1dad 2536 the whole branch sequence, including the first.
a3621e74 2537 */
3dab1dad 2538 /* Find the node we are going to overwrite */
7f69552c 2539 if ( first != startbranch || OP( last ) == BRANCH ) {
07be1b83 2540 /* branch sub-chain */
3dab1dad 2541 NEXT_OFF( first ) = (U16)(last - first);
7122b237 2542#ifdef RE_TRACK_PATTERN_OFFSETS
07be1b83
YO
2543 DEBUG_r({
2544 mjd_offset= Node_Offset((convert));
2545 mjd_nodelen= Node_Length((convert));
2546 });
7122b237 2547#endif
7f69552c 2548 /* whole branch chain */
7122b237
YO
2549 }
2550#ifdef RE_TRACK_PATTERN_OFFSETS
2551 else {
7f69552c
YO
2552 DEBUG_r({
2553 const regnode *nop = NEXTOPER( convert );
2554 mjd_offset= Node_Offset((nop));
2555 mjd_nodelen= Node_Length((nop));
2556 });
07be1b83
YO
2557 }
2558 DEBUG_OPTIMISE_r(
2559 PerlIO_printf(Perl_debug_log, "%*sMJD offset:%"UVuf" MJD length:%"UVuf"\n",
2560 (int)depth * 2 + 2, "",
786e8c11 2561 (UV)mjd_offset, (UV)mjd_nodelen)
07be1b83 2562 );
7122b237 2563#endif
3dab1dad
YO
2564 /* But first we check to see if there is a common prefix we can
2565 split out as an EXACT and put in front of the TRIE node. */
2566 trie->startstate= 1;
55eed653 2567 if ( trie->bitmap && !widecharmap && !trie->jump ) {
3dab1dad 2568 U32 state;
1e2e3d02 2569 for ( state = 1 ; state < trie->statecount-1 ; state++ ) {
a3621e74 2570 U32 ofs = 0;
8e11feef
RGS
2571 I32 idx = -1;
2572 U32 count = 0;
2573 const U32 base = trie->states[ state ].trans.base;
a3621e74 2574
3dab1dad 2575 if ( trie->states[state].wordnum )
8e11feef 2576 count = 1;
a3621e74 2577
8e11feef 2578 for ( ofs = 0 ; ofs < trie->uniquecharcount ; ofs++ ) {
cc601c31
YO
2579 if ( ( base + ofs >= trie->uniquecharcount ) &&
2580 ( base + ofs - trie->uniquecharcount < trie->lasttrans ) &&
a3621e74
YO
2581 trie->trans[ base + ofs - trie->uniquecharcount ].check == state )
2582 {
3dab1dad 2583 if ( ++count > 1 ) {
2b8b4781 2584 SV **tmp = av_fetch( revcharmap, ofs, 0);
07be1b83 2585 const U8 *ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2586 if ( state == 1 ) break;
3dab1dad
YO
2587 if ( count == 2 ) {
2588 Zero(trie->bitmap, ANYOF_BITMAP_SIZE, char);
2589 DEBUG_OPTIMISE_r(
8e11feef
RGS
2590 PerlIO_printf(Perl_debug_log,
2591 "%*sNew Start State=%"UVuf" Class: [",
2592 (int)depth * 2 + 2, "",
786e8c11 2593 (UV)state));
be8e71aa 2594 if (idx >= 0) {
2b8b4781 2595 SV ** const tmp = av_fetch( revcharmap, idx, 0);
be8e71aa 2596 const U8 * const ch = (U8*)SvPV_nolen_const( *tmp );
8e11feef 2597
3dab1dad 2598 TRIE_BITMAP_SET(trie,*ch);
8e11feef
RGS
2599 if ( folder )
2600 TRIE_BITMAP_SET(trie, folder[ *ch ]);
3dab1dad 2601 DEBUG_OPTIMISE_r(
f1f66076 2602 PerlIO_printf(Perl_debug_log, "%s", (char*)ch)
3dab1dad 2603 );
8e11feef
RGS
2604 }
2605 }
2606 TRIE_BITMAP_SET(trie,*ch);
2607 if ( folder )
2608 TRIE_BITMAP_SET(trie,folder[ *ch ]);
2609 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"%s", ch));
2610 }
2611 idx = ofs;
2612 }
3dab1dad
YO
2613 }
2614 if ( count == 1 ) {
2b8b4781 2615 SV **tmp = av_fetch( revcharmap, idx, 0);
c490c714
YO
2616 STRLEN len;
2617 char *ch = SvPV( *tmp, len );
de734bd5
A
2618 DEBUG_OPTIMISE_r({
2619 SV *sv=sv_newmortal();
8e11feef
RGS
2620 PerlIO_printf( Perl_debug_log,
2621 "%*sPrefix State: %"UVuf" Idx:%"UVuf" Char='%s'\n",
2622 (int)depth * 2 + 2, "",
de734bd5
A
2623 (UV)state, (UV)idx,
2624 pv_pretty(sv, SvPV_nolen_const(*tmp), SvCUR(*tmp), 6,
2625 PL_colors[0], PL_colors[1],
2626 (SvUTF8(*tmp) ? PERL_PV_ESCAPE_UNI : 0) |
2627 PERL_PV_ESCAPE_FIRSTCHAR
2628 )
2629 );
2630 });
3dab1dad
YO
2631 if ( state==1 ) {
2632 OP( convert ) = nodetype;
2633 str=STRING(convert);
2634 STR_LEN(convert)=0;
2635 }
c490c714
YO
2636 STR_LEN(convert) += len;
2637 while (len--)
de734bd5 2638 *str++ = *ch++;
8e11feef 2639 } else {
f9049ba1 2640#ifdef DEBUGGING
8e11feef
RGS
2641 if (state>1)
2642 DEBUG_OPTIMISE_r(PerlIO_printf( Perl_debug_log,"]\n"));
f9049ba1 2643#endif
8e11feef
RGS
2644 break;
2645 }
2646 }
2e64971a 2647 trie->prefixlen = (state-1);
3dab1dad 2648 if (str) {
8e11feef 2649 regnode *n = convert+NODE_SZ_STR(convert);
07be1b83 2650 NEXT_OFF(convert) = NODE_SZ_STR(convert);
8e11feef 2651 trie->startstate = state;
07be1b83
YO
2652 trie->minlen -= (state - 1);
2653 trie->maxlen -= (state - 1);
33809eae
JH
2654#ifdef DEBUGGING
2655 /* At least the UNICOS C compiler choked on this
2656 * being argument to DEBUG_r(), so let's just have
2657 * it right here. */
2658 if (
2659#ifdef PERL_EXT_RE_BUILD
2660 1
2661#else
2662 DEBUG_r_TEST
2663#endif
2664 ) {
2665 regnode *fix = convert;
2666 U32 word = trie->wordcount;
2667 mjd_nodelen++;
2668 Set_Node_Offset_Length(convert, mjd_offset, state - 1);
2669 while( ++fix < n ) {
2670 Set_Node_Offset_Length(fix, 0, 0);
2671 }
2672 while (word--) {
2673 SV ** const tmp = av_fetch( trie_words, word, 0 );
2674 if (tmp) {
2675 if ( STR_LEN(convert) <= SvCUR(*tmp) )
2676 sv_chop(*tmp, SvPV_nolen(*tmp) + STR_LEN(convert));
2677 else
2678 sv_chop(*tmp, SvPV_nolen(*tmp) + SvCUR(*tmp));
2679 }
2680 }
2681 }
2682#endif
8e11feef
RGS
2683 if (trie->maxlen) {
2684 convert = n;
2685 } else {
3dab1dad 2686 NEXT_OFF(convert) = (U16)(tail - convert);
a5ca303d 2687 DEBUG_r(optimize= n);
3dab1dad
YO
2688 }
2689 }
2690 }
a5ca303d
YO
2691 if (!jumper)
2692 jumper = last;
3dab1dad 2693 if ( trie->maxlen ) {
8e11feef
RGS
2694 NEXT_OFF( convert ) = (U16)(tail - convert);
2695 ARG_SET( convert, data_slot );
786e8c11
YO
2696 /* Store the offset to the first unabsorbed branch in
2697 jump[0], which is otherwise unused by the jump logic.
2698 We use this when dumping a trie and during optimisation. */
2699 if (trie->jump)
7f69552c 2700 trie->jump[0] = (U16)(nextbranch - convert);
a5ca303d 2701
6c48061a
YO
2702 /* If the start state is not accepting (meaning there is no empty string/NOTHING)
2703 * and there is a bitmap
2704 * and the first "jump target" node we found leaves enough room
2705 * then convert the TRIE node into a TRIEC node, with the bitmap
2706 * embedded inline in the opcode - this is hypothetically faster.
2707 */
2708 if ( !trie->states[trie->startstate].wordnum
2709 && trie->bitmap
2710 && ( (char *)jumper - (char *)convert) >= (int)sizeof(struct regnode_charclass) )
786e8c11
YO
2711 {
2712 OP( convert ) = TRIEC;
2713 Copy(trie->bitmap, ((struct regnode_charclass *)convert)->bitmap, ANYOF_BITMAP_SIZE, char);
446bd890 2714 PerlMemShared_free(trie->bitmap);
786e8c11
YO
2715 trie->bitmap= NULL;
2716 } else
2717 OP( convert ) = TRIE;
a3621e74 2718
3dab1dad
YO
2719 /* store the type in the flags */
2720 convert->flags = nodetype;
a5ca303d
YO
2721 DEBUG_r({
2722 optimize = convert
2723 + NODE_STEP_REGNODE
2724 + regarglen[ OP( convert ) ];
2725 });
2726 /* XXX We really should free up the resource in trie now,
2727 as we won't use them - (which resources?) dmq */
3dab1dad 2728 }
a3621e74 2729 /* needed for dumping*/
e62cc96a 2730 DEBUG_r(if (optimize) {
07be1b83 2731 regnode *opt = convert;
bcdf7404 2732
e62cc96a 2733 while ( ++opt < optimize) {
07be1b83
YO
2734 Set_Node_Offset_Length(opt,0,0);
2735 }
786e8c11
YO
2736 /*
2737 Try to clean up some of the debris left after the
2738 optimisation.
a3621e74 2739 */
786e8c11 2740 while( optimize < jumper ) {
07be1b83 2741 mjd_nodelen += Node_Length((optimize));
a3621e74 2742 OP( optimize ) = OPTIMIZED;
07be1b83 2743 Set_Node_Offset_Length(optimize,0,0);
a3621e74
YO
2744 optimize++;
2745 }
07be1b83 2746 Set_Node_Offset_Length(convert,mjd_offset,mjd_nodelen);
a3621e74
YO
2747 });
2748 } /* end node insert */
2e64971a
DM
2749
2750 /* Finish populating the prev field of the wordinfo array. Walk back
2751 * from each accept state until we find another accept state, and if
2752 * so, point the first word's .prev field at the second word. If the
2753 * second already has a .prev field set, stop now. This will be the
2754 * case either if we've already processed that word's accept state,
3b753521
FN
2755 * or that state had multiple words, and the overspill words were
2756 * already linked up earlier.
2e64971a
DM
2757 */
2758 {
2759 U16 word;
2760 U32 state;
2761 U16 prev;
2762
2763 for (word=1; word <= trie->wordcount; word++) {
2764 prev = 0;
2765 if (trie->wordinfo[word].prev)
2766 continue;
2767 state = trie->wordinfo[word].accept;
2768 while (state) {
2769 state = prev_states[state];
2770 if (!state)
2771 break;
2772 prev = trie->states[state].wordnum;
2773 if (prev)
2774 break;
2775 }
2776 trie->wordinfo[word].prev = prev;
2777 }
2778 Safefree(prev_states);
2779 }
2780
2781
2782 /* and now dump out the compressed format */
2783 DEBUG_TRIE_COMPILE_r(dump_trie(trie, widecharmap, revcharmap, depth+1));
2784
55eed653 2785 RExC_rxi->data->data[ data_slot + 1 ] = (void*)widecharmap;
2b8b4781
NC
2786#ifdef DEBUGGING
2787 RExC_rxi->data->data[ data_slot + TRIE_WORDS_OFFSET ] = (void*)trie_words;
2788 RExC_rxi->data->data[ data_slot + 3 ] = (void*)revcharmap;
2789#else
03e70be4 2790 SvREFCNT_dec_NN(revcharmap);
07be1b83 2791#endif
786e8c11
YO
2792 return trie->jump
2793 ? MADE_JUMP_TRIE
2794 : trie->startstate>1
2795 ? MADE_EXACT_TRIE
2796 : MADE_TRIE;
2797}
2798
2799STATIC void
2800S_make_trie_failtable(pTHX_ RExC_state_t *pRExC_state, regnode *source, regnode *stclass, U32 depth)
2801{
3b753521 2802/* The Trie is constructed and compressed now so we can build a fail array if it's needed
786e8c11
YO
2803
2804 This is basically the Aho-Corasick algorithm. Its from exercise 3.31 and 3.32 in the
2805 "Red Dragon" -- Compilers, principles, techniques, and tools. Aho, Sethi, Ullman 1985/88
2806 ISBN 0-201-10088-6
2807
2808 We find the fail state for each state in the trie, this state is the longest proper
3b753521
FN
2809 suffix of the current state's 'word' that is also a proper prefix of another word in our
2810 trie. State 1 represents the word '' and is thus the default fail state. This allows
786e8c11
YO
2811 the DFA not to have to restart after its tried and failed a word at a given point, it
2812 simply continues as though it had been matching the other word in the first place.
2813 Consider
2814 'abcdgu'=~/abcdefg|cdgu/
2815 When we get to 'd' we are still matching the first word, we would encounter 'g' which would
3b753521
FN
2816 fail, which would bring us to the state representing 'd' in the second word where we would
2817 try 'g' and succeed, proceeding to match 'cdgu'.
786e8c11
YO
2818 */
2819 /* add a fail transition */
3251b653
NC
2820 const U32 trie_offset = ARG(source);
2821 reg_trie_data *trie=(reg_trie_data *)RExC_rxi->data->data[trie_offset];
786e8c11
YO
2822 U32 *q;
2823 const U32 ucharcount = trie->uniquecharcount;
1e2e3d02 2824 const U32 numstates = trie->statecount;
786e8c11
YO
2825 const U32 ubound = trie->lasttrans + ucharcount;
2826 U32 q_read = 0;
2827 U32 q_write = 0;
2828 U32 charid;
2829 U32 base = trie->states[ 1 ].trans.base;
2830 U32 *fail;
2831 reg_ac_data *aho;
cf78de0b 2832 const U32 data_slot = add_data( pRExC_state, STR_WITH_LEN("T"));
786e8c11 2833 GET_RE_DEBUG_FLAGS_DECL;
7918f24d
NC
2834
2835 PERL_ARGS_ASSERT_MAKE_TRIE_FAILTABLE;
786e8c11
YO
2836#ifndef DEBUGGING
2837 PERL_UNUSED_ARG(depth);
2838#endif
2839
2840
2841 ARG_SET( stclass, data_slot );
c944940b 2842 aho = (reg_ac_data *) PerlMemShared_calloc( 1, sizeof(reg_ac_data) );
f8fc2ecf 2843 RExC_rxi->data->data[ data_slot ] = (void*)aho;
3251b653 2844 aho->trie=trie_offset;
446bd890
NC
2845 aho->states=(reg_trie_state *)PerlMemShared_malloc( numstates * sizeof(reg_trie_state) );
2846 Copy( trie->states, aho->states, numstates, reg_trie_state );
786e8c11 2847 Newxz( q, numstates, U32);
c944940b 2848 aho->fail = (U32 *) PerlMemShared_calloc( numstates, sizeof(U32) );
786e8c11
YO
2849 aho->refcount = 1;
2850 fail = aho->fail;
2851 /* initialize fail[0..1] to be 1 so that we always have
2852 a valid final fail state */
2853 fail[ 0 ] = fail[ 1 ] = 1;
2854
2855 for ( charid = 0; charid < ucharcount ; charid++ ) {
2856 const U32 newstate = TRIE_TRANS_STATE( 1, base, ucharcount, charid, 0 );
2857 if ( newstate ) {
2858 q[ q_write ] = newstate;
2859 /* set to point at the root */
2860 fail[ q[ q_write++ ] ]=1;
2861 }
2862 }
2863 while ( q_read < q_write) {
2864 const U32 cur = q[ q_read++ % numstates ];
2865 base = trie->states[ cur ].trans.base;
2866
2867 for ( charid = 0 ; charid < ucharcount ; charid++ ) {
2868 const U32 ch_state = TRIE_TRANS_STATE( cur, base, ucharcount, charid, 1 );
2869 if (ch_state) {
2870 U32 fail_state = cur;
2871 U32 fail_base;
2872 do {
2873 fail_state = fail[ fail_state ];
2874 fail_base = aho->states[ fail_state ].trans.base;
2875 } while ( !TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 ) );
2876
2877 fail_state = TRIE_TRANS_STATE( fail_state, fail_base, ucharcount, charid, 1 );
2878 fail[ ch_state ] = fail_state;
2879 if ( !aho->states[ ch_state ].wordnum && aho->states[ fail_state ].wordnum )
2880 {
2881 aho->states[ ch_state ].wordnum = aho->states[ fail_state ].wordnum;
2882 }
2883 q[ q_write++ % numstates] = ch_state;
2884 }
2885 }
2886 }
2887 /* restore fail[0..1] to 0 so that we "fall out" of the AC loop
2888 when we fail in state 1, this allows us to use the
2889 charclass scan to find a valid start char. This is based on the principle
2890 that theres a good chance the string being searched contains lots of stuff
2891 that cant be a start char.
2892 */
2893 fail[ 0 ] = fail[ 1 ] = 0;
2894 DEBUG_TRIE_COMPILE_r({
6d99fb9b
JH
2895 PerlIO_printf(Perl_debug_log,
2896 "%*sStclass Failtable (%"UVuf" states): 0",
2897 (int)(depth * 2), "", (UV)numstates
1e2e3d02 2898 );
786e8c11
YO
2899 for( q_read=1; q_read<numstates; q_read++ ) {
2900 PerlIO_printf(Perl_debug_log, ", %"UVuf, (UV)fail[q_read]);
2901 }
2902 PerlIO_printf(Perl_debug_log, "\n");
2903 });
2904 Safefree(q);
2905 /*RExC_seen |= REG_SEEN_TRIEDFA;*/
a3621e74
YO
2906}
2907
786e8c11 2908
07be1b83 2909#define DEBUG_PEEP(str,scan,depth) \
b515a41d 2910 DEBUG_OPTIMISE_r({if (scan){ \
07be1b83
YO
2911 SV * const mysv=sv_newmortal(); \
2912 regnode *Next = regnext(scan); \
2913 regprop(RExC_rx, mysv, scan); \
7f69552c 2914 PerlIO_printf(Perl_debug_log, "%*s" str ">%3d: %s (%d)\n", \
07be1b83
YO
2915 (int)depth*2, "", REG_NODE_NUM(scan), SvPV_nolen_const(mysv),\
2916 Next ? (REG_NODE_NUM(Next)) : 0 ); \
b515a41d 2917 }});
07be1b83 2918
1de06328 2919
bb914485 2920/* The below joins as many adjacent EXACTish nodes as possible into a single
0a982f06
KW
2921 * one. The regop may be changed if the node(s) contain certain sequences that
2922 * require special handling. The joining is only done if:
bb914485
KW
2923 * 1) there is room in the current conglomerated node to entirely contain the
2924 * next one.
2925 * 2) they are the exact same node type
2926 *
87b8b349 2927 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
bb914485
KW
2928 * these get optimized out
2929 *
0a982f06
KW
2930 * If a node is to match under /i (folded), the number of characters it matches
2931 * can be different than its character length if it contains a multi-character
2932 * fold. *min_subtract is set to the total delta of the input nodes.
bb914485 2933 *
a0c4c608
KW
2934 * And *has_exactf_sharp_s is set to indicate whether or not the node is EXACTF
2935 * and contains LATIN SMALL LETTER SHARP S
f758bddf 2936 *
bb914485 2937 * This is as good a place as any to discuss the design of handling these
0a982f06
KW
2938 * multi-character fold sequences. It's been wrong in Perl for a very long
2939 * time. There are three code points in Unicode whose multi-character folds
2940 * were long ago discovered to mess things up. The previous designs for
2941 * dealing with these involved assigning a special node for them. This
2942 * approach doesn't work, as evidenced by this example:
a0c4c608 2943 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
0a982f06
KW
2944 * Both these fold to "sss", but if the pattern is parsed to create a node that
2945 * would match just the \xDF, it won't be able to handle the case where a
bb914485
KW
2946 * successful match would have to cross the node's boundary. The new approach
2947 * that hopefully generally solves the problem generates an EXACTFU_SS node
2948 * that is "sss".
2949 *
0a982f06 2950 * It turns out that there are problems with all multi-character folds, and not
cb117658
KW
2951 * just these three. Now the code is general, for all such cases. The
2952 * approach taken is:
0a982f06
KW
2953 * 1) This routine examines each EXACTFish node that could contain multi-
2954 * character fold sequences. It returns in *min_subtract how much to
9d071ca8 2955 * subtract from the the actual length of the string to get a real minimum
0a982f06
KW
2956 * match length; it is 0 if there are no multi-char folds. This delta is
2957 * used by the caller to adjust the min length of the match, and the delta
2958 * between min and max, so that the optimizer doesn't reject these
2959 * possibilities based on size constraints.
cb117658 2960 * 2) For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
0a982f06
KW
2961 * is used for an EXACTFU node that contains at least one "ss" sequence in
2962 * it. For non-UTF-8 patterns and strings, this is the only case where
2963 * there is a possible fold length change. That means that a regular
2964 * EXACTFU node without UTF-8 involvement doesn't have to concern itself
2965 * with length changes, and so can be processed faster. regexec.c takes
2966 * advantage of this. Generally, an EXACTFish node that is in UTF-8 is
2967 * pre-folded by regcomp.c. This saves effort in regex matching.
87b8b349 2968 * However, the pre-folding isn't done for non-UTF8 patterns because the
0a982f06
KW
2969 * fold of the MICRO SIGN requires UTF-8, and we don't want to slow things
2970 * down by forcing the pattern into UTF8 unless necessary. Also what
2971 * EXACTF and EXACTFL nodes fold to isn't known until runtime. The fold
2972 * possibilities for the non-UTF8 patterns are quite simple, except for
2973 * the sharp s. All the ones that don't involve a UTF-8 target string are
2974 * members of a fold-pair, and arrays are set up for all of them so that
2975 * the other member of the pair can be found quickly. Code elsewhere in
2976 * this file makes sure that in EXACTFU nodes, the sharp s gets folded to
2977 * 'ss', even if the pattern isn't UTF-8. This avoids the issues
2978 * described in the next item.
098b07d5 2979 * 3) A problem remains for the sharp s in EXACTF and EXACTFA nodes when the
1ca267a5
KW
2980 * pattern isn't in UTF-8. (BTW, there cannot be an EXACTF node with a
2981 * UTF-8 pattern.) An assumption that the optimizer part of regexec.c
2982 * (probably unwittingly, in Perl_regexec_flags()) makes is that a
2983 * character in the pattern corresponds to at most a single character in
2984 * the target string. (And I do mean character, and not byte here, unlike
2985 * other parts of the documentation that have never been updated to
2986 * account for multibyte Unicode.) sharp s in EXACTF nodes can match the
2987 * two character string 'ss'; in EXACTFA nodes it can match
2988 * "\x{17F}\x{17F}". These violate the assumption, and they are the only
2989 * instances where it is violated. I'm reluctant to try to change the
2990 * assumption, as the code involved is impenetrable to me (khw), so
2991 * instead the code here punts. This routine examines (when the pattern
2992 * isn't UTF-8) EXACTF and EXACTFA nodes for the sharp s, and returns a
2993 * boolean indicating whether or not the node contains a sharp s. When it
2994 * is true, the caller sets a flag that later causes the optimizer in this
2995 * file to not set values for the floating and fixed string lengths, and
2996 * thus avoids the optimizer code in regexec.c that makes the invalid
2997 * assumption. Thus, there is no optimization based on string lengths for
2998 * non-UTF8-pattern EXACTF and EXACTFA nodes that contain the sharp s.
2999 * (The reason the assumption is wrong only in these two cases is that all
3000 * other non-UTF-8 folds are 1-1; and, for UTF-8 patterns, we pre-fold all
3001 * other folds to their expanded versions. We can't prefold sharp s to
3002 * 'ss' in EXACTF nodes because we don't know at compile time if it
3003 * actually matches 'ss' or not. It will match iff the target string is
3004 * in UTF-8, unlike the EXACTFU nodes, where it always matches; and
3005 * EXACTFA and EXACTFL where it never does. In an EXACTFA node in a UTF-8
3006 * pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the problem;
3007 * but in a non-UTF8 pattern, folding it to that above-Latin1 string would
3008 * require the pattern to be forced into UTF-8, the overhead of which we
3009 * want to avoid.)
098b07d5
KW
3010 *
3011 * Similarly, the code that generates tries doesn't currently handle
3012 * not-already-folded multi-char folds, and it looks like a pain to change
3013 * that. Therefore, trie generation of EXACTFA nodes with the sharp s
3014 * doesn't work. Instead, such an EXACTFA is turned into a new regnode,
3015 * EXACTFA_NO_TRIE, which the trie code knows not to handle. Most people
3016 * using /iaa matching will be doing so almost entirely with ASCII
3017 * strings, so this should rarely be encountered in practice */
1de06328 3018
9d071ca8 3019#define JOIN_EXACT(scan,min_subtract,has_exactf_sharp_s, flags) \
07be1b83 3020 if (PL_regkind[OP(scan)] == EXACT) \
9d071ca8 3021 join_exact(pRExC_state,(scan),(min_subtract),has_exactf_sharp_s, (flags),NULL,depth+1)
07be1b83 3022
be8e71aa 3023STATIC U32
9d071ca8 3024S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan, UV *min_subtract, bool *has_exactf_sharp_s, U32 flags,regnode *val, U32 depth) {
07be1b83
YO
3025 /* Merge several consecutive EXACTish nodes into one. */
3026 regnode *n = regnext(scan);
3027 U32 stringok = 1;
3028 regnode *next = scan + NODE_SZ_STR(scan);
3029 U32 merged = 0;
3030 U32 stopnow = 0;
3031#ifdef DEBUGGING
3032 regnode *stop = scan;
72f13be8 3033 GET_RE_DEBUG_FLAGS_DECL;
f9049ba1 3034#else
d47053eb
RGS
3035 PERL_UNUSED_ARG(depth);
3036#endif
7918f24d
NC
3037
3038 PERL_ARGS_ASSERT_JOIN_EXACT;
d47053eb 3039#ifndef EXPERIMENTAL_INPLACESCAN
f9049ba1
SP
3040 PERL_UNUSED_ARG(flags);
3041 PERL_UNUSED_ARG(val);
07be1b83 3042#endif
07be1b83 3043 DEBUG_PEEP("join",scan,depth);
bb914485 3044
3f410cf6
KW
3045 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
3046 * EXACT ones that are mergeable to the current one. */
3047 while (n
3048 && (PL_regkind[OP(n)] == NOTHING
3049 || (stringok && OP(n) == OP(scan)))
07be1b83 3050 && NEXT_OFF(n)
3f410cf6
KW
3051 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
3052 {
07be1b83
YO
3053
3054 if (OP(n) == TAIL || n > next)
3055 stringok = 0;
3056 if (PL_regkind[OP(n)] == NOTHING) {
07be1b83
YO
3057 DEBUG_PEEP("skip:",n,depth);
3058 NEXT_OFF(scan) += NEXT_OFF(n);
3059 next = n + NODE_STEP_REGNODE;
3060#ifdef DEBUGGING
3061 if (stringok)
3062 stop = n;
3063#endif
3064 n = regnext(n);
3065 }
3066 else if (stringok) {
786e8c11 3067 const unsigned int oldl = STR_LEN(scan);
07be1b83 3068 regnode * const nnext = regnext(n);
b2230d39 3069
87b8b349
KW
3070 /* XXX I (khw) kind of doubt that this works on platforms where
3071 * U8_MAX is above 255 because of lots of other assumptions */
79a81a6e 3072 /* Don't join if the sum can't fit into a single node */
b2230d39
KW
3073 if (oldl + STR_LEN(n) > U8_MAX)
3074 break;
07be1b83
YO
3075
3076 DEBUG_PEEP("merg",n,depth);
07be1b83 3077 merged++;
b2230d39 3078
07be1b83
YO
3079 NEXT_OFF(scan) += NEXT_OFF(n);
3080 STR_LEN(scan) += STR_LEN(n);
3081 next = n + NODE_SZ_STR(n);
3082 /* Now we can overwrite *n : */
3083 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
3084#ifdef DEBUGGING
3085 stop = next - 1;
3086#endif
3087 n = nnext;
3088 if (stopnow) break;
3089 }
3090
d47053eb
RGS
3091#ifdef EXPERIMENTAL_INPLACESCAN
3092 if (flags && !NEXT_OFF(n)) {
3093 DEBUG_PEEP("atch", val, depth);
3094 if (reg_off_by_arg[OP(n)]) {
3095 ARG_SET(n, val - n);
3096 }
3097 else {
3098 NEXT_OFF(n) = val - n;
3099 }
3100 stopnow = 1;
3101 }
07be1b83
YO
3102#endif
3103 }
2c2b7f86 3104
9d071ca8 3105 *min_subtract = 0;
f758bddf 3106 *has_exactf_sharp_s = FALSE;
f646642f 3107
3f410cf6
KW
3108 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
3109 * can now analyze for sequences of problematic code points. (Prior to
3110 * this final joining, sequences could have been split over boundaries, and
a0c4c608
KW
3111 * hence missed). The sequences only happen in folding, hence for any
3112 * non-EXACT EXACTish node */
86d6fcad 3113 if (OP(scan) != EXACT) {
0a982f06
KW
3114 const U8 * const s0 = (U8*) STRING(scan);
3115 const U8 * s = s0;
3116 const U8 * const s_end = s0 + STR_LEN(scan);
f758bddf
KW
3117
3118 /* One pass is made over the node's string looking for all the
3119 * possibilities. to avoid some tests in the loop, there are two main
3120 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
3121 * non-UTF-8 */
3122 if (UTF) {
86d6fcad 3123
0a982f06
KW
3124 /* Examine the string for a multi-character fold sequence. UTF-8
3125 * patterns have all characters pre-folded by the time this code is
3126 * executed */
3127 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
3128 length sequence we are looking for is 2 */
86d6fcad 3129 {
0a982f06
KW
3130 int count = 0;
3131 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
3132 if (! len) { /* Not a multi-char fold: get next char */
3133 s += UTF8SKIP(s);
3134 continue;
3135 }
bb914485 3136
0a982f06 3137 /* Nodes with 'ss' require special handling, except for EXACTFL
098b07d5
KW
3138 * and EXACTFA-ish for which there is no multi-char fold to
3139 * this */
0a982f06 3140 if (len == 2 && *s == 's' && *(s+1) == 's'
098b07d5
KW
3141 && OP(scan) != EXACTFL
3142 && OP(scan) != EXACTFA
3143 && OP(scan) != EXACTFA_NO_TRIE)
0a982f06
KW
3144 {
3145 count = 2;
3146 OP(scan) = EXACTFU_SS;
3147 s += 2;
3148 }
0a982f06
KW
3149 else { /* Here is a generic multi-char fold. */
3150 const U8* multi_end = s + len;
3151
3152 /* Count how many characters in it. In the case of /l and
3153 * /aa, no folds which contain ASCII code points are
3154 * allowed, so check for those, and skip if found. (In
3155 * EXACTFL, no folds are allowed to any Latin1 code point,
3156 * not just ASCII. But there aren't any of these
3157 * currently, nor ever likely, so don't take the time to
3158 * test for them. The code that generates the
3159 * is_MULTI_foo() macros croaks should one actually get put
3160 * into Unicode .) */
098b07d5
KW
3161 if (OP(scan) != EXACTFL
3162 && OP(scan) != EXACTFA
3163 && OP(scan) != EXACTFA_NO_TRIE)
3164 {
0a982f06
KW
3165 count = utf8_length(s, multi_end);
3166 s = multi_end;
3167 }
3168 else {
3169 while (s < multi_end) {
3170 if (isASCII(*s)) {
3171 s++;
3172 goto next_iteration;
3173 }
3174 else {
3175 s += UTF8SKIP(s);
3176 }
3177 count++;
3178 }
3179 }
3180 }
f758bddf 3181
0a982f06
KW
3182 /* The delta is how long the sequence is minus 1 (1 is how long
3183 * the character that folds to the sequence is) */
3184 *min_subtract += count - 1;
3185 next_iteration: ;
bb914485
KW
3186 }
3187 }
1ca267a5
KW
3188 else if (OP(scan) == EXACTFA) {
3189
3190 /* Non-UTF-8 pattern, EXACTFA node. There can't be a multi-char
3191 * fold to the ASCII range (and there are no existing ones in the
3192 * upper latin1 range). But, as outlined in the comments preceding
098b07d5
KW
3193 * this function, we need to flag any occurrences of the sharp s.
3194 * This character forbids trie formation (because of added
3195 * complexity) */
1ca267a5
KW
3196 while (s < s_end) {
3197 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
098b07d5 3198 OP(scan) = EXACTFA_NO_TRIE;
1ca267a5
KW
3199 *has_exactf_sharp_s = TRUE;
3200 break;
3201 }
3202 s++;
3203 continue;
3204 }
3205 }
3206 else if (OP(scan) != EXACTFL) {
3207
3208 /* Non-UTF-8 pattern, not EXACTFA nor EXACTFL node. Look for the
3209 * multi-char folds that are all Latin1. (This code knows that
3210 * there are no current multi-char folds possible with EXACTFL,
3211 * relying on fold_grind.t to catch any errors if the very unlikely
3212 * event happens that some get added in future Unicode versions.)
3213 * As explained in the comments preceding this function, we look
3214 * also for the sharp s in EXACTF nodes; it can be in the final
0a982f06
KW
3215 * position. Otherwise we can stop looking 1 byte earlier because
3216 * have to find at least two characters for a multi-fold */
f758bddf
KW
3217 const U8* upper = (OP(scan) == EXACTF) ? s_end : s_end -1;
3218
0a982f06 3219 while (s < upper) {
40b1ba4f 3220 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
0a982f06
KW
3221 if (! len) { /* Not a multi-char fold. */
3222 if (*s == LATIN_SMALL_LETTER_SHARP_S && OP(scan) == EXACTF)
3223 {
3224 *has_exactf_sharp_s = TRUE;
3225 }
3226 s++;
3227 continue;
3228 }
3229
3230 if (len == 2
c02c3054
KW
3231 && isARG2_lower_or_UPPER_ARG1('s', *s)
3232 && isARG2_lower_or_UPPER_ARG1('s', *(s+1)))
0a982f06
KW
3233 {
3234
3235 /* EXACTF nodes need to know that the minimum length
3236 * changed so that a sharp s in the string can match this
3237 * ss in the pattern, but they remain EXACTF nodes, as they
3238 * won't match this unless the target string is is UTF-8,
3239 * which we don't know until runtime */
3240 if (OP(scan) != EXACTF) {
3241 OP(scan) = EXACTFU_SS;
3242 }
86d6fcad 3243 }
0a982f06
KW
3244
3245 *min_subtract += len - 1;
3246 s += len;
86d6fcad
KW
3247 }
3248 }
07be1b83 3249 }
3f410cf6 3250
07be1b83 3251#ifdef DEBUGGING
bb789b09
DM
3252 /* Allow dumping but overwriting the collection of skipped
3253 * ops and/or strings with fake optimized ops */
07be1b83
YO
3254 n = scan + NODE_SZ_STR(scan);
3255 while (n <= stop) {
bb789b09
DM
3256 OP(n) = OPTIMIZED;
3257 FLAGS(n) = 0;
3258 NEXT_OFF(n) = 0;
07be1b83
YO
3259 n++;
3260 }
3261#endif
3262 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl",scan,depth)});
3263 return stopnow;
3264}
3265
486ec47a 3266/* REx optimizer. Converts nodes into quicker variants "in place".
653099ff
GS
3267 Finds fixed substrings. */
3268
a0288114 3269/* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
c277df42
IZ
3270 to the position after last scanned or to NULL. */
3271
40d049e4
YO
3272#define INIT_AND_WITHP \
3273 assert(!and_withp); \
b8f7bb16 3274 Newx(and_withp,1, regnode_ssc); \
40d049e4 3275 SAVEFREEPV(and_withp)
07be1b83 3276
b515a41d 3277/* this is a chain of data about sub patterns we are processing that
486ec47a 3278 need to be handled separately/specially in study_chunk. Its so
b515a41d
YO
3279 we can simulate recursion without losing state. */
3280struct scan_frame;
3281typedef struct scan_frame {
3282 regnode *last; /* last node to process in this frame */
3283 regnode *next; /* next node to process when last is reached */
3284 struct scan_frame *prev; /*previous frame*/
3285 I32 stop; /* what stopparen do we use */
3286} scan_frame;
3287
304ee84b
YO
3288
3289#define SCAN_COMMIT(s, data, m) scan_commit(s, data, m, is_inf)
3290
49f55535 3291STATIC SSize_t
40d049e4 3292S_study_chunk(pTHX_ RExC_state_t *pRExC_state, regnode **scanp,
ea3daa5d 3293 SSize_t *minlenp, SSize_t *deltap,
40d049e4
YO
3294 regnode *last,
3295 scan_data_t *data,
3296 I32 stopparen,
3297 U8* recursed,
b8f7bb16 3298 regnode_ssc *and_withp,
40d049e4 3299 U32 flags, U32 depth)
c277df42
IZ
3300 /* scanp: Start here (read-write). */
3301 /* deltap: Write maxlen-minlen here. */
3302 /* last: Stop before this one. */
40d049e4
YO
3303 /* data: string data about the pattern */
3304 /* stopparen: treat close N as END */
3305 /* recursed: which subroutines have we recursed into */
3306 /* and_withp: Valid if flags & SCF_DO_STCLASS_OR */
c277df42 3307{
97aff369 3308 dVAR;
ea3daa5d
FC
3309 /* There must be at least this number of characters to match */
3310 SSize_t min = 0;
2d608413 3311 I32 pars = 0, code;
c277df42 3312 regnode *scan = *scanp, *next;
ea3daa5d 3313 SSize_t delta = 0;
c277df42 3314 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
aca2d497 3315 int is_inf_internal = 0; /* The studied chunk is infinite */
c277df42
IZ
3316 I32 is_par = OP(scan) == OPEN ? ARG(scan) : 0;
3317 scan_data_t data_fake;
a3621e74 3318 SV *re_trie_maxbuff = NULL;
786e8c11 3319 regnode *first_non_open = scan;
ea3daa5d 3320 SSize_t stopmin = SSize_t_MAX;
8aa23a47 3321 scan_frame *frame = NULL;
a3621e74 3322 GET_RE_DEBUG_FLAGS_DECL;
8aa23a47 3323
7918f24d
NC
3324 PERL_ARGS_ASSERT_STUDY_CHUNK;
3325
13a24bad 3326#ifdef DEBUGGING
40d049e4 3327 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
13a24bad 3328#endif
40d049e4 3329
786e8c11 3330 if ( depth == 0 ) {
40d049e4 3331 while (first_non_open && OP(first_non_open) == OPEN)
786e8c11
YO
3332 first_non_open=regnext(first_non_open);
3333 }
3334
b81d288d 3335
8aa23a47
YO
3336 fake_study_recurse:
3337 while ( scan && OP(scan) != END && scan < last ){
2d608413
KW
3338 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
3339 node length to get a real minimum (because
3340 the folded version may be shorter) */
f758bddf 3341 bool has_exactf_sharp_s = FALSE;
8aa23a47 3342 /* Peephole optimizer: */
304ee84b 3343 DEBUG_STUDYDATA("Peep:", data,depth);
8aa23a47 3344 DEBUG_PEEP("Peep",scan,depth);
a0c4c608
KW
3345
3346 /* Its not clear to khw or hv why this is done here, and not in the
3347 * clauses that deal with EXACT nodes. khw's guess is that it's
3348 * because of a previous design */
9d071ca8 3349 JOIN_EXACT(scan,&min_subtract, &has_exactf_sharp_s, 0);
8aa23a47
YO
3350
3351 /* Follow the next-chain of the current node and optimize
3352 away all the NOTHINGs from it. */
3353 if (OP(scan) != CURLYX) {
3354 const int max = (reg_off_by_arg[OP(scan)]
3355 ? I32_MAX
3356 /* I32 may be smaller than U16 on CRAYs! */
3357 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
3358 int off = (reg_off_by_arg[OP(scan)] ? ARG(scan) : NEXT_OFF(scan));
3359 int noff;
3360 regnode *n = scan;
686b73d4 3361
8aa23a47
YO
3362 /* Skip NOTHING and LONGJMP. */
3363 while ((n = regnext(n))
3364 && ((PL_regkind[OP(n)] == NOTHING && (noff = NEXT_OFF(n)))
3365 || ((OP(n) == LONGJMP) && (noff = ARG(n))))
3366 && off + noff < max)
3367 off += noff;
3368 if (reg_off_by_arg[OP(scan)])
3369 ARG(scan) = off;
3370 else
3371 NEXT_OFF(scan) = off;
3372 }
a3621e74 3373
c277df42 3374
8aa23a47
YO
3375
3376 /* The principal pseudo-switch. Cannot be a switch, since we
3377 look into several different things. */
3378 if (OP(scan) == BRANCH || OP(scan) == BRANCHJ
3379 || OP(scan) == IFTHEN) {
3380 next = regnext(scan);
3381 code = OP(scan);
3382 /* demq: the op(next)==code check is to see if we have "branch-branch" AFAICT */
686b73d4 3383
8aa23a47
YO
3384 if (OP(next) == code || code == IFTHEN) {
3385 /* NOTE - There is similar code to this block below for handling
3386 TRIE nodes on a re-study. If you change stuff here check there
3387 too. */
49f55535 3388 SSize_t max1 = 0, min1 = SSize_t_MAX, num = 0;
b8f7bb16 3389 regnode_ssc accum;
8aa23a47 3390 regnode * const startbranch=scan;
686b73d4 3391
8aa23a47 3392 if (flags & SCF_DO_SUBSTR)
304ee84b 3393 SCAN_COMMIT(pRExC_state, data, minlenp); /* Cannot merge strings after this. */
8aa23a47 3394 if (flags & SCF_DO_STCLASS)
557bd3fb 3395 ssc_init_zero(pRExC_state, &accum);
8aa23a47
YO
3396
3397 while (OP(scan) == code) {
49f55535
FC
3398 SSize_t deltanext, minnext, fake;
3399 I32 f = 0;
b8f7bb16 3400 regnode_ssc this_class;
8aa23a47
YO
3401
3402 num++;
3403 data_fake.flags = 0;
3404 if (data) {
3405 data_fake.whilem_c = data->whilem_c;
3406 data_fake.last_closep = data->last_closep;
3407 }
3408 else
3409 data_fake.last_closep = &fake;
58e23c8d
YO
3410
3411 data_fake.pos_delta = delta;
8aa23a47
YO
3412 next = regnext(scan);
3413 scan = NEXTOPER(scan);
3414 if (code != BRANCH)
c277df42 3415 scan = NEXTOPER(scan);
8aa23a47 3416 if (flags & SCF_DO_STCLASS) {
557bd3fb 3417 ssc_init(pRExC_state, &this_class);
8aa23a47
YO
3418 data_fake.start_class = &this_class;
3419 f = SCF_DO_STCLASS_AND;
58e23c8d 3420 }
8aa23a47
YO
3421 if (flags & SCF_WHILEM_VISITED_POS)
3422 f |= SCF_WHILEM_VISITED_POS;
3423
3424 /* we suppose the run is continuous, last=next...*/
3425 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
3426 next, &data_fake,
3427 stopparen, recursed, NULL, f,depth+1);
3428 if (min1 > minnext)
3429 min1 = minnext;
ea3daa5d 3430 if (deltanext == SSize_t_MAX) {
8aa23a47 3431 is_inf = is_inf_internal = 1;
ea3daa5d 3432 max1 = SSize_t_MAX;
9b139d09
GG
3433 } else if (max1 < minnext + deltanext)
3434 max1 = minnext + deltanext;
8aa23a47
YO
3435 scan = next;
3436 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3437 pars++;
3438 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3439 if ( stopmin > minnext)
3440 stopmin = min + min1;
3441 flags &= ~SCF_DO_SUBSTR;
3442 if (data)
3443 data->flags |= SCF_SEEN_ACCEPT;
3444 }
3445 if (data) {
3446 if (data_fake.flags & SF_HAS_EVAL)
3447 data->flags |= SF_HAS_EVAL;
3448 data->whilem_c = data_fake.whilem_c;
3dab1dad 3449 }
8aa23a47 3450 if (flags & SCF_DO_STCLASS)
557bd3fb 3451 ssc_or(pRExC_state, &accum, &this_class);
8aa23a47
YO
3452 }
3453 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
3454 min1 = 0;
3455 if (flags & SCF_DO_SUBSTR) {
3456 data->pos_min += min1;
ea3daa5d
FC
3457 if (data->pos_delta >= SSize_t_MAX - (max1 - min1))
3458 data->pos_delta = SSize_t_MAX;
9b139d09
GG
3459 else
3460 data->pos_delta += max1 - min1;
8aa23a47
YO
3461 if (max1 != min1 || is_inf)
3462 data->longest = &(data->longest_float);
3463 }
3464 min += min1;
ea3daa5d
FC
3465 if (delta == SSize_t_MAX
3466 || SSize_t_MAX - delta - (max1 - min1) < 0)
3467 delta = SSize_t_MAX;
9b139d09
GG
3468 else
3469 delta += max1 - min1;
8aa23a47 3470 if (flags & SCF_DO_STCLASS_OR) {
557bd3fb 3471 ssc_or(pRExC_state, data->start_class, &accum);
8aa23a47 3472 if (min1) {
1ca93ef1 3473 ssc_and(pRExC_state, data->start_class, and_withp);
8aa23a47 3474 flags &= ~SCF_DO_STCLASS;
653099ff 3475 }
8aa23a47
YO
3476 }
3477 else if (flags & SCF_DO_STCLASS_AND) {
3478 if (min1) {
1ca93ef1 3479 ssc_and(pRExC_state, data->start_class, &accum);
8aa23a47 3480 flags &= ~SCF_DO_STCLASS;
de0c8cb8 3481 }
8aa23a47
YO
3482 else {
3483 /* Switch to OR mode: cache the old value of
3484 * data->start_class */
3485 INIT_AND_WITHP;
b8f7bb16 3486 StructCopy(data->start_class, and_withp, regnode_ssc);
8aa23a47 3487 flags &= ~SCF_DO_STCLASS_AND;
b8f7bb16 3488 StructCopy(&accum, data->start_class, regnode_ssc);
8aa23a47 3489 flags |= SCF_DO_STCLASS_OR;
899d20b9 3490 SET_SSC_EOS(data->start_class);
de0c8cb8 3491 }
8aa23a47 3492 }
a3621e74 3493
8aa23a47
YO
3494 if (PERL_ENABLE_TRIE_OPTIMISATION && OP( startbranch ) == BRANCH ) {
3495 /* demq.
a3621e74 3496
8aa23a47
YO
3497 Assuming this was/is a branch we are dealing with: 'scan' now
3498 points at the item that follows the branch sequence, whatever
3499 it is. We now start at the beginning of the sequence and look
3500 for subsequences of
a3621e74 3501
8aa23a47
YO
3502 BRANCH->EXACT=>x1
3503 BRANCH->EXACT=>x2
3504 tail
a3621e74 3505
8aa23a47 3506 which would be constructed from a pattern like /A|LIST|OF|WORDS/
a3621e74 3507
486ec47a 3508 If we can find such a subsequence we need to turn the first
8aa23a47
YO
3509 element into a trie and then add the subsequent branch exact
3510 strings to the trie.
a3621e74 3511
8aa23a47 3512 We have two cases
a3621e74 3513
3b753521 3514 1. patterns where the whole set of branches can be converted.
a3621e74 3515
8aa23a47 3516 2. patterns where only a subset can be converted.
a3621e74 3517
8aa23a47
YO
3518 In case 1 we can replace the whole set with a single regop
3519 for the trie. In case 2 we need to keep the start and end
3b753521 3520 branches so
a3621e74 3521
8aa23a47
YO
3522 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
3523 becomes BRANCH TRIE; BRANCH X;
786e8c11 3524
8aa23a47
YO
3525 There is an additional case, that being where there is a
3526 common prefix, which gets split out into an EXACT like node
3527 preceding the TRIE node.
a3621e74 3528
8aa23a47
YO
3529 If x(1..n)==tail then we can do a simple trie, if not we make
3530 a "jump" trie, such that when we match the appropriate word
486ec47a 3531 we "jump" to the appropriate tail node. Essentially we turn
8aa23a47 3532 a nested if into a case structure of sorts.
b515a41d 3533
8aa23a47 3534 */
686b73d4 3535
8aa23a47
YO
3536 int made=0;
3537 if (!re_trie_maxbuff) {
3538 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
3539 if (!SvIOK(re_trie_maxbuff))
3540 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
3541 }
3542 if ( SvIV(re_trie_maxbuff)>=0 ) {
3543 regnode *cur;
3544 regnode *first = (regnode *)NULL;
3545 regnode *last = (regnode *)NULL;
3546 regnode *tail = scan;
fab2782b 3547 U8 trietype = 0;
8aa23a47 3548 U32 count=0;
a3621e74
YO
3549
3550#ifdef DEBUGGING
8aa23a47 3551 SV * const mysv = sv_newmortal(); /* for dumping */
a3621e74 3552#endif
8aa23a47
YO
3553 /* var tail is used because there may be a TAIL
3554 regop in the way. Ie, the exacts will point to the
3555 thing following the TAIL, but the last branch will
3556 point at the TAIL. So we advance tail. If we
3557 have nested (?:) we may have to move through several
3558 tails.
3559 */
3560
3561 while ( OP( tail ) == TAIL ) {
3562 /* this is the TAIL generated by (?:) */
3563 tail = regnext( tail );
3564 }
a3621e74 3565
8aa23a47 3566
df826430 3567 DEBUG_TRIE_COMPILE_r({
8aa23a47
YO
3568 regprop(RExC_rx, mysv, tail );
3569 PerlIO_printf( Perl_debug_log, "%*s%s%s\n",
3570 (int)depth * 2 + 2, "",
3571 "Looking for TRIE'able sequences. Tail node is: ",
3572 SvPV_nolen_const( mysv )
3573 );
3574 });
3575
3576 /*
3577
fab2782b
YO
3578 Step through the branches
3579 cur represents each branch,
3580 noper is the first thing to be matched as part of that branch
3581 noper_next is the regnext() of that node.
3582
3583 We normally handle a case like this /FOO[xyz]|BAR[pqr]/
3584 via a "jump trie" but we also support building with NOJUMPTRIE,
3585 which restricts the trie logic to structures like /FOO|BAR/.
3586
3587 If noper is a trieable nodetype then the branch is a possible optimization
3588 target. If we are building under NOJUMPTRIE then we require that noper_next
3589 is the same as scan (our current position in the regex program).
3590
3591 Once we have two or more consecutive such branches we can create a
3592 trie of the EXACT's contents and stitch it in place into the program.
3593
3594 If the sequence represents all of the branches in the alternation we
3595 replace the entire thing with a single TRIE node.
3596
3597 Otherwise when it is a subsequence we need to stitch it in place and
3598 replace only the relevant branches. This means the first branch has
3599 to remain as it is used by the alternation logic, and its next pointer,
3600 and needs to be repointed at the item on the branch chain following
3601 the last branch we have optimized away.
3602
3603 This could be either a BRANCH, in which case the subsequence is internal,
3604 or it could be the item following the branch sequence in which case the