1 #ifdef PERL_EXT_RE_BUILD
6 #define PERL_IN_REGEX_ENGINE
7 #define PERL_IN_REGCOMP_ANY
8 #define PERL_IN_REGCOMP_STUDY_C
11 #ifdef PERL_IN_XSUB_RE
17 #include "invlist_inline.h"
18 #include "unicode_constants.h"
19 #include "regcomp_internal.h"
21 #define INIT_AND_WITHP \
23 Newx(and_withp, 1, regnode_ssc); \
28 S_unwind_scan_frames(pTHX_ const void *p)
30 PERL_ARGS_ASSERT_UNWIND_SCAN_FRAMES;
31 scan_frame *f= (scan_frame *)p;
33 scan_frame *n= f->next_frame;
39 /* Follow the next-chain of the current node and optimize away
40 all the NOTHINGs from it.
43 S_rck_elide_nothing(pTHX_ regnode *node)
45 PERL_ARGS_ASSERT_RCK_ELIDE_NOTHING;
47 if (OP(node) != CURLYX) {
48 const int max = (REGNODE_OFF_BY_ARG(OP(node))
50 /* I32 may be smaller than U16 on CRAYs! */
51 : (I32_MAX < U16_MAX ? I32_MAX : U16_MAX));
52 int off = (REGNODE_OFF_BY_ARG(OP(node)) ? ARG(node) : NEXT_OFF(node));
56 /* Skip NOTHING and LONGJMP. */
60 (REGNODE_TYPE(OP(n)) == NOTHING && (noff = NEXT_OFF(n)))
61 || ((OP(n) == LONGJMP) && (noff = ARG(n)))
67 if (REGNODE_OFF_BY_ARG(OP(node)))
77 * As best we can, determine the characters that can match the start of
78 * the given EXACTF-ish node. This is for use in creating ssc nodes, so there
79 * can be false positive matches
81 * Returns the invlist as a new SV*; it is the caller's responsibility to
82 * call SvREFCNT_dec() when done with it.
85 S_make_exactf_invlist(pTHX_ RExC_state_t *pRExC_state, regnode *node)
87 const U8 * s = (U8*)STRING(node);
88 SSize_t bytelen = STR_LEN(node);
90 /* Start out big enough for 2 separate code points */
91 SV* invlist = _new_invlist(4);
93 PERL_ARGS_ASSERT_MAKE_EXACTF_INVLIST;
98 /* We punt and assume can match anything if the node begins
99 * with a multi-character fold. Things are complicated. For
100 * example, /ffi/i could match any of:
101 * "\N{LATIN SMALL LIGATURE FFI}"
102 * "\N{LATIN SMALL LIGATURE FF}I"
103 * "F\N{LATIN SMALL LIGATURE FI}"
104 * plus several other things; and making sure we have all the
105 * possibilities is hard. */
106 if (is_MULTI_CHAR_FOLD_latin1_safe(s, s + bytelen)) {
107 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
110 /* Any Latin1 range character can potentially match any
111 * other depending on the locale, and in Turkic locales, 'I' and
112 * 'i' can match U+130 and U+131 */
113 if (OP(node) == EXACTFL) {
114 _invlist_union(invlist, PL_Latin1, &invlist);
115 if (isALPHA_FOLD_EQ(uc, 'I')) {
116 invlist = add_cp_to_invlist(invlist,
117 LATIN_SMALL_LETTER_DOTLESS_I);
118 invlist = add_cp_to_invlist(invlist,
119 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
123 /* But otherwise, it matches at least itself. We can
124 * quickly tell if it has a distinct fold, and if so,
125 * it matches that as well */
126 invlist = add_cp_to_invlist(invlist, uc);
127 if (IS_IN_SOME_FOLD_L1(uc))
128 invlist = add_cp_to_invlist(invlist, PL_fold_latin1[uc]);
131 /* Some characters match above-Latin1 ones under /i. This
132 * is true of EXACTFL ones when the locale is UTF-8 */
133 if (HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(uc)
134 && (! isASCII(uc) || ! inRANGE(OP(node), EXACTFAA,
137 add_above_Latin1_folds(pRExC_state, (U8) uc, &invlist);
141 else { /* Pattern is UTF-8 */
142 U8 folded[UTF8_MAX_FOLD_CHAR_EXPAND * UTF8_MAXBYTES_CASE + 1] = { '\0' };
143 const U8* e = s + bytelen;
146 fc = uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
148 /* The only code points that aren't folded in a UTF EXACTFish
149 * node are the problematic ones in EXACTFL nodes */
150 if (OP(node) == EXACTFL && is_PROBLEMATIC_LOCALE_FOLDEDS_START_cp(uc)) {
151 /* We need to check for the possibility that this EXACTFL
152 * node begins with a multi-char fold. Therefore we fold
153 * the first few characters of it so that we can make that
159 for (i = 0; i < UTF8_MAX_FOLD_CHAR_EXPAND && s < e; i++) {
161 *(d++) = (U8) toFOLD(*s);
162 if (fc < 0) { /* Save the first fold */
169 UV fold = toFOLD_utf8_safe(s, e, d, &len);
170 if (fc < 0) { /* Save the first fold */
178 /* And set up so the code below that looks in this folded
179 * buffer instead of the node's string */
184 /* When we reach here 's' points to the fold of the first
185 * character(s) of the node; and 'e' points to far enough along
186 * the folded string to be just past any possible multi-char
189 * Like the non-UTF case above, we punt if the node begins with a
192 if (is_MULTI_CHAR_FOLD_utf8_safe(s, e)) {
193 invlist = _add_range_to_invlist(invlist, 0, UV_MAX);
195 else { /* Single char fold */
198 const U32 * remaining_folds;
201 /* It matches itself */
202 invlist = add_cp_to_invlist(invlist, fc);
204 /* ... plus all the things that fold to it, which are found in
205 * PL_utf8_foldclosures */
206 folds_count = _inverse_folds(fc, &first_fold,
208 for (k = 0; k < folds_count; k++) {
209 UV c = (k == 0) ? first_fold : remaining_folds[k-1];
211 /* /aa doesn't allow folds between ASCII and non- */
212 if ( inRANGE(OP(node), EXACTFAA, EXACTFAA_NO_TRIE)
213 && isASCII(c) != isASCII(fc))
218 invlist = add_cp_to_invlist(invlist, c);
221 if (OP(node) == EXACTFL) {
223 /* If either [iI] are present in an EXACTFL node the above code
224 * should have added its normal case pair, but under a Turkish
225 * locale they could match instead the case pairs from it. Add
226 * those as potential matches as well */
227 if (isALPHA_FOLD_EQ(fc, 'I')) {
228 invlist = add_cp_to_invlist(invlist,
229 LATIN_SMALL_LETTER_DOTLESS_I);
230 invlist = add_cp_to_invlist(invlist,
231 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
233 else if (fc == LATIN_SMALL_LETTER_DOTLESS_I) {
234 invlist = add_cp_to_invlist(invlist, 'I');
236 else if (fc == LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE) {
237 invlist = add_cp_to_invlist(invlist, 'i');
247 /* Mark that we cannot extend a found fixed substring at this point.
248 Update the longest found anchored substring or the longest found
249 floating substrings if needed. */
252 Perl_scan_commit(pTHX_ const RExC_state_t *pRExC_state, scan_data_t *data,
253 SSize_t *minlenp, int is_inf)
255 const STRLEN l = CHR_SVLEN(data->last_found);
256 SV * const longest_sv = data->substrs[data->cur_is_floating].str;
257 const STRLEN old_l = CHR_SVLEN(longest_sv);
258 DECLARE_AND_GET_RE_DEBUG_FLAGS;
260 PERL_ARGS_ASSERT_SCAN_COMMIT;
262 if ((l >= old_l) && ((l > old_l) || (data->flags & SF_BEFORE_EOL))) {
263 const U8 i = data->cur_is_floating;
264 SvSetMagicSV(longest_sv, data->last_found);
265 data->substrs[i].min_offset = l ? data->last_start_min : data->pos_min;
268 data->substrs[0].max_offset = data->substrs[0].min_offset;
270 data->substrs[1].max_offset =
274 ? data->last_start_max
275 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min
277 : data->pos_min + data->pos_delta));
280 data->substrs[i].flags &= ~SF_BEFORE_EOL;
281 data->substrs[i].flags |= data->flags & SF_BEFORE_EOL;
282 data->substrs[i].minlenp = minlenp;
283 data->substrs[i].lookbehind = 0;
286 SvCUR_set(data->last_found, 0);
288 SV * const sv = data->last_found;
289 if (SvUTF8(sv) && SvMAGICAL(sv)) {
290 MAGIC * const mg = mg_find(sv, PERL_MAGIC_utf8);
296 data->flags &= ~SF_BEFORE_EOL;
297 DEBUG_STUDYDATA("commit", data, 0, is_inf, -1, -1, -1);
300 /* An SSC is just a regnode_charclass_posix with an extra field: the inversion
301 * list that describes which code points it matches */
304 S_ssc_anything(pTHX_ regnode_ssc *ssc)
306 /* Set the SSC 'ssc' to match an empty string or any code point */
308 PERL_ARGS_ASSERT_SSC_ANYTHING;
310 assert(is_ANYOF_SYNTHETIC(ssc));
312 /* mortalize so won't leak */
313 ssc->invlist = sv_2mortal(_add_range_to_invlist(NULL, 0, UV_MAX));
314 ANYOF_FLAGS(ssc) |= SSC_MATCHES_EMPTY_STRING; /* Plus matches empty */
318 S_ssc_is_anything(const regnode_ssc *ssc)
320 /* Returns TRUE if the SSC 'ssc' can match the empty string and any code
321 * point; FALSE otherwise. Thus, this is used to see if using 'ssc' buys
322 * us anything: if the function returns TRUE, 'ssc' hasn't been restricted
323 * in any way, so there's no point in using it */
325 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
328 PERL_ARGS_ASSERT_SSC_IS_ANYTHING;
330 assert(is_ANYOF_SYNTHETIC(ssc));
332 if (! (ANYOF_FLAGS(ssc) & SSC_MATCHES_EMPTY_STRING)) {
336 /* See if the list consists solely of the range 0 - Infinity */
337 invlist_iterinit(ssc->invlist);
338 ret = invlist_iternext(ssc->invlist, &start, &end)
342 invlist_iterfinish(ssc->invlist);
348 /* If e.g., both \w and \W are set, matches everything */
349 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
351 for (i = 0; i < ANYOF_POSIXL_MAX; i += 2) {
352 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i+1)) {
362 Perl_ssc_init(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc)
364 /* Initializes the SSC 'ssc'. This includes setting it to match an empty
365 * string, any code point, or any posix class under locale */
367 PERL_ARGS_ASSERT_SSC_INIT;
369 Zero(ssc, 1, regnode_ssc);
370 set_ANYOF_SYNTHETIC(ssc);
371 ARG_SET(ssc, ANYOF_MATCHES_ALL_OUTSIDE_BITMAP_VALUE);
374 /* If any portion of the regex is to operate under locale rules that aren't
375 * fully known at compile time, initialization includes it. The reason
376 * this isn't done for all regexes is that the optimizer was written under
377 * the assumption that locale was all-or-nothing. Given the complexity and
378 * lack of documentation in the optimizer, and that there are inadequate
379 * test cases for locale, many parts of it may not work properly, it is
380 * safest to avoid locale unless necessary. */
381 if (RExC_contains_locale) {
382 ANYOF_POSIXL_SETALL(ssc);
385 ANYOF_POSIXL_ZERO(ssc);
390 S_ssc_is_cp_posixl_init(const RExC_state_t *pRExC_state,
391 const regnode_ssc *ssc)
393 /* Returns TRUE if the SSC 'ssc' is in its initial state with regard only
394 * to the list of code points matched, and locale posix classes; hence does
395 * not check its flags) */
397 UV start = 0, end = 0; /* Initialize due to messages from dumb compiler */
400 PERL_ARGS_ASSERT_SSC_IS_CP_POSIXL_INIT;
402 assert(is_ANYOF_SYNTHETIC(ssc));
404 invlist_iterinit(ssc->invlist);
405 ret = invlist_iternext(ssc->invlist, &start, &end)
409 invlist_iterfinish(ssc->invlist);
415 if (RExC_contains_locale && ! ANYOF_POSIXL_SSC_TEST_ALL_SET(ssc)) {
424 S_get_ANYOF_cp_list_for_ssc(pTHX_ const RExC_state_t *pRExC_state,
425 const regnode_charclass* const node)
427 /* Returns a mortal inversion list defining which code points are matched
428 * by 'node', which is of ANYOF-ish type . Handles complementing the
429 * result if appropriate. If some code points aren't knowable at this
430 * time, the returned list must, and will, contain every code point that is
434 SV* only_utf8_locale_invlist = NULL;
435 bool new_node_has_latin1 = FALSE;
436 const U8 flags = (REGNODE_TYPE(OP(node)) == ANYOF)
440 PERL_ARGS_ASSERT_GET_ANYOF_CP_LIST_FOR_SSC;
442 /* Look at the data structure created by S_set_ANYOF_arg() */
443 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
444 invlist = sv_2mortal(_new_invlist(1));
445 invlist = _add_range_to_invlist(invlist, NUM_ANYOF_CODE_POINTS, UV_MAX);
447 else if (ANYOF_HAS_AUX(node)) {
448 const U32 n = ARG(node);
449 SV * const rv = MUTABLE_SV(RExC_rxi->data->data[n]);
450 AV * const av = MUTABLE_AV(SvRV(rv));
451 SV **const ary = AvARRAY(av);
453 if (av_tindex_skip_len_mg(av) >= DEFERRED_USER_DEFINED_INDEX) {
455 /* Here there are things that won't be known until runtime -- we
456 * have to assume it could be anything */
457 invlist = sv_2mortal(_new_invlist(1));
458 return _add_range_to_invlist(invlist, 0, UV_MAX);
460 else if (ary[INVLIST_INDEX]) {
462 /* Use the node's inversion list */
463 invlist = sv_2mortal(invlist_clone(ary[INVLIST_INDEX], NULL));
466 /* Get the code points valid only under UTF-8 locales */
467 if ( (flags & ANYOFL_FOLD)
468 && av_tindex_skip_len_mg(av) >= ONLY_LOCALE_MATCHES_INDEX)
470 only_utf8_locale_invlist = ary[ONLY_LOCALE_MATCHES_INDEX];
475 invlist = sv_2mortal(_new_invlist(0));
478 /* An ANYOF node contains a bitmap for the first NUM_ANYOF_CODE_POINTS
479 * code points, and an inversion list for the others, but if there are code
480 * points that should match only conditionally on the target string being
481 * UTF-8, those are placed in the inversion list, and not the bitmap.
482 * Since there are circumstances under which they could match, they are
483 * included in the SSC. But if the ANYOF node is to be inverted, we have
484 * to exclude them here, so that when we invert below, the end result
485 * actually does include them. (Think about "\xe0" =~ /[^\xc0]/di;). We
486 * have to do this here before we add the unconditionally matched code
488 if (flags & ANYOF_INVERT) {
489 _invlist_intersection_complement_2nd(invlist,
494 /* Add in the points from the bit map */
495 if (REGNODE_TYPE(OP(node)) == ANYOF){
496 for (unsigned i = 0; i < NUM_ANYOF_CODE_POINTS; i++) {
497 if (ANYOF_BITMAP_TEST(node, i)) {
498 unsigned int start = i++;
500 for (; i < NUM_ANYOF_CODE_POINTS
501 && ANYOF_BITMAP_TEST(node, i); ++i)
505 invlist = _add_range_to_invlist(invlist, start, i-1);
506 new_node_has_latin1 = TRUE;
511 /* If this can match all upper Latin1 code points, have to add them
512 * as well. But don't add them if inverting, as when that gets done below,
513 * it would exclude all these characters, including the ones it shouldn't
514 * that were added just above */
515 if ( ! (flags & ANYOF_INVERT)
516 && OP(node) == ANYOFD
517 && (flags & ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared))
519 _invlist_union(invlist, PL_UpperLatin1, &invlist);
522 /* Similarly for these */
523 if (ANYOF_MATCHES_ALL_OUTSIDE_BITMAP(node)) {
524 _invlist_union_complement_2nd(invlist, PL_InBitmap, &invlist);
527 if (flags & ANYOF_INVERT) {
528 _invlist_invert(invlist);
530 else if (flags & ANYOFL_FOLD) {
531 if (new_node_has_latin1) {
533 /* These folds are potential in Turkic locales */
534 if (_invlist_contains_cp(invlist, 'i')) {
535 invlist = add_cp_to_invlist(invlist,
536 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE);
538 if (_invlist_contains_cp(invlist, 'I')) {
539 invlist = add_cp_to_invlist(invlist,
540 LATIN_SMALL_LETTER_DOTLESS_I);
543 /* Under /li, any 0-255 could fold to any other 0-255, depending on
544 * the locale. We can skip this if there are no 0-255 at all. */
545 _invlist_union(invlist, PL_Latin1, &invlist);
548 if (_invlist_contains_cp(invlist, LATIN_SMALL_LETTER_DOTLESS_I)) {
549 invlist = add_cp_to_invlist(invlist, 'I');
551 if (_invlist_contains_cp(invlist,
552 LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE))
554 invlist = add_cp_to_invlist(invlist, 'i');
559 /* Similarly add the UTF-8 locale possible matches. These have to be
560 * deferred until after the non-UTF-8 locale ones are taken care of just
561 * above, or it leads to wrong results under ANYOF_INVERT */
562 if (only_utf8_locale_invlist) {
563 _invlist_union_maybe_complement_2nd(invlist,
564 only_utf8_locale_invlist,
565 flags & ANYOF_INVERT,
572 /* 'AND' a given class with another one. Can create false positives. 'ssc'
573 * should not be inverted. */
576 S_ssc_and(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
577 const regnode_charclass *and_with)
579 /* Accumulate into SSC 'ssc' its 'AND' with 'and_with', which is either
580 * another SSC or a regular ANYOF class. Can create false positives. */
583 U8 and_with_flags = (REGNODE_TYPE(OP(and_with)) == ANYOF)
584 ? ANYOF_FLAGS(and_with)
588 PERL_ARGS_ASSERT_SSC_AND;
590 assert(is_ANYOF_SYNTHETIC(ssc));
592 /* 'and_with' is used as-is if it too is an SSC; otherwise have to extract
593 * the code point inversion list and just the relevant flags */
594 if (is_ANYOF_SYNTHETIC(and_with)) {
595 anded_cp_list = ((regnode_ssc *)and_with)->invlist;
596 anded_flags = and_with_flags;
598 /* XXX This is a kludge around what appears to be deficiencies in the
599 * optimizer. If we make S_ssc_anything() add in the WARN_SUPER flag,
600 * there are paths through the optimizer where it doesn't get weeded
601 * out when it should. And if we don't make some extra provision for
602 * it like the code just below, it doesn't get added when it should.
603 * This solution is to add it only when AND'ing, which is here, and
604 * only when what is being AND'ed is the pristine, original node
605 * matching anything. Thus it is like adding it to ssc_anything() but
606 * only when the result is to be AND'ed. Probably the same solution
607 * could be adopted for the same problem we have with /l matching,
608 * which is solved differently in S_ssc_init(), and that would lead to
609 * fewer false positives than that solution has. But if this solution
610 * creates bugs, the consequences are only that a warning isn't raised
611 * that should be; while the consequences for having /l bugs is
612 * incorrect matches */
613 if (ssc_is_anything((regnode_ssc *)and_with)) {
614 anded_flags |= ANYOF_WARN_SUPER__shared;
618 anded_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, and_with);
619 if (OP(and_with) == ANYOFD) {
620 anded_flags = and_with_flags & ANYOF_COMMON_FLAGS;
623 anded_flags = and_with_flags
624 & ( ANYOF_COMMON_FLAGS
625 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
626 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
627 if (and_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
628 anded_flags &= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
633 ANYOF_FLAGS(ssc) &= anded_flags;
635 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
636 * C2 is the list of code points in 'and-with'; P2, its posix classes.
637 * 'and_with' may be inverted. When not inverted, we have the situation of
639 * (C1 | P1) & (C2 | P2)
640 * = (C1 & (C2 | P2)) | (P1 & (C2 | P2))
641 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
642 * <= ((C1 & C2) | P2)) | ( P1 | (P1 & P2))
643 * <= ((C1 & C2) | P1 | P2)
644 * Alternatively, the last few steps could be:
645 * = ((C1 & C2) | (C1 & P2)) | ((P1 & C2) | (P1 & P2))
646 * <= ((C1 & C2) | C1 ) | ( C2 | (P1 & P2))
647 * <= (C1 | C2 | (P1 & P2))
648 * We favor the second approach if either P1 or P2 is non-empty. This is
649 * because these components are a barrier to doing optimizations, as what
650 * they match cannot be known until the moment of matching as they are
651 * dependent on the current locale, 'AND"ing them likely will reduce or
653 * But we can do better if we know that C1,P1 are in their initial state (a
654 * frequent occurrence), each matching everything:
655 * (<everything>) & (C2 | P2) = C2 | P2
656 * Similarly, if C2,P2 are in their initial state (again a frequent
657 * occurrence), the result is a no-op
658 * (C1 | P1) & (<everything>) = C1 | P1
661 * (C1 | P1) & ~(C2 | P2) = (C1 | P1) & (~C2 & ~P2)
662 * = (C1 & (~C2 & ~P2)) | (P1 & (~C2 & ~P2))
663 * <= (C1 & ~C2) | (P1 & ~P2)
666 if ((and_with_flags & ANYOF_INVERT)
667 && ! is_ANYOF_SYNTHETIC(and_with))
671 ssc_intersection(ssc,
673 FALSE /* Has already been inverted */
676 /* If either P1 or P2 is empty, the intersection will be also; can skip
678 if (! (and_with_flags & ANYOF_MATCHES_POSIXL)) {
679 ANYOF_POSIXL_ZERO(ssc);
681 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
683 /* Note that the Posix class component P from 'and_with' actually
685 * P = Pa | Pb | ... | Pn
686 * where each component is one posix class, such as in [\w\s].
688 * ~P = ~(Pa | Pb | ... | Pn)
689 * = ~Pa & ~Pb & ... & ~Pn
690 * <= ~Pa | ~Pb | ... | ~Pn
691 * The last is something we can easily calculate, but unfortunately
692 * is likely to have many false positives. We could do better
693 * in some (but certainly not all) instances if two classes in
694 * P have known relationships. For example
695 * :lower: <= :alpha: <= :alnum: <= \w <= :graph: <= :print:
697 * :lower: & :print: = :lower:
698 * And similarly for classes that must be disjoint. For example,
699 * since \s and \w can have no elements in common based on rules in
700 * the POSIX standard,
702 * Unfortunately, some vendor locales do not meet the Posix
703 * standard, in particular almost everything by Microsoft.
704 * The loop below just changes e.g., \w into \W and vice versa */
706 regnode_charclass_posixl temp;
707 int add = 1; /* To calculate the index of the complement */
709 Zero(&temp, 1, regnode_charclass_posixl);
710 ANYOF_POSIXL_ZERO(&temp);
711 for (i = 0; i < ANYOF_MAX; i++) {
713 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)
714 || ! ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i + 1));
716 if (ANYOF_POSIXL_TEST((regnode_charclass_posixl*) and_with, i)) {
717 ANYOF_POSIXL_SET(&temp, i + add);
719 add = 0 - add; /* 1 goes to -1; -1 goes to 1 */
721 ANYOF_POSIXL_AND(&temp, ssc);
723 } /* else ssc already has no posixes */
724 } /* else: Not inverted. This routine is a no-op if 'and_with' is an SSC
725 in its initial state */
726 else if (! is_ANYOF_SYNTHETIC(and_with)
727 || ! ssc_is_cp_posixl_init(pRExC_state, (regnode_ssc *)and_with))
729 /* But if 'ssc' is in its initial state, the result is just 'and_with';
730 * copy it over 'ssc' */
731 if (ssc_is_cp_posixl_init(pRExC_state, ssc)) {
732 if (is_ANYOF_SYNTHETIC(and_with)) {
733 StructCopy(and_with, ssc, regnode_ssc);
736 ssc->invlist = anded_cp_list;
737 ANYOF_POSIXL_ZERO(ssc);
738 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
739 ANYOF_POSIXL_OR((regnode_charclass_posixl*) and_with, ssc);
743 else if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)
744 || (and_with_flags & ANYOF_MATCHES_POSIXL))
746 /* One or the other of P1, P2 is non-empty. */
747 if (and_with_flags & ANYOF_MATCHES_POSIXL) {
748 ANYOF_POSIXL_AND((regnode_charclass_posixl*) and_with, ssc);
750 ssc_union(ssc, anded_cp_list, FALSE);
752 else { /* P1 = P2 = empty */
753 ssc_intersection(ssc, anded_cp_list, FALSE);
759 S_ssc_or(pTHX_ const RExC_state_t *pRExC_state, regnode_ssc *ssc,
760 const regnode_charclass *or_with)
762 /* Accumulate into SSC 'ssc' its 'OR' with 'or_with', which is either
763 * another SSC or a regular ANYOF class. Can create false positives if
764 * 'or_with' is to be inverted. */
768 U8 or_with_flags = (REGNODE_TYPE(OP(or_with)) == ANYOF)
769 ? ANYOF_FLAGS(or_with)
772 PERL_ARGS_ASSERT_SSC_OR;
774 assert(is_ANYOF_SYNTHETIC(ssc));
776 /* 'or_with' is used as-is if it too is an SSC; otherwise have to extract
777 * the code point inversion list and just the relevant flags */
778 if (is_ANYOF_SYNTHETIC(or_with)) {
779 ored_cp_list = ((regnode_ssc*) or_with)->invlist;
780 ored_flags = or_with_flags;
783 ored_cp_list = get_ANYOF_cp_list_for_ssc(pRExC_state, or_with);
784 ored_flags = or_with_flags & ANYOF_COMMON_FLAGS;
785 if (OP(or_with) != ANYOFD) {
787 or_with_flags & ( ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
788 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES);
789 if (or_with_flags & ANYOFL_UTF8_LOCALE_REQD) {
790 ored_flags |= ANYOF_HAS_EXTRA_RUNTIME_MATCHES;
795 ANYOF_FLAGS(ssc) |= ored_flags;
797 /* Below, C1 is the list of code points in 'ssc'; P1, its posix classes.
798 * C2 is the list of code points in 'or-with'; P2, its posix classes.
799 * 'or_with' may be inverted. When not inverted, we have the simple
800 * situation of computing:
801 * (C1 | P1) | (C2 | P2) = (C1 | C2) | (P1 | P2)
802 * If P1|P2 yields a situation with both a class and its complement are
803 * set, like having both \w and \W, this matches all code points, and we
804 * can delete these from the P component of the ssc going forward. XXX We
805 * might be able to delete all the P components, but I (khw) am not certain
806 * about this, and it is better to be safe.
809 * (C1 | P1) | ~(C2 | P2) = (C1 | P1) | (~C2 & ~P2)
812 * (which results in actually simpler code than the non-inverted case)
815 if ((or_with_flags & ANYOF_INVERT)
816 && ! is_ANYOF_SYNTHETIC(or_with))
818 /* We ignore P2, leaving P1 going forward */
819 } /* else Not inverted */
820 else if (or_with_flags & ANYOF_MATCHES_POSIXL) {
821 ANYOF_POSIXL_OR((regnode_charclass_posixl*)or_with, ssc);
822 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
824 for (i = 0; i < ANYOF_MAX; i += 2) {
825 if (ANYOF_POSIXL_TEST(ssc, i) && ANYOF_POSIXL_TEST(ssc, i + 1))
827 ssc_match_all_cp(ssc);
828 ANYOF_POSIXL_CLEAR(ssc, i);
829 ANYOF_POSIXL_CLEAR(ssc, i+1);
837 FALSE /* Already has been inverted */
842 S_ssc_union(pTHX_ regnode_ssc *ssc, SV* const invlist, const bool invert2nd)
844 PERL_ARGS_ASSERT_SSC_UNION;
846 assert(is_ANYOF_SYNTHETIC(ssc));
848 _invlist_union_maybe_complement_2nd(ssc->invlist,
855 S_ssc_intersection(pTHX_ regnode_ssc *ssc,
857 const bool invert2nd)
859 PERL_ARGS_ASSERT_SSC_INTERSECTION;
861 assert(is_ANYOF_SYNTHETIC(ssc));
863 _invlist_intersection_maybe_complement_2nd(ssc->invlist,
870 S_ssc_add_range(pTHX_ regnode_ssc *ssc, const UV start, const UV end)
872 PERL_ARGS_ASSERT_SSC_ADD_RANGE;
874 assert(is_ANYOF_SYNTHETIC(ssc));
876 ssc->invlist = _add_range_to_invlist(ssc->invlist, start, end);
880 S_ssc_cp_and(pTHX_ regnode_ssc *ssc, const UV cp)
882 /* AND just the single code point 'cp' into the SSC 'ssc' */
884 SV* cp_list = _new_invlist(2);
886 PERL_ARGS_ASSERT_SSC_CP_AND;
888 assert(is_ANYOF_SYNTHETIC(ssc));
890 cp_list = add_cp_to_invlist(cp_list, cp);
891 ssc_intersection(ssc, cp_list,
892 FALSE /* Not inverted */
894 SvREFCNT_dec_NN(cp_list);
898 S_ssc_clear_locale(regnode_ssc *ssc)
900 /* Set the SSC 'ssc' to not match any locale things */
901 PERL_ARGS_ASSERT_SSC_CLEAR_LOCALE;
903 assert(is_ANYOF_SYNTHETIC(ssc));
905 ANYOF_POSIXL_ZERO(ssc);
906 ANYOF_FLAGS(ssc) &= ~ANYOF_LOCALE_FLAGS;
910 Perl_is_ssc_worth_it(const RExC_state_t * pRExC_state, const regnode_ssc * ssc)
912 /* The synthetic start class is used to hopefully quickly winnow down
913 * places where a pattern could start a match in the target string. If it
914 * doesn't really narrow things down that much, there isn't much point to
915 * having the overhead of using it. This function uses some very crude
916 * heuristics to decide if to use the ssc or not.
918 * It returns TRUE if 'ssc' rules out more than half what it considers to
919 * be the "likely" possible matches, but of course it doesn't know what the
920 * actual things being matched are going to be; these are only guesses
922 * For /l matches, it assumes that the only likely matches are going to be
923 * in the 0-255 range, uniformly distributed, so half of that is 127
924 * For /a and /d matches, it assumes that the likely matches will be just
925 * the ASCII range, so half of that is 63
926 * For /u and there isn't anything matching above the Latin1 range, it
927 * assumes that that is the only range likely to be matched, and uses
928 * half that as the cut-off: 127. If anything matches above Latin1,
929 * it assumes that all of Unicode could match (uniformly), except for
930 * non-Unicode code points and things in the General Category "Other"
931 * (unassigned, private use, surrogates, controls and formats). This
932 * is a much large number. */
934 U32 count = 0; /* Running total of number of code points matched by
936 UV start, end; /* Start and end points of current range in inversion
937 XXX outdated. UTF-8 locales are common, what about invert? list */
938 const U32 max_code_points = (LOC)
941 || invlist_highest(ssc->invlist) < 256)
944 const U32 max_match = max_code_points / 2;
946 PERL_ARGS_ASSERT_IS_SSC_WORTH_IT;
948 invlist_iterinit(ssc->invlist);
949 while (invlist_iternext(ssc->invlist, &start, &end)) {
950 if (start >= max_code_points) {
953 end = MIN(end, max_code_points - 1);
954 count += end - start + 1;
955 if (count >= max_match) {
956 invlist_iterfinish(ssc->invlist);
966 Perl_ssc_finalize(pTHX_ RExC_state_t *pRExC_state, regnode_ssc *ssc)
968 /* The inversion list in the SSC is marked mortal; now we need a more
969 * permanent copy, which is stored the same way that is done in a regular
970 * ANYOF node, with the first NUM_ANYOF_CODE_POINTS code points in a bit
973 SV* invlist = invlist_clone(ssc->invlist, NULL);
975 PERL_ARGS_ASSERT_SSC_FINALIZE;
977 assert(is_ANYOF_SYNTHETIC(ssc));
979 /* The code in this file assumes that all but these flags aren't relevant
980 * to the SSC, except SSC_MATCHES_EMPTY_STRING, which should be cleared
981 * by the time we reach here */
982 assert(! (ANYOF_FLAGS(ssc)
983 & ~( ANYOF_COMMON_FLAGS
984 |ANYOFD_NON_UTF8_MATCHES_ALL_NON_ASCII__shared
985 |ANYOF_HAS_EXTRA_RUNTIME_MATCHES)));
987 populate_anyof_bitmap_from_invlist( (regnode *) ssc, &invlist);
989 set_ANYOF_arg(pRExC_state, (regnode *) ssc, invlist, NULL, NULL);
990 SvREFCNT_dec(invlist);
992 /* Make sure is clone-safe */
995 if (ANYOF_POSIXL_SSC_TEST_ANY_SET(ssc)) {
996 ANYOF_FLAGS(ssc) |= ANYOF_MATCHES_POSIXL;
997 OP(ssc) = ANYOFPOSIXL;
999 else if (RExC_contains_locale) {
1003 assert(! (ANYOF_FLAGS(ssc) & ANYOF_LOCALE_FLAGS) || RExC_contains_locale);
1006 /* The below joins as many adjacent EXACTish nodes as possible into a single
1007 * one. The regop may be changed if the node(s) contain certain sequences that
1008 * require special handling. The joining is only done if:
1009 * 1) there is room in the current conglomerated node to entirely contain the
1011 * 2) they are compatible node types
1013 * The adjacent nodes actually may be separated by NOTHING-kind nodes, and
1014 * these get optimized out
1016 * XXX khw thinks this should be enhanced to fill EXACT (at least) nodes as full
1017 * as possible, even if that means splitting an existing node so that its first
1018 * part is moved to the preceding node. This would maximise the efficiency of
1019 * memEQ during matching.
1021 * If a node is to match under /i (folded), the number of characters it matches
1022 * can be different than its character length if it contains a multi-character
1023 * fold. *min_subtract is set to the total delta number of characters of the
1026 * And *unfolded_multi_char is set to indicate whether or not the node contains
1027 * an unfolded multi-char fold. This happens when it won't be known until
1028 * runtime whether the fold is valid or not; namely
1029 * 1) for EXACTF nodes that contain LATIN SMALL LETTER SHARP S, as only if the
1030 * target string being matched against turns out to be UTF-8 is that fold
1032 * 2) for EXACTFL nodes whose folding rules depend on the locale in force at
1034 * (Multi-char folds whose components are all above the Latin1 range are not
1035 * run-time locale dependent, and have already been folded by the time this
1036 * function is called.)
1038 * This is as good a place as any to discuss the design of handling these
1039 * multi-character fold sequences. It's been wrong in Perl for a very long
1040 * time. There are three code points in Unicode whose multi-character folds
1041 * were long ago discovered to mess things up. The previous designs for
1042 * dealing with these involved assigning a special node for them. This
1043 * approach doesn't always work, as evidenced by this example:
1044 * "\xDFs" =~ /s\xDF/ui # Used to fail before these patches
1045 * Both sides fold to "sss", but if the pattern is parsed to create a node that
1046 * would match just the \xDF, it won't be able to handle the case where a
1047 * successful match would have to cross the node's boundary. The new approach
1048 * that hopefully generally solves the problem generates an EXACTFUP node
1049 * that is "sss" in this case.
1051 * It turns out that there are problems with all multi-character folds, and not
1052 * just these three. Now the code is general, for all such cases. The
1053 * approach taken is:
1054 * 1) This routine examines each EXACTFish node that could contain multi-
1055 * character folded sequences. Since a single character can fold into
1056 * such a sequence, the minimum match length for this node is less than
1057 * the number of characters in the node. This routine returns in
1058 * *min_subtract how many characters to subtract from the actual
1059 * length of the string to get a real minimum match length; it is 0 if
1060 * there are no multi-char foldeds. This delta is used by the caller to
1061 * adjust the min length of the match, and the delta between min and max,
1062 * so that the optimizer doesn't reject these possibilities based on size
1065 * 2) For the sequence involving the LATIN SMALL LETTER SHARP S (U+00DF)
1066 * under /u, we fold it to 'ss' in regatom(), and in this routine, after
1067 * joining, we scan for occurrences of the sequence 'ss' in non-UTF-8
1068 * EXACTFU nodes. The node type of such nodes is then changed to
1069 * EXACTFUP, indicating it is problematic, and needs careful handling.
1070 * (The procedures in step 1) above are sufficient to handle this case in
1071 * UTF-8 encoded nodes.) The reason this is problematic is that this is
1072 * the only case where there is a possible fold length change in non-UTF-8
1073 * patterns. By reserving a special node type for problematic cases, the
1074 * far more common regular EXACTFU nodes can be processed faster.
1075 * regexec.c takes advantage of this.
1077 * EXACTFUP has been created as a grab-bag for (hopefully uncommon)
1078 * problematic cases. These all only occur when the pattern is not
1079 * UTF-8. In addition to the 'ss' sequence where there is a possible fold
1080 * length change, it handles the situation where the string cannot be
1081 * entirely folded. The strings in an EXACTFish node are folded as much
1082 * as possible during compilation in regcomp.c. This saves effort in
1083 * regex matching. By using an EXACTFUP node when it is not possible to
1084 * fully fold at compile time, regexec.c can know that everything in an
1085 * EXACTFU node is folded, so folding can be skipped at runtime. The only
1086 * case where folding in EXACTFU nodes can't be done at compile time is
1087 * the presumably uncommon MICRO SIGN, when the pattern isn't UTF-8. This
1088 * is because its fold requires UTF-8 to represent. Thus EXACTFUP nodes
1089 * handle two very different cases. Alternatively, there could have been
1090 * a node type where there are length changes, one for unfolded, and one
1091 * for both. If yet another special case needed to be created, the number
1092 * of required node types would have to go to 7. khw figures that even
1093 * though there are plenty of node types to spare, that the maintenance
1094 * cost wasn't worth the small speedup of doing it that way, especially
1095 * since he thinks the MICRO SIGN is rarely encountered in practice.
1097 * There are other cases where folding isn't done at compile time, but
1098 * none of them are under /u, and hence not for EXACTFU nodes. The folds
1099 * in EXACTFL nodes aren't known until runtime, and vary as the locale
1100 * changes. Some folds in EXACTF depend on if the runtime target string
1101 * is UTF-8 or not. (regatom() will create an EXACTFU node even under /di
1102 * when no fold in it depends on the UTF-8ness of the target string.)
1104 * 3) A problem remains for unfolded multi-char folds. (These occur when the
1105 * validity of the fold won't be known until runtime, and so must remain
1106 * unfolded for now. This happens for the sharp s in EXACTF and EXACTFAA
1107 * nodes when the pattern isn't in UTF-8. (Note, BTW, that there cannot
1108 * be an EXACTF node with a UTF-8 pattern.) They also occur for various
1109 * folds in EXACTFL nodes, regardless of the UTF-ness of the pattern.)
1110 * The reason this is a problem is that the optimizer part of regexec.c
1111 * (probably unwittingly, in Perl_regexec_flags()) makes an assumption
1112 * that a character in the pattern corresponds to at most a single
1113 * character in the target string. (And I do mean character, and not byte
1114 * here, unlike other parts of the documentation that have never been
1115 * updated to account for multibyte Unicode.) Sharp s in EXACTF and
1116 * EXACTFL nodes can match the two character string 'ss'; in EXACTFAA
1117 * nodes it can match "\x{17F}\x{17F}". These, along with other ones in
1118 * EXACTFL nodes, violate the assumption, and they are the only instances
1119 * where it is violated. I'm reluctant to try to change the assumption,
1120 * as the code involved is impenetrable to me (khw), so instead the code
1121 * here punts. This routine examines EXACTFL nodes, and (when the pattern
1122 * isn't UTF-8) EXACTF and EXACTFAA for such unfolded folds, and returns a
1123 * boolean indicating whether or not the node contains such a fold. When
1124 * it is true, the caller sets a flag that later causes the optimizer in
1125 * this file to not set values for the floating and fixed string lengths,
1126 * and thus avoids the optimizer code in regexec.c that makes the invalid
1127 * assumption. Thus, there is no optimization based on string lengths for
1128 * EXACTFL nodes that contain these few folds, nor for non-UTF8-pattern
1129 * EXACTF and EXACTFAA nodes that contain the sharp s. (The reason the
1130 * assumption is wrong only in these cases is that all other non-UTF-8
1131 * folds are 1-1; and, for UTF-8 patterns, we pre-fold all other folds to
1132 * their expanded versions. (Again, we can't prefold sharp s to 'ss' in
1133 * EXACTF nodes because we don't know at compile time if it actually
1134 * matches 'ss' or not. For EXACTF nodes it will match iff the target
1135 * string is in UTF-8. This is in contrast to EXACTFU nodes, where it
1136 * always matches; and EXACTFAA where it never does. In an EXACTFAA node
1137 * in a UTF-8 pattern, sharp s is folded to "\x{17F}\x{17F}, avoiding the
1138 * problem; but in a non-UTF8 pattern, folding it to that above-Latin1
1139 * string would require the pattern to be forced into UTF-8, the overhead
1140 * of which we want to avoid. Similarly the unfolded multi-char folds in
1141 * EXACTFL nodes will match iff the locale at the time of match is a UTF-8
1144 * Similarly, the code that generates tries doesn't currently handle
1145 * not-already-folded multi-char folds, and it looks like a pain to change
1146 * that. Therefore, trie generation of EXACTFAA nodes with the sharp s
1147 * doesn't work. Instead, such an EXACTFAA is turned into a new regnode,
1148 * EXACTFAA_NO_TRIE, which the trie code knows not to handle. Most people
1149 * using /iaa matching will be doing so almost entirely with ASCII
1150 * strings, so this should rarely be encountered in practice */
1153 Perl_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
1154 UV *min_subtract, bool *unfolded_multi_char,
1155 U32 flags, regnode *val, U32 depth)
1157 /* Merge several consecutive EXACTish nodes into one. */
1159 regnode *n = regnext(scan);
1161 regnode *next = REGNODE_AFTER_varies(scan);
1165 regnode *stop = scan;
1166 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1168 PERL_UNUSED_ARG(depth);
1171 PERL_ARGS_ASSERT_JOIN_EXACT;
1172 #ifndef EXPERIMENTAL_INPLACESCAN
1173 PERL_UNUSED_ARG(flags);
1174 PERL_UNUSED_ARG(val);
1176 DEBUG_PEEP("join", scan, depth, 0);
1178 assert(REGNODE_TYPE(OP(scan)) == EXACT);
1180 /* Look through the subsequent nodes in the chain. Skip NOTHING, merge
1181 * EXACT ones that are mergeable to the current one. */
1183 && ( REGNODE_TYPE(OP(n)) == NOTHING
1184 || (stringok && REGNODE_TYPE(OP(n)) == EXACT))
1186 && NEXT_OFF(scan) + NEXT_OFF(n) < I16_MAX)
1189 if (OP(n) == TAIL || n > next)
1191 if (REGNODE_TYPE(OP(n)) == NOTHING) {
1192 DEBUG_PEEP("skip:", n, depth, 0);
1193 NEXT_OFF(scan) += NEXT_OFF(n);
1194 next = n + NODE_STEP_REGNODE;
1201 else if (stringok) {
1202 const unsigned int oldl = STR_LEN(scan);
1203 regnode * const nnext = regnext(n);
1205 /* XXX I (khw) kind of doubt that this works on platforms (should
1206 * Perl ever run on one) where U8_MAX is above 255 because of lots
1207 * of other assumptions */
1208 /* Don't join if the sum can't fit into a single node */
1209 if (oldl + STR_LEN(n) > U8_MAX)
1212 /* Joining something that requires UTF-8 with something that
1213 * doesn't, means the result requires UTF-8. */
1214 if (OP(scan) == EXACT && (OP(n) == EXACT_REQ8)) {
1215 OP(scan) = EXACT_REQ8;
1217 else if (OP(scan) == EXACT_REQ8 && (OP(n) == EXACT)) {
1218 ; /* join is compatible, no need to change OP */
1220 else if ((OP(scan) == EXACTFU) && (OP(n) == EXACTFU_REQ8)) {
1221 OP(scan) = EXACTFU_REQ8;
1223 else if ((OP(scan) == EXACTFU_REQ8) && (OP(n) == EXACTFU)) {
1224 ; /* join is compatible, no need to change OP */
1226 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU) {
1227 ; /* join is compatible, no need to change OP */
1229 else if (OP(scan) == EXACTFU && OP(n) == EXACTFU_S_EDGE) {
1231 /* Under /di, temporary EXACTFU_S_EDGE nodes are generated,
1232 * which can join with EXACTFU ones. We check for this case
1233 * here. These need to be resolved to either EXACTFU or
1234 * EXACTF at joining time. They have nothing in them that
1235 * would forbid them from being the more desirable EXACTFU
1236 * nodes except that they begin and/or end with a single [Ss].
1237 * The reason this is problematic is because they could be
1238 * joined in this loop with an adjacent node that ends and/or
1239 * begins with [Ss] which would then form the sequence 'ss',
1240 * which matches differently under /di than /ui, in which case
1241 * EXACTFU can't be used. If the 'ss' sequence doesn't get
1242 * formed, the nodes get absorbed into any adjacent EXACTFU
1243 * node. And if the only adjacent node is EXACTF, they get
1244 * absorbed into that, under the theory that a longer node is
1245 * better than two shorter ones, even if one is EXACTFU. Note
1246 * that EXACTFU_REQ8 is generated only for UTF-8 patterns,
1247 * and the EXACTFU_S_EDGE ones only for non-UTF-8. */
1249 if (STRING(n)[STR_LEN(n)-1] == 's') {
1251 /* Here the joined node would end with 's'. If the node
1252 * following the combination is an EXACTF one, it's better to
1253 * join this trailing edge 's' node with that one, leaving the
1254 * current one in 'scan' be the more desirable EXACTFU */
1255 if (OP(nnext) == EXACTF) {
1259 OP(scan) = EXACTFU_S_EDGE;
1261 } /* Otherwise, the beginning 's' of the 2nd node just
1262 becomes an interior 's' in 'scan' */
1264 else if (OP(scan) == EXACTF && OP(n) == EXACTF) {
1265 ; /* join is compatible, no need to change OP */
1267 else if (OP(scan) == EXACTF && OP(n) == EXACTFU_S_EDGE) {
1269 /* EXACTF nodes are compatible for joining with EXACTFU_S_EDGE
1270 * nodes. But the latter nodes can be also joined with EXACTFU
1271 * ones, and that is a better outcome, so if the node following
1272 * 'n' is EXACTFU, quit now so that those two can be joined
1274 if (OP(nnext) == EXACTFU) {
1278 /* The join is compatible, and the combined node will be
1279 * EXACTF. (These don't care if they begin or end with 's' */
1281 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU_S_EDGE) {
1282 if ( STRING(scan)[STR_LEN(scan)-1] == 's'
1283 && STRING(n)[0] == 's')
1285 /* When combined, we have the sequence 'ss', which means we
1286 * have to remain /di */
1290 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTFU) {
1291 if (STRING(n)[0] == 's') {
1292 ; /* Here the join is compatible and the combined node
1293 starts with 's', no need to change OP */
1295 else { /* Now the trailing 's' is in the interior */
1299 else if (OP(scan) == EXACTFU_S_EDGE && OP(n) == EXACTF) {
1301 /* The join is compatible, and the combined node will be
1302 * EXACTF. (These don't care if they begin or end with 's' */
1305 else if (OP(scan) != OP(n)) {
1307 /* The only other compatible joinings are the same node type */
1311 DEBUG_PEEP("merg", n, depth, 0);
1314 next = REGNODE_AFTER_varies(n);
1315 NEXT_OFF(scan) += NEXT_OFF(n);
1316 assert( ( STR_LEN(scan) + STR_LEN(n) ) < 256 );
1317 setSTR_LEN(scan, (U8)(STR_LEN(scan) + STR_LEN(n)));
1318 /* Now we can overwrite *n : */
1319 Move(STRING(n), STRING(scan) + oldl, STR_LEN(n), char);
1327 #ifdef EXPERIMENTAL_INPLACESCAN
1328 if (flags && !NEXT_OFF(n)) {
1329 DEBUG_PEEP("atch", val, depth, 0);
1330 if (REGNODE_OFF_BY_ARG(OP(n))) {
1331 ARG_SET(n, val - n);
1334 NEXT_OFF(n) = val - n;
1341 /* This temporary node can now be turned into EXACTFU, and must, as
1342 * regexec.c doesn't handle it */
1343 if (OP(scan) == EXACTFU_S_EDGE) {
1348 *unfolded_multi_char = FALSE;
1350 /* Here, all the adjacent mergeable EXACTish nodes have been merged. We
1351 * can now analyze for sequences of problematic code points. (Prior to
1352 * this final joining, sequences could have been split over boundaries, and
1353 * hence missed). The sequences only happen in folding, hence for any
1354 * non-EXACT EXACTish node */
1355 if (OP(scan) != EXACT && OP(scan) != EXACT_REQ8 && OP(scan) != EXACTL) {
1356 U8* s0 = (U8*) STRING(scan);
1358 U8* s_end = s0 + STR_LEN(scan);
1360 int total_count_delta = 0; /* Total delta number of characters that
1361 multi-char folds expand to */
1363 /* One pass is made over the node's string looking for all the
1364 * possibilities. To avoid some tests in the loop, there are two main
1365 * cases, for UTF-8 patterns (which can't have EXACTF nodes) and
1370 if (OP(scan) == EXACTFL) {
1373 /* An EXACTFL node would already have been changed to another
1374 * node type unless there is at least one character in it that
1375 * is problematic; likely a character whose fold definition
1376 * won't be known until runtime, and so has yet to be folded.
1377 * For all but the UTF-8 locale, folds are 1-1 in length, but
1378 * to handle the UTF-8 case, we need to create a temporary
1379 * folded copy using UTF-8 locale rules in order to analyze it.
1380 * This is because our macros that look to see if a sequence is
1381 * a multi-char fold assume everything is folded (otherwise the
1382 * tests in those macros would be too complicated and slow).
1383 * Note that here, the non-problematic folds will have already
1384 * been done, so we can just copy such characters. We actually
1385 * don't completely fold the EXACTFL string. We skip the
1386 * unfolded multi-char folds, as that would just create work
1387 * below to figure out the size they already are */
1389 Newx(folded, UTF8_MAX_FOLD_CHAR_EXPAND * STR_LEN(scan) + 1, U8);
1392 STRLEN s_len = UTF8SKIP(s);
1393 if (! is_PROBLEMATIC_LOCALE_FOLD_utf8(s)) {
1394 Copy(s, d, s_len, U8);
1397 else if (is_FOLDS_TO_MULTI_utf8(s)) {
1398 *unfolded_multi_char = TRUE;
1399 Copy(s, d, s_len, U8);
1402 else if (isASCII(*s)) {
1403 *(d++) = toFOLD(*s);
1407 _toFOLD_utf8_flags(s, s_end, d, &len, FOLD_FLAGS_FULL);
1413 /* Point the remainder of the routine to look at our temporary
1417 } /* End of creating folded copy of EXACTFL string */
1419 /* Examine the string for a multi-character fold sequence. UTF-8
1420 * patterns have all characters pre-folded by the time this code is
1422 while (s < s_end - 1) /* Can stop 1 before the end, as minimum
1423 length sequence we are looking for is 2 */
1425 int count = 0; /* How many characters in a multi-char fold */
1426 int len = is_MULTI_CHAR_FOLD_utf8_safe(s, s_end);
1427 if (! len) { /* Not a multi-char fold: get next char */
1432 { /* Here is a generic multi-char fold. */
1433 U8* multi_end = s + len;
1435 /* Count how many characters are in it. In the case of
1436 * /aa, no folds which contain ASCII code points are
1437 * allowed, so check for those, and skip if found. */
1438 if (OP(scan) != EXACTFAA && OP(scan) != EXACTFAA_NO_TRIE) {
1439 count = utf8_length(s, multi_end);
1443 while (s < multi_end) {
1446 goto next_iteration;
1456 /* The delta is how long the sequence is minus 1 (1 is how long
1457 * the character that folds to the sequence is) */
1458 total_count_delta += count - 1;
1462 /* We created a temporary folded copy of the string in EXACTFL
1463 * nodes. Therefore we need to be sure it doesn't go below zero,
1464 * as the real string could be shorter */
1465 if (OP(scan) == EXACTFL) {
1466 int total_chars = utf8_length((U8*) STRING(scan),
1467 (U8*) STRING(scan) + STR_LEN(scan));
1468 if (total_count_delta > total_chars) {
1469 total_count_delta = total_chars;
1473 *min_subtract += total_count_delta;
1476 else if (OP(scan) == EXACTFAA) {
1478 /* Non-UTF-8 pattern, EXACTFAA node. There can't be a multi-char
1479 * fold to the ASCII range (and there are no existing ones in the
1480 * upper latin1 range). But, as outlined in the comments preceding
1481 * this function, we need to flag any occurrences of the sharp s.
1482 * This character forbids trie formation (because of added
1484 #if UNICODE_MAJOR_VERSION > 3 /* no multifolds in early Unicode */ \
1485 || (UNICODE_MAJOR_VERSION == 3 && ( UNICODE_DOT_VERSION > 0) \
1486 || UNICODE_DOT_DOT_VERSION > 0)
1488 if (*s == LATIN_SMALL_LETTER_SHARP_S) {
1489 OP(scan) = EXACTFAA_NO_TRIE;
1490 *unfolded_multi_char = TRUE;
1496 else if (OP(scan) != EXACTFAA_NO_TRIE) {
1498 /* Non-UTF-8 pattern, not EXACTFAA node. Look for the multi-char
1499 * folds that are all Latin1. As explained in the comments
1500 * preceding this function, we look also for the sharp s in EXACTF
1501 * and EXACTFL nodes; it can be in the final position. Otherwise
1502 * we can stop looking 1 byte earlier because have to find at least
1503 * two characters for a multi-fold */
1504 const U8* upper = (OP(scan) == EXACTF || OP(scan) == EXACTFL)
1509 int len = is_MULTI_CHAR_FOLD_latin1_safe(s, s_end);
1510 if (! len) { /* Not a multi-char fold. */
1511 if (*s == LATIN_SMALL_LETTER_SHARP_S
1512 && (OP(scan) == EXACTF || OP(scan) == EXACTFL))
1514 *unfolded_multi_char = TRUE;
1521 && isALPHA_FOLD_EQ(*s, 's')
1522 && isALPHA_FOLD_EQ(*(s+1), 's'))
1525 /* EXACTF nodes need to know that the minimum length
1526 * changed so that a sharp s in the string can match this
1527 * ss in the pattern, but they remain EXACTF nodes, as they
1528 * won't match this unless the target string is in UTF-8,
1529 * which we don't know until runtime. EXACTFL nodes can't
1530 * transform into EXACTFU nodes */
1531 if (OP(scan) != EXACTF && OP(scan) != EXACTFL) {
1532 OP(scan) = EXACTFUP;
1536 *min_subtract += len - 1;
1544 /* Allow dumping but overwriting the collection of skipped
1545 * ops and/or strings with fake optimized ops */
1546 n = REGNODE_AFTER_varies(scan);
1554 DEBUG_OPTIMISE_r(if (merged){DEBUG_PEEP("finl", scan, depth, 0);});
1558 /* REx optimizer. Converts nodes into quicker variants "in place".
1559 Finds fixed substrings. */
1562 /* Stops at toplevel WHILEM as well as at "last". At end *scanp is set
1563 to the position after last scanned or to NULL. */
1565 /* the return from this sub is the minimum length that could possibly match */
1567 Perl_study_chunk(pTHX_
1568 RExC_state_t *pRExC_state,
1569 regnode **scanp, /* Start here (read-write). */
1570 SSize_t *minlenp, /* used for the minlen of substrings? */
1571 SSize_t *deltap, /* Write maxlen-minlen here. */
1572 regnode *last, /* Stop before this one. */
1573 scan_data_t *data, /* string data about the pattern */
1574 I32 stopparen, /* treat CLOSE-N as END, see GOSUB */
1575 U32 recursed_depth, /* how deep have we recursed via GOSUB */
1576 regnode_ssc *and_withp, /* Valid if flags & SCF_DO_STCLASS_OR */
1577 U32 flags, /* flags controlling this call, see SCF_ flags */
1578 U32 depth, /* how deep have we recursed period */
1579 bool was_mutate_ok /* TRUE if in-place optimizations are allowed.
1580 FALSE only if the caller (recursively) was
1581 prohibited from modifying the regops, because
1582 a higher caller is holding a ptr to them. */
1585 /* vars about the regnodes we are working with */
1586 regnode *scan = *scanp; /* the current opcode we are inspecting */
1587 regnode *next = NULL; /* the next opcode beyond scan, tmp var */
1588 regnode *first_non_open = scan; /* FIXME: should this init to NULL?
1589 the first non open regop, if the init
1590 val IS an OPEN then we will skip past
1591 it just after the var decls section */
1592 I32 code = 0; /* temp var used to hold the optype of a regop */
1594 /* vars about the min and max length of the pattern */
1595 SSize_t min = 0; /* min length of this part of the pattern */
1596 SSize_t stopmin = OPTIMIZE_INFTY; /* min length accounting for ACCEPT
1597 this is adjusted down if we find
1599 SSize_t delta = 0; /* difference between min and max length
1600 (not accounting for stopmin) */
1602 /* vars about capture buffers in the pattern */
1603 I32 pars = 0; /* count of OPEN opcodes */
1604 I32 is_par = OP(scan) == OPEN ? PARNO(scan) : 0; /* is this op an OPEN? */
1606 /* vars about whether this pattern contains something that can match
1607 * infinitely long strings, eg, X* or X+ */
1608 int is_inf = (flags & SCF_DO_SUBSTR) && (data->flags & SF_IS_INF);
1609 int is_inf_internal = 0; /* The studied chunk is infinite */
1611 /* scan_data_t (struct) is used to hold information about the substrings
1612 * and start class we have extracted from the string */
1613 scan_data_t data_fake; /* temp var used for recursing in some cases */
1615 SV *re_trie_maxbuff = NULL; /* temp var used to hold whether we can do
1616 trie optimizations */
1618 scan_frame *frame = NULL; /* used as part of fake recursion */
1620 DECLARE_AND_GET_RE_DEBUG_FLAGS;
1622 PERL_ARGS_ASSERT_STUDY_CHUNK;
1623 RExC_study_started= 1;
1625 Zero(&data_fake, 1, scan_data_t);
1628 while (first_non_open && OP(first_non_open) == OPEN)
1629 first_non_open=regnext(first_non_open);
1634 RExC_study_chunk_recursed_count++;
1636 DEBUG_OPTIMISE_MORE_r(
1638 Perl_re_indentf( aTHX_ "study_chunk stopparen=%ld recursed_count=%lu depth=%lu recursed_depth=%lu scan=%p last=%p",
1639 depth, (long)stopparen,
1640 (unsigned long)RExC_study_chunk_recursed_count,
1641 (unsigned long)depth, (unsigned long)recursed_depth,
1644 if (recursed_depth) {
1647 for ( j = 0 ; j < recursed_depth ; j++ ) {
1648 for ( i = 0 ; i < (U32)RExC_total_parens ; i++ ) {
1649 if (PAREN_TEST(j, i) && (!j || !PAREN_TEST(j - 1, i))) {
1650 Perl_re_printf( aTHX_ " %d",(int)i);
1654 if ( j + 1 < recursed_depth ) {
1655 Perl_re_printf( aTHX_ ",");
1659 Perl_re_printf( aTHX_ "\n");
1662 while ( scan && OP(scan) != END && scan < last ){
1663 UV min_subtract = 0; /* How mmany chars to subtract from the minimum
1664 node length to get a real minimum (because
1665 the folded version may be shorter) */
1666 bool unfolded_multi_char = FALSE;
1667 /* avoid mutating ops if we are anywhere within the recursed or
1668 * enframed handling for a GOSUB: the outermost level will handle it.
1670 bool mutate_ok = was_mutate_ok && !(frame && frame->in_gosub);
1671 /* Peephole optimizer: */
1672 DEBUG_STUDYDATA("Peep", data, depth, is_inf, min, stopmin, delta);
1673 DEBUG_PEEP("Peep", scan, depth, flags);
1676 /* The reason we do this here is that we need to deal with things like
1677 * /(?:f)(?:o)(?:o)/ which cant be dealt with by the normal EXACT
1678 * parsing code, as each (?:..) is handled by a different invocation of
1681 if (REGNODE_TYPE(OP(scan)) == EXACT
1682 && OP(scan) != LEXACT
1683 && OP(scan) != LEXACT_REQ8
1686 join_exact(pRExC_state, scan, &min_subtract, &unfolded_multi_char,
1687 0, NULL, depth + 1);
1690 /* Follow the next-chain of the current node and optimize
1691 away all the NOTHINGs from it.
1693 rck_elide_nothing(scan);
1695 /* The principal pseudo-switch. Cannot be a switch, since we look into
1696 * several different things. */
1697 if ( OP(scan) == DEFINEP ) {
1699 SSize_t deltanext = 0;
1700 SSize_t fake_last_close = 0;
1701 regnode *fake_last_close_op = NULL;
1702 U32 f = SCF_IN_DEFINE | (flags & SCF_TRIE_DOING_RESTUDY);
1704 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
1705 scan = regnext(scan);
1706 assert( OP(scan) == IFTHEN );
1707 DEBUG_PEEP("expect IFTHEN", scan, depth, flags);
1709 data_fake.last_closep= &fake_last_close;
1710 data_fake.last_close_opp= &fake_last_close_op;
1712 next = regnext(scan);
1713 scan = REGNODE_AFTER_type(scan,tregnode_IFTHEN);
1714 DEBUG_PEEP("scan", scan, depth, flags);
1715 DEBUG_PEEP("next", next, depth, flags);
1717 /* we suppose the run is continuous, last=next...
1718 * NOTE we dont use the return here! */
1719 /* DEFINEP study_chunk() recursion */
1720 (void)study_chunk(pRExC_state, &scan, &minlen,
1721 &deltanext, next, &data_fake, stopparen,
1722 recursed_depth, NULL, f, depth+1, mutate_ok);
1727 OP(scan) == BRANCH ||
1728 OP(scan) == BRANCHJ ||
1731 next = regnext(scan);
1734 /* The op(next)==code check below is to see if we
1735 * have "BRANCH-BRANCH", "BRANCHJ-BRANCHJ", "IFTHEN-IFTHEN"
1736 * IFTHEN is special as it might not appear in pairs.
1737 * Not sure whether BRANCH-BRANCHJ is possible, regardless
1738 * we dont handle it cleanly. */
1739 if (OP(next) == code || code == IFTHEN) {
1740 /* NOTE - There is similar code to this block below for
1741 * handling TRIE nodes on a re-study. If you change stuff here
1742 * check there too. */
1743 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY, num = 0;
1745 regnode * const startbranch=scan;
1747 if (flags & SCF_DO_SUBSTR) {
1748 /* Cannot merge strings after this. */
1749 scan_commit(pRExC_state, data, minlenp, is_inf);
1752 if (flags & SCF_DO_STCLASS)
1753 ssc_init_zero(pRExC_state, &accum);
1755 while (OP(scan) == code) {
1756 SSize_t deltanext, minnext, fake_last_close = 0;
1757 regnode *fake_last_close_op = NULL;
1758 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
1759 regnode_ssc this_class;
1761 DEBUG_PEEP("Branch", scan, depth, flags);
1764 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
1766 data_fake.whilem_c = data->whilem_c;
1767 data_fake.last_closep = data->last_closep;
1768 data_fake.last_close_opp = data->last_close_opp;
1771 data_fake.last_closep = &fake_last_close;
1772 data_fake.last_close_opp = &fake_last_close_op;
1775 data_fake.pos_delta = delta;
1776 next = regnext(scan);
1778 scan = REGNODE_AFTER_opcode(scan, code);
1780 if (flags & SCF_DO_STCLASS) {
1781 ssc_init(pRExC_state, &this_class);
1782 data_fake.start_class = &this_class;
1783 f |= SCF_DO_STCLASS_AND;
1785 if (flags & SCF_WHILEM_VISITED_POS)
1786 f |= SCF_WHILEM_VISITED_POS;
1788 /* we suppose the run is continuous, last=next...*/
1789 /* recurse study_chunk() for each BRANCH in an alternation */
1790 minnext = study_chunk(pRExC_state, &scan, minlenp,
1791 &deltanext, next, &data_fake, stopparen,
1792 recursed_depth, NULL, f, depth+1,
1797 if (deltanext == OPTIMIZE_INFTY) {
1798 is_inf = is_inf_internal = 1;
1799 max1 = OPTIMIZE_INFTY;
1800 } else if (max1 < minnext + deltanext)
1801 max1 = minnext + deltanext;
1803 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
1805 if (data_fake.flags & SCF_SEEN_ACCEPT) {
1806 if ( stopmin > minnext)
1807 stopmin = min + min1;
1808 flags &= ~SCF_DO_SUBSTR;
1810 data->flags |= SCF_SEEN_ACCEPT;
1813 if (data_fake.flags & SF_HAS_EVAL)
1814 data->flags |= SF_HAS_EVAL;
1815 data->whilem_c = data_fake.whilem_c;
1817 if (flags & SCF_DO_STCLASS)
1818 ssc_or(pRExC_state, &accum, (regnode_charclass*)&this_class);
1819 DEBUG_STUDYDATA("end BRANCH", data, depth, is_inf, min, stopmin, delta);
1821 if (code == IFTHEN && num < 2) /* Empty ELSE branch */
1823 if (flags & SCF_DO_SUBSTR) {
1824 data->pos_min += min1;
1825 if (data->pos_delta >= OPTIMIZE_INFTY - (max1 - min1))
1826 data->pos_delta = OPTIMIZE_INFTY;
1828 data->pos_delta += max1 - min1;
1829 if (max1 != min1 || is_inf)
1830 data->cur_is_floating = 1;
1833 if (delta == OPTIMIZE_INFTY
1834 || OPTIMIZE_INFTY - delta - (max1 - min1) < 0)
1835 delta = OPTIMIZE_INFTY;
1837 delta += max1 - min1;
1838 if (flags & SCF_DO_STCLASS_OR) {
1839 ssc_or(pRExC_state, data->start_class, (regnode_charclass*) &accum);
1841 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
1842 flags &= ~SCF_DO_STCLASS;
1845 else if (flags & SCF_DO_STCLASS_AND) {
1847 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
1848 flags &= ~SCF_DO_STCLASS;
1851 /* Switch to OR mode: cache the old value of
1852 * data->start_class */
1854 StructCopy(data->start_class, and_withp, regnode_ssc);
1855 flags &= ~SCF_DO_STCLASS_AND;
1856 StructCopy(&accum, data->start_class, regnode_ssc);
1857 flags |= SCF_DO_STCLASS_OR;
1860 DEBUG_STUDYDATA("pre TRIE", data, depth, is_inf, min, stopmin, delta);
1862 if (PERL_ENABLE_TRIE_OPTIMISATION
1863 && OP(startbranch) == BRANCH
1868 Assuming this was/is a branch we are dealing with: 'scan'
1869 now points at the item that follows the branch sequence,
1870 whatever it is. We now start at the beginning of the
1871 sequence and look for subsequences of
1877 which would be constructed from a pattern like
1880 If we can find such a subsequence we need to turn the first
1881 element into a trie and then add the subsequent branch exact
1882 strings to the trie.
1886 1. patterns where the whole set of branches can be
1889 2. patterns where only a subset can be converted.
1891 In case 1 we can replace the whole set with a single regop
1892 for the trie. In case 2 we need to keep the start and end
1895 'BRANCH EXACT; BRANCH EXACT; BRANCH X'
1896 becomes BRANCH TRIE; BRANCH X;
1898 There is an additional case, that being where there is a
1899 common prefix, which gets split out into an EXACT like node
1900 preceding the TRIE node.
1902 If X(1..n)==tail then we can do a simple trie, if not we make
1903 a "jump" trie, such that when we match the appropriate word
1904 we "jump" to the appropriate tail node. Essentially we turn
1905 a nested if into a case structure of sorts.
1910 if (!re_trie_maxbuff) {
1911 re_trie_maxbuff = get_sv(RE_TRIE_MAXBUF_NAME, 1);
1912 if (!SvIOK(re_trie_maxbuff))
1913 sv_setiv(re_trie_maxbuff, RE_TRIE_MAXBUF_INIT);
1915 if ( SvIV(re_trie_maxbuff)>=0 ) {
1917 regnode *first = (regnode *)NULL;
1918 regnode *prev = (regnode *)NULL;
1919 regnode *tail = scan;
1923 /* var tail is used because there may be a TAIL
1924 regop in the way. Ie, the exacts will point to the
1925 thing following the TAIL, but the last branch will
1926 point at the TAIL. So we advance tail. If we
1927 have nested (?:) we may have to move through several
1931 while ( OP( tail ) == TAIL ) {
1932 /* this is the TAIL generated by (?:) */
1933 tail = regnext( tail );
1937 DEBUG_TRIE_COMPILE_r({
1938 regprop(RExC_rx, RExC_mysv, tail, NULL, pRExC_state);
1939 Perl_re_indentf( aTHX_ "%s %" UVuf ":%s\n",
1941 "Looking for TRIE'able sequences. Tail node is ",
1942 (UV) REGNODE_OFFSET(tail),
1943 SvPV_nolen_const( RExC_mysv )
1949 Step through the branches
1950 cur represents each branch,
1951 noper is the first thing to be matched as part
1953 noper_next is the regnext() of that node.
1955 We normally handle a case like this
1956 /FOO[xyz]|BAR[pqr]/ via a "jump trie" but we also
1957 support building with NOJUMPTRIE, which restricts
1958 the trie logic to structures like /FOO|BAR/.
1960 If noper is a trieable nodetype then the branch is
1961 a possible optimization target. If we are building
1962 under NOJUMPTRIE then we require that noper_next is
1963 the same as scan (our current position in the regex
1966 Once we have two or more consecutive such branches
1967 we can create a trie of the EXACT's contents and
1968 stitch it in place into the program.
1970 If the sequence represents all of the branches in
1971 the alternation we replace the entire thing with a
1974 Otherwise when it is a subsequence we need to
1975 stitch it in place and replace only the relevant
1976 branches. This means the first branch has to remain
1977 as it is used by the alternation logic, and its
1978 next pointer, and needs to be repointed at the item
1979 on the branch chain following the last branch we
1980 have optimized away.
1982 This could be either a BRANCH, in which case the
1983 subsequence is internal, or it could be the item
1984 following the branch sequence in which case the
1985 subsequence is at the end (which does not
1986 necessarily mean the first node is the start of the
1989 TRIE_TYPE(X) is a define which maps the optype to a
1993 ----------------+-----------
1998 EXACTFU_REQ8 | EXACTFU
2002 EXACTFLU8 | EXACTFLU8
2006 #define TRIE_TYPE(X) ( ( NOTHING == (X) ) \
2008 : ( EXACT == (X) || EXACT_REQ8 == (X) ) \
2010 : ( EXACTFU == (X) \
2011 || EXACTFU_REQ8 == (X) \
2012 || EXACTFUP == (X) ) \
2014 : ( EXACTFAA == (X) ) \
2016 : ( EXACTL == (X) ) \
2018 : ( EXACTFLU8 == (X) ) \
2022 /* dont use tail as the end marker for this traverse */
2023 for ( cur = startbranch ; cur != scan ; cur = regnext( cur ) ) {
2024 regnode * const noper = REGNODE_AFTER( cur );
2025 U8 noper_type = OP( noper );
2026 U8 noper_trietype = TRIE_TYPE( noper_type );
2027 #if defined(DEBUGGING) || defined(NOJUMPTRIE)
2028 regnode * const noper_next = regnext( noper );
2029 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
2030 U8 noper_next_trietype = (noper_next && noper_next < tail) ? TRIE_TYPE( noper_next_type ) :0;
2033 DEBUG_TRIE_COMPILE_r({
2034 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2035 Perl_re_indentf( aTHX_ "- %d:%s (%d)",
2037 REG_NODE_NUM(cur), SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur) );
2039 regprop(RExC_rx, RExC_mysv, noper, NULL, pRExC_state);
2040 Perl_re_printf( aTHX_ " -> %d:%s",
2041 REG_NODE_NUM(noper), SvPV_nolen_const(RExC_mysv));
2044 regprop(RExC_rx, RExC_mysv, noper_next, NULL, pRExC_state);
2045 Perl_re_printf( aTHX_ "\t=> %d:%s\t",
2046 REG_NODE_NUM(noper_next), SvPV_nolen_const(RExC_mysv));
2048 Perl_re_printf( aTHX_ "(First==%d,Last==%d,Cur==%d,tt==%s,ntt==%s,nntt==%s)\n",
2049 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
2050 REGNODE_NAME(trietype), REGNODE_NAME(noper_trietype), REGNODE_NAME(noper_next_trietype)
2054 /* Is noper a trieable nodetype that can be merged
2055 * with the current trie (if there is one)? */
2059 ( noper_trietype == NOTHING )
2060 || ( trietype == NOTHING )
2061 || ( trietype == noper_trietype )
2064 && noper_next >= tail
2068 /* Handle mergable triable node Either we are
2069 * the first node in a new trieable sequence,
2070 * in which case we do some bookkeeping,
2071 * otherwise we update the end pointer. */
2074 if ( noper_trietype == NOTHING ) {
2075 #if !defined(DEBUGGING) && !defined(NOJUMPTRIE)
2076 regnode * const noper_next = regnext( noper );
2077 U8 noper_next_type = (noper_next && noper_next < tail) ? OP(noper_next) : 0;
2078 U8 noper_next_trietype = noper_next_type ? TRIE_TYPE( noper_next_type ) :0;
2081 if ( noper_next_trietype ) {
2082 trietype = noper_next_trietype;
2083 } else if (noper_next_type) {
2084 /* a NOTHING regop is 1 regop wide.
2085 * We need at least two for a trie
2086 * so we can't merge this in */
2090 trietype = noper_trietype;
2093 if ( trietype == NOTHING )
2094 trietype = noper_trietype;
2099 } /* end handle mergable triable node */
2101 /* handle unmergable node -
2102 * noper may either be a triable node which can
2103 * not be tried together with the current trie,
2104 * or a non triable node */
2106 /* If last is set and trietype is not
2107 * NOTHING then we have found at least two
2108 * triable branch sequences in a row of a
2109 * similar trietype so we can turn them
2110 * into a trie. If/when we allow NOTHING to
2111 * start a trie sequence this condition
2112 * will be required, and it isn't expensive
2113 * so we leave it in for now. */
2114 if ( trietype && trietype != NOTHING )
2115 make_trie( pRExC_state,
2116 startbranch, first, cur, tail,
2117 count, trietype, depth+1 );
2118 prev = NULL; /* note: we clear/update
2119 first, trietype etc below,
2120 so we dont do it here */
2124 && noper_next >= tail
2127 /* noper is triable, so we can start a new
2131 trietype = noper_trietype;
2133 /* if we already saw a first but the
2134 * current node is not triable then we have
2135 * to reset the first information. */
2140 } /* end handle unmergable node */
2141 } /* loop over branches */
2142 DEBUG_TRIE_COMPILE_r({
2143 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2144 Perl_re_indentf( aTHX_ "- %s (%d) <SCAN FINISHED> ",
2145 depth+1, SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
2146 Perl_re_printf( aTHX_ "(First==%d, Last==%d, Cur==%d, tt==%s)\n",
2147 REG_NODE_NUM(first), REG_NODE_NUM(prev), REG_NODE_NUM(cur),
2148 REGNODE_NAME(trietype)
2152 if ( prev && trietype ) {
2153 if ( trietype != NOTHING ) {
2154 /* the last branch of the sequence was part of
2155 * a trie, so we have to construct it here
2156 * outside of the loop */
2157 made= make_trie( pRExC_state, startbranch,
2158 first, scan, tail, count,
2159 trietype, depth+1 );
2160 #ifdef TRIE_STUDY_OPT
2161 if ( ((made == MADE_EXACT_TRIE &&
2162 startbranch == first)
2163 || ( first_non_open == first )) &&
2165 flags |= SCF_TRIE_RESTUDY;
2166 if ( startbranch == first
2169 RExC_seen &=~REG_TOP_LEVEL_BRANCHES_SEEN;
2174 /* at this point we know whatever we have is a
2175 * NOTHING sequence/branch AND if 'startbranch'
2176 * is 'first' then we can turn the whole thing
2179 if ( startbranch == first ) {
2181 /* the entire thing is a NOTHING sequence,
2182 * something like this: (?:|) So we can
2183 * turn it into a plain NOTHING op. */
2184 DEBUG_TRIE_COMPILE_r({
2185 regprop(RExC_rx, RExC_mysv, cur, NULL, pRExC_state);
2186 Perl_re_indentf( aTHX_ "- %s (%d) <NOTHING BRANCH SEQUENCE>\n",
2188 SvPV_nolen_const( RExC_mysv ), REG_NODE_NUM(cur));
2191 OP(startbranch)= NOTHING;
2192 NEXT_OFF(startbranch)= tail - startbranch;
2193 for ( opt= startbranch + 1; opt < tail ; opt++ )
2197 } /* end if ( prev) */
2198 } /* TRIE_MAXBUF is non zero */
2200 DEBUG_STUDYDATA("after TRIE", data, depth, is_inf, min, stopmin, delta);
2203 scan = REGNODE_AFTER_opcode(scan,code);
2205 } else if (OP(scan) == SUSPEND || OP(scan) == GOSUB) {
2207 regnode *start = NULL;
2208 regnode *end = NULL;
2209 U32 my_recursed_depth= recursed_depth;
2211 if (OP(scan) != SUSPEND) { /* GOSUB */
2212 /* Do setup, note this code has side effects beyond
2213 * the rest of this block. Specifically setting
2214 * RExC_recurse[] must happen at least once during
2217 RExC_recurse[ARG2L(scan)] = scan;
2218 start = REGNODE_p(RExC_open_parens[paren]);
2219 end = REGNODE_p(RExC_close_parens[paren]);
2221 /* NOTE we MUST always execute the above code, even
2222 * if we do nothing with a GOSUB */
2224 ( flags & SCF_IN_DEFINE )
2227 (is_inf_internal || is_inf || (data && data->flags & SF_IS_INF))
2229 ( (flags & (SCF_DO_STCLASS | SCF_DO_SUBSTR)) == 0 )
2232 /* no need to do anything here if we are in a define. */
2233 /* or we are after some kind of infinite construct
2234 * so we can skip recursing into this item.
2235 * Since it is infinite we will not change the maxlen
2236 * or delta, and if we miss something that might raise
2237 * the minlen it will merely pessimise a little.
2239 * Iow /(?(DEFINE)(?<foo>foo|food))a+(?&foo)/
2240 * might result in a minlen of 1 and not of 4,
2241 * but this doesn't make us mismatch, just try a bit
2242 * harder than we should.
2244 * However we must assume this GOSUB is infinite, to
2245 * avoid wrongly applying other optimizations in the
2246 * enclosing scope - see GH 18096, for example.
2248 is_inf = is_inf_internal = 1;
2249 scan= regnext(scan);
2255 || !PAREN_TEST(recursed_depth - 1, paren)
2257 /* it is quite possible that there are more efficient ways
2258 * to do this. We maintain a bitmap per level of recursion
2259 * of which patterns we have entered so we can detect if a
2260 * pattern creates a possible infinite loop. When we
2261 * recurse down a level we copy the previous levels bitmap
2262 * down. When we are at recursion level 0 we zero the top
2263 * level bitmap. It would be nice to implement a different
2264 * more efficient way of doing this. In particular the top
2265 * level bitmap may be unnecessary.
2267 if (!recursed_depth) {
2268 Zero(RExC_study_chunk_recursed, RExC_study_chunk_recursed_bytes, U8);
2270 Copy(PAREN_OFFSET(recursed_depth - 1),
2271 PAREN_OFFSET(recursed_depth),
2272 RExC_study_chunk_recursed_bytes, U8);
2274 /* we havent recursed into this paren yet, so recurse into it */
2275 DEBUG_STUDYDATA("gosub-set", data, depth, is_inf, min, stopmin, delta);
2276 PAREN_SET(recursed_depth, paren);
2277 my_recursed_depth= recursed_depth + 1;
2279 DEBUG_STUDYDATA("gosub-inf", data, depth, is_inf, min, stopmin, delta);
2280 /* some form of infinite recursion, assume infinite length
2282 if (flags & SCF_DO_SUBSTR) {
2283 scan_commit(pRExC_state, data, minlenp, is_inf);
2284 data->cur_is_floating = 1;
2286 is_inf = is_inf_internal = 1;
2287 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
2288 ssc_anything(data->start_class);
2289 flags &= ~SCF_DO_STCLASS;
2291 start= NULL; /* reset start so we dont recurse later on. */
2296 end = regnext(scan);
2299 scan_frame *newframe;
2301 if (!RExC_frame_last) {
2302 Newxz(newframe, 1, scan_frame);
2303 SAVEDESTRUCTOR_X(S_unwind_scan_frames, newframe);
2304 RExC_frame_head= newframe;
2306 } else if (!RExC_frame_last->next_frame) {
2307 Newxz(newframe, 1, scan_frame);
2308 RExC_frame_last->next_frame= newframe;
2309 newframe->prev_frame= RExC_frame_last;
2312 newframe= RExC_frame_last->next_frame;
2314 RExC_frame_last= newframe;
2316 newframe->next_regnode = regnext(scan);
2317 newframe->last_regnode = last;
2318 newframe->stopparen = stopparen;
2319 newframe->prev_recursed_depth = recursed_depth;
2320 newframe->this_prev_frame= frame;
2321 newframe->in_gosub = (
2322 (frame && frame->in_gosub) || OP(scan) == GOSUB
2325 DEBUG_STUDYDATA("frame-new", data, depth, is_inf, min, stopmin, delta);
2326 DEBUG_PEEP("fnew", scan, depth, flags);
2333 recursed_depth= my_recursed_depth;
2338 else if (REGNODE_TYPE(OP(scan)) == EXACT && ! isEXACTFish(OP(scan))) {
2339 SSize_t bytelen = STR_LEN(scan), charlen;
2343 const U8 * const s = (U8*)STRING(scan);
2344 uc = utf8_to_uvchr_buf(s, s + bytelen, NULL);
2345 charlen = utf8_length(s, s + bytelen);
2347 uc = *((U8*)STRING(scan));
2351 if (flags & SCF_DO_SUBSTR) { /* Update longest substr. */
2352 /* The code below prefers earlier match for fixed
2353 offset, later match for variable offset. */
2354 if (data->last_end == -1) { /* Update the start info. */
2355 data->last_start_min = data->pos_min;
2356 data->last_start_max =
2357 is_inf ? OPTIMIZE_INFTY
2358 : (data->pos_delta > OPTIMIZE_INFTY - data->pos_min)
2359 ? OPTIMIZE_INFTY : data->pos_min + data->pos_delta;
2361 sv_catpvn(data->last_found, STRING(scan), bytelen);
2363 SvUTF8_on(data->last_found);
2365 SV * const sv = data->last_found;
2366 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2367 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2368 if (mg && mg->mg_len >= 0)
2369 mg->mg_len += charlen;
2371 data->last_end = data->pos_min + charlen;
2372 data->pos_min += charlen; /* As in the first entry. */
2373 data->flags &= ~SF_BEFORE_EOL;
2376 /* ANDing the code point leaves at most it, and not in locale, and
2377 * can't match null string */
2378 if (flags & SCF_DO_STCLASS_AND) {
2379 ssc_cp_and(data->start_class, uc);
2380 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2381 ssc_clear_locale(data->start_class);
2383 else if (flags & SCF_DO_STCLASS_OR) {
2384 ssc_add_cp(data->start_class, uc);
2385 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2387 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2388 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2390 flags &= ~SCF_DO_STCLASS;
2391 DEBUG_STUDYDATA("end EXACT", data, depth, is_inf, min, stopmin, delta);
2393 else if (REGNODE_TYPE(OP(scan)) == EXACT) {
2394 /* But OP != EXACT!, so is EXACTFish */
2395 SSize_t bytelen = STR_LEN(scan), charlen;
2396 const U8 * s = (U8*)STRING(scan);
2398 /* Replace a length 1 ASCII fold pair node with an ANYOFM node,
2399 * with the mask set to the complement of the bit that differs
2400 * between upper and lower case, and the lowest code point of the
2401 * pair (which the '&' forces) */
2404 && ( OP(scan) == EXACTFAA
2405 || ( OP(scan) == EXACTFU
2406 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(*s)))
2409 U8 mask = ~ ('A' ^ 'a'); /* These differ in just one bit */
2412 ARG_SET(scan, *s & mask);
2414 /* We're not EXACTFish any more, so restudy.
2415 * Search for "restudy" in this file to find
2416 * a comment with details. */
2420 /* Search for fixed substrings supports EXACT only. */
2421 if (flags & SCF_DO_SUBSTR) {
2423 scan_commit(pRExC_state, data, minlenp, is_inf);
2425 charlen = UTF ? (SSize_t) utf8_length(s, s + bytelen) : bytelen;
2426 if (unfolded_multi_char) {
2427 RExC_seen |= REG_UNFOLDED_MULTI_SEEN;
2429 min += charlen - min_subtract;
2431 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
2432 && delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
2434 delta += min_subtract;
2436 delta = OPTIMIZE_INFTY;
2438 if (flags & SCF_DO_SUBSTR) {
2439 data->pos_min += charlen - min_subtract;
2440 if (data->pos_min < 0) {
2443 if ((SSize_t)min_subtract < OPTIMIZE_INFTY
2444 && data->pos_delta < OPTIMIZE_INFTY - (SSize_t)min_subtract
2446 data->pos_delta += min_subtract;
2448 data->pos_delta = OPTIMIZE_INFTY;
2451 data->cur_is_floating = 1; /* float */
2455 if (flags & SCF_DO_STCLASS) {
2456 SV* EXACTF_invlist = make_exactf_invlist(pRExC_state, scan);
2458 assert(EXACTF_invlist);
2459 if (flags & SCF_DO_STCLASS_AND) {
2460 if (OP(scan) != EXACTFL)
2461 ssc_clear_locale(data->start_class);
2462 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2463 ANYOF_POSIXL_ZERO(data->start_class);
2464 ssc_intersection(data->start_class, EXACTF_invlist, FALSE);
2466 else { /* SCF_DO_STCLASS_OR */
2467 ssc_union(data->start_class, EXACTF_invlist, FALSE);
2468 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2470 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
2471 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
2473 flags &= ~SCF_DO_STCLASS;
2474 SvREFCNT_dec(EXACTF_invlist);
2476 DEBUG_STUDYDATA("end EXACTish", data, depth, is_inf, min, stopmin, delta);
2478 else if (REGNODE_VARIES(OP(scan))) {
2479 SSize_t mincount, maxcount, minnext, deltanext, pos_before = 0;
2482 regnode * const oscan = scan;
2483 regnode_ssc this_class;
2484 regnode_ssc *oclass = NULL;
2485 I32 next_is_eval = 0;
2487 switch (REGNODE_TYPE(OP(scan))) {
2488 case WHILEM: /* End of (?:...)* . */
2489 scan = REGNODE_AFTER(scan);
2492 if (flags & (SCF_DO_SUBSTR | SCF_DO_STCLASS)) {
2493 next = REGNODE_AFTER(scan);
2494 if ( ( REGNODE_TYPE(OP(next)) == EXACT
2495 && ! isEXACTFish(OP(next)))
2496 || (flags & SCF_DO_STCLASS))
2499 maxcount = REG_INFTY;
2500 next = regnext(scan);
2501 scan = REGNODE_AFTER(scan);
2505 if (flags & SCF_DO_SUBSTR)
2507 /* This will bypass the formal 'min += minnext * mincount'
2508 * calculation in the do_curly path, so assumes min width
2509 * of the PLUS payload is exactly one. */
2513 next = REGNODE_AFTER(scan);
2515 /* This temporary node can now be turned into EXACTFU, and
2516 * must, as regexec.c doesn't handle it */
2517 if (OP(next) == EXACTFU_S_EDGE && mutate_ok) {
2521 if ( STR_LEN(next) == 1
2522 && isALPHA_A(* STRING(next))
2523 && ( OP(next) == EXACTFAA
2524 || ( OP(next) == EXACTFU
2525 && ! HAS_NONLATIN1_SIMPLE_FOLD_CLOSURE(* STRING(next))))
2528 /* These differ in just one bit */
2529 U8 mask = ~ ('A' ^ 'a');
2531 assert(isALPHA_A(* STRING(next)));
2533 /* Then replace it by an ANYOFM node, with
2534 * the mask set to the complement of the
2535 * bit that differs between upper and lower
2536 * case, and the lowest code point of the
2537 * pair (which the '&' forces) */
2539 ARG_SET(next, *STRING(next) & mask);
2543 if (flags & SCF_DO_STCLASS) {
2545 maxcount = REG_INFTY;
2546 next = regnext(scan);
2547 scan = REGNODE_AFTER(scan);
2550 if (flags & SCF_DO_SUBSTR) {
2551 scan_commit(pRExC_state, data, minlenp, is_inf);
2552 /* Cannot extend fixed substrings */
2553 data->cur_is_floating = 1; /* float */
2555 is_inf = is_inf_internal = 1;
2556 scan = regnext(scan);
2557 goto optimize_curly_tail;
2559 if (stopparen>0 && (OP(scan)==CURLYN || OP(scan)==CURLYM)
2560 && (scan->flags == stopparen))
2565 mincount = ARG1(scan);
2566 maxcount = ARG2(scan);
2568 next = regnext(scan);
2569 if (OP(scan) == CURLYX) {
2570 I32 lp = (data ? *(data->last_closep) : 0);
2571 scan->flags = ((lp <= (I32)U8_MAX) ? (U8)lp : U8_MAX);
2573 scan = REGNODE_AFTER(scan);
2574 next_is_eval = (OP(scan) == EVAL);
2576 if (flags & SCF_DO_SUBSTR) {
2578 scan_commit(pRExC_state, data, minlenp, is_inf);
2579 /* Cannot extend fixed substrings */
2580 pos_before = data->pos_min;
2584 data->flags &= ~(SF_HAS_PAR|SF_IN_PAR|SF_HAS_EVAL);
2586 data->flags |= SF_IS_INF;
2588 if (flags & SCF_DO_STCLASS) {
2589 ssc_init(pRExC_state, &this_class);
2590 oclass = data->start_class;
2591 data->start_class = &this_class;
2592 f |= SCF_DO_STCLASS_AND;
2593 f &= ~SCF_DO_STCLASS_OR;
2595 /* Exclude from super-linear cache processing any {n,m}
2596 regops for which the combination of input pos and regex
2597 pos is not enough information to determine if a match
2600 For example, in the regex /foo(bar\s*){4,8}baz/ with the
2601 regex pos at the \s*, the prospects for a match depend not
2602 only on the input position but also on how many (bar\s*)
2603 repeats into the {4,8} we are. */
2604 if ((mincount > 1) || (maxcount > 1 && maxcount != REG_INFTY))
2605 f &= ~SCF_WHILEM_VISITED_POS;
2607 /* This will finish on WHILEM, setting scan, or on NULL: */
2608 /* recurse study_chunk() on loop bodies */
2609 minnext = study_chunk(pRExC_state, &scan, minlenp, &deltanext,
2610 last, data, stopparen, recursed_depth, NULL,
2612 ? (f & ~SCF_DO_SUBSTR)
2614 , depth+1, mutate_ok);
2616 if (data && data->flags & SCF_SEEN_ACCEPT) {
2621 if (flags & SCF_DO_STCLASS)
2622 data->start_class = oclass;
2623 if (mincount == 0 || minnext == 0) {
2624 if (flags & SCF_DO_STCLASS_OR) {
2625 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2627 else if (flags & SCF_DO_STCLASS_AND) {
2628 /* Switch to OR mode: cache the old value of
2629 * data->start_class */
2631 StructCopy(data->start_class, and_withp, regnode_ssc);
2632 flags &= ~SCF_DO_STCLASS_AND;
2633 StructCopy(&this_class, data->start_class, regnode_ssc);
2634 flags |= SCF_DO_STCLASS_OR;
2635 ANYOF_FLAGS(data->start_class)
2636 |= SSC_MATCHES_EMPTY_STRING;
2638 } else { /* Non-zero len */
2639 if (flags & SCF_DO_STCLASS_OR) {
2640 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2641 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2643 else if (flags & SCF_DO_STCLASS_AND)
2644 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &this_class);
2645 flags &= ~SCF_DO_STCLASS;
2647 if (!scan) /* It was not CURLYX, but CURLY. */
2649 if (((flags & (SCF_TRIE_DOING_RESTUDY|SCF_DO_SUBSTR))==SCF_DO_SUBSTR)
2650 /* ? quantifier ok, except for (?{ ... }) */
2651 && (next_is_eval || !(mincount == 0 && maxcount == 1))
2652 && (minnext == 0) && (deltanext == 0)
2653 && data && !(data->flags & (SF_HAS_PAR|SF_IN_PAR))
2654 && maxcount <= REG_INFTY/3) /* Complement check for big
2657 _WARN_HELPER(RExC_precomp_end, packWARN(WARN_REGEXP),
2658 Perl_ck_warner(aTHX_ packWARN(WARN_REGEXP),
2659 "Quantifier unexpected on zero-length expression "
2660 "in regex m/%" UTF8f "/",
2661 UTF8fARG(UTF, RExC_precomp_end - RExC_precomp,
2665 if ( ( minnext > 0 && mincount >= SSize_t_MAX / minnext )
2666 || min >= SSize_t_MAX - minnext * mincount )
2668 FAIL("Regexp out of space");
2671 min += minnext * mincount;
2672 is_inf_internal |= deltanext == OPTIMIZE_INFTY
2673 || (maxcount == REG_INFTY && minnext + deltanext > 0);
2674 is_inf |= is_inf_internal;
2676 delta = OPTIMIZE_INFTY;
2678 delta += (minnext + deltanext) * maxcount
2679 - minnext * mincount;
2682 if (data && data->flags & SCF_SEEN_ACCEPT) {
2683 if (flags & SCF_DO_SUBSTR) {
2684 scan_commit(pRExC_state, data, minlenp, is_inf);
2685 flags &= ~SCF_DO_SUBSTR;
2689 DEBUG_STUDYDATA("after-whilem accept", data, depth, is_inf, min, stopmin, delta);
2691 DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta);
2692 /* Try powerful optimization CURLYX => CURLYN. */
2693 if ( RE_OPTIMIZE_CURLYX_TO_CURLYN
2694 && OP(oscan) == CURLYX
2696 && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a
2697 non optimistic eval is seen
2699 && ( data->flags & SF_IN_PAR ) /* has parens */
2704 DEBUG_STUDYDATA("CURLYX_TO_CURLYN", data, depth, is_inf, min, stopmin, delta);
2705 /* Try to optimize to CURLYN. */
2706 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX);
2707 regnode * const nxt1 = nxt;
2713 if (!REGNODE_SIMPLE(OP(nxt))
2714 && !(REGNODE_TYPE(OP(nxt)) == EXACT
2715 && STR_LEN(nxt) == 1))
2721 if (OP(nxt) != CLOSE)
2723 if (RExC_open_parens) {
2726 RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
2729 RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt) + 2;
2731 /* Now we know that nxt2 is the only contents: */
2732 oscan->flags = (U8)PARNO(nxt);
2734 OP(nxt1) = NOTHING; /* was OPEN. */
2737 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2738 NEXT_OFF(nxt1+ 1) = 0; /* just for consistency. */
2739 NEXT_OFF(nxt2) = 0; /* just for consistency with CURLY. */
2740 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2741 OP(nxt + 1) = OPTIMIZED; /* was count. */
2742 NEXT_OFF(nxt+ 1) = 0; /* just for consistency. */
2747 DEBUG_STUDYDATA("PRE CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta);
2749 /* Try optimization CURLYX => CURLYM. */
2750 if ( RE_OPTIMIZE_CURLYX_TO_CURLYM
2751 && OP(oscan) == CURLYX
2753 && !(RExC_seen & REG_PESSIMIZE_SEEN) /* XXX: for now disable whenever a
2754 non optimistic eval is seen
2756 && !(data->flags & SF_HAS_PAR) /* no parens! */
2757 && !deltanext /* atom is fixed width */
2758 && minnext != 0 /* CURLYM can't handle zero width */
2759 /* Nor characters whose fold at run-time may be
2760 * multi-character */
2761 && !(RExC_seen & REG_UNFOLDED_MULTI_SEEN)
2764 DEBUG_STUDYDATA("CURLYX_TO_CURLYM", data, depth, is_inf, min, stopmin, delta);
2765 /* XXXX How to optimize if data == 0? */
2766 /* Optimize to a simpler form. */
2767 regnode *nxt = REGNODE_AFTER_type(oscan, tregnode_CURLYX); /* OPEN */
2771 while ( (nxt2 = regnext(nxt)) /* skip over embedded stuff*/
2772 && (OP(nxt2) != WHILEM))
2774 OP(nxt2) = SUCCEED; /* Whas WHILEM */
2775 /* Need to optimize away parenths. */
2776 if ((data->flags & SF_IN_PAR) && OP(nxt) == CLOSE) {
2777 /* Set the parenth number. */
2778 /* note that we have changed the type of oscan to CURLYM here */
2779 regnode *nxt1 = REGNODE_AFTER_type(oscan, tregnode_CURLYM); /* OPEN*/
2781 oscan->flags = (U8)PARNO(nxt);
2782 if (RExC_open_parens) {
2784 RExC_open_parens[PARNO(nxt1)] = REGNODE_OFFSET(oscan);
2787 RExC_close_parens[PARNO(nxt1)] = REGNODE_OFFSET(nxt2)
2790 OP(nxt1) = OPTIMIZED; /* was OPEN. */
2791 OP(nxt) = OPTIMIZED; /* was CLOSE. */
2794 OP(nxt1 + 1) = OPTIMIZED; /* was count. */
2795 OP(nxt + 1) = OPTIMIZED; /* was count. */
2796 NEXT_OFF(nxt1 + 1) = 0; /* just for consistency. */
2797 NEXT_OFF(nxt + 1) = 0; /* just for consistency. */
2800 while ( nxt1 && (OP(nxt1) != WHILEM)) {
2801 regnode *nnxt = regnext(nxt1);
2803 if (REGNODE_OFF_BY_ARG(OP(nxt1)))
2804 ARG_SET(nxt1, nxt2 - nxt1);
2805 else if (nxt2 - nxt1 < U16_MAX)
2806 NEXT_OFF(nxt1) = nxt2 - nxt1;
2808 OP(nxt) = NOTHING; /* Cannot beautify */
2813 /* Optimize again: */
2814 /* recurse study_chunk() on optimised CURLYX => CURLYM */
2815 study_chunk(pRExC_state, &nxt1, minlenp, &deltanext, nxt,
2816 NULL, stopparen, recursed_depth, NULL, 0,
2817 depth+1, mutate_ok);
2822 else if ((OP(oscan) == CURLYX)
2823 && (flags & SCF_WHILEM_VISITED_POS)
2824 /* See the comment on a similar expression above.
2825 However, this time it's not a subexpression
2826 we care about, but the expression itself. */
2827 && (maxcount == REG_INFTY)
2829 /* This stays as CURLYX, we can put the count/of pair. */
2830 /* Find WHILEM (as in regexec.c) */
2831 regnode *nxt = oscan + NEXT_OFF(oscan);
2833 if (OP(REGNODE_BEFORE(nxt)) == NOTHING) /* LONGJMP */
2835 nxt = REGNODE_BEFORE(nxt);
2836 if (nxt->flags & 0xf) {
2837 /* we've already set whilem count on this node */
2838 } else if (++data->whilem_c < 16) {
2839 assert(data->whilem_c <= RExC_whilem_seen);
2840 nxt->flags = (U8)(data->whilem_c
2841 | (RExC_whilem_seen << 4)); /* On WHILEM */
2844 if (data && fl & (SF_HAS_PAR|SF_IN_PAR))
2846 if (flags & SCF_DO_SUBSTR) {
2847 SV *last_str = NULL;
2848 STRLEN last_chrs = 0;
2849 int counted = mincount != 0;
2851 if (data->last_end > 0 && mincount != 0) { /* Ends with a
2853 SSize_t b = pos_before >= data->last_start_min
2854 ? pos_before : data->last_start_min;
2856 const char * const s = SvPV_const(data->last_found, l);
2857 SSize_t old = b - data->last_start_min;
2861 old = utf8_hop_forward((U8*)s, old,
2862 (U8 *) SvEND(data->last_found))
2865 /* Get the added string: */
2866 last_str = newSVpvn_utf8(s + old, l, UTF);
2867 last_chrs = UTF ? utf8_length((U8*)(s + old),
2868 (U8*)(s + old + l)) : l;
2869 if (deltanext == 0 && pos_before == b) {
2870 /* What was added is a constant string */
2873 SvGROW(last_str, (mincount * l) + 1);
2874 repeatcpy(SvPVX(last_str) + l,
2875 SvPVX_const(last_str), l,
2877 SvCUR_set(last_str, SvCUR(last_str) * mincount);
2878 /* Add additional parts. */
2879 SvCUR_set(data->last_found,
2880 SvCUR(data->last_found) - l);
2881 sv_catsv(data->last_found, last_str);
2883 SV * sv = data->last_found;
2885 SvUTF8(sv) && SvMAGICAL(sv) ?
2886 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2887 if (mg && mg->mg_len >= 0)
2888 mg->mg_len += last_chrs * (mincount-1);
2890 last_chrs *= mincount;
2891 data->last_end += l * (mincount - 1);
2894 /* start offset must point into the last copy */
2895 data->last_start_min += minnext * (mincount - 1);
2896 data->last_start_max =
2899 : data->last_start_max +
2900 (maxcount - 1) * (minnext + data->pos_delta);
2903 /* It is counted once already... */
2904 data->pos_min += minnext * (mincount - counted);
2906 Perl_re_printf( aTHX_ "counted=%" UVuf " deltanext=%" UVuf
2907 " OPTIMIZE_INFTY=%" UVuf " minnext=%" UVuf
2908 " maxcount=%" UVuf " mincount=%" UVuf
2909 " data->pos_delta=%" UVuf "\n",
2910 (UV)counted, (UV)deltanext, (UV)OPTIMIZE_INFTY, (UV)minnext,
2911 (UV)maxcount, (UV)mincount, (UV)data->pos_delta);
2912 if (deltanext != OPTIMIZE_INFTY)
2913 Perl_re_printf( aTHX_ "LHS=%" UVuf " RHS=%" UVuf "\n",
2914 (UV)(-counted * deltanext + (minnext + deltanext) * maxcount
2915 - minnext * mincount), (UV)(OPTIMIZE_INFTY - data->pos_delta));
2917 if (deltanext == OPTIMIZE_INFTY
2918 || data->pos_delta == OPTIMIZE_INFTY
2919 || -counted * deltanext + (minnext + deltanext) * maxcount - minnext * mincount >= OPTIMIZE_INFTY - data->pos_delta)
2920 data->pos_delta = OPTIMIZE_INFTY;
2922 data->pos_delta += - counted * deltanext +
2923 (minnext + deltanext) * maxcount - minnext * mincount;
2924 if (mincount != maxcount) {
2925 /* Cannot extend fixed substrings found inside
2927 scan_commit(pRExC_state, data, minlenp, is_inf);
2928 if (mincount && last_str) {
2929 SV * const sv = data->last_found;
2930 MAGIC * const mg = SvUTF8(sv) && SvMAGICAL(sv) ?
2931 mg_find(sv, PERL_MAGIC_utf8) : NULL;
2935 sv_setsv(sv, last_str);
2936 data->last_end = data->pos_min;
2937 data->last_start_min = data->pos_min - last_chrs;
2938 data->last_start_max = is_inf
2940 : data->pos_min + data->pos_delta - last_chrs;
2942 data->cur_is_floating = 1; /* float */
2944 SvREFCNT_dec(last_str);
2946 if (data && (fl & SF_HAS_EVAL))
2947 data->flags |= SF_HAS_EVAL;
2948 optimize_curly_tail:
2949 rck_elide_nothing(oscan);
2953 Perl_croak(aTHX_ "panic: unexpected varying REx opcode %d",
2957 if (flags & SCF_DO_SUBSTR) {
2958 /* Cannot expect anything... */
2959 scan_commit(pRExC_state, data, minlenp, is_inf);
2960 data->cur_is_floating = 1; /* float */
2962 is_inf = is_inf_internal = 1;
2963 if (flags & SCF_DO_STCLASS_OR) {
2964 if (OP(scan) == CLUMP) {
2965 /* Actually is any start char, but very few code points
2966 * aren't start characters */
2967 ssc_match_all_cp(data->start_class);
2970 ssc_anything(data->start_class);
2973 flags &= ~SCF_DO_STCLASS;
2977 else if (OP(scan) == LNBREAK) {
2978 if (flags & SCF_DO_STCLASS) {
2979 if (flags & SCF_DO_STCLASS_AND) {
2980 ssc_intersection(data->start_class,
2981 PL_XPosix_ptrs[CC_VERTSPACE_], FALSE);
2982 ssc_clear_locale(data->start_class);
2983 ANYOF_FLAGS(data->start_class)
2984 &= ~SSC_MATCHES_EMPTY_STRING;
2986 else if (flags & SCF_DO_STCLASS_OR) {
2987 ssc_union(data->start_class,
2988 PL_XPosix_ptrs[CC_VERTSPACE_],
2990 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
2992 /* See commit msg for
2993 * 749e076fceedeb708a624933726e7989f2302f6a */
2994 ANYOF_FLAGS(data->start_class)
2995 &= ~SSC_MATCHES_EMPTY_STRING;
2997 flags &= ~SCF_DO_STCLASS;
3000 if (delta != OPTIMIZE_INFTY)
3001 delta++; /* Because of the 2 char string cr-lf */
3002 if (flags & SCF_DO_SUBSTR) {
3003 /* Cannot expect anything... */
3004 scan_commit(pRExC_state, data, minlenp, is_inf);
3006 if (data->pos_delta != OPTIMIZE_INFTY) {
3007 data->pos_delta += 1;
3009 data->cur_is_floating = 1; /* float */
3012 else if (REGNODE_SIMPLE(OP(scan))) {
3014 if (flags & SCF_DO_SUBSTR) {
3015 scan_commit(pRExC_state, data, minlenp, is_inf);
3019 if (flags & SCF_DO_STCLASS) {
3021 SV* my_invlist = NULL;
3024 /* See commit msg 749e076fceedeb708a624933726e7989f2302f6a */
3025 ANYOF_FLAGS(data->start_class) &= ~SSC_MATCHES_EMPTY_STRING;
3027 /* Some of the logic below assumes that switching
3028 locale on will only add false positives. */
3033 Perl_croak(aTHX_ "panic: unexpected simple REx opcode %d",
3037 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3038 ssc_match_all_cp(data->start_class);
3043 SV* REG_ANY_invlist = _new_invlist(2);
3044 REG_ANY_invlist = add_cp_to_invlist(REG_ANY_invlist,
3046 if (flags & SCF_DO_STCLASS_OR) {
3047 ssc_union(data->start_class,
3049 TRUE /* TRUE => invert, hence all but \n
3053 else if (flags & SCF_DO_STCLASS_AND) {
3054 ssc_intersection(data->start_class,
3056 TRUE /* TRUE => invert */
3058 ssc_clear_locale(data->start_class);
3060 SvREFCNT_dec_NN(REG_ANY_invlist);
3072 if (flags & SCF_DO_STCLASS_AND)
3073 ssc_and(pRExC_state, data->start_class,
3074 (regnode_charclass *) scan);
3076 ssc_or(pRExC_state, data->start_class,
3077 (regnode_charclass *) scan);
3082 SV* cp_list = get_ANYOFHbbm_contents(scan);
3084 if (flags & SCF_DO_STCLASS_OR) {
3085 ssc_union(data->start_class, cp_list, invert);
3087 else if (flags & SCF_DO_STCLASS_AND) {
3088 ssc_intersection(data->start_class, cp_list, invert);
3091 SvREFCNT_dec_NN(cp_list);
3095 case NANYOFM: /* NANYOFM already contains the inversion of the
3096 input ANYOF data, so, unlike things like
3097 NPOSIXA, don't change 'invert' to TRUE */
3101 SV* cp_list = get_ANYOFM_contents(scan);
3103 if (flags & SCF_DO_STCLASS_OR) {
3104 ssc_union(data->start_class, cp_list, invert);
3106 else if (flags & SCF_DO_STCLASS_AND) {
3107 ssc_intersection(data->start_class, cp_list, invert);
3110 SvREFCNT_dec_NN(cp_list);
3119 cp_list = _add_range_to_invlist(cp_list,
3121 ANYOFRbase(scan) + ANYOFRdelta(scan));
3123 if (flags & SCF_DO_STCLASS_OR) {
3124 ssc_union(data->start_class, cp_list, invert);
3126 else if (flags & SCF_DO_STCLASS_AND) {
3127 ssc_intersection(data->start_class, cp_list, invert);
3130 SvREFCNT_dec_NN(cp_list);
3139 namedclass = classnum_to_namedclass(FLAGS(scan)) + invert;
3140 if (flags & SCF_DO_STCLASS_AND) {
3141 bool was_there = cBOOL(
3142 ANYOF_POSIXL_TEST(data->start_class,
3144 ANYOF_POSIXL_ZERO(data->start_class);
3145 if (was_there) { /* Do an AND */
3146 ANYOF_POSIXL_SET(data->start_class, namedclass);
3148 /* No individual code points can now match */
3149 data->start_class->invlist
3150 = sv_2mortal(_new_invlist(0));
3153 int complement = namedclass + ((invert) ? -1 : 1);
3155 assert(flags & SCF_DO_STCLASS_OR);
3157 /* If the complement of this class was already there,
3158 * the result is that they match all code points,
3159 * (\d + \D == everything). Remove the classes from
3160 * future consideration. Locale is not relevant in
3162 if (ANYOF_POSIXL_TEST(data->start_class, complement)) {
3163 ssc_match_all_cp(data->start_class);
3164 ANYOF_POSIXL_CLEAR(data->start_class, namedclass);
3165 ANYOF_POSIXL_CLEAR(data->start_class, complement);
3167 else { /* The usual case; just add this class to the
3169 ANYOF_POSIXL_SET(data->start_class, namedclass);
3174 case NPOSIXA: /* For these, we always know the exact set of
3179 my_invlist = invlist_clone(PL_Posix_ptrs[FLAGS(scan)], NULL);
3180 goto join_posix_and_ascii;
3188 my_invlist = invlist_clone(PL_XPosix_ptrs[FLAGS(scan)], NULL);
3190 /* NPOSIXD matches all upper Latin1 code points unless the
3191 * target string being matched is UTF-8, which is
3192 * unknowable until match time. Since we are going to
3193 * invert, we want to get rid of all of them so that the
3194 * inversion will match all */
3195 if (OP(scan) == NPOSIXD) {
3196 _invlist_subtract(my_invlist, PL_UpperLatin1,
3200 join_posix_and_ascii:
3202 if (flags & SCF_DO_STCLASS_AND) {
3203 ssc_intersection(data->start_class, my_invlist, invert);
3204 ssc_clear_locale(data->start_class);
3207 assert(flags & SCF_DO_STCLASS_OR);
3208 ssc_union(data->start_class, my_invlist, invert);
3210 SvREFCNT_dec(my_invlist);
3212 if (flags & SCF_DO_STCLASS_OR)
3213 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3214 flags &= ~SCF_DO_STCLASS;
3217 else if (REGNODE_TYPE(OP(scan)) == EOL && flags & SCF_DO_SUBSTR) {
3218 data->flags |= (OP(scan) == MEOL
3221 scan_commit(pRExC_state, data, minlenp, is_inf);
3224 else if ( REGNODE_TYPE(OP(scan)) == BRANCHJ
3225 /* Lookbehind, or need to calculate parens/evals/stclass: */
3226 && (scan->flags || data || (flags & SCF_DO_STCLASS))
3227 && (OP(scan) == IFMATCH || OP(scan) == UNLESSM))
3229 if ( !PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3230 || OP(scan) == UNLESSM )
3232 /* Negative Lookahead/lookbehind
3233 In this case we can't do fixed string optimisation.
3236 bool is_positive = OP(scan) == IFMATCH ? 1 : 0;
3237 SSize_t deltanext, minnext;
3238 SSize_t fake_last_close = 0;
3239 regnode *fake_last_close_op = NULL;
3240 regnode *cur_last_close_op;
3243 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3245 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3247 data_fake.whilem_c = data->whilem_c;
3248 data_fake.last_closep = data->last_closep;
3249 data_fake.last_close_opp = data->last_close_opp;
3252 data_fake.last_closep = &fake_last_close;
3253 data_fake.last_close_opp = &fake_last_close_op;
3256 /* remember the last_close_op we saw so we can see if
3257 * we are dealing with variable length lookbehind that
3258 * contains capturing buffers, which are considered
3260 cur_last_close_op= *(data_fake.last_close_opp);
3262 data_fake.pos_delta = delta;
3263 if ( flags & SCF_DO_STCLASS && !scan->flags
3264 && OP(scan) == IFMATCH ) { /* Lookahead */
3265 ssc_init(pRExC_state, &intrnl);
3266 data_fake.start_class = &intrnl;
3267 f |= SCF_DO_STCLASS_AND;
3269 if (flags & SCF_WHILEM_VISITED_POS)
3270 f |= SCF_WHILEM_VISITED_POS;
3271 next = regnext(scan);
3272 nscan = REGNODE_AFTER(scan);
3274 /* recurse study_chunk() for lookahead body */
3275 minnext = study_chunk(pRExC_state, &nscan, minlenp, &deltanext,
3276 last, &data_fake, stopparen,
3277 recursed_depth, NULL, f, depth+1,
3282 || deltanext > (I32) U8_MAX
3283 || minnext > (I32)U8_MAX
3284 || minnext + deltanext > (I32)U8_MAX)
3286 FAIL2("Lookbehind longer than %" UVuf " not implemented",
3290 /* The 'next_off' field has been repurposed to count the
3291 * additional starting positions to try beyond the initial
3292 * one. (This leaves it at 0 for non-variable length
3293 * matches to avoid breakage for those not using this
3296 scan->next_off = deltanext;
3298 /* See a CLOSE op inside this lookbehind? */
3299 cur_last_close_op != *(data_fake.last_close_opp)
3300 /* and not doing restudy. see: restudied */
3301 && !(flags & SCF_TRIE_DOING_RESTUDY)
3303 /* this is positive variable length lookbehind with
3304 * capture buffers inside of it */
3305 ckWARNexperimental_with_arg(RExC_parse,
3306 WARN_EXPERIMENTAL__VLB,
3307 "Variable length %s lookbehind with capturing is experimental",
3308 is_positive ? "positive" : "negative");
3311 scan->flags = (U8)minnext + deltanext;
3314 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3316 if (data_fake.flags & SF_HAS_EVAL)
3317 data->flags |= SF_HAS_EVAL;
3318 data->whilem_c = data_fake.whilem_c;
3320 if (f & SCF_DO_STCLASS_AND) {
3321 if (flags & SCF_DO_STCLASS_OR) {
3322 /* OR before, AND after: ideally we would recurse with
3323 * data_fake to get the AND applied by study of the
3324 * remainder of the pattern, and then derecurse;
3325 * *** HACK *** for now just treat as "no information".
3326 * See [perl #56690].
3328 ssc_init(pRExC_state, data->start_class);
3330 /* AND before and after: combine and continue. These
3331 * assertions are zero-length, so can match an EMPTY
3333 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
3334 ANYOF_FLAGS(data->start_class)
3335 |= SSC_MATCHES_EMPTY_STRING;
3338 DEBUG_STUDYDATA("end LOOKAROUND", data, depth, is_inf, min, stopmin, delta);
3340 #if PERL_ENABLE_POSITIVE_ASSERTION_STUDY
3342 /* Positive Lookahead/lookbehind
3343 In this case we can do fixed string optimisation,
3344 but we must be careful about it. Note in the case of
3345 lookbehind the positions will be offset by the minimum
3346 length of the pattern, something we won't know about
3347 until after the recurse.
3349 SSize_t deltanext, fake_last_close = 0;
3350 regnode *last_close_op = NULL;
3353 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3354 /* We use SAVEFREEPV so that when the full compile
3355 is finished perl will clean up the allocated
3356 minlens when it's all done. This way we don't
3357 have to worry about freeing them when we know
3358 they wont be used, which would be a pain.
3361 Newx( minnextp, 1, SSize_t );
3362 SAVEFREEPV(minnextp);
3365 StructCopy(data, &data_fake, scan_data_t);
3366 if ((flags & SCF_DO_SUBSTR) && data->last_found) {
3369 scan_commit(pRExC_state, &data_fake, minlenp, is_inf);
3370 data_fake.last_found=newSVsv(data->last_found);
3374 data_fake.last_closep = &fake_last_close;
3375 data_fake.last_close_opp = &fake_last_close_opp;
3377 data_fake.flags = 0;
3378 data_fake.substrs[0].flags = 0;
3379 data_fake.substrs[1].flags = 0;
3380 data_fake.pos_delta = delta;
3382 data_fake.flags |= SF_IS_INF;
3383 if ( flags & SCF_DO_STCLASS && !scan->flags
3384 && OP(scan) == IFMATCH ) { /* Lookahead */
3385 ssc_init(pRExC_state, &intrnl);
3386 data_fake.start_class = &intrnl;
3387 f |= SCF_DO_STCLASS_AND;
3389 if (flags & SCF_WHILEM_VISITED_POS)
3390 f |= SCF_WHILEM_VISITED_POS;
3391 next = regnext(scan);
3392 nscan = REGNODE_AFTER(scan);
3394 /* positive lookahead study_chunk() recursion */
3395 *minnextp = study_chunk(pRExC_state, &nscan, minnextp,
3396 &deltanext, last, &data_fake,
3397 stopparen, recursed_depth, NULL,
3398 f, depth+1, mutate_ok);
3400 assert(0); /* This code has never been tested since this
3401 is normally not compiled */
3403 || deltanext > (I32) U8_MAX
3404 || *minnextp > (I32)U8_MAX
3405 || *minnextp + deltanext > (I32)U8_MAX)
3407 FAIL2("Lookbehind longer than %" UVuf " not implemented",
3412 scan->next_off = deltanext;
3414 scan->flags = (U8)*minnextp + deltanext;
3419 if (f & SCF_DO_STCLASS_AND) {
3420 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &intrnl);
3421 ANYOF_FLAGS(data->start_class) |= SSC_MATCHES_EMPTY_STRING;
3424 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3426 if (data_fake.flags & SF_HAS_EVAL)
3427 data->flags |= SF_HAS_EVAL;
3428 data->whilem_c = data_fake.whilem_c;
3429 if ((flags & SCF_DO_SUBSTR) && data_fake.last_found) {
3431 if (RExC_rx->minlen < *minnextp)
3432 RExC_rx->minlen = *minnextp;
3433 scan_commit(pRExC_state, &data_fake, minnextp, is_inf);
3434 SvREFCNT_dec_NN(data_fake.last_found);
3436 for (i = 0; i < 2; i++) {
3437 if (data_fake.substrs[i].minlenp != minlenp) {
3438 data->substrs[i].min_offset =
3439 data_fake.substrs[i].min_offset;
3440 data->substrs[i].max_offset =
3441 data_fake.substrs[i].max_offset;
3442 data->substrs[i].minlenp =
3443 data_fake.substrs[i].minlenp;
3444 data->substrs[i].lookbehind += scan->flags;
3452 else if (OP(scan) == OPEN) {
3453 if (stopparen != (I32)PARNO(scan))
3456 else if (OP(scan) == CLOSE) {
3457 if (stopparen == (I32)PARNO(scan)) {
3460 if ((I32)PARNO(scan) == is_par) {
3461 next = regnext(scan);
3463 if ( next && (OP(next) != WHILEM) && next < last)
3464 is_par = 0; /* Disable optimization */
3467 *(data->last_closep) = PARNO(scan);
3468 *(data->last_close_opp) = scan;
3471 else if (OP(scan) == EVAL) {
3472 if (data && !(scan->flags & EVAL_OPTIMISTIC_FLAG) )
3473 data->flags |= SF_HAS_EVAL;
3475 else if ( REGNODE_TYPE(OP(scan)) == ENDLIKE ) {
3476 if (flags & SCF_DO_SUBSTR) {
3477 scan_commit(pRExC_state, data, minlenp, is_inf);
3478 flags &= ~SCF_DO_SUBSTR;
3480 if (OP(scan)==ACCEPT) {
3481 /* m{(*ACCEPT)x} does not have to start with 'x' */
3482 flags &= ~SCF_DO_STCLASS;
3484 data->flags |= SCF_SEEN_ACCEPT;
3489 else if (OP(scan) == COMMIT) {
3490 /* gh18770: m{abc(*COMMIT)xyz} must fail on "abc abcxyz", so we
3491 * must not end up with "abcxyz" as a fixed substring else we'll
3492 * skip straight to attempting to match at offset 4.
3494 if (flags & SCF_DO_SUBSTR) {
3495 scan_commit(pRExC_state, data, minlenp, is_inf);
3496 flags &= ~SCF_DO_SUBSTR;
3499 else if (OP(scan) == LOGICAL && scan->flags == 2) /* Embedded follows */
3501 if (flags & SCF_DO_SUBSTR) {
3502 scan_commit(pRExC_state, data, minlenp, is_inf);
3503 data->cur_is_floating = 1; /* float */
3505 is_inf = is_inf_internal = 1;
3506 if (flags & SCF_DO_STCLASS_OR) /* Allow everything */
3507 ssc_anything(data->start_class);
3508 flags &= ~SCF_DO_STCLASS;
3510 else if (OP(scan) == GPOS) {
3511 if (!(RExC_rx->intflags & PREGf_GPOS_FLOAT) &&
3512 !(delta || is_inf || (data && data->pos_delta)))
3514 if (!(RExC_rx->intflags & PREGf_ANCH) && (flags & SCF_DO_SUBSTR))
3515 RExC_rx->intflags |= PREGf_ANCH_GPOS;
3516 if (RExC_rx->gofs < (STRLEN)min)
3517 RExC_rx->gofs = min;
3519 RExC_rx->intflags |= PREGf_GPOS_FLOAT;
3523 #ifdef TRIE_STUDY_OPT
3524 #ifdef FULL_TRIE_STUDY
3525 else if (REGNODE_TYPE(OP(scan)) == TRIE) {
3526 /* NOTE - There is similar code to this block above for handling
3527 BRANCH nodes on the initial study. If you change stuff here
3529 regnode *trie_node= scan;
3530 regnode *tail= regnext(scan);
3531 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3532 SSize_t max1 = 0, min1 = OPTIMIZE_INFTY;
3535 if (flags & SCF_DO_SUBSTR) { /* XXXX Add !SUSPEND? */
3536 /* Cannot merge strings after this. */
3537 scan_commit(pRExC_state, data, minlenp, is_inf);
3539 if (flags & SCF_DO_STCLASS)
3540 ssc_init_zero(pRExC_state, &accum);
3546 const regnode *nextbranch= NULL;
3549 for ( word=1 ; word <= trie->wordcount ; word++)
3551 SSize_t deltanext = 0, minnext = 0;
3552 U32 f = (flags & SCF_TRIE_DOING_RESTUDY);
3553 SSize_t fake_last_close = 0;
3554 regnode *fake_last_close_op = NULL;
3555 regnode_ssc this_class;
3557 StructCopy(&zero_scan_data, &data_fake, scan_data_t);
3559 data_fake.whilem_c = data->whilem_c;
3560 data_fake.last_closep = data->last_closep;
3561 data_fake.last_close_opp = data->last_close_opp;
3564 data_fake.last_closep = &fake_last_close;
3565 data_fake.last_close_opp = &fake_last_close_op;
3567 data_fake.pos_delta = delta;
3568 if (flags & SCF_DO_STCLASS) {
3569 ssc_init(pRExC_state, &this_class);
3570 data_fake.start_class = &this_class;
3571 f |= SCF_DO_STCLASS_AND;
3573 if (flags & SCF_WHILEM_VISITED_POS)
3574 f |= SCF_WHILEM_VISITED_POS;
3576 if (trie->jump[word]) {
3578 nextbranch = trie_node + trie->jump[0];
3579 scan= trie_node + trie->jump[word];
3580 /* We go from the jump point to the branch that follows
3581 it. Note this means we need the vestigal unused
3582 branches even though they arent otherwise used. */
3583 /* optimise study_chunk() for TRIE */
3584 minnext = study_chunk(pRExC_state, &scan, minlenp,
3585 &deltanext, (regnode *)nextbranch, &data_fake,
3586 stopparen, recursed_depth, NULL, f, depth+1,
3589 if (nextbranch && REGNODE_TYPE(OP(nextbranch))==BRANCH)
3590 nextbranch= regnext((regnode*)nextbranch);
3592 if (min1 > (SSize_t)(minnext + trie->minlen))
3593 min1 = minnext + trie->minlen;
3594 if (deltanext == OPTIMIZE_INFTY) {
3595 is_inf = is_inf_internal = 1;
3596 max1 = OPTIMIZE_INFTY;
3597 } else if (max1 < (SSize_t)(minnext + deltanext + trie->maxlen))
3598 max1 = minnext + deltanext + trie->maxlen;
3600 if (data_fake.flags & (SF_HAS_PAR|SF_IN_PAR))
3602 if (data_fake.flags & SCF_SEEN_ACCEPT) {
3603 if ( stopmin > min + min1)
3604 stopmin = min + min1;
3605 flags &= ~SCF_DO_SUBSTR;
3607 data->flags |= SCF_SEEN_ACCEPT;
3610 if (data_fake.flags & SF_HAS_EVAL)
3611 data->flags |= SF_HAS_EVAL;
3612 data->whilem_c = data_fake.whilem_c;
3614 if (flags & SCF_DO_STCLASS)
3615 ssc_or(pRExC_state, &accum, (regnode_charclass *) &this_class);
3617 DEBUG_STUDYDATA("after JUMPTRIE", data, depth, is_inf, min, stopmin, delta);
3619 if (flags & SCF_DO_SUBSTR) {
3620 data->pos_min += min1;
3621 data->pos_delta += max1 - min1;
3622 if (max1 != min1 || is_inf)
3623 data->cur_is_floating = 1; /* float */
3626 if (delta != OPTIMIZE_INFTY) {
3627 if (OPTIMIZE_INFTY - (max1 - min1) >= delta)
3628 delta += max1 - min1;
3630 delta = OPTIMIZE_INFTY;
3632 if (flags & SCF_DO_STCLASS_OR) {
3633 ssc_or(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3635 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3636 flags &= ~SCF_DO_STCLASS;
3639 else if (flags & SCF_DO_STCLASS_AND) {
3641 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) &accum);
3642 flags &= ~SCF_DO_STCLASS;
3645 /* Switch to OR mode: cache the old value of
3646 * data->start_class */
3648 StructCopy(data->start_class, and_withp, regnode_ssc);
3649 flags &= ~SCF_DO_STCLASS_AND;
3650 StructCopy(&accum, data->start_class, regnode_ssc);
3651 flags |= SCF_DO_STCLASS_OR;
3655 DEBUG_STUDYDATA("after TRIE study", data, depth, is_inf, min, stopmin, delta);
3659 else if (REGNODE_TYPE(OP(scan)) == TRIE) {
3660 reg_trie_data *trie = (reg_trie_data*)RExC_rxi->data->data[ ARG(scan) ];
3663 min += trie->minlen;
3664 delta += (trie->maxlen - trie->minlen);
3665 flags &= ~SCF_DO_STCLASS; /* xxx */
3666 if (flags & SCF_DO_SUBSTR) {
3667 /* Cannot expect anything... */
3668 scan_commit(pRExC_state, data, minlenp, is_inf);
3669 data->pos_min += trie->minlen;
3670 data->pos_delta += (trie->maxlen - trie->minlen);
3671 if (trie->maxlen != trie->minlen)
3672 data->cur_is_floating = 1; /* float */
3674 if (trie->jump) /* no more substrings -- for now /grr*/
3675 flags &= ~SCF_DO_SUBSTR;
3678 #endif /* old or new */
3679 #endif /* TRIE_STUDY_OPT */
3681 else if (OP(scan) == REGEX_SET) {
3682 Perl_croak(aTHX_ "panic: %s regnode should be resolved"
3683 " before optimization", REGNODE_NAME(REGEX_SET));
3686 /* Else: zero-length, ignore. */
3687 scan = regnext(scan);
3692 /* we need to unwind recursion. */
3695 DEBUG_STUDYDATA("frame-end", data, depth, is_inf, min, stopmin, delta);
3696 DEBUG_PEEP("fend", scan, depth, flags);
3698 /* restore previous context */
3699 last = frame->last_regnode;
3700 scan = frame->next_regnode;
3701 stopparen = frame->stopparen;
3702 recursed_depth = frame->prev_recursed_depth;
3704 RExC_frame_last = frame->prev_frame;
3705 frame = frame->this_prev_frame;
3706 goto fake_study_recurse;
3710 DEBUG_STUDYDATA("pre-fin", data, depth, is_inf, min, stopmin, delta);
3712 /* is this pattern infinite? Eg, consider /(a|b+)/ */
3713 if (is_inf_internal)
3714 delta = OPTIMIZE_INFTY;
3716 /* deal with (*ACCEPT), Eg, consider /(foo(*ACCEPT)|bop)bar/ */
3717 if (min > stopmin) {
3719 At this point 'min' represents the minimum length string we can
3720 match while *ignoring* the implication of ACCEPT, and 'delta'
3721 represents the difference between the minimum length and maximum
3722 length, and if the pattern matches an infinitely long string
3723 (consider the + and * quantifiers) then we use the special delta
3724 value of OPTIMIZE_INFTY to represent it. 'stopmin' is the
3725 minimum length that can be matched *and* accepted.
3727 A pattern is accepted when matching was successful *and*
3728 complete, and thus there is no further matching needing to be
3729 done, no backtracking to occur, etc. Prior to the introduction
3730 of ACCEPT the only opcode that signaled acceptance was the END
3731 opcode, which is always the very last opcode in a regex program.
3732 ACCEPT is thus conceptually an early successful return out of
3733 the matching process. stopmin starts out as OPTIMIZE_INFTY to
3734 represent "the entire pattern", and is ratched down to the
3735 "current min" if necessary when an ACCEPT opcode is encountered.
3737 Thus stopmin might be smaller than min if we saw an (*ACCEPT),
3738 and we now need to account for it in both min and delta.
3739 Consider that in a pattern /AB/ normally the min length it can
3740 match can be computed as min(A)+min(B). But (*ACCEPT) means
3741 that it might be something else, not even neccesarily min(A) at
3744 A = /(foo(*ACCEPT)|x+)/
3746 AB = /(foo(*ACCEPT)|x+)whop/
3748 The min for A is 1 for "x" and the delta for A is OPTIMIZE_INFTY
3749 for "xxxxx...", its stopmin is 3 for "foo". The min for B is 4 for
3750 "whop", and the delta of 0 as the pattern is of fixed length, the
3751 stopmin would be OPTIMIZE_INFTY as it does not contain an ACCEPT.
3752 When handling AB we expect to see a min of 5 for "xwhop", and a
3753 delta of OPTIMIZE_INFTY for "xxxxx...whop", and a stopmin of 3
3754 for "foo". This should result in a final min of 3 for "foo", and
3755 a final delta of OPTIMIZE_INFTY for "xxxxx...whop".
3757 In something like /(dude(*ACCEPT)|irk)x{3,7}/ we would have a
3758 min of 6 for "irkxxx" and a delta of 4 for "irkxxxxxxx", and the
3759 stop min would be 4 for "dude". This should result in a final
3760 min of 4 for "dude", and a final delta of 6, for "irkxxxxxxx".
3762 When min is smaller than stopmin then we can ignore it. In the
3763 fragment /(x{10,20}(*ACCEPT)|a)b+/, we would have a min of 2,
3764 and a delta of OPTIMIZE_INFTY, and a stopmin of 10. Obviously
3765 the ACCEPT doesn't reduce the minimum length of the string that
3766 might be matched, nor affect the maximum length.
3768 In something like /foo(*ACCEPT)ba?r/ we would have a min of 5
3769 for "foobr", a delta of 1 for "foobar", and a stopmin of 3 for
3770 "foo". We currently turn this into a min of 3 for "foo" and a
3771 delta of 3 for "foobar" even though technically "foobar" isn't
3772 possible. ACCEPT affects some aspects of the optimizer, like
3773 length computations and mandatory substring optimizations, but
3774 there are other optimzations this routine perfoms that are not
3775 affected and this compromise simplifies implementation.
3777 It might be helpful to consider that this C function is called
3778 recursively on the pattern in a bottom up fashion, and that the
3779 min returned by a nested call may be marked as coming from an
3780 ACCEPT, causing its callers to treat the returned min as a
3781 stopmin as the recursion unwinds. Thus a single ACCEPT can affect
3782 multiple calls into this function in different ways.
3785 if (OPTIMIZE_INFTY - delta >= min - stopmin)
3786 delta += min - stopmin;
3788 delta = OPTIMIZE_INFTY;
3795 if (flags & SCF_DO_SUBSTR && is_inf)
3796 data->pos_delta = OPTIMIZE_INFTY - data->pos_min;
3797 if (is_par > (I32)U8_MAX)
3799 if (is_par && pars==1 && data) {
3800 data->flags |= SF_IN_PAR;
3801 data->flags &= ~SF_HAS_PAR;
3803 else if (pars && data) {
3804 data->flags |= SF_HAS_PAR;
3805 data->flags &= ~SF_IN_PAR;
3807 if (flags & SCF_DO_STCLASS_OR)
3808 ssc_and(pRExC_state, data->start_class, (regnode_charclass *) and_withp);
3809 if (flags & SCF_TRIE_RESTUDY)
3810 data->flags |= SCF_TRIE_RESTUDY;
3813 if (!(RExC_seen & REG_UNBOUNDED_QUANTIFIER_SEEN)) {
3814 if (min > OPTIMIZE_INFTY - delta)
3815 RExC_maxlen = OPTIMIZE_INFTY;
3816 else if (RExC_maxlen < min + delta)
3817 RExC_maxlen = min + delta;
3819 DEBUG_STUDYDATA("post-fin", data, depth, is_inf, min, stopmin, delta);