This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: Change handling of filled EXACT nodes
authorKarl Williamson <khw@cpan.org>
Sun, 22 Sep 2019 22:12:07 +0000 (16:12 -0600)
committerKarl Williamson <khw@cpan.org>
Sun, 29 Sep 2019 17:46:26 +0000 (11:46 -0600)
This changes the detection mechanism to check just before writing to see
if if would be out of bounds, and if so, instead break out of the loop,
and go close out the node.  Prior to this commit space for a worst-case
scenario was reserved, and we didn't start a new character if we were in
that danger zone.  This left nodes left fully packed than they could
have been.

Thus this improves the packing of nodes, especially under /i, from the
previous mechanism.  But more importantly, it set things up so that we
can potentially increase the node size as we go along.

This also changes the handling of avoiding splitting a multi-character
fold across nodes under /i.  For example, take the sequence 'ffi',  We
wouldn't want to end a node with 'ff', when the first character in the
next node is an 'i', as U+FB03 folds to that sequence, and the code that
does pattern matching can't currently match across node boundaries.
Previously we backed off filling the node until the final character
wasn't one that could potentially cause such a break.  That is we didn't
look at the next character and see if it was an 'i' (or some other
potential multi-char fold.)  Now we do look at that next
character(s), and only back off if this actually would split a real
multi-char fold.

regcomp.c
t/re/pat.t

index 9f1a837..0564fcc 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -13922,13 +13922,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
            UV ender = 0;
            char *p;
            char *s;
-
-/* This allows us to fill a node with just enough spare so that if the final
- * character folds, its expansion is guaranteed to fit */
-#define MAX_FOLDED_NODE_STRING_SIZE (255-UTF8_MAXBYTES_CASE)
-
            char *s0;
-           U8 upper_fill = MAX_FOLDED_NODE_STRING_SIZE;
+            U32 max_string_len = 255;
+
+            /* We may have to reparse the node, artificially stopping filling
+             * it early, based on info gleaned in the first parse.  This
+             * variable gives where we stop.  Make it above the normal stopping
+             * place first time through. */
+           U32 upper_fill = max_string_len + 1;
 
             /* We start out as an EXACT node, even if under /i, until we find a
              * character which is in a fold.  The algorithm now segregates into
@@ -13944,7 +13945,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             /* Assume the node will be fully used; the excess is given back at
              * the end.  We can't make any other length assumptions, as a byte
              * input sequence could shrink down. */
-            Ptrdiff_t current_string_nodes = STR_SZ(256);
+            Ptrdiff_t current_string_nodes = STR_SZ(max_string_len);
 
             bool next_is_quantifier;
             char * oldp = NULL;
@@ -13975,6 +13976,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
             /* So is the MICRO SIGN */
             bool has_micro_sign = FALSE;
 
+            /* Set when we fill up the current node and there is still more
+             * text to process */
+            bool overflowed;
+
             /* Allocate an EXACT node.  The node_type may change below to
              * another EXACTish node, but since the size of the node doesn't
              * change, it works */
@@ -13989,6 +13994,10 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
          reparse:
 
+            p = RExC_parse;
+            len = 0;
+            s = s0;
+
             /* This breaks under rare circumstances.  If folding, we do not
              * want to split a node at a character that is a non-final in a
              * multi-char fold, as an input string could just happen to want to
@@ -14003,12 +14012,14 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                    || UTF8_IS_INVARIANT(UCHARAT(RExC_parse))
                    || UTF8_IS_START(UCHARAT(RExC_parse)));
 
+            overflowed = FALSE;
+
             /* Here, we have a literal character.  Find the maximal string of
              * them in the input that we can fit into a single EXACTish node.
              * We quit at the first non-literal or when the node gets full, or
              * under /i the categorization of folding/non-folding character
              * changes */
-            for (p = RExC_parse; len < upper_fill && p < RExC_end; ) {
+            while (p < RExC_end && len < upper_fill) {
 
                 /* In most cases each iteration adds one byte to the output.
                  * The exceptions override this */
@@ -14346,8 +14357,17 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 /* Ready to add 'ender' to the node */
 
                 if (! FOLD) {  /* The simple case, just append the literal */
-
                   not_fold_common:
+
+                    /* Don't output if it would overflow */
+                    if (UNLIKELY(len > max_string_len - ((UTF)
+                                                         ? UVCHR_SKIP(ender)
+                                                         : 1)))
+                    {
+                        overflowed = TRUE;
+                        break;
+                    }
+
                     if (UVCHR_IS_INVARIANT(ender) || ! UTF) {
                         *(s++) = (char) ender;
                     }
@@ -14425,20 +14445,33 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
 
                     if (UTF) {  /* Use the folded value */
                         if (UVCHR_IS_INVARIANT(ender)) {
+                            if (UNLIKELY(len + 1 > max_string_len)) {
+                                overflowed = TRUE;
+                                break;
+                            }
+
                             *(s)++ = (U8) toFOLD(ender);
                         }
                         else {
-                            ender = _to_uni_fold_flags(
+                            U8 temp[UTF8_MAXBYTES_CASE+1];
+
+                            UV folded = _to_uni_fold_flags(
                                     ender,
-                                    (U8 *) s,
+                                    temp,
                                     &added_len,
                                     FOLD_FLAGS_FULL | ((ASCII_FOLD_RESTRICTED)
                                                     ? FOLD_FLAGS_NOMIX_ASCII
                                                     : 0));
+                            if (UNLIKELY(len + added_len > max_string_len)) {
+                                overflowed = TRUE;
+                                break;
+                            }
+
+                            Copy(temp, s, added_len, char);
                             s += added_len;
 
-                            if (   ender > 255
-                                && LIKELY(ender != GREEK_SMALL_LETTER_MU))
+                            if (   folded > 255
+                                && LIKELY(folded != GREEK_SMALL_LETTER_MU))
                             {
                                 /* U+B5 folds to the MU, so its possible for a
                                  * non-UTF-8 target to match it */
@@ -14490,6 +14523,12 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             if (UNLIKELY(ender == LATIN_SMALL_LETTER_SHARP_S)) {
                                 maybe_SIMPLE = 0;
                                 if (node_type == EXACTFU) {
+
+                                    if (UNLIKELY(len + 2 > max_string_len)) {
+                                        overflowed = TRUE;
+                                        break;
+                                    }
+
                                     *(s++) = 's';
 
                                     /* Let the code below add in the extra 's'
@@ -14505,6 +14544,11 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                             has_micro_sign = TRUE;
                         }
 
+                        if (UNLIKELY(len + 1 > max_string_len)) {
+                            overflowed = TRUE;
+                            break;
+                        }
+
                         *(s++) = (DEPENDS_SEMANTICS)
                                  ? (char) toFOLD(ender)
 
@@ -14530,160 +14574,206 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
            } /* End of loop through literal characters */
 
             /* Here we have either exhausted the input or run out of room in
-             * the node.  (If we encountered a character that can't be in the
-             * node, transfer is made directly to <loopdone>, and so we
-             * wouldn't have fallen off the end of the loop.)  In the latter
-             * case, we artificially have to split the node into two, because
-             * we just don't have enough space to hold everything.  This
-             * creates a problem if the final character participates in a
-             * multi-character fold in the non-final position, as a match that
-             * should have occurred won't, due to the way nodes are matched,
-             * and our artificial boundary.  So back off until we find a non-
-             * problematic character -- one that isn't at the beginning or
-             * middle of such a fold.  (Either it doesn't participate in any
-             * folds, or appears only in the final position of all the folds it
-             * does participate in.)  A better solution with far fewer false
-             * positives, and that would fill the nodes more completely, would
-             * be to actually have available all the multi-character folds to
-             * test against, and to back-off only far enough to be sure that
-             * this node isn't ending with a partial one.  <upper_fill> is set
-             * further below (if we need to reparse the node) to include just
-             * up through that final non-problematic character that this code
-             * identifies, so when it is set to less than the full node, we can
-             * skip the rest of this */
-            if (FOLD && p < RExC_end && upper_fill == MAX_FOLDED_NODE_STRING_SIZE) {
-                PERL_UINT_FAST8_T backup_count = 0;
-
-                const STRLEN full_len = len;
-
-               assert(len >= MAX_FOLDED_NODE_STRING_SIZE);
-
-                /* Here, <s> points to just beyond where we have output the
-                 * final character of the node.  Look backwards through the
-                 * string until find a non- problematic character */
-
-               if (! UTF) {
-
-                    /* This has no multi-char folds to non-UTF characters */
-                    if (ASCII_FOLD_RESTRICTED) {
-                        goto loopdone;
-                    }
-
-                    while (--s >= s0 && IS_NON_FINAL_FOLD(*s)) {
-                        backup_count++;
-                    }
-                    len = s - s0 + 1;
-               }
-                else {
+             * the node.  If the former, we are done.  (If we encountered a
+             * character that can't be in the node, transfer is made directly
+             * to <loopdone>, and so we wouldn't have fallen off the end of the
+             * loop.)  */
+            if (LIKELY(! overflowed)) {
+                goto loopdone;
+            }
+
+            /* Here we have run out of room.  We artificially have to split the
+             * node into two, because we just don't have enough space to hold
+             * everything. */
+
+            if (FOLD) {
+
+                /* Running out of room creates a problem if we are
+                 * folding, and the split happens in the middle of a
+                 * multi-character fold, as a match that should have occurred,
+                 * won't, due to the way nodes are matched, and our artificial
+                 * boundary.  So back off until we aren't splitting such a
+                 * fold.  If there is no such place to back off to, we end up
+                 * taking the entire node as-is.  This can happen if the node
+                 * consists entirely of 'f' or entirely of 's' characters (or
+                 * things that fold to them) as 'ff' and 'ss' are
+                 * multi-character folds.
+                 *
+                 * At this point:
+                 *  oldp        points to the beginning in the input of the
+                 *              final character in the node.
+                 *  p           points to the beginning in the input of the
+                 *              next character in the input, the one that won't
+                 *              fit in the node.
+                 *
+                 * We aren't in the middle of a multi-char fold unless the
+                 * final character in the node can appear in a non-final
+                 * position in such a fold.  Very few characters actually
+                 * participate in multi-character folds, and fewer still can be
+                 * in the non-final position.  But it's complicated to know
+                 * here if that final character is folded or not, so skip this
+                 * check */
+
+                           /* Make sure enough space for final char of node,
+                            * first char of following node, and the fold of the
+                            * following char (so we don't have to worry about
+                            * that fold running off the end */
+                U8 foldbuf[UTF8_MAXBYTES_CASE * 5 + 1];
+                STRLEN fold_len;
+                UV folded;
 
-                    /* Point to the first byte of the final character */
-                    s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
 
-                    while (s >= s0) {   /* Search backwards until find
-                                           a non-problematic char */
-                        if (UTF8_IS_INVARIANT(*s)) {
+                /* The Unicode standard says that multi character folds consist
+                 * of either two or three characters.  So we create a buffer
+                 * containing a window of three.  The first is the final
+                 * character in the node (folded), and then the two that begin
+                 * the following node.   But if the first character of the
+                 * following node can't be in a non-final fold position, there
+                 * is no need to look at its successor character.  The macros
+                 * used below to check for multi character folds require folded
+                 * inputs, so we have to fold these.  (The fold of p was likely
+                 * calculated in the loop above, but it hasn't beeen saved, and
+                 * khw thinks it would be too entangled to change to do so) */
+
+                if (UTF || LIKELY(UCHARAT(p) != MICRO_SIGN)) {
+                    folded = _to_uni_fold_flags(ender,
+                                                foldbuf,
+                                                &fold_len,
+                                                FOLD_FLAGS_FULL);
+                }
+                else {
+                    foldbuf[0] = folded = MICRO_SIGN;
+                    fold_len = 1;
+                }
+
+                /* Here, foldbuf contains the fold of the first character in
+                 * the next node.  We may also need the next one (if there is
+                 * one) to get our third, but if the first character folded to
+                 * more than one, those extra one(s) will serve as the third.
+                 * Also, we don't need a third unless the previous one can
+                 * appear in a non-final position in a fold */
+                if (  ((RExC_end - p) > ((UTF) ? UVCHR_SKIP(ender) : 1))
+                    && (fold_len == 1 || (   UTF
+                                          && UVCHR_SKIP(folded) == fold_len))
+                    &&  UNLIKELY(_invlist_contains_cp(PL_NonFinalFold, folded)))
+                {
+                    if (UTF) {
+                        STRLEN next_fold_len;
 
-                            /* There are no ascii characters that participate
-                             * in multi-char folds under /aa.  In EBCDIC, the
-                             * non-ascii invariants are all control characters,
-                             * so don't ever participate in any folds. */
-                            if (ASCII_FOLD_RESTRICTED
-                                || ! IS_NON_FINAL_FOLD(*s))
-                            {
-                                break;
-                            }
-                        }
-                        else if (UTF8_IS_DOWNGRADEABLE_START(*s)) {
-                            if (! IS_NON_FINAL_FOLD(EIGHT_BIT_UTF8_TO_NATIVE(
-                                                                  *s, *(s+1))))
-                            {
-                                break;
-                            }
+                        toFOLD_utf8_safe((U8*) p + UTF8SKIP(p),
+                                         (U8*) RExC_end, foldbuf + fold_len,
+                                         &next_fold_len);
+                        fold_len += next_fold_len;
+                    }
+                    else {
+                        if (UNLIKELY(p[1] == LATIN_SMALL_LETTER_SHARP_S)) {
+                            foldbuf[fold_len] = 's';
                         }
-                        else if (! _invlist_contains_cp(
-                                        PL_NonFinalFold,
-                                        valid_utf8_to_uvchr((U8 *) s, NULL)))
-                        {
-                            break;
+                        else {
+                            foldbuf[fold_len] = toLOWER_L1(p[1]);
                         }
+                        fold_len++;
+                    }
+                }
 
-                        /* Here, the current character is problematic in that
-                         * it does occur in the non-final position of some
-                         * fold, so try the character before it, but have to
-                         * special case the very first byte in the string, so
-                         * we don't read outside the string */
-                        s = (s == s0) ? s -1 : (char *) utf8_hop((U8 *) s, -1);
-                        backup_count++;
-                    } /* End of loop backwards through the string */
-
-                    /* If there were only problematic characters in the string,
-                     * <s> will point to before s0, in which case the length
-                     * should be 0, otherwise include the length of the
-                     * non-problematic character just found */
-                    len = (s < s0) ? 0 : s - s0 + UTF8SKIP(s);
-               }
+                /* Here foldbuf contains the the fold of p, and if appropriate
+                 * that of the character following p in the input. */
 
-                /* Here, have found the final character, if any, that is
-                 * non-problematic as far as ending the node without splitting
-                 * it across a potential multi-char fold.  <len> contains the
-                 * number of bytes in the node up-to and including that
-                 * character, or is 0 if there is no such character, meaning
-                 * the whole node contains only problematic characters.  In
-                 * this case, give up and just take the node as-is.  We can't
-                 * do any better */
-                if (len == 0) {
-                    len = full_len;
+                /* Search backwards until find a place that doesn't split a
+                 * multi-char fold */
+                while (1) {
+                    STRLEN s_len;
+                    char s_fold_buf[UTF8_MAXBYTES_CASE];
+                    char * s_fold = s_fold_buf;
 
-                } else {
+                    if (s <= s0) {
 
-                    /* Here, the node does contain some characters that aren't
-                     * problematic.  If we didn't have to backup any, then the
-                     * final character in the node is non-problematic, and we
-                     * can take the node as-is */
-                    if (backup_count == 0) {
-                        goto loopdone;
+                        /* There's no safe place in the node to split.  Quit so
+                         * will take the whole node */
+                        break;
                     }
-                    else if (backup_count == 1) {
 
-                        /* If the final character is problematic, but the
-                         * penultimate is not, back-off that last character to
-                         * later start a new node with it */
-                        p = oldp;
-                        goto loopdone;
+                    /* Backup 1 character.  The first time through this moves s
+                     * to point to the final character in the node */
+                    if (UTF) {
+                        s = (char *) utf8_hop_back((U8 *) s, -1, (U8 *) s0);
+                    }
+                    else {
+                        s--;
                     }
 
-                    /* Here, the final non-problematic character is earlier
-                     * in the input than the penultimate character.  What we do
-                     * is reparse from the beginning, going up only as far as
-                     * this final ok one, thus guaranteeing that the node ends
-                     * in an acceptable character.  The reason we reparse is
-                     * that we know how far in the character is, but we don't
-                     * know how to correlate its position with the input parse.
-                     * An alternate implementation would be to build that
-                     * correlation as we go along during the original parse,
-                     * but that would entail extra work for every node, whereas
-                     * this code gets executed only when the string is too
-                     * large for the node, and the final two characters are
-                     * problematic, an infrequent occurrence.  Yet another
-                     * possible strategy would be to save the tail of the
-                     * string, and the next time regatom is called, initialize
-                     * with that.  The problem with this is that unless you
-                     * back off one more character, you won't be guaranteed
-                     * regatom will get called again, unless regbranch,
-                     * regpiece ... are also changed.  If you do back off that
-                     * extra character, so that there is input guaranteed to
-                     * force calling regatom, you can't handle the case where
-                     * just the first character in the node is acceptable.  I
-                     * (khw) decided to try this method which doesn't have that
-                     * pitfall; if performance issues are found, we can do a
-                     * combination of the current approach plus that one */
-                    upper_fill = len;
-                    len = 0;
-                    s = s0;
-                    goto reparse;
+                    /* 's' may or may not be folded; so make sure it is, and
+                     * use just the final character in its fold (should there
+                     * be more than one */
+                    if (UTF) {
+                        toFOLD_utf8_safe((U8*) s,
+                                         (U8*) s + UTF8SKIP(s),
+                                         (U8 *) s_fold_buf, &s_len);
+                        while (s_fold + UTF8SKIP(s_fold) < s_fold_buf + s_len)
+                        {
+                            s_fold += UTF8SKIP(s_fold);
+                        }
+                        s_len = UTF8SKIP(s_fold);
+                    }
+                    else {
+                        if (UNLIKELY(UCHARAT(s) == LATIN_SMALL_LETTER_SHARP_S))
+                        {
+                            s_fold_buf[0] = 's';
+                        }
+                        else {  /* This works for all other non-UTF-8 folds
+                                 */
+                            s_fold_buf[0] = toLOWER_L1(UCHARAT(s));
+                        }
+                        s_len = 1;
+                    }
+
+                    /* Unshift this character to the beginning of the buffer,
+                     * No longer needed trailing characters are overwritten.
+                     * */
+                    Move(foldbuf, foldbuf + s_len, sizeof(foldbuf) - s_len, U8);
+                    Copy(s_fold, foldbuf, s_len, U8);
+
+                    /* If this isn't a multi-character fold, we have found a
+                     * splittable place.  If this is the final character in the
+                     * node, that means the node is valid as-is, and can quit.
+                     * Otherwise, we note how much we can fill the node before
+                     * coming to a non-splittable position, and go parse it
+                     * again, stopping there. This is done because we know
+                     * where in the output to stop, but we don't have a map to
+                     * where that is in the input.  One could be created, but
+                     * it seems like overkill for such a rare event as we are
+                     * dealing with here */
+                    if (UTF) {
+                        if (! is_MULTI_CHAR_FOLD_utf8_safe(foldbuf,
+                                                foldbuf + UTF8_MAXBYTES_CASE))
+                        {
+                            upper_fill = s + UTF8SKIP(s) - s0;
+                            if (LIKELY(upper_fill == 255)) {
+                                break;
+                            }
+                            goto reparse;
+                        }
+                    }
+                    else if (! is_MULTI_CHAR_FOLD_latin1_safe(foldbuf,
+                                                foldbuf + UTF8_MAXBYTES_CASE))
+                    {
+                        upper_fill = s + 1 - s0;
+                        if (LIKELY(upper_fill == 255)) {
+                            break;
+                        }
+                        goto reparse;
+                    }
                 }
+
+                /* Here the node consists entirely of non-final multi-char
+                 * folds.  (Likely it is all 'f's or all 's's.)  There's no
+                 * decent place to split it, so give up and just take the whole
+                 * thing */
+
            }   /* End of verifying node ends with an appropriate char */
 
+            p = oldp;
+
           loopdone:   /* Jumped to when encounters something that shouldn't be
                          in the node */
 
@@ -14766,7 +14856,7 @@ S_regatom(pTHX_ RExC_state_t *pRExC_state, I32 *flagp, U32 depth)
                 RExC_emit += STR_SZ(len);
 
                 /* If the node isn't a single character, it can't be SIMPLE */
-                if (len > (Size_t) ((UTF) ? UVCHR_SKIP(ender) : 1)) {
+                if (len > (Size_t) ((UTF) ? UTF8SKIP(STRING(REGNODE_p(ret))) : 1)) {
                     maybe_SIMPLE = 0;
                 }
 
index 67ad0f4..b7c645e 100644 (file)
@@ -25,7 +25,7 @@ BEGIN {
 skip_all('no re module') unless defined &DynaLoader::boot_DynaLoader;
 skip_all_without_unicode_tables();
 
-plan tests => 864;  # Update this when adding/deleting tests.
+plan tests => 960;  # Update this when adding/deleting tests.
 
 run_tests() unless caller;
 
@@ -1421,6 +1421,55 @@ EOP
         ok("\x{017F}\x{017F}" =~ qr/^[$sharp_s]?$/i, "[] to EXACTish optimization");
     }
 
+    {   # Test that it avoids spllitting a multi-char fold across nodes
+        my $utf8_locale = find_utf8_ctype_locale();
+        for my $char('F', $sharp_s, "\x{FB00}") {
+            my $length = 260;    # Long enough to overflow an EXACTFish regnode
+            my $p = $char x $length;
+            my $s = ($char eq $sharp_s) ? 'ss' : 'ff';
+            $s = $s x $length;
+            for my $charset (qw(u d l aa)) {
+                for my $utf8 (0..1) {
+                  SKIP:
+                    for my $locale ('C', $utf8_locale) {
+                        skip "test skipped for non-C locales", 2
+                                    if $charset ne 'l'
+                                    && (! defined $locale || $locale ne 'C');
+                        if ($charset eq 'l') {
+                            if (! defined $locale) {
+                                skip "No UTF-8 locale", 2;
+                            }
+
+                            use POSIX;
+                            POSIX::setlocale(&LC_CTYPE, $locale);
+                        }
+
+                        my $pat = $p;
+                        utf8::upgrade($pat) if $utf8;
+                        my $should_pass =
+                            (    $charset eq 'u'
+                             || ($charset eq 'd' && $utf8)
+                             || ($charset eq 'd' && (   $char =~ /[[:ascii:]]/
+                                                     || ord $char > 255))
+                             || ($charset eq 'aa' && $char =~ /[[:ascii:]]/)
+                             || ($charset eq 'l' && $locale ne 'C')
+                             || ($charset eq 'l' && $char =~ /[[:ascii:]]/)
+                            );
+                        my $name = "(?i$charset), utf8=$utf8, locale=$locale,"
+                                 . " char=" . sprintf "%x", ord $char;
+                        no warnings 'locale';
+                        is (eval " '$s' =~ qr/(?i$charset)$pat/;",
+                            $should_pass, $name);
+                        fail "$name: $@" if $@;
+                        is (eval " 'a$s' =~ qr/(?i$charset)a$pat/;",
+                            $should_pass, "extra a, $name");
+                        fail "$name: $@" if $@;
+                    }
+                }
+            }
+        }
+    }
+
     {
         for my $char (":", uni_to_native("\x{f7}"), "\x{2010}") {
             my $utf8_char = $char;