regexec.c: Stop looking for match sooner

author Karl Williamson <public@khwilliamson.com>

Sun, 16 Oct 2011 17:43:08 +0000 (11:43 -0600)

committer Karl Williamson <public@khwilliamson.com>

Mon, 17 Oct 2011 23:04:28 +0000 (17:04 -0600)
author Karl Williamson <public@khwilliamson.com>
Sun, 16 Oct 2011 17:43:08 +0000 (11:43 -0600)
committer Karl Williamson <public@khwilliamson.com>
Mon, 17 Oct 2011 23:04:28 +0000 (17:04 -0600)
diff --git a/regexec.c b/regexec.c

index 4abb712..6100fce 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -1528,6 +1528,9 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
             break;
  
         do_exactf_utf8:
+       {
+           unsigned expansion;
+
  
             /* If one of the operands is in utf8, we can't use the simpler
              * folding above, due to the fact that many different characters
@@ -1540,13 +1543,33 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                     ? utf8_length((U8 *) pat_string, (U8 *) pat_end)
                     : ln;
  
-           /* Set the end position to the final character available */
-           e = HOP3c(strend, -1, s);
+           /* We have 'lnc' characters to match in the pattern, but because of
+            * multi-character folding, each character in the target can match
+            * up to 3 characters (Unicode guarantees it will never exceed
+            * this) if it is utf8-encoded; and up to 2 if not (based on the
+            * fact that the Latin 1 folds are already determined, and the
+            * only multi-char fold in that range is the sharp-s folding to
+            * 'ss'.  Thus, a pattern character can match as little as 1/3 of a
+            * string character.  Adjust lnc accordingly, always matching at
+            * least 1 */
+           expansion = (utf8_target) ? UTF8_MAX_FOLD_CHAR_EXPAND : 2;
+           lnc = (lnc < expansion) ? 1 : lnc / expansion;
+
+           /* As in the non-UTF8 case, if we have to match 3 characters, and
+            * only 2 are left, it's guaranteed to fail, so don't start a
+            * match that would require us to go beyond the end of the string
+            */
+           e = HOP3c(strend, -((I32)lnc), s);
  
             if (!reginfo && e < s) {
                 e = s;                  /* Due to minlen logic of intuit() */
             }
  
+           /* XXX Note that we could recalculate e every so-often through the
+            * loop to stop earlier, as the worst case expansion above will
+            * rarely be met, and as we go along we would usually find that e
+            * moves further to the left.  Unclear if worth the expense */
+
             while (s <= e) {
                 char *my_strend= (char *)strend;
                 if (foldEQ_utf8_flags(s, &my_strend, 0,  utf8_target,
@@ -1558,6 +1581,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
                 s += UTF8SKIP(s);
             }
             break;
+       }
         case BOUNDL:
             PL_reg_flags |= RF_tainted;
             FBC_BOUND(isALNUM_LC,
diff --git a/t/re/re_tests b/t/re/re_tests

index 9b65f55..7b303c8 100644 (file)
--- a/t/re/re_tests
+++ b/t/re/re_tests
@@ -1546,4 +1546,9 @@ abc\N{def -       c       -       \\N{NAME} must be resolved by the lexer
  /fi/i  \x{FB01}\x{FB00}        y       $&      \x{FB01}
  /fi/i  \x{FB00}\x{FB01}        y       $&      \x{FB01}
  
+# These test that doesn't cut-off matching too soon in the string for
+# multi-char folds
+/ffiffl/i      abcdef\x{FB03}\x{FB04}  y       $&      \x{FB03}\x{FB04}
+/\xdf\xdf/ui   abcdefssss      y       $&      ssss
+
  # vim: softtabstop=0 noexpandtab
author	Karl Williamson <public@khwilliamson.com>
	Sun, 16 Oct 2011 17:43:08 +0000 (11:43 -0600)
committer	Karl Williamson <public@khwilliamson.com>
	Mon, 17 Oct 2011 23:04:28 +0000 (17:04 -0600)
regexec.c		patch \| blob \| blame \| history
t/re/re_tests		patch \| blob \| blame \| history