regcomp.c: Generate EXACTFU_SS only for non-UTF8

author Karl Williamson <khw@cpan.org>

Sun, 16 Dec 2018 04:10:44 +0000 (21:10 -0700)

committer Karl Williamson <khw@cpan.org>

Wed, 26 Dec 2018 19:50:37 +0000 (12:50 -0700)
author Karl Williamson <khw@cpan.org>
Sun, 16 Dec 2018 04:10:44 +0000 (21:10 -0700)
committer Karl Williamson <khw@cpan.org>
Wed, 26 Dec 2018 19:50:37 +0000 (12:50 -0700)
diff --git a/pod/perldebguts.pod b/pod/perldebguts.pod

index b076bcf..f670bb2 100644 (file)
--- a/pod/perldebguts.pod
+++ b/pod/perldebguts.pod
@@ -667,8 +667,8 @@ will be lost.
                               folded length <= unfolded).
  
   EXACTFU_SS       str        Match this string using /iu rules (w/len);
                               folded length <= unfolded).
  
   EXACTFU_SS       str        Match this string using /iu rules (w/len);
-                             (string folded iff in UTF-8; non-UTF8
-                             folded length > unfolded).
+                             (string not UTF-8, only portions guaranteed
+                             to be folded; folded length > unfolded).
   EXACTFLU8        str        Like EXACTFU, but use /il, UTF-8, folded,
                               and everything in it is above 255.
   EXACTFAA_NO_TRIE str        Match this string using /iaa rules (w/len)
   EXACTFLU8        str        Like EXACTFU, but use /il, UTF-8, folded,
                               and everything in it is above 255.
   EXACTFAA_NO_TRIE str        Match this string using /iaa rules (w/len)
diff --git a/regcomp.c b/regcomp.c

index 6f02661..11a155d 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -3856,16 +3856,18 @@ S_construct_ahocorasick_from_trie(pTHX_ RExC_state_t *pRExC_state, regnode *sour
   *      so that the optimizer doesn't reject these possibilities based on size
   *      constraints.
   * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
   *      so that the optimizer doesn't reject these possibilities based on size
   *      constraints.
   * 2)   For the sequence involving the Sharp s (\xDF), the node type EXACTFU_SS
- *      is used for an EXACTFU node that contains at least one "ss" sequence in
- *      it.  For non-UTF-8 patterns and strings, this is the only case where
- *      there is a possible fold length change.  That means that a regular
- *      EXACTFU node without UTF-8 involvement doesn't have to concern itself
- *      with length changes, and so can be processed faster.  regexec.c takes
- *      advantage of this.  Generally, an EXACTFish node that is in UTF-8 is
- *      pre-folded by regcomp.c (except EXACTFL, some of whose folds aren't
- *      known until runtime).  This saves effort in regex matching.  However,
- *      the pre-folding isn't done for non-UTF8 patterns because the fold of
- *      the MICRO SIGN requires UTF-8, and we don't want to slow things down by
+ *      is used in non-UTF-8 patterns for an EXACTFU node that contains at
+ *      least one "ss" sequence in it.  For UTF-8 patterns, the procedures in
+ *      step 1) above are sufficient to handle these, but for non-UTF-8
+ *      patterns and strings, this is the only case where there is a possible
+ *      fold length change.  That means that a regular EXACTFU node without
+ *      UTF-8 involvement doesn't have to concern itself with length changes,
+ *      and so can be processed faster.  regexec.c takes advantage of this.
+ *      Generally, an EXACTFish node that is in UTF-8 is pre-folded by
+ *      regcomp.c (except EXACTFL, some of whose folds aren't known until
+ *      runtime).  This saves effort in regex matching.  However, the
+ *      pre-folding isn't done for non-UTF8 patterns because the fold of the
+ *      MICRO SIGN requires UTF-8, and we don't want to slow things down by
   *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
   *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
   *      possibilities for the non-UTF8 patterns are quite simple, except for
   *      forcing the pattern into UTF8 unless necessary.  Also what EXACTF (and,
   *      again, EXACTFL) nodes fold to isn't known until runtime.  The fold
   *      possibilities for the non-UTF8 patterns are quite simple, except for
@@ -4232,19 +4234,7 @@ S_join_exact(pTHX_ RExC_state_t *pRExC_state, regnode *scan,
                      continue;
                  }
  
                      continue;
                  }
  
-                /* Nodes with 'ss' require special handling, except for
-                 * EXACTFAA-ish for which there is no multi-char fold to this */
-                if (len == 2 && *s == 's' && *(s+1) == 's'
-                    && OP(scan) != EXACTFAA
-                    && OP(scan) != EXACTFAA_NO_TRIE)
-                {
-                    count = 2;
-                    if (OP(scan) != EXACTFL) {
-                        OP(scan) = EXACTFU_SS;
-                    }
-                    s += 2;
-                }
-                else { /* Here is a generic multi-char fold. */
+                { /* Here is a generic multi-char fold. */
                      U8* multi_end  = s + len;
  
                      /* Count how many characters are in it.  In the case of
                      U8* multi_end  = s + len;
  
                      /* Count how many characters are in it.  In the case of
diff --git a/regcomp.sym b/regcomp.sym

index ddf5ba8..235305d 100644 (file)
--- a/regcomp.sym
+++ b/regcomp.sym
@@ -107,7 +107,7 @@ EXACTFAA    EXACT,      str   ; Match this string using /iaa rules (w/len) (stri
  
  # End of important relative ordering.
  
  
  # End of important relative ordering.
  
-EXACTFU_SS  EXACT,      str      ; Match this string using /iu rules (w/len); (string folded iff in UTF-8; non-UTF8 folded length > unfolded).
+EXACTFU_SS  EXACT,      str      ; Match this string using /iu rules (w/len); (string not UTF-8, only portions guaranteed to be folded; folded length > unfolded).
  EXACTFLU8   EXACT,      str      ; Like EXACTFU, but use /il, UTF-8, folded, and everything in it is above 255.
  EXACTFAA_NO_TRIE  EXACT, str     ; Match this string using /iaa rules (w/len) (string not UTF-8, not guaranteed to be folded, not currently trie-able).
  
  EXACTFLU8   EXACT,      str      ; Like EXACTFU, but use /il, UTF-8, folded, and everything in it is above 255.
  EXACTFAA_NO_TRIE  EXACT, str     ; Match this string using /iaa rules (w/len) (string not UTF-8, not guaranteed to be folded, not currently trie-able).
  
diff --git a/regexec.c b/regexec.c

index f7da372..f025965 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -2278,9 +2278,7 @@ S_find_byclass(pTHX_ regexp * prog, const regnode *c, char *s,
          goto do_exactf_non_utf8;
  
      case EXACTFU_SS:
          goto do_exactf_non_utf8;
  
      case EXACTFU_SS:
-        if (is_utf8_pat) {
-            utf8_fold_flags = FOLDEQ_S2_ALREADY_FOLDED;
-        }
+        assert(! is_utf8_pat);
          goto do_exactf_utf8;
  
      case EXACTFLU8:
          goto do_exactf_utf8;
  
      case EXACTFLU8:
@@ -6378,6 +6376,8 @@ S_regmatch(pTHX_ regmatch_info *reginfo, char *startpos, regnode *prog)
             goto do_exactf;
  
         case EXACTFU_SS:         /*  /\x{df}/iu   */
             goto do_exactf;
  
         case EXACTFU_SS:         /*  /\x{df}/iu   */
+            assert(! is_utf8_pat);
+            /* FALLTHROUGH */
         case EXACTFU:            /*  /abc/iu      */
             folder = foldEQ_latin1;
             fold_array = PL_fold_latin1;
         case EXACTFU:            /*  /abc/iu      */
             folder = foldEQ_latin1;
             fold_array = PL_fold_latin1;
@@ -9131,7 +9131,7 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
      I32 hardcount = 0;  /* How many matches so far */
      bool utf8_target = reginfo->is_utf8_target;
      unsigned int to_complement = 0;  /* Invert the result? */
      I32 hardcount = 0;  /* How many matches so far */
      bool utf8_target = reginfo->is_utf8_target;
      unsigned int to_complement = 0;  /* Invert the result? */
-    UV utf8_flags;
+    UV utf8_flags = 0;
      _char_class_number classnum;
  
      PERL_ARGS_ASSERT_REGREPEAT;
      _char_class_number classnum;
  
      PERL_ARGS_ASSERT_REGREPEAT;
@@ -9277,7 +9277,6 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
  
      case EXACTF:   /* This node only generated for non-utf8 patterns */
          assert(! reginfo->is_utf8_pat);
  
      case EXACTF:   /* This node only generated for non-utf8 patterns */
          assert(! reginfo->is_utf8_pat);
-        utf8_flags = 0;
          goto do_exactf;
  
      case EXACTFLU8:
          goto do_exactf;
  
      case EXACTFLU8:
@@ -9296,9 +9295,13 @@ S_regrepeat(pTHX_ regexp *prog, char **startposp, const regnode *p,
         utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
          goto do_exactf;
  
         utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
          goto do_exactf;
  
-    case EXACTFU_SS:
      case EXACTFU:
      case EXACTFU:
-       utf8_flags = reginfo->is_utf8_pat ? FOLDEQ_S2_ALREADY_FOLDED : 0;
+       if (reginfo->is_utf8_pat) {
+            utf8_flags = FOLDEQ_S2_ALREADY_FOLDED;
+        }
+        /* FALLTHROUGH */
+
+    case EXACTFU_SS:
  
        do_exactf: {
          int c1, c2;
  
        do_exactf: {
          int c1, c2;
diff --git a/regnodes.h b/regnodes.h

index 94b4443..9fd01a7 100644 (file)
--- a/regnodes.h
+++ b/regnodes.h
@@ -53,7 +53,7 @@
  #define        EXACTFL                 39      /* 0x27 Match this string using /il rules (w/len); (string not guaranteed to be folded). */
  #define        EXACTFU                 40      /* 0x28 Match this string using /iu rules (w/len); (string folded iff in UTF-8; non-UTF8 folded length <= unfolded). */
  #define        EXACTFAA                41      /* 0x29 Match this string using /iaa rules (w/len) (string folded iff in UTF-8; non-UTF8 folded length <= unfolded). */
  #define        EXACTFL                 39      /* 0x27 Match this string using /il rules (w/len); (string not guaranteed to be folded). */
  #define        EXACTFU                 40      /* 0x28 Match this string using /iu rules (w/len); (string folded iff in UTF-8; non-UTF8 folded length <= unfolded). */
  #define        EXACTFAA                41      /* 0x29 Match this string using /iaa rules (w/len) (string folded iff in UTF-8; non-UTF8 folded length <= unfolded). */
-#define        EXACTFU_SS              42      /* 0x2a Match this string using /iu rules (w/len); (string folded iff in UTF-8; non-UTF8 folded length > unfolded). */
+#define        EXACTFU_SS              42      /* 0x2a Match this string using /iu rules (w/len); (string not UTF-8, only portions guaranteed to be folded; folded length > unfolded). */
  #define        EXACTFLU8               43      /* 0x2b Like EXACTFU, but use /il, UTF-8, folded, and everything in it is above 255. */
  #define        EXACTFAA_NO_TRIE        44      /* 0x2c Match this string using /iaa rules (w/len) (string not UTF-8, not guaranteed to be folded, not currently trie-able). */
  #define        EXACT_ONLY8             45      /* 0x2d Like EXACT, but only UTF-8 encoded targets can match */
  #define        EXACTFLU8               43      /* 0x2b Like EXACTFU, but use /il, UTF-8, folded, and everything in it is above 255. */
  #define        EXACTFAA_NO_TRIE        44      /* 0x2c Match this string using /iaa rules (w/len) (string not UTF-8, not guaranteed to be folded, not currently trie-able). */
  #define        EXACT_ONLY8             45      /* 0x2d Like EXACT, but only UTF-8 encoded targets can match */
author	Karl Williamson <khw@cpan.org>
	Sun, 16 Dec 2018 04:10:44 +0000 (21:10 -0700)
committer	Karl Williamson <khw@cpan.org>
	Wed, 26 Dec 2018 19:50:37 +0000 (12:50 -0700)
pod/perldebguts.pod		patch \| blob \| blame \| history
regcomp.c		patch \| blob \| blame \| history
regcomp.sym		patch \| blob \| blame \| history
regexec.c		patch \| blob \| blame \| history
regnodes.h		patch \| blob \| blame \| history