This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regex: Multi-char /i shouldnt match single char []
authorKarl Williamson <public@khwilliamson.com>
Wed, 15 Dec 2010 22:22:38 +0000 (15:22 -0700)
committerKarl Williamson <public@khwilliamson.com>
Wed, 15 Dec 2010 23:24:37 +0000 (16:24 -0700)
":\N{LATIN SMALL LIGATURE ST}:" !~ /:[_st]:/i

because it is two character, but previously it did match.  The cause of
this is that the code was using swash_fetch() to test if the fold
matches.  But swash_fetch() only checks if the first character matches,
not all characters, so it was falsely returning true.

This is an intermediate commit, with some cleanup of blocks, comments,
and accompanying indentation of regexec.c to follow immediately.

regexec.c
t/uni/fold.t

index f2723e4..cc31b87 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -6444,30 +6444,8 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
                        /* See if the folded version matches */
                        STRLEN foldlen;
                        to_utf8_fold(utf8_p, folded, &foldlen);
-                       if (swash_fetch(sw, folded, 1)) {   /* 1 => is utf8 */
-                           match = TRUE;
-                       }
-                       else {
-                           /* The fold in a few cases  of an above Latin1 char
-                            * is in the Latin1 range, and hence may be in the
-                            * bitmap */
-                           if (UTF8_IS_INVARIANT(*folded)
-                               && ANYOF_BITMAP_TEST(n, UNI_TO_NATIVE(*folded)))
                            {
-                               match = TRUE;
-                           }
-                           else if (UTF8_IS_DOWNGRADEABLE_START(*folded)
-                                    && ANYOF_BITMAP_TEST(n,
-                                         UNI_TO_NATIVE(
-                                            TWO_BYTE_UTF8_TO_UNI(folded[0],
-                                                                  folded[1]))))
-                           { /* Since the fold comes from internally
-                              * generated data, we can safely assume it is
-                              * valid utf8 in the test above */
 
-                               match = TRUE;
-                           }
-                            if (! match) {
                                SV** listp;
 
                                /* Consider "k" =~ /[K]/i.  The line above
@@ -6510,6 +6488,10 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
                                         * nulls since NULL isn't folded or
                                         * foldable */
                                        try_c = SvPVX(*try_p);
+
+                                       /* The fold in a few cases  of an above Latin1 char
+                                        * is in the Latin1 range, and hence may be in the
+                                        * bitmap */
                                        if (UTF8_IS_INVARIANT(*try_c)
                                            && ANYOF_BITMAP_TEST(n,
                                                            UNI_TO_NATIVE(*try_c)))
@@ -6524,6 +6506,9 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
                                                TWO_BYTE_UTF8_TO_UNI(try_c[0],
                                                                     try_c[1]))))
                                        {
+                                          /* Since the fold comes from internally
+                                           * generated data, we can safely assume it is
+                                           * valid utf8 in the test above */
                                            match = TRUE;
                                            break;
                                        } else if (swash_fetch(sw,
@@ -6534,7 +6519,6 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
                                        }
                                    }
                                }
-                           }
                         }
                    }
                }
index c841614..52417c1 100644 (file)
@@ -97,13 +97,13 @@ foreach my $test_ref (@CF) {
         }
         my $test;
 
+        # A multi-char fold should not match just one char;
+        # e.g., ":ß:" !~ /:[_s]:/i
+        $test = qq[":$c:" !~ /:[_$f]:/i];
+        ok eval $test, "$code - $name - $mapping - $type - $test";
+
         local $TODO = 'Multi-char fold in [character class]';
 
-        TODO: { # e.g., ":ß:" !~ /:[_s]:/i  # A multi-char fold should not
-                                            # match just one char
-            $test = qq[":$c:" !~ /:[_$f]:/i];
-            ok eval $test, "$code - $name - $mapping - $type - $test";
-        }
         TODO: { # e.g., ":ß:" =~ /:[_s]{2}:/i
             $test = qq[":$c:" =~ /:[_$f]{$f_length}:/i];
             ok eval $test, "$code - $name - $mapping - $type - $test";