This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: Don't give up on fold matching early
authorKarl Williamson <public@khwilliamson.com>
Sun, 7 Nov 2010 22:25:31 +0000 (15:25 -0700)
committerFather Chrysostomos <sprout@cpan.org>
Mon, 8 Nov 2010 05:42:42 +0000 (21:42 -0800)
As noted in the comments of the code, "a" =~ /[A]/i doesn't work currently
(except that regcomp.c knows about the ASCII characters and corrects for
it, but not always, for example in cases like "a" =~ /\p{Upper}/i.  This
patch catches all those).

It works by computing a list of all characters that (singly) fold to
another one, and then checking each of those.  The maximum length of
the list is 3 in the current Unicode standard.

I believe that a better long-term solution is to do this at compile
rather than execution time, by generating a closure of everything
matched.  But this can't be done now because the data structure would
need to be extensively revamped to list all non-byte characters, and
user-defined \p{} matches are not known at compile-time.

And it doesn't handle the multi-char folds.  There is a separate ticket
for those.

embedvar.h
intrpvar.h
perl.c
regexec.c
sv.c
t/re/reg_fold.t

index 87099c1..36f7575 100644 (file)
 #define PL_utf8_ascii          (vTHX->Iutf8_ascii)
 #define PL_utf8_cntrl          (vTHX->Iutf8_cntrl)
 #define PL_utf8_digit          (vTHX->Iutf8_digit)
+#define PL_utf8_foldclosures   (vTHX->Iutf8_foldclosures)
 #define PL_utf8_graph          (vTHX->Iutf8_graph)
 #define PL_utf8_idcont         (vTHX->Iutf8_idcont)
 #define PL_utf8_idstart                (vTHX->Iutf8_idstart)
 #define PL_Iutf8_ascii         PL_utf8_ascii
 #define PL_Iutf8_cntrl         PL_utf8_cntrl
 #define PL_Iutf8_digit         PL_utf8_digit
+#define PL_Iutf8_foldclosures  PL_utf8_foldclosures
 #define PL_Iutf8_graph         PL_utf8_graph
 #define PL_Iutf8_idcont                PL_utf8_idcont
 #define PL_Iutf8_idstart       PL_utf8_idstart
index d919e1d..1ab1495 100644 (file)
@@ -765,6 +765,10 @@ PERLVAR(Iregistered_mros, HV *)
 /* Compile-time block start/end hooks */
 PERLVAR(Iblockhooks, AV *)
 
+
+/* Everything that folds to a character, for case insensitivity regex matching */
+PERLVARI(Iutf8_foldclosures,   HV *, NULL)
+
 /* If you are adding a U8 or U16, check to see if there are 'Space' comments
  * above on where there are gaps which currently will be structure padding.  */
 
diff --git a/perl.c b/perl.c
index 157cd6b..ed99612 100644 (file)
--- a/perl.c
+++ b/perl.c
@@ -1003,6 +1003,7 @@ perl_destruct(pTHXx)
     SvREFCNT_dec(PL_utf8_tofold);
     SvREFCNT_dec(PL_utf8_idstart);
     SvREFCNT_dec(PL_utf8_idcont);
+    SvREFCNT_dec(PL_utf8_foldclosures);
     PL_utf8_alnum      = NULL;
     PL_utf8_ascii      = NULL;
     PL_utf8_alpha      = NULL;
@@ -1022,6 +1023,7 @@ perl_destruct(pTHXx)
     PL_utf8_tofold     = NULL;
     PL_utf8_idstart    = NULL;
     PL_utf8_idcont     = NULL;
+    PL_utf8_foldclosures = NULL;
 
     if (!specialWARN(PL_compiling.cop_warnings))
        PerlMemShared_free(PL_compiling.cop_warnings);
index 433bbeb..a6da6ce 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -6343,6 +6343,51 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
                        if (swash_fetch(sw, folded, 1)) {   /* 1 => is utf8 */
                            match = TRUE;
                        }
+                       else {
+                           SV** listp;
+
+                            /* Consider "k" =~ /[K]/i.  The line above would
+                             * have just folded the 'k' to itself, and that
+                             * isn't going to match 'K'.  So we look through
+                             * the closure of everything that folds to 'k'.
+                             * That will find the 'K'.  Initialize the list, if
+                             * necessary */
+                           if (! PL_utf8_foldclosures) {
+
+                               /* If the folds haven't been read in, call a fold
+                            * function to force that */
+                               if (! PL_utf8_tofold) {
+                                   U8 dummy[UTF8_MAXBYTES+1];
+                                   STRLEN dummy_len;
+                                   to_utf8_fold((U8*) "A", dummy, &dummy_len);
+                               }
+                               PL_utf8_foldclosures =
+                                       _swash_inversion_hash(PL_utf8_tofold);
+                           }
+
+                            /* The data structure is a hash with the keys every
+                             * character that is folded to, like 'k', and the
+                             * values each an array of everything that folds to
+                             * its key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
+                           if ((listp = hv_fetch(PL_utf8_foldclosures,
+                                           (char *) folded, foldlen, FALSE)))
+                           {
+                               AV* list = (AV*) *listp;
+                               IV i;
+                               for (i = 0; i <= av_len(list); i++) {
+                                   SV** try_p = av_fetch(list, i, FALSE);
+                                   if (try_p == NULL) {
+                                       Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+                                   }
+                                   /* Don't have to worry about embeded nulls
+                                    * since NULL isn't folded or foldable */
+                                   if (swash_fetch(sw, (U8*) SvPVX(*try_p),1)) {
+                                       match = TRUE;
+                                       break;
+                                   }
+                               }
+                           }
+                       }
                    }
                }
 
diff --git a/sv.c b/sv.c
index f3010af..e2d498d 100644 (file)
--- a/sv.c
+++ b/sv.c
@@ -13157,6 +13157,7 @@ perl_clone_using(PerlInterpreter *proto_perl, UV flags,
 
     PL_registered_mros  = hv_dup_inc(proto_perl->Iregistered_mros, param);
     PL_blockhooks      = av_dup_inc(proto_perl->Iblockhooks, param);
+    PL_utf8_foldclosures = hv_dup_inc(proto_perl->Iutf8_foldclosures, param);
 
     /* Call the ->CLONE method, if it exists, for each of the stashes
        identified by sv_dup() above.
index 1c7dfe8..af5ba28 100644 (file)
@@ -72,6 +72,12 @@ while (<$fh>) {
         }
     }
 }
+
+push @tests, qq[like chr(0x0430), qr/[=\x{0410}-\x{0411}]/i, 'Bug #71752 Unicode /i char in a range'];
+$count++;
+push @tests, qq[like 'a', qr/\\p{Upper}/i, "'a' =~ /\\\\p{Upper}/i"];
+$count++;
+
 eval join ";\n","plan tests=>".($count-1),@tests,"1"
     or die $@;
 __DATA__