This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: tell regexec more about multi-char folds
authorKarl Williamson <public@khwilliamson.com>
Mon, 7 Feb 2011 18:11:16 +0000 (11:11 -0700)
committerKarl Williamson <public@khwilliamson.com>
Mon, 14 Feb 2011 15:41:36 +0000 (08:41 -0700)
A multi-char fold that matches in the Latin1 range needs to have that
fact communicated to regexec.

regcomp.c

index 50b8877..49db5ea 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -9790,8 +9790,31 @@ parseit:
                         * these multicharacter foldings, to be later saved as
                         * part of the additional "s" data. */
                        if (! RExC_in_lookbehind) {
-                           /* XXX Discard this fold if any are latin1 and LOC */
                            SV *sv;
+                           U8* loc = foldbuf;
+                           U8* e = foldbuf + foldlen;
+
+                           /* If any of the folded characters of this are in
+                            * the Latin1 range, tell the regex engine that
+                            * this can match a non-utf8 target string.  The
+                            * multi-byte fold whose source is in the
+                            * Latin1 range (U+00DF) applies only when the
+                            * target string is utf8, or under unicode rules */
+                           if (j > 255 || AT_LEAST_UNI_SEMANTICS) {
+                               while (loc < e) {
+                                   /* XXX Discard this fold if any are latin1
+                                    * and LOC */
+                                   if (UTF8_IS_INVARIANT(*loc)
+                                       || UTF8_IS_DOWNGRADEABLE_START(*loc))
+                                   {
+                                       ANYOF_FLAGS(ret)
+                                               |= ANYOF_NONBITMAP_NON_UTF8;
+                                       break;
+                                   }
+                                   loc += UTF8SKIP(loc);
+                               }
+                           }
+                           ANYOF_FLAGS(ret) |= ANYOF_UTF8;
 
                            if (!unicode_alternate) {
                                unicode_alternate = newAV();
@@ -9801,7 +9824,6 @@ parseit:
 
                            /* This node is variable length */
                            OP(ret) = ANYOFV;
-                           ANYOF_FLAGS(ret) |= ANYOF_UTF8;
                        }
                    }
                    else { /* Single character fold */