This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Move ANYOF folding from regexec to regcomp
authorKarl Williamson <public@khwilliamson.com>
Wed, 2 Feb 2011 19:01:34 +0000 (12:01 -0700)
committerKarl Williamson <public@khwilliamson.com>
Wed, 2 Feb 2011 23:31:23 +0000 (16:31 -0700)
This is for security as well as performance.  It allows Unicode properties to
not be matched case sensitively.  As a result the swash inversion hash is
converted from having utf8 keys to numeric, code point, keys.

It also for the first time fixes the bug where /i doesn't work for a code point
not at the end of a range in a bracketed character class has a multi-character
fold

lib/unicore/mktables
pod/perldelta.pod
pod/perlrecharclass.pod
pod/perlunicode.pod
regcomp.c
regexec.c
utf8.c

index 5e3f0c5..ab90ba3 100644 (file)
@@ -12395,6 +12395,7 @@ sub make_table_pod_entries($) {
     my $string_count = clarify_number($count);
     my $status = $input_table->status;
     my $status_info = $input_table->status_info;
     my $string_count = clarify_number($count);
     my $status = $input_table->status;
     my $status_info = $input_table->status_info;
+    my $caseless_equivalent = $input_table->caseless_equivalent;
 
     my $entry_for_first_table; # The entry for the first table output.
                            # Almost certainly, it is the parent.
 
     my $entry_for_first_table; # The entry for the first table output.
                            # Almost certainly, it is the parent.
@@ -12618,6 +12619,11 @@ sub make_table_pod_entries($) {
                 }
             }
 
                 }
             }
 
+            if ($caseless_equivalent != 0) {
+                $parenthesized .=  '; ' if $parenthesized ne "";
+                $parenthesized .= "/i= " . $caseless_equivalent->complete_name;
+            }
+
 
             # Warn if this property isn't the same as one that a
             # semi-casual user might expect.  The other components of this
 
             # Warn if this property isn't the same as one that a
             # semi-casual user might expect.  The other components of this
@@ -13060,6 +13066,15 @@ Numbers in (parentheses) indicate the total number of code points matched by
 the property.  For emphasis, those properties that match no code points at all
 are listed as well in a separate section following the table.
 
 the property.  For emphasis, those properties that match no code points at all
 are listed as well in a separate section following the table.
 
+Most properties match the same code points regardless of whether C<"/i">
+case-insensitive matching is specified or not.  But a few properties are
+affected.  These are shown with the notation
+
+ (/i= other_property)
+
+in the second column.  Under case-insensitive matching they match the
+same code pode points as the property "other_property".
+
 There is no description given for most non-Perl defined properties (See
 $unicode_reference_url for that).
 
 There is no description given for most non-Perl defined properties (See
 $unicode_reference_url for that).
 
index 9d6e258..3f0d2d8 100644 (file)
@@ -51,6 +51,30 @@ XXX For a release on a stable branch, this section aspires to be:
 
 [ List each incompatible change as a =head2 entry ]
 
 
 [ List each incompatible change as a =head2 entry ]
 
+=head2 Most C<\p{}> properties are now immune from case-insensitive matching
+
+For most Unicode properties, it doesn't make sense to have them match
+differently under C</i> case-insensitive matching than not.  And doing
+so leads to unexpected results and potential security holes.  For
+example
+
+ m/\p{ASCII_Hex_Digit}+/i
+
+could previously match non-ASCII characters because of the Unicode
+matching rules.  There were a number of bugs in this feature until an
+earlier release in the 5.13 series.  Now this release reverts, and
+removes the feature completely except for the few properties where
+people have come to expect it, namely the ones where casing is an
+integral part of their functionality, such as C<m/\p{Uppercase}/i> and
+C<m/\p{Lowercase}/i>, both of which match the exact same code points,
+namely those matched by C<m/\p{Cased}/i>.  Details are in
+L<perlrecharclass/Unicode Properties>.
+
+User-defined property handlers that need to match differently under
+C</i> must change to read the new boolean parameter passed it which is
+non-zero if case-insensitive matching is in effect; 0 if not.  See
+L<perluniprops/User-Defined Character Properties>.
+
 =head1 Deprecations
 
 XXX Any deprecated features, syntax, modules etc. should be listed here.
 =head1 Deprecations
 
 XXX Any deprecated features, syntax, modules etc. should be listed here.
index e7f6f8a..de2256f 100644 (file)
@@ -273,13 +273,32 @@ C</\pLl/> is valid, but means something different.
 It matches a two character string: a letter (Unicode property C<\pL>),
 followed by a lowercase C<l>.
 
 It matches a two character string: a letter (Unicode property C<\pL>),
 followed by a lowercase C<l>.
 
-For more details, see L<perlunicode/Unicode Character Properties>; for a
+Note that almost all properties are immune to case-insensitive matching.
+That is, adding a C</i> regular expression modifier does not change what
+they match.  There are two sets that are affected.  The first set is
+C<Uppercase_Letter>,
+C<Lowercase_Letter>,
+and C<Titlecase_Letter>,
+all of which match C<Cased_Letter> under C</i> matching.
+And the second set is
+C<Uppercase>,
+C<Lowercase>,
+and C<Titlecase>,
+all of which match C<Cased> under C</i> matching.
+(The difference between these sets is that some things, such as Roman
+Numerals come in both upper and lower case so they are C<Cased>, but
+aren't considered to be letters, so they aren't C<Cased_Letter>s.)
+This set also includes its subsets C<PosixUpper> and C<PosixLower> both
+of which under C</i> matching match C<PosixAlpha>.
+
+For more details on Unicode properties, see L<perlunicode/Unicode
+Character Properties>; for a
 complete list of possible properties, see
 complete list of possible properties, see
-L<perluniprops/Properties accessible through \p{} and \P{}>.
+L<perluniprops/Properties accessible through \p{} and \P{}>,
+which notes all forms that have C</i> differences.
 It is also possible to define your own properties. This is discussed in
 L<perlunicode/User-Defined Character Properties>.
 
 It is also possible to define your own properties. This is discussed in
 L<perlunicode/User-Defined Character Properties>.
 
-
 =head4 Examples
 
  "a"  =~  /\w/      # Match, "a" is a 'word' character.
 =head4 Examples
 
  "a"  =~  /\w/      # Match, "a" is a 'word' character.
index 360af1d..7f3a795 100644 (file)
@@ -349,6 +349,27 @@ You can also use negation in both C<\p{}> and C<\P{}> by introducing a caret
 (^) between the first brace and the property name: C<\p{^Tamil}> is
 equal to C<\P{Tamil}>.
 
 (^) between the first brace and the property name: C<\p{^Tamil}> is
 equal to C<\P{Tamil}>.
 
+Almost all properties are immune to case-insensitive matching.  That is,
+adding a C</i> regular expression modifier does not change what they
+match.  There are two sets that are affected.
+The first set is
+C<Uppercase_Letter>,
+C<Lowercase_Letter>,
+and C<Titlecase_Letter>,
+all of which match C<Cased_Letter> under C</i> matching.
+And the second set is
+C<Uppercase>,
+C<Lowercase>,
+and C<Titlecase>,
+all of which match C<Cased> under C</i> matching.
+This set also includes its subsets C<PosixUpper> and C<PosixLower> both
+of which under C</i> matching match C<PosixAlpha>.
+(The difference between these sets is that some things, such as Roman
+Numerals come in both upper and lower case so they are C<Cased>, but aren't considered to be
+letters, so they aren't C<Cased_Letter>s.)
+L<perluniprops> includes a notation for all forms that have C</i>
+differences.
+
 =head3 B<General_Category>
 
 Every Unicode character is assigned a general category, which is the "most
 =head3 B<General_Category>
 
 Every Unicode character is assigned a general category, which is the "most
@@ -773,6 +794,12 @@ C<\p> or C<\P> construct.
 
 
 Note that the effect is compile-time and immutable once defined.
 
 
 Note that the effect is compile-time and immutable once defined.
+However the subroutines are passed a single parameter which is 0 if
+case-sensitive matching is in effect, and non-zero if caseless matching
+is in effect.  The subroutine may return different values depending on
+the value of the flag, and one set of values will immutably be in effect
+for all case-sensitive matches; the other set for all case-insensitive
+matches.
 
 The subroutines must return a specially-formatted string, with one
 or more newline-separated lines.  Each line must be one of the following:
 
 The subroutines must return a specially-formatted string, with one
 or more newline-separated lines.  Each line must be one of the following:
index 0c5ad08..8007dd3 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -8872,14 +8872,14 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
 ANYOF_##NAME:                                                                  \
        for (value = 0; value < 256; value++)                                  \
            if (TEST)                                                          \
 ANYOF_##NAME:                                                                  \
        for (value = 0; value < 256; value++)                                  \
            if (TEST)                                                          \
-           stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
+           stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);  \
     yesno = '+';                                                               \
     what = WORD;                                                               \
     break;                                                                     \
 case ANYOF_N##NAME:                                                            \
        for (value = 0; value < 256; value++)                                  \
            if (!TEST)                                                         \
     yesno = '+';                                                               \
     what = WORD;                                                               \
     break;                                                                     \
 case ANYOF_N##NAME:                                                            \
        for (value = 0; value < 256; value++)                                  \
            if (!TEST)                                                         \
-           stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
+           stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);  \
     yesno = '!';                                                               \
     what = WORD;                                                               \
     break
     yesno = '!';                                                               \
     what = WORD;                                                               \
     break
@@ -8894,14 +8894,14 @@ ANYOF_##NAME:                                                                  \
     else if (UNI_SEMANTICS) {                                                  \
         for (value = 0; value < 256; value++) {                                \
             if (TEST_8(value)) stored +=                                       \
     else if (UNI_SEMANTICS) {                                                  \
         for (value = 0; value < 256; value++) {                                \
             if (TEST_8(value)) stored +=                                       \
-                      S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);  \
+                      S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);  \
         }                                                                      \
     }                                                                          \
     else {                                                                     \
         for (value = 0; value < 128; value++) {                                \
             if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
                S_set_regclass_bit(aTHX_ pRExC_state, ret,                     \
         }                                                                      \
     }                                                                          \
     else {                                                                     \
         for (value = 0; value < 128; value++) {                                \
             if (TEST_7(UNI_TO_NATIVE(value))) stored +=                        \
                S_set_regclass_bit(aTHX_ pRExC_state, ret,                     \
-                                  (U8) UNI_TO_NATIVE(value));                 \
+                                  (U8) UNI_TO_NATIVE(value), &nonbitmap);                 \
         }                                                                      \
     }                                                                          \
     yesno = '+';                                                               \
         }                                                                      \
     }                                                                          \
     yesno = '+';                                                               \
@@ -8912,18 +8912,18 @@ case ANYOF_N##NAME:                                                            \
     else if (UNI_SEMANTICS) {                                                  \
         for (value = 0; value < 256; value++) {                                \
             if (! TEST_8(value)) stored +=                                     \
     else if (UNI_SEMANTICS) {                                                  \
         for (value = 0; value < 256; value++) {                                \
             if (! TEST_8(value)) stored +=                                     \
-                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);    \
+                   S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);    \
         }                                                                      \
     }                                                                          \
     else {                                                                     \
         for (value = 0; value < 128; value++) {                                \
             if (! TEST_7(UNI_TO_NATIVE(value))) stored += S_set_regclass_bit(  \
         }                                                                      \
     }                                                                          \
     else {                                                                     \
         for (value = 0; value < 128; value++) {                                \
             if (! TEST_7(UNI_TO_NATIVE(value))) stored += S_set_regclass_bit(  \
-                       aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value));    \
+                       aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap);    \
         }                                                                      \
        if (ASCII_RESTRICTED) {                                                \
            for (value = 128; value < 256; value++) {                          \
              stored += S_set_regclass_bit(                                     \
         }                                                                      \
        if (ASCII_RESTRICTED) {                                                \
            for (value = 128; value < 256; value++) {                          \
              stored += S_set_regclass_bit(                                     \
-                          aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value)); \
+                          aTHX_ pRExC_state, ret, (U8) UNI_TO_NATIVE(value), &nonbitmap); \
            }                                                                  \
            ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8;                  \
        }                                                                      \
            }                                                                  \
            ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL|ANYOF_UTF8;                  \
        }                                                                      \
@@ -8957,7 +8957,7 @@ case ANYOF_N##NAME:                                                            \
 #endif
 
 STATIC U8
 #endif
 
 STATIC U8
-S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
+S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
 {
 
     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
 {
 
     /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
@@ -8991,6 +8991,10 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8
            don't have unicode semantics for the above ASCII Latin-1 characters,
            and they have a fold, they should match if the target is utf8, and
            not otherwise */
            don't have unicode semantics for the above ASCII Latin-1 characters,
            and they have a fold, they should match if the target is utf8, and
            not otherwise */
+       if (! *nonbitmap_ptr) {
+           *nonbitmap_ptr = _new_invlist(2);
+       }
+       *nonbitmap_ptr = add_range_to_invlist(*nonbitmap_ptr, value, value);
        ANYOF_FLAGS(node) |= ANYOF_UTF8;
     }
 
        ANYOF_FLAGS(node) |= ANYOF_UTF8;
     }
 
@@ -8999,7 +9003,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8
 
 
 PERL_STATIC_INLINE U8
 
 
 PERL_STATIC_INLINE U8
-S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
+S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value, HV** nonbitmap_ptr)
 {
     /* This inline function sets a bit in the bitmap if not already set, and if
      * appropriate, its fold, returning the number of bits that actually
 {
     /* This inline function sets a bit in the bitmap if not already set, and if
      * appropriate, its fold, returning the number of bits that actually
@@ -9015,7 +9019,7 @@ S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 valu
     stored = 1;
 
     if (FOLD && ! LOC) {       /* Locale folds aren't known until runtime */
     stored = 1;
 
     if (FOLD && ! LOC) {       /* Locale folds aren't known until runtime */
-       stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
+       stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value, nonbitmap_ptr);
     }
 
     return stored;
     }
 
     return stored;
@@ -9043,6 +9047,7 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
     bool need_class = 0;
     SV *listsv = NULL;
     UV n;
     bool need_class = 0;
     SV *listsv = NULL;
     UV n;
+    HV* nonbitmap = NULL;
     AV* unicode_alternate  = NULL;
 #ifdef EBCDIC
     UV literal_endpoint = 0;
     AV* unicode_alternate  = NULL;
 #ifdef EBCDIC
     UV literal_endpoint = 0;
@@ -9064,8 +9069,10 @@ S_regclass(pTHX_ RExC_state_t *pRExC_state, U32 depth)
     /* Assume we are going to generate an ANYOF node. */
     ret = reganode(pRExC_state, ANYOF, 0);
 
     /* Assume we are going to generate an ANYOF node. */
     ret = reganode(pRExC_state, ANYOF, 0);
 
-    if (!SIZE_ONLY)
+
+    if (!SIZE_ONLY) {
        ANYOF_FLAGS(ret) = 0;
        ANYOF_FLAGS(ret) = 0;
+    }
 
     if (UCHARAT(RExC_parse) == '^') {  /* Complement of range. */
        RExC_naughty++;
 
     if (UCHARAT(RExC_parse) == '^') {  /* Complement of range. */
        RExC_naughty++;
@@ -9350,9 +9357,9 @@ parseit:
 
                    if (prevvalue < 256) {
                        stored +=
 
                    if (prevvalue < 256) {
                        stored +=
-                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue);
+                         S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) prevvalue, &nonbitmap);
                        stored +=
                        stored +=
-                         S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
+                         S_set_regclass_bit(aTHX_ pRExC_state, ret, '-', &nonbitmap);
                    }
                    else {
                        ANYOF_FLAGS(ret) |= ANYOF_UTF8;
                    }
                    else {
                        ANYOF_FLAGS(ret) |= ANYOF_UTF8;
@@ -9404,7 +9411,7 @@ parseit:
                    else {
                        for (value = 0; value < 128; value++)
                            stored +=
                    else {
                        for (value = 0; value < 128; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
                    }
                    yesno = '+';
                    what = NULL;        /* Doesn't match outside ascii, so
                    }
                    yesno = '+';
                    what = NULL;        /* Doesn't match outside ascii, so
@@ -9416,7 +9423,7 @@ parseit:
                    else {
                        for (value = 128; value < 256; value++)
                            stored +=
                    else {
                        for (value = 128; value < 256; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value));
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) ASCII_TO_NATIVE(value), &nonbitmap);
                    }
                    ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
                    yesno = '!';
                    }
                    ANYOF_FLAGS(ret) |= ANYOF_UNICODE_ALL;
                    yesno = '!';
@@ -9429,7 +9436,7 @@ parseit:
                        /* consecutive digits assumed */
                        for (value = '0'; value <= '9'; value++)
                            stored +=
                        /* consecutive digits assumed */
                        for (value = '0'; value <= '9'; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
                    }
                    yesno = '+';
                    what = POSIX_CC_UNI_NAME("Digit");
                    }
                    yesno = '+';
                    what = POSIX_CC_UNI_NAME("Digit");
@@ -9441,10 +9448,10 @@ parseit:
                        /* consecutive digits assumed */
                        for (value = 0; value < '0'; value++)
                            stored +=
                        /* consecutive digits assumed */
                        for (value = 0; value < '0'; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
                        for (value = '9' + 1; value < 256; value++)
                            stored +=
                        for (value = '9' + 1; value < 256; value++)
                            stored +=
-                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value);
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) value, &nonbitmap);
                    }
                    yesno = '!';
                    what = POSIX_CC_UNI_NAME("Digit");
                    }
                    yesno = '!';
                    what = POSIX_CC_UNI_NAME("Digit");
@@ -9494,13 +9501,17 @@ parseit:
                    }
                    if (!SIZE_ONLY)
                        stored +=
                    }
                    if (!SIZE_ONLY)
                        stored +=
-                            S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
+                            S_set_regclass_bit(aTHX_ pRExC_state, ret, '-', &nonbitmap);
                } else
                    range = 1;  /* yeah, it's a range! */
                continue;       /* but do it the next time */
            }
        }
 
                } else
                    range = 1;  /* yeah, it's a range! */
                continue;       /* but do it the next time */
            }
        }
 
+       if (value > 255) {
+           RExC_uni_semantics = 1;
+       }
+
        /* now is the next time */
        if (!SIZE_ONLY) {
            if (prevvalue < 256) {
        /* now is the next time */
        if (!SIZE_ONLY) {
            if (prevvalue < 256) {
@@ -9517,25 +9528,32 @@ parseit:
                        for (i = prevvalue; i <= ceilvalue; i++)
                            if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
                                stored +=
                        for (i = prevvalue; i <= ceilvalue; i++)
                            if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
                                stored +=
-                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
+                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
                            }
                    } else {
                        for (i = prevvalue; i <= ceilvalue; i++)
                            if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
                                stored +=
                            }
                    } else {
                        for (i = prevvalue; i <= ceilvalue; i++)
                            if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
                                stored +=
-                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
+                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
                            }
                    }
                }
                else
 #endif
                      for (i = prevvalue; i <= ceilvalue; i++) {
                            }
                    }
                }
                else
 #endif
                      for (i = prevvalue; i <= ceilvalue; i++) {
-                       stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i);
+                       stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) i, &nonbitmap);
                      }
          }
                      }
          }
-         if (value > 255 || UTF) {
-               const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
-               const UV natvalue      = NATIVE_TO_UNI(value);
+         if (value > 255) {
+           const UV prevnatvalue  = NATIVE_TO_UNI(prevvalue);
+           const UV natvalue      = NATIVE_TO_UNI(value);
+           if (! nonbitmap) {
+               nonbitmap = _new_invlist(2);
+           }
+           nonbitmap = add_range_to_invlist(nonbitmap, prevnatvalue, natvalue);
+           ANYOF_FLAGS(ret) |= ANYOF_UTF8;
+       }
+#if 0
 
                /* If the code point requires utf8 to represent, and we are not
                 * folding, it can't match unless the target is in utf8.  Only
 
                /* If the code point requires utf8 to represent, and we are not
                 * folding, it can't match unless the target is in utf8.  Only
@@ -9632,6 +9650,7 @@ parseit:
                    }
                }
            }
                    }
                }
            }
+#endif
 #ifdef EBCDIC
            literal_endpoint = 0;
 #endif
 #ifdef EBCDIC
            literal_endpoint = 0;
 #endif
@@ -9646,6 +9665,167 @@ parseit:
         return ret;
     /****** !SIZE_ONLY AFTER HERE *********/
 
         return ret;
     /****** !SIZE_ONLY AFTER HERE *********/
 
+    /* Finish up the non-bitmap entries */
+    if (nonbitmap) {
+       UV* nonbitmap_array;
+       UV i;
+
+       /* If folding, we add to the list all characters that could fold to or
+        * from the ones already on the list */
+       if (FOLD) {
+           HV* fold_intersection;
+           UV* fold_list;
+
+           /* This is a list of all the characters that participate in folds
+            * (except marks, etc in multi-char folds */
+           if (! PL_utf8_foldable) {
+               SV* swash = swash_init("utf8", "Cased", &PL_sv_undef, 1, 0);
+               PL_utf8_foldable = _swash_to_invlist(swash);
+           }
+
+           /* This is a hash that for a particular fold gives all characters
+            * that are involved in it */
+           if (! PL_utf8_foldclosures) {
+
+               /* If the folds haven't been read in, call a fold
+                   * function to force that */
+               if (! PL_utf8_tofold) {
+                   U8 dummy[UTF8_MAXBYTES+1];
+                   STRLEN dummy_len;
+                   to_utf8_fold((U8*) "A", dummy, &dummy_len);
+               }
+               PL_utf8_foldclosures = _swash_inversion_hash(PL_utf8_tofold);
+           }
+
+           /* Only the characters in this class that participate in folds need
+            * be checked.  Get the intersection of this class and all the
+            * possible characters that are foldable.  This can quickly narrow
+            * down a large class */
+           fold_intersection = invlist_intersection(PL_utf8_foldable, nonbitmap);
+
+           /* Now look at the foldable characters in this class individually */
+           fold_list = invlist_array(fold_intersection);
+           for (i = 0; i < invlist_len(fold_intersection); i++) {
+               UV j;
+
+               /* The next entry is the beginning of the range that is in the
+                * class */
+               UV start = fold_list[i++];
+
+
+               /* The next entry is the beginning of the next range, which
+                * isn't in the class, so the end of the current range is one
+                * less than that */
+               UV end = fold_list[i] - 1;
+
+               /* Look at every character in the range */
+               for (j = start; j <= end; j++) {
+
+                   /* Get its fold */
+                   U8 foldbuf[UTF8_MAXBYTES_CASE+1];
+                   STRLEN foldlen;
+                   const UV f = to_uni_fold(j, foldbuf, &foldlen);
+
+                   if (foldlen > (STRLEN)UNISKIP(f)) {
+
+                       /* Any multicharacter foldings (disallowed in
+                        * lookbehind patterns) require the following
+                        * transform: [ABCDEF] -> (?:[ABCabcDEFd]|pq|rst) where
+                        * E folds into "pq" and F folds into "rst", all other
+                        * characters fold to single characters.  We save away
+                        * these multicharacter foldings, to be later saved as
+                        * part of the additional "s" data. */
+                       if (! RExC_in_lookbehind) {
+                           /* XXX Discard this fold if any are latin1 and LOC */
+                           SV *sv;
+
+                           if (!unicode_alternate) {
+                               unicode_alternate = newAV();
+                           }
+                           sv = newSVpvn_utf8((char*)foldbuf, foldlen, TRUE);
+                           av_push(unicode_alternate, sv);
+
+                           /* This node is variable length */
+                           OP(ret) = ANYOFV;
+                           ANYOF_FLAGS(ret) |= ANYOF_UNICODE;
+                       }
+                   }
+                   else { /* Single character fold */
+                       SV** listp;
+
+                       /* Consider "k" =~ /[K]/i.  The line above would have
+                        * just folded the 'k' to itself, and that isn't going
+                        * to match 'K'.  So we look through the closure of
+                        * everything that folds to 'k'.  That will find the
+                        * 'K'.  Initialize the list, if necessary */
+
+                       /* The data structure is a hash with the keys every
+                        * character that is folded to, like 'k', and the
+                        * values each an array of everything that folds to its
+                        * key.  e.g. [ 'k', 'K', KELVIN_SIGN ] */
+                       if ((listp = hv_fetch(PL_utf8_foldclosures,
+                                     (char *) foldbuf, foldlen, FALSE)))
+                       {
+                           AV* list = (AV*) *listp;
+                           IV k;
+                           for (k = 0; k <= av_len(list); k++) {
+                               SV** c_p = av_fetch(list, k, FALSE);
+                               UV c;
+                               if (c_p == NULL) {
+                                   Perl_croak(aTHX_ "panic: invalid PL_utf8_foldclosures structure");
+                               }
+                               c = SvUV(*c_p);
+
+                               if (c < 256 && AT_LEAST_UNI_SEMANTICS) {
+                                   stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, (U8) c, &nonbitmap);
+                               }
+                                   /* It may be that the code point is already
+                                    * in this range or already in the bitmap,
+                                    * XXX THink about LOC
+                                    * in which case we need do nothing */
+                               else if ((c < start || c > end)
+                                        && (c > 255
+                                            || ! ANYOF_BITMAP_TEST(ret, c)))
+                               {
+                                   nonbitmap = add_range_to_invlist(nonbitmap, c, c);
+                               }
+                           }
+                       }
+                   }
+               }
+           }
+           invlist_destroy(fold_intersection);
+       } /* End of processing all the folds */
+
+       /*  Here have the full list of items to match that aren't in the
+        *  bitmap.  Convert to the structure that the rest of the code is
+        *  expecting.   XXX That rest of the code should convert to this
+        *  structure */
+       nonbitmap_array = invlist_array(nonbitmap);
+       for (i = 0; i < invlist_len(nonbitmap); i++) {
+
+           /* The next entry is the beginning of the range that is in the
+            * class */
+           UV start = nonbitmap_array[i++];
+
+           /* The next entry is the beginning of the next range, which isn't
+            * in the class, so the end of the current range is one less than
+            * that */
+           UV end = nonbitmap_array[i] - 1;
+
+           if (start == end) {
+               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\n", start);
+           }
+           else {
+               /* The \t sets the whole range */
+               Perl_sv_catpvf(aTHX_ listsv, "%04"UVxf"\t%04"UVxf"\n",
+                       /* XXX EBCDIC */
+                                  start, end);
+           }
+       }
+       invlist_destroy(nonbitmap);
+    }
+
     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
      * set the FOLD flag yet, so this this does optimize those.  It doesn't
      * optimize locale.  Doing so perhaps could be done as long as there is
     /* Optimize inverted simple patterns (e.g. [^a-z]).  Note that we haven't
      * set the FOLD flag yet, so this this does optimize those.  It doesn't
      * optimize locale.  Doing so perhaps could be done as long as there is
index b59b8bf..476a966 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -6813,6 +6813,7 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
                            }
                        }
                    }
                            }
                        }
                    }
+#if 0
                    if (!match) { /* See if the folded version matches */
                        SV** listp;
 
                    if (!match) { /* See if the folded version matches */
                        SV** listp;
 
@@ -6881,6 +6882,7 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
                            }
                        }
                    }
                            }
                        }
                    }
+#endif
                }
 
                /* If we allocated a string above, free it */
                }
 
                /* If we allocated a string above, free it */
diff --git a/utf8.c b/utf8.c
index 18ff1d8..6053465 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -2657,10 +2657,6 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
            char* key_end = (char *) uvuni_to_utf8((U8*) key, val);
            STRLEN key_len = key_end - key;
 
            char* key_end = (char *) uvuni_to_utf8((U8*) key, val);
            STRLEN key_len = key_end - key;
 
-           /* And the value is what the forward mapping is from. */
-           char utf8_inverse[UTF8_MAXBYTES+1];
-           char *utf8_inverse_end = (char *) uvuni_to_utf8((U8*) utf8_inverse, inverse);
-
            /* Get the list for the map */
            if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
                list = (AV*) *listp;
            /* Get the list for the map */
            if ((listp = hv_fetch(ret, key, key_len, FALSE))) {
                list = (AV*) *listp;
@@ -2679,22 +2675,21 @@ Perl__swash_inversion_hash(pTHX_ SV* const swash)
                    Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
                }
                entry = *entryp;
                    Perl_croak(aTHX_ "panic: av_fetch() unexpectedly failed");
                }
                entry = *entryp;
-               if (SvCUR(entry) != key_len) {
-                   continue;
-               }
-               if (memEQ(key, SvPVX(entry), key_len)) {
+               if (SvUV(entry) == val) {
                    found_key = TRUE;
                    break;
                }
            }
                    found_key = TRUE;
                    break;
                }
            }
+
+           /* Make sure there is a mapping to itself on the list */
            if (! found_key) {
            if (! found_key) {
-               element = newSVpvn_flags(key, key_len, SVf_UTF8);
+               element = newSVuv(val);
                av_push(list, element);
            }
 
 
            /* Simply add the value to the list */
                av_push(list, element);
            }
 
 
            /* Simply add the value to the list */
-           element = newSVpvn_flags(utf8_inverse, utf8_inverse_end - utf8_inverse, SVf_UTF8);
+           element = newSVuv(inverse);
            av_push(list, element);
 
            /* swash_get() increments the value of val for each element in the
            av_push(list, element);
 
            /* swash_get() increments the value of val for each element in the