This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regexec.c: Latin1 chars can fold match UTF8_ALL
authorKarl Williamson <public@khwilliamson.com>
Sat, 27 Nov 2010 17:26:01 +0000 (10:26 -0700)
committerFather Chrysostomos <sprout@cpan.org>
Sun, 28 Nov 2010 12:49:14 +0000 (04:49 -0800)
Some ANYOF regnodes have the ANYOF_UNICODE_ALL flag set, which means
they match any non-Latin1 character.  These should match /i (in a utf8
target string) any ASCII or Latin1 character that folds outside the
Latin1 range

As part of this patch, an internal only macro is renamed to account for its
new use in regexec.c.  The cumbersome name is to ward off others from
using it until the final semantics have been settled on.

handy.h
regcomp.c
regexec.c

diff --git a/handy.h b/handy.h
index 391156a..216d0ea 100644 (file)
--- a/handy.h
+++ b/handy.h
@@ -609,7 +609,7 @@ patched there.  The file as of this writing is cpan/Devel-PPPort/parts/inc/misc
 #   define isUPPER_A(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_UPPER_A))
 #   define isWORDCHAR_A(c) cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_WORDCHAR_A))
 #   define isXDIGIT_A(c)  cBOOL(FITS_IN_8_BITS(c) && (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_XDIGIT_A))
-#   define _NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_NONLATIN1_FOLD))
+#   define _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c) ((! cBOOL(FITS_IN_8_BITS(c))) || (PL_charclass[(U8) NATIVE_TO_UNI(c)] & _CC_NONLATIN1_FOLD))
 #else   /* No perl.h. */
 #   define isOCTAL_A(c)  ((c) >= '0' && (c) <= '9')
 #   ifdef EBCDIC
index 07834a0..23824ac 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -8184,7 +8184,7 @@ S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8
     if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
        ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
     }
-    else if (_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C(value)
+    else if (_HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(value)
             || (! UNI_SEMANTICS
                  && ! isASCII(value)
                  && PL_fold_latin1[value] != value))
index 375d4fd..874dce3 100644 (file)
--- a/regexec.c
+++ b/regexec.c
@@ -6300,11 +6300,17 @@ S_reginclass(pTHX_ const regexp * const prog, register const regnode * const n,
     /* If the bitmap didn't (or couldn't) match, and something outside the
      * bitmap could match, try that */
     if (!match) {
-       if (utf8_target && (flags & ANYOF_UNICODE_ALL) && c >= 256) {
-           match = TRUE;
+       if (utf8_target && (flags & ANYOF_UNICODE_ALL)) {
+           if (c >= 256
+               || ((flags & ANYOF_FOLD) /* Latin1 1 that has a non-Latin1 fold
+                                           should match */
+                   && _HAS_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C_AND_REGEXEC_DOT_C(c)))
+           {
+               match = TRUE;
+           }
        }
-       else if ((flags & ANYOF_NONBITMAP_NON_UTF8)
-                || (utf8_target && flags & ANYOF_UTF8))
+       if (!match && ((flags & ANYOF_NONBITMAP_NON_UTF8)
+                      || (utf8_target && flags & ANYOF_UTF8)))
        {
            AV *av;
            SV * const sw = regclass_swash(prog, n, TRUE, 0, (SV**)&av);