This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
[bracketed char class] fixes
authorKarl Williamson <public@khwilliamson.com>
Mon, 15 Nov 2010 20:51:24 +0000 (13:51 -0700)
committerFather Chrysostomos <sprout@cpan.org>
Mon, 22 Nov 2010 21:32:57 +0000 (13:32 -0800)
This patch adds two functions for setting the ANYOF node bitmaps.  The
one for dealing with folds has intelligence as to what to do if unicode
semantics is in effect.

Together with previous commits, this fixes the unicode bug for bracketed
character classes, as far as known bugs go, so pods are updated as well.

pod/perldelta.pod
pod/perlunicode.pod
regcomp.c

index 38750fa..544a813 100644 (file)
@@ -324,7 +324,9 @@ from either 5.XXX.XXX or 5.XXX.XXX.
 
 =item *
 
-XXX
+A number of bugs with regular expression bracketed character classes
+have been fixed, mostly having to do with matching characters in the
+non-ASCII Latin-1 range.
 
 =back
 
index 978bede..b950f7b 100644 (file)
@@ -1515,10 +1515,31 @@ support seamlessly.  The result wasn't seamless: these characters were
 orphaned.
 
 Work is being done to correct this, but only some of it is complete.
-What has been finished is the matching of C<\b>, C<\s>, C<\w> and the Posix
-character classes and their complements in regular expressions, and the
-important part of the case
-changing component.  Due to concerns, and some evidence, that older code might
+What has been finished is:
+
+=over
+
+=item *
+
+the matching of C<\b>, C<\s>, C<\w> and the Posix
+character classes and their complements in regular expressions
+
+=item *
+
+case changing (but not user-defined casing)
+
+=item *
+
+case-insensitive (C</i>) regular expression matching for [bracketed
+character classes] only, except for some bugs with C<LATIN SMALL
+LETTER SHARP S> (which is supposed to match the two character sequence
+"ss" (or "Ss" or "sS" or "SS"), but Perl has a number of bugs for all
+such multi-character case insensitive characters, of which this is just
+one example.
+
+=back
+
+Due to concerns, and some evidence, that older code might
 have come to rely on the existing behavior, the new behavior must be explicitly
 enabled by the feature C<unicode_strings> in the L<feature> pragma, even though
 no new syntax is involved.
index ec4c792..5437864 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -8083,14 +8083,14 @@ S_checkposixcc(pTHX_ RExC_state_t *pRExC_state)
 ANYOF_##NAME:                                           \
        for (value = 0; value < 256; value++)           \
            if (TEST)                                   \
-               ANYOF_BITMAP_SET(ret, value);           \
+               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
     yesno = '+';                                        \
     what = WORD;                                        \
     break;                                              \
 case ANYOF_N##NAME:                                     \
        for (value = 0; value < 256; value++)           \
            if (!TEST)                                  \
-               ANYOF_BITMAP_SET(ret, value);           \
+               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
     yesno = '!';                                        \
     what = WORD;                                        \
     break
@@ -8104,12 +8104,14 @@ ANYOF_##NAME:                                           \
     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_##NAME);        \
     else if (UNI_SEMANTICS) {                           \
         for (value = 0; value < 256; value++) {         \
-            if (TEST_8) ANYOF_BITMAP_SET(ret, value);   \
+            if (TEST_8) stored +=                       \
+                      S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
         }                                               \
     }                                                   \
     else {                                              \
         for (value = 0; value < 256; value++) {         \
-            if (TEST_7) ANYOF_BITMAP_SET(ret, value);   \
+            if (TEST_7) stored +=                       \
+                       S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
         }                                               \
     }                                                   \
     yesno = '+';                                        \
@@ -8119,12 +8121,14 @@ case ANYOF_N##NAME:                                     \
     if (LOC) ANYOF_CLASS_SET(ret, ANYOF_N##NAME);       \
     else if (UNI_SEMANTICS) {                           \
         for (value = 0; value < 256; value++) {         \
-            if (! TEST_8) ANYOF_BITMAP_SET(ret, value); \
+            if (! TEST_8) stored +=                     \
+                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
         }                                               \
     }                                                   \
     else {                                              \
         for (value = 0; value < 256; value++) {         \
-            if (! TEST_7) ANYOF_BITMAP_SET(ret, value); \
+            if (! TEST_7) stored +=                     \
+                        S_set_regclass_bit(aTHX_ pRExC_state, ret, value); \
         }                                               \
     }                                                   \
     yesno = '!';                                        \
@@ -8147,6 +8151,79 @@ case ANYOF_N##NAME:                                     \
 #define POSIX_CC_UNI_NAME(CCNAME) "Posix" CCNAME
 #endif
 
+STATIC U8
+S_set_regclass_bit_fold(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U8 value)
+{
+
+    /* Handle the setting of folds in the bitmap for non-locale ANYOF nodes.
+     * Locale folding is done at run-time, so this function should not be
+     * called for nodes that are for locales.
+     *
+     * This function simply sets the bit corresponding to the fold of the input
+     * 'value', if not already set.  The fold of 'f' is 'F', and the fold of
+     * 'F' is 'f'.
+     *
+     * It also sets any necessary flags, and returns the number of bits that
+     * actually changed from 0 to 1 */
+
+    U8 stored = 0;
+    SV *sv;
+    U8 fold;
+
+    fold = (UNI_SEMANTICS) ? PL_fold_latin1[value]
+                           : PL_fold[value];
+
+    /* It assumes the bit for 'value' has already been set */
+    if (fold != value && ! ANYOF_BITMAP_TEST(node, fold)) {
+        ANYOF_BITMAP_SET(node, fold);
+        stored++;
+    }
+
+    /* The fold of the German sharp s is two ASCII characters, so isn't in the
+     * bitmap and doesn't have to be in utf8, but we only process it if unicode
+     * semantics are called for */
+    if (UNI_SEMANTICS && value == LATIN_SMALL_LETTER_SHARP_S) {
+       ANYOF_FLAGS(node) |= ANYOF_NONBITMAP_NON_UTF8;
+    }
+    else if (_NONLATIN1_FOLD_CLOSURE_ONLY_FOR_USE_BY_REGCOMP_DOT_C(value)
+            || (! UNI_SEMANTICS
+                 && ! isASCII(value)
+                 && PL_fold_latin1[value] != value))
+    {   /* A character that has a fold outside of Latin1 matches outside the
+           bitmap, but only when the target string is utf8.  Similarly when we
+           don't have unicode semantics for the above ASCII Latin-1 characters,
+           and they have a fold, they should match if the target is utf8, and
+           not otherwise */
+       ANYOF_FLAGS(node) |= ANYOF_UTF8;
+    }
+
+    return stored;
+}
+
+
+PERL_STATIC_INLINE U8
+S_set_regclass_bit(pTHX_ RExC_state_t *pRExC_state, regnode* node, const U32 value)
+{
+    /* This inline function sets a bit in the bitmap if not already set, and if
+     * appropriate, its fold, returning the number of bits that actually
+     * changed from 0 to 1 */
+
+    U8 stored;
+
+    if (ANYOF_BITMAP_TEST(node, value)) {   /* Already set */
+       return 0;
+    }
+
+    ANYOF_BITMAP_SET(node, value);
+    stored = 1;
+
+    if (FOLD && ! LOC) {       /* Locale folds aren't known until runtime */
+       stored += S_set_regclass_bit_fold(aTHX_ pRExC_state, node, value);
+    }
+
+    return stored;
+}
+
 /*
    parse a class specification and produce either an ANYOF node that
    matches the pattern or if the pattern matches a single char only and
@@ -8444,8 +8521,10 @@ parseit:
                               w, w, rangebegin);
 
                    if (prevvalue < 256) {
-                       ANYOF_BITMAP_SET(ret, prevvalue);
-                       ANYOF_BITMAP_SET(ret, '-');
+                       stored +=
+                         S_set_regclass_bit(aTHX_ pRExC_state, ret, prevvalue);
+                       stored +=
+                         S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
                    }
                    else {
                        ANYOF_FLAGS(ret) |= ANYOF_UTF8;
@@ -8499,11 +8578,12 @@ parseit:
                    else {
 #ifndef EBCDIC
                        for (value = 0; value < 128; value++)
-                           ANYOF_BITMAP_SET(ret, value);
+                           stored +=
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
 #else  /* EBCDIC */
                        for (value = 0; value < 256; value++) {
                            if (isASCII(value))
-                               ANYOF_BITMAP_SET(ret, value);
+                               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
                        }
 #endif /* EBCDIC */
                    }
@@ -8516,11 +8596,12 @@ parseit:
                    else {
 #ifndef EBCDIC
                        for (value = 128; value < 256; value++)
-                           ANYOF_BITMAP_SET(ret, value);
+                           stored +=
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
 #else  /* EBCDIC */
                        for (value = 0; value < 256; value++) {
                            if (!isASCII(value))
-                               ANYOF_BITMAP_SET(ret, value);
+                               stored += S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
                        }
 #endif /* EBCDIC */
                    }
@@ -8533,7 +8614,8 @@ parseit:
                    else {
                        /* consecutive digits assumed */
                        for (value = '0'; value <= '9'; value++)
-                           ANYOF_BITMAP_SET(ret, value);
+                           stored +=
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
                    }
                    yesno = '+';
                    what = POSIX_CC_UNI_NAME("Digit");
@@ -8544,9 +8626,11 @@ parseit:
                    else {
                        /* consecutive digits assumed */
                        for (value = 0; value < '0'; value++)
-                           ANYOF_BITMAP_SET(ret, value);
+                           stored +=
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
                        for (value = '9' + 1; value < 256; value++)
-                           ANYOF_BITMAP_SET(ret, value);
+                           stored +=
+                              S_set_regclass_bit(aTHX_ pRExC_state, ret, value);
                    }
                    yesno = '!';
                    what = POSIX_CC_UNI_NAME("Digit");
@@ -8597,7 +8681,8 @@ parseit:
                               w, w, rangebegin);
                    }
                    if (!SIZE_ONLY)
-                       ANYOF_BITMAP_SET(ret, '-');
+                       stored +=
+                            S_set_regclass_bit(aTHX_ pRExC_state, ret, '-');
                } else
                    range = 1;  /* yeah, it's a range! */
                continue;       /* but do it the next time */
@@ -8620,14 +8705,14 @@ parseit:
                    if (isLOWER(prevvalue)) {
                        for (i = prevvalue; i <= ceilvalue; i++)
                            if (isLOWER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
-                               stored++;
-                               ANYOF_BITMAP_SET(ret, i);
+                               stored +=
+                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
                            }
                    } else {
                        for (i = prevvalue; i <= ceilvalue; i++)
                            if (isUPPER(i) && !ANYOF_BITMAP_TEST(ret,i)) {
-                               stored++;
-                               ANYOF_BITMAP_SET(ret, i);
+                               stored +=
+                                  S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
                            }
                    }
                }
@@ -8635,8 +8720,8 @@ parseit:
 #endif
                      for (i = prevvalue; i <= ceilvalue; i++) {
                        if (!ANYOF_BITMAP_TEST(ret,i)) {
-                           stored++;  
-                           ANYOF_BITMAP_SET(ret, i);
+                           stored +=
+                                S_set_regclass_bit(aTHX_ pRExC_state, ret, i);
                        }
                      }
          }