Fix regex /il and /iaa failures for single element [] class
authorKarl Williamson <public@khwilliamson.com>
Thu, 9 May 2013 05:06:17 +0000 (23:06 -0600)
committerKarl Williamson <public@khwilliamson.com>
Thu, 9 May 2013 16:15:13 +0000 (10:15 -0600)
This was a regression introduced in the v5.17 series.  It only affected
UTF-8 encoded patterns.  Basically, the code here should have
corresponded to, and didn't, similar logic located after the defchar:
label in this file, which is executed for the general case (not stemming
from a single element [bracketed] character class node).

We don't fold code points 0-255 under locale, as those aren't known
until run time.  Similarly, we don't allow folds that cross the 255/256
boundary, as those aren't well-defined; and under /aa we don't allow
folds that cross the 127/128 boundary.

regcomp.c
t/re/fold_grind.t

index de17958..bc0c0ef 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -10131,8 +10131,9 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32
      * additionally will populate the node's STRING with <code_point>, if <len>
      * is 0.  In both cases <*flagp> is appropriately set
      *
-     * It knows that under FOLD, UTF characters and the Latin Sharp S must be
-     * folded (the latter only when the rules indicate it can match 'ss') */
+     * It knows that under FOLD, the Latin Sharp S and UTF characters above
+     * 255, must be folded (the former only when the rules indicate it can
+     * match 'ss') */
 
     bool len_passed_in = cBOOL(len != 0);
     U8 character[UTF8_MAXBYTES_CASE+1];
@@ -10141,8 +10142,15 @@ S_alloc_maybe_populate_EXACT(pTHX_ RExC_state_t *pRExC_state, regnode *node, I32
 
     if (! len_passed_in) {
         if (UTF) {
-            if (FOLD) {
-                to_uni_fold(NATIVE_TO_UNI(code_point), character, &len);
+            if (FOLD && (! LOC || code_point > 255)) {
+                _to_uni_fold_flags(NATIVE_TO_UNI(code_point),
+                                   character,
+                                   &len,
+                                   FOLD_FLAGS_FULL | ((LOC)
+                                                     ? FOLD_FLAGS_LOCALE
+                                                     : (ASCII_FOLD_RESTRICTED)
+                                                       ? FOLD_FLAGS_NOMIX_ASCII
+                                                       : 0));
             }
             else {
                 uvchr_to_utf8( character, code_point);
index 3267336..bb45a69 100644 (file)
@@ -666,6 +666,8 @@ foreach my $test (sort { numerically } keys %tests) {
           foreach my $bracketed (0, 1) {   # Put rhs in [...], or not
             next if $bracketed && @pattern != 1;    # bracketed makes these
                                                     # or's instead of a sequence
+            foreach my $optimize_bracketed (0, 1) {
+                next if $optimize_bracketed && ! $bracketed;
             foreach my $inverted (0,1) {
                 next if $inverted && ! $bracketed;  # inversion only valid in [^...]
                 next if $inverted && @target != 1;  # [perl #89750] multi-char
@@ -687,8 +689,9 @@ foreach my $test (sort { numerically } keys %tests) {
                       $rhs .=  $rhs_char;
 
                       # Add a character to the class, so class doesn't get
-                      # optimized out
-                      $rhs .= '_]' if $bracketed;
+                      # optimized out, unless we are testing that optimization
+                      $rhs .= '_' if $optimize_bracketed;
+                      $rhs .= ']' if $bracketed;
                   }
 
                   # Add one of: no capturing parens
@@ -812,6 +815,7 @@ foreach my $test (sort { numerically } keys %tests) {
               }
             }
           }
+          }
         }
       }
       unless($list_all_tests) {