regcomp.c: Debug output clearer ranges
authorKarl Williamson <public@khwilliamson.com>
Sun, 28 Jul 2013 00:45:18 +0000 (18:45 -0600)
committerKarl Williamson <public@khwilliamson.com>
Tue, 30 Jul 2013 18:05:47 +0000 (12:05 -0600)
It's not immediately obvious what the character class [!-~] matches.
Better is its equivalent: [\x21-\x7e].  This commit changes the debug
output to be the latter for character class matches, while retaining the
current behavior where it is clear what the range matches, in, e.g.,
[J-R].  Ranges like [A-z] include more than just alphabetics, so they
are now output as [\x41-\x7a].  (Debug output is done, for example, when
the command line option -Dr is specified.)

regcomp.c

index 72dd943..b3c66b8 100644 (file)
--- a/regcomp.c
+++ b/regcomp.c
@@ -15526,13 +15526,37 @@ S_put_latin1_charclass_innards(pTHX_ SV *sv, char *bitmap)
             if (rangestart == -1)
                 rangestart = i;
         } else if (rangestart != -1) {
-            if (i <= rangestart + 3)
+            int j = i - 1;
+            if (i <= rangestart + 3) {  /* Individual chars in short ranges */
                 for (; rangestart < i; rangestart++)
                     put_byte(sv, rangestart);
-            else {
+            }
+            else if (   j > 255
+                     || ! isALPHANUMERIC(rangestart)
+                     || ! isALPHANUMERIC(j)
+                     || isDIGIT(rangestart) != isDIGIT(j)
+                     || isUPPER(rangestart) != isUPPER(j)
+                     || isLOWER(rangestart) != isLOWER(j)
+
+                        /* This final test should get optimized out except
+                         * on EBCDIC platforms, where it causes ranges that
+                         * cross discontinuities like i/j to be shown as hex
+                         * instead of the misleading, e.g. H-K (since that
+                         * range includes more than H, I, J, K). */
+                     || (j - rangestart)
+                         != NATIVE_TO_ASCII(j) - NATIVE_TO_ASCII(rangestart))
+            {
+                Perl_sv_catpvf(aTHX_ sv, "\\x{%02x}-\\x{%02x}",
+                               rangestart,
+                               (j < 256) ? j : 255);
+            }
+            else { /* Here, the ends of the range are both digits, or both
+                      uppercase, or both lowercase; and there's no
+                      discontinuity in the range (which could happen on EBCDIC
+                      platforms) */
                 put_byte(sv, rangestart);
                 sv_catpvs(sv, "-");
-                put_byte(sv, i - 1);
+                put_byte(sv, j);
             }
             rangestart = -1;
             has_output_anything = TRUE;