This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
ext/XS-APItest/t/utf8_warn_base.pl: Fix for EBCDIC
authorKarl Williamson <khw@cpan.org>
Sat, 7 Dec 2019 18:31:48 +0000 (11:31 -0700)
committerKarl Williamson <khw@cpan.org>
Sat, 7 Dec 2019 22:50:38 +0000 (15:50 -0700)
There were some flaws here that showed up in EBCDIC testing.

ext/XS-APItest/t/utf8_warn_base.pl

index cae1679..34e8221 100644 (file)
@@ -703,15 +703,16 @@ my $min_cont = $::lowest_continuation;
 my $continuation_shift = (isASCII) ? 6 : 5;
 my $continuation_mask = (1 << $continuation_shift) - 1;
 
-sub isUTF8_CHAR($$) {   # Uses first principals to determine if this is legal
-                        # (Doesn't work if overflows)
-    my ($string, $length) = @_;
+sub isUTF8_CHAR($$) {   # Uses first principals to determine if this I8 input
+                        # is legal.  (Doesn't work if overflows)
+    my ($native, $length) = @_;
+    my $i8 = native_to_I8($native);
 
-    # Uses first principals to calculate if $string is legal
+    # Uses first principals to calculate if $i8 is legal
 
     return 0 if $length <= 0;
 
-    my $first = ord substr($string, 0, 1);
+    my $first = ord substr($i8, 0, 1);
 
     # Invariant
     return 1 if $length == 1 && $first < $min_cont;
@@ -733,7 +734,7 @@ sub isUTF8_CHAR($$) {   # Uses first principals to determine if this is legal
     my $cp = $bits >> $utf8skip;
 
     for my $i (1 .. $length - 1) {
-        my $ord = ord substr($string, $i, 1);
+        my $ord = ord substr($i8, $i, 1);
 
         # Wrong if not a continuation
         return 0 if $ord < $min_cont || $ord >= 0xC0;
@@ -745,12 +746,18 @@ sub isUTF8_CHAR($$) {   # Uses first principals to determine if this is legal
     # If the calculated value can be expressed in fewer bytes than were passed
     # in, is an illegal overlong.  XXX if 'chr' is not working properly, this
     # may not be right
-    my $chr = chr $cp;
+    my $chr = uni_to_native(chr $cp);
     utf8::upgrade($chr);
 
     use bytes;
     return 0 if length $chr < $length;
 
+    # Also, its possible on EBCDIC platforms that have more illegal start
+    # bytes than ASCII ones (like C3, C4) for something to have the same
+    # length but still be overlong.  We make sure the first byte isn't smaller
+    # than the first byte of the real representation.
+    return 0 if substr($native, 0, 1) lt substr($chr, 0, 1);
+
     return 1;
 }
 
@@ -786,9 +793,11 @@ if ($::TEST_CHUNK == 0
 && $ENV{PERL_DEBUG_FULL_TEST}
 && $ENV{PERL_DEBUG_FULL_TEST} == 97)
 {
+    # We construct UTF-8 (I8 on EBCDIC platforms converted later to native)
+
     my $min_cont_mask = $min_cont | 0xF;
     my @bytes = (   0,  # Placeholder to signify to use an empty string ""
-                ord 'A',# We assume that all the invariant characters are
+                 0x41,  # We assume that all the invariant characters are
                         # properly in the same class, so this is an exemplar
                         # character
                 $min_cont .. 0xFF   # But test every non-invariant individually
@@ -875,33 +884,31 @@ if ($::TEST_CHUNK == 0
 
                             my $should_be_string;
                             if ($length == 1) {
-                                $should_be_string = chr $cp;
+                                $should_be_string = native_to_I8(chr $cp);
                             }
                             else {
 
                                 # Starting with the code point, use first
                                 # principals to find the equivalent I8 string
                                 my @bytes;
-                                my $uv = $cp;
+                                my $uv = ord native_to_uni(chr $cp);
                                 for (my $i = $length - 1; $i > 0; $i--) {
-                                    $bytes[$i] = chr I8_to_native(($uv & $mask)
-                                                                  | $mark);
+                                    $bytes[$i] = chr (($uv & $mask) | $mark);
                                     $uv >>= $continuation_shift;
                                 }
-                                $bytes[0] = chr I8_to_native((   $uv
-                                                        & start_mask($length))
+                                $bytes[0] = chr ($uv & start_mask($length)
                                             | start_mark($length));
                                 $should_be_string = join "", @bytes;
                             }
 
                             # If the original string and the inverse are the
                             # same, it worked.
-                            if (is($native, $should_be_string,
-                                    "utf8n_to_uvchr_msgs("
-                                 .  display_bytes($native)
-                                 . ") returns correct uv=0x"
-                                 . sprintf ("%x", $cp)))
-                            {
+                            my $test_name = "utf8n_to_uvchr_msgs("
+                                          . display_bytes($native)
+                                          . ") yields "
+                                          . sprintf ("0x%x", $cp)
+                                          . "; does its I8 eq original";
+                            if (is($should_be_string, $string, $test_name)) {
                                 my $is_surrogate = $cp >= 0xD800
                                                 && $cp <= 0xDFFF;
                                 my $got_surrogate