From 79bcf1318d6280d492396ce01959aed43fee8707 Mon Sep 17 00:00:00 2001 From: Karl Williamson Date: Sat, 7 Dec 2019 11:31:48 -0700 Subject: [PATCH] ext/XS-APItest/t/utf8_warn_base.pl: Fix for EBCDIC There were some flaws here that showed up in EBCDIC testing. --- ext/XS-APItest/t/utf8_warn_base.pl | 47 ++++++++++++++++++++++---------------- 1 file changed, 27 insertions(+), 20 deletions(-) diff --git a/ext/XS-APItest/t/utf8_warn_base.pl b/ext/XS-APItest/t/utf8_warn_base.pl index cae1679..34e8221 100644 --- a/ext/XS-APItest/t/utf8_warn_base.pl +++ b/ext/XS-APItest/t/utf8_warn_base.pl @@ -703,15 +703,16 @@ my $min_cont = $::lowest_continuation; my $continuation_shift = (isASCII) ? 6 : 5; my $continuation_mask = (1 << $continuation_shift) - 1; -sub isUTF8_CHAR($$) { # Uses first principals to determine if this is legal - # (Doesn't work if overflows) - my ($string, $length) = @_; +sub isUTF8_CHAR($$) { # Uses first principals to determine if this I8 input + # is legal. (Doesn't work if overflows) + my ($native, $length) = @_; + my $i8 = native_to_I8($native); - # Uses first principals to calculate if $string is legal + # Uses first principals to calculate if $i8 is legal return 0 if $length <= 0; - my $first = ord substr($string, 0, 1); + my $first = ord substr($i8, 0, 1); # Invariant return 1 if $length == 1 && $first < $min_cont; @@ -733,7 +734,7 @@ sub isUTF8_CHAR($$) { # Uses first principals to determine if this is legal my $cp = $bits >> $utf8skip; for my $i (1 .. $length - 1) { - my $ord = ord substr($string, $i, 1); + my $ord = ord substr($i8, $i, 1); # Wrong if not a continuation return 0 if $ord < $min_cont || $ord >= 0xC0; @@ -745,12 +746,18 @@ sub isUTF8_CHAR($$) { # Uses first principals to determine if this is legal # If the calculated value can be expressed in fewer bytes than were passed # in, is an illegal overlong. XXX if 'chr' is not working properly, this # may not be right - my $chr = chr $cp; + my $chr = uni_to_native(chr $cp); utf8::upgrade($chr); use bytes; return 0 if length $chr < $length; + # Also, its possible on EBCDIC platforms that have more illegal start + # bytes than ASCII ones (like C3, C4) for something to have the same + # length but still be overlong. We make sure the first byte isn't smaller + # than the first byte of the real representation. + return 0 if substr($native, 0, 1) lt substr($chr, 0, 1); + return 1; } @@ -786,9 +793,11 @@ if ($::TEST_CHUNK == 0 && $ENV{PERL_DEBUG_FULL_TEST} && $ENV{PERL_DEBUG_FULL_TEST} == 97) { + # We construct UTF-8 (I8 on EBCDIC platforms converted later to native) + my $min_cont_mask = $min_cont | 0xF; my @bytes = ( 0, # Placeholder to signify to use an empty string "" - ord 'A',# We assume that all the invariant characters are + 0x41, # We assume that all the invariant characters are # properly in the same class, so this is an exemplar # character $min_cont .. 0xFF # But test every non-invariant individually @@ -875,33 +884,31 @@ if ($::TEST_CHUNK == 0 my $should_be_string; if ($length == 1) { - $should_be_string = chr $cp; + $should_be_string = native_to_I8(chr $cp); } else { # Starting with the code point, use first # principals to find the equivalent I8 string my @bytes; - my $uv = $cp; + my $uv = ord native_to_uni(chr $cp); for (my $i = $length - 1; $i > 0; $i--) { - $bytes[$i] = chr I8_to_native(($uv & $mask) - | $mark); + $bytes[$i] = chr (($uv & $mask) | $mark); $uv >>= $continuation_shift; } - $bytes[0] = chr I8_to_native(( $uv - & start_mask($length)) + $bytes[0] = chr ($uv & start_mask($length) | start_mark($length)); $should_be_string = join "", @bytes; } # If the original string and the inverse are the # same, it worked. - if (is($native, $should_be_string, - "utf8n_to_uvchr_msgs(" - . display_bytes($native) - . ") returns correct uv=0x" - . sprintf ("%x", $cp))) - { + my $test_name = "utf8n_to_uvchr_msgs(" + . display_bytes($native) + . ") yields " + . sprintf ("0x%x", $cp) + . "; does its I8 eq original"; + if (is($should_be_string, $string, $test_name)) { my $is_surrogate = $cp >= 0xD800 && $cp <= 0xDFFF; my $got_surrogate -- 1.8.3.1