my $continuation_shift = (isASCII) ? 6 : 5;
my $continuation_mask = (1 << $continuation_shift) - 1;
-sub isUTF8_CHAR($$) { # Uses first principals to determine if this is legal
- # (Doesn't work if overflows)
- my ($string, $length) = @_;
+sub isUTF8_CHAR($$) { # Uses first principals to determine if this I8 input
+ # is legal. (Doesn't work if overflows)
+ my ($native, $length) = @_;
+ my $i8 = native_to_I8($native);
- # Uses first principals to calculate if $string is legal
+ # Uses first principals to calculate if $i8 is legal
return 0 if $length <= 0;
- my $first = ord substr($string, 0, 1);
+ my $first = ord substr($i8, 0, 1);
# Invariant
return 1 if $length == 1 && $first < $min_cont;
my $cp = $bits >> $utf8skip;
for my $i (1 .. $length - 1) {
- my $ord = ord substr($string, $i, 1);
+ my $ord = ord substr($i8, $i, 1);
# Wrong if not a continuation
return 0 if $ord < $min_cont || $ord >= 0xC0;
# If the calculated value can be expressed in fewer bytes than were passed
# in, is an illegal overlong. XXX if 'chr' is not working properly, this
# may not be right
- my $chr = chr $cp;
+ my $chr = uni_to_native(chr $cp);
utf8::upgrade($chr);
use bytes;
return 0 if length $chr < $length;
+ # Also, its possible on EBCDIC platforms that have more illegal start
+ # bytes than ASCII ones (like C3, C4) for something to have the same
+ # length but still be overlong. We make sure the first byte isn't smaller
+ # than the first byte of the real representation.
+ return 0 if substr($native, 0, 1) lt substr($chr, 0, 1);
+
return 1;
}
&& $ENV{PERL_DEBUG_FULL_TEST}
&& $ENV{PERL_DEBUG_FULL_TEST} == 97)
{
+ # We construct UTF-8 (I8 on EBCDIC platforms converted later to native)
+
my $min_cont_mask = $min_cont | 0xF;
my @bytes = ( 0, # Placeholder to signify to use an empty string ""
- ord 'A',# We assume that all the invariant characters are
+ 0x41, # We assume that all the invariant characters are
# properly in the same class, so this is an exemplar
# character
$min_cont .. 0xFF # But test every non-invariant individually
my $should_be_string;
if ($length == 1) {
- $should_be_string = chr $cp;
+ $should_be_string = native_to_I8(chr $cp);
}
else {
# Starting with the code point, use first
# principals to find the equivalent I8 string
my @bytes;
- my $uv = $cp;
+ my $uv = ord native_to_uni(chr $cp);
for (my $i = $length - 1; $i > 0; $i--) {
- $bytes[$i] = chr I8_to_native(($uv & $mask)
- | $mark);
+ $bytes[$i] = chr (($uv & $mask) | $mark);
$uv >>= $continuation_shift;
}
- $bytes[0] = chr I8_to_native(( $uv
- & start_mask($length))
+ $bytes[0] = chr ($uv & start_mask($length)
| start_mark($length));
$should_be_string = join "", @bytes;
}
# If the original string and the inverse are the
# same, it worked.
- if (is($native, $should_be_string,
- "utf8n_to_uvchr_msgs("
- . display_bytes($native)
- . ") returns correct uv=0x"
- . sprintf ("%x", $cp)))
- {
+ my $test_name = "utf8n_to_uvchr_msgs("
+ . display_bytes($native)
+ . ") yields "
+ . sprintf ("0x%x", $cp)
+ . "; does its I8 eq original";
+ if (is($should_be_string, $string, $test_name)) {
my $is_surrogate = $cp >= 0xD800
&& $cp <= 0xDFFF;
my $got_surrogate