#!perl -w
# This is a base file to be used by various .t's in its directory
-# It tests various code points that are "problematic", and verifies that the
-# correct warnings/flags etc are generated when using them. It also takes the
-# UTF-8 for some of them and perturbs it to be malformed in various ways, and
-# tests that this gets appropriately detected.
+# It tests various malformed UTF-8 sequences and some code points that are
+# "problematic", and verifies that the correct warnings/flags etc are
+# generated when using them. For the code points, it also takes the UTF-8 and
+# perturbs it to be malformed in various ways, and tests that this gets
+# appropriately detected.
use strict;
use Test::More;
return shift > $highest_non_extended_utf8_cp;
}
+sub is_extended_utf8($) {
+
+ # Returns a boolean as to whether or not the input UTF-8 sequence uses
+ # Perl extended UTF-8.
+
+ my $byte = substr(shift, 0, 1);
+ return ord $byte >= 0xFE if isASCII;
+ return $byte == I8_to_native("\xFF");
+}
+
sub overflow_discern_len($) {
- # Returns how many bytes are needed to tell if a UTF-8 sequence is for a
- # code point that won't fit in the platform's word size. Only the length
- # of the sequence representing a single code point is needed.
+ # Returns how many bytes are needed to tell if a non-overlong UTF-8
+ # sequence is for a code point that won't fit in the platform's word size.
+ # Only the length of the sequence representing a single code point is
+ # needed.
if (isASCII) {
return ($::is64bit) ? 3 : ((shift == $::max_bytes)
return ($::is64bit) ? 2 : 8;
}
+sub overlong_discern_len($) {
+
+ # Returns how many bytes are needed to tell if the input UTF-8 sequence
+ # for a code point is overlong
+
+ my $string = shift;
+ my $length = length $string;
+ my $byte = ord native_to_I8(substr($string, 0, 1));
+ if (isASCII) {
+ return ($length == $::max_bytes)
+ # This is constrained to 1 on 32-bit machines, as it
+ # overflows there
+ ? (($::is64bit) ? 7 : 1)
+ : (($length == 2) ? 1 : 2);
+ }
+
+ return ($length == $::max_bytes) ? 8 : (($length <= 3) ? 1 : 2);
+}
+
my @tests;
{
no warnings qw(portable overflow);
# like being a surrogate; 0 indicates we need
# the whole string. Some categories have a
# default that is used if this is omitted.
+ [ "overlong malformation, lowest 2-byte",
+ (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"),
+ 0, # NUL
+ ],
+ [ "overlong malformation, highest 2-byte",
+ (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
+ (isASCII) ? 0x7F : utf8::unicode_to_native(0x9F),
+ ],
+ [ "overlong malformation, lowest 3-byte",
+ (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
+ 0, # NUL
+ ],
+ [ "overlong malformation, highest 3-byte",
+ (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
+ (isASCII) ? 0x7FF : 0x3FF,
+ ],
[ "lowest surrogate",
(isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
0xD800,
(isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
0xFFFF,
],
+ [ "overlong malformation, lowest 4-byte",
+ (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
+ 0, # NUL
+ ],
+ [ "overlong malformation, highest 4-byte",
+ (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
+ (isASCII) ? 0xFFFF : 0x3FFF,
+ ],
[ "non-character code point U+1FFFE",
(isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
0x1FFFE,
(isASCII) ? 0x140000 : 0x200000,
1,
],
+ [ "overlong malformation, lowest 5-byte",
+ (isASCII)
+ ? "\xf8\x80\x80\x80\x80"
+ : I8_to_native("\xf8\xa0\xa0\xa0\xa0"),
+ 0, # NUL
+ ],
+ [ "overlong malformation, highest 5-byte",
+ (isASCII)
+ ? "\xf8\x87\xbf\xbf\xbf"
+ : I8_to_native("\xf8\xa7\xbf\xbf\xbf"),
+ (isASCII) ? 0x1FFFFF : 0x3FFFF,
+ ],
+ [ "overlong malformation, lowest 6-byte",
+ (isASCII)
+ ? "\xfc\x80\x80\x80\x80\x80"
+ : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"),
+ 0, # NUL
+ ],
+ [ "overlong malformation, highest 6-byte",
+ (isASCII)
+ ? "\xfc\x83\xbf\xbf\xbf\xbf"
+ : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"),
+ (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
+ ],
+ [ "overlong malformation, lowest 7-byte",
+ (isASCII)
+ ? "\xfe\x80\x80\x80\x80\x80\x80"
+ : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"),
+ 0, # NUL
+ ],
+ [ "overlong malformation, highest 7-byte",
+ (isASCII)
+ ? "\xfe\x81\xbf\xbf\xbf\xbf\xbf"
+ : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"),
+ (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
+ ],
[ "lowest 32 bit code point",
(isASCII)
? "\xfe\x82\x80\x80\x80\x80\x80"
if (! $::is64bit) {
if (isASCII) {
push @tests,
+ [ "overlong malformation, but naively looks like overflow",
+ "\xff\x80\x80\x80\x80\x80\x80\x83\xbf\xbf\xbf\xbf\xbf",
+ 0xFFFFFFFF,
+ ],
[ "overflow that old algorithm failed to detect",
"\xfe\x86\x80\x80\x80\x80\x80",
-1,
}
}
+ push @tests,
+ [ "overlong malformation, lowest max-byte",
+ (isASCII)
+ ? "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
+ : I8_to_native(
+ "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
+ 0, # NUL
+ ],
+ [ "overlong malformation, highest max-byte",
+ (isASCII) # 2**36-1 on ASCII; 2**30-1 on EBCDIC
+ ? "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf"
+ : I8_to_native(
+ "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"),
+ (isASCII) ? (($::is64bit) ? 0xFFFFFFFFF : -1) : 0x3FFFFFFF,
+ ];
+
+ if (isASCII) {
+ push @tests,
+ [ "Lowest code point requiring 13 bytes to represent", # 2**36
+ "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
+ ($::is64bit) ? 0x1000000000 : -1, # overflows on 32bit
+ ],
+ };
+
if ($::is64bit) {
push @tests,
[ "highest 64 bit code point",
];
if (isASCII) {
push @tests,
- [ "Lowest code point requiring 13 bytes to represent",
- "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
- 0x1000000000,
- ],
[ "overflow that old algorithm failed to detect",
"\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
-1,
my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
my $length = length $bytes;
+ my $initially_overlong = $testname =~ /overlong/;
my $will_overflow = $allowed_uv < 0;
my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
my $extended_non_cp_trailing_text
= "is a Perl extension, and so is not portable";
+ # What bytes should have been used to specify a code point that has been
+ # specified as an overlong.
+ my $correct_bytes_for_overlong;
+
# Is this test malformed from the beginning? If so, we know to generally
# expect that the tests will show it isn't valid.
my $initially_malformed = 0;
- if ($will_overflow || $allowed_uv > 0x10FFFF) {
+ if ($initially_overlong) {
+ $non_cp_trailing_text = "if you see this, there is an error";
+ $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
+ if (! defined $needed_to_discern_len) {
+ $needed_to_discern_len = overlong_discern_len($bytes);
+ }
+ $initially_malformed = 1;
+ $correct_bytes_for_overlong = display_bytes_no_quotes(chr $allowed_uv);
+ $utf8n_flag_to_warn = 0;
+ $utf8n_flag_to_disallow = 0;
+
+ $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE;
+ $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE;
+ if (! $will_overflow && $allowed_uv <= 0x10FFFF) {
+ $utf8n_flag_to_warn_complement |= $::UTF8_WARN_SUPER;
+ $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_SUPER;
+ if (($allowed_uv & 0xFFFF) != 0xFFFF) {
+ $utf8n_flag_to_warn_complement |= $::UTF8_WARN_NONCHAR;
+ $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_NONCHAR;
+ }
+ }
+ if (! is_extended_utf8($bytes)) {
+ $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED;
+ $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_PERL_EXTENDED;
+ }
+ $controlling_warning_category = 'utf8';
+ }
+ elsif($will_overflow || $allowed_uv > 0x10FFFF) {
# Set the SUPER flags; later, we test for PERL_EXTENDED as well.
$utf8n_flag_to_warn = $::UTF8_WARN_SUPER;
die 'Didn\'t set $needed_to_discern_len for ' . $testname
unless defined $needed_to_discern_len;
- { # First test the isFOO calls
- use warnings; no warnings 'deprecated'; # Make sure these don't raise warnings
- undef @warnings_gotten;
-
- my $ret = test_isUTF8_CHAR($bytes, $length);
- my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
- if ($initially_malformed) {
- is($ret, 0, "For $testname: isUTF8_CHAR() returns 0");
- is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0");
- }
- else {
- is($ret, $length,
- "For $testname: isUTF8_CHAR() returns expected length: $length");
- is($ret_flags, $length, " And isUTF8_CHAR_flags(...,0)"
- . " returns expected length: $length");
- }
- is(scalar @warnings_gotten, 0,
- " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags generated"
- . " any warnings")
- or output_warnings(@warnings_gotten);
-
- undef @warnings_gotten;
- $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
- if ($initially_malformed) {
- is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0");
- }
- else {
- my $expected_ret = ( $testname =~ /surrogate|non-character/
- || $allowed_uv > 0x10FFFF)
- ? 0
- : $length;
- is($ret, $expected_ret, " And isSTRICT_UTF8_CHAR() returns"
- . " expected length: $expected_ret");
- $ret = test_isUTF8_CHAR_flags($bytes, $length,
- $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
- is($ret, $expected_ret,
- " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
- . " acts like isSTRICT_UTF8_CHAR");
- }
- is(scalar @warnings_gotten, 0,
- " And neither isSTRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
- . " generated any warnings")
- or output_warnings(@warnings_gotten);
-
- undef @warnings_gotten;
- $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
- if ($initially_malformed) {
- is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0");
- }
- else {
- my $expected_ret = ( $testname =~ /surrogate/
- || $allowed_uv > 0x10FFFF)
- ? 0
- : $length;
- is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()"
- ." returns expected length: $expected_ret");
- $ret = test_isUTF8_CHAR_flags($bytes, $length,
- $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
- is($ret, $expected_ret,
- " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
- . " acts like isC9_STRICT_UTF8_CHAR");
- }
- is(scalar @warnings_gotten, 0,
- " And neither isC9_STRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
- . " generated any warnings")
- or output_warnings(@warnings_gotten);
-
- foreach my $disallow_type (0..2) {
- # 0 is don't disallow this type of code point
- # 1 is do disallow
- # 2 is do disallow, but only code points requiring
- # perl-extended-UTF8
-
- my $disallow_flags;
- my $expected_ret;
-
- if ($initially_malformed) {
-
- # Malformations are by default disallowed, so testing with
- # $disallow_type equal to 0 is sufficicient.
- next if $disallow_type;
-
- $disallow_flags = 0;
- $expected_ret = 0;
- }
- elsif ($disallow_type == 1) {
- $disallow_flags = $utf8n_flag_to_disallow;
- $expected_ret = 0;
- }
- elsif ($disallow_type == 2) {
- next if ! requires_extended_utf8($allowed_uv);
- $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED;
- $expected_ret = 0;
- }
- else { # type is 0
- $disallow_flags = $utf8n_flag_to_disallow_complement;
- $expected_ret = $length;
- }
-
- $ret = test_isUTF8_CHAR_flags($bytes, $length, $disallow_flags);
- is($ret, $expected_ret, " And isUTF8_CHAR_flags("
- . "$display_bytes, $disallow_flags) returns "
- . $expected_ret)
- or diag "The flags mean "
- . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
-
- is(scalar @warnings_gotten, 0,
- " And isUTF8_CHAR_flags(...) generated no warnings")
- or output_warnings(@warnings_gotten);
-
- # Test partial character handling, for each byte not a full character
- my $did_test_partial = 0;
- for (my $j = 1; $j < $length - 1; $j++) {
- $did_test_partial = 1;
- my $partial = substr($bytes, 0, $j);
- my $ret_should_be;
- my $comment;
- if ($disallow_type || $initially_malformed) {
- $ret_should_be = 0;
- $comment = "disallowed";
- if ($j < $needed_to_discern_len) {
- $ret_should_be = 1;
- $comment .= ", but need $needed_to_discern_len"
- . " bytes to discern:";
- }
- }
- else {
- $ret_should_be = 1;
- $comment = "allowed";
- }
-
- undef @warnings_gotten;
-
- $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
- $disallow_flags);
- is($ret, $ret_should_be,
- " And is_utf8_valid_partial_char_flags("
- . display_bytes($partial)
- . ", $disallow_flags), $comment: returns $ret_should_be")
- or diag "The flags mean "
- . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
- }
-
- if ($did_test_partial) {
- is(scalar @warnings_gotten, 0,
- " And is_utf8_valid_partial_char_flags()"
- . " generated no warnings for any of the lengths")
- or output_warnings(@warnings_gotten);
- }
- }
- }
-
- # Now test the to/from UTF-8 calls
- # This is more complicated than the malformations tested in other files in
- # this directory, as there are several orthogonal variables involved. We
- # test most possible combinations
-
# We try various combinations of malformations that can occur
foreach my $short (0, 1) {
next if $skip_most_tests && $short;
next if $skip_most_tests && $unexpected_noncont;
foreach my $overlong (0, 1) {
next if $overlong && $skip_most_tests;
+ next if $initially_overlong && ! $overlong;
# If we're creating an overlong, it can't be longer than the
# maximum length, so skip if we're already at that length.
- next if $overlong && $length >= $::max_bytes;
+ next if (! $initially_overlong && $overlong)
+ && $length >= $::max_bytes;
my $this_cp_message_qr = $cp_message_qr;
my $this_non_cp_trailing_text = $non_cp_trailing_text;
my $dont_use_overlong_cp = 0;
if ($overlong) {
+ if (! $initially_overlong) {
my $new_expected_len;
# To force this malformation, we convert the original start
- ( $this_expected_len
- $this_needed_to_discern_len);
$this_expected_len = $new_expected_len;
- push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
- push @malformation_names, 'overlong';
-
- if ($malformed_allow_type == 2) {
- $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
- }
- elsif ($malformed_allow_type) {
- $allow_flags |= $::UTF8_ALLOW_LONG;
}
}
if ($short) {
- push @malformation_names, 'short';
# To force this malformation, just tell the test to not look
# as far as it should into the input.
$this_length--;
$this_expected_len--;
- push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
$allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
}
if ($unexpected_noncont) {
- push @malformation_names, 'unexpected non-continuation';
# To force this malformation, change the final continuation
# byte into a non continuation.
my $pos = ($short) ? -2 : -1;
substr($this_bytes, $pos, 1) = '?';
$this_expected_len--;
- push @expected_malformation_return_flags,
- $::UTF8_GOT_NON_CONTINUATION;
- $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
- if $malformed_allow_type;
}
+ # The whole point of a test that is malformed from the beginning
+ # is to test for that malformation. If we've modified things so
+ # much that we don't have enough information to detect that
+ # malformation, there's no point in testing.
+ next if $initially_malformed
+ && $this_expected_len < $this_needed_to_discern_len;
+
# Here, we've transformed the input with all of the desired
# non-overflow malformations. We are now in a position to
# construct any potential warnings for those malformations. But
# point.
if ($overlong) {
+ push @malformation_names, 'overlong';
+ push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
# If one of the other malformation types is also in effect, we
# don't know what the intended code point was.
else {
my $wrong_bytes = display_bytes_no_quotes(
substr($this_bytes, 0, $this_length));
- my $correct_bytes = display_bytes_no_quotes($bytes);
+ if (! defined $correct_bytes_for_overlong) {
+ $correct_bytes_for_overlong
+ = display_bytes_no_quotes($bytes);
+ }
my $prefix = ($allowed_uv > 0x10FFFF) ? "0x" : "U+";
push @expected_malformation_warnings,
qr/\QMalformed UTF-8 character: $wrong_bytes\E
\Q (overlong; instead use\E
- \Q $correct_bytes to\E
+ \Q $correct_bytes_for_overlong to\E
\Q represent $prefix$uv_string)/x;
}
+
+ if ($malformed_allow_type == 2) {
+ $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
+ }
+ elsif ($malformed_allow_type) {
+ $allow_flags |= $::UTF8_ALLOW_LONG;
+ }
}
if ($short) {
+ push @malformation_names, 'short';
+ push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
push @expected_malformation_warnings, qr/too short/;
}
if ($unexpected_noncont) {
+ push @malformation_names, 'unexpected non-continuation';
+ push @expected_malformation_return_flags,
+ $::UTF8_GOT_NON_CONTINUATION;
+ $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
+ if $malformed_allow_type;
push @expected_malformation_warnings,
qr/unexpected non-continuation byte/;
}
# Done setting up the malformation related stuff
+ { # First test the isFOO calls
+ use warnings; # XXX no warnings 'deprecated'; # Make sure these don't raise warnings
+ undef @warnings_gotten;
+
+ my $ret = test_isUTF8_CHAR($this_bytes, $this_length);
+ my $ret_flags
+ = test_isUTF8_CHAR_flags($this_bytes, $this_length, 0);
+ if ($malformations_name) {
+ is($ret, 0, "For $testname$malformations_name: isUTF8_CHAR() returns 0");
+ is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0");
+ }
+ else {
+ is($ret, $this_length, "For $testname: isUTF8_CHAR() returns"
+ . " expected length: $this_length");
+ is($ret_flags, $this_length,
+ " And isUTF8_CHAR_flags(...,0) returns expected"
+ . " length: $this_length");
+ }
+ is(scalar @warnings_gotten, 0,
+ " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags"
+ . " generated any warnings")
+ or output_warnings(@warnings_gotten);
+
+ undef @warnings_gotten;
+ $ret = test_isSTRICT_UTF8_CHAR($this_bytes, $this_length);
+ if ($malformations_name) {
+ is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0");
+ }
+ else {
+ my $expected_ret
+ = ( $testname =~ /surrogate|non-character/
+ || $allowed_uv > 0x10FFFF)
+ ? 0
+ : $this_length;
+ is($ret, $expected_ret,
+ " And isSTRICT_UTF8_CHAR() returns expected"
+ . " length: $expected_ret");
+ $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
+ $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
+ is($ret, $expected_ret,
+ " And isUTF8_CHAR_flags('"
+ . "DISALLOW_ILLEGAL_INTERCHANGE') acts like"
+ . " isSTRICT_UTF8_CHAR");
+ }
+ is(scalar @warnings_gotten, 0,
+ " And neither isSTRICT_UTF8_CHAR() nor"
+ . " isUTF8_CHAR_flags generated any warnings")
+ or output_warnings(@warnings_gotten);
+
+ undef @warnings_gotten;
+ $ret = test_isC9_STRICT_UTF8_CHAR($this_bytes, $this_length);
+ if ($malformations_name) {
+ is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0");
+ }
+ else {
+ my $expected_ret = ( $testname =~ /surrogate/
+ || $allowed_uv > 0x10FFFF)
+ ? 0
+ : $this_expected_len;
+ is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()"
+ . " returns expected length:"
+ . " $expected_ret");
+ $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
+ $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
+ is($ret, $expected_ret,
+ " And isUTF8_CHAR_flags('"
+ . "DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like"
+ . " isC9_STRICT_UTF8_CHAR");
+ }
+ is(scalar @warnings_gotten, 0,
+ " And neither isC9_STRICT_UTF8_CHAR() nor"
+ . " isUTF8_CHAR_flags generated any warnings")
+ or output_warnings(@warnings_gotten);
+
+ foreach my $disallow_type (0..2) {
+ # 0 is don't disallow this type of code point
+ # 1 is do disallow
+ # 2 is do disallow, but only code points requiring
+ # perl-extended-UTF8
+
+ my $disallow_flags;
+ my $expected_ret;
+
+ if ($malformations_name) {
+
+ # Malformations are by default disallowed, so testing
+ # with $disallow_type equal to 0 is sufficicient.
+ next if $disallow_type;
+
+ $disallow_flags = 0;
+ $expected_ret = 0;
+ }
+ elsif ($disallow_type == 1) {
+ $disallow_flags = $utf8n_flag_to_disallow;
+ $expected_ret = 0;
+ }
+ elsif ($disallow_type == 2) {
+ next if ! requires_extended_utf8($allowed_uv);
+ $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED;
+ $expected_ret = 0;
+ }
+ else { # type is 0
+ $disallow_flags = $utf8n_flag_to_disallow_complement;
+ $expected_ret = $this_length;
+ }
+
+ $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
+ $disallow_flags);
+ is($ret, $expected_ret,
+ " And isUTF8_CHAR_flags($display_bytes,"
+ . " $disallow_flags) returns $expected_ret")
+ or diag "The flags mean "
+ . flags_to_text($disallow_flags,
+ \@utf8n_flags_to_text);
+ is(scalar @warnings_gotten, 0,
+ " And isUTF8_CHAR_flags(...) generated"
+ . " no warnings")
+ or output_warnings(@warnings_gotten);
+
+ # Test partial character handling, for each byte not a
+ # full character
+ my $did_test_partial = 0;
+ for (my $j = 1; $j < $this_length - 1; $j++) {
+ $did_test_partial = 1;
+ my $partial = substr($this_bytes, 0, $j);
+ my $ret_should_be;
+ my $comment;
+ if ($disallow_type || $malformations_name) {
+ $ret_should_be = 0;
+ $comment = "disallowed";
+
+ # The number of bytes required to tell if a
+ # sequence has something wrong is the smallest of
+ # all the things wrong with it. We start with the
+ # number for this type of code point, if that is
+ # disallowed; or the whole length if not. The
+ # latter is what a couple of the malformations
+ # require.
+ my $needed_to_tell = ($disallow_type)
+ ? $this_needed_to_discern_len
+ : $this_expected_len;
+
+ # Then we see if the malformations that are
+ # detectable early in the string are present.
+ if ($overlong) {
+ my $dl = overlong_discern_len($this_bytes);
+ $needed_to_tell = $dl if $dl < $needed_to_tell;
+ }
+ if ($will_overflow) {
+ my $dl = overflow_discern_len($length);
+ $needed_to_tell = $dl if $dl < $needed_to_tell;
+ }
+
+ if ($j < $needed_to_tell) {
+ $ret_should_be = 1;
+ $comment .= ", but need $needed_to_tell"
+ . " bytes to discern:";
+ }
+ }
+ else {
+ $ret_should_be = 1;
+ $comment = "allowed";
+ }
+
+ undef @warnings_gotten;
+
+ $ret = test_is_utf8_valid_partial_char_flags($partial,
+ $j, $disallow_flags);
+ is($ret, $ret_should_be,
+ " And is_utf8_valid_partial_char_flags("
+ . display_bytes($partial)
+ . ", $disallow_flags), $comment: returns"
+ . " $ret_should_be")
+ or diag "The flags mean "
+ . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
+ }
+
+ if ($did_test_partial) {
+ is(scalar @warnings_gotten, 0,
+ " And is_utf8_valid_partial_char_flags()"
+ . " generated no warnings for any of the lengths")
+ or output_warnings(@warnings_gotten);
+ }
+ }
+ }
+
+ # Now test the to/from UTF-8 calls. There are several orthogonal
+ # variables involved. We test most possible combinations
+
foreach my $do_disallow (0, 1) {
- next if $skip_most_tests && ! $do_disallow;
+ if ($do_disallow) {
+ next if $initially_overlong;
+ }
+ else {
+ next if $skip_most_tests;
+ }
# We classify the warnings into certain "interesting" types,
# described later
foreach my $warning_type (0..4) {
next if $skip_most_tests && $warning_type != 1;
foreach my $use_warn_flag (0, 1) {
- next if $skip_most_tests && ! $use_warn_flag;
+ if ($use_warn_flag) {
+ next if $initially_overlong;
+ }
+ else {
+ next if $skip_most_tests;
+ }
# Finally, here is the inner loop
}
}
- my $this_name = "utf8n_to_uvchr_error() $testname: "
- . (($disallowed)
- ? 'disallowed'
- : 'allowed');
- $this_name .= ", $eval_warn";
+ my $this_name = "utf8n_to_uvchr_error() $testname: ";
+ if (! $initially_malformed) {
+ $this_name .= ($disallowed)
+ ? 'disallowed, '
+ : 'allowed, ';
+ }
+ $this_name .= "$eval_warn";
$this_name .= ", " . (( $this_warning_flags
& $this_utf8n_flag_to_warn)
? 'with flag for raising warnings'