no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
# machines, and that is tested elsewhere
-
use XS::APItest;
-use Data::Dumper;
my @warnings_gotten;
local $SIG{__WARN__} = sub { my @copy = @_;
push @warnings_gotten, map { chomp; $_ } @copy;
};
+
+my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
+my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
+
+sub requires_extended_utf8($) {
+
+ # Returns a boolean as to whether or not the code point parameter fits
+ # into 31 bits, subject to the convention that a negative code point
+ # stands for one that overflows the word size, so won't fit in 31 bits.
+
+ return shift > $highest_non_extended_utf8_cp;
+}
+
+sub overflow_discern_len($) {
+
+ # Returns how many bytes are needed to tell if a UTF-8 sequence is for a
+ # code point that won't fit in the platform's word size. Only the length
+ # of the sequence representing a single code point is needed.
+
+ if (isASCII) {
+ return ($::is64bit) ? 3 : ((shift == $::max_bytes)
+ ? 1
+ : 2);
+ }
+
+ return ($::is64bit) ? 2 : 8;
+}
+
my @tests;
{
no warnings qw(portable overflow);
: I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
0x80000000,
- (isASCII) ? 1 : 8,
],
[ "highest 32 bit code point",
(isASCII)
: I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
0xFFFFFFFF,
- (isASCII) ? 1 : 8,
],
[ "Lowest 33 bit code point",
(isASCII)
: I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
($::is64bit) ? 0x100000000 : -1, # Overflows on 32-bit systems
- (isASCII && ! $::is64bit) ? 2 : 1,
],
);
[ "overflow that old algorithm failed to detect",
"\xfe\x86\x80\x80\x80\x80\x80",
-1,
- 2,
];
}
}
: I8_to_native(
"\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
-1,
- (isASCII) ? 3 : 2,
];
if (isASCII) {
push @tests,
[ "Lowest code point requiring 13 bytes to represent",
"\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
0x1000000000,
- 1,
],
[ "overflow that old algorithm failed to detect",
"\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
-1,
- 3,
];
}
else {
I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x800000000,
- 7,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x10000000000,
- 6,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x200000000000,
- 5,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x4000000000000,
- 4,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x80000000000000,
- 3,
],
[ "requires at least 32 bits",
I8_to_native(
"\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
0x1000000000000000,
- 2,
];
}
}
W_NONCHAR
D_SUPER
W_SUPER
- D_ABOVE_31_BIT
- W_ABOVE_31_BIT
+ D_PERL_EXTENDED
+ W_PERL_EXTENDED
CHECK_ONLY
NO_CONFIDENCE_IN_CURLEN_
) );
W_SURROGATE
W_NONCHAR
W_SUPER
- W_ABOVE_31_BIT
+ W_PERL_EXTENDED
D_SURROGATE
D_NONCHAR
D_SUPER
- D_ABOVE_31_BIT
+ D_PERL_EXTENDED
) );
$_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
my $will_overflow = $allowed_uv < 0;
my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
+ my $display_bytes = display_bytes($bytes);
my $controlling_warning_category;
my $utf8n_flag_to_warn;
# contain a code point. (This is a result of
# some sort of malformation that means we
# can't get an exact code poin
+ my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
+ \Q requires a Perl extension, and so is not\E
+ \Q portable\E/x;
+ my $extended_non_cp_trailing_text
+ = "is a Perl extension, and so is not portable";
+
+ # Is this test malformed from the beginning? If so, we know to generally
+ # expect that the tests will show it isn't valid.
+ my $initially_malformed = 0;
if ($will_overflow || $allowed_uv > 0x10FFFF) {
- # Set the SUPER flags; later, we test for ABOVE_31_BIT as well.
+ # Set the SUPER flags; later, we test for PERL_EXTENDED as well.
$utf8n_flag_to_warn = $::UTF8_WARN_SUPER;
$utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
$uvchr_flag_to_warn = $::UNICODE_WARN_SUPER;
$uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
- # Below, we add the flags for non-above-31 bit to the code points that
- # don't fit that category. Special tests are done for this category
- # in the inner loop.
+ # Below, we add the flags for non-perl_extended to the code points
+ # that don't fit that category. Special tests are done for this
+ # category in the inner loop.
$utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
|$::UTF8_WARN_SURROGATE;
$utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
if ($will_overflow) { # This is realy a malformation
$non_cp_trailing_text = "if you see this, there is an error";
$cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
+ $initially_malformed = 1;
+ if (! defined $needed_to_discern_len) {
+ $needed_to_discern_len = overflow_discern_len($length);
+ }
}
- elsif ($allowed_uv > 0x7FFFFFFF) {
- $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
- \Q and not portable\E/x;
- $non_cp_trailing_text = "is for a non-Unicode code point, and is not portable";
+ elsif (requires_extended_utf8($allowed_uv)) {
+ $cp_message_qr = $extended_cp_message_qr;
+ $non_cp_trailing_text = $extended_non_cp_trailing_text;
+ $needed_to_discern_len = 1 unless defined $needed_to_discern_len;
}
else {
$cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
\Q may not be portable\E/x;
$non_cp_trailing_text = "is for a non-Unicode code point, may not"
. " be portable";
- $utf8n_flag_to_warn_complement |= $::UTF8_WARN_ABOVE_31_BIT;
- $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_ABOVE_31_BIT;
- $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_ABOVE_31_BIT;
+ $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED;
+ $utf8n_flag_to_disallow_complement
+ |= $::UTF8_DISALLOW_PERL_EXTENDED;
+ $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED;
$uvchr_flag_to_disallow_complement
- |= $::UNICODE_DISALLOW_ABOVE_31_BIT;
+ |= $::UNICODE_DISALLOW_PERL_EXTENDED;
}
}
elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
$utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
|$::UTF8_WARN_SUPER
- |$::UTF8_WARN_ABOVE_31_BIT;
+ |$::UTF8_WARN_PERL_EXTENDED;
$utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
|$::UTF8_DISALLOW_SUPER
- |$::UTF8_DISALLOW_ABOVE_31_BIT;
+ |$::UTF8_DISALLOW_PERL_EXTENDED;
$uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
|$::UNICODE_WARN_SUPER
- |$::UNICODE_WARN_ABOVE_31_BIT;
+ |$::UNICODE_WARN_PERL_EXTENDED;
$uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
|$::UNICODE_DISALLOW_SUPER
- |$::UNICODE_DISALLOW_ABOVE_31_BIT;
+ |$::UNICODE_DISALLOW_PERL_EXTENDED;
$controlling_warning_category = 'surrogate';
}
elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
$utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE
|$::UTF8_WARN_SUPER
- |$::UTF8_WARN_ABOVE_31_BIT;
+ |$::UTF8_WARN_PERL_EXTENDED;
$utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
|$::UTF8_DISALLOW_SUPER
- |$::UTF8_DISALLOW_ABOVE_31_BIT;
+ |$::UTF8_DISALLOW_PERL_EXTENDED;
$uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE
|$::UNICODE_WARN_SUPER
- |$::UNICODE_WARN_ABOVE_31_BIT;
+ |$::UNICODE_WARN_PERL_EXTENDED;
$uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
|$::UNICODE_DISALLOW_SUPER
- |$::UNICODE_DISALLOW_ABOVE_31_BIT;
+ |$::UNICODE_DISALLOW_PERL_EXTENDED;
$controlling_warning_category = 'nonchar';
}
die 'Didn\'t set $needed_to_discern_len for ' . $testname
unless defined $needed_to_discern_len;
+
{ # First test the isFOO calls
- use warnings; # Make sure these don't raise warnings
+ use warnings; no warnings 'deprecated'; # Make sure these don't raise warnings
undef @warnings_gotten;
my $ret = test_isUTF8_CHAR($bytes, $length);
my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
- if ($will_overflow) {
+ if ($initially_malformed) {
is($ret, 0, "For $testname: isUTF8_CHAR() returns 0");
is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0");
}
undef @warnings_gotten;
$ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
- if ($will_overflow) {
+ if ($initially_malformed) {
is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0");
}
else {
undef @warnings_gotten;
$ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
- if ($will_overflow) {
+ if ($initially_malformed) {
is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0");
}
else {
. " generated any warnings")
or output_warnings(@warnings_gotten);
- # Test partial character handling, for each byte not a full character
- for my $j (1.. $length - 1) {
+ foreach my $disallow_type (0..2) {
+ # 0 is don't disallow this type of code point
+ # 1 is do disallow
+ # 2 is do disallow, but only code points requiring
+ # perl-extended-UTF8
- # Skip the test for the interaction between overflow and above-31
- # bit. It is really testing other things than the partial
- # character tests, for which other tests in this file are
- # sufficient
- last if $will_overflow;
+ my $disallow_flags;
+ my $expected_ret;
- foreach my $disallow_flag (0, $utf8n_flag_to_disallow) {
+ if ($initially_malformed) {
+
+ # Malformations are by default disallowed, so testing with
+ # $disallow_type equal to 0 is sufficicient.
+ next if $disallow_type;
+
+ $disallow_flags = 0;
+ $expected_ret = 0;
+ }
+ elsif ($disallow_type == 1) {
+ $disallow_flags = $utf8n_flag_to_disallow;
+ $expected_ret = 0;
+ }
+ elsif ($disallow_type == 2) {
+ next if ! requires_extended_utf8($allowed_uv);
+ $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED;
+ $expected_ret = 0;
+ }
+ else { # type is 0
+ $disallow_flags = $utf8n_flag_to_disallow_complement;
+ $expected_ret = $length;
+ }
+
+ $ret = test_isUTF8_CHAR_flags($bytes, $length, $disallow_flags);
+ is($ret, $expected_ret, " And isUTF8_CHAR_flags("
+ . "$display_bytes, $disallow_flags) returns "
+ . $expected_ret)
+ or diag "The flags mean "
+ . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
+
+ is(scalar @warnings_gotten, 0,
+ " And isUTF8_CHAR_flags(...) generated no warnings")
+ or output_warnings(@warnings_gotten);
+
+ # Test partial character handling, for each byte not a full character
+ my $did_test_partial = 0;
+ for (my $j = 1; $j < $length - 1; $j++) {
+ $did_test_partial = 1;
my $partial = substr($bytes, 0, $j);
my $ret_should_be;
my $comment;
- if ($disallow_flag) {
+ if ($disallow_type || $initially_malformed) {
$ret_should_be = 0;
$comment = "disallowed";
if ($j < $needed_to_discern_len) {
$ret_should_be = 1;
- $comment .= ", but need $needed_to_discern_len bytes"
- . " to discern:";
+ $comment .= ", but need $needed_to_discern_len"
+ . " bytes to discern:";
}
}
else {
undef @warnings_gotten;
$ret = test_is_utf8_valid_partial_char_flags($partial, $j,
- $disallow_flag);
+ $disallow_flags);
is($ret, $ret_should_be,
- " And is_utf8_valid_partial_char_flags("
- . display_bytes($partial)
- . "), $comment: returns $ret_should_be");
+ " And is_utf8_valid_partial_char_flags("
+ . display_bytes($partial)
+ . ", $disallow_flags), $comment: returns $ret_should_be")
+ or diag "The flags mean "
+ . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
+ }
+
+ if ($did_test_partial) {
is(scalar @warnings_gotten, 0,
" And is_utf8_valid_partial_char_flags()"
- . " generated no warnings")
- or output_warnings(@warnings_gotten);
+ . " generated no warnings for any of the lengths")
+ or output_warnings(@warnings_gotten);
}
}
}
# This is more complicated than the malformations tested in other files in
# this directory, as there are several orthogonal variables involved. We
# test most possible combinations
- foreach my $do_disallow (0, 1) {
- next if $skip_most_tests && ! $do_disallow;
- # We try various combinations of malformations that can occur
- foreach my $short ("", "short") {
- next if $skip_most_tests && $short;
- foreach my $unexpected_noncont ("", "unexpected non-continuation") {
- next if $skip_most_tests && $unexpected_noncont;
- foreach my $overlong ("", "overlong") {
- next if $overlong && $skip_most_tests;
+ # We try various combinations of malformations that can occur
+ foreach my $short (0, 1) {
+ next if $skip_most_tests && $short;
+ foreach my $unexpected_noncont (0, 1) {
+ next if $skip_most_tests && $unexpected_noncont;
+ foreach my $overlong (0, 1) {
+ next if $overlong && $skip_most_tests;
+
+ # If we're creating an overlong, it can't be longer than the
+ # maximum length, so skip if we're already at that length.
+ next if $overlong && $length >= $::max_bytes;
+
+ my $this_cp_message_qr = $cp_message_qr;
+ my $this_non_cp_trailing_text = $non_cp_trailing_text;
+
+ foreach my $malformed_allow_type (0..2) {
+ # 0 don't allow this malformation; ignored if no malformation
+ # 1 allow, with REPLACEMENT CHARACTER returned
+ # 2 allow, with intended code point returned. All malformations
+ # other than overlong can't determine the intended code point,
+ # so this isn't valid for them.
+ next if $malformed_allow_type == 2
+ && ($will_overflow || $short || $unexpected_noncont);
+ next if $skip_most_tests && $malformed_allow_type;
+
+ # Here we are in the innermost loop for malformations. So we
+ # know which ones are in effect. Can now change the input to be
+ # appropriately malformed. We also can set up certain other
+ # things now, like whether we expect a return flag from this
+ # malformation, and which flag.
+
+ my $this_bytes = $bytes;
+ my $this_length = $length;
+ my $this_expected_len = $length;
+ my $this_needed_to_discern_len = $needed_to_discern_len;
+
+ my @malformation_names;
+ my @expected_malformation_warnings;
+ my @expected_malformation_return_flags;
+
+ # Contains the flags for any allowed malformations. Currently no
+ # combinations of on/off are tested for. It's either all are
+ # allowed, or none are.
+ my $allow_flags = 0;
+ my $overlong_is_in_perl_extended_utf8 = 0;
+ my $dont_use_overlong_cp = 0;
+
+ if ($overlong) {
+ my $new_expected_len;
+
+ # To force this malformation, we convert the original start
+ # byte into a continuation byte with the same data bits as
+ # originally. ...
+ my $start_byte = substr($this_bytes, 0, 1);
+ my $converted_to_continuation_byte
+ = start_byte_to_cont($start_byte);
+
+ # ... Then we prepend it with a known overlong sequence. This
+ # should evaluate to the exact same code point as the
+ # original. We try to avoid an overlong using Perl extended
+ # UTF-8. The code points are the highest representable as
+ # overlongs on the respective platform without using extended
+ # UTF-8.
+ if (native_to_I8($start_byte) lt "\xFC") {
+ $start_byte = I8_to_native("\xFC");
+ $new_expected_len = 6;
+ }
+ elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") {
+
+ # FE is not extended UTF-8 on EBCDIC
+ $start_byte = I8_to_native("\xFE");
+ $new_expected_len = 7;
+ }
+ else { # Must use extended UTF-8. On ASCII platforms, we
+ # could express some overlongs here starting with
+ # \xFE, but there's no real reason to do so.
+ $overlong_is_in_perl_extended_utf8 = 1;
+ $start_byte = I8_to_native("\xFF");
+ $new_expected_len = $::max_bytes;
+ $this_cp_message_qr = $extended_cp_message_qr;
+
+ # The warning that gets raised doesn't include the code
+ # point in the message if the code point can be expressed
+ # without using extended UTF-8, but the particular
+ # overlong sequence used is in extended UTF-8. To do
+ # otherwise would be confusing to the user, as it would
+ # claim the code point requires extended, when it doesn't.
+ $dont_use_overlong_cp = 1
+ unless requires_extended_utf8($allowed_uv);
+ $this_non_cp_trailing_text = $extended_non_cp_trailing_text;
+ }
+
+ # Splice in the revise continuation byte, preceded by the
+ # start byte and the proper number of the lowest continuation
+ # bytes.
+ $this_bytes = $start_byte
+ . ($native_lowest_continuation_chr
+ x ( $new_expected_len - 1 - length($this_bytes)))
+ . $converted_to_continuation_byte
+ . substr($this_bytes, 1);
+ $this_length = length($this_bytes);
+ $this_needed_to_discern_len = $new_expected_len
+ - ( $this_expected_len
+ - $this_needed_to_discern_len);
+ $this_expected_len = $new_expected_len;
+ push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
+ push @malformation_names, 'overlong';
+
+ if ($malformed_allow_type == 2) {
+ $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
+ }
+ elsif ($malformed_allow_type) {
+ $allow_flags |= $::UTF8_ALLOW_LONG;
+ }
+ }
+
+ if ($short) {
+ push @malformation_names, 'short';
+
+ # To force this malformation, just tell the test to not look
+ # as far as it should into the input.
+ $this_length--;
+ $this_expected_len--;
+ push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
+
+ $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
+ }
+
+ if ($unexpected_noncont) {
+ push @malformation_names, 'unexpected non-continuation';
+
+ # To force this malformation, change the final continuation
+ # byte into a non continuation.
+ my $pos = ($short) ? -2 : -1;
+ substr($this_bytes, $pos, 1) = '?';
+ $this_expected_len--;
+ push @expected_malformation_return_flags,
+ $::UTF8_GOT_NON_CONTINUATION;
+ $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
+ if $malformed_allow_type;
+ }
+
+ # The whole point of a test that is malformed from the beginning
+ # is to test for that malformation. If we've modified things so
+ # much that we don't have enough information to detect that
+ # malformation, there's no point in testing.
+ next if $initially_malformed
+ && $this_expected_len < $this_needed_to_discern_len;
+
+ # Here, we've transformed the input with all of the desired
+ # non-overflow malformations. We are now in a position to
+ # construct any potential warnings for those malformations. But
+ # it's a pain to get the detailed messages exactly right, so for
+ # now XXX, only do so for those that return an explicit code
+ # point.
+
+ if ($overlong) {
+
+ # If one of the other malformation types is also in effect, we
+ # don't know what the intended code point was.
+ if ($short || $unexpected_noncont || $will_overflow) {
+ push @expected_malformation_warnings, qr/overlong/;
+ }
+ else {
+ my $wrong_bytes = display_bytes_no_quotes(
+ substr($this_bytes, 0, $this_length));
+ my $correct_bytes = display_bytes_no_quotes($bytes);
+ my $prefix = ($allowed_uv > 0x10FFFF) ? "0x" : "U+";
+ push @expected_malformation_warnings,
+ qr/\QMalformed UTF-8 character: $wrong_bytes\E
+ \Q (overlong; instead use\E
+ \Q $correct_bytes to\E
+ \Q represent $prefix$uv_string)/x;
+ }
+ }
+ if ($short) {
+ push @expected_malformation_warnings, qr/too short/;
+ }
+ if ($unexpected_noncont) {
+ push @expected_malformation_warnings,
+ qr/unexpected non-continuation byte/;
+ }
+
+ # The overflow malformation is done differently than other
+ # malformations. It comes from manually typed tests in the test
+ # array. We now make it be treated like one of the other
+ # malformations. But some has to be deferred until the inner loop
+ my $overflow_msg_pattern;
+ if ($will_overflow) {
+ push @malformation_names, 'overflow';
+
+ $overflow_msg_pattern = display_bytes_no_quotes(
+ substr($this_bytes, 0, $this_expected_len));
+ $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E
+ \Q $overflow_msg_pattern\E
+ \Q (overflows)\E/x;
+ push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW;
+ $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type;
+ }
+
+ # And we can create the malformation-related text for the the test
+ # names we eventually will generate.
+ my $malformations_name = "";
+ if (@malformation_names) {
+ $malformations_name .= "dis" unless $malformed_allow_type;
+ $malformations_name .= "allowed ";
+ $malformations_name .= "malformation";
+ $malformations_name .= "s" if @malformation_names > 1;
+ $malformations_name .= ": ";
+ $malformations_name .= join "/", @malformation_names;
+ $malformations_name = " ($malformations_name)";
+ }
+
+ # Done setting up the malformation related stuff
- # If we're creating an overlong, it can't be longer than the
- # maximum length, so skip if we're already at that length.
- next if $overlong && $length >= $::max_bytes;
+ foreach my $do_disallow (0, 1) {
+ next if $skip_most_tests && ! $do_disallow;
# We classify the warnings into certain "interesting" types,
# described later
if ($warning_type == 0) {
$eval_warn = "use warnings; no warnings 'deprecated'";
$expect_regular_warnings = $use_warn_flag;
+
+ # We ordinarily expect overflow warnings here. But it
+ # is somewhat more complicated, and the final
+ # determination is deferred to one place in the filw
+ # where we handle overflow.
$expect_warnings_for_overflow = 1;
- $expect_warnings_for_malformed = 1;
+
+ # We would ordinarily expect malformed warnings in
+ # this case, but not if malformations are allowed.
+ $expect_warnings_for_malformed
+ = $malformed_allow_type == 0;
}
elsif ($warning_type == 1) {
$eval_warn = "no warnings";
$eval_warn = "no warnings; use warnings 'utf8'";
$expect_regular_warnings = $use_warn_flag;
$expect_warnings_for_overflow = 1;
- $expect_warnings_for_malformed = 1;
+ $expect_warnings_for_malformed
+ = $malformed_allow_type == 0;
}
elsif ($warning_type == 3) {
$eval_warn = "no warnings; use warnings"
$expect_warnings_for_malformed = 0;
}
elsif ($warning_type == 4) { # Like type 3, but uses the
- # above-31-bit flags
+ # PERL_EXTENDED flags
# The complement flags were set up so that the
- # above-31-bit flags have been tested that they don't
+ # PERL_EXTENDED flags have been tested that they don't
# trigger wrongly for too small code points. And the
# flags have been set up so that those small code
# points are tested for being above Unicode. What's
# left to test is that the large code points do
- # trigger the above-31-bit flags.
- next if ! $will_overflow && $allowed_uv < 0x80000000;
+ # trigger the PERL_EXTENDED flags.
+ next if ! requires_extended_utf8($allowed_uv);
next if $controlling_warning_category ne 'non_unicode';
$eval_warn = "no warnings; use warnings 'non_unicode'";
$expect_regular_warnings = 1;
$expect_warnings_for_overflow = 1;
$expect_warnings_for_malformed = 0;
- $this_utf8n_flag_to_warn = $::UTF8_WARN_ABOVE_31_BIT;
+ $this_utf8n_flag_to_warn = $::UTF8_WARN_PERL_EXTENDED;
$this_utf8n_flag_to_disallow
- = $::UTF8_DISALLOW_ABOVE_31_BIT;
- $this_uvchr_flag_to_warn = $::UNICODE_WARN_ABOVE_31_BIT;
+ = $::UTF8_DISALLOW_PERL_EXTENDED;
+ $this_uvchr_flag_to_warn
+ = $::UNICODE_WARN_PERL_EXTENDED;
$this_uvchr_flag_to_disallow
- = $::UNICODE_DISALLOW_ABOVE_31_BIT;
+ = $::UNICODE_DISALLOW_PERL_EXTENDED;
}
else {
die "Unexpected warning type '$warning_type'";
# on all the other flags. That makes sure that they all
# are independent of this flag, and so we don't need to
# test them individually.
- my $this_warning_flags = ($use_warn_flag)
- ? $this_utf8n_flag_to_warn
- : $utf8n_flag_to_warn_complement;
- my $this_disallow_flags = ($do_disallow)
- ? $this_utf8n_flag_to_disallow
- : $utf8n_flag_to_disallow_complement;
- my $this_bytes = $bytes;
- my $this_length = $length;
+ my $this_warning_flags
+ = ($use_warn_flag)
+ ? $this_utf8n_flag_to_warn
+ : ($overlong_is_in_perl_extended_utf8
+ ? ($utf8n_flag_to_warn_complement
+ & ~$::UTF8_WARN_PERL_EXTENDED)
+ : $utf8n_flag_to_warn_complement);
+ my $this_disallow_flags
+ = ($do_disallow)
+ ? $this_utf8n_flag_to_disallow
+ : ($overlong_is_in_perl_extended_utf8
+ ? ($utf8n_flag_to_disallow_complement
+ & ~$::UTF8_DISALLOW_PERL_EXTENDED)
+ : $utf8n_flag_to_disallow_complement);
my $expected_uv = $allowed_uv;
- my $this_expected_len = $length;
- my $this_needed_to_discern_len = $needed_to_discern_len;
+ my $this_uv_string = $uv_string;
- my @malformation_names;
+ my @expected_return_flags
+ = @expected_malformation_return_flags;
my @expected_warnings;
- my @expected_return_flags;
-
- # Now go through the possible malformations wanted, and
- # change the input accordingly. We also can set up
- # certain other things now, like whether we expect a
- # return flag from this malformation and which flag.
- if ($overlong) {
-
- # To force this malformation, we convert the original
- # start byte into a continuation byte with the same
- # data bits as originally. ...
- substr($this_bytes, 0, 1)
- = start_byte_to_cont(substr($this_bytes,
- 0, 1));
-
- # ... Then we prepend it with a known overlong
- # sequence. This should evaluate to the exact same
- # code point as the original.
- $this_bytes
- = I8_to_native("\xff")
- . (I8_to_native(chr $::lowest_continuation)
- x ( $::max_bytes - 1 - length($this_bytes)))
- . $this_bytes;
- $this_length = length($this_bytes);
- $this_needed_to_discern_len
- = $::max_bytes - ($this_expected_len
- - $this_needed_to_discern_len);
- $this_expected_len = $::max_bytes;
- push @expected_return_flags, $::UTF8_GOT_LONG;
- push @malformation_names, $overlong;
- if ($expect_warnings_for_malformed) {
- if ( ! $short
- && ! $unexpected_noncont
- && ! $will_overflow)
- {
- my $overlong_bytes
- = display_bytes_no_quotes($this_bytes);
- my $correct_bytes
- = display_bytes_no_quotes($bytes);
- my $prefix = ($allowed_uv > 0x10FFFF)
- ? "0x"
- : "U+";
- push @expected_warnings,
- qr/\QMalformed UTF-8 character:\E
- \Q $overlong_bytes (overlong;\E
- \Q instead use $correct_bytes to\E
- \Q represent $prefix$uv_string)/x;
- }
- else {
- push @expected_warnings, qr/overlong/;
- }
- }
- }
-
- if ($short) {
- push @malformation_names, $short;
- push @expected_warnings, qr/short/
- if $expect_warnings_for_malformed;
-
- # To force this malformation, just tell the test to
- # not look as far as it should into the input.
- $this_length--;
- $this_expected_len--;
- push @expected_return_flags, $::UTF8_GOT_SHORT;
- }
-
- if ($unexpected_noncont) {
- push @malformation_names, $unexpected_noncont;
- push @expected_warnings, qr/$unexpected_noncont/
+ push @expected_warnings, @expected_malformation_warnings
if $expect_warnings_for_malformed;
- # To force this malformation, change the final
- # continuation byte into a non continuation.
- my $pos = ($short) ? -2 : -1;
- substr($this_bytes, $pos, 1) = '?';
- $this_expected_len--;
- push @expected_return_flags,
- $::UTF8_GOT_NON_CONTINUATION;
- }
-
# The overflow malformation is done differently than other
# malformations. It comes from manually typed tests in
# the test array, but it also is above Unicode and uses
# everything else.
$expect_regular_warnings = 0;
- push @malformation_names, 'overflow';
- if ($expect_warnings_for_overflow) {
- my $qr = display_bytes_no_quotes(
- substr($this_bytes, 0, $this_expected_len));
- $qr = qr/\QMalformed UTF-8 character: \E
- \Q$qr (overflows)\E/x;
- push @expected_warnings, $qr;
+ # Earlier, we tentatively calculated whether this
+ # should emit a message or not. It's tentative
+ # because, even if we ordinarily would output it, we
+ # don't if malformations are allowed -- except an
+ # overflow is also a SUPER and PERL_EXTENDED, and if
+ # warnings for those are enabled, the overflow
+ # warning does get raised.
+ if ( $expect_warnings_for_overflow
+ && ( $malformed_allow_type == 0
+ || ( $this_warning_flags
+ & ($::UTF8_WARN_SUPER
+ |$::UTF8_WARN_PERL_EXTENDED))))
+ {
+ push @expected_warnings, $overflow_msg_pattern;
}
- push @expected_return_flags, $::UTF8_GOT_OVERFLOW;
- }
-
- # Here, we've set things up based on the malformations.
- # Now generate the text for them for the test name.
- my $malformations_name = "";
- if (@malformation_names) {
- $malformations_name .= "malformation";
- $malformations_name .= "s" if @malformation_names > 1;
- $malformations_name .= ": ";
- $malformations_name .= join "/", @malformation_names;
- $malformations_name = " ($malformations_name)";
}
# It may be that the malformations have shortened the
# So far the array contains warnings generated by
# malformations. Add the expected regular one.
- unshift @expected_warnings, $cp_message_qr;
+ unshift @expected_warnings, $this_cp_message_qr;
# But it may need to be modified, because either of
# these malformations means we can't determine the
# expected code point.
- if ($short || $unexpected_noncont) {
+ if ( $short || $unexpected_noncont
+ || $dont_use_overlong_cp)
+ {
my $first_byte = substr($this_bytes, 0, 1);
$expected_warnings[0] = display_bytes(
substr($this_bytes, 0, $this_expected_len));
$expected_warnings[0]
= qr/[Aa]\Qny UTF-8 sequence that starts with\E
\Q $expected_warnings[0]\E
- \Q $non_cp_trailing_text\E/x;
+ \Q $this_non_cp_trailing_text\E/x;
}
}
- # Is effectively disallowed if we've set up a
- # malformation, even if the flag indicates it is
- # allowed. Fix up test name to indicate this as
- # well
- my $disallowed = ( $this_disallow_flags
- & $this_utf8n_flag_to_disallow)
- || $malformations_name;
- my $this_name = "utf8n_to_uvchr_error() $testname: "
- . (($disallowed)
- ? 'disallowed'
- : 'allowed');
- $this_name .= ", $eval_warn";
+ # Is effectively disallowed if we've set up a malformation
+ # (unless malformations are allowed), even if the flag
+ # indicates it is allowed. Fix up test name to indicate
+ # this as well
+ my $disallowed = 0;
+ if ( $this_disallow_flags & $this_utf8n_flag_to_disallow
+ && $this_expected_len >= $this_needed_to_discern_len)
+ {
+ $disallowed = 1;
+ }
+ if ($malformations_name) {
+ if ($malformed_allow_type == 0) {
+ $disallowed = 1;
+ }
+ elsif ($malformed_allow_type == 1) {
+
+ # Even if allowed, the malformation returns the
+ # REPLACEMENT CHARACTER.
+ $expected_uv = 0xFFFD;
+ $this_uv_string = "0xFFFD"
+ }
+ }
+
+ my $this_name = "utf8n_to_uvchr_error() $testname: ";
+ if (! $initially_malformed) {
+ $this_name .= ($disallowed)
+ ? 'disallowed, '
+ : 'allowed, ';
+ }
+ $this_name .= "$eval_warn";
$this_name .= ", " . (( $this_warning_flags
& $this_utf8n_flag_to_warn)
? 'with flag for raising warnings'
# Do the actual test using an eval
undef @warnings_gotten;
my $ret_ref;
- my $this_flags = $this_warning_flags|$this_disallow_flags;
+ my $this_flags
+ = $allow_flags|$this_warning_flags|$this_disallow_flags;
my $eval_text = "$eval_warn; \$ret_ref"
. " = test_utf8n_to_uvchr_error("
. "'$this_bytes', $this_length, $this_flags)";
else {
is($ret_ref->[0], $expected_uv,
" And returns expected uv: "
- . $uv_string)
+ . $this_uv_string)
or diag "Call was: " . utf8n_display_call($eval_text);
}
is($ret_ref->[1], $this_expected_len,
for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
if ($expected_return_flags[$i] & $returned_flags) {
if ($expected_return_flags[$i]
- == $::UTF8_DISALLOW_ABOVE_31_BIT)
+ == $::UTF8_GOT_PERL_EXTENDED)
{
pass(" Expected and got return flag for"
- . " above_31_bit");
+ . " PERL_EXTENDED");
}
# The first entries in this are
# malformations
or diag "Call was: " . uvchr_display_call($eval_text);
}
}
+ }
}
}
}