#!perl -w
# This is a base file to be used by various .t's in its directory
+# It tests various code points that are "problematic", and verifies that the
+# correct warnings/flags etc are generated when using them. It also takes the
+# UTF-8 for some of them and perturbs it to be malformed in various ways, and
+# tests that this gets appropriately detected.
use strict;
use Test::More;
no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
# machines, and that is tested elsewhere
-
use XS::APItest;
-use Data::Dumper;
my @warnings_gotten;
use warnings 'utf8';
-local $SIG{__WARN__} = sub { push @warnings_gotten, @_ };
+local $SIG{__WARN__} = sub { my @copy = @_;
+ push @warnings_gotten, map { chomp; $_ } @copy;
+ };
-sub nonportable_regex ($) {
+my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
+my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
- # Returns a pattern that matches the non-portable message raised either
- # for the specific input code point, or the one generated when there
- # is some malformation that precludes the message containing the specific
- # code point
+sub requires_extended_utf8($) {
- my $code_point = shift;
+ # Returns a boolean as to whether or not the code point parameter fits
+ # into 31 bits, subject to the convention that a negative code point
+ # stands for one that overflows the word size, so won't fit in 31 bits.
- my $string = sprintf '(Code point 0x%X is not Unicode, and'
- . '|Any UTF-8 sequence that starts with'
- . ' "(\\\x[[:xdigit:]]{2})+" is for a'
- . ' non-Unicode code point, and is) not portable',
- $code_point;
- return qr/$string/;
+ return shift > $highest_non_extended_utf8_cp;
}
-# Now test the cases where a legal code point is generated, but may or may not
-# be allowed/warned on.
-my @tests = (
- # ($testname, $bytes, $disallow_flags, $controlling_warning_category,
- # $allowed_uv, $needed_to_discern_len )
- [ "lowest surrogate",
- (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
- $::UTF8_DISALLOW_SURROGATE,
- 'surrogate', 0xD800,
- ],
- [ "a middle surrogate",
- (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
- $::UTF8_DISALLOW_SURROGATE,
- 'surrogate', 0xD90D,
- ],
- [ "highest surrogate",
- (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
- $::UTF8_DISALLOW_SURROGATE,
- 'surrogate', 0xDFFF,
- ],
- [ "first non_unicode",
- (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
- $::UTF8_DISALLOW_SUPER,
- 'non_unicode', 0x110000,
- 2,
- ],
- [ "non_unicode whose first byte tells that",
- (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
- $::UTF8_DISALLOW_SUPER,
- 'non_unicode',
- (isASCII) ? 0x140000 : 0x200000,
- 1,
- ],
- [ "first of 32 consecutive non-character code points",
- (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xFDD0,
- ],
- [ "a mid non-character code point of the 32 consecutive ones",
- (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xFDE0,
- ],
- [ "final of 32 consecutive non-character code points",
- (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xFDEF,
- ],
- [ "non-character code point U+FFFE",
- (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xFFFE,
- ],
- [ "non-character code point U+FFFF",
- (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xFFFF,
- ],
- [ "non-character code point U+1FFFE",
- (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x1FFFE,
- ],
- [ "non-character code point U+1FFFF",
- (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x1FFFF,
- ],
- [ "non-character code point U+2FFFE",
- (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x2FFFE,
- ],
- [ "non-character code point U+2FFFF",
- (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x2FFFF,
- ],
- [ "non-character code point U+3FFFE",
- (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x3FFFE,
- ],
- [ "non-character code point U+3FFFF",
- (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x3FFFF,
- ],
- [ "non-character code point U+4FFFE",
- (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x4FFFE,
- ],
- [ "non-character code point U+4FFFF",
- (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x4FFFF,
- ],
- [ "non-character code point U+5FFFE",
- (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x5FFFE,
- ],
- [ "non-character code point U+5FFFF",
- (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x5FFFF,
- ],
- [ "non-character code point U+6FFFE",
- (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x6FFFE,
- ],
- [ "non-character code point U+6FFFF",
- (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x6FFFF,
- ],
- [ "non-character code point U+7FFFE",
- (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x7FFFE,
- ],
- [ "non-character code point U+7FFFF",
- (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x7FFFF,
- ],
- [ "non-character code point U+8FFFE",
- (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x8FFFE,
- ],
- [ "non-character code point U+8FFFF",
- (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x8FFFF,
- ],
- [ "non-character code point U+9FFFE",
- (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x9FFFE,
- ],
- [ "non-character code point U+9FFFF",
- (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x9FFFF,
- ],
- [ "non-character code point U+AFFFE",
- (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xAFFFE,
- ],
- [ "non-character code point U+AFFFF",
- (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xAFFFF,
- ],
- [ "non-character code point U+BFFFE",
- (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xBFFFE,
- ],
- [ "non-character code point U+BFFFF",
- (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xBFFFF,
- ],
- [ "non-character code point U+CFFFE",
- (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xCFFFE,
- ],
- [ "non-character code point U+CFFFF",
- (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xCFFFF,
- ],
- [ "non-character code point U+DFFFE",
- (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xDFFFE,
- ],
- [ "non-character code point U+DFFFF",
- (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xDFFFF,
- ],
- [ "non-character code point U+EFFFE",
- (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xEFFFE,
- ],
- [ "non-character code point U+EFFFF",
- (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xEFFFF,
- ],
- [ "non-character code point U+FFFFE",
- (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xFFFFE,
- ],
- [ "non-character code point U+FFFFF",
- (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0xFFFFF,
- ],
- [ "non-character code point U+10FFFE",
- (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x10FFFE,
- ],
- [ "non-character code point U+10FFFF",
- (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_NONCHAR,
- 'nonchar', 0x10FFFF,
- ],
- [ "requires at least 32 bits",
- (isASCII)
- ? "\xfe\x82\x80\x80\x80\x80\x80"
- : I8_to_native(
- "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
- # This code point is chosen so that it is representable in a UV on
- # 32-bit machines
- $::UTF8_DISALLOW_ABOVE_31_BIT,
- 'utf8', 0x80000000,
- (isASCII) ? 1 : 8,
- ],
- [ "highest 32 bit code point",
- (isASCII)
- ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
- : I8_to_native(
- "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
- $::UTF8_DISALLOW_ABOVE_31_BIT,
- 'utf8', 0xFFFFFFFF,
- (isASCII) ? 1 : 8,
- ],
- [ "requires at least 32 bits, and use SUPER-type flags, instead of"
- . " ABOVE_31_BIT",
- (isASCII)
- ? "\xfe\x82\x80\x80\x80\x80\x80"
- : I8_to_native(
- "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
- $::UTF8_DISALLOW_SUPER,
- 'utf8', 0x80000000,
- 1,
- ],
- [ "overflow with warnings/disallow for more than 31 bits",
- # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
- # with overflow. The overflow malformation is never allowed, so
- # preventing it takes precedence if the ABOVE_31_BIT options would
- # otherwise allow in an overflowing value. The ASCII code points (1
- # for 32-bits; 1 for 64) were chosen because the old overflow
- # detection algorithm did not catch them; this means this test also
- # checks for that fix. The EBCDIC are arbitrary overflowing ones
- # since we have no reports of failures with it.
- (($::is64bit)
- ? ((isASCII)
- ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
- : I8_to_native(
- "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
- : ((isASCII)
- ? "\xfe\x86\x80\x80\x80\x80\x80"
- : I8_to_native(
- "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
- $::UTF8_DISALLOW_ABOVE_31_BIT,
- 'utf8', -1,
- (isASCII || $::is64bit) ? 2 : 8,
- ],
-);
-
-if (! $::is64bit) {
+sub overflow_discern_len($) {
+
+ # Returns how many bytes are needed to tell if a UTF-8 sequence is for a
+ # code point that won't fit in the platform's word size. Only the length
+ # of the sequence representing a single code point is needed.
+
if (isASCII) {
- no warnings qw{portable overflow};
- push @tests,
- [ "Lowest 33 bit code point: overflow",
- "\xFE\x84\x80\x80\x80\x80\x80",
- $::UTF8_DISALLOW_ABOVE_31_BIT,
- 'utf8', -1,
- 1,
- ];
+ return ($::is64bit) ? 3 : ((shift == $::max_bytes)
+ ? 1
+ : 2);
}
+
+ return ($::is64bit) ? 2 : 8;
}
-else {
- no warnings qw{portable overflow};
- push @tests,
- [ "More than 32 bits",
+
+my @tests;
+{
+ no warnings qw(portable overflow);
+ @tests = (
+ # $testname,
+ # $bytes, UTF-8 string
+ # $allowed_uv, code point $bytes evaluates to; -1 if
+ # overflows
+ # $needed_to_discern_len optional, how long an initial substring do
+ # we need to tell that the string must be for
+ # a code point in the category it falls in,
+ # like being a surrogate; 0 indicates we need
+ # the whole string. Some categories have a
+ # default that is used if this is omitted.
+ [ "lowest surrogate",
+ (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
+ 0xD800,
+ ],
+ [ "a middle surrogate",
+ (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
+ 0xD90D,
+ ],
+ [ "highest surrogate",
+ (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
+ 0xDFFF,
+ ],
+ [ "first of 32 consecutive non-character code points",
+ (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
+ 0xFDD0,
+ ],
+ [ "a mid non-character code point of the 32 consecutive ones",
+ (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
+ 0xFDE0,
+ ],
+ [ "final of 32 consecutive non-character code points",
+ (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
+ 0xFDEF,
+ ],
+ [ "non-character code point U+FFFE",
+ (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
+ 0xFFFE,
+ ],
+ [ "non-character code point U+FFFF",
+ (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
+ 0xFFFF,
+ ],
+ [ "non-character code point U+1FFFE",
+ (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
+ 0x1FFFE,
+ ],
+ [ "non-character code point U+1FFFF",
+ (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
+ 0x1FFFF,
+ ],
+ [ "non-character code point U+2FFFE",
+ (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
+ 0x2FFFE,
+ ],
+ [ "non-character code point U+2FFFF",
+ (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
+ 0x2FFFF,
+ ],
+ [ "non-character code point U+3FFFE",
+ (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
+ 0x3FFFE,
+ ],
+ [ "non-character code point U+3FFFF",
+ (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
+ 0x3FFFF,
+ ],
+ [ "non-character code point U+4FFFE",
+ (isASCII)
+ ? "\xf1\x8f\xbf\xbe"
+ : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
+ 0x4FFFE,
+ ],
+ [ "non-character code point U+4FFFF",
+ (isASCII)
+ ? "\xf1\x8f\xbf\xbf"
+ : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
+ 0x4FFFF,
+ ],
+ [ "non-character code point U+5FFFE",
+ (isASCII)
+ ? "\xf1\x9f\xbf\xbe"
+ : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
+ 0x5FFFE,
+ ],
+ [ "non-character code point U+5FFFF",
+ (isASCII)
+ ? "\xf1\x9f\xbf\xbf"
+ : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
+ 0x5FFFF,
+ ],
+ [ "non-character code point U+6FFFE",
+ (isASCII)
+ ? "\xf1\xaf\xbf\xbe"
+ : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
+ 0x6FFFE,
+ ],
+ [ "non-character code point U+6FFFF",
+ (isASCII)
+ ? "\xf1\xaf\xbf\xbf"
+ : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
+ 0x6FFFF,
+ ],
+ [ "non-character code point U+7FFFE",
+ (isASCII)
+ ? "\xf1\xbf\xbf\xbe"
+ : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
+ 0x7FFFE,
+ ],
+ [ "non-character code point U+7FFFF",
+ (isASCII)
+ ? "\xf1\xbf\xbf\xbf"
+ : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
+ 0x7FFFF,
+ ],
+ [ "non-character code point U+8FFFE",
+ (isASCII)
+ ? "\xf2\x8f\xbf\xbe"
+ : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
+ 0x8FFFE,
+ ],
+ [ "non-character code point U+8FFFF",
+ (isASCII)
+ ? "\xf2\x8f\xbf\xbf"
+ : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
+ 0x8FFFF,
+ ],
+ [ "non-character code point U+9FFFE",
+ (isASCII)
+ ? "\xf2\x9f\xbf\xbe"
+ : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
+ 0x9FFFE,
+ ],
+ [ "non-character code point U+9FFFF",
+ (isASCII)
+ ? "\xf2\x9f\xbf\xbf"
+ : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
+ 0x9FFFF,
+ ],
+ [ "non-character code point U+AFFFE",
+ (isASCII)
+ ? "\xf2\xaf\xbf\xbe"
+ : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
+ 0xAFFFE,
+ ],
+ [ "non-character code point U+AFFFF",
+ (isASCII)
+ ? "\xf2\xaf\xbf\xbf"
+ : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
+ 0xAFFFF,
+ ],
+ [ "non-character code point U+BFFFE",
+ (isASCII)
+ ? "\xf2\xbf\xbf\xbe"
+ : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
+ 0xBFFFE,
+ ],
+ [ "non-character code point U+BFFFF",
+ (isASCII)
+ ? "\xf2\xbf\xbf\xbf"
+ : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
+ 0xBFFFF,
+ ],
+ [ "non-character code point U+CFFFE",
+ (isASCII)
+ ? "\xf3\x8f\xbf\xbe"
+ : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
+ 0xCFFFE,
+ ],
+ [ "non-character code point U+CFFFF",
+ (isASCII)
+ ? "\xf3\x8f\xbf\xbf"
+ : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
+ 0xCFFFF,
+ ],
+ [ "non-character code point U+DFFFE",
+ (isASCII)
+ ? "\xf3\x9f\xbf\xbe"
+ : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
+ 0xDFFFE,
+ ],
+ [ "non-character code point U+DFFFF",
+ (isASCII)
+ ? "\xf3\x9f\xbf\xbf"
+ : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
+ 0xDFFFF,
+ ],
+ [ "non-character code point U+EFFFE",
+ (isASCII)
+ ? "\xf3\xaf\xbf\xbe"
+ : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
+ 0xEFFFE,
+ ],
+ [ "non-character code point U+EFFFF",
(isASCII)
- ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
+ ? "\xf3\xaf\xbf\xbf"
+ : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
+ 0xEFFFF,
+ ],
+ [ "non-character code point U+FFFFE",
+ (isASCII)
+ ? "\xf3\xbf\xbf\xbe"
+ : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
+ 0xFFFFE,
+ ],
+ [ "non-character code point U+FFFFF",
+ (isASCII)
+ ? "\xf3\xbf\xbf\xbf"
+ : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
+ 0xFFFFF,
+ ],
+ [ "non-character code point U+10FFFE",
+ (isASCII)
+ ? "\xf4\x8f\xbf\xbe"
+ : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
+ 0x10FFFE,
+ ],
+ [ "non-character code point U+10FFFF",
+ (isASCII)
+ ? "\xf4\x8f\xbf\xbf"
+ : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
+ 0x10FFFF,
+ ],
+ [ "first non_unicode",
+ (isASCII)
+ ? "\xf4\x90\x80\x80"
+ : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
+ 0x110000,
+ 2,
+ ],
+ [ "non_unicode whose first byte tells that",
+ (isASCII)
+ ? "\xf5\x80\x80\x80"
+ : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
+ (isASCII) ? 0x140000 : 0x200000,
+ 1,
+ ],
+ [ "lowest 32 bit code point",
+ (isASCII)
+ ? "\xfe\x82\x80\x80\x80\x80\x80"
: I8_to_native(
- "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
- $::UTF8_DISALLOW_ABOVE_31_BIT,
- 'utf8', 0x1000000000,
- (isASCII) ? 1 : 7,
- ];
- if (! isASCII) {
- push @tests, # These could falsely show wrongly in a naive
- # implementation
- [ "requires at least 32 bits",
- I8_to_native(
- "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
- $::UTF8_DISALLOW_ABOVE_31_BIT,
- 'utf8', 0x800000000,
- 7,
+ "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
+ 0x80000000,
+ ],
+ [ "highest 32 bit code point",
+ (isASCII)
+ ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
+ : I8_to_native(
+ "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
+ 0xFFFFFFFF,
+ ],
+ [ "Lowest 33 bit code point",
+ (isASCII)
+ ? "\xfe\x84\x80\x80\x80\x80\x80"
+ : I8_to_native(
+ "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
+ ($::is64bit) ? 0x100000000 : -1, # Overflows on 32-bit systems
+ ],
+ );
+
+ if (! $::is64bit) {
+ if (isASCII) {
+ push @tests,
+ [ "overflow that old algorithm failed to detect",
+ "\xfe\x86\x80\x80\x80\x80\x80",
+ -1,
+ ];
+ }
+ }
+
+ if ($::is64bit) {
+ push @tests,
+ [ "highest 64 bit code point",
+ (isASCII)
+ ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
+ : I8_to_native(
+ "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
+ 0xFFFFFFFFFFFFFFFF,
+ (isASCII) ? 1 : 2,
],
- [ "requires at least 32 bits",
- I8_to_native(
+ [ "first 65 bit code point",
+ (isASCII)
+ ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
+ : I8_to_native(
+ "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
+ -1,
+ ];
+ if (isASCII) {
+ push @tests,
+ [ "Lowest code point requiring 13 bytes to represent",
+ "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
+ 0x1000000000,
+ ],
+ [ "overflow that old algorithm failed to detect",
+ "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
+ -1,
+ ];
+ }
+ else {
+ push @tests, # These could falsely show wrongly in a naive
+ # implementation
+ [ "requires at least 32 bits",
+ I8_to_native(
+ "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
+ 0x800000000,
+ ],
+ [ "requires at least 32 bits",
+ I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
- $::UTF8_DISALLOW_ABOVE_31_BIT,
- 'utf8', 0x10000000000,
- 6,
- ],
- [ "requires at least 32 bits",
- I8_to_native(
+ 0x10000000000,
+ ],
+ [ "requires at least 32 bits",
+ I8_to_native(
"\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
- $::UTF8_DISALLOW_ABOVE_31_BIT,
- 'utf8', 0x200000000000,
- 5,
- ],
- [ "requires at least 32 bits",
- I8_to_native(
+ 0x200000000000,
+ ],
+ [ "requires at least 32 bits",
+ I8_to_native(
"\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
- $::UTF8_DISALLOW_ABOVE_31_BIT,
- 'utf8', 0x4000000000000,
- 4,
- ],
- [ "requires at least 32 bits",
- I8_to_native(
+ 0x4000000000000,
+ ],
+ [ "requires at least 32 bits",
+ I8_to_native(
"\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
- $::UTF8_DISALLOW_ABOVE_31_BIT,
- 'utf8', 0x80000000000000,
- 3,
- ],
- [ "requires at least 32 bits",
- I8_to_native(
+ 0x80000000000000,
+ ],
+ [ "requires at least 32 bits",
+ I8_to_native(
"\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
- $::UTF8_DISALLOW_ABOVE_31_BIT,
- 'utf8', 0x1000000000000000,
- 2,
- ];
+ 0x1000000000000000,
+ ];
+ }
}
}
W_NONCHAR
D_SUPER
W_SUPER
- D_ABOVE_31_BIT
- W_ABOVE_31_BIT
+ D_PERL_EXTENDED
+ W_PERL_EXTENDED
CHECK_ONLY
NO_CONFIDENCE_IN_CURLEN_
) );
+sub utf8n_display_call($)
+{
+ # Converts an eval string that calls test_utf8n_to_uvchr into a more human
+ # readable form, and returns it. Doesn't work if the byte string contains
+ # an apostrophe. The return will look something like:
+ # test_utf8n_to_uvchr_error('$bytes', $length, $flags)
+ #diag $_[0];
+
+ $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
+ my $text1 = $1; # Everything before the byte string
+ my $bytes = $2;
+ my $text2 = $3; # Includes the length
+ my $flags = $4;
+
+ return $text1
+ . display_bytes($bytes)
+ . $text2
+ . flags_to_text($flags, \@utf8n_flags_to_text)
+ . ')';
+}
+
+sub uvchr_display_call($)
+{
+ # Converts an eval string that calls test_uvchr_to_utf8 into a more human
+ # readable form, and returns it. The return will look something like:
+ # test_uvchr_to_utf8n_flags($uv, $flags)
+ #diag $_[0];
+
+ my @flags_to_text = ( qw(
+ W_SURROGATE
+ W_NONCHAR
+ W_SUPER
+ W_PERL_EXTENDED
+ D_SURROGATE
+ D_NONCHAR
+ D_SUPER
+ D_PERL_EXTENDED
+ ) );
+
+ $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
+ my $text = $1;
+ my $cp = sprintf "%X", $2;
+ my $flags = $3;
+
+ return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
+}
+
+sub do_warnings_test(@)
+{
+ my @expected_warnings = @_;
+
+ # Compares the input expected warnings array with @warnings_gotten,
+ # generating a pass for each found, removing it from @warnings_gotten.
+ # Any discrepancies generate test failures. Returns TRUE if no
+ # discrepcancies; otherwise FALSE.
+
+ my $succeeded = 1;
+
+ if (@expected_warnings == 0) {
+ if (! is(@warnings_gotten, 0, " Expected and got no warnings")) {
+ output_warnings(@warnings_gotten);
+ $succeeded = 0;
+ }
+ return $succeeded;
+ }
+
+ # Check that we got all the expected warnings,
+ # removing each one found
+ WARNING:
+ foreach my $expected (@expected_warnings) {
+ foreach (my $i = 0; $i < @warnings_gotten; $i++) {
+ if ($warnings_gotten[$i] =~ $expected) {
+ pass(" Expected and got warning: "
+ . " $warnings_gotten[$i]");
+ splice @warnings_gotten, $i, 1;
+ next WARNING;
+ }
+ }
+ fail(" Expected a warning that matches "
+ . $expected . " but didn't get it");
+ $succeeded = 0;
+ }
+
+ if (! is(@warnings_gotten, 0, " Got no unexpected warnings")) {
+ output_warnings(@warnings_gotten);
+ $succeeded = 0;
+ }
+
+ return $succeeded;
+}
+
# This test is split into this number of files.
my $num_test_files = $ENV{TEST_JOBS} || 1;
$num_test_files = 10 if $num_test_files > 10;
$test_count++;
next if $test_count % $num_test_files != $::TEST_CHUNK;
- my ($testname, $bytes, $disallow_flags,
- $controlling_warning_category, $allowed_uv, $needed_to_discern_len
- ) = @$test;
+ my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
my $length = length $bytes;
my $will_overflow = $allowed_uv < 0;
- # The convention is that the got flag is the same value as the disallow
- # one, and the warn flag is the next bit over. If this were violated, the
- # tests here should start failing. We could do an eval under no strict to
- # be sure.
- my $expected_error_flags = $disallow_flags;
- my $warn_flags = $disallow_flags << 1;
-
- my $message;
- if ($allowed_uv > 0x7FFFFFFF) {
- $message = nonportable_regex($allowed_uv);
- }
- elsif ($allowed_uv > 0x10FFFF) {
- $message = qr/(not Unicode|for a non-Unicode code point).* may not be portable/;
+ my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
+ my $display_bytes = display_bytes($bytes);
+
+ my $controlling_warning_category;
+ my $utf8n_flag_to_warn;
+ my $utf8n_flag_to_disallow;
+ my $uvchr_flag_to_warn;
+ my $uvchr_flag_to_disallow;
+
+ # We want to test that the independent flags are actually independent.
+ # For example, that a surrogate doesn't trigger a non-character warning,
+ # and conversely, turning off an above-Unicode flag doesn't suppress a
+ # surrogate warning. Earlier versions of this file used nested loops to
+ # test all possible combinations. But that creates lots of tests, making
+ # this run too long. What is now done instead is to use the complement of
+ # the category we are testing to greatly reduce the combinatorial
+ # explosion. For example, if we have a surrogate and we aren't expecting
+ # a warning about it, we set all the flags for non-surrogates to raise
+ # warnings. If one shows up, it indicates the flags aren't independent.
+ my $utf8n_flag_to_warn_complement;
+ my $utf8n_flag_to_disallow_complement;
+ my $uvchr_flag_to_warn_complement;
+ my $uvchr_flag_to_disallow_complement;
+
+ # Many of the code points being tested are middling in that if code point
+ # edge cases work, these are very likely to as well. Because this test
+ # file takes a while to execute, we skip testing the edge effects of code
+ # points deemed middling, while testing their basics and continuing to
+ # fully test the non-middling code points.
+ my $skip_most_tests = 0;
+
+ my $cp_message_qr; # Pattern that matches the message raised when
+ # that message contains the problematic code
+ # point. The message is the same (currently) both
+ # when going from/to utf8.
+ my $non_cp_trailing_text; # The suffix text when the message doesn't
+ # contain a code point. (This is a result of
+ # some sort of malformation that means we
+ # can't get an exact code poin
+ my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
+ \Q requires a Perl extension, and so is not\E
+ \Q portable\E/x;
+ my $extended_non_cp_trailing_text
+ = "is a Perl extension, and so is not portable";
+
+ # Is this test malformed from the beginning? If so, we know to generally
+ # expect that the tests will show it isn't valid.
+ my $initially_malformed = 0;
+
+ if ($will_overflow || $allowed_uv > 0x10FFFF) {
+
+ # Set the SUPER flags; later, we test for PERL_EXTENDED as well.
+ $utf8n_flag_to_warn = $::UTF8_WARN_SUPER;
+ $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
+ $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER;
+ $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
+
+ # Below, we add the flags for non-perl_extended to the code points
+ # that don't fit that category. Special tests are done for this
+ # category in the inner loop.
+ $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
+ |$::UTF8_WARN_SURROGATE;
+ $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
+ |$::UTF8_DISALLOW_SURROGATE;
+ $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
+ |$::UNICODE_WARN_SURROGATE;
+ $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
+ |$::UNICODE_DISALLOW_SURROGATE;
+ $controlling_warning_category = 'non_unicode';
+
+ if ($will_overflow) { # This is realy a malformation
+ $non_cp_trailing_text = "if you see this, there is an error";
+ $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
+ $initially_malformed = 1;
+ if (! defined $needed_to_discern_len) {
+ $needed_to_discern_len = overflow_discern_len($length);
+ }
+ }
+ elsif (requires_extended_utf8($allowed_uv)) {
+ $cp_message_qr = $extended_cp_message_qr;
+ $non_cp_trailing_text = $extended_non_cp_trailing_text;
+ $needed_to_discern_len = 1 unless defined $needed_to_discern_len;
+ }
+ else {
+ $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
+ \Q may not be portable\E/x;
+ $non_cp_trailing_text = "is for a non-Unicode code point, may not"
+ . " be portable";
+ $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED;
+ $utf8n_flag_to_disallow_complement
+ |= $::UTF8_DISALLOW_PERL_EXTENDED;
+ $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED;
+ $uvchr_flag_to_disallow_complement
+ |= $::UNICODE_DISALLOW_PERL_EXTENDED;
+ }
}
elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
- $message = qr/surrogate/;
+ $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/;
+ $non_cp_trailing_text = "is for a surrogate";
$needed_to_discern_len = 2 unless defined $needed_to_discern_len;
+ $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
+
+ $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE;
+ $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
+ $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE;
+ $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
+
+ $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
+ |$::UTF8_WARN_SUPER
+ |$::UTF8_WARN_PERL_EXTENDED;
+ $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
+ |$::UTF8_DISALLOW_SUPER
+ |$::UTF8_DISALLOW_PERL_EXTENDED;
+ $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
+ |$::UNICODE_WARN_SUPER
+ |$::UNICODE_WARN_PERL_EXTENDED;
+ $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
+ |$::UNICODE_DISALLOW_SUPER
+ |$::UNICODE_DISALLOW_PERL_EXTENDED;
+ $controlling_warning_category = 'surrogate';
}
elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
|| ($allowed_uv & 0xFFFE) == 0xFFFE)
{
- $message = qr/Unicode non-character.*is not recommended for open interchange/;
+ $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E
+ \Q is not recommended for open interchange\E/x;
+ $non_cp_trailing_text = "if you see this, there is an error";
$needed_to_discern_len = $length unless defined $needed_to_discern_len;
- }
- elsif ($will_overflow) {
- $message = qr/overflows/;
+ if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
+ || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
+ {
+ $skip_most_tests = 1;
+ }
+
+ $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR;
+ $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
+ $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR;
+ $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
+
+ $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE
+ |$::UTF8_WARN_SUPER
+ |$::UTF8_WARN_PERL_EXTENDED;
+ $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
+ |$::UTF8_DISALLOW_SUPER
+ |$::UTF8_DISALLOW_PERL_EXTENDED;
+ $uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE
+ |$::UNICODE_WARN_SUPER
+ |$::UNICODE_WARN_PERL_EXTENDED;
+ $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
+ |$::UNICODE_DISALLOW_SUPER
+ |$::UNICODE_DISALLOW_PERL_EXTENDED;
+
+ $controlling_warning_category = 'nonchar';
}
else {
die "Can't figure out what type of warning to test for $testname"
die 'Didn\'t set $needed_to_discern_len for ' . $testname
unless defined $needed_to_discern_len;
- {
- use warnings;
+ { # First test the isFOO calls
+ use warnings; no warnings 'deprecated'; # Make sure these don't raise warnings
undef @warnings_gotten;
+
my $ret = test_isUTF8_CHAR($bytes, $length);
my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
- if ($will_overflow) {
- is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
- is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
+ if ($initially_malformed) {
+ is($ret, 0, "For $testname: isUTF8_CHAR() returns 0");
+ is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0");
}
else {
is($ret, $length,
- "isUTF8_CHAR() $testname: returns expected length: $length");
- is($ret_flags, $length, "isUTF8_CHAR_flags(...,0) $testname:"
+ "For $testname: isUTF8_CHAR() returns expected length: $length");
+ is($ret_flags, $length, " And isUTF8_CHAR_flags(...,0)"
. " returns expected length: $length");
}
is(scalar @warnings_gotten, 0,
- "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated"
- . " no warnings")
+ " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags generated"
+ . " any warnings")
or output_warnings(@warnings_gotten);
undef @warnings_gotten;
$ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
- if ($will_overflow) {
- is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
+ if ($initially_malformed) {
+ is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0");
}
else {
my $expected_ret = ( $testname =~ /surrogate|non-character/
|| $allowed_uv > 0x10FFFF)
? 0
: $length;
- is($ret, $expected_ret, "isSTRICT_UTF8_CHAR() $testname: returns"
+ is($ret, $expected_ret, " And isSTRICT_UTF8_CHAR() returns"
. " expected length: $expected_ret");
$ret = test_isUTF8_CHAR_flags($bytes, $length,
$::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
is($ret, $expected_ret,
- "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
- . " acts like isSTRICT_UTF8_CHAR");
+ " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
+ . " acts like isSTRICT_UTF8_CHAR");
}
is(scalar @warnings_gotten, 0,
- "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
- . " generated no warnings")
+ " And neither isSTRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
+ . " generated any warnings")
or output_warnings(@warnings_gotten);
undef @warnings_gotten;
$ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
- if ($will_overflow) {
- is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
+ if ($initially_malformed) {
+ is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0");
}
else {
my $expected_ret = ( $testname =~ /surrogate/
|| $allowed_uv > 0x10FFFF)
? 0
: $length;
- is($ret, $expected_ret, "isC9_STRICT_UTF8_CHAR() $testname:"
+ is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()"
." returns expected length: $expected_ret");
$ret = test_isUTF8_CHAR_flags($bytes, $length,
- $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
+ $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
is($ret, $expected_ret,
- "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
- . " acts like isC9_STRICT_UTF8_CHAR");
+ " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
+ . " acts like isC9_STRICT_UTF8_CHAR");
}
is(scalar @warnings_gotten, 0,
- "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
- . " generated no warnings")
+ " And neither isC9_STRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
+ . " generated any warnings")
or output_warnings(@warnings_gotten);
- # Test partial character handling, for each byte not a full character
- for my $j (1.. $length - 1) {
+ foreach my $disallow_type (0..2) {
+ # 0 is don't disallow this type of code point
+ # 1 is do disallow
+ # 2 is do disallow, but only code points requiring
+ # perl-extended-UTF8
+
+ my $disallow_flags;
+ my $expected_ret;
+
+ if ($initially_malformed) {
+
+ # Malformations are by default disallowed, so testing with
+ # $disallow_type equal to 0 is sufficicient.
+ next if $disallow_type;
- # Skip the test for the interaction between overflow and above-31
- # bit. It is really testing other things than the partial
- # character tests, for which other tests in this file are
- # sufficient
- last if $will_overflow;
+ $disallow_flags = 0;
+ $expected_ret = 0;
+ }
+ elsif ($disallow_type == 1) {
+ $disallow_flags = $utf8n_flag_to_disallow;
+ $expected_ret = 0;
+ }
+ elsif ($disallow_type == 2) {
+ next if ! requires_extended_utf8($allowed_uv);
+ $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED;
+ $expected_ret = 0;
+ }
+ else { # type is 0
+ $disallow_flags = $utf8n_flag_to_disallow_complement;
+ $expected_ret = $length;
+ }
- foreach my $disallow_flag (0, $disallow_flags) {
+ $ret = test_isUTF8_CHAR_flags($bytes, $length, $disallow_flags);
+ is($ret, $expected_ret, " And isUTF8_CHAR_flags("
+ . "$display_bytes, $disallow_flags) returns "
+ . $expected_ret)
+ or diag "The flags mean "
+ . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
+
+ is(scalar @warnings_gotten, 0,
+ " And isUTF8_CHAR_flags(...) generated no warnings")
+ or output_warnings(@warnings_gotten);
+
+ # Test partial character handling, for each byte not a full character
+ my $did_test_partial = 0;
+ for (my $j = 1; $j < $length - 1; $j++) {
+ $did_test_partial = 1;
my $partial = substr($bytes, 0, $j);
my $ret_should_be;
my $comment;
- if ($disallow_flag) {
+ if ($disallow_type || $initially_malformed) {
$ret_should_be = 0;
$comment = "disallowed";
if ($j < $needed_to_discern_len) {
$ret_should_be = 1;
- $comment .= ", but need $needed_to_discern_len bytes"
- . " to discern:";
+ $comment .= ", but need $needed_to_discern_len"
+ . " bytes to discern:";
}
}
else {
undef @warnings_gotten;
$ret = test_is_utf8_valid_partial_char_flags($partial, $j,
- $disallow_flag);
+ $disallow_flags);
is($ret, $ret_should_be,
- "$testname: is_utf8_valid_partial_char_flags("
- . display_bytes($partial)
- . "), $comment: returns $ret_should_be");
+ " And is_utf8_valid_partial_char_flags("
+ . display_bytes($partial)
+ . ", $disallow_flags), $comment: returns $ret_should_be")
+ or diag "The flags mean "
+ . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
+ }
+
+ if ($did_test_partial) {
is(scalar @warnings_gotten, 0,
- "$testname: is_utf8_valid_partial_char_flags()"
- . " generated no warnings")
- or output_warnings(@warnings_gotten);
+ " And is_utf8_valid_partial_char_flags()"
+ . " generated no warnings for any of the lengths")
+ or output_warnings(@warnings_gotten);
}
}
}
- # This is more complicated than the malformations tested earlier, as there
- # are several orthogonal variables involved. We test all the subclasses
- # of utf8 warnings to verify they work with and without the utf8 class,
- # and don't have effects on other sublass warnings
- foreach my $trial_warning_category ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
- foreach my $warn_flag (0, $warn_flags) {
- foreach my $disallow_flag (0, $disallow_flags) {
- foreach my $do_warning (0, 1) {
-
- # We try each of the above with various combinations of
- # malformations that can occur on the same input sequence.
- foreach my $short ("", "short") {
- foreach my $unexpected_noncont ("",
- "unexpected non-continuation")
- {
- foreach my $overlong ("", "overlong") {
-
- # If we're creating an overlong, it can't be longer than
- # the maximum length, so skip if we're already at that
- # length.
- next if $overlong && $length >= $::max_bytes;
-
- my @malformations;
- my @expected_return_flags;
- push @malformations, $short if $short;
- push @malformations, $unexpected_noncont
- if $unexpected_noncont;
- push @malformations, $overlong if $overlong;
-
- # The overflow malformation test in the input
- # array is coerced into being treated like one of
- # the others.
- if ($will_overflow) {
- push @malformations, 'overflow';
- push @expected_return_flags, $::UTF8_GOT_OVERFLOW;
+ # Now test the to/from UTF-8 calls
+ # This is more complicated than the malformations tested in other files in
+ # this directory, as there are several orthogonal variables involved. We
+ # test most possible combinations
+
+ # We try various combinations of malformations that can occur
+ foreach my $short (0, 1) {
+ next if $skip_most_tests && $short;
+ foreach my $unexpected_noncont (0, 1) {
+ next if $skip_most_tests && $unexpected_noncont;
+ foreach my $overlong (0, 1) {
+ next if $overlong && $skip_most_tests;
+
+ # If we're creating an overlong, it can't be longer than the
+ # maximum length, so skip if we're already at that length.
+ next if $overlong && $length >= $::max_bytes;
+
+ my $this_cp_message_qr = $cp_message_qr;
+ my $this_non_cp_trailing_text = $non_cp_trailing_text;
+
+ foreach my $malformed_allow_type (0..2) {
+ # 0 don't allow this malformation; ignored if no malformation
+ # 1 allow, with REPLACEMENT CHARACTER returned
+ # 2 allow, with intended code point returned. All malformations
+ # other than overlong can't determine the intended code point,
+ # so this isn't valid for them.
+ next if $malformed_allow_type == 2
+ && ($will_overflow || $short || $unexpected_noncont);
+ next if $skip_most_tests && $malformed_allow_type;
+
+ # Here we are in the innermost loop for malformations. So we
+ # know which ones are in effect. Can now change the input to be
+ # appropriately malformed. We also can set up certain other
+ # things now, like whether we expect a return flag from this
+ # malformation, and which flag.
+
+ my $this_bytes = $bytes;
+ my $this_length = $length;
+ my $this_expected_len = $length;
+ my $this_needed_to_discern_len = $needed_to_discern_len;
+
+ my @malformation_names;
+ my @expected_malformation_warnings;
+ my @expected_malformation_return_flags;
+
+ # Contains the flags for any allowed malformations. Currently no
+ # combinations of on/off are tested for. It's either all are
+ # allowed, or none are.
+ my $allow_flags = 0;
+ my $overlong_is_in_perl_extended_utf8 = 0;
+ my $dont_use_overlong_cp = 0;
+
+ if ($overlong) {
+ my $new_expected_len;
+
+ # To force this malformation, we convert the original start
+ # byte into a continuation byte with the same data bits as
+ # originally. ...
+ my $start_byte = substr($this_bytes, 0, 1);
+ my $converted_to_continuation_byte
+ = start_byte_to_cont($start_byte);
+
+ # ... Then we prepend it with a known overlong sequence. This
+ # should evaluate to the exact same code point as the
+ # original. We try to avoid an overlong using Perl extended
+ # UTF-8. The code points are the highest representable as
+ # overlongs on the respective platform without using extended
+ # UTF-8.
+ if (native_to_I8($start_byte) lt "\xFC") {
+ $start_byte = I8_to_native("\xFC");
+ $new_expected_len = 6;
+ }
+ elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") {
+
+ # FE is not extended UTF-8 on EBCDIC
+ $start_byte = I8_to_native("\xFE");
+ $new_expected_len = 7;
+ }
+ else { # Must use extended UTF-8. On ASCII platforms, we
+ # could express some overlongs here starting with
+ # \xFE, but there's no real reason to do so.
+ $overlong_is_in_perl_extended_utf8 = 1;
+ $start_byte = I8_to_native("\xFF");
+ $new_expected_len = $::max_bytes;
+ $this_cp_message_qr = $extended_cp_message_qr;
+
+ # The warning that gets raised doesn't include the code
+ # point in the message if the code point can be expressed
+ # without using extended UTF-8, but the particular
+ # overlong sequence used is in extended UTF-8. To do
+ # otherwise would be confusing to the user, as it would
+ # claim the code point requires extended, when it doesn't.
+ $dont_use_overlong_cp = 1
+ unless requires_extended_utf8($allowed_uv);
+ $this_non_cp_trailing_text = $extended_non_cp_trailing_text;
+ }
+
+ # Splice in the revise continuation byte, preceded by the
+ # start byte and the proper number of the lowest continuation
+ # bytes.
+ $this_bytes = $start_byte
+ . ($native_lowest_continuation_chr
+ x ( $new_expected_len - 1 - length($this_bytes)))
+ . $converted_to_continuation_byte
+ . substr($this_bytes, 1);
+ $this_length = length($this_bytes);
+ $this_needed_to_discern_len = $new_expected_len
+ - ( $this_expected_len
+ - $this_needed_to_discern_len);
+ $this_expected_len = $new_expected_len;
+ push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
+ push @malformation_names, 'overlong';
+
+ if ($malformed_allow_type == 2) {
+ $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
+ }
+ elsif ($malformed_allow_type) {
+ $allow_flags |= $::UTF8_ALLOW_LONG;
+ }
+ }
+
+ if ($short) {
+ push @malformation_names, 'short';
+
+ # To force this malformation, just tell the test to not look
+ # as far as it should into the input.
+ $this_length--;
+ $this_expected_len--;
+ push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
+
+ $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
+ }
+
+ if ($unexpected_noncont) {
+ push @malformation_names, 'unexpected non-continuation';
+
+ # To force this malformation, change the final continuation
+ # byte into a non continuation.
+ my $pos = ($short) ? -2 : -1;
+ substr($this_bytes, $pos, 1) = '?';
+ $this_expected_len--;
+ push @expected_malformation_return_flags,
+ $::UTF8_GOT_NON_CONTINUATION;
+ $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
+ if $malformed_allow_type;
+ }
+
+ # The whole point of a test that is malformed from the beginning
+ # is to test for that malformation. If we've modified things so
+ # much that we don't have enough information to detect that
+ # malformation, there's no point in testing.
+ next if $initially_malformed
+ && $this_expected_len < $this_needed_to_discern_len;
+
+ # Here, we've transformed the input with all of the desired
+ # non-overflow malformations. We are now in a position to
+ # construct any potential warnings for those malformations. But
+ # it's a pain to get the detailed messages exactly right, so for
+ # now XXX, only do so for those that return an explicit code
+ # point.
+
+ if ($overlong) {
+
+ # If one of the other malformation types is also in effect, we
+ # don't know what the intended code point was.
+ if ($short || $unexpected_noncont || $will_overflow) {
+ push @expected_malformation_warnings, qr/overlong/;
+ }
+ else {
+ my $wrong_bytes = display_bytes_no_quotes(
+ substr($this_bytes, 0, $this_length));
+ my $correct_bytes = display_bytes_no_quotes($bytes);
+ my $prefix = ($allowed_uv > 0x10FFFF) ? "0x" : "U+";
+ push @expected_malformation_warnings,
+ qr/\QMalformed UTF-8 character: $wrong_bytes\E
+ \Q (overlong; instead use\E
+ \Q $correct_bytes to\E
+ \Q represent $prefix$uv_string)/x;
+ }
+ }
+ if ($short) {
+ push @expected_malformation_warnings, qr/too short/;
+ }
+ if ($unexpected_noncont) {
+ push @expected_malformation_warnings,
+ qr/unexpected non-continuation byte/;
+ }
+
+ # The overflow malformation is done differently than other
+ # malformations. It comes from manually typed tests in the test
+ # array. We now make it be treated like one of the other
+ # malformations. But some has to be deferred until the inner loop
+ my $overflow_msg_pattern;
+ if ($will_overflow) {
+ push @malformation_names, 'overflow';
+
+ $overflow_msg_pattern = display_bytes_no_quotes(
+ substr($this_bytes, 0, $this_expected_len));
+ $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E
+ \Q $overflow_msg_pattern\E
+ \Q (overflows)\E/x;
+ push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW;
+ $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type;
+ }
+
+ # And we can create the malformation-related text for the the test
+ # names we eventually will generate.
+ my $malformations_name = "";
+ if (@malformation_names) {
+ $malformations_name .= "dis" unless $malformed_allow_type;
+ $malformations_name .= "allowed ";
+ $malformations_name .= "malformation";
+ $malformations_name .= "s" if @malformation_names > 1;
+ $malformations_name .= ": ";
+ $malformations_name .= join "/", @malformation_names;
+ $malformations_name = " ($malformations_name)";
+ }
+
+ # Done setting up the malformation related stuff
+
+ foreach my $do_disallow (0, 1) {
+ next if $skip_most_tests && ! $do_disallow;
+
+ # We classify the warnings into certain "interesting" types,
+ # described later
+ foreach my $warning_type (0..4) {
+ next if $skip_most_tests && $warning_type != 1;
+ foreach my $use_warn_flag (0, 1) {
+ next if $skip_most_tests && ! $use_warn_flag;
+
+ # Finally, here is the inner loop
+
+ my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn;
+ my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow;
+ my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn;
+ my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow;
+
+ my $eval_warn;
+ my $expect_regular_warnings;
+ my $expect_warnings_for_malformed;
+ my $expect_warnings_for_overflow;
+
+ if ($warning_type == 0) {
+ $eval_warn = "use warnings; no warnings 'deprecated'";
+ $expect_regular_warnings = $use_warn_flag;
+
+ # We ordinarily expect overflow warnings here. But it
+ # is somewhat more complicated, and the final
+ # determination is deferred to one place in the filw
+ # where we handle overflow.
+ $expect_warnings_for_overflow = 1;
+
+ # We would ordinarily expect malformed warnings in
+ # this case, but not if malformations are allowed.
+ $expect_warnings_for_malformed
+ = $malformed_allow_type == 0;
+ }
+ elsif ($warning_type == 1) {
+ $eval_warn = "no warnings";
+ $expect_regular_warnings = 0;
+ $expect_warnings_for_overflow = 0;
+ $expect_warnings_for_malformed = 0;
+ }
+ elsif ($warning_type == 2) {
+ $eval_warn = "no warnings; use warnings 'utf8'";
+ $expect_regular_warnings = $use_warn_flag;
+ $expect_warnings_for_overflow = 1;
+ $expect_warnings_for_malformed
+ = $malformed_allow_type == 0;
+ }
+ elsif ($warning_type == 3) {
+ $eval_warn = "no warnings; use warnings"
+ . " '$controlling_warning_category'";
+ $expect_regular_warnings = $use_warn_flag;
+ $expect_warnings_for_overflow
+ = $controlling_warning_category eq 'non_unicode';
+ $expect_warnings_for_malformed = 0;
+ }
+ elsif ($warning_type == 4) { # Like type 3, but uses the
+ # PERL_EXTENDED flags
+ # The complement flags were set up so that the
+ # PERL_EXTENDED flags have been tested that they don't
+ # trigger wrongly for too small code points. And the
+ # flags have been set up so that those small code
+ # points are tested for being above Unicode. What's
+ # left to test is that the large code points do
+ # trigger the PERL_EXTENDED flags.
+ next if ! requires_extended_utf8($allowed_uv);
+ next if $controlling_warning_category ne 'non_unicode';
+ $eval_warn = "no warnings; use warnings 'non_unicode'";
+ $expect_regular_warnings = 1;
+ $expect_warnings_for_overflow = 1;
+ $expect_warnings_for_malformed = 0;
+ $this_utf8n_flag_to_warn = $::UTF8_WARN_PERL_EXTENDED;
+ $this_utf8n_flag_to_disallow
+ = $::UTF8_DISALLOW_PERL_EXTENDED;
+ $this_uvchr_flag_to_warn
+ = $::UNICODE_WARN_PERL_EXTENDED;
+ $this_uvchr_flag_to_disallow
+ = $::UNICODE_DISALLOW_PERL_EXTENDED;
+ }
+ else {
+ die "Unexpected warning type '$warning_type'";
}
- my $malformations_name = join "/", @malformations;
- $malformations_name .= " malformation"
- if $malformations_name;
- $malformations_name .= "s" if @malformations > 1;
- my $this_bytes = $bytes;
- my $this_length = $length;
+ # We only need to test the case where all warnings are
+ # enabled (type 0) to see if turning off the warning flag
+ # causes things to not be output. If those pass, then
+ # turning on some sub-category of warnings, or turning off
+ # warnings altogether are extremely likely to not output
+ # warnings either, given how the warnings subsystem is
+ # supposed to work, and this file assumes it does work.
+ next if $warning_type != 0 && ! $use_warn_flag;
+
+ # The convention is that the 'got' flag is the same value
+ # as the disallow one. If this were violated, the tests
+ # here should start failing.
+ my $return_flag = $this_utf8n_flag_to_disallow;
+
+ # If we aren't expecting warnings/disallow for this, turn
+ # on all the other flags. That makes sure that they all
+ # are independent of this flag, and so we don't need to
+ # test them individually.
+ my $this_warning_flags
+ = ($use_warn_flag)
+ ? $this_utf8n_flag_to_warn
+ : ($overlong_is_in_perl_extended_utf8
+ ? ($utf8n_flag_to_warn_complement
+ & ~$::UTF8_WARN_PERL_EXTENDED)
+ : $utf8n_flag_to_warn_complement);
+ my $this_disallow_flags
+ = ($do_disallow)
+ ? $this_utf8n_flag_to_disallow
+ : ($overlong_is_in_perl_extended_utf8
+ ? ($utf8n_flag_to_disallow_complement
+ & ~$::UTF8_DISALLOW_PERL_EXTENDED)
+ : $utf8n_flag_to_disallow_complement);
my $expected_uv = $allowed_uv;
- my $this_expected_len = $length;
- my $this_needed_to_discern_len = $needed_to_discern_len;
- if ($malformations_name) {
- $expected_uv = 0;
-
- # Coerce the input into the desired
- # malformation
- if ($malformations_name =~ /overlong/) {
-
- # For an overlong, we convert the original
- # start byte into a continuation byte with
- # the same data bits as originally. ...
- substr($this_bytes, 0, 1)
- = start_byte_to_cont(substr($this_bytes,
- 0, 1));
-
- # ... Then we prepend it with a known
- # overlong sequence. This should evaluate
- # to the exact same code point as the
- # original.
- $this_bytes
- = I8_to_native("\xff")
- . (I8_to_native(chr $::lowest_continuation)
- x ( $::max_bytes - 1 - length($this_bytes)))
- . $this_bytes;
- $this_length = length($this_bytes);
- $this_needed_to_discern_len
- = $::max_bytes - ($this_expected_len
- - $this_needed_to_discern_len);
- $this_expected_len = $::max_bytes;
- push @expected_return_flags, $::UTF8_GOT_LONG;
- }
- if ($malformations_name =~ /short/) {
+ my $this_uv_string = $uv_string;
+
+ my @expected_return_flags
+ = @expected_malformation_return_flags;
+ my @expected_warnings;
+ push @expected_warnings, @expected_malformation_warnings
+ if $expect_warnings_for_malformed;
+
+ # The overflow malformation is done differently than other
+ # malformations. It comes from manually typed tests in
+ # the test array, but it also is above Unicode and uses
+ # Perl extended UTF-8, so affects some of the flags being
+ # tested. We now make it be treated like one of the other
+ # generated malformations.
+ if ($will_overflow) {
- # Just tell the test to not look far
- # enough into the input.
- $this_length--;
- $this_expected_len--;
- push @expected_return_flags, $::UTF8_GOT_SHORT;
+ # An overflow is (way) above Unicode, and overrides
+ # everything else.
+ $expect_regular_warnings = 0;
+
+ # Earlier, we tentatively calculated whether this
+ # should emit a message or not. It's tentative
+ # because, even if we ordinarily would output it, we
+ # don't if malformations are allowed -- except an
+ # overflow is also a SUPER and PERL_EXTENDED, and if
+ # warnings for those are enabled, the overflow
+ # warning does get raised.
+ if ( $expect_warnings_for_overflow
+ && ( $malformed_allow_type == 0
+ || ( $this_warning_flags
+ & ($::UTF8_WARN_SUPER
+ |$::UTF8_WARN_PERL_EXTENDED))))
+ {
+ push @expected_warnings, $overflow_msg_pattern;
}
- if ($malformations_name
- =~ /non-continuation/)
+ }
+
+ # It may be that the malformations have shortened the
+ # amount of input we look at so much that we can't tell
+ # what the category the code point was in. Otherwise, set
+ # up the expected return flags based on the warnings and
+ # disallowments.
+ if ($this_expected_len < $this_needed_to_discern_len) {
+ $expect_regular_warnings = 0;
+ }
+ elsif ( ($this_warning_flags & $this_utf8n_flag_to_warn)
+ || ( $this_disallow_flags
+ & $this_utf8n_flag_to_disallow))
+ {
+ push @expected_return_flags, $return_flag;
+ }
+
+ # Finish setting up the expected warning.
+ if ($expect_regular_warnings) {
+
+ # So far the array contains warnings generated by
+ # malformations. Add the expected regular one.
+ unshift @expected_warnings, $this_cp_message_qr;
+
+ # But it may need to be modified, because either of
+ # these malformations means we can't determine the
+ # expected code point.
+ if ( $short || $unexpected_noncont
+ || $dont_use_overlong_cp)
{
- # Change the final continuation byte into
- # a non one.
- my $pos = ($short) ? -2 : -1;
- substr($this_bytes, $pos, 1) = '?';
- $this_expected_len--;
- push @expected_return_flags,
- $::UTF8_GOT_NON_CONTINUATION;
+ my $first_byte = substr($this_bytes, 0, 1);
+ $expected_warnings[0] = display_bytes(
+ substr($this_bytes, 0, $this_expected_len));
+ $expected_warnings[0]
+ = qr/[Aa]\Qny UTF-8 sequence that starts with\E
+ \Q $expected_warnings[0]\E
+ \Q $this_non_cp_trailing_text\E/x;
}
}
- my $eval_warn = $do_warning
- ? "use warnings '$trial_warning_category'"
- : $trial_warning_category eq "utf8"
- ? "no warnings 'utf8'"
- : ( "use warnings 'utf8';"
- . " no warnings '$trial_warning_category'");
-
- # Is effectively disallowed if we've set up a
- # malformation, even if the flag indicates it is
- # allowed. Fix up test name to indicate this as
- # well
- my $disallowed = $disallow_flag
- || $malformations_name;
- my $this_name = "utf8n_to_uvchr_error() $testname: "
- . (($disallow_flag)
- ? 'disallowed'
- : $disallowed
- ? $disallowed
- : 'allowed');
- $this_name .= ", $eval_warn";
- $this_name .= ", " . (($warn_flag)
- ? 'with warning flag'
- : 'no warning flag');
+ # Is effectively disallowed if we've set up a malformation
+ # (unless malformations are allowed), even if the flag
+ # indicates it is allowed. Fix up test name to indicate
+ # this as well
+ my $disallowed = 0;
+ if ( $this_disallow_flags & $this_utf8n_flag_to_disallow
+ && $this_expected_len >= $this_needed_to_discern_len)
+ {
+ $disallowed = 1;
+ }
+ if ($malformations_name) {
+ if ($malformed_allow_type == 0) {
+ $disallowed = 1;
+ }
+ elsif ($malformed_allow_type == 1) {
+
+ # Even if allowed, the malformation returns the
+ # REPLACEMENT CHARACTER.
+ $expected_uv = 0xFFFD;
+ $this_uv_string = "0xFFFD"
+ }
+ }
+ my $this_name = "utf8n_to_uvchr_error() $testname: ";
+ if (! $initially_malformed) {
+ $this_name .= ($disallowed)
+ ? 'disallowed, '
+ : 'allowed, ';
+ }
+ $this_name .= "$eval_warn";
+ $this_name .= ", " . (( $this_warning_flags
+ & $this_utf8n_flag_to_warn)
+ ? 'with flag for raising warnings'
+ : 'no flag for raising warnings');
+ $this_name .= $malformations_name;
+
+ # Do the actual test using an eval
undef @warnings_gotten;
my $ret_ref;
- my $display_bytes = display_bytes($this_bytes);
- my $call = " Call was: $eval_warn; \$ret_ref"
- . " = test_utf8n_to_uvchr_error("
- . "'$display_bytes', $this_length,"
- . "$warn_flag"
- . "|$disallow_flag)";
+ my $this_flags
+ = $allow_flags|$this_warning_flags|$this_disallow_flags;
my $eval_text = "$eval_warn; \$ret_ref"
. " = test_utf8n_to_uvchr_error("
- . "'$this_bytes',"
- . " $this_length, $warn_flag"
- . "|$disallow_flag)";
+ . "'$this_bytes', $this_length, $this_flags)";
eval "$eval_text";
- if (! ok ("$@ eq ''",
- "$this_name: eval succeeded"))
+ if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
{
- diag "\$!='$!'; eval'd=\"$call\"";
+ diag "\$@='$@'; call was: "
+ . utf8n_display_call($eval_text);
next;
}
if ($disallowed) {
- is($ret_ref->[0], 0, "$this_name: Returns 0")
- or diag $call;
+ is($ret_ref->[0], 0, " And returns 0")
+ or diag "Call was: " . utf8n_display_call($eval_text);
}
else {
is($ret_ref->[0], $expected_uv,
- "$this_name: Returns expected uv: "
- . sprintf("0x%04X", $expected_uv))
- or diag $call;
+ " And returns expected uv: "
+ . $this_uv_string)
+ or diag "Call was: " . utf8n_display_call($eval_text);
}
is($ret_ref->[1], $this_expected_len,
- "$this_name: Returns expected length:"
+ " And returns expected length:"
. " $this_expected_len")
- or diag $call;
+ or diag "Call was: " . utf8n_display_call($eval_text);
my $returned_flags = $ret_ref->[2];
for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
- if (ok($expected_return_flags[$i] & $returned_flags,
- "Expected and got error bit return"
- . " for $malformations[$i] malformation"))
- {
+ if ($expected_return_flags[$i] & $returned_flags) {
+ if ($expected_return_flags[$i]
+ == $::UTF8_GOT_PERL_EXTENDED)
+ {
+ pass(" Expected and got return flag for"
+ . " PERL_EXTENDED");
+ }
+ # The first entries in this are
+ # malformations
+ elsif ($i > @malformation_names - 1) {
+ pass(" Expected and got return flag"
+ . " for " . $controlling_warning_category);
+ }
+ else {
+ pass(" Expected and got return flag for "
+ . $malformation_names[$i]
+ . " malformation");
+ }
$returned_flags &= ~$expected_return_flags[$i];
+ splice @expected_return_flags, $i, 1;
}
- splice @expected_return_flags, $i, 1;
- }
- is(scalar @expected_return_flags, 0,
- "Got all the expected malformation errors")
- or diag Dumper \@expected_return_flags;
-
- if ( $this_expected_len >= $this_needed_to_discern_len
- && ($warn_flag || $disallow_flag))
- {
- is($returned_flags, $expected_error_flags,
- "Got the correct error flag")
- or diag $call;
}
- else {
- is($returned_flags, 0, "Got no other error flag")
- or
- # We strip off any prefixes from the flag names
- diag "The unexpected flags were: "
+ is($returned_flags, 0,
+ " Got no unexpected return flags")
+ or diag "The unexpected flags gotten were: "
. (flags_to_text($returned_flags,
\@utf8n_flags_to_text)
+ # We strip off any prefixes from the flag
+ # names
+ =~ s/ \b [A-Z] _ //xgr);
+ is (scalar @expected_return_flags, 0,
+ " Got all expected return flags")
+ or diag "The expected flags not gotten were: "
+ . (flags_to_text(eval join("|",
+ @expected_return_flags),
+ \@utf8n_flags_to_text)
+ # We strip off any prefixes from the flag
+ # names
=~ s/ \b [A-Z] _ //xgr);
- }
-
- if (@malformations) {
- if (! $do_warning && $trial_warning_category eq 'utf8') {
- goto no_warnings_expected;
- }
-
- # Check that each malformation generates a
- # warning, removing that warning if found
- MALFORMATION:
- foreach my $malformation (@malformations) {
- foreach (my $i = 0; $i < @warnings_gotten; $i++) {
- if ($warnings_gotten[$i] =~ /$malformation/) {
- pass("Expected and got"
- . "'$malformation' warning");
- splice @warnings_gotten, $i, 1;
- next MALFORMATION;
- }
- }
- fail("Expected '$malformation' warning"
- . " but didn't get it");
-
- }
- }
-
- # Any overflow will override any super or above-31
- # warnings.
- goto no_warnings_expected
- if $will_overflow || $this_expected_len
- < $this_needed_to_discern_len;
-
- if ( ! $do_warning
- && ( $trial_warning_category eq 'utf8'
- || $trial_warning_category eq $controlling_warning_category))
- {
- goto no_warnings_expected;
- }
- elsif ($warn_flag) {
- if (is(scalar @warnings_gotten, 1,
- "$this_name: Got a single warning "))
- {
- like($warnings_gotten[0], $message,
- "$this_name: Got expected warning")
- or diag $call;
- }
- else {
- diag $call;
- if (scalar @warnings_gotten) {
- output_warnings(@warnings_gotten);
- }
- }
- }
- else {
- no_warnings_expected:
- unless (is(scalar @warnings_gotten, 0,
- "$this_name: Got no warnings"))
- {
- diag $call;
- output_warnings(@warnings_gotten);
- }
- }
+ do_warnings_test(@expected_warnings)
+ or diag "Call was: " . utf8n_display_call($eval_text);
+ undef @warnings_gotten;
# Check CHECK_ONLY results when the input is
# disallowed. Do this when actually disallowed,
- # not just when the $disallow_flag is set
+ # not just when the $this_disallow_flags is set
if ($disallowed) {
- undef @warnings_gotten;
- $ret_ref = test_utf8n_to_uvchr_error(
- $this_bytes, $this_length,
- $disallow_flag|$::UTF8_CHECK_ONLY);
- is($ret_ref->[0], 0,
- "$this_name, CHECK_ONLY: Returns 0")
- or diag $call;
+ my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
+ my $eval_text = "use warnings; no warnings 'deprecated'; \$ret_ref ="
+ . " test_utf8n_to_uvchr_error('"
+ . "$this_bytes', $this_length,"
+ . " $this_flags)";
+ eval $eval_text;
+ if (! ok ("$@ eq ''",
+ " And eval succeeded with CHECK_ONLY"))
+ {
+ diag "\$@='$@'; Call was: "
+ . utf8n_display_call($eval_text);
+ next;
+ }
+ is($ret_ref->[0], 0, " CHECK_ONLY: Returns 0")
+ or diag "Call was: " . utf8n_display_call($eval_text);
is($ret_ref->[1], -1,
- "$this_name: CHECK_ONLY: returns -1 for length")
- or diag $call;
+ " CHECK_ONLY: returns -1 for length")
+ or diag "Call was: " . utf8n_display_call($eval_text);
if (! is(scalar @warnings_gotten, 0,
- "$this_name, CHECK_ONLY: no warnings"
- . " generated"))
+ " CHECK_ONLY: no warnings generated"))
{
- diag $call;
+ diag "Call was: " . utf8n_display_call($eval_text);
output_warnings(@warnings_gotten);
}
}
# Now repeat some of the above, but for
# uvchr_to_utf8_flags(). Since this comes from an
- # existing code point, it hasn't overflowed, and
- # isn't malformed.
- next if @malformations;
-
- # The warning and disallow flags passed in are for
- # utf8n_to_uvchr_error(). Convert them for
- # uvchr_to_utf8_flags().
- my $uvchr_warn_flag = 0;
- my $uvchr_disallow_flag = 0;
- if ($warn_flag) {
- if ($warn_flag == $::UTF8_WARN_SURROGATE) {
- $uvchr_warn_flag = $::UNICODE_WARN_SURROGATE
- }
- elsif ($warn_flag == $::UTF8_WARN_NONCHAR) {
- $uvchr_warn_flag = $::UNICODE_WARN_NONCHAR
- }
- elsif ($warn_flag == $::UTF8_WARN_SUPER) {
- $uvchr_warn_flag = $::UNICODE_WARN_SUPER
- }
- elsif ($warn_flag == $::UTF8_WARN_ABOVE_31_BIT) {
- $uvchr_warn_flag
- = $::UNICODE_WARN_ABOVE_31_BIT;
- }
- else {
- fail(sprintf "Unexpected warn flag: %x",
- $warn_flag);
- next;
- }
- }
- if ($disallow_flag) {
- if ($disallow_flag == $::UTF8_DISALLOW_SURROGATE)
- {
- $uvchr_disallow_flag
- = $::UNICODE_DISALLOW_SURROGATE;
- }
- elsif ($disallow_flag == $::UTF8_DISALLOW_NONCHAR)
- {
- $uvchr_disallow_flag
- = $::UNICODE_DISALLOW_NONCHAR;
- }
- elsif ($disallow_flag == $::UTF8_DISALLOW_SUPER) {
- $uvchr_disallow_flag
- = $::UNICODE_DISALLOW_SUPER;
- }
- elsif ($disallow_flag
- == $::UTF8_DISALLOW_ABOVE_31_BIT)
- {
- $uvchr_disallow_flag =
- $::UNICODE_DISALLOW_ABOVE_31_BIT;
- }
- else {
- fail(sprintf "Unexpected disallow flag: %x",
- $disallow_flag);
- next;
- }
- }
-
- $disallowed = $uvchr_disallow_flag;
+ # existing code point, it hasn't overflowed, and isn't
+ # malformed.
+ next if @malformation_names;
+
+ $this_warning_flags = ($use_warn_flag)
+ ? $this_uvchr_flag_to_warn
+ : 0;
+ $this_disallow_flags = ($do_disallow)
+ ? $this_uvchr_flag_to_disallow
+ : 0;
+
+ $disallowed = $this_disallow_flags
+ & $this_uvchr_flag_to_disallow;
+ $this_name .= ", " . (( $this_warning_flags
+ & $this_utf8n_flag_to_warn)
+ ? 'with flag for raising warnings'
+ : 'no flag for raising warnings');
$this_name = "uvchr_to_utf8_flags() $testname: "
- . (($uvchr_disallow_flag)
+ . (($disallowed)
? 'disallowed'
- : ($disallowed)
- ? 'ABOVE_31_BIT allowed'
: 'allowed');
$this_name .= ", $eval_warn";
- $this_name .= ", " . (($uvchr_warn_flag)
+ $this_name .= ", " . (( $this_warning_flags
+ & $this_uvchr_flag_to_warn)
? 'with warning flag'
: 'no warning flag');
undef @warnings_gotten;
my $ret;
- my $warn_flag = sprintf "0x%x", $uvchr_warn_flag;
- my $disallow_flag = sprintf "0x%x",
- $uvchr_disallow_flag;
- $call = sprintf(" Call was: $eval_warn; \$ret"
- . " = test_uvchr_to_utf8_flags("
- . " 0x%x, $warn_flag|$disallow_flag)",
- $allowed_uv);
+ $this_flags = $this_warning_flags|$this_disallow_flags;
$eval_text = "$eval_warn; \$ret ="
. " test_uvchr_to_utf8_flags("
- . "$allowed_uv, $warn_flag|"
- . "$disallow_flag)";
+ . "$allowed_uv, $this_flags)";
eval "$eval_text";
if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
{
- diag "\$!='$!'; eval'd=\"$eval_text\"";
+ diag "\$@='$@'; call was: "
+ . uvchr_display_call($eval_text);
next;
}
if ($disallowed) {
- is($ret, undef, "$this_name: Returns undef")
- or diag $call;
+ is($ret, undef, " And returns undef")
+ or diag "Call was: " . uvchr_display_call($eval_text);
}
else {
- is($ret, $bytes, "$this_name: Returns expected string")
- or diag $call;
- }
- if (! $do_warning
- && ($trial_warning_category eq 'utf8' || $trial_warning_category eq $controlling_warning_category))
- {
- if (!is(scalar @warnings_gotten, 0,
- "$this_name: No warnings generated"))
- {
- diag $call;
- output_warnings(@warnings_gotten);
- }
- }
- elsif ( $uvchr_warn_flag
- && ( $trial_warning_category eq 'utf8'
- || $trial_warning_category eq $controlling_warning_category))
- {
- if (is(scalar @warnings_gotten, 1,
- "$this_name: Got a single warning "))
- {
- like($warnings_gotten[0], $message,
- "$this_name: Got expected warning")
- or diag $call;
- }
- else {
- diag $call;
- output_warnings(@warnings_gotten)
- if scalar @warnings_gotten;
- }
+ is($ret, $this_bytes, " And returns expected string")
+ or diag "Call was: " . uvchr_display_call($eval_text);
}
+
+ do_warnings_test(@expected_warnings)
+ or diag "Call was: " . uvchr_display_call($eval_text);
}
}
}