X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/c0236afee0c5845d3823612c5cd34eccc4d29321..dbb8d79823035c58d8e52ea1c4bc1e5d16d1c2de:/ext/XS-APItest/t/utf8.t diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t index 61a3ff8..1c759cd 100644 --- a/ext/XS-APItest/t/utf8.t +++ b/ext/XS-APItest/t/utf8.t @@ -2,107 +2,57 @@ use strict; use Test::More; + +# This file tests various functions and macros in the API related to UTF-8. + +BEGIN { + use_ok('XS::APItest'); + require 'charset_tools.pl'; + require './t/utf8_setup.pl'; +}; + $|=1; +no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit + # machines, and that is tested elsewhere + use XS::APItest; my $pound_sign = chr utf8::unicode_to_native(163); -sub isASCII { ord "A" == 65 } - -sub display_bytes { - my $string = shift; - return '"' - . join("", map { sprintf("\\x%02x", ord $_) } split "", $string) - . '"'; -} - # This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl # because that uses the same functions we are testing here. So UTF-EBCDIC -# strings are hard-coded as I8 strings in this file instead, and we use array -# lookup to translate into the appropriate code page. - -my @i8_to_native = ( # Only code page 1047 so far. -# _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F -0x00,0x01,0x02,0x03,0x37,0x2D,0x2E,0x2F,0x16,0x05,0x15,0x0B,0x0C,0x0D,0x0E,0x0F, -0x10,0x11,0x12,0x13,0x3C,0x3D,0x32,0x26,0x18,0x19,0x3F,0x27,0x1C,0x1D,0x1E,0x1F, -0x40,0x5A,0x7F,0x7B,0x5B,0x6C,0x50,0x7D,0x4D,0x5D,0x5C,0x4E,0x6B,0x60,0x4B,0x61, -0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0x7A,0x5E,0x4C,0x7E,0x6E,0x6F, -0x7C,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6, -0xD7,0xD8,0xD9,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xAD,0xE0,0xBD,0x5F,0x6D, -0x79,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x91,0x92,0x93,0x94,0x95,0x96, -0x97,0x98,0x99,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xC0,0x4F,0xD0,0xA1,0x07, -0x20,0x21,0x22,0x23,0x24,0x25,0x06,0x17,0x28,0x29,0x2A,0x2B,0x2C,0x09,0x0A,0x1B, -0x30,0x31,0x1A,0x33,0x34,0x35,0x36,0x08,0x38,0x39,0x3A,0x3B,0x04,0x14,0x3E,0xFF, -0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x51,0x52,0x53,0x54,0x55,0x56, -0x57,0x58,0x59,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x70,0x71,0x72,0x73, -0x74,0x75,0x76,0x77,0x78,0x80,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,0x90,0x9A,0x9B,0x9C, -0x9D,0x9E,0x9F,0xA0,0xAA,0xAB,0xAC,0xAE,0xAF,0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6, -0xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBE,0xBF,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xDA,0xDB, -0xDC,0xDD,0xDE,0xDF,0xE1,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xFA,0xFB,0xFC,0xFD,0xFE, -); - -*I8_to_native = (isASCII) - ? sub { return shift } - : sub { return join "", map { chr $i8_to_native[ord $_] } - split "", shift }; - -my $is64bit = length sprintf("%x", ~0) > 8; - - -# Test utf8n_to_uvchr(). These provide essentially complete code coverage. -# Copied from utf8.h -my $UTF8_ALLOW_EMPTY = 0x0001; -my $UTF8_ALLOW_CONTINUATION = 0x0002; -my $UTF8_ALLOW_NON_CONTINUATION = 0x0004; -my $UTF8_ALLOW_SHORT = 0x0008; -my $UTF8_ALLOW_LONG = 0x0010; -my $UTF8_DISALLOW_SURROGATE = 0x0020; -my $UTF8_WARN_SURROGATE = 0x0040; -my $UTF8_DISALLOW_NONCHAR = 0x0080; -my $UTF8_WARN_NONCHAR = 0x0100; -my $UTF8_DISALLOW_SUPER = 0x0200; -my $UTF8_WARN_SUPER = 0x0400; -my $UTF8_DISALLOW_ABOVE_31_BIT = 0x0800; -my $UTF8_WARN_ABOVE_31_BIT = 0x1000; -my $UTF8_CHECK_ONLY = 0x2000; - -# Test uvchr_to_utf8(). -my $UNICODE_WARN_SURROGATE = 0x0001; -my $UNICODE_WARN_NONCHAR = 0x0002; -my $UNICODE_WARN_SUPER = 0x0004; -my $UNICODE_WARN_ABOVE_31_BIT = 0x0008; -my $UNICODE_DISALLOW_SURROGATE = 0x0010; -my $UNICODE_DISALLOW_NONCHAR = 0x0020; -my $UNICODE_DISALLOW_SUPER = 0x0040; -my $UNICODE_DISALLOW_ABOVE_31_BIT = 0x0080; +# strings are hard-coded as I8 strings in this file instead, and we use the +# translation functions to/from I8 from that file instead. my $look_for_everything_utf8n_to - = $UTF8_DISALLOW_SURROGATE - | $UTF8_WARN_SURROGATE - | $UTF8_DISALLOW_NONCHAR - | $UTF8_WARN_NONCHAR - | $UTF8_DISALLOW_SUPER - | $UTF8_WARN_SUPER - | $UTF8_DISALLOW_ABOVE_31_BIT - | $UTF8_WARN_ABOVE_31_BIT; + = $::UTF8_DISALLOW_SURROGATE + | $::UTF8_WARN_SURROGATE + | $::UTF8_DISALLOW_NONCHAR + | $::UTF8_WARN_NONCHAR + | $::UTF8_DISALLOW_SUPER + | $::UTF8_WARN_SUPER + | $::UTF8_DISALLOW_ABOVE_31_BIT + | $::UTF8_WARN_ABOVE_31_BIT; my $look_for_everything_uvchr_to - = $UNICODE_DISALLOW_SURROGATE - | $UNICODE_WARN_SURROGATE - | $UNICODE_DISALLOW_NONCHAR - | $UNICODE_WARN_NONCHAR - | $UNICODE_DISALLOW_SUPER - | $UNICODE_WARN_SUPER - | $UNICODE_DISALLOW_ABOVE_31_BIT - | $UNICODE_WARN_ABOVE_31_BIT; + = $::UNICODE_DISALLOW_SURROGATE + | $::UNICODE_WARN_SURROGATE + | $::UNICODE_DISALLOW_NONCHAR + | $::UNICODE_WARN_NONCHAR + | $::UNICODE_DISALLOW_SUPER + | $::UNICODE_WARN_SUPER + | $::UNICODE_DISALLOW_ABOVE_31_BIT + | $::UNICODE_WARN_ABOVE_31_BIT; foreach ([0, '', '', 'empty'], [0, 'N', 'N', '1 char'], [1, 'NN', 'N', '1 char substring'], [-2, 'Perl', 'Rules', 'different'], [0, $pound_sign, $pound_sign, 'pound sign'], - [1, $pound_sign . 10, $pound_sign . 1, '10 pounds is more than 1 pound'], - [1, $pound_sign . $pound_sign, $pound_sign, '2 pound signs are more than 1'], + [1, $pound_sign . 10, $pound_sign . 1, + '10 pounds is more than 1 pound'], + [1, $pound_sign . $pound_sign, $pound_sign, + '2 pound signs are more than 1'], [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'], [-1, '!', "!\x{1F42A}", 'Initial substrings match'], ) { @@ -141,81 +91,345 @@ my %code_points = ( # as of this writing, considers potentially problematic on ASCII 0xD000 => (isASCII) ? "\xed\x80\x80" : I8_to_native("\xf1\xb4\xa0\xa0"), - # Bracket the surrogates - 0xD7FF => (isASCII) ? "\xed\x9f\xbf" : I8_to_native("\xf1\xb5\xbf\xbf"), - 0xE000 => (isASCII) ? "\xee\x80\x80" : I8_to_native("\xf1\xb8\xa0\xa0"), - - # Bracket the 32 contiguous non characters - 0xFDCF => (isASCII) ? "\xef\xb7\x8f" : I8_to_native("\xf1\xbf\xae\xaf"), - 0xFDF0 => (isASCII) ? "\xef\xb7\xb0" : I8_to_native("\xf1\xbf\xaf\xb0"), - - # Mostly bracket non-characters, but some are transitions to longer - # strings - 0xFFFD => (isASCII) ? "\xef\xbf\xbd" : I8_to_native("\xf1\xbf\xbf\xbd"), - 0x10000 - 1 => (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"), - 0x10000 => (isASCII) ? "\xf0\x90\x80\x80" : I8_to_native("\xf2\xa0\xa0\xa0"), - 0x1FFFD => (isASCII) ? "\xf0\x9f\xbf\xbd" : I8_to_native("\xf3\xbf\xbf\xbd"), - 0x20000 => (isASCII) ? "\xf0\xa0\x80\x80" : I8_to_native("\xf4\xa0\xa0\xa0"), - 0x2FFFD => (isASCII) ? "\xf0\xaf\xbf\xbd" : I8_to_native("\xf5\xbf\xbf\xbd"), - 0x30000 => (isASCII) ? "\xf0\xb0\x80\x80" : I8_to_native("\xf6\xa0\xa0\xa0"), - 0x3FFFD => (isASCII) ? "\xf0\xbf\xbf\xbd" : I8_to_native("\xf7\xbf\xbf\xbd"), - 0x40000 - 1 => (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"), - 0x40000 => (isASCII) ? "\xf1\x80\x80\x80" : I8_to_native("\xf8\xa8\xa0\xa0\xa0"), - 0x4FFFD => (isASCII) ? "\xf1\x8f\xbf\xbd" : I8_to_native("\xf8\xa9\xbf\xbf\xbd"), - 0x50000 => (isASCII) ? "\xf1\x90\x80\x80" : I8_to_native("\xf8\xaa\xa0\xa0\xa0"), - 0x5FFFD => (isASCII) ? "\xf1\x9f\xbf\xbd" : I8_to_native("\xf8\xab\xbf\xbf\xbd"), - 0x60000 => (isASCII) ? "\xf1\xa0\x80\x80" : I8_to_native("\xf8\xac\xa0\xa0\xa0"), - 0x6FFFD => (isASCII) ? "\xf1\xaf\xbf\xbd" : I8_to_native("\xf8\xad\xbf\xbf\xbd"), - 0x70000 => (isASCII) ? "\xf1\xb0\x80\x80" : I8_to_native("\xf8\xae\xa0\xa0\xa0"), - 0x7FFFD => (isASCII) ? "\xf1\xbf\xbf\xbd" : I8_to_native("\xf8\xaf\xbf\xbf\xbd"), - 0x80000 => (isASCII) ? "\xf2\x80\x80\x80" : I8_to_native("\xf8\xb0\xa0\xa0\xa0"), - 0x8FFFD => (isASCII) ? "\xf2\x8f\xbf\xbd" : I8_to_native("\xf8\xb1\xbf\xbf\xbd"), - 0x90000 => (isASCII) ? "\xf2\x90\x80\x80" : I8_to_native("\xf8\xb2\xa0\xa0\xa0"), - 0x9FFFD => (isASCII) ? "\xf2\x9f\xbf\xbd" : I8_to_native("\xf8\xb3\xbf\xbf\xbd"), - 0xA0000 => (isASCII) ? "\xf2\xa0\x80\x80" : I8_to_native("\xf8\xb4\xa0\xa0\xa0"), - 0xAFFFD => (isASCII) ? "\xf2\xaf\xbf\xbd" : I8_to_native("\xf8\xb5\xbf\xbf\xbd"), - 0xB0000 => (isASCII) ? "\xf2\xb0\x80\x80" : I8_to_native("\xf8\xb6\xa0\xa0\xa0"), - 0xBFFFD => (isASCII) ? "\xf2\xbf\xbf\xbd" : I8_to_native("\xf8\xb7\xbf\xbf\xbd"), - 0xC0000 => (isASCII) ? "\xf3\x80\x80\x80" : I8_to_native("\xf8\xb8\xa0\xa0\xa0"), - 0xCFFFD => (isASCII) ? "\xf3\x8f\xbf\xbd" : I8_to_native("\xf8\xb9\xbf\xbf\xbd"), - 0xD0000 => (isASCII) ? "\xf3\x90\x80\x80" : I8_to_native("\xf8\xba\xa0\xa0\xa0"), - 0xDFFFD => (isASCII) ? "\xf3\x9f\xbf\xbd" : I8_to_native("\xf8\xbb\xbf\xbf\xbd"), - 0xE0000 => (isASCII) ? "\xf3\xa0\x80\x80" : I8_to_native("\xf8\xbc\xa0\xa0\xa0"), - 0xEFFFD => (isASCII) ? "\xf3\xaf\xbf\xbd" : I8_to_native("\xf8\xbd\xbf\xbf\xbd"), - 0xF0000 => (isASCII) ? "\xf3\xb0\x80\x80" : I8_to_native("\xf8\xbe\xa0\xa0\xa0"), - 0xFFFFD => (isASCII) ? "\xf3\xbf\xbf\xbd" : I8_to_native("\xf8\xbf\xbf\xbf\xbd"), - 0x100000 => (isASCII) ? "\xf4\x80\x80\x80" : I8_to_native("\xf9\xa0\xa0\xa0\xa0"), - 0x10FFFD => (isASCII) ? "\xf4\x8f\xbf\xbd" : I8_to_native("\xf9\xa1\xbf\xbf\xbd"), - 0x110000 => (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"), + # Bracket the surrogates, and include several surrogates + 0xD7FF => (isASCII) ? "\xed\x9f\xbf" : I8_to_native("\xf1\xb5\xbf\xbf"), + 0xD800 => (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"), + 0xDC00 => (isASCII) ? "\xed\xb0\x80" : I8_to_native("\xf1\xb7\xa0\xa0"), + 0xDFFF => (isASCII) ? "\xee\x80\x80" : I8_to_native("\xf1\xb8\xa0\xa0"), + 0xDFFF => (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"), + 0xE000 => (isASCII) ? "\xee\x80\x80" : I8_to_native("\xf1\xb8\xa0\xa0"), + + # Include the 32 contiguous non characters, and surrounding code points + 0xFDCF => (isASCII) ? "\xef\xb7\x8f" : I8_to_native("\xf1\xbf\xae\xaf"), + 0xFDD0 => (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"), + 0xFDD1 => (isASCII) ? "\xef\xb7\x91" : I8_to_native("\xf1\xbf\xae\xb1"), + 0xFDD2 => (isASCII) ? "\xef\xb7\x92" : I8_to_native("\xf1\xbf\xae\xb2"), + 0xFDD3 => (isASCII) ? "\xef\xb7\x93" : I8_to_native("\xf1\xbf\xae\xb3"), + 0xFDD4 => (isASCII) ? "\xef\xb7\x94" : I8_to_native("\xf1\xbf\xae\xb4"), + 0xFDD5 => (isASCII) ? "\xef\xb7\x95" : I8_to_native("\xf1\xbf\xae\xb5"), + 0xFDD6 => (isASCII) ? "\xef\xb7\x96" : I8_to_native("\xf1\xbf\xae\xb6"), + 0xFDD7 => (isASCII) ? "\xef\xb7\x97" : I8_to_native("\xf1\xbf\xae\xb7"), + 0xFDD8 => (isASCII) ? "\xef\xb7\x98" : I8_to_native("\xf1\xbf\xae\xb8"), + 0xFDD9 => (isASCII) ? "\xef\xb7\x99" : I8_to_native("\xf1\xbf\xae\xb9"), + 0xFDDA => (isASCII) ? "\xef\xb7\x9a" : I8_to_native("\xf1\xbf\xae\xba"), + 0xFDDB => (isASCII) ? "\xef\xb7\x9b" : I8_to_native("\xf1\xbf\xae\xbb"), + 0xFDDC => (isASCII) ? "\xef\xb7\x9c" : I8_to_native("\xf1\xbf\xae\xbc"), + 0xFDDD => (isASCII) ? "\xef\xb7\x9d" : I8_to_native("\xf1\xbf\xae\xbd"), + 0xFDDE => (isASCII) ? "\xef\xb7\x9e" : I8_to_native("\xf1\xbf\xae\xbe"), + 0xFDDF => (isASCII) ? "\xef\xb7\x9f" : I8_to_native("\xf1\xbf\xae\xbf"), + 0xFDE0 => (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"), + 0xFDE1 => (isASCII) ? "\xef\xb7\xa1" : I8_to_native("\xf1\xbf\xaf\xa1"), + 0xFDE2 => (isASCII) ? "\xef\xb7\xa2" : I8_to_native("\xf1\xbf\xaf\xa2"), + 0xFDE3 => (isASCII) ? "\xef\xb7\xa3" : I8_to_native("\xf1\xbf\xaf\xa3"), + 0xFDE4 => (isASCII) ? "\xef\xb7\xa4" : I8_to_native("\xf1\xbf\xaf\xa4"), + 0xFDE5 => (isASCII) ? "\xef\xb7\xa5" : I8_to_native("\xf1\xbf\xaf\xa5"), + 0xFDE6 => (isASCII) ? "\xef\xb7\xa6" : I8_to_native("\xf1\xbf\xaf\xa6"), + 0xFDE7 => (isASCII) ? "\xef\xb7\xa7" : I8_to_native("\xf1\xbf\xaf\xa7"), + 0xFDE8 => (isASCII) ? "\xef\xb7\xa8" : I8_to_native("\xf1\xbf\xaf\xa8"), + 0xFDEa => (isASCII) ? "\xef\xb7\x99" : I8_to_native("\xf1\xbf\xaf\xa9"), + 0xFDEA => (isASCII) ? "\xef\xb7\xaa" : I8_to_native("\xf1\xbf\xaf\xaa"), + 0xFDEB => (isASCII) ? "\xef\xb7\xab" : I8_to_native("\xf1\xbf\xaf\xab"), + 0xFDEC => (isASCII) ? "\xef\xb7\xac" : I8_to_native("\xf1\xbf\xaf\xac"), + 0xFDED => (isASCII) ? "\xef\xb7\xad" : I8_to_native("\xf1\xbf\xaf\xad"), + 0xFDEE => (isASCII) ? "\xef\xb7\xae" : I8_to_native("\xf1\xbf\xaf\xae"), + 0xFDEF => (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"), + 0xFDF0 => (isASCII) ? "\xef\xb7\xb0" : I8_to_native("\xf1\xbf\xaf\xb0"), + + # Mostly around non-characters, but some are transitions to longer strings + 0xFFFD => (isASCII) ? "\xef\xbf\xbd" : I8_to_native("\xf1\xbf\xbf\xbd"), + 0x10000 - 1 => (isASCII) + ? "\xef\xbf\xbf" + : I8_to_native("\xf1\xbf\xbf\xbf"), + 0x10000 => (isASCII) + ? "\xf0\x90\x80\x80" + : I8_to_native("\xf2\xa0\xa0\xa0"), + 0x1FFFD => (isASCII) + ? "\xf0\x9f\xbf\xbd" + : I8_to_native("\xf3\xbf\xbf\xbd"), + 0x1FFFE => (isASCII) + ? "\xf0\x9f\xbf\xbe" + : I8_to_native("\xf3\xbf\xbf\xbe"), + 0x1FFFF => (isASCII) + ? "\xf0\x9f\xbf\xbf" + : I8_to_native("\xf3\xbf\xbf\xbf"), + 0x20000 => (isASCII) + ? "\xf0\xa0\x80\x80" + : I8_to_native("\xf4\xa0\xa0\xa0"), + 0x2FFFD => (isASCII) + ? "\xf0\xaf\xbf\xbd" + : I8_to_native("\xf5\xbf\xbf\xbd"), + 0x2FFFE => (isASCII) + ? "\xf0\xaf\xbf\xbe" + : I8_to_native("\xf5\xbf\xbf\xbe"), + 0x2FFFF => (isASCII) + ? "\xf0\xaf\xbf\xbf" + : I8_to_native("\xf5\xbf\xbf\xbf"), + 0x30000 => (isASCII) + ? "\xf0\xb0\x80\x80" + : I8_to_native("\xf6\xa0\xa0\xa0"), + 0x3FFFD => (isASCII) + ? "\xf0\xbf\xbf\xbd" + : I8_to_native("\xf7\xbf\xbf\xbd"), + 0x3FFFE => (isASCII) + ? "\xf0\xbf\xbf\xbe" + : I8_to_native("\xf7\xbf\xbf\xbe"), + 0x40000 - 1 => (isASCII) + ? "\xf0\xbf\xbf\xbf" + : I8_to_native("\xf7\xbf\xbf\xbf"), + 0x40000 => (isASCII) + ? "\xf1\x80\x80\x80" + : I8_to_native("\xf8\xa8\xa0\xa0\xa0"), + 0x4FFFD => (isASCII) + ? "\xf1\x8f\xbf\xbd" + : I8_to_native("\xf8\xa9\xbf\xbf\xbd"), + 0x4FFFE => (isASCII) + ? "\xf1\x8f\xbf\xbe" + : I8_to_native("\xf8\xa9\xbf\xbf\xbe"), + 0x4FFFF => (isASCII) + ? "\xf1\x8f\xbf\xbf" + : I8_to_native("\xf8\xa9\xbf\xbf\xbf"), + 0x50000 => (isASCII) + ? "\xf1\x90\x80\x80" + : I8_to_native("\xf8\xaa\xa0\xa0\xa0"), + 0x5FFFD => (isASCII) + ? "\xf1\x9f\xbf\xbd" + : I8_to_native("\xf8\xab\xbf\xbf\xbd"), + 0x5FFFE => (isASCII) + ? "\xf1\x9f\xbf\xbe" + : I8_to_native("\xf8\xab\xbf\xbf\xbe"), + 0x5FFFF => (isASCII) + ? "\xf1\x9f\xbf\xbf" + : I8_to_native("\xf8\xab\xbf\xbf\xbf"), + 0x60000 => (isASCII) + ? "\xf1\xa0\x80\x80" + : I8_to_native("\xf8\xac\xa0\xa0\xa0"), + 0x6FFFD => (isASCII) + ? "\xf1\xaf\xbf\xbd" + : I8_to_native("\xf8\xad\xbf\xbf\xbd"), + 0x6FFFE => (isASCII) + ? "\xf1\xaf\xbf\xbe" + : I8_to_native("\xf8\xad\xbf\xbf\xbe"), + 0x6FFFF => (isASCII) + ? "\xf1\xaf\xbf\xbf" + : I8_to_native("\xf8\xad\xbf\xbf\xbf"), + 0x70000 => (isASCII) + ? "\xf1\xb0\x80\x80" + : I8_to_native("\xf8\xae\xa0\xa0\xa0"), + 0x7FFFD => (isASCII) + ? "\xf1\xbf\xbf\xbd" + : I8_to_native("\xf8\xaf\xbf\xbf\xbd"), + 0x7FFFE => (isASCII) + ? "\xf1\xbf\xbf\xbe" + : I8_to_native("\xf8\xaf\xbf\xbf\xbe"), + 0x7FFFF => (isASCII) + ? "\xf1\xbf\xbf\xbf" + : I8_to_native("\xf8\xaf\xbf\xbf\xbf"), + 0x80000 => (isASCII) + ? "\xf2\x80\x80\x80" + : I8_to_native("\xf8\xb0\xa0\xa0\xa0"), + 0x8FFFD => (isASCII) + ? "\xf2\x8f\xbf\xbd" + : I8_to_native("\xf8\xb1\xbf\xbf\xbd"), + 0x8FFFE => (isASCII) + ? "\xf2\x8f\xbf\xbe" + : I8_to_native("\xf8\xb1\xbf\xbf\xbe"), + 0x8FFFF => (isASCII) + ? "\xf2\x8f\xbf\xbf" + : I8_to_native("\xf8\xb1\xbf\xbf\xbf"), + 0x90000 => (isASCII) + ? "\xf2\x90\x80\x80" + : I8_to_native("\xf8\xb2\xa0\xa0\xa0"), + 0x9FFFD => (isASCII) + ? "\xf2\x9f\xbf\xbd" + : I8_to_native("\xf8\xb3\xbf\xbf\xbd"), + 0x9FFFE => (isASCII) + ? "\xf2\x9f\xbf\xbe" + : I8_to_native("\xf8\xb3\xbf\xbf\xbe"), + 0x9FFFF => (isASCII) + ? "\xf2\x9f\xbf\xbf" + : I8_to_native("\xf8\xb3\xbf\xbf\xbf"), + 0xA0000 => (isASCII) + ? "\xf2\xa0\x80\x80" + : I8_to_native("\xf8\xb4\xa0\xa0\xa0"), + 0xAFFFD => (isASCII) + ? "\xf2\xaf\xbf\xbd" + : I8_to_native("\xf8\xb5\xbf\xbf\xbd"), + 0xAFFFE => (isASCII) + ? "\xf2\xaf\xbf\xbe" + : I8_to_native("\xf8\xb5\xbf\xbf\xbe"), + 0xAFFFF => (isASCII) + ? "\xf2\xaf\xbf\xbf" + : I8_to_native("\xf8\xb5\xbf\xbf\xbf"), + 0xB0000 => (isASCII) + ? "\xf2\xb0\x80\x80" + : I8_to_native("\xf8\xb6\xa0\xa0\xa0"), + 0xBFFFD => (isASCII) + ? "\xf2\xbf\xbf\xbd" + : I8_to_native("\xf8\xb7\xbf\xbf\xbd"), + 0xBFFFE => (isASCII) + ? "\xf2\xbf\xbf\xbe" + : I8_to_native("\xf8\xb7\xbf\xbf\xbe"), + 0xBFFFF => (isASCII) + ? "\xf2\xbf\xbf\xbf" + : I8_to_native("\xf8\xb7\xbf\xbf\xbf"), + 0xC0000 => (isASCII) + ? "\xf3\x80\x80\x80" + : I8_to_native("\xf8\xb8\xa0\xa0\xa0"), + 0xCFFFD => (isASCII) + ? "\xf3\x8f\xbf\xbd" + : I8_to_native("\xf8\xb9\xbf\xbf\xbd"), + 0xCFFFE => (isASCII) + ? "\xf3\x8f\xbf\xbe" + : I8_to_native("\xf8\xb9\xbf\xbf\xbe"), + 0xCFFFF => (isASCII) + ? "\xf3\x8f\xbf\xbf" + : I8_to_native("\xf8\xb9\xbf\xbf\xbf"), + 0xD0000 => (isASCII) + ? "\xf3\x90\x80\x80" + : I8_to_native("\xf8\xba\xa0\xa0\xa0"), + 0xDFFFD => (isASCII) + ? "\xf3\x9f\xbf\xbd" + : I8_to_native("\xf8\xbb\xbf\xbf\xbd"), + 0xDFFFE => (isASCII) + ? "\xf3\x9f\xbf\xbe" + : I8_to_native("\xf8\xbb\xbf\xbf\xbe"), + 0xDFFFF => (isASCII) + ? "\xf3\x9f\xbf\xbf" + : I8_to_native("\xf8\xbb\xbf\xbf\xbf"), + 0xE0000 => (isASCII) + ? "\xf3\xa0\x80\x80" + : I8_to_native("\xf8\xbc\xa0\xa0\xa0"), + 0xEFFFD => (isASCII) + ? "\xf3\xaf\xbf\xbd" + : I8_to_native("\xf8\xbd\xbf\xbf\xbd"), + 0xEFFFE => (isASCII) + ? "\xf3\xaf\xbf\xbe" + : I8_to_native("\xf8\xbd\xbf\xbf\xbe"), + 0xEFFFF => (isASCII) + ? "\xf3\xaf\xbf\xbf" + : I8_to_native("\xf8\xbd\xbf\xbf\xbf"), + 0xF0000 => (isASCII) + ? "\xf3\xb0\x80\x80" + : I8_to_native("\xf8\xbe\xa0\xa0\xa0"), + 0xFFFFD => (isASCII) + ? "\xf3\xbf\xbf\xbd" + : I8_to_native("\xf8\xbf\xbf\xbf\xbd"), + 0xFFFFE => (isASCII) + ? "\xf3\xbf\xbf\xbe" + : I8_to_native("\xf8\xbf\xbf\xbf\xbe"), + 0xFFFFF => (isASCII) + ? "\xf3\xbf\xbf\xbf" + : I8_to_native("\xf8\xbf\xbf\xbf\xbf"), + 0x100000 => (isASCII) + ? "\xf4\x80\x80\x80" + : I8_to_native("\xf9\xa0\xa0\xa0\xa0"), + 0x10FFFD => (isASCII) + ? "\xf4\x8f\xbf\xbd" + : I8_to_native("\xf9\xa1\xbf\xbf\xbd"), + 0x10FFFE => (isASCII) + ? "\xf4\x8f\xbf\xbe" + : I8_to_native("\xf9\xa1\xbf\xbf\xbe"), + 0x10FFFF => (isASCII) + ? "\xf4\x8f\xbf\xbf" + : I8_to_native("\xf9\xa1\xbf\xbf\xbf"), + 0x110000 => (isASCII) + ? "\xf4\x90\x80\x80" + : I8_to_native("\xf9\xa2\xa0\xa0\xa0"), # Things that would be noncharacters if they were in Unicode, and might be # mistaken, if the C code is bad, to be nonchars - 0x11FFFE => (isASCII) ? "\xf4\x9f\xbf\xbe" : I8_to_native("\xf9\xa3\xbf\xbf\xbe"), - 0x11FFFF => (isASCII) ? "\xf4\x9f\xbf\xbf" : I8_to_native("\xf9\xa3\xbf\xbf\xbf"), - 0x20FFFE => (isASCII) ? "\xf8\x88\x8f\xbf\xbe" : I8_to_native("\xfa\xa1\xbf\xbf\xbe"), - 0x20FFFF => (isASCII) ? "\xf8\x88\x8f\xbf\xbf" : I8_to_native("\xfa\xa1\xbf\xbf\xbf"), - - 0x200000 - 1 => (isASCII) ? "\xf7\xbf\xbf\xbf" : I8_to_native("\xf9\xbf\xbf\xbf\xbf"), - 0x200000 => (isASCII) ? "\xf8\x88\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"), - 0x400000 - 1 => (isASCII) ? "\xf8\x8f\xbf\xbf\xbf" : I8_to_native("\xfb\xbf\xbf\xbf\xbf"), - 0x400000 => (isASCII) ? "\xf8\x90\x80\x80\x80" : I8_to_native("\xfc\xa4\xa0\xa0\xa0\xa0"), - 0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"), - 0x4000000 => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"), - 0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"), - 0x4000000 => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"), - 0x40000000 - 1 => (isASCII) ? "\xfc\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xfe\xbf\xbf\xbf\xbf\xbf\xbf"), - 0x40000000 => (isASCII) ? "\xfd\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0"), - 0x80000000 - 1 => (isASCII) ? "\xfd\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"), - 0x80000000 => (isASCII) ? "\xfe\x82\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), - 0xFFFFFFFF => (isASCII) ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), + 0x11FFFE => (isASCII) + ? "\xf4\x9f\xbf\xbe" + : I8_to_native("\xf9\xa3\xbf\xbf\xbe"), + 0x11FFFF => (isASCII) + ? "\xf4\x9f\xbf\xbf" + : I8_to_native("\xf9\xa3\xbf\xbf\xbf"), + 0x20FFFE => (isASCII) + ? "\xf8\x88\x8f\xbf\xbe" + : I8_to_native("\xfa\xa1\xbf\xbf\xbe"), + 0x20FFFF => (isASCII) + ? "\xf8\x88\x8f\xbf\xbf" + : I8_to_native("\xfa\xa1\xbf\xbf\xbf"), + + 0x200000 - 1 => (isASCII) + ? "\xf7\xbf\xbf\xbf" + : I8_to_native("\xf9\xbf\xbf\xbf\xbf"), + 0x200000 => (isASCII) + ? "\xf8\x88\x80\x80\x80" + : I8_to_native("\xfa\xa0\xa0\xa0\xa0"), + 0x400000 - 1 => (isASCII) + ? "\xf8\x8f\xbf\xbf\xbf" + : I8_to_native("\xfb\xbf\xbf\xbf\xbf"), + 0x400000 => (isASCII) + ? "\xf8\x90\x80\x80\x80" + : I8_to_native("\xfc\xa4\xa0\xa0\xa0\xa0"), + 0x4000000 - 1 => (isASCII) + ? "\xfb\xbf\xbf\xbf\xbf" + : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"), + 0x4000000 => (isASCII) + ? "\xfc\x84\x80\x80\x80\x80" + : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"), + 0x4000000 - 1 => (isASCII) + ? "\xfb\xbf\xbf\xbf\xbf" + : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"), + 0x4000000 => (isASCII) + ? "\xfc\x84\x80\x80\x80\x80" + : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"), + 0x40000000 - 1 => (isASCII) + ? "\xfc\xbf\xbf\xbf\xbf\xbf" + : I8_to_native("\xfe\xbf\xbf\xbf\xbf\xbf\xbf"), + 0x40000000 => + (isASCII) ? "\xfd\x80\x80\x80\x80\x80" + : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0"), + 0x80000000 - 1 => + (isASCII) ? "\xfd\xbf\xbf\xbf\xbf\xbf" + : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"), + 0x80000000 => + (isASCII) ? "\xfe\x82\x80\x80\x80\x80\x80" + : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), + 0xFFFFFFFF => + (isASCII) ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" + : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), ); -if ($is64bit) { +if ($::is64bit) { no warnings qw(overflow portable); - $code_points{0x100000000} = (isASCII) ? "\xfe\x84\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"); - $code_points{0x1000000000 - 1} = (isASCII) ? "\xfe\xbf\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf\xbf"); - $code_points{0x1000000000} = (isASCII) ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"); - $code_points{0xFFFFFFFFFFFFFFFF} = (isASCII) ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"); + $code_points{0x100000000} + = (isASCII) + ? "\xfe\x84\x80\x80\x80\x80\x80" + : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"); + $code_points{0x1000000000 - 1} + = (isASCII) + ? "\xfe\xbf\xbf\xbf\xbf\xbf\xbf" + : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf\xbf"); + $code_points{0x1000000000} + = (isASCII) + ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80" + : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"); + $code_points{0xFFFFFFFFFFFFFFFF} + = (isASCII) + ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" + : I8_to_native("\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"); + if (isASCII) { # These could falsely show as overlongs in a naive + # implementation + $code_points{0x40000000000} + = "\xff\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80"; + $code_points{0x1000000000000} + = "\xff\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80"; + $code_points{0x40000000000000} + = "\xff\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80"; + $code_points{0x1000000000000000} + = "\xff\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"; + # overflows + #$code_points{0xfoo} + # = "\xff\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"; + } +} +elsif (! isASCII) { # 32-bit EBCDIC. 64-bit is clearer to handle, so doesn't + # need this test case + no warnings qw(overflow portable); + $code_points{0x40000000} = I8_to_native( + "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0"); } # Now add in entries for each of code points 0-255, which require special @@ -235,18 +449,17 @@ while ($cp < ((isASCII) ? 128 : 160)) { # This is from the definition of # continuation bytes can be in, and what the lowest start byte can be. So we # cycle through them. -my $first_continuation = (isASCII) ? 0x80 : 0xA0; -my $final_continuation = 0xBF; +my $highest_continuation = 0xBF; my $start = (isASCII) ? 0xC2 : 0xC5; -my $continuation = $first_continuation - 1; +my $continuation = $::lowest_continuation - 1; while ($cp < 255) { - if (++$continuation > $final_continuation) { + if (++$continuation > $highest_continuation) { # Wrap to the next start byte when we reach the final continuation # byte possible - $continuation = $first_continuation; + $continuation = $::lowest_continuation; $start++; } $code_points{$cp} = I8_to_native(chr($start) . chr($continuation)); @@ -259,9 +472,16 @@ my @warnings; use warnings 'utf8'; local $SIG{__WARN__} = sub { push @warnings, @_ }; -# This set of tests looks for basic sanity, and lastly tests the bottom level -# decode routine for the given code point. If the earlier tests for that code -# point fail, that one probably will too. Malformations are tested in later +my %restriction_types; + +$restriction_types{""}{'valid_strings'} = ""; +$restriction_types{"c9strict"}{'valid_strings'} = ""; +$restriction_types{"strict"}{'valid_strings'} = ""; +$restriction_types{"fits_in_31_bits"}{'valid_strings'} = ""; + +# This set of tests looks for basic sanity, and lastly tests various routines +# for the given code point. If the earlier tests for that code point fail, +# the later ones probably will too. Malformations are tested in later # segments of code. for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } keys %code_points) @@ -280,8 +500,8 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } $u < 0x10000 ? 3 : $u < 0x200000 ? 4 : $u < 0x4000000 ? 5 : - $u < 0x80000000 ? 6 : (($is64bit) - ? ($u < 0x1000000000 ? 7 : 13) + $u < 0x80000000 ? 6 : (($::is64bit) + ? ($u < 0x1000000000 ? 7 : $::max_bytes) : 7) ) : ($u < 0xA0 ? 1 : @@ -290,7 +510,7 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } $u < 0x40000 ? 4 : $u < 0x400000 ? 5 : $u < 0x4000000 ? 6 : - $u < 0x40000000 ? 7 : 14 ); + $u < 0x40000000 ? 7 : $::max_bytes ); } # If this test fails, subsequent ones are meaningless. @@ -314,7 +534,30 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } "Verify UTF8_SKIP(chr $hex_n) is $uvchr_skip_should_be"); use bytes; - for (my $j = 0; $j < length $n_chr; $j++) { + my $byte_length = length $n_chr; + for (my $j = 0; $j < $byte_length; $j++) { + undef @warnings; + + if ($j == $byte_length - 1) { + my $ret + = test_is_utf8_valid_partial_char_flags($n_chr, $byte_length, 0); + is($ret, 0, " Verify is_utf8_valid_partial_char_flags(" + . display_bytes($n_chr) + . ") returns 0 for full character"); + } + else { + my $bytes_so_far = substr($n_chr, 0, $j + 1); + my $ret + = test_is_utf8_valid_partial_char_flags($bytes_so_far, $j + 1, 0); + is($ret, 1, " Verify is_utf8_valid_partial_char_flags(" + . display_bytes($bytes_so_far) + . ") returns 1"); + } + + is(scalar @warnings, 0, " Verify is_utf8_valid_partial_char_flags" + . " generated no warnings") + or output_warnings(@warnings); + my $b = substr($n_chr, $j, 1); my $hex_b = sprintf("\"\\x%02x\"", ord $b); @@ -374,760 +617,463 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) } # later section of the code tests for these kinds of things. my $this_utf8_flags = $look_for_everything_utf8n_to; my $len = length $bytes; - if ($n > 2 ** 31 - 1) { - $this_utf8_flags &= - ~($UTF8_DISALLOW_ABOVE_31_BIT|$UTF8_WARN_ABOVE_31_BIT); - } + + my $valid_under_strict = 1; + my $valid_under_c9strict = 1; + my $valid_for_fits_in_31_bits = 1; if ($n > 0x10FFFF) { - $this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER); + $this_utf8_flags &= ~($::UTF8_DISALLOW_SUPER|$::UTF8_WARN_SUPER); + $valid_under_strict = 0; + $valid_under_c9strict = 0; + if ($n > 2 ** 31 - 1) { + $this_utf8_flags &= + ~($::UTF8_DISALLOW_ABOVE_31_BIT|$::UTF8_WARN_ABOVE_31_BIT); + $valid_for_fits_in_31_bits = 0; + } } - elsif (($n & 0xFFFE) == 0xFFFE) { - $this_utf8_flags &= ~($UTF8_DISALLOW_NONCHAR|$UTF8_WARN_NONCHAR); + elsif (($n >= 0xFDD0 && $n <= 0xFDEF) || ($n & 0xFFFE) == 0xFFFE) { + $this_utf8_flags &= ~($::UTF8_DISALLOW_NONCHAR|$::UTF8_WARN_NONCHAR); + $valid_under_strict = 0; + } + elsif ($n >= 0xD800 && $n <= 0xDFFF) { + $this_utf8_flags &= ~($::UTF8_DISALLOW_SURROGATE|$::UTF8_WARN_SURROGATE); + $valid_under_c9strict = 0; + $valid_under_strict = 0; } undef @warnings; my $display_flags = sprintf "0x%x", $this_utf8_flags; - my $ret_ref = test_utf8n_to_uvchr($bytes, $len, $this_utf8_flags); my $display_bytes = display_bytes($bytes); - is($ret_ref->[0], $n, "Verify utf8n_to_uvchr($display_bytes, $display_flags) returns $hex_n"); - is($ret_ref->[1], $len, "Verify utf8n_to_uvchr() for $hex_n returns expected length"); + my $ret_ref = test_utf8n_to_uvchr_error($bytes, $len, $this_utf8_flags); + + # Rest of tests likely meaningless if it gets the wrong code point. + next unless is($ret_ref->[0], $n, + "Verify utf8n_to_uvchr_error($display_bytes, $display_flags)" + . "returns $hex_n"); + is($ret_ref->[1], $len, + "Verify utf8n_to_uvchr_error() for $hex_n returns expected length:" + . " $len"); unless (is(scalar @warnings, 0, - "Verify utf8n_to_uvchr() for $hex_n generated no warnings")) + "Verify utf8n_to_uvchr_error() for $hex_n generated no warnings")) { - diag "The warnings were: " . join(", ", @warnings); + output_warnings(@warnings); } - - # Similarly for uvchr_to_utf8 - my $this_uvchr_flags = $look_for_everything_uvchr_to; - if ($n > 2 ** 31 - 1) { - $this_uvchr_flags &= - ~($UNICODE_DISALLOW_ABOVE_31_BIT|$UNICODE_WARN_ABOVE_31_BIT); - } - if ($n > 0x10FFFF) { - $this_uvchr_flags &= ~($UNICODE_DISALLOW_SUPER|$UNICODE_WARN_SUPER); - } - elsif (($n & 0xFFFE) == 0xFFFE) { - $this_uvchr_flags &= ~($UNICODE_DISALLOW_NONCHAR|$UNICODE_WARN_NONCHAR); - } - $display_flags = sprintf "0x%x", $this_uvchr_flags; + is($ret_ref->[2], 0, + "Verify utf8n_to_uvchr_error() returned no error bits"); undef @warnings; - my $ret = test_uvchr_to_utf8_flags($n, $this_uvchr_flags); - ok(defined $ret, "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returned success"); - is($ret, $bytes, "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returns correct bytes"); + my $ret = test_isUTF8_CHAR($bytes, $len); + is($ret, $len, + "Verify isUTF8_CHAR($display_bytes) returns expected length: $len"); unless (is(scalar @warnings, 0, - "Verify uvchr_to_utf8_flags($hex_n, $display_flags) for $hex_n generated no warnings")) + "Verify isUTF8_CHAR() for $hex_n generated no warnings")) { - diag "The warnings were: " . join(", ", @warnings); + output_warnings(@warnings); } -} -my $REPLACEMENT = 0xFFFD; - -# Now test the malformations. All these raise category utf8 warnings. -my $c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte -my @malformations = ( - [ "zero length string malformation", "", 0, - $UTF8_ALLOW_EMPTY, 0, 0, - qr/empty string/ - ], - [ "orphan continuation byte malformation", I8_to_native("${c}a"), - 2, - $UTF8_ALLOW_CONTINUATION, $REPLACEMENT, 1, - qr/unexpected continuation byte/ - ], - [ "premature next character malformation (immediate)", - (isASCII) ? "\xc2a" : I8_to_native("\xc5") ."a", - 2, - $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 1, - qr/unexpected non-continuation byte.*immediately after start byte/ - ], - [ "premature next character malformation (non-immediate)", - I8_to_native("\xf0${c}a"), - 3, - $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 2, - qr/unexpected non-continuation byte .* 2 bytes after start byte/ - ], - [ "too short malformation", I8_to_native("\xf0${c}a"), 2, - # Having the 'a' after this, but saying there are only 2 bytes also - # tests that we pay attention to the passed in length - $UTF8_ALLOW_SHORT, $REPLACEMENT, 2, - qr/2 bytes, need 4/ - ], - [ "overlong malformation", I8_to_native("\xc0$c"), 2, - $UTF8_ALLOW_LONG, - 0, # NUL - 2, - qr/2 bytes, need 1/ - ], - [ "overflow malformation", - # These are the smallest overflowing on 64 byte machines: - # 2**64 - (isASCII) ? "\xff\x80\x90\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0" - : I8_to_native("\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), - (isASCII) ? 13 : 14, - 0, # There is no way to allow this malformation - $REPLACEMENT, - (isASCII) ? 13 : 14, - qr/overflow/ - ], -); + undef @warnings; -foreach my $test (@malformations) { - my ($testname, $bytes, $length, $allow_flags, $allowed_uv, $expected_len, $message ) = @$test; + $ret = test_isUTF8_CHAR($bytes, $len - 1); + is($ret, 0, + "Verify isUTF8_CHAR() with too short length parameter returns 0"); - next if ! ok(length($bytes) >= $length, "$testname: Make sure won't read beyond buffer: " . length($bytes) . " >= $length"); + is(scalar @warnings, 0, "Verify isUTF8_CHAR() generated no warnings") + or output_warnings(@warnings); - # Test what happens when this malformation is not allowed undef @warnings; - my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0); - is($ret_ref->[0], 0, "$testname: disallowed: Returns 0"); - is($ret_ref->[1], $expected_len, "$testname: disallowed: Returns expected length"); - if (is(scalar @warnings, 1, "$testname: disallowed: Got a single warning ")) { - like($warnings[0], $message, "$testname: disallowed: Got expected warning"); + + $ret = test_isUTF8_CHAR_flags($bytes, $len, 0); + is($ret, $len, "Verify isUTF8_CHAR_flags($display_bytes, 0)" + . " returns expected length: $len"); + + is(scalar @warnings, 0, + "Verify isUTF8_CHAR_flags() for $hex_n generated no warnings") + or output_warnings(@warnings); + + undef @warnings; + + $ret = test_isUTF8_CHAR_flags($bytes, $len - 1, 0); + is($ret, 0, + "Verify isUTF8_CHAR_flags() with too short length parameter returns 0"); + + is(scalar @warnings, 0, "Verify isUTF8_CHAR_flags() generated no warnings") + or output_warnings(@warnings); + + undef @warnings; + + $ret = test_isSTRICT_UTF8_CHAR($bytes, $len); + my $expected_len = ($valid_under_strict) ? $len : 0; + is($ret, $expected_len, "Verify isSTRICT_UTF8_CHAR($display_bytes)" + . " returns expected length: $expected_len"); + + is(scalar @warnings, 0, + "Verify isSTRICT_UTF8_CHAR() for $hex_n generated no warnings") + or output_warnings(@warnings); + + undef @warnings; + + $ret = test_isSTRICT_UTF8_CHAR($bytes, $len - 1); + is($ret, 0, + "Verify isSTRICT_UTF8_CHAR() with too short length parameter returns 0"); + + is(scalar @warnings, 0, "Verify isSTRICT_UTF8_CHAR() generated no warnings") + or output_warnings(@warnings); + + undef @warnings; + + $ret = test_isUTF8_CHAR_flags($bytes, $len, + $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE); + is($ret, $expected_len, + "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')" + . " acts like isSTRICT_UTF8_CHAR"); + + is(scalar @warnings, 0, + "Verify isUTF8_CHAR() for $hex_n generated no warnings") + or output_warnings(@warnings); + + undef @warnings; + + $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len); + $expected_len = ($valid_under_c9strict) ? $len : 0; + is($ret, $expected_len, "Verify isC9_STRICT_UTF8_CHAR($display_bytes)" + . " returns expected length: $len"); + + is(scalar @warnings, 0, + "Verify isC9_STRICT_UTF8_CHAR() for $hex_n generated no warnings") + or output_warnings(@warnings); + + undef @warnings; + + $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len - 1); + is($ret, 0, + "Verify isC9_STRICT_UTF8_CHAR() with too short length parameter returns 0"); + + is(scalar @warnings, 0, + "Verify isC9_STRICT_UTF8_CHAR() generated no warnings") + or output_warnings(@warnings); + + undef @warnings; + + $ret = test_isUTF8_CHAR_flags($bytes, $len, + $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); + is($ret, $expected_len, + "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')" + ." acts like isC9_STRICT_UTF8_CHAR"); + + is(scalar @warnings, 0, + "Verify isUTF8_CHAR() for $hex_n generated no warnings") + or output_warnings(@warnings); + + undef @warnings; + + $ret_ref = test_valid_utf8_to_uvchr($bytes); + is($ret_ref->[0], $n, + "Verify valid_utf8_to_uvchr($display_bytes) returns $hex_n"); + is($ret_ref->[1], $len, + "Verify valid_utf8_to_uvchr() for $hex_n returns expected length: $len"); + + is(scalar @warnings, 0, + "Verify valid_utf8_to_uvchr() for $hex_n generated no warnings") + or output_warnings(@warnings); + + # Similarly for uvchr_to_utf8 + my $this_uvchr_flags = $look_for_everything_uvchr_to; + if ($n > 2 ** 31 - 1) { + $this_uvchr_flags &= + ~($::UNICODE_DISALLOW_ABOVE_31_BIT|$::UNICODE_WARN_ABOVE_31_BIT); } - else { - if (scalar @warnings) { - diag "The warnings were: " . join(", ", @warnings); - } + if ($n > 0x10FFFF) { + $this_uvchr_flags &= ~($::UNICODE_DISALLOW_SUPER|$::UNICODE_WARN_SUPER); } - - { # Next test when disallowed, and warnings are off. - undef @warnings; - no warnings 'utf8'; - my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0); - is($ret_ref->[0], 0, "$testname: disallowed: no warnings 'utf8': Returns 0"); - is($ret_ref->[1], $expected_len, "$testname: disallowed: no warnings 'utf8': Returns expected length"); - if (!is(scalar @warnings, 0, "$testname: disallowed: no warnings 'utf8': no warnings generated")) { - diag "The warnings were: " . join(", ", @warnings); - } + elsif (($n >= 0xFDD0 && $n <= 0xFDEF) || ($n & 0xFFFE) == 0xFFFE) { + $this_uvchr_flags &= ~($::UNICODE_DISALLOW_NONCHAR|$::UNICODE_WARN_NONCHAR); } + elsif ($n >= 0xD800 && $n <= 0xDFFF) { + $this_uvchr_flags + &= ~($::UNICODE_DISALLOW_SURROGATE|$::UNICODE_WARN_SURROGATE); + } + $display_flags = sprintf "0x%x", $this_uvchr_flags; - # Test with CHECK_ONLY undef @warnings; - $ret_ref = test_utf8n_to_uvchr($bytes, $length, $UTF8_CHECK_ONLY); - is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0"); - is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns expected length"); - if (! is(scalar @warnings, 0, "$testname: CHECK_ONLY: no warnings generated")) { - diag "The warnings were: " . join(", ", @warnings); + + $ret = test_uvchr_to_utf8_flags($n, $this_uvchr_flags); + ok(defined $ret, + "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returned success"); + is($ret, $bytes, + "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returns correct bytes"); + + is(scalar @warnings, 0, + "Verify uvchr_to_utf8_flags($hex_n, $display_flags) for $hex_n" + . " generated no warnings") + or output_warnings(@warnings); + + # Now append this code point to a string that we will test various + # versions of is_foo_utf8_string_bar on, and keep a count of how many code + # points are in it. All the code points in this loop are valid in Perl's + # extended UTF-8, but some are not valid under various restrictions. A + # string and count is kept separately that is entirely valid for each + # restriction. And, for each restriction, we note the first occurrence in + # the unrestricted string where we find something not in the restricted + # string. + $restriction_types{""}{'valid_strings'} .= $bytes; + $restriction_types{""}{'valid_counts'}++; + + if ($valid_under_c9strict) { + $restriction_types{"c9strict"}{'valid_strings'} .= $bytes; + $restriction_types{"c9strict"}{'valid_counts'}++; + } + elsif (! exists $restriction_types{"c9strict"}{'first_invalid_offset'}) { + $restriction_types{"c9strict"}{'first_invalid_offset'} + = length $restriction_types{"c9strict"}{'valid_strings'}; + $restriction_types{"c9strict"}{'first_invalid_count'} + = $restriction_types{"c9strict"}{'valid_counts'}; } - next if $allow_flags == 0; # Skip if can't allow this malformation + if ($valid_under_strict) { + $restriction_types{"strict"}{'valid_strings'} .= $bytes; + $restriction_types{"strict"}{'valid_counts'}++; + } + elsif (! exists $restriction_types{"strict"}{'first_invalid_offset'}) { + $restriction_types{"strict"}{'first_invalid_offset'} + = length $restriction_types{"strict"}{'valid_strings'}; + $restriction_types{"strict"}{'first_invalid_count'} + = $restriction_types{"strict"}{'valid_counts'}; + } - # Test when the malformation is allowed - undef @warnings; - $ret_ref = test_utf8n_to_uvchr($bytes, $length, $allow_flags); - is($ret_ref->[0], $allowed_uv, "$testname: allowed: Returns expected uv"); - is($ret_ref->[1], $expected_len, "$testname: allowed: Returns expected length"); - if (!is(scalar @warnings, 0, "$testname: allowed: no warnings generated")) + if ($valid_for_fits_in_31_bits) { + $restriction_types{"fits_in_31_bits"}{'valid_strings'} .= $bytes; + $restriction_types{"fits_in_31_bits"}{'valid_counts'}++; + } + elsif (! exists + $restriction_types{"fits_in_31_bits"}{'first_invalid_offset'}) { - diag "The warnings were: " . join(", ", @warnings); + $restriction_types{"fits_in_31_bits"}{'first_invalid_offset'} + = length $restriction_types{"fits_in_31_bits"}{'valid_strings'}; + $restriction_types{"fits_in_31_bits"}{'first_invalid_count'} + = $restriction_types{"fits_in_31_bits"}{'valid_counts'}; } } -# Now test the cases where a legal code point is generated, but may or may not -# be allowed/warned on. -my @tests = ( - [ "lowest surrogate", - (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"), - $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, - 'surrogate', 0xD800, - (isASCII) ? 3 : 4, - qr/surrogate/ - ], - [ "a middle surrogate", - (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"), - $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, - 'surrogate', 0xD90D, - (isASCII) ? 3 : 4, - qr/surrogate/ - ], - [ "highest surrogate", - (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"), - $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, - 'surrogate', 0xDFFF, - (isASCII) ? 3 : 4, - qr/surrogate/ - ], - [ "first non_unicode", - (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"), - $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, - 'non_unicode', 0x110000, - (isASCII) ? 4 : 5, - qr/not Unicode.* may not be portable/ - ], - [ "first of 32 consecutive non-character code points", - (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xFDD0, - (isASCII) ? 3 : 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "a mid non-character code point of the 32 consecutive ones", - (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xFDE0, - (isASCII) ? 3 : 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "final of 32 consecutive non-character code points", - (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xFDEF, - (isASCII) ? 3 : 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+FFFE", - (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xFFFE, - (isASCII) ? 3 : 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+FFFF", - (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xFFFF, - (isASCII) ? 3 : 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+1FFFE", - (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x1FFFE, 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+1FFFF", - (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x1FFFF, 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+2FFFE", - (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x2FFFE, 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+2FFFF", - (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x2FFFF, 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+3FFFE", - (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x3FFFE, 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+3FFFF", - (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x3FFFF, 4, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+4FFFE", - (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x4FFFE, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+4FFFF", - (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x4FFFF, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+5FFFE", - (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x5FFFE, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+5FFFF", - (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x5FFFF, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+6FFFE", - (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x6FFFE, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+6FFFF", - (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x6FFFF, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+7FFFE", - (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x7FFFE, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+7FFFF", - (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x7FFFF, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+8FFFE", - (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x8FFFE, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+8FFFF", - (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x8FFFF, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+9FFFE", - (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x9FFFE, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+9FFFF", - (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x9FFFF, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+AFFFE", - (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xAFFFE, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+AFFFF", - (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xAFFFF, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+BFFFE", - (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xBFFFE, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+BFFFF", - (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xBFFFF, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+CFFFE", - (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xCFFFE, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+CFFFF", - (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xCFFFF, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+DFFFE", - (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xDFFFE, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+DFFFF", - (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xDFFFF, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+EFFFE", - (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xEFFFE, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+EFFFF", - (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xEFFFF, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+FFFFE", - (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xFFFFE, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+FFFFF", - (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0xFFFFF, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+10FFFE", - (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x10FFFE, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "non-character code point U+10FFFF", - (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"), - $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, - 'nonchar', 0x10FFFF, - (isASCII) ? 4 : 5, - qr/Unicode non-character.*is not recommended for open interchange/ - ], - [ "requires at least 32 bits", - (isASCII) - ? "\xfe\x82\x80\x80\x80\x80\x80" - : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), - # This code point is chosen so that it is representable in a UV on - # 32-bit machines - $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT, - 'utf8', 0x80000000, (isASCII) ? 7 :14, - qr/Code point 0x80000000 is not Unicode, and not portable/ - ], - [ "requires at least 32 bits, and use SUPER-type flags, instead of ABOVE_31_BIT", - (isASCII) - ? "\xfe\x82\x80\x80\x80\x80\x80" - : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), - $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, - 'utf8', 0x80000000, (isASCII) ? 7 :14, - qr/Code point 0x80000000 is not Unicode, and not portable/ - ], - [ "overflow with warnings/disallow for more than 31 bits", - # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT - # with overflow. The overflow malformation is never allowed, so - # preventing it takes precedence if the ABOVE_31_BIT options would - # otherwise allow in an overflowing value. The ASCII code points (1 - # for 32-bits; 1 for 64) were chosen because the old overflow - # detection algorithm did not catch them; this means this test also - # checks for that fix. The EBCDIC are arbitrary overflowing ones - # since we have no reports of failures with it. - (($is64bit) - ? ((isASCII) - ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" - : I8_to_native("\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0")) - : ((isASCII) - ? "\xfe\x86\x80\x80\x80\x80\x80" - : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))), - - # We include both warning categories to make sure the ABOVE_31_BIT one - # has precedence - "$UTF8_WARN_ABOVE_31_BIT|$UTF8_WARN_SUPER", - "$UTF8_DISALLOW_ABOVE_31_BIT", - 'utf8', 0, - (! isASCII) ? 14 : ($is64bit) ? 13 : 7, - qr/overflow at byte .*, after start byte 0xf/ - ], -); +my $I8c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte +my $cont_byte = I8_to_native($I8c); +my $p = (isASCII) ? "\xe1\x80" : I8_to_native("\xE4\xA0"); # partial -if ($is64bit) { - no warnings qw{portable overflow}; - push @tests, - [ "More than 32 bits", - (isASCII) - ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80" - : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), - $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT, - 'utf8', 0x1000000000, (isASCII) ? 13 : 14, - qr/Code point 0x.* is not Unicode, and not portable/ - ]; -} +# The loop above tested the single or partial character functions/macros, +# while building up strings to test the string functions, which we do now. -foreach my $test (@tests) { - my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test; - - my $length = length $bytes; - my $will_overflow = $testname =~ /overflow/; - - # This is more complicated than the malformations tested earlier, as there - # are several orthogonal variables involved. We test all the subclasses - # of utf8 warnings to verify they work with and without the utf8 class, - # and don't have effects on other sublass warnings - foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') { - foreach my $warn_flag (0, $warn_flags) { - foreach my $disallow_flag (0, $disallow_flags) { - foreach my $do_warning (0, 1) { - - my $eval_warn = $do_warning - ? "use warnings '$warning'" - : $warning eq "utf8" - ? "no warnings 'utf8'" - : "use warnings 'utf8'; no warnings '$warning'"; - - # is effectively disallowed if will overflow, even if the - # flag indicates it is allowed, fix up test name to - # indicate this as well - my $disallowed = $disallow_flag || $will_overflow; - - my $this_name = "utf8n_to_uvchr() $testname: " . (($disallow_flag) - ? 'disallowed' - : ($disallowed) - ? 'ABOVE_31_BIT allowed' - : 'allowed'); - $this_name .= ", $eval_warn"; - $this_name .= ", " . (($warn_flag) - ? 'with warning flag' - : 'no warning flag'); - - undef @warnings; - my $ret_ref; - my $display_bytes = display_bytes($bytes); - my $call = "Call was: $eval_warn; \$ret_ref = test_utf8n_to_uvchr('$display_bytes', $length, $warn_flag|$disallow_flag)"; - my $eval_text = "$eval_warn; \$ret_ref = test_utf8n_to_uvchr('$bytes', $length, $warn_flag|$disallow_flag)"; - eval "$eval_text"; - if (! ok ("$@ eq ''", "$this_name: eval succeeded")) { - diag "\$!='$!'; eval'd=\"$call\""; - next; - } - if ($disallowed) { - unless (is($ret_ref->[0], 0, "$this_name: Returns 0")) - { - diag $call; - } - } - else { - unless (is($ret_ref->[0], $allowed_uv, - "$this_name: Returns expected uv")) - { - diag $call; - } - } - unless (is($ret_ref->[1], $expected_len, - "$this_name: Returns expected length")) - { - diag $call; - } +for my $restriction (sort keys %restriction_types) { + use bytes; - if (! $do_warning - && ($warning eq 'utf8' || $warning eq $category)) - { - if (!is(scalar @warnings, 0, - "$this_name: No warnings generated")) - { - diag $call; - diag "The warnings were: " . join(", ", @warnings); - } - } - elsif ($will_overflow - && ! $disallow_flag - && $warning eq 'utf8') - { - - # Will get the overflow message instead of the expected - # message under these circumstances, as they would - # otherwise accept an overflowed value, which the code - # should not allow, so falls back to overflow. - if (is(scalar @warnings, 1, - "$this_name: Got a single warning ")) + for my $use_flags ("", "_flags") { + + # For each restriction, we test it in both the is_foo_flags functions + # and the specially named foo function. But not if there isn't such a + # specially named function. Currently, this is the only tested + # restriction that doesn't have a specially named function + next if $use_flags eq "" && $restriction eq "fits_in_31_bits"; + + # Start building up the name of the function we will test. + my $base_name = "is_"; + + if (! $use_flags && $restriction ne "") { + $base_name .= $restriction . "_"; + } + + # We test both "is_utf8_string_foo" and "is_fixed_width_buf" functions + foreach my $operand ('string', 'fixed_width_buf') { + + # Currently, the only fixed_width_buf functions have the '_flags' + # suffix. + next if $operand eq 'fixed_width_buf' && $use_flags eq ""; + + my $name = "${base_name}utf8_$operand"; + + # We test each version of the function + for my $function ("_loclen", "_loc", "") { + + # We test each function against + # a) valid input + # b) invalid input created by appending an out-of-place + # continuation character to the valid string + # c) input created by appending a partial character. This + # is valid in the 'fixed_width' functions, but invalid in + # the 'string' ones + # d) invalid input created by calling a function that is + # expecting a restricted form of the input using the string + # that's valid when unrestricted + for my $error_type (0, $cont_byte, $p, $restriction) { + #diag "restriction=$restriction, use_flags=$use_flags, function=$function, error_type=" . display_bytes($error_type); + + # If there is no restriction, the error type will be "", + # which is redundant with 0. + next if $error_type eq ""; + + my $this_name = "$name$function$use_flags"; + my $bytes + = $restriction_types{$restriction}{'valid_strings'}; + my $expected_offset = length $bytes; + my $expected_count + = $restriction_types{$restriction}{'valid_counts'}; + my $test_name_suffix = ""; + + my $this_error_type = $error_type; + if ($this_error_type) { + + # Appending a bare continuation byte or a partial + # character doesn't change the character count or + # offset. But in the other cases, we have saved where + # the failures should occur, so use those. Appending + # a continuation byte makes it invalid; appending a + # partial character makes the 'string' form invalid, + # but not the 'fixed_width_buf' form. + if ( $this_error_type eq $cont_byte + || $this_error_type eq $p) { - unless (like($warnings[0], qr/overflow/, - "$this_name: Got overflow warning")) - { - diag $call; + $bytes .= $this_error_type; + if ($this_error_type eq $cont_byte) { + $test_name_suffix + = " for an unexpected continuation"; + } + else { + $test_name_suffix + = " if ends with a partial character"; + $this_error_type + = 0 if $operand eq "fixed_width_buf"; } } else { - diag $call; - if (scalar @warnings) { - diag "The warnings were: " - . join(", ", @warnings); + $test_name_suffix + = " if contains forbidden code points"; + if ($this_error_type eq "c9strict") { + $bytes = $restriction_types{""}{'valid_strings'}; + $expected_offset + = $restriction_types{"c9strict"} + {'first_invalid_offset'}; + $expected_count + = $restriction_types{"c9strict"} + {'first_invalid_count'}; } - } - } - elsif ($warn_flag - && ($warning eq 'utf8' || $warning eq $category)) - { - if (is(scalar @warnings, 1, - "$this_name: Got a single warning ")) - { - unless (like($warnings[0], $message, - "$this_name: Got expected warning")) - { - diag $call; + elsif ($this_error_type eq "strict") { + $bytes = $restriction_types{""}{'valid_strings'}; + $expected_offset + = $restriction_types{"strict"} + {'first_invalid_offset'}; + $expected_count + = $restriction_types{"strict"} + {'first_invalid_count'}; + } - } - else { - diag $call; - if (scalar @warnings) { - diag "The warnings were: " - . join(", ", @warnings); + elsif ($this_error_type eq "fits_in_31_bits") { + $bytes = $restriction_types{""}{'valid_strings'}; + $expected_offset + = $restriction_types{"fits_in_31_bits"} + {'first_invalid_offset'}; + $expected_count + = $restriction_types{"fits_in_31_bits"} + {'first_invalid_count'}; + } + else { + fail("Internal test error: Unknown error type " + . "'$this_error_type'"); + next; } } } - # Check CHECK_ONLY results when the input is disallowed. Do - # this when actually disallowed, not just when the - # $disallow_flag is set - if ($disallowed) { - undef @warnings; - $ret_ref = test_utf8n_to_uvchr($bytes, $length, - $disallow_flag|$UTF8_CHECK_ONLY); - unless (is($ret_ref->[0], 0, "$this_name, CHECK_ONLY: Returns 0")) { - diag $call; - } - unless (is($ret_ref->[1], -1, - "$this_name: CHECK_ONLY: returns expected length")) - { - diag $call; - } - if (! is(scalar @warnings, 0, - "$this_name, CHECK_ONLY: no warnings generated")) - { - diag $call; - diag "The warnings were: " . join(", ", @warnings); - } - } + my $length = length $bytes; + my $ret_ref; - # Now repeat some of the above, but for - # uvchr_to_utf8_flags(). Since this comes from an - # existing code point, it hasn't overflowed. - next if $will_overflow; - - # The warning and disallow flags passed in are for - # utf8n_to_uvchr(). Convert them for - # uvchr_to_utf8_flags(). - my $uvchr_warn_flag = 0; - my $uvchr_disallow_flag = 0; - if ($warn_flag) { - if ($warn_flag == $UTF8_WARN_SURROGATE) { - $uvchr_warn_flag = $UNICODE_WARN_SURROGATE - } - elsif ($warn_flag == $UTF8_WARN_NONCHAR) { - $uvchr_warn_flag = $UNICODE_WARN_NONCHAR - } - elsif ($warn_flag == $UTF8_WARN_SUPER) { - $uvchr_warn_flag = $UNICODE_WARN_SUPER - } - elsif ($warn_flag == $UTF8_WARN_ABOVE_31_BIT) { - $uvchr_warn_flag = $UNICODE_WARN_ABOVE_31_BIT; - } - else { - fail(sprintf "Unexpected warn flag: %x", - $warn_flag); - next; - } - } - if ($disallow_flag) { - if ($disallow_flag == $UTF8_DISALLOW_SURROGATE) { - $uvchr_disallow_flag = $UNICODE_DISALLOW_SURROGATE - } - elsif ($disallow_flag == $UTF8_DISALLOW_NONCHAR) { - $uvchr_disallow_flag = $UNICODE_DISALLOW_NONCHAR - } - elsif ($disallow_flag == $UTF8_DISALLOW_SUPER) { - $uvchr_disallow_flag = $UNICODE_DISALLOW_SUPER - } - elsif ($disallow_flag == $UTF8_DISALLOW_ABOVE_31_BIT) { - $uvchr_disallow_flag = - $UNICODE_DISALLOW_ABOVE_31_BIT; + my $test = "\$ret_ref = test_$this_name(\$bytes, $length"; + + # If using the _flags functions, we have to figure out what + # flags to pass. This is done to match the restriction. + if ($use_flags eq "_flags") { + if (! $restriction) { + $test .= ", 0"; # The flag + + # Indicate the kind of flag in the test name. + $this_name .= "(0)"; } else { - fail(sprintf "Unexpected disallow flag: %x", - $disallow_flag); - next; + $this_name .= "($restriction)"; + if ($restriction eq "c9strict") { + $test + .= ", $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE"; + } + elsif ($restriction eq "strict") { + $test .= ", $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE"; + } + elsif ($restriction eq "fits_in_31_bits") { + $test .= ", $::UTF8_DISALLOW_ABOVE_31_BIT"; + } + else { + fail("Internal test error: Unknown restriction " + . "'$restriction'"); + next; + } } } + $test .= ")"; - $disallowed = $uvchr_disallow_flag; - - $this_name = "uvchr_to_utf8_flags() $testname: " - . (($uvchr_disallow_flag) - ? 'disallowed' - : ($disallowed) - ? 'ABOVE_31_BIT allowed' - : 'allowed'); - $this_name .= ", $eval_warn"; - $this_name .= ", " . (($uvchr_warn_flag) - ? 'with warning flag' - : 'no warning flag'); + # Actually run the test + eval $test; + if ($@) { + fail($test); + diag $@; + next; + } - undef @warnings; my $ret; - my $warn_flag = sprintf "0x%x", $uvchr_warn_flag; - my $disallow_flag = sprintf "0x%x", $uvchr_disallow_flag; - $call = sprintf "call was: $eval_warn; \$ret = test_uvchr_to_utf8_flags(0x%x, $warn_flag|$disallow_flag)", $allowed_uv; - $eval_text = "$eval_warn; \$ret = test_uvchr_to_utf8_flags($allowed_uv, $warn_flag|$disallow_flag)"; - eval "$eval_text"; - if (! ok ("$@ eq ''", "$this_name: eval succeeded")) { - diag "\$!='$!'; eval'd=\"$eval_text\""; - next; + my $error_offset; + my $cp_count; + + if ($function eq "") { + $ret = $ret_ref; # For plain function, there's only a + # single return value } - if ($disallowed) { - unless (is($ret, undef, "$this_name: Returns undef")) { - diag $call; - } + else { # Otherwise, the multiple values come in an array. + $ret = shift @$ret_ref ; + $error_offset = shift @$ret_ref; + $cp_count = shift@$ret_ref if $function eq "_loclen"; } - else { - unless (is($ret, $bytes, "$this_name: Returns expected string")) { - diag $call; - } + + if ($this_error_type) { + is($ret, 0, + "Verify $this_name is FALSE$test_name_suffix"); } - if (! $do_warning - && ($warning eq 'utf8' || $warning eq $category)) - { - if (!is(scalar @warnings, 0, - "$this_name: No warnings generated")) + else { + unless(is($ret, 1, + "Verify $this_name is TRUE for valid input" + . "$test_name_suffix")) { - diag $call; - diag "The warnings were: " . join(", ", @warnings); + diag(" The bytes starting at offset" + . " $error_offset are" + . display_bytes(substr( + $restriction_types{$restriction} + {'valid_strings'}, + $error_offset))); + next; } } - elsif ($uvchr_warn_flag - && ($warning eq 'utf8' || $warning eq $category)) - { - if (is(scalar @warnings, 1, - "$this_name: Got a single warning ")) + + if ($function ne "") { + unless (is($error_offset, $expected_offset, + "\tAnd returns the correct offset")) { - unless (like($warnings[0], $message, - "$this_name: Got expected warning")) - { - diag $call; - } + my $min = ($error_offset < $expected_offset) + ? $error_offset + : $expected_offset; + diag(" The bytes starting at offset" . $min + . " are " . display_bytes(substr($bytes, $min))); } - else { - diag $call; - if (scalar @warnings) { - diag "The warnings were: " - . join(", ", @warnings); - } + + if ($function eq '_loclen') { + is($cp_count, $expected_count, + "\tAnd returns the correct character count"); } } } @@ -1136,4 +1082,50 @@ foreach my $test (@tests) { } } +SKIP: +{ + isASCII + or skip "These tests probably break on non-ASCII", 1; + my $simple = join "", "A" .. "J"; + my $utf_ch = "\x{7fffffff}"; + utf8::encode($utf_ch); + my $utf_ch_len = length $utf_ch; + note "utf_ch_len $utf_ch_len"; + my $utf = $utf_ch x 10; + my $bad_start = substr($utf, 1); + # $bad_end ends with a start byte and a single continuation + my $bad_end = substr($utf, 0, length($utf)-$utf_ch_len+2); + + # WARNING: all offsets are *byte* offsets + my @hop_tests = + ( + # string s off expected name + [ $simple, 0, 5, 5, "simple in range, forward" ], + [ $simple, 10, -5, 5, "simple in range, backward" ], + [ $simple, 5, 10, 10, "simple out of range, forward" ], + [ $simple, 5, -10, 0, "simple out of range, backward" ], + [ $utf, $utf_ch_len * 5, 5, length($utf), "utf in range, forward" ], + [ $utf, $utf_ch_len * 5, -5, 0, "utf in range, backward" ], + [ $utf, $utf_ch_len * 5, 4, $utf_ch_len * 9, "utf in range b, forward" ], + [ $utf, $utf_ch_len * 5, -4, $utf_ch_len, "utf in range b, backward" ], + [ $utf, $utf_ch_len * 5, 6, length($utf), "utf out of range, forward" ], + [ $utf, $utf_ch_len * 5, -6, 0, "utf out of range, backward" ], + [ $bad_start, 0, 1, 1, "bad start, forward 1 from 0" ], + [ $bad_start, 0, $utf_ch_len-1, $utf_ch_len-1, "bad start, forward ch_len-1 from 0" ], + [ $bad_start, 0, $utf_ch_len, $utf_ch_len*2-1, "bad start, forward ch_len from 0" ], + [ $bad_start, $utf_ch_len-1, -1, 0, "bad start, back 1 from first start byte" ], + [ $bad_start, $utf_ch_len-2, -1, 0, "bad start, back 1 from before first start byte" ], + [ $bad_start, 0, -1, 0, "bad start, back 1 from 0" ], + [ $bad_start, length $bad_start, -10, 0, "bad start, back 10 from end" ], + [ $bad_end, 0, 10, length $bad_end, "bad end, forward 10 from 0" ], + [ $bad_end, length($bad_end)-1, 10, length $bad_end, "bad end, forward 1 from end-1" ], + ); + + for my $test (@hop_tests) { + my ($str, $s_off, $off, $want, $name) = @$test; + my $result = test_utf8_hop_safe($str, $s_off, $off); + is($result, $want, "utf8_hop_safe: $name"); + } +} + done_testing;