This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
XS-APItest/t/utf8.t: Fix failing EBCDIC test
[perl5.git] / ext / XS-APItest / t / utf8.t
1 #!perl -w
2
3 use strict;
4 use Test::More;
5 $|=1;
6
7 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
8                           # machines, and that is tested elsewhere
9
10 use XS::APItest;
11
12 my $pound_sign = chr utf8::unicode_to_native(163);
13
14 sub isASCII { ord "A" == 65 }
15
16 sub display_bytes {
17     use bytes;
18     my $string = shift;
19     return   '"'
20            . join("", map { sprintf("\\x%02x", ord $_) } split "", $string)
21            . '"';
22 }
23
24 # This  test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
25 # because that uses the same functions we are testing here.  So UTF-EBCDIC
26 # strings are hard-coded as I8 strings in this file instead, and we use array
27 # lookup to translate into the appropriate code page.
28
29 my @i8_to_native = (    # Only code page 1047 so far.
30 # _0   _1   _2   _3   _4   _5   _6   _7   _8   _9   _A   _B   _C   _D   _E   _F
31 0x00,0x01,0x02,0x03,0x37,0x2D,0x2E,0x2F,0x16,0x05,0x15,0x0B,0x0C,0x0D,0x0E,0x0F,
32 0x10,0x11,0x12,0x13,0x3C,0x3D,0x32,0x26,0x18,0x19,0x3F,0x27,0x1C,0x1D,0x1E,0x1F,
33 0x40,0x5A,0x7F,0x7B,0x5B,0x6C,0x50,0x7D,0x4D,0x5D,0x5C,0x4E,0x6B,0x60,0x4B,0x61,
34 0xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0x7A,0x5E,0x4C,0x7E,0x6E,0x6F,
35 0x7C,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,
36 0xD7,0xD8,0xD9,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xAD,0xE0,0xBD,0x5F,0x6D,
37 0x79,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x91,0x92,0x93,0x94,0x95,0x96,
38 0x97,0x98,0x99,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xC0,0x4F,0xD0,0xA1,0x07,
39 0x20,0x21,0x22,0x23,0x24,0x25,0x06,0x17,0x28,0x29,0x2A,0x2B,0x2C,0x09,0x0A,0x1B,
40 0x30,0x31,0x1A,0x33,0x34,0x35,0x36,0x08,0x38,0x39,0x3A,0x3B,0x04,0x14,0x3E,0xFF,
41 0x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x51,0x52,0x53,0x54,0x55,0x56,
42 0x57,0x58,0x59,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x70,0x71,0x72,0x73,
43 0x74,0x75,0x76,0x77,0x78,0x80,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,0x90,0x9A,0x9B,0x9C,
44 0x9D,0x9E,0x9F,0xA0,0xAA,0xAB,0xAC,0xAE,0xAF,0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,
45 0xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBE,0xBF,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xDA,0xDB,
46 0xDC,0xDD,0xDE,0xDF,0xE1,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xFA,0xFB,0xFC,0xFD,0xFE,
47 );
48
49 my @native_to_i8;
50 for (my $i = 0; $i < 256; $i++) {
51     $native_to_i8[$i8_to_native[$i]] = $i;
52 }
53
54 *I8_to_native = (isASCII)
55                     ? sub { return shift }
56                     : sub { return join "", map { chr $i8_to_native[ord $_] }
57                                             split "", shift };
58 *native_to_I8 = (isASCII)
59                     ? sub { return shift }
60                     : sub { return join "", map { chr $native_to_i8[ord $_] }
61                                             split "", shift };
62
63 my $is64bit = length sprintf("%x", ~0) > 8;
64
65
66 # Test utf8n_to_uvchr().  These provide essentially complete code coverage.
67 # Copied from utf8.h
68 my $UTF8_ALLOW_EMPTY            = 0x0001;
69 my $UTF8_ALLOW_CONTINUATION     = 0x0002;
70 my $UTF8_ALLOW_NON_CONTINUATION = 0x0004;
71 my $UTF8_ALLOW_SHORT            = 0x0008;
72 my $UTF8_ALLOW_LONG             = 0x0010;
73 my $UTF8_DISALLOW_SURROGATE     = 0x0020;
74 my $UTF8_WARN_SURROGATE         = 0x0040;
75 my $UTF8_DISALLOW_NONCHAR       = 0x0080;
76 my $UTF8_WARN_NONCHAR           = 0x0100;
77 my $UTF8_DISALLOW_SUPER         = 0x0200;
78 my $UTF8_WARN_SUPER             = 0x0400;
79 my $UTF8_DISALLOW_ABOVE_31_BIT  = 0x0800;
80 my $UTF8_WARN_ABOVE_31_BIT      = 0x1000;
81 my $UTF8_CHECK_ONLY             = 0x2000;
82 my $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
83                              = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE;
84 my $UTF8_DISALLOW_ILLEGAL_INTERCHANGE
85               = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR;
86 my $UTF8_WARN_ILLEGAL_C9_INTERCHANGE
87                              = $UTF8_WARN_SUPER|$UTF8_WARN_SURROGATE;
88 my $UTF8_WARN_ILLEGAL_INTERCHANGE
89               = $UTF8_WARN_ILLEGAL_C9_INTERCHANGE|$UTF8_WARN_NONCHAR;
90
91 # Test uvchr_to_utf8().
92 my $UNICODE_WARN_SURROGATE        = 0x0001;
93 my $UNICODE_WARN_NONCHAR          = 0x0002;
94 my $UNICODE_WARN_SUPER            = 0x0004;
95 my $UNICODE_WARN_ABOVE_31_BIT     = 0x0008;
96 my $UNICODE_DISALLOW_SURROGATE    = 0x0010;
97 my $UNICODE_DISALLOW_NONCHAR      = 0x0020;
98 my $UNICODE_DISALLOW_SUPER        = 0x0040;
99 my $UNICODE_DISALLOW_ABOVE_31_BIT = 0x0080;
100
101 my $look_for_everything_utf8n_to
102                         = $UTF8_DISALLOW_SURROGATE
103                         | $UTF8_WARN_SURROGATE
104                         | $UTF8_DISALLOW_NONCHAR
105                         | $UTF8_WARN_NONCHAR
106                         | $UTF8_DISALLOW_SUPER
107                         | $UTF8_WARN_SUPER
108                         | $UTF8_DISALLOW_ABOVE_31_BIT
109                         | $UTF8_WARN_ABOVE_31_BIT;
110 my $look_for_everything_uvchr_to
111                         = $UNICODE_DISALLOW_SURROGATE
112                         | $UNICODE_WARN_SURROGATE
113                         | $UNICODE_DISALLOW_NONCHAR
114                         | $UNICODE_WARN_NONCHAR
115                         | $UNICODE_DISALLOW_SUPER
116                         | $UNICODE_WARN_SUPER
117                         | $UNICODE_DISALLOW_ABOVE_31_BIT
118                         | $UNICODE_WARN_ABOVE_31_BIT;
119
120 foreach ([0, '', '', 'empty'],
121          [0, 'N', 'N', '1 char'],
122          [1, 'NN', 'N', '1 char substring'],
123          [-2, 'Perl', 'Rules', 'different'],
124          [0, $pound_sign, $pound_sign, 'pound sign'],
125          [1, $pound_sign . 10, $pound_sign . 1, '10 pounds is more than 1 pound'],
126          [1, $pound_sign . $pound_sign, $pound_sign, '2 pound signs are more than 1'],
127          [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'],
128          [-1, '!', "!\x{1F42A}", 'Initial substrings match'],
129         ) {
130     my ($expect, $left, $right, $desc) = @$_;
131     my $copy = $right;
132     utf8::encode($copy);
133     is(bytes_cmp_utf8($left, $copy), $expect, $desc);
134     next if $right =~ tr/\0-\377//c;
135     utf8::encode($left);
136     is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed");
137 }
138
139 # The keys to this hash are Unicode code points, their values are the native
140 # UTF-8 representations of them.  The code points are chosen because they are
141 # "interesting" on either or both ASCII and EBCDIC platforms.  First we add
142 # boundaries where the number of bytes required to represent them increase, or
143 # are adjacent to problematic code points, so we want to make sure they aren't
144 # considered problematic.
145 my %code_points = (
146     0x0100     => (isASCII) ? "\xc4\x80" : I8_to_native("\xc8\xa0"),
147     0x0400 - 1 => (isASCII) ? "\xcf\xbf" : I8_to_native("\xdf\xbf"),
148     0x0400     => (isASCII) ? "\xd0\x80" : I8_to_native("\xe1\xa0\xa0"),
149     0x0800 - 1 => (isASCII) ? "\xdf\xbf"     : I8_to_native("\xe1\xbf\xbf"),
150     0x0800     => (isASCII) ? "\xe0\xa0\x80" : I8_to_native("\xe2\xa0\xa0"),
151     0x4000 - 1 => (isASCII) ? "\xe3\xbf\xbf" : I8_to_native("\xef\xbf\xbf"),
152     0x4000     => (isASCII) ? "\xe4\x80\x80" : I8_to_native("\xf0\xb0\xa0\xa0"),
153     0x8000 - 1 => (isASCII) ? "\xe7\xbf\xbf" : I8_to_native("\xf0\xbf\xbf\xbf"),
154
155     # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC,
156     # as of this writing, considers potentially problematic on EBCDIC
157     0x8000     => (isASCII) ? "\xe8\x80\x80" : I8_to_native("\xf1\xa0\xa0\xa0"),
158
159     0xD000 - 1 => (isASCII) ? "\xec\xbf\xbf" : I8_to_native("\xf1\xb3\xbf\xbf"),
160
161     # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC,
162     # as of this writing, considers potentially problematic on ASCII
163     0xD000     => (isASCII) ? "\xed\x80\x80" : I8_to_native("\xf1\xb4\xa0\xa0"),
164
165     # Bracket the surrogates, and include several surrogates
166     0xD7FF      => (isASCII) ? "\xed\x9f\xbf" : I8_to_native("\xf1\xb5\xbf\xbf"),
167     0xD800      => (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
168     0xDC00      => (isASCII) ? "\xed\xb0\x80" : I8_to_native("\xf1\xb7\xa0\xa0"),
169     0xDFFF      => (isASCII) ? "\xee\x80\x80" : I8_to_native("\xf1\xb8\xa0\xa0"),
170     0xDFFF      => (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
171     0xE000      => (isASCII) ? "\xee\x80\x80" : I8_to_native("\xf1\xb8\xa0\xa0"),
172
173     # Include the 32 contiguous non characters, and surrounding code points
174     0xFDCF      => (isASCII) ? "\xef\xb7\x8f" : I8_to_native("\xf1\xbf\xae\xaf"),
175     0xFDD0      => (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
176     0xFDD1      => (isASCII) ? "\xef\xb7\x91" : I8_to_native("\xf1\xbf\xae\xb1"),
177     0xFDD2      => (isASCII) ? "\xef\xb7\x92" : I8_to_native("\xf1\xbf\xae\xb2"),
178     0xFDD3      => (isASCII) ? "\xef\xb7\x93" : I8_to_native("\xf1\xbf\xae\xb3"),
179     0xFDD4      => (isASCII) ? "\xef\xb7\x94" : I8_to_native("\xf1\xbf\xae\xb4"),
180     0xFDD5      => (isASCII) ? "\xef\xb7\x95" : I8_to_native("\xf1\xbf\xae\xb5"),
181     0xFDD6      => (isASCII) ? "\xef\xb7\x96" : I8_to_native("\xf1\xbf\xae\xb6"),
182     0xFDD7      => (isASCII) ? "\xef\xb7\x97" : I8_to_native("\xf1\xbf\xae\xb7"),
183     0xFDD8      => (isASCII) ? "\xef\xb7\x98" : I8_to_native("\xf1\xbf\xae\xb8"),
184     0xFDD9      => (isASCII) ? "\xef\xb7\x99" : I8_to_native("\xf1\xbf\xae\xb9"),
185     0xFDDA      => (isASCII) ? "\xef\xb7\x9a" : I8_to_native("\xf1\xbf\xae\xba"),
186     0xFDDB      => (isASCII) ? "\xef\xb7\x9b" : I8_to_native("\xf1\xbf\xae\xbb"),
187     0xFDDC      => (isASCII) ? "\xef\xb7\x9c" : I8_to_native("\xf1\xbf\xae\xbc"),
188     0xFDDD      => (isASCII) ? "\xef\xb7\x9d" : I8_to_native("\xf1\xbf\xae\xbd"),
189     0xFDDE      => (isASCII) ? "\xef\xb7\x9e" : I8_to_native("\xf1\xbf\xae\xbe"),
190     0xFDDF      => (isASCII) ? "\xef\xb7\x9f" : I8_to_native("\xf1\xbf\xae\xbf"),
191     0xFDE0      => (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
192     0xFDE1      => (isASCII) ? "\xef\xb7\xa1" : I8_to_native("\xf1\xbf\xaf\xa1"),
193     0xFDE2      => (isASCII) ? "\xef\xb7\xa2" : I8_to_native("\xf1\xbf\xaf\xa2"),
194     0xFDE3      => (isASCII) ? "\xef\xb7\xa3" : I8_to_native("\xf1\xbf\xaf\xa3"),
195     0xFDE4      => (isASCII) ? "\xef\xb7\xa4" : I8_to_native("\xf1\xbf\xaf\xa4"),
196     0xFDE5      => (isASCII) ? "\xef\xb7\xa5" : I8_to_native("\xf1\xbf\xaf\xa5"),
197     0xFDE6      => (isASCII) ? "\xef\xb7\xa6" : I8_to_native("\xf1\xbf\xaf\xa6"),
198     0xFDE7      => (isASCII) ? "\xef\xb7\xa7" : I8_to_native("\xf1\xbf\xaf\xa7"),
199     0xFDE8      => (isASCII) ? "\xef\xb7\xa8" : I8_to_native("\xf1\xbf\xaf\xa8"),
200     0xFDEa      => (isASCII) ? "\xef\xb7\x99" : I8_to_native("\xf1\xbf\xaf\xa9"),
201     0xFDEA      => (isASCII) ? "\xef\xb7\xaa" : I8_to_native("\xf1\xbf\xaf\xaa"),
202     0xFDEB      => (isASCII) ? "\xef\xb7\xab" : I8_to_native("\xf1\xbf\xaf\xab"),
203     0xFDEC      => (isASCII) ? "\xef\xb7\xac" : I8_to_native("\xf1\xbf\xaf\xac"),
204     0xFDED      => (isASCII) ? "\xef\xb7\xad" : I8_to_native("\xf1\xbf\xaf\xad"),
205     0xFDEE      => (isASCII) ? "\xef\xb7\xae" : I8_to_native("\xf1\xbf\xaf\xae"),
206     0xFDEF      => (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
207     0xFDF0      => (isASCII) ? "\xef\xb7\xb0" : I8_to_native("\xf1\xbf\xaf\xb0"),
208
209     # Mostly around non-characters, but some are transitions to longer strings
210     0xFFFD      => (isASCII) ? "\xef\xbf\xbd" : I8_to_native("\xf1\xbf\xbf\xbd"),
211     0x10000 - 1 => (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
212     0x10000     => (isASCII) ? "\xf0\x90\x80\x80" : I8_to_native("\xf2\xa0\xa0\xa0"),
213     0x1FFFD     => (isASCII) ? "\xf0\x9f\xbf\xbd" : I8_to_native("\xf3\xbf\xbf\xbd"),
214     0x1FFFE     => (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
215     0x1FFFF     => (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
216     0x20000     => (isASCII) ? "\xf0\xa0\x80\x80" : I8_to_native("\xf4\xa0\xa0\xa0"),
217     0x2FFFD     => (isASCII) ? "\xf0\xaf\xbf\xbd" : I8_to_native("\xf5\xbf\xbf\xbd"),
218     0x2FFFE     => (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
219     0x2FFFF     => (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
220     0x30000     => (isASCII) ? "\xf0\xb0\x80\x80" : I8_to_native("\xf6\xa0\xa0\xa0"),
221     0x3FFFD     => (isASCII) ? "\xf0\xbf\xbf\xbd" : I8_to_native("\xf7\xbf\xbf\xbd"),
222     0x3FFFE     => (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
223     0x40000 - 1 => (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
224     0x40000     => (isASCII) ? "\xf1\x80\x80\x80" : I8_to_native("\xf8\xa8\xa0\xa0\xa0"),
225     0x4FFFD     => (isASCII) ? "\xf1\x8f\xbf\xbd" : I8_to_native("\xf8\xa9\xbf\xbf\xbd"),
226     0x4FFFE     => (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
227     0x4FFFF     => (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
228     0x50000     => (isASCII) ? "\xf1\x90\x80\x80" : I8_to_native("\xf8\xaa\xa0\xa0\xa0"),
229     0x5FFFD     => (isASCII) ? "\xf1\x9f\xbf\xbd" : I8_to_native("\xf8\xab\xbf\xbf\xbd"),
230     0x5FFFE     => (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
231     0x5FFFF     => (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
232     0x60000     => (isASCII) ? "\xf1\xa0\x80\x80" : I8_to_native("\xf8\xac\xa0\xa0\xa0"),
233     0x6FFFD     => (isASCII) ? "\xf1\xaf\xbf\xbd" : I8_to_native("\xf8\xad\xbf\xbf\xbd"),
234     0x6FFFE     => (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
235     0x6FFFF     => (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
236     0x70000     => (isASCII) ? "\xf1\xb0\x80\x80" : I8_to_native("\xf8\xae\xa0\xa0\xa0"),
237     0x7FFFD     => (isASCII) ? "\xf1\xbf\xbf\xbd" : I8_to_native("\xf8\xaf\xbf\xbf\xbd"),
238     0x7FFFE     => (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
239     0x7FFFF     => (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
240     0x80000     => (isASCII) ? "\xf2\x80\x80\x80" : I8_to_native("\xf8\xb0\xa0\xa0\xa0"),
241     0x8FFFD     => (isASCII) ? "\xf2\x8f\xbf\xbd" : I8_to_native("\xf8\xb1\xbf\xbf\xbd"),
242     0x8FFFE     => (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
243     0x8FFFF     => (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
244     0x90000     => (isASCII) ? "\xf2\x90\x80\x80" : I8_to_native("\xf8\xb2\xa0\xa0\xa0"),
245     0x9FFFD     => (isASCII) ? "\xf2\x9f\xbf\xbd" : I8_to_native("\xf8\xb3\xbf\xbf\xbd"),
246     0x9FFFE     => (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
247     0x9FFFF     => (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
248     0xA0000     => (isASCII) ? "\xf2\xa0\x80\x80" : I8_to_native("\xf8\xb4\xa0\xa0\xa0"),
249     0xAFFFD     => (isASCII) ? "\xf2\xaf\xbf\xbd" : I8_to_native("\xf8\xb5\xbf\xbf\xbd"),
250     0xAFFFE     => (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
251     0xAFFFF     => (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
252     0xB0000     => (isASCII) ? "\xf2\xb0\x80\x80" : I8_to_native("\xf8\xb6\xa0\xa0\xa0"),
253     0xBFFFD     => (isASCII) ? "\xf2\xbf\xbf\xbd" : I8_to_native("\xf8\xb7\xbf\xbf\xbd"),
254     0xBFFFE     => (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
255     0xBFFFF     => (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
256     0xC0000     => (isASCII) ? "\xf3\x80\x80\x80" : I8_to_native("\xf8\xb8\xa0\xa0\xa0"),
257     0xCFFFD     => (isASCII) ? "\xf3\x8f\xbf\xbd" : I8_to_native("\xf8\xb9\xbf\xbf\xbd"),
258     0xCFFFE     => (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
259     0xCFFFF     => (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
260     0xD0000     => (isASCII) ? "\xf3\x90\x80\x80" : I8_to_native("\xf8\xba\xa0\xa0\xa0"),
261     0xDFFFD     => (isASCII) ? "\xf3\x9f\xbf\xbd" : I8_to_native("\xf8\xbb\xbf\xbf\xbd"),
262     0xDFFFE     => (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
263     0xDFFFF     => (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
264     0xE0000     => (isASCII) ? "\xf3\xa0\x80\x80" : I8_to_native("\xf8\xbc\xa0\xa0\xa0"),
265     0xEFFFD     => (isASCII) ? "\xf3\xaf\xbf\xbd" : I8_to_native("\xf8\xbd\xbf\xbf\xbd"),
266     0xEFFFE     => (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
267     0xEFFFF     => (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
268     0xF0000     => (isASCII) ? "\xf3\xb0\x80\x80" : I8_to_native("\xf8\xbe\xa0\xa0\xa0"),
269     0xFFFFD     => (isASCII) ? "\xf3\xbf\xbf\xbd" : I8_to_native("\xf8\xbf\xbf\xbf\xbd"),
270     0xFFFFE     => (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
271     0xFFFFF     => (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
272     0x100000    => (isASCII) ? "\xf4\x80\x80\x80" : I8_to_native("\xf9\xa0\xa0\xa0\xa0"),
273     0x10FFFD    => (isASCII) ? "\xf4\x8f\xbf\xbd" : I8_to_native("\xf9\xa1\xbf\xbf\xbd"),
274     0x10FFFE    => (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
275     0x10FFFF    => (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
276     0x110000    => (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
277
278     # Things that would be noncharacters if they were in Unicode, and might be
279     # mistaken, if the C code is bad, to be nonchars
280     0x11FFFE    => (isASCII) ? "\xf4\x9f\xbf\xbe" : I8_to_native("\xf9\xa3\xbf\xbf\xbe"),
281     0x11FFFF    => (isASCII) ? "\xf4\x9f\xbf\xbf" : I8_to_native("\xf9\xa3\xbf\xbf\xbf"),
282     0x20FFFE    => (isASCII) ? "\xf8\x88\x8f\xbf\xbe" : I8_to_native("\xfa\xa1\xbf\xbf\xbe"),
283     0x20FFFF    => (isASCII) ? "\xf8\x88\x8f\xbf\xbf" : I8_to_native("\xfa\xa1\xbf\xbf\xbf"),
284
285     0x200000 - 1 => (isASCII) ? "\xf7\xbf\xbf\xbf" : I8_to_native("\xf9\xbf\xbf\xbf\xbf"),
286     0x200000     => (isASCII) ? "\xf8\x88\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
287     0x400000 - 1 => (isASCII) ? "\xf8\x8f\xbf\xbf\xbf" : I8_to_native("\xfb\xbf\xbf\xbf\xbf"),
288     0x400000     => (isASCII) ? "\xf8\x90\x80\x80\x80" : I8_to_native("\xfc\xa4\xa0\xa0\xa0\xa0"),
289     0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"),
290     0x4000000     => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"),
291     0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"),
292     0x4000000     => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"),
293     0x40000000 - 1 => (isASCII) ? "\xfc\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xfe\xbf\xbf\xbf\xbf\xbf\xbf"),
294     0x40000000     => (isASCII) ? "\xfd\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0"),
295     0x80000000 - 1 => (isASCII) ? "\xfd\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"),
296     0x80000000     => (isASCII) ? "\xfe\x82\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
297     0xFFFFFFFF     => (isASCII) ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
298 );
299
300 if ($is64bit) {
301     no warnings qw(overflow portable);
302     $code_points{0x100000000}        = (isASCII)
303                                         ?              "\xfe\x84\x80\x80\x80\x80\x80"
304                                         : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0");
305     $code_points{0x1000000000 - 1}   = (isASCII)
306                                         ?              "\xfe\xbf\xbf\xbf\xbf\xbf\xbf"
307                                         : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf\xbf");
308     $code_points{0x1000000000}       = (isASCII)
309                                         ?              "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
310                                         : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0");
311     $code_points{0xFFFFFFFFFFFFFFFF} = (isASCII)
312                                         ?              "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
313                                         : I8_to_native("\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf");
314     if (isASCII) {  # These could falsely show as overlongs in a naive implementation
315         $code_points{0x40000000000}  = "\xff\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80";
316         $code_points{0x1000000000000} = "\xff\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80";
317         $code_points{0x40000000000000} = "\xff\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80";
318         $code_points{0x1000000000000000} = "\xff\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80";
319         # overflows
320         #$code_points{0xfoo}     = "\xff\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80";
321     }
322 }
323 elsif (! isASCII) { # 32-bit EBCDIC.  64-bit is clearer to handle, so doesn't need this test case
324     no warnings qw(overflow portable);
325     $code_points{0x40000000} = I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0");
326 }
327
328 # Now add in entries for each of code points 0-255, which require special
329 # handling on EBCDIC.  Remember the keys are Unicode values, and the values
330 # are the native UTF-8.  For invariants, the bytes are just the native chr.
331
332 my $cp = 0;
333 while ($cp < ((isASCII) ? 128 : 160)) {   # This is from the definition of
334                                         # invariant
335     $code_points{$cp} = chr utf8::unicode_to_native($cp);
336     $cp++;
337 }
338
339 # Done with the invariants.  Now do the variants.  All in this range are 2
340 # byte.  Again, we can't use the internal functions to generate UTF-8, as
341 # those are what we are trying to test.  In the loop, we know what range the
342 # continuation bytes can be in, and what the lowest start byte can be.  So we
343 # cycle through them.
344
345 my $first_continuation = (isASCII) ? 0x80 : 0xA0;
346 my $final_continuation = 0xBF;
347 my $start = (isASCII) ? 0xC2 : 0xC5;
348
349 my $continuation = $first_continuation - 1;
350
351 while ($cp < 255) {
352     if (++$continuation > $final_continuation) {
353
354         # Wrap to the next start byte when we reach the final continuation
355         # byte possible
356         $continuation = $first_continuation;
357         $start++;
358     }
359     $code_points{$cp} = I8_to_native(chr($start) . chr($continuation));
360
361     $cp++;
362 }
363
364 my @warnings;
365
366 use warnings 'utf8';
367 local $SIG{__WARN__} = sub { push @warnings, @_ };
368
369 my %restriction_types;
370
371 $restriction_types{""}{'valid_strings'} = "";
372 $restriction_types{"c9strict"}{'valid_strings'} = "";
373 $restriction_types{"strict"}{'valid_strings'} = "";
374 $restriction_types{"fits_in_31_bits"}{'valid_strings'} = "";
375
376 # This set of tests looks for basic sanity, and lastly tests various routines
377 # for the given code point.  If the earlier tests for that code point fail,
378 # the later ones probably will too.  Malformations are tested in later
379 # segments of code.
380 for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
381           keys %code_points)
382 {
383     my $hex_u = sprintf("0x%02X", $u);
384     my $n = utf8::unicode_to_native($u);
385     my $hex_n = sprintf("0x%02X", $n);
386     my $bytes = $code_points{$u};
387
388     my $offskip_should_be;
389     {
390         no warnings qw(overflow portable);
391         $offskip_should_be = (isASCII)
392             ? ( $u < 0x80           ? 1 :
393                 $u < 0x800          ? 2 :
394                 $u < 0x10000        ? 3 :
395                 $u < 0x200000       ? 4 :
396                 $u < 0x4000000      ? 5 :
397                 $u < 0x80000000     ? 6 : (($is64bit)
398                                         ? ($u < 0x1000000000 ? 7 : 13)
399                                         : 7)
400               )
401             : ($u < 0xA0        ? 1 :
402                $u < 0x400       ? 2 :
403                $u < 0x4000      ? 3 :
404                $u < 0x40000     ? 4 :
405                $u < 0x400000    ? 5 :
406                $u < 0x4000000   ? 6 :
407                $u < 0x40000000  ? 7 : 14 );
408     }
409
410     # If this test fails, subsequent ones are meaningless.
411     next unless is(test_OFFUNISKIP($u), $offskip_should_be,
412                    "Verify OFFUNISKIP($hex_u) is $offskip_should_be");
413     my $invariant = $offskip_should_be == 1;
414     my $display_invariant = $invariant || 0;
415     is(test_OFFUNI_IS_INVARIANT($u), $invariant,
416        "Verify OFFUNI_IS_INVARIANT($hex_u) is $display_invariant");
417
418     my $uvchr_skip_should_be = $offskip_should_be;
419     next unless is(test_UVCHR_SKIP($n), $uvchr_skip_should_be,
420                    "Verify UVCHR_SKIP($hex_n) is $uvchr_skip_should_be");
421     is(test_UVCHR_IS_INVARIANT($n), $offskip_should_be == 1,
422        "Verify UVCHR_IS_INVARIANT($hex_n) is $display_invariant");
423
424     my $n_chr = chr $n;
425     utf8::upgrade $n_chr;
426
427     is(test_UTF8_SKIP($n_chr), $uvchr_skip_should_be,
428         "Verify UTF8_SKIP(chr $hex_n) is $uvchr_skip_should_be");
429
430     use bytes;
431     my $byte_length = length $n_chr;
432     for (my $j = 0; $j < $byte_length; $j++) {
433         undef @warnings;
434
435         if ($j == $byte_length - 1) {
436             my $ret = test_is_utf8_valid_partial_char_flags($n_chr, $byte_length, 0);
437             is($ret, 0, "   Verify is_utf8_valid_partial_char_flags(" . display_bytes($n_chr) . ") returns 0 for full character");
438         }
439         else {
440             my $bytes_so_far = substr($n_chr, 0, $j + 1);
441             my $ret = test_is_utf8_valid_partial_char_flags($bytes_so_far, $j + 1, 0);
442             is($ret, 1, "   Verify is_utf8_valid_partial_char_flags(" . display_bytes($bytes_so_far) . ") returns 1");
443         }
444
445         unless (is(scalar @warnings, 0,
446                 "   Verify is_utf8_valid_partial_char_flags generated no warnings"))
447         {
448             diag "The warnings were: " . join(", ", @warnings);
449         }
450
451         my $b = substr($n_chr, $j, 1);
452         my $hex_b = sprintf("\"\\x%02x\"", ord $b);
453
454         my $byte_invariant = $j == 0 && $uvchr_skip_should_be == 1;
455         my $display_byte_invariant = $byte_invariant || 0;
456         next unless is(test_UTF8_IS_INVARIANT($b), $byte_invariant,
457                        "   Verify UTF8_IS_INVARIANT($hex_b) for byte $j "
458                      . "is $display_byte_invariant");
459
460         my $is_start = $j == 0 && $uvchr_skip_should_be > 1;
461         my $display_is_start = $is_start || 0;
462         next unless is(test_UTF8_IS_START($b), $is_start,
463                     "      Verify UTF8_IS_START($hex_b) is $display_is_start");
464
465         my $is_continuation = $j != 0 && $uvchr_skip_should_be > 1;
466         my $display_is_continuation = $is_continuation || 0;
467         next unless is(test_UTF8_IS_CONTINUATION($b), $is_continuation,
468                        "      Verify UTF8_IS_CONTINUATION($hex_b) is "
469                      . "$display_is_continuation");
470
471         my $is_continued = $uvchr_skip_should_be > 1;
472         my $display_is_continued = $is_continued || 0;
473         next unless is(test_UTF8_IS_CONTINUED($b), $is_continued,
474                        "      Verify UTF8_IS_CONTINUED($hex_b) is "
475                      . "$display_is_continued");
476
477         my $is_downgradeable_start =    $n < 256
478                                      && $uvchr_skip_should_be > 1
479                                      && $j == 0;
480         my $display_is_downgradeable_start = $is_downgradeable_start || 0;
481         next unless is(test_UTF8_IS_DOWNGRADEABLE_START($b),
482                        $is_downgradeable_start,
483                        "      Verify UTF8_IS_DOWNGRADEABLE_START($hex_b) is "
484                      . "$display_is_downgradeable_start");
485
486         my $is_above_latin1 =  $n > 255 && $j == 0;
487         my $display_is_above_latin1 = $is_above_latin1 || 0;
488         next unless is(test_UTF8_IS_ABOVE_LATIN1($b),
489                        $is_above_latin1,
490                        "      Verify UTF8_IS_ABOVE_LATIN1($hex_b) is "
491                      . "$display_is_above_latin1");
492
493         my $is_possibly_problematic =  $j == 0
494                                     && $n >= ((isASCII)
495                                               ? 0xD000
496                                               : 0x8000);
497         my $display_is_possibly_problematic = $is_possibly_problematic || 0;
498         next unless is(test_isUTF8_POSSIBLY_PROBLEMATIC($b),
499                        $is_possibly_problematic,
500                        "      Verify isUTF8_POSSIBLY_PROBLEMATIC($hex_b) is "
501                      . "$display_is_above_latin1");
502     }
503
504     # We are not trying to look for warnings, etc, so if they should occur, it
505     # is an error.  But some of the code points here do cause warnings, so we
506     # check here and turn off the ones that apply to such code points.  A
507     # later section of the code tests for these kinds of things.
508     my $this_utf8_flags = $look_for_everything_utf8n_to;
509     my $len = length $bytes;
510
511     my $valid_under_strict = 1;
512     my $valid_under_c9strict = 1;
513     my $valid_for_fits_in_31_bits = 1;
514     if ($n > 0x10FFFF) {
515         $this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER);
516         $valid_under_strict = 0;
517         $valid_under_c9strict = 0;
518         if ($n > 2 ** 31 - 1) {
519             $this_utf8_flags &=
520                             ~($UTF8_DISALLOW_ABOVE_31_BIT|$UTF8_WARN_ABOVE_31_BIT);
521             $valid_for_fits_in_31_bits = 0;
522         }
523     }
524     elsif (($n >= 0xFDD0 && $n <= 0xFDEF) || ($n & 0xFFFE) == 0xFFFE) {
525         $this_utf8_flags &= ~($UTF8_DISALLOW_NONCHAR|$UTF8_WARN_NONCHAR);
526         $valid_under_strict = 0;
527     }
528     elsif ($n >= 0xD800 && $n <= 0xDFFF) {
529         $this_utf8_flags &= ~($UTF8_DISALLOW_SURROGATE|$UTF8_WARN_SURROGATE);
530         $valid_under_c9strict = 0;
531         $valid_under_strict = 0;
532     }
533
534     undef @warnings;
535
536     my $display_flags = sprintf "0x%x", $this_utf8_flags;
537     my $display_bytes = display_bytes($bytes);
538     my $ret_ref = test_utf8n_to_uvchr($bytes, $len, $this_utf8_flags);
539
540     # Rest of tests likely meaningless if it gets the wrong code point.
541     next unless is($ret_ref->[0], $n,
542        "Verify utf8n_to_uvchr($display_bytes, $display_flags) returns $hex_n");
543     is($ret_ref->[1], $len,
544        "Verify utf8n_to_uvchr() for $hex_n returns expected length: $len");
545
546     unless (is(scalar @warnings, 0,
547                "Verify utf8n_to_uvchr() for $hex_n generated no warnings"))
548     {
549         diag "The warnings were: " . join(", ", @warnings);
550     }
551
552     undef @warnings;
553
554     my $ret = test_isUTF8_CHAR($bytes, $len);
555     is($ret, $len, "Verify isUTF8_CHAR($display_bytes) returns expected length: $len");
556
557     unless (is(scalar @warnings, 0,
558                "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
559     {
560         diag "The warnings were: " . join(", ", @warnings);
561     }
562
563     undef @warnings;
564
565     $ret = test_isUTF8_CHAR($bytes, $len - 1);
566     is($ret, 0, "Verify isUTF8_CHAR() with too short length parameter returns 0");
567
568     unless (is(scalar @warnings, 0,
569                "Verify isUTF8_CHAR() generated no warnings"))
570     {
571         diag "The warnings were: " . join(", ", @warnings);
572     }
573
574     undef @warnings;
575
576     $ret = test_isUTF8_CHAR_flags($bytes, $len, 0);
577     is($ret, $len, "Verify isUTF8_CHAR_flags($display_bytes, 0) returns expected length: $len");
578
579     unless (is(scalar @warnings, 0,
580                "Verify isUTF8_CHAR_flags() for $hex_n generated no warnings"))
581     {
582         diag "The warnings were: " . join(", ", @warnings);
583     }
584
585     undef @warnings;
586
587     $ret = test_isUTF8_CHAR_flags($bytes, $len - 1, 0);
588     is($ret, 0, "Verify isUTF8_CHAR_flags() with too short length parameter returns 0");
589
590     unless (is(scalar @warnings, 0,
591                "Verify isUTF8_CHAR_flags() generated no warnings"))
592     {
593         diag "The warnings were: " . join(", ", @warnings);
594     }
595
596     undef @warnings;
597
598     $ret = test_isSTRICT_UTF8_CHAR($bytes, $len);
599     my $expected_len = ($valid_under_strict) ? $len : 0;
600     is($ret, $expected_len, "Verify isSTRICT_UTF8_CHAR($display_bytes) returns expected length: $expected_len");
601
602     unless (is(scalar @warnings, 0,
603                "Verify isSTRICT_UTF8_CHAR() for $hex_n generated no warnings"))
604     {
605         diag "The warnings were: " . join(", ", @warnings);
606     }
607
608     undef @warnings;
609
610     $ret = test_isSTRICT_UTF8_CHAR($bytes, $len - 1);
611     is($ret, 0, "Verify isSTRICT_UTF8_CHAR() with too short length parameter returns 0");
612
613     unless (is(scalar @warnings, 0,
614                "Verify isSTRICT_UTF8_CHAR() generated no warnings"))
615     {
616         diag "The warnings were: " . join(", ", @warnings);
617     }
618
619     undef @warnings;
620
621     $ret = test_isUTF8_CHAR_flags($bytes, $len, $UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
622     is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE') acts like isSTRICT_UTF8_CHAR");
623
624     unless (is(scalar @warnings, 0,
625                "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
626     {
627         diag "The warnings were: " . join(", ", @warnings);
628     }
629
630     undef @warnings;
631
632     $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len);
633     $expected_len = ($valid_under_c9strict) ? $len : 0;
634     is($ret, $expected_len, "Verify isC9_STRICT_UTF8_CHAR($display_bytes) returns expected length: $len");
635
636     unless (is(scalar @warnings, 0,
637                "Verify isC9_STRICT_UTF8_CHAR() for $hex_n generated no warnings"))
638     {
639         diag "The warnings were: " . join(", ", @warnings);
640     }
641
642     undef @warnings;
643
644     $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len - 1);
645     is($ret, 0, "Verify isC9_STRICT_UTF8_CHAR() with too short length parameter returns 0");
646
647     unless (is(scalar @warnings, 0,
648                "Verify isC9_STRICT_UTF8_CHAR() generated no warnings"))
649     {
650         diag "The warnings were: " . join(", ", @warnings);
651     }
652
653     undef @warnings;
654
655     $ret = test_isUTF8_CHAR_flags($bytes, $len, $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
656     is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like isC9_STRICT_UTF8_CHAR");
657
658     unless (is(scalar @warnings, 0,
659                "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
660     {
661         diag "The warnings were: " . join(", ", @warnings);
662     }
663
664     undef @warnings;
665
666     $ret_ref = test_valid_utf8_to_uvchr($bytes);
667     is($ret_ref->[0], $n, "Verify valid_utf8_to_uvchr($display_bytes) returns $hex_n");
668     is($ret_ref->[1], $len, "Verify valid_utf8_to_uvchr() for $hex_n returns expected length: $len");
669
670     unless (is(scalar @warnings, 0,
671                "Verify valid_utf8_to_uvchr() for $hex_n generated no warnings"))
672     {
673         diag "The warnings were: " . join(", ", @warnings);
674     }
675
676     # Similarly for uvchr_to_utf8
677     my $this_uvchr_flags = $look_for_everything_uvchr_to;
678     if ($n > 2 ** 31 - 1) {
679         $this_uvchr_flags &=
680                 ~($UNICODE_DISALLOW_ABOVE_31_BIT|$UNICODE_WARN_ABOVE_31_BIT);
681     }
682     if ($n > 0x10FFFF) {
683         $this_uvchr_flags &= ~($UNICODE_DISALLOW_SUPER|$UNICODE_WARN_SUPER);
684     }
685     elsif (($n >= 0xFDD0 && $n <= 0xFDEF) || ($n & 0xFFFE) == 0xFFFE) {
686         $this_uvchr_flags &= ~($UNICODE_DISALLOW_NONCHAR|$UNICODE_WARN_NONCHAR);
687     }
688     elsif ($n >= 0xD800 && $n <= 0xDFFF) {
689         $this_uvchr_flags &= ~($UNICODE_DISALLOW_SURROGATE|$UNICODE_WARN_SURROGATE);
690     }
691     $display_flags = sprintf "0x%x", $this_uvchr_flags;
692
693     undef @warnings;
694
695     $ret = test_uvchr_to_utf8_flags($n, $this_uvchr_flags);
696     ok(defined $ret, "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returned success");
697     is($ret, $bytes, "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returns correct bytes");
698
699     unless (is(scalar @warnings, 0,
700         "Verify uvchr_to_utf8_flags($hex_n, $display_flags) for $hex_n generated no warnings"))
701     {
702         diag "The warnings were: " . join(", ", @warnings);
703     }
704
705     # Now append this code point to a string that we will test various
706     # versions of is_foo_utf8_string_bar on, and keep a count of how many code
707     # points are in it.  All the code points in this loop are valid in Perl's
708     # extended UTF-8, but some are not valid under various restrictions.  A
709     # string and count is kept separately that is entirely valid for each
710     # restriction.  And, for each restriction, we note the first occurrence in
711     # the unrestricted string where we find something not in the restricted
712     # string.
713     $restriction_types{""}{'valid_strings'} .= $bytes;
714     $restriction_types{""}{'valid_counts'}++;
715
716     if ($valid_under_c9strict) {
717         $restriction_types{"c9strict"}{'valid_strings'} .= $bytes;
718         $restriction_types{"c9strict"}{'valid_counts'}++;
719     }
720     elsif (! exists $restriction_types{"c9strict"}{'first_invalid_offset'}) {
721         $restriction_types{"c9strict"}{'first_invalid_offset'}
722                     = length $restriction_types{"c9strict"}{'valid_strings'};
723         $restriction_types{"c9strict"}{'first_invalid_count'}
724                             = $restriction_types{"c9strict"}{'valid_counts'};
725     }
726
727     if ($valid_under_strict) {
728         $restriction_types{"strict"}{'valid_strings'} .= $bytes;
729         $restriction_types{"strict"}{'valid_counts'}++;
730     }
731     elsif (! exists $restriction_types{"strict"}{'first_invalid_offset'}) {
732         $restriction_types{"strict"}{'first_invalid_offset'}
733                         = length $restriction_types{"strict"}{'valid_strings'};
734         $restriction_types{"strict"}{'first_invalid_count'}
735                                 = $restriction_types{"strict"}{'valid_counts'};
736     }
737
738     if ($valid_for_fits_in_31_bits) {
739         $restriction_types{"fits_in_31_bits"}{'valid_strings'} .= $bytes;
740         $restriction_types{"fits_in_31_bits"}{'valid_counts'}++;
741     }
742     elsif (! exists
743                 $restriction_types{"fits_in_31_bits"}{'first_invalid_offset'})
744     {
745         $restriction_types{"fits_in_31_bits"}{'first_invalid_offset'}
746                 = length $restriction_types{"fits_in_31_bits"}{'valid_strings'};
747         $restriction_types{"fits_in_31_bits"}{'first_invalid_count'}
748                         = $restriction_types{"fits_in_31_bits"}{'valid_counts'};
749     }
750 }
751
752 my $I8c = (isASCII) ? "\x80" : "\xa0";    # A continuation byte
753 my $cont_byte = I8_to_native($I8c);
754 my $p = (isASCII) ? "\xe1\x80" : I8_to_native("\xE4\xA0");  # partial
755
756 # The loop above tested the single or partial character functions/macros,
757 # while building up strings to test the string functions, which we do now.
758
759 for my $restriction (sort keys %restriction_types) {
760     use bytes;
761
762     for my $use_flags ("", "_flags") {
763
764         # For each restriction, we test it in both the is_foo_flags functions
765         # and the specially named foo function.  But not if there isn't such a
766         # specially named function.  Currently, this is the only tested
767         # restriction that doesn't have a specially named function
768         next if $use_flags eq "" && $restriction eq "fits_in_31_bits";
769
770         # Start building up the name of the function we will test.
771         my $base_name = "is_";
772
773         if (! $use_flags  && $restriction ne "") {
774             $base_name .= $restriction . "_";
775         }
776
777         # We test both "is_utf8_string_foo" and "is_fixed_width_buf" functions
778         foreach my $operand ('string', 'fixed_width_buf') {
779
780             # Currently, the only fixed_width_buf functions have the '_flags'
781             # suffix.
782             next if $operand eq 'fixed_width_buf' && $use_flags eq "";
783
784             my $name = "${base_name}utf8_$operand";
785
786             # We test each version of the function
787             for my $function ("_loclen", "_loc", "") {
788
789                 # We test each function against
790                 #   a) valid input
791                 #   b) invalid input created by appending an out-of-place
792                 #      continuation character to the valid string
793                 #   c) input created by appending a partial character.  This
794                 #      is valid in the 'fixed_width' functions, but invalid in
795                 #   the 'string' ones
796                 #   d) invalid input created by calling a function that is
797                 #      expecting a restricted form of the input using the string
798                 #      that's valid when unrestricted
799                 for my $error_type (0, $cont_byte, $p, $restriction) {
800                     #diag "restriction=$restriction, use_flags=$use_flags, function=$function, error_type=" . display_bytes($error_type);
801
802                     # If there is no restriction, the error type will be "",
803                     # which is redundant with 0.
804                     next if $error_type eq "";
805
806                     my $this_name = "$name$function$use_flags";
807                     my $bytes
808                             = $restriction_types{$restriction}{'valid_strings'};
809                     my $expected_offset = length $bytes;
810                     my $expected_count
811                             = $restriction_types{$restriction}{'valid_counts'};
812                     my $test_name_suffix = "";
813
814                     my $this_error_type = $error_type;
815                     if ($this_error_type) {
816
817                         # Appending a bare continuation byte or a partial
818                         # character doesn't change the character count or
819                         # offset.  But in the other cases, we have saved where
820                         # the failures should occur, so use those.  Appending
821                         # a continuation byte makes it invalid; appending a
822                         # partial character makes the 'string' form invalid,
823                         # but not the 'fixed_width_buf' form.
824                         if ($this_error_type eq $cont_byte || $this_error_type eq $p) {
825                             $bytes .= $this_error_type;
826                             if ($this_error_type eq $cont_byte) {
827                                 $test_name_suffix
828                                             = " for an unexpected continuation";
829                             }
830                             else {
831                                 $test_name_suffix
832                                         = " if ends with a partial character";
833                                 $this_error_type
834                                         = 0 if $operand eq "fixed_width_buf";
835                             }
836                         }
837                         else {
838                             $test_name_suffix
839                                         = " if contains forbidden code points";
840                             if ($this_error_type eq "c9strict") {
841                                 $bytes = $restriction_types{""}{'valid_strings'};
842                                 $expected_offset
843                                  = $restriction_types{"c9strict"}
844                                                      {'first_invalid_offset'};
845                                 $expected_count
846                                   = $restriction_types{"c9strict"}
847                                                       {'first_invalid_count'};
848                             }
849                             elsif ($this_error_type eq "strict") {
850                                 $bytes = $restriction_types{""}{'valid_strings'};
851                                 $expected_offset
852                                   = $restriction_types{"strict"}
853                                                       {'first_invalid_offset'};
854                                 $expected_count
855                                   = $restriction_types{"strict"}
856                                                       {'first_invalid_count'};
857
858                             }
859                             elsif ($this_error_type eq "fits_in_31_bits") {
860                                 $bytes = $restriction_types{""}{'valid_strings'};
861                                 $expected_offset
862                                   = $restriction_types{"fits_in_31_bits"}
863                                                       {'first_invalid_offset'};
864                                 $expected_count
865                                     = $restriction_types{"fits_in_31_bits"}
866                                                         {'first_invalid_count'};
867                             }
868                             else {
869                                 fail("Internal test error: Unknown error type "
870                                 . "'$this_error_type'");
871                                 next;
872                             }
873                         }
874                     }
875
876                     my $length = length $bytes;
877                     my $ret_ref;
878
879                     my $test = "\$ret_ref = test_$this_name(\$bytes, $length";
880
881                     # If using the _flags functions, we have to figure out what
882                     # flags to pass.  This is done to match the restriction.
883                     if ($use_flags eq "_flags") {
884                         if (! $restriction) {
885                             $test .= ", 0";     # The flag
886
887                             # Indicate the kind of flag in the test name.
888                             $this_name .= "(0)";
889                         }
890                         else {
891                             $this_name .= "($restriction)";
892                             if ($restriction eq "c9strict") {
893                                 $test
894                                   .= ", $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE";
895                             }
896                             elsif ($restriction eq "strict") {
897                                 $test .= ", $UTF8_DISALLOW_ILLEGAL_INTERCHANGE";
898                             }
899                             elsif ($restriction eq "fits_in_31_bits") {
900                                 $test .= ", $UTF8_DISALLOW_ABOVE_31_BIT";
901                             }
902                             else {
903                                 fail("Internal test error: Unknown restriction "
904                                 . "'$restriction'");
905                                 next;
906                             }
907                         }
908                     }
909                     $test .= ")";
910
911                     # Actually run the test
912                     eval $test;
913                     if ($@) {
914                         fail($test);
915                         diag $@;
916                         next;
917                     }
918
919                     my $ret;
920                     my $error_offset;
921                     my $cp_count;
922
923                     if ($function eq "") {
924                         $ret = $ret_ref;    # For plain function, there's only a
925                                             # single return value
926                     }
927                     else {  # Otherwise, the multiple values come in an array.
928                         $ret = shift @$ret_ref ;
929                         $error_offset = shift @$ret_ref;
930                         $cp_count = shift@$ret_ref if $function eq "_loclen";
931                     }
932
933                     if ($this_error_type) {
934                         is($ret, 0,
935                            "Verify $this_name is FALSE$test_name_suffix");
936                     }
937                     else {
938                         unless(is($ret, 1,
939                                   "Verify $this_name is TRUE for valid input"
940                                 . "$test_name_suffix"))
941                         {
942                             diag("The bytes starting at offset"
943                                . " $error_offset are"
944                                . display_bytes(substr(
945                                           $restriction_types{$restriction}
946                                                             {'valid_strings'},
947                                           $error_offset)));
948                             next;
949                         }
950                     }
951
952                     if ($function ne "") {
953                         unless (is($error_offset, $expected_offset,
954                                    "\tAnd returns the correct offset"))
955                         {
956                             my $min = ($error_offset < $expected_offset)
957                                     ? $error_offset
958                                     : $expected_offset;
959                             diag display_bytes(substr($bytes, $min));
960                         }
961
962                         if ($function eq '_loclen') {
963                             is($cp_count, $expected_count,
964                                "\tAnd returns the correct character count");
965                         }
966                     }
967                 }
968             }
969         }
970     }
971 }
972
973 my $REPLACEMENT = 0xFFFD;
974
975 # Now test the malformations.  All these raise category utf8 warnings.
976 my @malformations = (
977     [ "zero length string malformation", "", 0,
978         $UTF8_ALLOW_EMPTY, 0, 0,
979         qr/empty string/
980     ],
981     [ "orphan continuation byte malformation", I8_to_native("${I8c}a"),
982         2,
983         $UTF8_ALLOW_CONTINUATION, $REPLACEMENT, 1,
984         qr/unexpected continuation byte/
985     ],
986     [ "premature next character malformation (immediate)",
987         (isASCII) ? "\xc2\xc2\x80" : I8_to_native("\xc5\xc5\xa0"),
988         3,
989         $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 1,
990         qr/unexpected non-continuation byte.*immediately after start byte/
991     ],
992     [ "premature next character malformation (non-immediate)",
993         I8_to_native("\xf0${I8c}a"),
994         3,
995         $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 2,
996         qr/unexpected non-continuation byte .* 2 bytes after start byte/
997     ],
998     [ "too short malformation", I8_to_native("\xf0${I8c}a"), 2,
999         # Having the 'a' after this, but saying there are only 2 bytes also
1000         # tests that we pay attention to the passed in length
1001         $UTF8_ALLOW_SHORT, $REPLACEMENT, 2,
1002         qr/2 bytes, need 4/
1003     ],
1004     [ "overlong malformation, lowest 2-byte",
1005         (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"),
1006         2,
1007         $UTF8_ALLOW_LONG,
1008         0,   # NUL
1009         2,
1010         qr/2 bytes, need 1/
1011     ],
1012     [ "overlong malformation, highest 2-byte",
1013         (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
1014         2,
1015         $UTF8_ALLOW_LONG,
1016         (isASCII) ? 0x7F : utf8::unicode_to_native(0xBF),
1017         2,
1018         qr/2 bytes, need 1/
1019     ],
1020     [ "overlong malformation, lowest 3-byte",
1021         (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
1022         3,
1023         $UTF8_ALLOW_LONG,
1024         0,   # NUL
1025         3,
1026         qr/3 bytes, need 1/
1027     ],
1028     [ "overlong malformation, highest 3-byte",
1029         (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
1030         3,
1031         $UTF8_ALLOW_LONG,
1032         (isASCII) ? 0x7FF : 0x3FF,
1033         3,
1034         qr/3 bytes, need 2/
1035     ],
1036     [ "overlong malformation, lowest 4-byte",
1037         (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
1038         4,
1039         $UTF8_ALLOW_LONG,
1040         0,   # NUL
1041         4,
1042         qr/4 bytes, need 1/
1043     ],
1044     [ "overlong malformation, highest 4-byte",
1045         (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
1046         4,
1047         $UTF8_ALLOW_LONG,
1048         (isASCII) ? 0xFFFF : 0x3FFF,
1049         4,
1050         qr/4 bytes, need 3/
1051     ],
1052     [ "overlong malformation, lowest 5-byte",
1053         (isASCII)
1054          ?              "\xf8\x80\x80\x80\x80"
1055          : I8_to_native("\xf8\xa0\xa0\xa0\xa0"),
1056         5,
1057         $UTF8_ALLOW_LONG,
1058         0,   # NUL
1059         5,
1060         qr/5 bytes, need 1/
1061     ],
1062     [ "overlong malformation, highest 5-byte",
1063         (isASCII)
1064          ?              "\xf8\x87\xbf\xbf\xbf"
1065          : I8_to_native("\xf8\xa7\xbf\xbf\xbf"),
1066         5,
1067         $UTF8_ALLOW_LONG,
1068         (isASCII) ? 0x1FFFFF : 0x3FFFF,
1069         5,
1070         qr/5 bytes, need 4/
1071     ],
1072     [ "overlong malformation, lowest 6-byte",
1073         (isASCII)
1074          ?              "\xfc\x80\x80\x80\x80\x80"
1075          : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"),
1076         6,
1077         $UTF8_ALLOW_LONG,
1078         0,   # NUL
1079         6,
1080         qr/6 bytes, need 1/
1081     ],
1082     [ "overlong malformation, highest 6-byte",
1083         (isASCII)
1084          ?              "\xfc\x83\xbf\xbf\xbf\xbf"
1085          : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"),
1086         6,
1087         $UTF8_ALLOW_LONG,
1088         (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
1089         6,
1090         qr/6 bytes, need 5/
1091     ],
1092     [ "overlong malformation, lowest 7-byte",
1093         (isASCII)
1094          ?              "\xfe\x80\x80\x80\x80\x80\x80"
1095          : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"),
1096         7,
1097         $UTF8_ALLOW_LONG,
1098         0,   # NUL
1099         7,
1100         qr/7 bytes, need 1/
1101     ],
1102     [ "overlong malformation, highest 7-byte",
1103         (isASCII)
1104          ?              "\xfe\x81\xbf\xbf\xbf\xbf\xbf"
1105          : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"),
1106         7,
1107         $UTF8_ALLOW_LONG,
1108         (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
1109         7,
1110         qr/7 bytes, need 6/
1111     ],
1112 );
1113
1114 if (isASCII && ! $is64bit) {    # 32-bit ASCII platform
1115     no warnings 'portable';
1116     push @malformations,
1117         [ "overflow malformation",
1118             "\xfe\x84\x80\x80\x80\x80\x80",  # Represents 2**32
1119             7,
1120             0,  # There is no way to allow this malformation
1121             $REPLACEMENT,
1122             7,
1123             qr/overflow/
1124         ],
1125         [ "overflow malformation, can tell on first byte",
1126             "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
1127             13,
1128             0,  # There is no way to allow this malformation
1129             $REPLACEMENT,
1130             13,
1131             qr/overflow/
1132         ];
1133 }
1134 else {
1135     # On EBCDIC platforms, another overlong test is needed even on 32-bit
1136     # systems, whereas it doesn't happen on ASCII except on 64-bit ones.
1137
1138     no warnings 'portable';
1139     no warnings 'overflow'; # Doesn't run on 32-bit systems, but compiles
1140     push @malformations,
1141         [ "overlong malformation, lowest max-byte",
1142             (isASCII)
1143              ?              "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
1144              : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1145             (isASCII) ? 13 : 14,
1146             $UTF8_ALLOW_LONG,
1147             0,   # NUL
1148             (isASCII) ? 13 : 14,
1149             qr/1[34] bytes, need 1/,    # 1[34] to work on either ASCII or EBCDIC
1150         ],
1151         [ "overlong malformation, highest max-byte",
1152             (isASCII)    # 2**36-1 on ASCII; 2**30-1 on EBCDIC
1153              ?              "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf"
1154              : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"),
1155             (isASCII) ? 13 : 14,
1156             $UTF8_ALLOW_LONG,
1157             (isASCII) ? 0xFFFFFFFFF : 0x3FFFFFFF,
1158             (isASCII) ? 13 : 14,
1159             qr/1[34] bytes, need 7/,
1160         ];
1161
1162     if (! $is64bit) {   # 32-bit EBCDIC
1163         push @malformations,
1164         [ "overflow malformation",
1165             I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
1166             14,
1167             0,  # There is no way to allow this malformation
1168             $REPLACEMENT,
1169             14,
1170             qr/overflow/
1171         ];
1172     }
1173     else {  # 64-bit
1174         push @malformations,
1175             [ "overflow malformation",
1176                (isASCII)
1177                 ?              "\xff\x80\x90\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
1178                 : I8_to_native("\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1179                 (isASCII) ? 13 : 14,
1180                 0,  # There is no way to allow this malformation
1181                 $REPLACEMENT,
1182                 (isASCII) ? 13 : 14,
1183                 qr/overflow/
1184             ];
1185     }
1186 }
1187
1188 foreach my $test (@malformations) {
1189     my ($testname, $bytes, $length, $allow_flags, $allowed_uv, $expected_len, $message ) = @$test;
1190
1191     next if ! ok(length($bytes) >= $length, "$testname: Make sure won't read beyond buffer: " . length($bytes) . " >= $length");
1192
1193     undef @warnings;
1194
1195     my $ret = test_isUTF8_CHAR($bytes, $length);
1196     is($ret, 0, "$testname: isUTF8_CHAR returns 0");
1197     unless (is(scalar @warnings, 0,
1198                "$testname: isUTF8_CHAR() generated no warnings"))
1199     {
1200         diag "The warnings were: " . join(", ", @warnings);
1201     }
1202
1203     undef @warnings;
1204
1205     $ret = test_isUTF8_CHAR_flags($bytes, $length, 0);
1206     is($ret, 0, "$testname: isUTF8_CHAR_flags returns 0");
1207     unless (is(scalar @warnings, 0,
1208                "$testname: isUTF8_CHAR() generated no warnings"))
1209     {
1210         diag "The warnings were: " . join(", ", @warnings);
1211     }
1212
1213     $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
1214     is($ret, 0, "$testname: isSTRICT_UTF8_CHAR returns 0");
1215     unless (is(scalar @warnings, 0,
1216                "$testname: isSTRICT_UTF8_CHAR() generated no warnings"))
1217     {
1218         diag "The warnings were: " . join(", ", @warnings);
1219     }
1220
1221     $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
1222     is($ret, 0, "$testname: isC9_STRICT_UTF8_CHAR returns 0");
1223     unless (is(scalar @warnings, 0,
1224                "$testname: isC9_STRICT_UTF8_CHAR() generated no warnings"))
1225     {
1226         diag "The warnings were: " . join(", ", @warnings);
1227     }
1228
1229     for my $j (1 .. $length - 1) {
1230         my $partial = substr($bytes, 0, $j);
1231
1232         undef @warnings;
1233
1234         $ret = test_is_utf8_valid_partial_char_flags($bytes, $j, 0);
1235         my $ret_should_be = 0;
1236         my $comment = "";
1237         if ($testname =~ /premature|short/ && $j < 2) {
1238             $ret_should_be = 1;
1239             $comment = ", but need 2 bytes to discern:";
1240         }
1241         elsif ($testname =~ /overlong/ && ! isASCII && $length == 3) {
1242             # 3-byte overlongs on EBCDIC are determinable on the first byte
1243         }
1244         elsif ($testname =~ /overlong/ && $length > 2) {
1245             if ($length <= 7 && $j < 2) {
1246                 $ret_should_be = 1;
1247                 $comment = ", but need 2 bytes to discern:";
1248             }
1249             elsif ($length > 7 && $j < 7) {
1250                 $ret_should_be = 1;
1251                 $comment = ", but need 7 bytes to discern:";
1252             }
1253         }
1254         elsif ($testname =~ /overflow/ && $testname !~ /first byte/) {
1255             if (isASCII) {
1256                 if ($j < (($is64bit) ? 3 : 2)) {
1257                     $comment = ", but need $j bytes to discern:";
1258                     $ret_should_be = 1;
1259                 }
1260             }
1261             else {
1262                 if ($j < (($is64bit) ? 2 : 8)) {
1263                     $comment = ", but need $j bytes to discern:";
1264                     $ret_should_be = 1;
1265                 }
1266             }
1267         }
1268         is($ret, $ret_should_be, "$testname: is_utf8_valid_partial_char_flags("
1269                                 . display_bytes($partial)
1270                                 . ")$comment returns $ret_should_be");
1271         unless (is(scalar @warnings, 0,
1272                 "$testname: is_utf8_valid_partial_char_flags() generated no warnings"))
1273         {
1274             diag "The warnings were: " . join(", ", @warnings);
1275         }
1276     }
1277
1278
1279     # Test what happens when this malformation is not allowed
1280     undef @warnings;
1281     my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
1282     is($ret_ref->[0], 0, "$testname: disallowed: Returns 0");
1283     is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), disallowed: Returns expected length: $expected_len");
1284     if (is(scalar @warnings, 1, "$testname: disallowed: Got a single warning ")) {
1285         like($warnings[0], $message, "$testname: disallowed: Got expected warning");
1286     }
1287     else {
1288         if (scalar @warnings) {
1289             diag "The warnings were: " . join(", ", @warnings);
1290         }
1291     }
1292
1293     {   # Next test when disallowed, and warnings are off.
1294         undef @warnings;
1295         no warnings 'utf8';
1296         my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
1297         is($ret_ref->[0], 0, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': Returns 0");
1298         is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': Returns expected length: $expected_len");
1299         if (!is(scalar @warnings, 0, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': no warnings generated")) {
1300             diag "The warnings were: " . join(", ", @warnings);
1301         }
1302     }
1303
1304     # Test with CHECK_ONLY
1305     undef @warnings;
1306     $ret_ref = test_utf8n_to_uvchr($bytes, $length, $UTF8_CHECK_ONLY);
1307     is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0");
1308     is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns -1 for length");
1309     if (! is(scalar @warnings, 0, "$testname: CHECK_ONLY: no warnings generated")) {
1310         diag "The warnings were: " . join(", ", @warnings);
1311     }
1312
1313     next if $allow_flags == 0;    # Skip if can't allow this malformation
1314
1315     # Test when the malformation is allowed
1316     undef @warnings;
1317     $ret_ref = test_utf8n_to_uvchr($bytes, $length, $allow_flags);
1318     is($ret_ref->[0], $allowed_uv, "$testname: utf8n_to_uvchr(), allowed: Returns expected uv: " . sprintf("0x%04X", $allowed_uv));
1319     is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), allowed: Returns expected length: $expected_len");
1320     if (!is(scalar @warnings, 0, "$testname: utf8n_to_uvchr(), allowed: no warnings generated"))
1321     {
1322         diag "The warnings were: " . join(", ", @warnings);
1323     }
1324 }
1325
1326 # Now test the cases where a legal code point is generated, but may or may not
1327 # be allowed/warned on.
1328 my @tests = (
1329     [ "lowest surrogate",
1330         (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
1331         $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
1332         'surrogate', 0xD800,
1333         (isASCII) ? 3 : 4,
1334         qr/surrogate/
1335     ],
1336     [ "a middle surrogate",
1337         (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
1338         $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
1339         'surrogate', 0xD90D,
1340         (isASCII) ? 3 : 4,
1341         qr/surrogate/
1342     ],
1343     [ "highest surrogate",
1344         (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
1345         $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
1346         'surrogate', 0xDFFF,
1347         (isASCII) ? 3 : 4,
1348         qr/surrogate/
1349     ],
1350     [ "first non_unicode",
1351         (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
1352         $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
1353         'non_unicode', 0x110000,
1354         (isASCII) ? 4 : 5,
1355         qr/not Unicode.* may not be portable/
1356     ],
1357     [ "non_unicode whose first byte tells that",
1358         (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
1359         $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
1360         'non_unicode',
1361         (isASCII) ? 0x140000 : 0x200000,
1362         (isASCII) ? 4 : 5,
1363         qr/not Unicode.* may not be portable/
1364     ],
1365     [ "first of 32 consecutive non-character code points",
1366         (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
1367         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1368         'nonchar', 0xFDD0,
1369         (isASCII) ? 3 : 4,
1370         qr/Unicode non-character.*is not recommended for open interchange/
1371     ],
1372     [ "a mid non-character code point of the 32 consecutive ones",
1373         (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
1374         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1375         'nonchar', 0xFDE0,
1376         (isASCII) ? 3 : 4,
1377         qr/Unicode non-character.*is not recommended for open interchange/
1378     ],
1379     [ "final of 32 consecutive non-character code points",
1380         (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
1381         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1382         'nonchar', 0xFDEF,
1383         (isASCII) ? 3 : 4,
1384         qr/Unicode non-character.*is not recommended for open interchange/
1385     ],
1386     [ "non-character code point U+FFFE",
1387         (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
1388         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1389         'nonchar', 0xFFFE,
1390         (isASCII) ? 3 : 4,
1391         qr/Unicode non-character.*is not recommended for open interchange/
1392     ],
1393     [ "non-character code point U+FFFF",
1394         (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
1395         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1396         'nonchar', 0xFFFF,
1397         (isASCII) ? 3 : 4,
1398         qr/Unicode non-character.*is not recommended for open interchange/
1399     ],
1400     [ "non-character code point U+1FFFE",
1401         (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
1402         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1403         'nonchar', 0x1FFFE, 4,
1404         qr/Unicode non-character.*is not recommended for open interchange/
1405     ],
1406     [ "non-character code point U+1FFFF",
1407         (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
1408         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1409         'nonchar', 0x1FFFF, 4,
1410         qr/Unicode non-character.*is not recommended for open interchange/
1411     ],
1412     [ "non-character code point U+2FFFE",
1413         (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
1414         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1415         'nonchar', 0x2FFFE, 4,
1416         qr/Unicode non-character.*is not recommended for open interchange/
1417     ],
1418     [ "non-character code point U+2FFFF",
1419         (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
1420         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1421         'nonchar', 0x2FFFF, 4,
1422         qr/Unicode non-character.*is not recommended for open interchange/
1423     ],
1424     [ "non-character code point U+3FFFE",
1425         (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
1426         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1427         'nonchar', 0x3FFFE, 4,
1428         qr/Unicode non-character.*is not recommended for open interchange/
1429     ],
1430     [ "non-character code point U+3FFFF",
1431         (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
1432         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1433         'nonchar', 0x3FFFF, 4,
1434         qr/Unicode non-character.*is not recommended for open interchange/
1435     ],
1436     [ "non-character code point U+4FFFE",
1437         (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
1438         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1439         'nonchar', 0x4FFFE,
1440         (isASCII) ? 4 : 5,
1441         qr/Unicode non-character.*is not recommended for open interchange/
1442     ],
1443     [ "non-character code point U+4FFFF",
1444         (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
1445         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1446         'nonchar', 0x4FFFF,
1447         (isASCII) ? 4 : 5,
1448         qr/Unicode non-character.*is not recommended for open interchange/
1449     ],
1450     [ "non-character code point U+5FFFE",
1451         (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
1452         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1453         'nonchar', 0x5FFFE,
1454         (isASCII) ? 4 : 5,
1455         qr/Unicode non-character.*is not recommended for open interchange/
1456     ],
1457     [ "non-character code point U+5FFFF",
1458         (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
1459         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1460         'nonchar', 0x5FFFF,
1461         (isASCII) ? 4 : 5,
1462         qr/Unicode non-character.*is not recommended for open interchange/
1463     ],
1464     [ "non-character code point U+6FFFE",
1465         (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
1466         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1467         'nonchar', 0x6FFFE,
1468         (isASCII) ? 4 : 5,
1469         qr/Unicode non-character.*is not recommended for open interchange/
1470     ],
1471     [ "non-character code point U+6FFFF",
1472         (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
1473         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1474         'nonchar', 0x6FFFF,
1475         (isASCII) ? 4 : 5,
1476         qr/Unicode non-character.*is not recommended for open interchange/
1477     ],
1478     [ "non-character code point U+7FFFE",
1479         (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
1480         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1481         'nonchar', 0x7FFFE,
1482         (isASCII) ? 4 : 5,
1483         qr/Unicode non-character.*is not recommended for open interchange/
1484     ],
1485     [ "non-character code point U+7FFFF",
1486         (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
1487         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1488         'nonchar', 0x7FFFF,
1489         (isASCII) ? 4 : 5,
1490         qr/Unicode non-character.*is not recommended for open interchange/
1491     ],
1492     [ "non-character code point U+8FFFE",
1493         (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
1494         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1495         'nonchar', 0x8FFFE,
1496         (isASCII) ? 4 : 5,
1497         qr/Unicode non-character.*is not recommended for open interchange/
1498     ],
1499     [ "non-character code point U+8FFFF",
1500         (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
1501         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1502         'nonchar', 0x8FFFF,
1503         (isASCII) ? 4 : 5,
1504         qr/Unicode non-character.*is not recommended for open interchange/
1505     ],
1506     [ "non-character code point U+9FFFE",
1507         (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
1508         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1509         'nonchar', 0x9FFFE,
1510         (isASCII) ? 4 : 5,
1511         qr/Unicode non-character.*is not recommended for open interchange/
1512     ],
1513     [ "non-character code point U+9FFFF",
1514         (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
1515         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1516         'nonchar', 0x9FFFF,
1517         (isASCII) ? 4 : 5,
1518         qr/Unicode non-character.*is not recommended for open interchange/
1519     ],
1520     [ "non-character code point U+AFFFE",
1521         (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
1522         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1523         'nonchar', 0xAFFFE,
1524         (isASCII) ? 4 : 5,
1525         qr/Unicode non-character.*is not recommended for open interchange/
1526     ],
1527     [ "non-character code point U+AFFFF",
1528         (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
1529         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1530         'nonchar', 0xAFFFF,
1531         (isASCII) ? 4 : 5,
1532         qr/Unicode non-character.*is not recommended for open interchange/
1533     ],
1534     [ "non-character code point U+BFFFE",
1535         (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
1536         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1537         'nonchar', 0xBFFFE,
1538         (isASCII) ? 4 : 5,
1539         qr/Unicode non-character.*is not recommended for open interchange/
1540     ],
1541     [ "non-character code point U+BFFFF",
1542         (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
1543         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1544         'nonchar', 0xBFFFF,
1545         (isASCII) ? 4 : 5,
1546         qr/Unicode non-character.*is not recommended for open interchange/
1547     ],
1548     [ "non-character code point U+CFFFE",
1549         (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
1550         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1551         'nonchar', 0xCFFFE,
1552         (isASCII) ? 4 : 5,
1553         qr/Unicode non-character.*is not recommended for open interchange/
1554     ],
1555     [ "non-character code point U+CFFFF",
1556         (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
1557         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1558         'nonchar', 0xCFFFF,
1559         (isASCII) ? 4 : 5,
1560         qr/Unicode non-character.*is not recommended for open interchange/
1561     ],
1562     [ "non-character code point U+DFFFE",
1563         (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
1564         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1565         'nonchar', 0xDFFFE,
1566         (isASCII) ? 4 : 5,
1567         qr/Unicode non-character.*is not recommended for open interchange/
1568     ],
1569     [ "non-character code point U+DFFFF",
1570         (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
1571         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1572         'nonchar', 0xDFFFF,
1573         (isASCII) ? 4 : 5,
1574         qr/Unicode non-character.*is not recommended for open interchange/
1575     ],
1576     [ "non-character code point U+EFFFE",
1577         (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
1578         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1579         'nonchar', 0xEFFFE,
1580         (isASCII) ? 4 : 5,
1581         qr/Unicode non-character.*is not recommended for open interchange/
1582     ],
1583     [ "non-character code point U+EFFFF",
1584         (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
1585         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1586         'nonchar', 0xEFFFF,
1587         (isASCII) ? 4 : 5,
1588         qr/Unicode non-character.*is not recommended for open interchange/
1589     ],
1590     [ "non-character code point U+FFFFE",
1591         (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
1592         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1593         'nonchar', 0xFFFFE,
1594         (isASCII) ? 4 : 5,
1595         qr/Unicode non-character.*is not recommended for open interchange/
1596     ],
1597     [ "non-character code point U+FFFFF",
1598         (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
1599         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1600         'nonchar', 0xFFFFF,
1601         (isASCII) ? 4 : 5,
1602         qr/Unicode non-character.*is not recommended for open interchange/
1603     ],
1604     [ "non-character code point U+10FFFE",
1605         (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
1606         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1607         'nonchar', 0x10FFFE,
1608         (isASCII) ? 4 : 5,
1609         qr/Unicode non-character.*is not recommended for open interchange/
1610     ],
1611     [ "non-character code point U+10FFFF",
1612         (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
1613         $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
1614         'nonchar', 0x10FFFF,
1615         (isASCII) ? 4 : 5,
1616         qr/Unicode non-character.*is not recommended for open interchange/
1617     ],
1618     [ "requires at least 32 bits",
1619         (isASCII)
1620          ? "\xfe\x82\x80\x80\x80\x80\x80"
1621          : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
1622         # This code point is chosen so that it is representable in a UV on
1623         # 32-bit machines
1624         $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
1625         'utf8', 0x80000000, (isASCII) ? 7 :14,
1626         qr/Code point 0x80000000 is not Unicode, and not portable/
1627     ],
1628     [ "requires at least 32 bits, and use SUPER-type flags, instead of ABOVE_31_BIT",
1629         (isASCII)
1630          ? "\xfe\x82\x80\x80\x80\x80\x80"
1631          : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
1632         $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
1633         'utf8', 0x80000000, (isASCII) ? 7 :14,
1634         qr/Code point 0x80000000 is not Unicode, and not portable/
1635     ],
1636     [ "overflow with warnings/disallow for more than 31 bits",
1637         # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
1638         # with overflow.  The overflow malformation is never allowed, so
1639         # preventing it takes precedence if the ABOVE_31_BIT options would
1640         # otherwise allow in an overflowing value.  The ASCII code points (1
1641         # for 32-bits; 1 for 64) were chosen because the old overflow
1642         # detection algorithm did not catch them; this means this test also
1643         # checks for that fix.  The EBCDIC are arbitrary overflowing ones
1644         # since we have no reports of failures with it.
1645        (($is64bit)
1646         ? ((isASCII)
1647            ?              "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
1648            : I8_to_native("\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
1649         : ((isASCII)
1650            ?              "\xfe\x86\x80\x80\x80\x80\x80"
1651            : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
1652
1653         # We include both warning categories to make sure the ABOVE_31_BIT one
1654         # has precedence
1655         "$UTF8_WARN_ABOVE_31_BIT|$UTF8_WARN_SUPER",
1656         "$UTF8_DISALLOW_ABOVE_31_BIT",
1657         'utf8', 0,
1658         (! isASCII) ? 14 : ($is64bit) ? 13 : 7,
1659         qr/overflow at byte .*, after start byte 0xf/
1660     ],
1661 );
1662
1663 if ($is64bit) {
1664     no warnings qw{portable overflow};
1665     push @tests,
1666         [ "More than 32 bits",
1667             (isASCII)
1668             ?              "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
1669             : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1670             $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
1671             'utf8', 0x1000000000, (isASCII) ? 13 : 14,
1672             qr/Code point 0x.* is not Unicode, and not portable/
1673         ];
1674     if (! isASCII) {
1675         push @tests,   # These could falsely show wrongly in a naive implementation
1676             [ "requires at least 32 bits",
1677                 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1678                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1679                 'utf8', 0x800000000, 14,
1680                 qr/Code point 0x800000000 is not Unicode, and not portable/
1681             ],
1682             [ "requires at least 32 bits",
1683                 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1684                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1685                 'utf8', 0x10000000000, 14,
1686                 qr/Code point 0x10000000000 is not Unicode, and not portable/
1687             ],
1688             [ "requires at least 32 bits",
1689                 I8_to_native("\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1690                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1691                 'utf8', 0x200000000000, 14,
1692                 qr/Code point 0x200000000000 is not Unicode, and not portable/
1693             ],
1694             [ "requires at least 32 bits",
1695                 I8_to_native("\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1696                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1697                 'utf8', 0x4000000000000, 14,
1698                 qr/Code point 0x4000000000000 is not Unicode, and not portable/
1699             ],
1700             [ "requires at least 32 bits",
1701                 I8_to_native("\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1702                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1703                 'utf8', 0x80000000000000, 14,
1704                 qr/Code point 0x80000000000000 is not Unicode, and not portable/
1705             ],
1706             [ "requires at least 32 bits",
1707                 I8_to_native("\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1708                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
1709                 'utf8', 0x1000000000000000, 14,
1710                 qr/Code point 0x1000000000000000 is not Unicode, and not portable/
1711             ];
1712     }
1713 }
1714
1715 foreach my $test (@tests) {
1716     my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test;
1717
1718     my $length = length $bytes;
1719     my $will_overflow = $testname =~ /overflow/;
1720
1721     {
1722         use warnings;
1723         undef @warnings;
1724         my $ret = test_isUTF8_CHAR($bytes, $length);
1725         my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
1726         if ($will_overflow) {
1727             is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
1728             is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
1729         }
1730         else {
1731             is($ret, $length,
1732                "isUTF8_CHAR() $testname: returns expected length: $length");
1733             is($ret_flags, $length,
1734                "isUTF8_CHAR_flags(...,0) $testname: returns expected length: $length");
1735         }
1736         unless (is(scalar @warnings, 0,
1737                 "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated no warnings"))
1738         {
1739             diag "The warnings were: " . join(", ", @warnings);
1740         }
1741
1742         undef @warnings;
1743         $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
1744         if ($will_overflow) {
1745             is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
1746         }
1747         else {
1748             my $expected_ret = (   $testname =~ /surrogate|non-character/
1749                                 || $allowed_uv > 0x10FFFF)
1750                                ? 0
1751                                : $length;
1752             is($ret, $expected_ret,
1753                "isSTRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
1754             $ret = test_isUTF8_CHAR_flags($bytes, $length,
1755                                           $UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
1756             is($ret, $expected_ret,
1757                "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE') acts like isSTRICT_UTF8_CHAR");
1758         }
1759         unless (is(scalar @warnings, 0,
1760                 "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname: generated no warnings"))
1761         {
1762             diag "The warnings were: " . join(", ", @warnings);
1763         }
1764
1765         undef @warnings;
1766         $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
1767         if ($will_overflow) {
1768             is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
1769         }
1770         else {
1771             my $expected_ret = (   $testname =~ /surrogate/
1772                                 || $allowed_uv > 0x10FFFF)
1773                                ? 0
1774                                : $length;
1775             is($ret, $expected_ret,
1776                "isC9_STRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
1777             $ret = test_isUTF8_CHAR_flags($bytes, $length,
1778                                           $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
1779             is($ret, $expected_ret,
1780                "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like isC9_STRICT_UTF8_CHAR");
1781         }
1782         unless (is(scalar @warnings, 0,
1783                 "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname: generated no warnings"))
1784         {
1785             diag "The warnings were: " . join(", ", @warnings);
1786         }
1787
1788         # Test partial character handling, for each byte not a full character
1789         for my $j (1.. $length - 1) {
1790
1791             # Skip the test for the interaction between overflow and above-31
1792             # bit.  It is really testing other things than the partial
1793             # character tests, for which other tests in this file are
1794             # sufficient
1795             last if $testname =~ /overflow/;
1796
1797             foreach my $disallow_flag (0, $disallow_flags) {
1798                 my $partial = substr($bytes, 0, $j);
1799                 my $ret_should_be;
1800                 my $comment;
1801                 if ($disallow_flag) {
1802                     $ret_should_be = 0;
1803                     $comment = "disallowed";
1804                 }
1805                 else {
1806                     $ret_should_be = 1;
1807                     $comment = "allowed";
1808                 }
1809
1810                 if ($disallow_flag) {
1811                     if ($testname =~ /non-character/) {
1812                         $ret_should_be = 1;
1813                         $comment .= ", but but need full char to discern";
1814                     }
1815                     elsif ($testname =~ /surrogate/) {
1816                         if ($j < 2) {
1817                             $ret_should_be = 1;
1818                             $comment .= ", but need 2 bytes to discern";
1819                         }
1820                     }
1821                     elsif (   ($disallow_flags & $UTF8_DISALLOW_SUPER)
1822                            && $j < 2
1823                            && ord(native_to_I8(substr($bytes, 0, 1)))
1824                                lt ((isASCII) ? 0xF5 : 0xFA))
1825                     {
1826                         $ret_should_be = 1;
1827                         $comment .= ", but need 2 bytes to discern";
1828                     }
1829                     elsif (   ! isASCII
1830                            && $testname =~ /requires at least 32 bits/)
1831                     {
1832                         # On EBCDIC, the boundary between 31 and 32 bits is
1833                         # more complicated.
1834                         $ret_should_be = 1 if native_to_I8($partial) le
1835                      "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF";
1836                     }
1837                 }
1838
1839                 undef @warnings;
1840
1841                 $ret = test_is_utf8_valid_partial_char_flags($partial, $j, $disallow_flag);
1842                 is($ret, $ret_should_be, "$testname: is_utf8_valid_partial_char_flags("
1843                                         . display_bytes($partial)
1844                                         . "), $comment: returns $ret_should_be");
1845                 unless (is(scalar @warnings, 0,
1846                         "$testname: is_utf8_valid_partial_char_flags() generated no warnings"))
1847                 {
1848                     diag "The warnings were: " . join(", ", @warnings);
1849                 }
1850             }
1851         }
1852     }
1853
1854     # This is more complicated than the malformations tested earlier, as there
1855     # are several orthogonal variables involved.  We test all the subclasses
1856     # of utf8 warnings to verify they work with and without the utf8 class,
1857     # and don't have effects on other sublass warnings
1858     foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
1859         foreach my $warn_flag (0, $warn_flags) {
1860             foreach my $disallow_flag (0, $disallow_flags) {
1861                 foreach my $do_warning (0, 1) {
1862
1863                     my $eval_warn = $do_warning
1864                                   ? "use warnings '$warning'"
1865                                   : $warning eq "utf8"
1866                                     ? "no warnings 'utf8'"
1867                                     : "use warnings 'utf8'; no warnings '$warning'";
1868
1869                     # is effectively disallowed if will overflow, even if the
1870                     # flag indicates it is allowed, fix up test name to
1871                     # indicate this as well
1872                     my $disallowed = $disallow_flag || $will_overflow;
1873
1874                     my $this_name = "utf8n_to_uvchr() $testname: " . (($disallow_flag)
1875                                                     ? 'disallowed'
1876                                                     : ($disallowed)
1877                                                         ? 'ABOVE_31_BIT allowed'
1878                                                         : 'allowed');
1879                     $this_name .= ", $eval_warn";
1880                     $this_name .= ", " . (($warn_flag)
1881                                           ? 'with warning flag'
1882                                           : 'no warning flag');
1883
1884                     undef @warnings;
1885                     my $ret_ref;
1886                     my $display_bytes = display_bytes($bytes);
1887                     my $call = "Call was: $eval_warn; \$ret_ref = test_utf8n_to_uvchr('$display_bytes', $length, $warn_flag|$disallow_flag)";
1888                     my $eval_text =      "$eval_warn; \$ret_ref = test_utf8n_to_uvchr('$bytes', $length, $warn_flag|$disallow_flag)";
1889                     eval "$eval_text";
1890                     if (! ok ("$@ eq ''", "$this_name: eval succeeded")) {
1891                         diag "\$!='$!'; eval'd=\"$call\"";
1892                         next;
1893                     }
1894                     if ($disallowed) {
1895                         unless (is($ret_ref->[0], 0, "$this_name: Returns 0"))
1896                         {
1897                             diag $call;
1898                         }
1899                     }
1900                     else {
1901                         unless (is($ret_ref->[0], $allowed_uv,
1902                                    "$this_name: Returns expected uv: "
1903                                  . sprintf("0x%04X", $allowed_uv)))
1904                         {
1905                             diag $call;
1906                         }
1907                     }
1908                     unless (is($ret_ref->[1], $expected_len,
1909                         "$this_name: Returns expected length: $expected_len"))
1910                     {
1911                         diag $call;
1912                     }
1913
1914                     if (! $do_warning
1915                         && ($warning eq 'utf8' || $warning eq $category))
1916                     {
1917                         if (!is(scalar @warnings, 0,
1918                                             "$this_name: No warnings generated"))
1919                         {
1920                             diag $call;
1921                             diag "The warnings were: " . join(", ", @warnings);
1922                         }
1923                     }
1924                     elsif ($will_overflow
1925                            && ! $disallow_flag
1926                            && $warning eq 'utf8')
1927                     {
1928
1929                         # Will get the overflow message instead of the expected
1930                         # message under these circumstances, as they would
1931                         # otherwise accept an overflowed value, which the code
1932                         # should not allow, so falls back to overflow.
1933                         if (is(scalar @warnings, 1,
1934                                "$this_name: Got a single warning "))
1935                         {
1936                             unless (like($warnings[0], qr/overflow/,
1937                                         "$this_name: Got overflow warning"))
1938                             {
1939                                 diag $call;
1940                             }
1941                         }
1942                         else {
1943                             diag $call;
1944                             if (scalar @warnings) {
1945                                 diag "The warnings were: "
1946                                                         . join(", ", @warnings);
1947                             }
1948                         }
1949                     }
1950                     elsif ($warn_flag
1951                            && ($warning eq 'utf8' || $warning eq $category))
1952                     {
1953                         if (is(scalar @warnings, 1,
1954                                "$this_name: Got a single warning "))
1955                         {
1956                             unless (like($warnings[0], $message,
1957                                         "$this_name: Got expected warning"))
1958                             {
1959                                 diag $call;
1960                             }
1961                         }
1962                         else {
1963                             diag $call;
1964                             if (scalar @warnings) {
1965                                 diag "The warnings were: "
1966                                                         . join(", ", @warnings);
1967                             }
1968                         }
1969                     }
1970
1971                     # Check CHECK_ONLY results when the input is disallowed.  Do
1972                     # this when actually disallowed, not just when the
1973                     # $disallow_flag is set
1974                     if ($disallowed) {
1975                         undef @warnings;
1976                         $ret_ref = test_utf8n_to_uvchr($bytes, $length,
1977                                                 $disallow_flag|$UTF8_CHECK_ONLY);
1978                         unless (is($ret_ref->[0], 0, "$this_name, CHECK_ONLY: Returns 0")) {
1979                             diag $call;
1980                         }
1981                         unless (is($ret_ref->[1], -1,
1982                             "$this_name: CHECK_ONLY: returns -1 for length"))
1983                         {
1984                             diag $call;
1985                         }
1986                         if (! is(scalar @warnings, 0,
1987                             "$this_name, CHECK_ONLY: no warnings generated"))
1988                         {
1989                             diag $call;
1990                             diag "The warnings were: " . join(", ", @warnings);
1991                         }
1992                     }
1993
1994                     # Now repeat some of the above, but for
1995                     # uvchr_to_utf8_flags().  Since this comes from an
1996                     # existing code point, it hasn't overflowed.
1997                     next if $will_overflow;
1998
1999                     # The warning and disallow flags passed in are for
2000                     # utf8n_to_uvchr().  Convert them for
2001                     # uvchr_to_utf8_flags().
2002                     my $uvchr_warn_flag = 0;
2003                     my $uvchr_disallow_flag = 0;
2004                     if ($warn_flag) {
2005                         if ($warn_flag == $UTF8_WARN_SURROGATE) {
2006                             $uvchr_warn_flag = $UNICODE_WARN_SURROGATE
2007                         }
2008                         elsif ($warn_flag == $UTF8_WARN_NONCHAR) {
2009                             $uvchr_warn_flag = $UNICODE_WARN_NONCHAR
2010                         }
2011                         elsif ($warn_flag == $UTF8_WARN_SUPER) {
2012                             $uvchr_warn_flag = $UNICODE_WARN_SUPER
2013                         }
2014                         elsif ($warn_flag == $UTF8_WARN_ABOVE_31_BIT) {
2015                             $uvchr_warn_flag = $UNICODE_WARN_ABOVE_31_BIT;
2016                         }
2017                         else {
2018                             fail(sprintf "Unexpected warn flag: %x",
2019                                  $warn_flag);
2020                             next;
2021                         }
2022                     }
2023                     if ($disallow_flag) {
2024                         if ($disallow_flag == $UTF8_DISALLOW_SURROGATE) {
2025                             $uvchr_disallow_flag = $UNICODE_DISALLOW_SURROGATE
2026                         }
2027                         elsif ($disallow_flag == $UTF8_DISALLOW_NONCHAR) {
2028                             $uvchr_disallow_flag = $UNICODE_DISALLOW_NONCHAR
2029                         }
2030                         elsif ($disallow_flag == $UTF8_DISALLOW_SUPER) {
2031                             $uvchr_disallow_flag = $UNICODE_DISALLOW_SUPER
2032                         }
2033                         elsif ($disallow_flag == $UTF8_DISALLOW_ABOVE_31_BIT) {
2034                             $uvchr_disallow_flag =
2035                             $UNICODE_DISALLOW_ABOVE_31_BIT;
2036                         }
2037                         else {
2038                             fail(sprintf "Unexpected disallow flag: %x",
2039                                  $disallow_flag);
2040                             next;
2041                         }
2042                     }
2043
2044                     $disallowed = $uvchr_disallow_flag;
2045
2046                     $this_name = "uvchr_to_utf8_flags() $testname: "
2047                                                   . (($uvchr_disallow_flag)
2048                                                     ? 'disallowed'
2049                                                     : ($disallowed)
2050                                                       ? 'ABOVE_31_BIT allowed'
2051                                                       : 'allowed');
2052                     $this_name .= ", $eval_warn";
2053                     $this_name .= ", " . (($uvchr_warn_flag)
2054                                           ? 'with warning flag'
2055                                           : 'no warning flag');
2056
2057                     undef @warnings;
2058                     my $ret;
2059                     my $warn_flag = sprintf "0x%x", $uvchr_warn_flag;
2060                     my $disallow_flag = sprintf "0x%x", $uvchr_disallow_flag;
2061                     $call = sprintf "call was: $eval_warn; \$ret = test_uvchr_to_utf8_flags(0x%x, $warn_flag|$disallow_flag)", $allowed_uv;
2062                     $eval_text = "$eval_warn; \$ret = test_uvchr_to_utf8_flags($allowed_uv, $warn_flag|$disallow_flag)";
2063                     eval "$eval_text";
2064                     if (! ok ("$@ eq ''", "$this_name: eval succeeded")) {
2065                         diag "\$!='$!'; eval'd=\"$eval_text\"";
2066                         next;
2067                     }
2068                     if ($disallowed) {
2069                         unless (is($ret, undef, "$this_name: Returns undef")) {
2070                             diag $call;
2071                         }
2072                     }
2073                     else {
2074                         unless (is($ret, $bytes, "$this_name: Returns expected string")) {
2075                             diag $call;
2076                         }
2077                     }
2078                     if (! $do_warning
2079                         && ($warning eq 'utf8' || $warning eq $category))
2080                     {
2081                         if (!is(scalar @warnings, 0,
2082                                             "$this_name: No warnings generated"))
2083                         {
2084                             diag $call;
2085                             diag "The warnings were: " . join(", ", @warnings);
2086                         }
2087                     }
2088                     elsif ($uvchr_warn_flag
2089                            && ($warning eq 'utf8' || $warning eq $category))
2090                     {
2091                         if (is(scalar @warnings, 1,
2092                                "$this_name: Got a single warning "))
2093                         {
2094                             unless (like($warnings[0], $message,
2095                                             "$this_name: Got expected warning"))
2096                             {
2097                                 diag $call;
2098                             }
2099                         }
2100                         else {
2101                             diag $call;
2102                             if (scalar @warnings) {
2103                                 diag "The warnings were: "
2104                                                         . join(", ", @warnings);
2105                             }
2106                         }
2107                     }
2108                 }
2109             }
2110         }
2111     }
2112 }
2113
2114 done_testing;