This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/utf8.t: Fix failing EBCDIC tests
[perl5.git] / ext / XS-APItest / t / utf8.t
CommitLineData
fed3ba5d
NC
1#!perl -w
2
3use strict;
4use Test::More;
7dfd8446 5$|=1;
fed3ba5d 6
760c7c2f
KW
7no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
8 # machines, and that is tested elsewhere
9
fed3ba5d 10use XS::APItest;
f9380377 11use Data::Dumper;
4deba822
KW
12my $pound_sign = chr utf8::unicode_to_native(163);
13
7dfd8446
KW
14sub isASCII { ord "A" == 65 }
15
16sub display_bytes {
9d2d0ecd 17 use bytes;
7dfd8446
KW
18 my $string = shift;
19 return '"'
20 . join("", map { sprintf("\\x%02x", ord $_) } split "", $string)
21 . '"';
22}
23
d84e92aa
KW
24sub output_warnings(@) {
25 diag "The warnings were:\n" . join("", @_);
26}
27
7dfd8446
KW
28# This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
29# because that uses the same functions we are testing here. So UTF-EBCDIC
30# strings are hard-coded as I8 strings in this file instead, and we use array
31# lookup to translate into the appropriate code page.
32
33my @i8_to_native = ( # Only code page 1047 so far.
34# _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F
350x00,0x01,0x02,0x03,0x37,0x2D,0x2E,0x2F,0x16,0x05,0x15,0x0B,0x0C,0x0D,0x0E,0x0F,
360x10,0x11,0x12,0x13,0x3C,0x3D,0x32,0x26,0x18,0x19,0x3F,0x27,0x1C,0x1D,0x1E,0x1F,
370x40,0x5A,0x7F,0x7B,0x5B,0x6C,0x50,0x7D,0x4D,0x5D,0x5C,0x4E,0x6B,0x60,0x4B,0x61,
380xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0x7A,0x5E,0x4C,0x7E,0x6E,0x6F,
390x7C,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,
400xD7,0xD8,0xD9,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xAD,0xE0,0xBD,0x5F,0x6D,
410x79,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x91,0x92,0x93,0x94,0x95,0x96,
420x97,0x98,0x99,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xC0,0x4F,0xD0,0xA1,0x07,
430x20,0x21,0x22,0x23,0x24,0x25,0x06,0x17,0x28,0x29,0x2A,0x2B,0x2C,0x09,0x0A,0x1B,
440x30,0x31,0x1A,0x33,0x34,0x35,0x36,0x08,0x38,0x39,0x3A,0x3B,0x04,0x14,0x3E,0xFF,
450x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x51,0x52,0x53,0x54,0x55,0x56,
460x57,0x58,0x59,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x70,0x71,0x72,0x73,
470x74,0x75,0x76,0x77,0x78,0x80,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,0x90,0x9A,0x9B,0x9C,
480x9D,0x9E,0x9F,0xA0,0xAA,0xAB,0xAC,0xAE,0xAF,0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,
490xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBE,0xBF,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xDA,0xDB,
500xDC,0xDD,0xDE,0xDF,0xE1,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xFA,0xFB,0xFC,0xFD,0xFE,
51);
52
418080dc
KW
53my @native_to_i8;
54for (my $i = 0; $i < 256; $i++) {
55 $native_to_i8[$i8_to_native[$i]] = $i;
56}
57
7dfd8446
KW
58*I8_to_native = (isASCII)
59 ? sub { return shift }
60 : sub { return join "", map { chr $i8_to_native[ord $_] }
61 split "", shift };
418080dc
KW
62*native_to_I8 = (isASCII)
63 ? sub { return shift }
64 : sub { return join "", map { chr $native_to_i8[ord $_] }
65 split "", shift };
2b5e7bc2
KW
66sub start_byte_to_cont($) {
67
68 # Extract the code point information from the input UTF-8 start byte, and
69 # return a continuation byte containing the same information. This is
70 # used in constructing an overlong malformation from valid input.
71
72 my $byte = shift;
73 my $len = test_UTF8_SKIP($byte);
74 if ($len < 2) {
921139af 75 die "start_byte_to_cont() is expecting a UTF-8 variant";
2b5e7bc2
KW
76 }
77
78 $byte = ord native_to_I8($byte);
79
80 # Copied from utf8.h. This gets rid of the leading 1 bits.
81 $byte &= ((($len) >= 7) ? 0x00 : (0x1F >> (($len)-2)));
82
8182a17b
KW
83 $byte |= (isASCII) ? 0x80 : 0xA0;
84 return I8_to_native(chr $byte);
2b5e7bc2 85}
7dfd8446
KW
86
87my $is64bit = length sprintf("%x", ~0) > 8;
88
89
f9380377
KW
90# Test utf8n_to_uvchr_error(). These provide essentially complete code
91# coverage. Copied from utf8.h
7dfd8446 92my $UTF8_ALLOW_EMPTY = 0x0001;
f9380377 93my $UTF8_GOT_EMPTY = $UTF8_ALLOW_EMPTY;
7dfd8446 94my $UTF8_ALLOW_CONTINUATION = 0x0002;
f9380377 95my $UTF8_GOT_CONTINUATION = $UTF8_ALLOW_CONTINUATION;
7dfd8446 96my $UTF8_ALLOW_NON_CONTINUATION = 0x0004;
f9380377 97my $UTF8_GOT_NON_CONTINUATION = $UTF8_ALLOW_NON_CONTINUATION;
7dfd8446 98my $UTF8_ALLOW_SHORT = 0x0008;
f9380377 99my $UTF8_GOT_SHORT = $UTF8_ALLOW_SHORT;
7dfd8446 100my $UTF8_ALLOW_LONG = 0x0010;
f9380377
KW
101my $UTF8_GOT_LONG = $UTF8_ALLOW_LONG;
102my $UTF8_GOT_OVERFLOW = 0x0020;
6f89c5a0 103my $UTF8_DISALLOW_SURROGATE = 0x0040;
f9380377 104my $UTF8_GOT_SURROGATE = $UTF8_DISALLOW_SURROGATE;
6f89c5a0
KW
105my $UTF8_WARN_SURROGATE = 0x0080;
106my $UTF8_DISALLOW_NONCHAR = 0x0100;
f9380377 107my $UTF8_GOT_NONCHAR = $UTF8_DISALLOW_NONCHAR;
6f89c5a0
KW
108my $UTF8_WARN_NONCHAR = 0x0200;
109my $UTF8_DISALLOW_SUPER = 0x0400;
f9380377 110my $UTF8_GOT_SUPER = $UTF8_DISALLOW_SUPER;
6f89c5a0
KW
111my $UTF8_WARN_SUPER = 0x0800;
112my $UTF8_DISALLOW_ABOVE_31_BIT = 0x1000;
f9380377 113my $UTF8_GOT_ABOVE_31_BIT = $UTF8_DISALLOW_ABOVE_31_BIT;
6f89c5a0
KW
114my $UTF8_WARN_ABOVE_31_BIT = 0x2000;
115my $UTF8_CHECK_ONLY = 0x4000;
25e3a4e0
KW
116my $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
117 = $UTF8_DISALLOW_SUPER|$UTF8_DISALLOW_SURROGATE;
118my $UTF8_DISALLOW_ILLEGAL_INTERCHANGE
119 = $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE|$UTF8_DISALLOW_NONCHAR;
867a901b
KW
120my $UTF8_WARN_ILLEGAL_C9_INTERCHANGE
121 = $UTF8_WARN_SUPER|$UTF8_WARN_SURROGATE;
122my $UTF8_WARN_ILLEGAL_INTERCHANGE
123 = $UTF8_WARN_ILLEGAL_C9_INTERCHANGE|$UTF8_WARN_NONCHAR;
7dfd8446 124
046d01eb
KW
125# Test uvchr_to_utf8().
126my $UNICODE_WARN_SURROGATE = 0x0001;
127my $UNICODE_WARN_NONCHAR = 0x0002;
128my $UNICODE_WARN_SUPER = 0x0004;
129my $UNICODE_WARN_ABOVE_31_BIT = 0x0008;
130my $UNICODE_DISALLOW_SURROGATE = 0x0010;
131my $UNICODE_DISALLOW_NONCHAR = 0x0020;
132my $UNICODE_DISALLOW_SUPER = 0x0040;
133my $UNICODE_DISALLOW_ABOVE_31_BIT = 0x0080;
134
135my $look_for_everything_utf8n_to
136 = $UTF8_DISALLOW_SURROGATE
7dfd8446
KW
137 | $UTF8_WARN_SURROGATE
138 | $UTF8_DISALLOW_NONCHAR
139 | $UTF8_WARN_NONCHAR
140 | $UTF8_DISALLOW_SUPER
141 | $UTF8_WARN_SUPER
1d1c12d9
KW
142 | $UTF8_DISALLOW_ABOVE_31_BIT
143 | $UTF8_WARN_ABOVE_31_BIT;
046d01eb
KW
144my $look_for_everything_uvchr_to
145 = $UNICODE_DISALLOW_SURROGATE
146 | $UNICODE_WARN_SURROGATE
147 | $UNICODE_DISALLOW_NONCHAR
148 | $UNICODE_WARN_NONCHAR
149 | $UNICODE_DISALLOW_SUPER
150 | $UNICODE_WARN_SUPER
151 | $UNICODE_DISALLOW_ABOVE_31_BIT
152 | $UNICODE_WARN_ABOVE_31_BIT;
7dfd8446 153
fed3ba5d
NC
154foreach ([0, '', '', 'empty'],
155 [0, 'N', 'N', '1 char'],
156 [1, 'NN', 'N', '1 char substring'],
157 [-2, 'Perl', 'Rules', 'different'],
4deba822
KW
158 [0, $pound_sign, $pound_sign, 'pound sign'],
159 [1, $pound_sign . 10, $pound_sign . 1, '10 pounds is more than 1 pound'],
160 [1, $pound_sign . $pound_sign, $pound_sign, '2 pound signs are more than 1'],
fed3ba5d
NC
161 [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'],
162 [-1, '!', "!\x{1F42A}", 'Initial substrings match'],
163 ) {
164 my ($expect, $left, $right, $desc) = @$_;
165 my $copy = $right;
166 utf8::encode($copy);
167 is(bytes_cmp_utf8($left, $copy), $expect, $desc);
168 next if $right =~ tr/\0-\377//c;
169 utf8::encode($left);
170 is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed");
171}
172
7dfd8446
KW
173# The keys to this hash are Unicode code points, their values are the native
174# UTF-8 representations of them. The code points are chosen because they are
175# "interesting" on either or both ASCII and EBCDIC platforms. First we add
176# boundaries where the number of bytes required to represent them increase, or
177# are adjacent to problematic code points, so we want to make sure they aren't
178# considered problematic.
179my %code_points = (
180 0x0100 => (isASCII) ? "\xc4\x80" : I8_to_native("\xc8\xa0"),
181 0x0400 - 1 => (isASCII) ? "\xcf\xbf" : I8_to_native("\xdf\xbf"),
182 0x0400 => (isASCII) ? "\xd0\x80" : I8_to_native("\xe1\xa0\xa0"),
183 0x0800 - 1 => (isASCII) ? "\xdf\xbf" : I8_to_native("\xe1\xbf\xbf"),
184 0x0800 => (isASCII) ? "\xe0\xa0\x80" : I8_to_native("\xe2\xa0\xa0"),
185 0x4000 - 1 => (isASCII) ? "\xe3\xbf\xbf" : I8_to_native("\xef\xbf\xbf"),
186 0x4000 => (isASCII) ? "\xe4\x80\x80" : I8_to_native("\xf0\xb0\xa0\xa0"),
187 0x8000 - 1 => (isASCII) ? "\xe7\xbf\xbf" : I8_to_native("\xf0\xbf\xbf\xbf"),
188
189 # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC,
190 # as of this writing, considers potentially problematic on EBCDIC
191 0x8000 => (isASCII) ? "\xe8\x80\x80" : I8_to_native("\xf1\xa0\xa0\xa0"),
192
193 0xD000 - 1 => (isASCII) ? "\xec\xbf\xbf" : I8_to_native("\xf1\xb3\xbf\xbf"),
194
195 # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC,
196 # as of this writing, considers potentially problematic on ASCII
197 0xD000 => (isASCII) ? "\xed\x80\x80" : I8_to_native("\xf1\xb4\xa0\xa0"),
198
5f8a3d1d 199 # Bracket the surrogates, and include several surrogates
7dfd8446 200 0xD7FF => (isASCII) ? "\xed\x9f\xbf" : I8_to_native("\xf1\xb5\xbf\xbf"),
5f8a3d1d
KW
201 0xD800 => (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
202 0xDC00 => (isASCII) ? "\xed\xb0\x80" : I8_to_native("\xf1\xb7\xa0\xa0"),
203 0xDFFF => (isASCII) ? "\xee\x80\x80" : I8_to_native("\xf1\xb8\xa0\xa0"),
204 0xDFFF => (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
7dfd8446
KW
205 0xE000 => (isASCII) ? "\xee\x80\x80" : I8_to_native("\xf1\xb8\xa0\xa0"),
206
5f8a3d1d 207 # Include the 32 contiguous non characters, and surrounding code points
7dfd8446 208 0xFDCF => (isASCII) ? "\xef\xb7\x8f" : I8_to_native("\xf1\xbf\xae\xaf"),
5f8a3d1d
KW
209 0xFDD0 => (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
210 0xFDD1 => (isASCII) ? "\xef\xb7\x91" : I8_to_native("\xf1\xbf\xae\xb1"),
211 0xFDD2 => (isASCII) ? "\xef\xb7\x92" : I8_to_native("\xf1\xbf\xae\xb2"),
212 0xFDD3 => (isASCII) ? "\xef\xb7\x93" : I8_to_native("\xf1\xbf\xae\xb3"),
213 0xFDD4 => (isASCII) ? "\xef\xb7\x94" : I8_to_native("\xf1\xbf\xae\xb4"),
214 0xFDD5 => (isASCII) ? "\xef\xb7\x95" : I8_to_native("\xf1\xbf\xae\xb5"),
215 0xFDD6 => (isASCII) ? "\xef\xb7\x96" : I8_to_native("\xf1\xbf\xae\xb6"),
216 0xFDD7 => (isASCII) ? "\xef\xb7\x97" : I8_to_native("\xf1\xbf\xae\xb7"),
217 0xFDD8 => (isASCII) ? "\xef\xb7\x98" : I8_to_native("\xf1\xbf\xae\xb8"),
218 0xFDD9 => (isASCII) ? "\xef\xb7\x99" : I8_to_native("\xf1\xbf\xae\xb9"),
219 0xFDDA => (isASCII) ? "\xef\xb7\x9a" : I8_to_native("\xf1\xbf\xae\xba"),
220 0xFDDB => (isASCII) ? "\xef\xb7\x9b" : I8_to_native("\xf1\xbf\xae\xbb"),
221 0xFDDC => (isASCII) ? "\xef\xb7\x9c" : I8_to_native("\xf1\xbf\xae\xbc"),
222 0xFDDD => (isASCII) ? "\xef\xb7\x9d" : I8_to_native("\xf1\xbf\xae\xbd"),
223 0xFDDE => (isASCII) ? "\xef\xb7\x9e" : I8_to_native("\xf1\xbf\xae\xbe"),
224 0xFDDF => (isASCII) ? "\xef\xb7\x9f" : I8_to_native("\xf1\xbf\xae\xbf"),
225 0xFDE0 => (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
226 0xFDE1 => (isASCII) ? "\xef\xb7\xa1" : I8_to_native("\xf1\xbf\xaf\xa1"),
227 0xFDE2 => (isASCII) ? "\xef\xb7\xa2" : I8_to_native("\xf1\xbf\xaf\xa2"),
228 0xFDE3 => (isASCII) ? "\xef\xb7\xa3" : I8_to_native("\xf1\xbf\xaf\xa3"),
229 0xFDE4 => (isASCII) ? "\xef\xb7\xa4" : I8_to_native("\xf1\xbf\xaf\xa4"),
230 0xFDE5 => (isASCII) ? "\xef\xb7\xa5" : I8_to_native("\xf1\xbf\xaf\xa5"),
231 0xFDE6 => (isASCII) ? "\xef\xb7\xa6" : I8_to_native("\xf1\xbf\xaf\xa6"),
232 0xFDE7 => (isASCII) ? "\xef\xb7\xa7" : I8_to_native("\xf1\xbf\xaf\xa7"),
233 0xFDE8 => (isASCII) ? "\xef\xb7\xa8" : I8_to_native("\xf1\xbf\xaf\xa8"),
234 0xFDEa => (isASCII) ? "\xef\xb7\x99" : I8_to_native("\xf1\xbf\xaf\xa9"),
235 0xFDEA => (isASCII) ? "\xef\xb7\xaa" : I8_to_native("\xf1\xbf\xaf\xaa"),
236 0xFDEB => (isASCII) ? "\xef\xb7\xab" : I8_to_native("\xf1\xbf\xaf\xab"),
237 0xFDEC => (isASCII) ? "\xef\xb7\xac" : I8_to_native("\xf1\xbf\xaf\xac"),
238 0xFDED => (isASCII) ? "\xef\xb7\xad" : I8_to_native("\xf1\xbf\xaf\xad"),
a9e5eeaa 239 0xFDEE => (isASCII) ? "\xef\xb7\xae" : I8_to_native("\xf1\xbf\xaf\xae"),
5f8a3d1d 240 0xFDEF => (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
7dfd8446
KW
241 0xFDF0 => (isASCII) ? "\xef\xb7\xb0" : I8_to_native("\xf1\xbf\xaf\xb0"),
242
5f8a3d1d 243 # Mostly around non-characters, but some are transitions to longer strings
7dfd8446
KW
244 0xFFFD => (isASCII) ? "\xef\xbf\xbd" : I8_to_native("\xf1\xbf\xbf\xbd"),
245 0x10000 - 1 => (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
246 0x10000 => (isASCII) ? "\xf0\x90\x80\x80" : I8_to_native("\xf2\xa0\xa0\xa0"),
247 0x1FFFD => (isASCII) ? "\xf0\x9f\xbf\xbd" : I8_to_native("\xf3\xbf\xbf\xbd"),
5f8a3d1d
KW
248 0x1FFFE => (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
249 0x1FFFF => (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
7dfd8446
KW
250 0x20000 => (isASCII) ? "\xf0\xa0\x80\x80" : I8_to_native("\xf4\xa0\xa0\xa0"),
251 0x2FFFD => (isASCII) ? "\xf0\xaf\xbf\xbd" : I8_to_native("\xf5\xbf\xbf\xbd"),
5f8a3d1d
KW
252 0x2FFFE => (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
253 0x2FFFF => (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
7dfd8446
KW
254 0x30000 => (isASCII) ? "\xf0\xb0\x80\x80" : I8_to_native("\xf6\xa0\xa0\xa0"),
255 0x3FFFD => (isASCII) ? "\xf0\xbf\xbf\xbd" : I8_to_native("\xf7\xbf\xbf\xbd"),
5f8a3d1d 256 0x3FFFE => (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
7dfd8446
KW
257 0x40000 - 1 => (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
258 0x40000 => (isASCII) ? "\xf1\x80\x80\x80" : I8_to_native("\xf8\xa8\xa0\xa0\xa0"),
259 0x4FFFD => (isASCII) ? "\xf1\x8f\xbf\xbd" : I8_to_native("\xf8\xa9\xbf\xbf\xbd"),
5f8a3d1d
KW
260 0x4FFFE => (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
261 0x4FFFF => (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
7dfd8446
KW
262 0x50000 => (isASCII) ? "\xf1\x90\x80\x80" : I8_to_native("\xf8\xaa\xa0\xa0\xa0"),
263 0x5FFFD => (isASCII) ? "\xf1\x9f\xbf\xbd" : I8_to_native("\xf8\xab\xbf\xbf\xbd"),
5f8a3d1d
KW
264 0x5FFFE => (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
265 0x5FFFF => (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
7dfd8446
KW
266 0x60000 => (isASCII) ? "\xf1\xa0\x80\x80" : I8_to_native("\xf8\xac\xa0\xa0\xa0"),
267 0x6FFFD => (isASCII) ? "\xf1\xaf\xbf\xbd" : I8_to_native("\xf8\xad\xbf\xbf\xbd"),
5f8a3d1d
KW
268 0x6FFFE => (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
269 0x6FFFF => (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
7dfd8446
KW
270 0x70000 => (isASCII) ? "\xf1\xb0\x80\x80" : I8_to_native("\xf8\xae\xa0\xa0\xa0"),
271 0x7FFFD => (isASCII) ? "\xf1\xbf\xbf\xbd" : I8_to_native("\xf8\xaf\xbf\xbf\xbd"),
5f8a3d1d
KW
272 0x7FFFE => (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
273 0x7FFFF => (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
7dfd8446
KW
274 0x80000 => (isASCII) ? "\xf2\x80\x80\x80" : I8_to_native("\xf8\xb0\xa0\xa0\xa0"),
275 0x8FFFD => (isASCII) ? "\xf2\x8f\xbf\xbd" : I8_to_native("\xf8\xb1\xbf\xbf\xbd"),
5f8a3d1d
KW
276 0x8FFFE => (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
277 0x8FFFF => (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
7dfd8446
KW
278 0x90000 => (isASCII) ? "\xf2\x90\x80\x80" : I8_to_native("\xf8\xb2\xa0\xa0\xa0"),
279 0x9FFFD => (isASCII) ? "\xf2\x9f\xbf\xbd" : I8_to_native("\xf8\xb3\xbf\xbf\xbd"),
5f8a3d1d
KW
280 0x9FFFE => (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
281 0x9FFFF => (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
7dfd8446
KW
282 0xA0000 => (isASCII) ? "\xf2\xa0\x80\x80" : I8_to_native("\xf8\xb4\xa0\xa0\xa0"),
283 0xAFFFD => (isASCII) ? "\xf2\xaf\xbf\xbd" : I8_to_native("\xf8\xb5\xbf\xbf\xbd"),
5f8a3d1d
KW
284 0xAFFFE => (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
285 0xAFFFF => (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
7dfd8446
KW
286 0xB0000 => (isASCII) ? "\xf2\xb0\x80\x80" : I8_to_native("\xf8\xb6\xa0\xa0\xa0"),
287 0xBFFFD => (isASCII) ? "\xf2\xbf\xbf\xbd" : I8_to_native("\xf8\xb7\xbf\xbf\xbd"),
5f8a3d1d
KW
288 0xBFFFE => (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
289 0xBFFFF => (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
7dfd8446
KW
290 0xC0000 => (isASCII) ? "\xf3\x80\x80\x80" : I8_to_native("\xf8\xb8\xa0\xa0\xa0"),
291 0xCFFFD => (isASCII) ? "\xf3\x8f\xbf\xbd" : I8_to_native("\xf8\xb9\xbf\xbf\xbd"),
5f8a3d1d
KW
292 0xCFFFE => (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
293 0xCFFFF => (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
7dfd8446
KW
294 0xD0000 => (isASCII) ? "\xf3\x90\x80\x80" : I8_to_native("\xf8\xba\xa0\xa0\xa0"),
295 0xDFFFD => (isASCII) ? "\xf3\x9f\xbf\xbd" : I8_to_native("\xf8\xbb\xbf\xbf\xbd"),
5f8a3d1d
KW
296 0xDFFFE => (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
297 0xDFFFF => (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
7dfd8446
KW
298 0xE0000 => (isASCII) ? "\xf3\xa0\x80\x80" : I8_to_native("\xf8\xbc\xa0\xa0\xa0"),
299 0xEFFFD => (isASCII) ? "\xf3\xaf\xbf\xbd" : I8_to_native("\xf8\xbd\xbf\xbf\xbd"),
5f8a3d1d
KW
300 0xEFFFE => (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
301 0xEFFFF => (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
7dfd8446
KW
302 0xF0000 => (isASCII) ? "\xf3\xb0\x80\x80" : I8_to_native("\xf8\xbe\xa0\xa0\xa0"),
303 0xFFFFD => (isASCII) ? "\xf3\xbf\xbf\xbd" : I8_to_native("\xf8\xbf\xbf\xbf\xbd"),
5f8a3d1d
KW
304 0xFFFFE => (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
305 0xFFFFF => (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
7dfd8446
KW
306 0x100000 => (isASCII) ? "\xf4\x80\x80\x80" : I8_to_native("\xf9\xa0\xa0\xa0\xa0"),
307 0x10FFFD => (isASCII) ? "\xf4\x8f\xbf\xbd" : I8_to_native("\xf9\xa1\xbf\xbf\xbd"),
5f8a3d1d
KW
308 0x10FFFE => (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
309 0x10FFFF => (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
7dfd8446
KW
310 0x110000 => (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
311
312 # Things that would be noncharacters if they were in Unicode, and might be
313 # mistaken, if the C code is bad, to be nonchars
314 0x11FFFE => (isASCII) ? "\xf4\x9f\xbf\xbe" : I8_to_native("\xf9\xa3\xbf\xbf\xbe"),
315 0x11FFFF => (isASCII) ? "\xf4\x9f\xbf\xbf" : I8_to_native("\xf9\xa3\xbf\xbf\xbf"),
316 0x20FFFE => (isASCII) ? "\xf8\x88\x8f\xbf\xbe" : I8_to_native("\xfa\xa1\xbf\xbf\xbe"),
317 0x20FFFF => (isASCII) ? "\xf8\x88\x8f\xbf\xbf" : I8_to_native("\xfa\xa1\xbf\xbf\xbf"),
318
319 0x200000 - 1 => (isASCII) ? "\xf7\xbf\xbf\xbf" : I8_to_native("\xf9\xbf\xbf\xbf\xbf"),
320 0x200000 => (isASCII) ? "\xf8\x88\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
321 0x400000 - 1 => (isASCII) ? "\xf8\x8f\xbf\xbf\xbf" : I8_to_native("\xfb\xbf\xbf\xbf\xbf"),
322 0x400000 => (isASCII) ? "\xf8\x90\x80\x80\x80" : I8_to_native("\xfc\xa4\xa0\xa0\xa0\xa0"),
323 0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"),
324 0x4000000 => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"),
325 0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"),
326 0x4000000 => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"),
327 0x40000000 - 1 => (isASCII) ? "\xfc\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xfe\xbf\xbf\xbf\xbf\xbf\xbf"),
328 0x40000000 => (isASCII) ? "\xfd\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0"),
329 0x80000000 - 1 => (isASCII) ? "\xfd\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"),
c0236afe
KW
330 0x80000000 => (isASCII) ? "\xfe\x82\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
331 0xFFFFFFFF => (isASCII) ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
7dfd8446 332);
4deba822 333
7dfd8446
KW
334if ($is64bit) {
335 no warnings qw(overflow portable);
9d2d0ecd
KW
336 $code_points{0x100000000} = (isASCII)
337 ? "\xfe\x84\x80\x80\x80\x80\x80"
338 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0");
339 $code_points{0x1000000000 - 1} = (isASCII)
340 ? "\xfe\xbf\xbf\xbf\xbf\xbf\xbf"
341 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf\xbf");
342 $code_points{0x1000000000} = (isASCII)
343 ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
344 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0");
345 $code_points{0xFFFFFFFFFFFFFFFF} = (isASCII)
346 ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
347 : I8_to_native("\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf");
d566bd20
KW
348 if (isASCII) { # These could falsely show as overlongs in a naive implementation
349 $code_points{0x40000000000} = "\xff\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80";
350 $code_points{0x1000000000000} = "\xff\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80";
351 $code_points{0x40000000000000} = "\xff\x80\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80";
352 $code_points{0x1000000000000000} = "\xff\x80\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80";
353 # overflows
354 #$code_points{0xfoo} = "\xff\x81\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80";
355 }
356}
357elsif (! isASCII) { # 32-bit EBCDIC. 64-bit is clearer to handle, so doesn't need this test case
358 no warnings qw(overflow portable);
359 $code_points{0x40000000} = I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0");
7dfd8446 360}
eb83ed87 361
7dfd8446
KW
362# Now add in entries for each of code points 0-255, which require special
363# handling on EBCDIC. Remember the keys are Unicode values, and the values
364# are the native UTF-8. For invariants, the bytes are just the native chr.
365
366my $cp = 0;
367while ($cp < ((isASCII) ? 128 : 160)) { # This is from the definition of
368 # invariant
369 $code_points{$cp} = chr utf8::unicode_to_native($cp);
370 $cp++;
371}
372
373# Done with the invariants. Now do the variants. All in this range are 2
374# byte. Again, we can't use the internal functions to generate UTF-8, as
375# those are what we are trying to test. In the loop, we know what range the
376# continuation bytes can be in, and what the lowest start byte can be. So we
377# cycle through them.
378
379my $first_continuation = (isASCII) ? 0x80 : 0xA0;
380my $final_continuation = 0xBF;
381my $start = (isASCII) ? 0xC2 : 0xC5;
382
78a3c0f8
KW
383my $max_bytes = (isASCII) ? 13 : 14; # Max number of bytes in a UTF-8 sequence
384 # representing a single code point
385
7dfd8446
KW
386my $continuation = $first_continuation - 1;
387
388while ($cp < 255) {
389 if (++$continuation > $final_continuation) {
390
391 # Wrap to the next start byte when we reach the final continuation
392 # byte possible
393 $continuation = $first_continuation;
394 $start++;
395 }
396 $code_points{$cp} = I8_to_native(chr($start) . chr($continuation));
397
398 $cp++;
399}
eb83ed87
KW
400
401my @warnings;
402
403use warnings 'utf8';
404local $SIG{__WARN__} = sub { push @warnings, @_ };
405
9f2abfde
KW
406my %restriction_types;
407
408$restriction_types{""}{'valid_strings'} = "";
409$restriction_types{"c9strict"}{'valid_strings'} = "";
410$restriction_types{"strict"}{'valid_strings'} = "";
411$restriction_types{"fits_in_31_bits"}{'valid_strings'} = "";
412
413# This set of tests looks for basic sanity, and lastly tests various routines
414# for the given code point. If the earlier tests for that code point fail,
415# the later ones probably will too. Malformations are tested in later
7dfd8446
KW
416# segments of code.
417for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
418 keys %code_points)
419{
420 my $hex_u = sprintf("0x%02X", $u);
421 my $n = utf8::unicode_to_native($u);
422 my $hex_n = sprintf("0x%02X", $n);
423 my $bytes = $code_points{$u};
424
425 my $offskip_should_be;
426 {
427 no warnings qw(overflow portable);
428 $offskip_should_be = (isASCII)
429 ? ( $u < 0x80 ? 1 :
430 $u < 0x800 ? 2 :
431 $u < 0x10000 ? 3 :
432 $u < 0x200000 ? 4 :
433 $u < 0x4000000 ? 5 :
434 $u < 0x80000000 ? 6 : (($is64bit)
78a3c0f8 435 ? ($u < 0x1000000000 ? 7 : $max_bytes)
7dfd8446
KW
436 : 7)
437 )
438 : ($u < 0xA0 ? 1 :
439 $u < 0x400 ? 2 :
440 $u < 0x4000 ? 3 :
441 $u < 0x40000 ? 4 :
442 $u < 0x400000 ? 5 :
c0236afe 443 $u < 0x4000000 ? 6 :
78a3c0f8 444 $u < 0x40000000 ? 7 : $max_bytes );
7dfd8446
KW
445 }
446
447 # If this test fails, subsequent ones are meaningless.
448 next unless is(test_OFFUNISKIP($u), $offskip_should_be,
449 "Verify OFFUNISKIP($hex_u) is $offskip_should_be");
450 my $invariant = $offskip_should_be == 1;
451 my $display_invariant = $invariant || 0;
452 is(test_OFFUNI_IS_INVARIANT($u), $invariant,
453 "Verify OFFUNI_IS_INVARIANT($hex_u) is $display_invariant");
454
455 my $uvchr_skip_should_be = $offskip_should_be;
456 next unless is(test_UVCHR_SKIP($n), $uvchr_skip_should_be,
457 "Verify UVCHR_SKIP($hex_n) is $uvchr_skip_should_be");
458 is(test_UVCHR_IS_INVARIANT($n), $offskip_should_be == 1,
459 "Verify UVCHR_IS_INVARIANT($hex_n) is $display_invariant");
460
461 my $n_chr = chr $n;
462 utf8::upgrade $n_chr;
463
464 is(test_UTF8_SKIP($n_chr), $uvchr_skip_should_be,
465 "Verify UTF8_SKIP(chr $hex_n) is $uvchr_skip_should_be");
466
467 use bytes;
3d56ecbe
KW
468 my $byte_length = length $n_chr;
469 for (my $j = 0; $j < $byte_length; $j++) {
470 undef @warnings;
471
472 if ($j == $byte_length - 1) {
473 my $ret = test_is_utf8_valid_partial_char_flags($n_chr, $byte_length, 0);
474 is($ret, 0, " Verify is_utf8_valid_partial_char_flags(" . display_bytes($n_chr) . ") returns 0 for full character");
475 }
476 else {
477 my $bytes_so_far = substr($n_chr, 0, $j + 1);
478 my $ret = test_is_utf8_valid_partial_char_flags($bytes_so_far, $j + 1, 0);
479 is($ret, 1, " Verify is_utf8_valid_partial_char_flags(" . display_bytes($bytes_so_far) . ") returns 1");
480 }
481
482 unless (is(scalar @warnings, 0,
483 " Verify is_utf8_valid_partial_char_flags generated no warnings"))
484 {
d84e92aa 485 output_warnings(@warnings);
3d56ecbe
KW
486 }
487
7dfd8446
KW
488 my $b = substr($n_chr, $j, 1);
489 my $hex_b = sprintf("\"\\x%02x\"", ord $b);
490
491 my $byte_invariant = $j == 0 && $uvchr_skip_should_be == 1;
492 my $display_byte_invariant = $byte_invariant || 0;
493 next unless is(test_UTF8_IS_INVARIANT($b), $byte_invariant,
494 " Verify UTF8_IS_INVARIANT($hex_b) for byte $j "
495 . "is $display_byte_invariant");
496
497 my $is_start = $j == 0 && $uvchr_skip_should_be > 1;
498 my $display_is_start = $is_start || 0;
499 next unless is(test_UTF8_IS_START($b), $is_start,
500 " Verify UTF8_IS_START($hex_b) is $display_is_start");
501
502 my $is_continuation = $j != 0 && $uvchr_skip_should_be > 1;
503 my $display_is_continuation = $is_continuation || 0;
504 next unless is(test_UTF8_IS_CONTINUATION($b), $is_continuation,
505 " Verify UTF8_IS_CONTINUATION($hex_b) is "
506 . "$display_is_continuation");
507
508 my $is_continued = $uvchr_skip_should_be > 1;
509 my $display_is_continued = $is_continued || 0;
510 next unless is(test_UTF8_IS_CONTINUED($b), $is_continued,
511 " Verify UTF8_IS_CONTINUED($hex_b) is "
512 . "$display_is_continued");
513
514 my $is_downgradeable_start = $n < 256
515 && $uvchr_skip_should_be > 1
516 && $j == 0;
517 my $display_is_downgradeable_start = $is_downgradeable_start || 0;
518 next unless is(test_UTF8_IS_DOWNGRADEABLE_START($b),
519 $is_downgradeable_start,
520 " Verify UTF8_IS_DOWNGRADEABLE_START($hex_b) is "
521 . "$display_is_downgradeable_start");
522
523 my $is_above_latin1 = $n > 255 && $j == 0;
524 my $display_is_above_latin1 = $is_above_latin1 || 0;
525 next unless is(test_UTF8_IS_ABOVE_LATIN1($b),
526 $is_above_latin1,
527 " Verify UTF8_IS_ABOVE_LATIN1($hex_b) is "
528 . "$display_is_above_latin1");
529
530 my $is_possibly_problematic = $j == 0
531 && $n >= ((isASCII)
532 ? 0xD000
533 : 0x8000);
534 my $display_is_possibly_problematic = $is_possibly_problematic || 0;
535 next unless is(test_isUTF8_POSSIBLY_PROBLEMATIC($b),
536 $is_possibly_problematic,
537 " Verify isUTF8_POSSIBLY_PROBLEMATIC($hex_b) is "
538 . "$display_is_above_latin1");
539 }
540
541 # We are not trying to look for warnings, etc, so if they should occur, it
542 # is an error. But some of the code points here do cause warnings, so we
543 # check here and turn off the ones that apply to such code points. A
544 # later section of the code tests for these kinds of things.
046d01eb 545 my $this_utf8_flags = $look_for_everything_utf8n_to;
7dfd8446 546 my $len = length $bytes;
e23e8bc1
KW
547
548 my $valid_under_strict = 1;
a82be82b 549 my $valid_under_c9strict = 1;
9f2abfde 550 my $valid_for_fits_in_31_bits = 1;
7dfd8446
KW
551 if ($n > 0x10FFFF) {
552 $this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER);
e23e8bc1 553 $valid_under_strict = 0;
a82be82b 554 $valid_under_c9strict = 0;
5f8a3d1d
KW
555 if ($n > 2 ** 31 - 1) {
556 $this_utf8_flags &=
557 ~($UTF8_DISALLOW_ABOVE_31_BIT|$UTF8_WARN_ABOVE_31_BIT);
9f2abfde 558 $valid_for_fits_in_31_bits = 0;
5f8a3d1d 559 }
7dfd8446 560 }
5f8a3d1d 561 elsif (($n >= 0xFDD0 && $n <= 0xFDEF) || ($n & 0xFFFE) == 0xFFFE) {
7dfd8446 562 $this_utf8_flags &= ~($UTF8_DISALLOW_NONCHAR|$UTF8_WARN_NONCHAR);
e23e8bc1 563 $valid_under_strict = 0;
7dfd8446 564 }
5f8a3d1d
KW
565 elsif ($n >= 0xD800 && $n <= 0xDFFF) {
566 $this_utf8_flags &= ~($UTF8_DISALLOW_SURROGATE|$UTF8_WARN_SURROGATE);
567 $valid_under_c9strict = 0;
568 $valid_under_strict = 0;
569 }
7dfd8446
KW
570
571 undef @warnings;
572
573 my $display_flags = sprintf "0x%x", $this_utf8_flags;
7dfd8446 574 my $display_bytes = display_bytes($bytes);
f9380377 575 my $ret_ref = test_utf8n_to_uvchr_error($bytes, $len, $this_utf8_flags);
c0d8738e
KW
576
577 # Rest of tests likely meaningless if it gets the wrong code point.
578 next unless is($ret_ref->[0], $n,
f9380377
KW
579 "Verify utf8n_to_uvchr_error($display_bytes, $display_flags)"
580 . "returns $hex_n");
c0d8738e 581 is($ret_ref->[1], $len,
f9380377
KW
582 "Verify utf8n_to_uvchr_error() for $hex_n returns expected length:"
583 . " $len");
7dfd8446
KW
584
585 unless (is(scalar @warnings, 0,
f9380377 586 "Verify utf8n_to_uvchr_error() for $hex_n generated no warnings"))
7dfd8446 587 {
d84e92aa 588 output_warnings(@warnings);
7dfd8446 589 }
f9380377
KW
590 is($ret_ref->[2], 0,
591 "Verify utf8n_to_uvchr_error() returned no error bits");
046d01eb 592
75ffa578
KW
593 undef @warnings;
594
d7874298 595 my $ret = test_isUTF8_CHAR($bytes, $len);
9d2d0ecd 596 is($ret, $len, "Verify isUTF8_CHAR($display_bytes) returns expected length: $len");
d7874298
KW
597
598 unless (is(scalar @warnings, 0,
599 "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
600 {
d84e92aa 601 output_warnings(@warnings);
d7874298
KW
602 }
603
604 undef @warnings;
605
606 $ret = test_isUTF8_CHAR($bytes, $len - 1);
607 is($ret, 0, "Verify isUTF8_CHAR() with too short length parameter returns 0");
608
609 unless (is(scalar @warnings, 0,
610 "Verify isUTF8_CHAR() generated no warnings"))
611 {
d84e92aa 612 output_warnings(@warnings);
d7874298
KW
613 }
614
615 undef @warnings;
616
25e3a4e0
KW
617 $ret = test_isUTF8_CHAR_flags($bytes, $len, 0);
618 is($ret, $len, "Verify isUTF8_CHAR_flags($display_bytes, 0) returns expected length: $len");
619
620 unless (is(scalar @warnings, 0,
621 "Verify isUTF8_CHAR_flags() for $hex_n generated no warnings"))
622 {
d84e92aa 623 output_warnings(@warnings);
25e3a4e0
KW
624 }
625
626 undef @warnings;
627
628 $ret = test_isUTF8_CHAR_flags($bytes, $len - 1, 0);
629 is($ret, 0, "Verify isUTF8_CHAR_flags() with too short length parameter returns 0");
630
631 unless (is(scalar @warnings, 0,
632 "Verify isUTF8_CHAR_flags() generated no warnings"))
633 {
d84e92aa 634 output_warnings(@warnings);
25e3a4e0
KW
635 }
636
637 undef @warnings;
638
e23e8bc1
KW
639 $ret = test_isSTRICT_UTF8_CHAR($bytes, $len);
640 my $expected_len = ($valid_under_strict) ? $len : 0;
641 is($ret, $expected_len, "Verify isSTRICT_UTF8_CHAR($display_bytes) returns expected length: $expected_len");
642
643 unless (is(scalar @warnings, 0,
644 "Verify isSTRICT_UTF8_CHAR() for $hex_n generated no warnings"))
645 {
d84e92aa 646 output_warnings(@warnings);
e23e8bc1
KW
647 }
648
649 undef @warnings;
650
651 $ret = test_isSTRICT_UTF8_CHAR($bytes, $len - 1);
652 is($ret, 0, "Verify isSTRICT_UTF8_CHAR() with too short length parameter returns 0");
653
654 unless (is(scalar @warnings, 0,
655 "Verify isSTRICT_UTF8_CHAR() generated no warnings"))
656 {
d84e92aa 657 output_warnings(@warnings);
e23e8bc1
KW
658 }
659
25e3a4e0
KW
660 undef @warnings;
661
662 $ret = test_isUTF8_CHAR_flags($bytes, $len, $UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
663 is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE') acts like isSTRICT_UTF8_CHAR");
664
665 unless (is(scalar @warnings, 0,
666 "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
667 {
d84e92aa 668 output_warnings(@warnings);
25e3a4e0
KW
669 }
670
671 undef @warnings;
672
a82be82b
KW
673 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len);
674 $expected_len = ($valid_under_c9strict) ? $len : 0;
675 is($ret, $expected_len, "Verify isC9_STRICT_UTF8_CHAR($display_bytes) returns expected length: $len");
676
677 unless (is(scalar @warnings, 0,
678 "Verify isC9_STRICT_UTF8_CHAR() for $hex_n generated no warnings"))
679 {
d84e92aa 680 output_warnings(@warnings);
a82be82b
KW
681 }
682
683 undef @warnings;
684
685 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $len - 1);
686 is($ret, 0, "Verify isC9_STRICT_UTF8_CHAR() with too short length parameter returns 0");
687
688 unless (is(scalar @warnings, 0,
689 "Verify isC9_STRICT_UTF8_CHAR() generated no warnings"))
690 {
d84e92aa 691 output_warnings(@warnings);
a82be82b
KW
692 }
693
e23e8bc1
KW
694 undef @warnings;
695
25e3a4e0
KW
696 $ret = test_isUTF8_CHAR_flags($bytes, $len, $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
697 is($ret, $expected_len, "Verify isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like isC9_STRICT_UTF8_CHAR");
698
699 unless (is(scalar @warnings, 0,
700 "Verify isUTF8_CHAR() for $hex_n generated no warnings"))
701 {
d84e92aa 702 output_warnings(@warnings);
25e3a4e0
KW
703 }
704
705 undef @warnings;
706
75ffa578
KW
707 $ret_ref = test_valid_utf8_to_uvchr($bytes);
708 is($ret_ref->[0], $n, "Verify valid_utf8_to_uvchr($display_bytes) returns $hex_n");
9d2d0ecd 709 is($ret_ref->[1], $len, "Verify valid_utf8_to_uvchr() for $hex_n returns expected length: $len");
75ffa578
KW
710
711 unless (is(scalar @warnings, 0,
712 "Verify valid_utf8_to_uvchr() for $hex_n generated no warnings"))
713 {
d84e92aa 714 output_warnings(@warnings);
75ffa578
KW
715 }
716
046d01eb
KW
717 # Similarly for uvchr_to_utf8
718 my $this_uvchr_flags = $look_for_everything_uvchr_to;
719 if ($n > 2 ** 31 - 1) {
720 $this_uvchr_flags &=
721 ~($UNICODE_DISALLOW_ABOVE_31_BIT|$UNICODE_WARN_ABOVE_31_BIT);
722 }
723 if ($n > 0x10FFFF) {
724 $this_uvchr_flags &= ~($UNICODE_DISALLOW_SUPER|$UNICODE_WARN_SUPER);
725 }
5f8a3d1d 726 elsif (($n >= 0xFDD0 && $n <= 0xFDEF) || ($n & 0xFFFE) == 0xFFFE) {
046d01eb
KW
727 $this_uvchr_flags &= ~($UNICODE_DISALLOW_NONCHAR|$UNICODE_WARN_NONCHAR);
728 }
5f8a3d1d
KW
729 elsif ($n >= 0xD800 && $n <= 0xDFFF) {
730 $this_uvchr_flags &= ~($UNICODE_DISALLOW_SURROGATE|$UNICODE_WARN_SURROGATE);
731 }
046d01eb
KW
732 $display_flags = sprintf "0x%x", $this_uvchr_flags;
733
734 undef @warnings;
735
d7874298 736 $ret = test_uvchr_to_utf8_flags($n, $this_uvchr_flags);
046d01eb
KW
737 ok(defined $ret, "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returned success");
738 is($ret, $bytes, "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returns correct bytes");
739
740 unless (is(scalar @warnings, 0,
741 "Verify uvchr_to_utf8_flags($hex_n, $display_flags) for $hex_n generated no warnings"))
742 {
d84e92aa 743 output_warnings(@warnings);
046d01eb 744 }
9f2abfde
KW
745
746 # Now append this code point to a string that we will test various
747 # versions of is_foo_utf8_string_bar on, and keep a count of how many code
748 # points are in it. All the code points in this loop are valid in Perl's
749 # extended UTF-8, but some are not valid under various restrictions. A
750 # string and count is kept separately that is entirely valid for each
751 # restriction. And, for each restriction, we note the first occurrence in
752 # the unrestricted string where we find something not in the restricted
753 # string.
754 $restriction_types{""}{'valid_strings'} .= $bytes;
755 $restriction_types{""}{'valid_counts'}++;
756
757 if ($valid_under_c9strict) {
758 $restriction_types{"c9strict"}{'valid_strings'} .= $bytes;
759 $restriction_types{"c9strict"}{'valid_counts'}++;
760 }
761 elsif (! exists $restriction_types{"c9strict"}{'first_invalid_offset'}) {
762 $restriction_types{"c9strict"}{'first_invalid_offset'}
763 = length $restriction_types{"c9strict"}{'valid_strings'};
764 $restriction_types{"c9strict"}{'first_invalid_count'}
765 = $restriction_types{"c9strict"}{'valid_counts'};
766 }
767
768 if ($valid_under_strict) {
769 $restriction_types{"strict"}{'valid_strings'} .= $bytes;
770 $restriction_types{"strict"}{'valid_counts'}++;
771 }
772 elsif (! exists $restriction_types{"strict"}{'first_invalid_offset'}) {
773 $restriction_types{"strict"}{'first_invalid_offset'}
774 = length $restriction_types{"strict"}{'valid_strings'};
775 $restriction_types{"strict"}{'first_invalid_count'}
776 = $restriction_types{"strict"}{'valid_counts'};
777 }
778
779 if ($valid_for_fits_in_31_bits) {
780 $restriction_types{"fits_in_31_bits"}{'valid_strings'} .= $bytes;
781 $restriction_types{"fits_in_31_bits"}{'valid_counts'}++;
782 }
783 elsif (! exists
784 $restriction_types{"fits_in_31_bits"}{'first_invalid_offset'})
785 {
786 $restriction_types{"fits_in_31_bits"}{'first_invalid_offset'}
787 = length $restriction_types{"fits_in_31_bits"}{'valid_strings'};
788 $restriction_types{"fits_in_31_bits"}{'first_invalid_count'}
789 = $restriction_types{"fits_in_31_bits"}{'valid_counts'};
790 }
791}
792
793my $I8c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte
794my $cont_byte = I8_to_native($I8c);
795my $p = (isASCII) ? "\xe1\x80" : I8_to_native("\xE4\xA0"); # partial
796
797# The loop above tested the single or partial character functions/macros,
798# while building up strings to test the string functions, which we do now.
799
800for my $restriction (sort keys %restriction_types) {
801 use bytes;
802
803 for my $use_flags ("", "_flags") {
804
805 # For each restriction, we test it in both the is_foo_flags functions
806 # and the specially named foo function. But not if there isn't such a
807 # specially named function. Currently, this is the only tested
808 # restriction that doesn't have a specially named function
809 next if $use_flags eq "" && $restriction eq "fits_in_31_bits";
810
811 # Start building up the name of the function we will test.
8bc127bf 812 my $base_name = "is_";
9f2abfde
KW
813
814 if (! $use_flags && $restriction ne "") {
8bc127bf 815 $base_name .= $restriction . "_";
9f2abfde 816 }
8bc127bf
KW
817
818 # We test both "is_utf8_string_foo" and "is_fixed_width_buf" functions
819 foreach my $operand ('string', 'fixed_width_buf') {
820
821 # Currently, the only fixed_width_buf functions have the '_flags'
822 # suffix.
823 next if $operand eq 'fixed_width_buf' && $use_flags eq "";
824
825 my $name = "${base_name}utf8_$operand";
9f2abfde
KW
826
827 # We test each version of the function
828 for my $function ("_loclen", "_loc", "") {
829
830 # We test each function against
831 # a) valid input
832 # b) invalid input created by appending an out-of-place
833 # continuation character to the valid string
8bc127bf
KW
834 # c) input created by appending a partial character. This
835 # is valid in the 'fixed_width' functions, but invalid in
836 # the 'string' ones
9f2abfde
KW
837 # d) invalid input created by calling a function that is
838 # expecting a restricted form of the input using the string
839 # that's valid when unrestricted
840 for my $error_type (0, $cont_byte, $p, $restriction) {
841 #diag "restriction=$restriction, use_flags=$use_flags, function=$function, error_type=" . display_bytes($error_type);
842
843 # If there is no restriction, the error type will be "",
844 # which is redundant with 0.
845 next if $error_type eq "";
846
847 my $this_name = "$name$function$use_flags";
848 my $bytes
849 = $restriction_types{$restriction}{'valid_strings'};
850 my $expected_offset = length $bytes;
851 my $expected_count
852 = $restriction_types{$restriction}{'valid_counts'};
853 my $test_name_suffix = "";
854
855 my $this_error_type = $error_type;
856 if ($this_error_type) {
857
858 # Appending a bare continuation byte or a partial
8bc127bf
KW
859 # character doesn't change the character count or
860 # offset. But in the other cases, we have saved where
861 # the failures should occur, so use those. Appending
862 # a continuation byte makes it invalid; appending a
863 # partial character makes the 'string' form invalid,
864 # but not the 'fixed_width_buf' form.
9f2abfde
KW
865 if ($this_error_type eq $cont_byte || $this_error_type eq $p) {
866 $bytes .= $this_error_type;
867 if ($this_error_type eq $cont_byte) {
868 $test_name_suffix
869 = " for an unexpected continuation";
870 }
871 else {
872 $test_name_suffix
873 = " if ends with a partial character";
8bc127bf
KW
874 $this_error_type
875 = 0 if $operand eq "fixed_width_buf";
9f2abfde
KW
876 }
877 }
878 else {
879 $test_name_suffix
880 = " if contains forbidden code points";
881 if ($this_error_type eq "c9strict") {
882 $bytes = $restriction_types{""}{'valid_strings'};
883 $expected_offset
884 = $restriction_types{"c9strict"}
885 {'first_invalid_offset'};
886 $expected_count
887 = $restriction_types{"c9strict"}
888 {'first_invalid_count'};
889 }
890 elsif ($this_error_type eq "strict") {
891 $bytes = $restriction_types{""}{'valid_strings'};
892 $expected_offset
893 = $restriction_types{"strict"}
894 {'first_invalid_offset'};
895 $expected_count
896 = $restriction_types{"strict"}
897 {'first_invalid_count'};
898
899 }
900 elsif ($this_error_type eq "fits_in_31_bits") {
901 $bytes = $restriction_types{""}{'valid_strings'};
902 $expected_offset
903 = $restriction_types{"fits_in_31_bits"}
904 {'first_invalid_offset'};
905 $expected_count
906 = $restriction_types{"fits_in_31_bits"}
907 {'first_invalid_count'};
908 }
909 else {
910 fail("Internal test error: Unknown error type "
911 . "'$this_error_type'");
912 next;
913 }
914 }
915 }
916
917 my $length = length $bytes;
918 my $ret_ref;
919
920 my $test = "\$ret_ref = test_$this_name(\$bytes, $length";
921
922 # If using the _flags functions, we have to figure out what
923 # flags to pass. This is done to match the restriction.
924 if ($use_flags eq "_flags") {
925 if (! $restriction) {
926 $test .= ", 0"; # The flag
927
928 # Indicate the kind of flag in the test name.
929 $this_name .= "(0)";
930 }
931 else {
932 $this_name .= "($restriction)";
933 if ($restriction eq "c9strict") {
934 $test
935 .= ", $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE";
936 }
937 elsif ($restriction eq "strict") {
938 $test .= ", $UTF8_DISALLOW_ILLEGAL_INTERCHANGE";
939 }
940 elsif ($restriction eq "fits_in_31_bits") {
941 $test .= ", $UTF8_DISALLOW_ABOVE_31_BIT";
942 }
943 else {
944 fail("Internal test error: Unknown restriction "
945 . "'$restriction'");
946 next;
947 }
948 }
949 }
950 $test .= ")";
951
952 # Actually run the test
953 eval $test;
954 if ($@) {
955 fail($test);
956 diag $@;
957 next;
958 }
959
960 my $ret;
961 my $error_offset;
962 my $cp_count;
963
964 if ($function eq "") {
965 $ret = $ret_ref; # For plain function, there's only a
966 # single return value
967 }
968 else { # Otherwise, the multiple values come in an array.
969 $ret = shift @$ret_ref ;
970 $error_offset = shift @$ret_ref;
971 $cp_count = shift@$ret_ref if $function eq "_loclen";
972 }
973
974 if ($this_error_type) {
975 is($ret, 0,
976 "Verify $this_name is FALSE$test_name_suffix");
977 }
978 else {
979 unless(is($ret, 1,
980 "Verify $this_name is TRUE for valid input"
981 . "$test_name_suffix"))
982 {
983 diag("The bytes starting at offset"
984 . " $error_offset are"
985 . display_bytes(substr(
986 $restriction_types{$restriction}
987 {'valid_strings'},
988 $error_offset)));
989 next;
990 }
991 }
992
993 if ($function ne "") {
994 unless (is($error_offset, $expected_offset,
995 "\tAnd returns the correct offset"))
996 {
997 my $min = ($error_offset < $expected_offset)
998 ? $error_offset
999 : $expected_offset;
1000 diag display_bytes(substr($bytes, $min));
1001 }
1002
1003 if ($function eq '_loclen') {
1004 is($cp_count, $expected_count,
1005 "\tAnd returns the correct character count");
1006 }
1007 }
1008 }
1009 }
8bc127bf 1010 }
9f2abfde 1011 }
7dfd8446
KW
1012}
1013
1014my $REPLACEMENT = 0xFFFD;
1015
1016# Now test the malformations. All these raise category utf8 warnings.
7dfd8446 1017my @malformations = (
eb83ed87 1018 [ "zero length string malformation", "", 0,
f9380377 1019 $UTF8_ALLOW_EMPTY, $UTF8_GOT_EMPTY, 0, 0,
eb83ed87
KW
1020 qr/empty string/
1021 ],
152c1f4b 1022 [ "orphan continuation byte malformation", I8_to_native("${I8c}a"),
7dfd8446 1023 2,
f9380377 1024 $UTF8_ALLOW_CONTINUATION, $UTF8_GOT_CONTINUATION, $REPLACEMENT, 1,
eb83ed87
KW
1025 qr/unexpected continuation byte/
1026 ],
7dfd8446 1027 [ "premature next character malformation (immediate)",
9d2d0ecd
KW
1028 (isASCII) ? "\xc2\xc2\x80" : I8_to_native("\xc5\xc5\xa0"),
1029 3,
f9380377 1030 $UTF8_ALLOW_NON_CONTINUATION, $UTF8_GOT_NON_CONTINUATION, $REPLACEMENT, 1,
eb83ed87
KW
1031 qr/unexpected non-continuation byte.*immediately after start byte/
1032 ],
7dfd8446 1033 [ "premature next character malformation (non-immediate)",
1b514755 1034 I8_to_native("\xf1${I8c}a"),
7dfd8446 1035 3,
f9380377 1036 $UTF8_ALLOW_NON_CONTINUATION, $UTF8_GOT_NON_CONTINUATION, $REPLACEMENT, 2,
eb83ed87
KW
1037 qr/unexpected non-continuation byte .* 2 bytes after start byte/
1038 ],
1b514755 1039 [ "too short malformation", I8_to_native("\xf1${I8c}a"), 2,
eb83ed87
KW
1040 # Having the 'a' after this, but saying there are only 2 bytes also
1041 # tests that we pay attention to the passed in length
f9380377 1042 $UTF8_ALLOW_SHORT, $UTF8_GOT_SHORT, $REPLACEMENT, 2,
eb83ed87
KW
1043 qr/2 bytes, need 4/
1044 ],
d566bd20
KW
1045 [ "overlong malformation, lowest 2-byte",
1046 (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"),
1047 2,
f9380377 1048 $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
7dfd8446
KW
1049 0, # NUL
1050 2,
7cf8d05d 1051 qr/overlong/
c0236afe 1052 ],
d566bd20
KW
1053 [ "overlong malformation, highest 2-byte",
1054 (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
1055 2,
f9380377 1056 $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
f9913875 1057 (isASCII) ? 0x7F : utf8::unicode_to_native(0x9F),
d566bd20 1058 2,
7cf8d05d 1059 qr/overlong/
d566bd20
KW
1060 ],
1061 [ "overlong malformation, lowest 3-byte",
1062 (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
1063 3,
f9380377 1064 $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
d566bd20
KW
1065 0, # NUL
1066 3,
7cf8d05d 1067 qr/overlong/
d566bd20
KW
1068 ],
1069 [ "overlong malformation, highest 3-byte",
1070 (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
1071 3,
f9380377 1072 $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
d566bd20
KW
1073 (isASCII) ? 0x7FF : 0x3FF,
1074 3,
7cf8d05d 1075 qr/overlong/
d566bd20
KW
1076 ],
1077 [ "overlong malformation, lowest 4-byte",
1078 (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
1079 4,
f9380377 1080 $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
d566bd20
KW
1081 0, # NUL
1082 4,
7cf8d05d 1083 qr/overlong/
d566bd20
KW
1084 ],
1085 [ "overlong malformation, highest 4-byte",
1086 (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
1087 4,
f9380377 1088 $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
d566bd20
KW
1089 (isASCII) ? 0xFFFF : 0x3FFF,
1090 4,
7cf8d05d 1091 qr/overlong/
d566bd20
KW
1092 ],
1093 [ "overlong malformation, lowest 5-byte",
1094 (isASCII)
1095 ? "\xf8\x80\x80\x80\x80"
1096 : I8_to_native("\xf8\xa0\xa0\xa0\xa0"),
1097 5,
f9380377 1098 $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
d566bd20
KW
1099 0, # NUL
1100 5,
7cf8d05d 1101 qr/overlong/
d566bd20
KW
1102 ],
1103 [ "overlong malformation, highest 5-byte",
1104 (isASCII)
1105 ? "\xf8\x87\xbf\xbf\xbf"
1106 : I8_to_native("\xf8\xa7\xbf\xbf\xbf"),
1107 5,
f9380377 1108 $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
d566bd20
KW
1109 (isASCII) ? 0x1FFFFF : 0x3FFFF,
1110 5,
7cf8d05d 1111 qr/overlong/
d566bd20
KW
1112 ],
1113 [ "overlong malformation, lowest 6-byte",
1114 (isASCII)
1115 ? "\xfc\x80\x80\x80\x80\x80"
1116 : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"),
1117 6,
f9380377 1118 $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
d566bd20
KW
1119 0, # NUL
1120 6,
7cf8d05d 1121 qr/overlong/
d566bd20
KW
1122 ],
1123 [ "overlong malformation, highest 6-byte",
1124 (isASCII)
1125 ? "\xfc\x83\xbf\xbf\xbf\xbf"
1126 : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"),
1127 6,
f9380377 1128 $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
d566bd20
KW
1129 (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
1130 6,
7cf8d05d 1131 qr/overlong/
d566bd20
KW
1132 ],
1133 [ "overlong malformation, lowest 7-byte",
1134 (isASCII)
1135 ? "\xfe\x80\x80\x80\x80\x80\x80"
1136 : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"),
1137 7,
f9380377 1138 $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
d566bd20
KW
1139 0, # NUL
1140 7,
7cf8d05d 1141 qr/overlong/
d566bd20
KW
1142 ],
1143 [ "overlong malformation, highest 7-byte",
1144 (isASCII)
1145 ? "\xfe\x81\xbf\xbf\xbf\xbf\xbf"
1146 : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"),
1147 7,
f9380377 1148 $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
d566bd20
KW
1149 (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
1150 7,
7cf8d05d 1151 qr/overlong/
c0236afe
KW
1152 ],
1153);
7dfd8446 1154
d566bd20
KW
1155if (isASCII && ! $is64bit) { # 32-bit ASCII platform
1156 no warnings 'portable';
1157 push @malformations,
1158 [ "overflow malformation",
1159 "\xfe\x84\x80\x80\x80\x80\x80", # Represents 2**32
1160 7,
1161 0, # There is no way to allow this malformation
f9380377 1162 $UTF8_GOT_OVERFLOW,
d566bd20
KW
1163 $REPLACEMENT,
1164 7,
7cf8d05d 1165 qr/overflows/
d566bd20
KW
1166 ],
1167 [ "overflow malformation, can tell on first byte",
1168 "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
78a3c0f8 1169 $max_bytes,
d566bd20 1170 0, # There is no way to allow this malformation
f9380377 1171 $UTF8_GOT_OVERFLOW,
d566bd20 1172 $REPLACEMENT,
78a3c0f8 1173 $max_bytes,
7cf8d05d 1174 qr/overflows/
d566bd20
KW
1175 ];
1176}
1177else {
1178 # On EBCDIC platforms, another overlong test is needed even on 32-bit
1179 # systems, whereas it doesn't happen on ASCII except on 64-bit ones.
1180
1181 no warnings 'portable';
1182 no warnings 'overflow'; # Doesn't run on 32-bit systems, but compiles
1183 push @malformations,
1184 [ "overlong malformation, lowest max-byte",
1185 (isASCII)
1186 ? "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
1187 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
78a3c0f8 1188 $max_bytes,
f9380377 1189 $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
d566bd20 1190 0, # NUL
78a3c0f8 1191 $max_bytes,
7cf8d05d 1192 qr/overlong/,
d566bd20
KW
1193 ],
1194 [ "overlong malformation, highest max-byte",
1195 (isASCII) # 2**36-1 on ASCII; 2**30-1 on EBCDIC
1196 ? "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf"
1197 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"),
78a3c0f8 1198 $max_bytes,
f9380377 1199 $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
d566bd20 1200 (isASCII) ? 0xFFFFFFFFF : 0x3FFFFFFF,
78a3c0f8 1201 $max_bytes,
7cf8d05d 1202 qr/overlong/,
d566bd20
KW
1203 ];
1204
1205 if (! $is64bit) { # 32-bit EBCDIC
1206 push @malformations,
1207 [ "overflow malformation",
1208 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
78a3c0f8 1209 $max_bytes,
d566bd20 1210 0, # There is no way to allow this malformation
f9380377 1211 $UTF8_GOT_OVERFLOW,
d566bd20 1212 $REPLACEMENT,
78a3c0f8 1213 $max_bytes,
7cf8d05d 1214 qr/overflows/
d566bd20
KW
1215 ];
1216 }
1217 else { # 64-bit
1218 push @malformations,
1219 [ "overflow malformation",
1220 (isASCII)
1221 ? "\xff\x80\x90\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
1222 : I8_to_native("\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
78a3c0f8 1223 $max_bytes,
d566bd20 1224 0, # There is no way to allow this malformation
f9380377 1225 $UTF8_GOT_OVERFLOW,
d566bd20 1226 $REPLACEMENT,
78a3c0f8 1227 $max_bytes,
7cf8d05d 1228 qr/overflows/
d566bd20
KW
1229 ];
1230 }
1231}
1232
7dfd8446 1233foreach my $test (@malformations) {
f9380377
KW
1234 my ($testname, $bytes, $length, $allow_flags, $expected_error_flags,
1235 $allowed_uv, $expected_len, $message ) = @$test;
eb83ed87 1236
96f5e3aa
KW
1237 if (length($bytes) < $length) {
1238 fail("Internal test error: actual buffer length (" . length($bytes)
1239 . ") must be at least as high as how far we are allowed to read"
1240 . " into it ($length)");
1241 diag($testname);
1242 next;
1243 }
eb83ed87 1244
d7874298
KW
1245 undef @warnings;
1246
1247 my $ret = test_isUTF8_CHAR($bytes, $length);
1248 is($ret, 0, "$testname: isUTF8_CHAR returns 0");
1249 unless (is(scalar @warnings, 0,
1250 "$testname: isUTF8_CHAR() generated no warnings"))
1251 {
d84e92aa 1252 output_warnings(@warnings);
d7874298
KW
1253 }
1254
25e3a4e0
KW
1255 undef @warnings;
1256
1257 $ret = test_isUTF8_CHAR_flags($bytes, $length, 0);
1258 is($ret, 0, "$testname: isUTF8_CHAR_flags returns 0");
1259 unless (is(scalar @warnings, 0,
1260 "$testname: isUTF8_CHAR() generated no warnings"))
1261 {
d84e92aa 1262 output_warnings(@warnings);
25e3a4e0
KW
1263 }
1264
e23e8bc1
KW
1265 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
1266 is($ret, 0, "$testname: isSTRICT_UTF8_CHAR returns 0");
1267 unless (is(scalar @warnings, 0,
1268 "$testname: isSTRICT_UTF8_CHAR() generated no warnings"))
1269 {
d84e92aa 1270 output_warnings(@warnings);
e23e8bc1
KW
1271 }
1272
a82be82b
KW
1273 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
1274 is($ret, 0, "$testname: isC9_STRICT_UTF8_CHAR returns 0");
1275 unless (is(scalar @warnings, 0,
1276 "$testname: isC9_STRICT_UTF8_CHAR() generated no warnings"))
1277 {
d84e92aa 1278 output_warnings(@warnings);
a82be82b
KW
1279 }
1280
3d56ecbe
KW
1281 for my $j (1 .. $length - 1) {
1282 my $partial = substr($bytes, 0, $j);
1283
1284 undef @warnings;
1285
1286 $ret = test_is_utf8_valid_partial_char_flags($bytes, $j, 0);
1287 my $ret_should_be = 0;
1288 my $comment = "";
1b514755
KW
1289 if ($testname =~ /premature|short/ && $j < 3) {
1290
1291 # The tests are hard-coded so these relationships hold
1292 my $cut_off = 2;
1293 $cut_off = 3 if $testname =~ /non-immediate/;
1294 if ($j < $cut_off) {
1295 $ret_should_be = 1;
1296 $comment = ", but need $cut_off bytes to discern:";
1297 }
3d56ecbe 1298 }
57b7eb53
KW
1299 elsif ($testname =~ /overlong/ && ! isASCII && $length == 3) {
1300 # 3-byte overlongs on EBCDIC are determinable on the first byte
1301 }
3d56ecbe
KW
1302 elsif ($testname =~ /overlong/ && $length > 2) {
1303 if ($length <= 7 && $j < 2) {
1304 $ret_should_be = 1;
1305 $comment = ", but need 2 bytes to discern:";
1306 }
1307 elsif ($length > 7 && $j < 7) {
1308 $ret_should_be = 1;
1309 $comment = ", but need 7 bytes to discern:";
1310 }
1311 }
1312 elsif ($testname =~ /overflow/ && $testname !~ /first byte/) {
1313 if (isASCII) {
1314 if ($j < (($is64bit) ? 3 : 2)) {
1315 $comment = ", but need $j bytes to discern:";
1316 $ret_should_be = 1;
1317 }
1318 }
1319 else {
1320 if ($j < (($is64bit) ? 2 : 8)) {
1321 $comment = ", but need $j bytes to discern:";
1322 $ret_should_be = 1;
1323 }
1324 }
1325 }
1326 is($ret, $ret_should_be, "$testname: is_utf8_valid_partial_char_flags("
1327 . display_bytes($partial)
1328 . ")$comment returns $ret_should_be");
1329 unless (is(scalar @warnings, 0,
1330 "$testname: is_utf8_valid_partial_char_flags() generated no warnings"))
1331 {
d84e92aa 1332 output_warnings(@warnings);
3d56ecbe
KW
1333 }
1334 }
1335
d7874298 1336
eb83ed87
KW
1337 # Test what happens when this malformation is not allowed
1338 undef @warnings;
f9380377 1339 my $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, 0);
eb83ed87 1340 is($ret_ref->[0], 0, "$testname: disallowed: Returns 0");
f9380377
KW
1341 is($ret_ref->[1], $expected_len,
1342 "$testname: utf8n_to_uvchr_error(), disallowed: Returns expected"
1343 . " length: $expected_len");
1344 if (is(scalar @warnings, 1,
1345 "$testname: disallowed: Got a single warning "))
1346 {
1347 like($warnings[0], $message,
1348 "$testname: disallowed: Got expected warning");
eb83ed87
KW
1349 }
1350 else {
1351 if (scalar @warnings) {
d84e92aa 1352 output_warnings(@warnings);
eb83ed87
KW
1353 }
1354 }
f9380377
KW
1355 is($ret_ref->[2], $expected_error_flags,
1356 "$testname: utf8n_to_uvchr_error(), disallowed:"
1357 . " Returns expected error");
eb83ed87
KW
1358
1359 { # Next test when disallowed, and warnings are off.
1360 undef @warnings;
1361 no warnings 'utf8';
f9380377
KW
1362 my $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, 0);
1363 is($ret_ref->[0], 0,
1364 "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':"
1365 . " Returns 0");
1366 is($ret_ref->[1], $expected_len,
1367 "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':"
1368 . " Returns expected length: $expected_len");
1369 if (!is(scalar @warnings, 0,
1370 "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':"
1371 . " no warnings generated"))
1372 {
d84e92aa 1373 output_warnings(@warnings);
eb83ed87 1374 }
f9380377
KW
1375 is($ret_ref->[2], $expected_error_flags,
1376 "$testname: utf8n_to_uvchr_error(), disallowed: Returns"
1377 . " expected error");
eb83ed87
KW
1378 }
1379
1380 # Test with CHECK_ONLY
1381 undef @warnings;
f9380377 1382 $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, $UTF8_CHECK_ONLY);
eb83ed87 1383 is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0");
9d2d0ecd 1384 is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns -1 for length");
eb83ed87 1385 if (! is(scalar @warnings, 0, "$testname: CHECK_ONLY: no warnings generated")) {
d84e92aa 1386 output_warnings(@warnings);
eb83ed87 1387 }
f9380377
KW
1388 is($ret_ref->[2], $expected_error_flags,
1389 "$testname: utf8n_to_uvchr_error(), disallowed: Returns expected"
1390 . " error");
eb83ed87
KW
1391
1392 next if $allow_flags == 0; # Skip if can't allow this malformation
1393
1394 # Test when the malformation is allowed
1395 undef @warnings;
f9380377
KW
1396 $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, $allow_flags);
1397 is($ret_ref->[0], $allowed_uv,
1398 "$testname: utf8n_to_uvchr_error(), allowed: Returns expected uv: "
1399 . sprintf("0x%04X", $allowed_uv));
1400 is($ret_ref->[1], $expected_len,
1401 "$testname: utf8n_to_uvchr_error(), allowed: Returns expected length:"
1402 . " $expected_len");
1403 if (!is(scalar @warnings, 0,
1404 "$testname: utf8n_to_uvchr_error(), allowed: no warnings"
1405 . " generated"))
eb83ed87 1406 {
d84e92aa 1407 output_warnings(@warnings);
eb83ed87 1408 }
f9380377
KW
1409 is($ret_ref->[2], $expected_error_flags,
1410 "$testname: utf8n_to_uvchr_error(), disallowed: Returns"
1411 . " expected error");
eb83ed87
KW
1412}
1413
2b5e7bc2
KW
1414sub nonportable_regex ($) {
1415
1416 # Returns a pattern that matches the non-portable message raised either
1417 # for the specific input code point, or the one generated when there
1418 # is some malformation that precludes the message containing the specific
1419 # code point
1420
1421 my $code_point = shift;
1422
22123136 1423 my $string = sprintf '(Code point 0x%X is not Unicode, and'
2b5e7bc2
KW
1424 . '|Any UTF-8 sequence that starts with'
1425 . ' "(\\\x[[:xdigit:]]{2})+" is for a'
1426 . ' non-Unicode code point, and is) not portable',
1427 $code_point;
1428 return qr/$string/;
1429}
1430
eb83ed87
KW
1431# Now test the cases where a legal code point is generated, but may or may not
1432# be allowed/warned on.
2f8f112e 1433my @tests = (
7dfd8446
KW
1434 [ "lowest surrogate",
1435 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
f9380377 1436 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE,
7dfd8446
KW
1437 'surrogate', 0xD800,
1438 (isASCII) ? 3 : 4,
1439 qr/surrogate/
1440 ],
1441 [ "a middle surrogate",
1442 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
f9380377 1443 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE,
7dfd8446
KW
1444 'surrogate', 0xD90D,
1445 (isASCII) ? 3 : 4,
eb83ed87
KW
1446 qr/surrogate/
1447 ],
7dfd8446
KW
1448 [ "highest surrogate",
1449 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
f9380377 1450 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE,
7dfd8446
KW
1451 'surrogate', 0xDFFF,
1452 (isASCII) ? 3 : 4,
1453 qr/surrogate/
1454 ],
1455 [ "first non_unicode",
1456 (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
f9380377 1457 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, $UTF8_GOT_SUPER,
7dfd8446
KW
1458 'non_unicode', 0x110000,
1459 (isASCII) ? 4 : 5,
2b5e7bc2 1460 qr/(not Unicode|for a non-Unicode code point).* may not be portable/
7dfd8446 1461 ],
d566bd20
KW
1462 [ "non_unicode whose first byte tells that",
1463 (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
f9380377 1464 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, $UTF8_GOT_SUPER,
d566bd20
KW
1465 'non_unicode',
1466 (isASCII) ? 0x140000 : 0x200000,
1467 (isASCII) ? 4 : 5,
2b5e7bc2 1468 qr/(not Unicode|for a non-Unicode code point).* may not be portable/
d566bd20 1469 ],
7dfd8446
KW
1470 [ "first of 32 consecutive non-character code points",
1471 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
f9380377 1472 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1473 'nonchar', 0xFDD0,
1474 (isASCII) ? 3 : 4,
1475 qr/Unicode non-character.*is not recommended for open interchange/
1476 ],
1477 [ "a mid non-character code point of the 32 consecutive ones",
1478 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
f9380377 1479 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1480 'nonchar', 0xFDE0,
1481 (isASCII) ? 3 : 4,
1482 qr/Unicode non-character.*is not recommended for open interchange/
1483 ],
1484 [ "final of 32 consecutive non-character code points",
1485 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
f9380377 1486 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1487 'nonchar', 0xFDEF,
1488 (isASCII) ? 3 : 4,
1489 qr/Unicode non-character.*is not recommended for open interchange/
1490 ],
1491 [ "non-character code point U+FFFE",
1492 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
f9380377 1493 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1494 'nonchar', 0xFFFE,
1495 (isASCII) ? 3 : 4,
1496 qr/Unicode non-character.*is not recommended for open interchange/
1497 ],
1498 [ "non-character code point U+FFFF",
1499 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
f9380377 1500 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1501 'nonchar', 0xFFFF,
1502 (isASCII) ? 3 : 4,
1503 qr/Unicode non-character.*is not recommended for open interchange/
1504 ],
1505 [ "non-character code point U+1FFFE",
1506 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
f9380377 1507 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1508 'nonchar', 0x1FFFE, 4,
1509 qr/Unicode non-character.*is not recommended for open interchange/
1510 ],
1511 [ "non-character code point U+1FFFF",
1512 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
f9380377 1513 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1514 'nonchar', 0x1FFFF, 4,
1515 qr/Unicode non-character.*is not recommended for open interchange/
1516 ],
1517 [ "non-character code point U+2FFFE",
1518 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
f9380377 1519 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1520 'nonchar', 0x2FFFE, 4,
1521 qr/Unicode non-character.*is not recommended for open interchange/
1522 ],
1523 [ "non-character code point U+2FFFF",
1524 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
f9380377 1525 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1526 'nonchar', 0x2FFFF, 4,
1527 qr/Unicode non-character.*is not recommended for open interchange/
1528 ],
1529 [ "non-character code point U+3FFFE",
1530 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
f9380377 1531 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1532 'nonchar', 0x3FFFE, 4,
1533 qr/Unicode non-character.*is not recommended for open interchange/
1534 ],
1535 [ "non-character code point U+3FFFF",
1536 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
f9380377 1537 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1538 'nonchar', 0x3FFFF, 4,
1539 qr/Unicode non-character.*is not recommended for open interchange/
1540 ],
1541 [ "non-character code point U+4FFFE",
1542 (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
f9380377 1543 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1544 'nonchar', 0x4FFFE,
1545 (isASCII) ? 4 : 5,
1546 qr/Unicode non-character.*is not recommended for open interchange/
1547 ],
1548 [ "non-character code point U+4FFFF",
1549 (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
f9380377 1550 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1551 'nonchar', 0x4FFFF,
1552 (isASCII) ? 4 : 5,
1553 qr/Unicode non-character.*is not recommended for open interchange/
1554 ],
1555 [ "non-character code point U+5FFFE",
1556 (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
f9380377 1557 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1558 'nonchar', 0x5FFFE,
1559 (isASCII) ? 4 : 5,
1560 qr/Unicode non-character.*is not recommended for open interchange/
1561 ],
1562 [ "non-character code point U+5FFFF",
1563 (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
f9380377 1564 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1565 'nonchar', 0x5FFFF,
1566 (isASCII) ? 4 : 5,
1567 qr/Unicode non-character.*is not recommended for open interchange/
1568 ],
1569 [ "non-character code point U+6FFFE",
1570 (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
f9380377 1571 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1572 'nonchar', 0x6FFFE,
1573 (isASCII) ? 4 : 5,
1574 qr/Unicode non-character.*is not recommended for open interchange/
1575 ],
1576 [ "non-character code point U+6FFFF",
1577 (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
f9380377 1578 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1579 'nonchar', 0x6FFFF,
1580 (isASCII) ? 4 : 5,
1581 qr/Unicode non-character.*is not recommended for open interchange/
1582 ],
1583 [ "non-character code point U+7FFFE",
1584 (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
f9380377 1585 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1586 'nonchar', 0x7FFFE,
1587 (isASCII) ? 4 : 5,
1588 qr/Unicode non-character.*is not recommended for open interchange/
1589 ],
1590 [ "non-character code point U+7FFFF",
1591 (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
f9380377 1592 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1593 'nonchar', 0x7FFFF,
1594 (isASCII) ? 4 : 5,
1595 qr/Unicode non-character.*is not recommended for open interchange/
1596 ],
1597 [ "non-character code point U+8FFFE",
1598 (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
f9380377 1599 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1600 'nonchar', 0x8FFFE,
1601 (isASCII) ? 4 : 5,
1602 qr/Unicode non-character.*is not recommended for open interchange/
1603 ],
1604 [ "non-character code point U+8FFFF",
1605 (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
f9380377 1606 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1607 'nonchar', 0x8FFFF,
1608 (isASCII) ? 4 : 5,
1609 qr/Unicode non-character.*is not recommended for open interchange/
1610 ],
1611 [ "non-character code point U+9FFFE",
1612 (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
f9380377 1613 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1614 'nonchar', 0x9FFFE,
1615 (isASCII) ? 4 : 5,
1616 qr/Unicode non-character.*is not recommended for open interchange/
1617 ],
1618 [ "non-character code point U+9FFFF",
1619 (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
f9380377 1620 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1621 'nonchar', 0x9FFFF,
1622 (isASCII) ? 4 : 5,
1623 qr/Unicode non-character.*is not recommended for open interchange/
1624 ],
1625 [ "non-character code point U+AFFFE",
1626 (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
f9380377 1627 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1628 'nonchar', 0xAFFFE,
1629 (isASCII) ? 4 : 5,
1630 qr/Unicode non-character.*is not recommended for open interchange/
1631 ],
1632 [ "non-character code point U+AFFFF",
1633 (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
f9380377 1634 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1635 'nonchar', 0xAFFFF,
1636 (isASCII) ? 4 : 5,
1637 qr/Unicode non-character.*is not recommended for open interchange/
1638 ],
1639 [ "non-character code point U+BFFFE",
1640 (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
f9380377 1641 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1642 'nonchar', 0xBFFFE,
1643 (isASCII) ? 4 : 5,
1644 qr/Unicode non-character.*is not recommended for open interchange/
1645 ],
1646 [ "non-character code point U+BFFFF",
1647 (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
f9380377 1648 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1649 'nonchar', 0xBFFFF,
1650 (isASCII) ? 4 : 5,
1651 qr/Unicode non-character.*is not recommended for open interchange/
1652 ],
1653 [ "non-character code point U+CFFFE",
1654 (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
f9380377 1655 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1656 'nonchar', 0xCFFFE,
1657 (isASCII) ? 4 : 5,
1658 qr/Unicode non-character.*is not recommended for open interchange/
1659 ],
1660 [ "non-character code point U+CFFFF",
1661 (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
f9380377 1662 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1663 'nonchar', 0xCFFFF,
1664 (isASCII) ? 4 : 5,
1665 qr/Unicode non-character.*is not recommended for open interchange/
1666 ],
1667 [ "non-character code point U+DFFFE",
1668 (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
f9380377 1669 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1670 'nonchar', 0xDFFFE,
1671 (isASCII) ? 4 : 5,
1672 qr/Unicode non-character.*is not recommended for open interchange/
1673 ],
1674 [ "non-character code point U+DFFFF",
1675 (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
f9380377 1676 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1677 'nonchar', 0xDFFFF,
1678 (isASCII) ? 4 : 5,
1679 qr/Unicode non-character.*is not recommended for open interchange/
1680 ],
1681 [ "non-character code point U+EFFFE",
1682 (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
f9380377 1683 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1684 'nonchar', 0xEFFFE,
1685 (isASCII) ? 4 : 5,
1686 qr/Unicode non-character.*is not recommended for open interchange/
1687 ],
1688 [ "non-character code point U+EFFFF",
1689 (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
f9380377 1690 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1691 'nonchar', 0xEFFFF,
1692 (isASCII) ? 4 : 5,
1693 qr/Unicode non-character.*is not recommended for open interchange/
1694 ],
1695 [ "non-character code point U+FFFFE",
1696 (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
f9380377 1697 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1698 'nonchar', 0xFFFFE,
1699 (isASCII) ? 4 : 5,
1700 qr/Unicode non-character.*is not recommended for open interchange/
1701 ],
1702 [ "non-character code point U+FFFFF",
1703 (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
f9380377 1704 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1705 'nonchar', 0xFFFFF,
1706 (isASCII) ? 4 : 5,
1707 qr/Unicode non-character.*is not recommended for open interchange/
1708 ],
1709 [ "non-character code point U+10FFFE",
1710 (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
f9380377 1711 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1712 'nonchar', 0x10FFFE,
1713 (isASCII) ? 4 : 5,
1714 qr/Unicode non-character.*is not recommended for open interchange/
eb83ed87 1715 ],
7dfd8446
KW
1716 [ "non-character code point U+10FFFF",
1717 (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
f9380377 1718 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
7dfd8446
KW
1719 'nonchar', 0x10FFFF,
1720 (isASCII) ? 4 : 5,
ba707cdc 1721 qr/Unicode non-character.*is not recommended for open interchange/
eb83ed87 1722 ],
1d1c12d9 1723 [ "requires at least 32 bits",
c0236afe
KW
1724 (isASCII)
1725 ? "\xfe\x82\x80\x80\x80\x80\x80"
1726 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
eb83ed87 1727 # This code point is chosen so that it is representable in a UV on
2f8f112e 1728 # 32-bit machines
1d1c12d9 1729 $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
f9380377 1730 $UTF8_GOT_ABOVE_31_BIT,
78a3c0f8 1731 'utf8', 0x80000000, (isASCII) ? 7 : $max_bytes,
2b5e7bc2 1732 nonportable_regex(0x80000000)
7dfd8446 1733 ],
b0b342d4
KW
1734 [ "highest 32 bit code point",
1735 (isASCII)
1736 ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
1737 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
1738 $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
1739 $UTF8_GOT_ABOVE_31_BIT,
1740 'utf8', 0xFFFFFFFF, (isASCII) ? 7 : $max_bytes,
1741 nonportable_regex(0xffffffff)
1742 ],
1d1c12d9 1743 [ "requires at least 32 bits, and use SUPER-type flags, instead of ABOVE_31_BIT",
c0236afe
KW
1744 (isASCII)
1745 ? "\xfe\x82\x80\x80\x80\x80\x80"
1746 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
f9380377 1747 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, $UTF8_GOT_SUPER,
78a3c0f8 1748 'utf8', 0x80000000, (isASCII) ? 7 : $max_bytes,
2b5e7bc2 1749 nonportable_regex(0x80000000)
eb83ed87 1750 ],
1d1c12d9
KW
1751 [ "overflow with warnings/disallow for more than 31 bits",
1752 # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
1753 # with overflow. The overflow malformation is never allowed, so
1754 # preventing it takes precedence if the ABOVE_31_BIT options would
c0236afe 1755 # otherwise allow in an overflowing value. The ASCII code points (1
1d1c12d9
KW
1756 # for 32-bits; 1 for 64) were chosen because the old overflow
1757 # detection algorithm did not catch them; this means this test also
c0236afe
KW
1758 # checks for that fix. The EBCDIC are arbitrary overflowing ones
1759 # since we have no reports of failures with it.
1760 (($is64bit)
1761 ? ((isASCII)
9d2d0ecd 1762 ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
c0236afe
KW
1763 : I8_to_native("\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
1764 : ((isASCII)
9d2d0ecd 1765 ? "\xfe\x86\x80\x80\x80\x80\x80"
c0236afe 1766 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
f9380377
KW
1767 $UTF8_WARN_ABOVE_31_BIT,
1768 $UTF8_DISALLOW_ABOVE_31_BIT,
1769 $UTF8_GOT_ABOVE_31_BIT,
7dfd8446 1770 'utf8', 0,
78a3c0f8 1771 (! isASCII) ? $max_bytes : ($is64bit) ? $max_bytes : 7, # XXX
7cf8d05d 1772 qr/overflows/
eb83ed87 1773 ],
c0236afe 1774);
2f8f112e 1775
b0b342d4
KW
1776if (! $is64bit) {
1777 if (isASCII) {
1778 no warnings qw{portable overflow};
1779 push @tests,
1780 [ "Lowest 33 bit code point: overflow",
1781 "\xFE\x84\x80\x80\x80\x80\x80",
1782 $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
1783 $UTF8_GOT_ABOVE_31_BIT,
1784 'utf8', 0x100000000, 7,
1785 qr/and( is)? not portable/
1786 ];
1787 }
1788}
1789else {
c0236afe 1790 no warnings qw{portable overflow};
2f8f112e 1791 push @tests,
c0236afe
KW
1792 [ "More than 32 bits",
1793 (isASCII)
9d2d0ecd 1794 ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
c0236afe 1795 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1d1c12d9 1796 $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
f9380377 1797 $UTF8_GOT_ABOVE_31_BIT,
78a3c0f8 1798 'utf8', 0x1000000000, $max_bytes,
2b5e7bc2 1799 qr/and( is)? not portable/
2f8f112e 1800 ];
83dc0f42
KW
1801 if (! isASCII) {
1802 push @tests, # These could falsely show wrongly in a naive implementation
1803 [ "requires at least 32 bits",
1804 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1805 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
f9380377 1806 $UTF8_GOT_ABOVE_31_BIT,
78a3c0f8 1807 'utf8', 0x800000000, $max_bytes,
2b5e7bc2 1808 nonportable_regex(0x80000000)
83dc0f42
KW
1809 ],
1810 [ "requires at least 32 bits",
1811 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1812 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
f9380377 1813 $UTF8_GOT_ABOVE_31_BIT,
78a3c0f8 1814 'utf8', 0x10000000000, $max_bytes,
2b5e7bc2 1815 nonportable_regex(0x10000000000)
83dc0f42
KW
1816 ],
1817 [ "requires at least 32 bits",
1818 I8_to_native("\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1819 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
f9380377 1820 $UTF8_GOT_ABOVE_31_BIT,
78a3c0f8 1821 'utf8', 0x200000000000, $max_bytes,
2b5e7bc2 1822 nonportable_regex(0x20000000000)
83dc0f42
KW
1823 ],
1824 [ "requires at least 32 bits",
1825 I8_to_native("\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1826 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
f9380377 1827 $UTF8_GOT_ABOVE_31_BIT,
78a3c0f8 1828 'utf8', 0x4000000000000, $max_bytes,
2b5e7bc2 1829 nonportable_regex(0x4000000000000)
83dc0f42
KW
1830 ],
1831 [ "requires at least 32 bits",
1832 I8_to_native("\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1833 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
f9380377 1834 $UTF8_GOT_ABOVE_31_BIT,
78a3c0f8 1835 'utf8', 0x80000000000000, $max_bytes,
2b5e7bc2 1836 nonportable_regex(0x80000000000000)
83dc0f42
KW
1837 ],
1838 [ "requires at least 32 bits",
1839 I8_to_native("\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
2b5e7bc2 1840 #IBM-1047 \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
83dc0f42 1841 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
f9380377 1842 $UTF8_GOT_ABOVE_31_BIT,
78a3c0f8 1843 'utf8', 0x1000000000000000, $max_bytes,
2b5e7bc2 1844 nonportable_regex(0x1000000000000000)
83dc0f42
KW
1845 ];
1846 }
2f8f112e
KW
1847}
1848
1849foreach my $test (@tests) {
f9380377
KW
1850 my ($testname, $bytes, $warn_flags, $disallow_flags, $expected_error_flags,
1851 $category, $allowed_uv, $expected_len, $message ) = @$test;
eb83ed87
KW
1852
1853 my $length = length $bytes;
2b5e7bc2 1854 my $will_overflow = $testname =~ /overflow/ ? 'overflow' : "";
eb83ed87 1855
d7874298
KW
1856 {
1857 use warnings;
1858 undef @warnings;
1859 my $ret = test_isUTF8_CHAR($bytes, $length);
25e3a4e0 1860 my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
d7874298
KW
1861 if ($will_overflow) {
1862 is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
25e3a4e0 1863 is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
d7874298
KW
1864 }
1865 else {
1866 is($ret, $length,
1867 "isUTF8_CHAR() $testname: returns expected length: $length");
25e3a4e0
KW
1868 is($ret_flags, $length,
1869 "isUTF8_CHAR_flags(...,0) $testname: returns expected length: $length");
d7874298
KW
1870 }
1871 unless (is(scalar @warnings, 0,
25e3a4e0 1872 "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated no warnings"))
d7874298 1873 {
d84e92aa 1874 output_warnings(@warnings);
d7874298 1875 }
3d56ecbe 1876
e23e8bc1
KW
1877 undef @warnings;
1878 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
1879 if ($will_overflow) {
1880 is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
1881 }
1882 else {
1883 my $expected_ret = ( $testname =~ /surrogate|non-character/
1884 || $allowed_uv > 0x10FFFF)
1885 ? 0
1886 : $length;
1887 is($ret, $expected_ret,
1888 "isSTRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
25e3a4e0
KW
1889 $ret = test_isUTF8_CHAR_flags($bytes, $length,
1890 $UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
1891 is($ret, $expected_ret,
1892 "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE') acts like isSTRICT_UTF8_CHAR");
e23e8bc1
KW
1893 }
1894 unless (is(scalar @warnings, 0,
25e3a4e0 1895 "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname: generated no warnings"))
e23e8bc1 1896 {
d84e92aa 1897 output_warnings(@warnings);
e23e8bc1
KW
1898 }
1899
a82be82b
KW
1900 undef @warnings;
1901 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
1902 if ($will_overflow) {
1903 is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
1904 }
1905 else {
1906 my $expected_ret = ( $testname =~ /surrogate/
1907 || $allowed_uv > 0x10FFFF)
1908 ? 0
1909 : $length;
1910 is($ret, $expected_ret,
1911 "isC9_STRICT_UTF8_CHAR() $testname: returns expected length: $expected_ret");
25e3a4e0
KW
1912 $ret = test_isUTF8_CHAR_flags($bytes, $length,
1913 $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
1914 is($ret, $expected_ret,
1915 "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like isC9_STRICT_UTF8_CHAR");
a82be82b
KW
1916 }
1917 unless (is(scalar @warnings, 0,
25e3a4e0 1918 "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname: generated no warnings"))
a82be82b 1919 {
d84e92aa 1920 output_warnings(@warnings);
a82be82b
KW
1921 }
1922
3d56ecbe
KW
1923 # Test partial character handling, for each byte not a full character
1924 for my $j (1.. $length - 1) {
1925
1926 # Skip the test for the interaction between overflow and above-31
1927 # bit. It is really testing other things than the partial
1928 # character tests, for which other tests in this file are
1929 # sufficient
1930 last if $testname =~ /overflow/;
1931
1932 foreach my $disallow_flag (0, $disallow_flags) {
1933 my $partial = substr($bytes, 0, $j);
1934 my $ret_should_be;
1935 my $comment;
1936 if ($disallow_flag) {
1937 $ret_should_be = 0;
1938 $comment = "disallowed";
1939 }
1940 else {
1941 $ret_should_be = 1;
1942 $comment = "allowed";
1943 }
1944
1945 if ($disallow_flag) {
1946 if ($testname =~ /non-character/) {
1947 $ret_should_be = 1;
1948 $comment .= ", but but need full char to discern";
1949 }
1950 elsif ($testname =~ /surrogate/) {
1951 if ($j < 2) {
1952 $ret_should_be = 1;
1953 $comment .= ", but need 2 bytes to discern";
1954 }
1955 }
dd0f8ff2
KW
1956 elsif ( ($disallow_flags & $UTF8_DISALLOW_SUPER)
1957 && $j < 2
1958 && ord(native_to_I8(substr($bytes, 0, 1)))
1959 lt ((isASCII) ? 0xF5 : 0xFA))
1960 {
3d56ecbe
KW
1961 $ret_should_be = 1;
1962 $comment .= ", but need 2 bytes to discern";
1963 }
418080dc
KW
1964 elsif ( ! isASCII
1965 && $testname =~ /requires at least 32 bits/)
1966 {
1967 # On EBCDIC, the boundary between 31 and 32 bits is
1968 # more complicated.
1969 $ret_should_be = 1 if native_to_I8($partial) le
1970 "\xFF\xA0\xA0\xA0\xA0\xA0\xA0\xA1\xBF\xBF\xBF\xBF\xBF\xBF";
1971 }
3d56ecbe
KW
1972 }
1973
1974 undef @warnings;
1975
1976 $ret = test_is_utf8_valid_partial_char_flags($partial, $j, $disallow_flag);
1977 is($ret, $ret_should_be, "$testname: is_utf8_valid_partial_char_flags("
1978 . display_bytes($partial)
1979 . "), $comment: returns $ret_should_be");
1980 unless (is(scalar @warnings, 0,
1981 "$testname: is_utf8_valid_partial_char_flags() generated no warnings"))
1982 {
d84e92aa 1983 output_warnings(@warnings);
3d56ecbe
KW
1984 }
1985 }
1986 }
d7874298
KW
1987 }
1988
eb83ed87
KW
1989 # This is more complicated than the malformations tested earlier, as there
1990 # are several orthogonal variables involved. We test all the subclasses
1991 # of utf8 warnings to verify they work with and without the utf8 class,
1992 # and don't have effects on other sublass warnings
54f4afef 1993 foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
eb83ed87
KW
1994 foreach my $warn_flag (0, $warn_flags) {
1995 foreach my $disallow_flag (0, $disallow_flags) {
54f4afef 1996 foreach my $do_warning (0, 1) {
2b5e7bc2
KW
1997
1998 # We try each of the above with various combinations of
1999 # malformations that can occur on the same input sequence.
2000 foreach my $short ("",
2001 "short",
2002 "unexpected non-continuation")
2003 {
2004 # The non-characters can't be discerned with a short
2005 # malformation
2006 next if $short && $testname =~ /non-character/;
2007
2008 foreach my $overlong ("", "overlong") {
2009
44563783
KW
2010 # If we're already at the longest possible, we
2011 # can't create an overlong (which would be longer)
2b5e7bc2 2012 # can't handle anything larger.
44563783 2013 next if $overlong && $expected_len >= $max_bytes;
2b5e7bc2
KW
2014
2015 my @malformations;
f9380377 2016 my @expected_errors;
2b5e7bc2
KW
2017 push @malformations, $short if $short;
2018 push @malformations, $overlong if $overlong;
2019
2020 # The overflow malformation test in the input
2021 # array is coerced into being treated like one of
2022 # the others.
f9380377
KW
2023 if ($will_overflow) {
2024 push @malformations, 'overflow';
2025 push @expected_errors, $UTF8_GOT_OVERFLOW;
2026 }
2b5e7bc2
KW
2027
2028 my $malformations_name = join "/", @malformations;
2029 $malformations_name .= " malformation"
2030 if $malformations_name;
2031 $malformations_name .= "s" if @malformations > 1;
2032 my $this_bytes = $bytes;
2033 my $this_length = $length;
2034 my $expected_uv = $allowed_uv;
2035 my $this_expected_len = $expected_len;
2036 if ($malformations_name) {
2037 $expected_uv = 0;
2038
2039 # Coerce the input into the desired
2040 # malformation
2041 if ($malformations_name =~ /overlong/) {
2042
2043 # For an overlong, we convert the original
2044 # start byte into a continuation byte with
2045 # the same data bits as originally. ...
2046 substr($this_bytes, 0, 1)
2047 = start_byte_to_cont(substr($this_bytes,
2048 0, 1));
2049
2050 # ... Then we prepend it with a known
2051 # overlong sequence. This should evaluate
2052 # to the exact same code point as the
2053 # original.
1069c57c 2054 $this_bytes
44563783 2055 = I8_to_native("\xff")
f2c1c148 2056 . (I8_to_native(chr $first_continuation)
44563783 2057 x ( $max_bytes - 1 - length($this_bytes)))
f2c1c148 2058 . $this_bytes;
2b5e7bc2 2059 $this_length = length($this_bytes);
44563783 2060 $this_expected_len = $max_bytes;
f9380377 2061 push @expected_errors, $UTF8_GOT_LONG;
2b5e7bc2
KW
2062 }
2063 if ($malformations_name =~ /short/) {
2064
2065 # Just tell the test to not look far
2066 # enough into the input.
2067 $this_length--;
2068 $this_expected_len--;
f9380377 2069 push @expected_errors, $UTF8_GOT_SHORT;
2b5e7bc2
KW
2070 }
2071 elsif ($malformations_name
2072 =~ /non-continuation/)
2073 {
2074 # Change the final continuation byte into
2075 # a non one.
2076 substr($this_bytes, -1, 1) = '?';
2077 $this_expected_len--;
f9380377
KW
2078 push @expected_errors,
2079 $UTF8_GOT_NON_CONTINUATION;
2b5e7bc2
KW
2080 }
2081 }
2082
1980a0f4
KW
2083 my $eval_warn = $do_warning
2084 ? "use warnings '$warning'"
2085 : $warning eq "utf8"
2086 ? "no warnings 'utf8'"
2087 : ( "use warnings 'utf8';"
2088 . " no warnings '$warning'");
2089
2b5e7bc2
KW
2090 # Is effectively disallowed if we've set up a
2091 # malformation, even if the flag indicates it is
2092 # allowed. Fix up test name to indicate this as
2093 # well
2094 my $disallowed = $disallow_flag
2095 || $malformations_name;
f9380377 2096 my $this_name = "utf8n_to_uvchr_error() $testname: "
2b5e7bc2 2097 . (($disallow_flag)
f2c1c148
KW
2098 ? 'disallowed'
2099 : $disallowed
2100 ? $disallowed
2101 : 'allowed');
1980a0f4
KW
2102 $this_name .= ", $eval_warn";
2103 $this_name .= ", " . (($warn_flag)
2104 ? 'with warning flag'
2105 : 'no warning flag');
2106
2107 undef @warnings;
2108 my $ret_ref;
2b5e7bc2 2109 my $display_bytes = display_bytes($this_bytes);
1980a0f4 2110 my $call = "Call was: $eval_warn; \$ret_ref"
f9380377
KW
2111 . " = test_utf8n_to_uvchr_error("
2112 . "'$display_bytes', $this_length,"
2113 . "$warn_flag"
2b5e7bc2 2114 . "|$disallow_flag)";
1980a0f4 2115 my $eval_text = "$eval_warn; \$ret_ref"
f9380377
KW
2116 . " = test_utf8n_to_uvchr_error("
2117 . "'$this_bytes',"
2b5e7bc2
KW
2118 . " $this_length, $warn_flag"
2119 . "|$disallow_flag)";
1980a0f4
KW
2120 eval "$eval_text";
2121 if (! ok ("$@ eq ''",
2122 "$this_name: eval succeeded"))
7dfd8446 2123 {
1980a0f4
KW
2124 diag "\$!='$!'; eval'd=\"$call\"";
2125 next;
7dfd8446 2126 }
1980a0f4
KW
2127 if ($disallowed) {
2128 unless (is($ret_ref->[0], 0,
2129 "$this_name: Returns 0"))
2130 {
2131 diag $call;
2132 }
2133 }
2134 else {
2b5e7bc2 2135 unless (is($ret_ref->[0], $expected_uv,
1980a0f4 2136 "$this_name: Returns expected uv: "
2b5e7bc2 2137 . sprintf("0x%04X", $expected_uv)))
1980a0f4
KW
2138 {
2139 diag $call;
2140 }
2141 }
2b5e7bc2 2142 unless (is($ret_ref->[1], $this_expected_len,
1980a0f4 2143 "$this_name: Returns expected length:"
2b5e7bc2 2144 . " $this_expected_len"))
7dfd8446
KW
2145 {
2146 diag $call;
2147 }
1980a0f4 2148
f9380377
KW
2149 my $errors = $ret_ref->[2];
2150
2151 for (my $i = @expected_errors - 1; $i >= 0; $i--) {
2152 if (ok($expected_errors[$i] & $errors,
2153 "Expected and got error bit return"
2154 . " for $malformations[$i] malformation"))
2155 {
2156 $errors &= ~$expected_errors[$i];
2157 }
2158 splice @expected_errors, $i, 1;
2159 }
2160 unless (is(scalar @expected_errors, 0,
2161 "Got all the expected malformation errors"))
2162 {
2163 diag Dumper \@expected_errors;
2164 }
2165
2166 if ($warn_flag || $disallow_flag) {
2167 is($errors, $expected_error_flags,
2168 "Got the correct error flag");
2169 }
2170 else {
2171 is($errors, 0, "Got no other error flag");
2172 }
2173
2b5e7bc2 2174 if (@malformations) {
1980a0f4
KW
2175 if (! $do_warning && $warning eq 'utf8') {
2176 goto no_warnings_expected;
2177 }
2178
2b5e7bc2
KW
2179 # Check that each malformation generates a
2180 # warning, removing that warning if found
2181 MALFORMATION:
2182 foreach my $malformation (@malformations) {
2183 foreach (my $i = 0; $i < @warnings; $i++) {
2184 if ($warnings[$i] =~ /$malformation/) {
2185 pass("Expected and got"
2186 . "'$malformation' warning");
2187 splice @warnings, $i, 1;
2188 next MALFORMATION;
2189 }
1980a0f4 2190 }
2b5e7bc2 2191 fail("Expected '$malformation' warning"
f2c1c148 2192 . " but didn't get it");
2b5e7bc2 2193
1980a0f4
KW
2194 }
2195 }
2b5e7bc2
KW
2196
2197 # Any overflow will override any super or above-31
2198 # warnings.
2199 goto no_warnings_expected if $will_overflow;
2200
2201 if ( ! $do_warning
2202 && ( $warning eq 'utf8'
2203 || $warning eq $category))
1980a0f4
KW
2204 {
2205 goto no_warnings_expected;
2206 }
2207 elsif ($warn_flag) {
2208 if (is(scalar @warnings, 1,
2209 "$this_name: Got a single warning "))
2210 {
2211 unless (like($warnings[0], $message,
2212 "$this_name: Got expected warning"))
2213 {
2214 diag $call;
2215 }
2216 }
2217 else {
2218 diag $call;
2219 if (scalar @warnings) {
2220 output_warnings(@warnings);
2221 }
2222 }
2223 }
2224 else {
2225 no_warnings_expected:
2226 unless (is(scalar @warnings, 0,
2227 "$this_name: Got no warnings"))
2228 {
2229 diag $call;
2230 output_warnings(@warnings);
2231 }
13d7a909 2232 }
eb83ed87 2233
1980a0f4
KW
2234 # Check CHECK_ONLY results when the input is
2235 # disallowed. Do this when actually disallowed,
2236 # not just when the $disallow_flag is set
2237 if ($disallowed) {
2238 undef @warnings;
f9380377 2239 $ret_ref = test_utf8n_to_uvchr_error(
2b5e7bc2
KW
2240 $this_bytes, $this_length,
2241 $disallow_flag|$UTF8_CHECK_ONLY);
1980a0f4
KW
2242 unless (is($ret_ref->[0], 0,
2243 "$this_name, CHECK_ONLY: Returns 0"))
2244 {
2245 diag $call;
2246 }
2247 unless (is($ret_ref->[1], -1,
2248 "$this_name: CHECK_ONLY: returns -1 for"
2249 . " length"))
2250 {
2251 diag $call;
2252 }
2253 if (! is(scalar @warnings, 0,
2254 "$this_name, CHECK_ONLY: no warnings"
2255 . " generated"))
2256 {
2257 diag $call;
2258 output_warnings(@warnings);
2259 }
2260 }
046d01eb 2261
1980a0f4
KW
2262 # Now repeat some of the above, but for
2263 # uvchr_to_utf8_flags(). Since this comes from an
2b5e7bc2
KW
2264 # existing code point, it hasn't overflowed, and
2265 # isn't malformed.
2266 next if @malformations;
1980a0f4
KW
2267
2268 # The warning and disallow flags passed in are for
f9380377 2269 # utf8n_to_uvchr_error(). Convert them for
1980a0f4
KW
2270 # uvchr_to_utf8_flags().
2271 my $uvchr_warn_flag = 0;
2272 my $uvchr_disallow_flag = 0;
2273 if ($warn_flag) {
2274 if ($warn_flag == $UTF8_WARN_SURROGATE) {
2275 $uvchr_warn_flag = $UNICODE_WARN_SURROGATE
2276 }
2277 elsif ($warn_flag == $UTF8_WARN_NONCHAR) {
2278 $uvchr_warn_flag = $UNICODE_WARN_NONCHAR
2279 }
2280 elsif ($warn_flag == $UTF8_WARN_SUPER) {
2281 $uvchr_warn_flag = $UNICODE_WARN_SUPER
2282 }
2283 elsif ($warn_flag == $UTF8_WARN_ABOVE_31_BIT) {
2284 $uvchr_warn_flag
2285 = $UNICODE_WARN_ABOVE_31_BIT;
2286 }
2287 else {
2288 fail(sprintf "Unexpected warn flag: %x",
2289 $warn_flag);
2290 next;
2291 }
2292 }
2293 if ($disallow_flag) {
2294 if ($disallow_flag == $UTF8_DISALLOW_SURROGATE)
2295 {
2296 $uvchr_disallow_flag
2297 = $UNICODE_DISALLOW_SURROGATE;
2298 }
2299 elsif ($disallow_flag == $UTF8_DISALLOW_NONCHAR)
2300 {
2301 $uvchr_disallow_flag
2302 = $UNICODE_DISALLOW_NONCHAR;
2303 }
2304 elsif ($disallow_flag == $UTF8_DISALLOW_SUPER) {
2305 $uvchr_disallow_flag
2306 = $UNICODE_DISALLOW_SUPER;
2307 }
2308 elsif ($disallow_flag
2309 == $UTF8_DISALLOW_ABOVE_31_BIT)
2310 {
2311 $uvchr_disallow_flag =
2312 $UNICODE_DISALLOW_ABOVE_31_BIT;
2313 }
2314 else {
2315 fail(sprintf "Unexpected disallow flag: %x",
2316 $disallow_flag);
2317 next;
2318 }
2319 }
046d01eb 2320
1980a0f4 2321 $disallowed = $uvchr_disallow_flag;
046d01eb 2322
1980a0f4
KW
2323 $this_name = "uvchr_to_utf8_flags() $testname: "
2324 . (($uvchr_disallow_flag)
2325 ? 'disallowed'
2326 : ($disallowed)
2327 ? 'ABOVE_31_BIT allowed'
2328 : 'allowed');
2329 $this_name .= ", $eval_warn";
2330 $this_name .= ", " . (($uvchr_warn_flag)
2331 ? 'with warning flag'
2332 : 'no warning flag');
2333
2334 undef @warnings;
2335 my $ret;
2336 my $warn_flag = sprintf "0x%x", $uvchr_warn_flag;
2337 my $disallow_flag = sprintf "0x%x",
2338 $uvchr_disallow_flag;
2339 $call = sprintf("call was: $eval_warn; \$ret"
2340 . " = test_uvchr_to_utf8_flags("
2341 . " 0x%x, $warn_flag|$disallow_flag)",
2342 $allowed_uv);
2343 $eval_text = "$eval_warn; \$ret ="
2344 . " test_uvchr_to_utf8_flags("
2345 . "$allowed_uv, $warn_flag|"
2346 . "$disallow_flag)";
2347 eval "$eval_text";
2348 if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
046d01eb 2349 {
1980a0f4
KW
2350 diag "\$!='$!'; eval'd=\"$eval_text\"";
2351 next;
2352 }
2353 if ($disallowed) {
2354 unless (is($ret, undef,
2355 "$this_name: Returns undef"))
2356 {
2357 diag $call;
2358 }
2359 }
2360 else {
2361 unless (is($ret, $bytes,
2362 "$this_name: Returns expected string"))
2363 {
2364 diag $call;
2365 }
2366 }
2367 if (! $do_warning
2368 && ($warning eq 'utf8' || $warning eq $category))
2369 {
2370 if (!is(scalar @warnings, 0,
2371 "$this_name: No warnings generated"))
2372 {
2373 diag $call;
2374 output_warnings(@warnings);
2375 }
2376 }
2377 elsif ( $uvchr_warn_flag
2378 && ( $warning eq 'utf8'
2379 || $warning eq $category))
2380 {
2381 if (is(scalar @warnings, 1,
2382 "$this_name: Got a single warning "))
2383 {
2384 unless (like($warnings[0], $message,
2385 "$this_name: Got expected warning"))
2386 {
2387 diag $call;
2388 }
2389 }
2390 else {
2391 diag $call;
2392 output_warnings(@warnings)
2393 if scalar @warnings;
2394 }
046d01eb 2395 }
2b5e7bc2
KW
2396 }
2397 }
eb83ed87
KW
2398 }
2399 }
2400 }
2401 }
2402}
6e3d6c02 2403
65df57a8
TC
2404SKIP:
2405{
2406 isASCII
2407 or skip "These tests probably break on non-ASCII", 1;
2408 my $simple = join "", "A" .. "J";
2409 my $utf_ch = "\x{7fffffff}";
2410 utf8::encode($utf_ch);
2411 my $utf_ch_len = length $utf_ch;
2412 note "utf_ch_len $utf_ch_len";
2413 my $utf = $utf_ch x 10;
2414 my $bad_start = substr($utf, 1);
2415 # $bad_end ends with a start byte and a single continuation
2416 my $bad_end = substr($utf, 0, length($utf)-$utf_ch_len+2);
2417
2418 # WARNING: all offsets are *byte* offsets
2419 my @hop_tests =
2420 (
2421 # string s off expected name
2422 [ $simple, 0, 5, 5, "simple in range, forward" ],
2423 [ $simple, 10, -5, 5, "simple in range, backward" ],
2424 [ $simple, 5, 10, 10, "simple out of range, forward" ],
2425 [ $simple, 5, -10, 0, "simple out of range, backward" ],
2426 [ $utf, $utf_ch_len * 5, 5, length($utf), "utf in range, forward" ],
2427 [ $utf, $utf_ch_len * 5, -5, 0, "utf in range, backward" ],
2428 [ $utf, $utf_ch_len * 5, 4, $utf_ch_len * 9, "utf in range b, forward" ],
2429 [ $utf, $utf_ch_len * 5, -4, $utf_ch_len, "utf in range b, backward" ],
2430 [ $utf, $utf_ch_len * 5, 6, length($utf), "utf out of range, forward" ],
2431 [ $utf, $utf_ch_len * 5, -6, 0, "utf out of range, backward" ],
2432 [ $bad_start, 0, 1, 1, "bad start, forward 1 from 0" ],
2433 [ $bad_start, 0, $utf_ch_len-1, $utf_ch_len-1, "bad start, forward ch_len-1 from 0" ],
2434 [ $bad_start, 0, $utf_ch_len, $utf_ch_len*2-1, "bad start, forward ch_len from 0" ],
2435 [ $bad_start, $utf_ch_len-1, -1, 0, "bad start, back 1 from first start byte" ],
2436 [ $bad_start, $utf_ch_len-2, -1, 0, "bad start, back 1 from before first start byte" ],
2437 [ $bad_start, 0, -1, 0, "bad start, back 1 from 0" ],
2438 [ $bad_start, length $bad_start, -10, 0, "bad start, back 10 from end" ],
2439 [ $bad_end, 0, 10, length $bad_end, "bad end, forward 10 from 0" ],
2440 [ $bad_end, length($bad_end)-1, 10, length $bad_end, "bad end, forward 1 from end-1" ],
2441 );
2442
2443 for my $test (@hop_tests) {
2444 my ($str, $s_off, $off, $want, $name) = @$test;
2445 my $result = test_utf8_hop_safe($str, $s_off, $off);
2446 is($result, $want, "utf8_hop_safe: $name");
2447 }
2448}
2449
fed3ba5d 2450done_testing;