This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Fix uvoffuni_to_utf8_flags() disallowed input
[perl5.git] / ext / XS-APItest / t / utf8.t
CommitLineData
fed3ba5d
NC
1#!perl -w
2
3use strict;
4use Test::More;
7dfd8446 5$|=1;
fed3ba5d
NC
6
7use XS::APItest;
8
4deba822
KW
9my $pound_sign = chr utf8::unicode_to_native(163);
10
7dfd8446
KW
11sub isASCII { ord "A" == 65 }
12
13sub display_bytes {
14 my $string = shift;
15 return '"'
16 . join("", map { sprintf("\\x%02x", ord $_) } split "", $string)
17 . '"';
18}
19
20# This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
21# because that uses the same functions we are testing here. So UTF-EBCDIC
22# strings are hard-coded as I8 strings in this file instead, and we use array
23# lookup to translate into the appropriate code page.
24
25my @i8_to_native = ( # Only code page 1047 so far.
26# _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F
270x00,0x01,0x02,0x03,0x37,0x2D,0x2E,0x2F,0x16,0x05,0x15,0x0B,0x0C,0x0D,0x0E,0x0F,
280x10,0x11,0x12,0x13,0x3C,0x3D,0x32,0x26,0x18,0x19,0x3F,0x27,0x1C,0x1D,0x1E,0x1F,
290x40,0x5A,0x7F,0x7B,0x5B,0x6C,0x50,0x7D,0x4D,0x5D,0x5C,0x4E,0x6B,0x60,0x4B,0x61,
300xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0x7A,0x5E,0x4C,0x7E,0x6E,0x6F,
310x7C,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,
320xD7,0xD8,0xD9,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xAD,0xE0,0xBD,0x5F,0x6D,
330x79,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x91,0x92,0x93,0x94,0x95,0x96,
340x97,0x98,0x99,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xC0,0x4F,0xD0,0xA1,0x07,
350x20,0x21,0x22,0x23,0x24,0x25,0x06,0x17,0x28,0x29,0x2A,0x2B,0x2C,0x09,0x0A,0x1B,
360x30,0x31,0x1A,0x33,0x34,0x35,0x36,0x08,0x38,0x39,0x3A,0x3B,0x04,0x14,0x3E,0xFF,
370x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x51,0x52,0x53,0x54,0x55,0x56,
380x57,0x58,0x59,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x70,0x71,0x72,0x73,
390x74,0x75,0x76,0x77,0x78,0x80,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,0x90,0x9A,0x9B,0x9C,
400x9D,0x9E,0x9F,0xA0,0xAA,0xAB,0xAC,0xAE,0xAF,0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,
410xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBE,0xBF,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xDA,0xDB,
420xDC,0xDD,0xDE,0xDF,0xE1,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xFA,0xFB,0xFC,0xFD,0xFE,
43);
44
45*I8_to_native = (isASCII)
46 ? sub { return shift }
47 : sub { return join "", map { chr $i8_to_native[ord $_] }
48 split "", shift };
49
50my $is64bit = length sprintf("%x", ~0) > 8;
51
52
53# Test utf8n_to_uvchr(). These provide essentially complete code coverage.
54# Copied from utf8.h
55my $UTF8_ALLOW_EMPTY = 0x0001;
56my $UTF8_ALLOW_CONTINUATION = 0x0002;
57my $UTF8_ALLOW_NON_CONTINUATION = 0x0004;
58my $UTF8_ALLOW_SHORT = 0x0008;
59my $UTF8_ALLOW_LONG = 0x0010;
60my $UTF8_DISALLOW_SURROGATE = 0x0020;
61my $UTF8_WARN_SURROGATE = 0x0040;
62my $UTF8_DISALLOW_NONCHAR = 0x0080;
63my $UTF8_WARN_NONCHAR = 0x0100;
64my $UTF8_DISALLOW_SUPER = 0x0200;
65my $UTF8_WARN_SUPER = 0x0400;
66my $UTF8_DISALLOW_FE_FF = 0x0800;
67my $UTF8_WARN_FE_FF = 0x1000;
68my $UTF8_CHECK_ONLY = 0x2000;
69
70my $look_for_everything = $UTF8_DISALLOW_SURROGATE
71 | $UTF8_WARN_SURROGATE
72 | $UTF8_DISALLOW_NONCHAR
73 | $UTF8_WARN_NONCHAR
74 | $UTF8_DISALLOW_SUPER
75 | $UTF8_WARN_SUPER
76 | $UTF8_DISALLOW_FE_FF
77 | $UTF8_WARN_FE_FF;
78
fed3ba5d
NC
79foreach ([0, '', '', 'empty'],
80 [0, 'N', 'N', '1 char'],
81 [1, 'NN', 'N', '1 char substring'],
82 [-2, 'Perl', 'Rules', 'different'],
4deba822
KW
83 [0, $pound_sign, $pound_sign, 'pound sign'],
84 [1, $pound_sign . 10, $pound_sign . 1, '10 pounds is more than 1 pound'],
85 [1, $pound_sign . $pound_sign, $pound_sign, '2 pound signs are more than 1'],
fed3ba5d
NC
86 [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'],
87 [-1, '!', "!\x{1F42A}", 'Initial substrings match'],
88 ) {
89 my ($expect, $left, $right, $desc) = @$_;
90 my $copy = $right;
91 utf8::encode($copy);
92 is(bytes_cmp_utf8($left, $copy), $expect, $desc);
93 next if $right =~ tr/\0-\377//c;
94 utf8::encode($left);
95 is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed");
96}
97
7dfd8446
KW
98# The keys to this hash are Unicode code points, their values are the native
99# UTF-8 representations of them. The code points are chosen because they are
100# "interesting" on either or both ASCII and EBCDIC platforms. First we add
101# boundaries where the number of bytes required to represent them increase, or
102# are adjacent to problematic code points, so we want to make sure they aren't
103# considered problematic.
104my %code_points = (
105 0x0100 => (isASCII) ? "\xc4\x80" : I8_to_native("\xc8\xa0"),
106 0x0400 - 1 => (isASCII) ? "\xcf\xbf" : I8_to_native("\xdf\xbf"),
107 0x0400 => (isASCII) ? "\xd0\x80" : I8_to_native("\xe1\xa0\xa0"),
108 0x0800 - 1 => (isASCII) ? "\xdf\xbf" : I8_to_native("\xe1\xbf\xbf"),
109 0x0800 => (isASCII) ? "\xe0\xa0\x80" : I8_to_native("\xe2\xa0\xa0"),
110 0x4000 - 1 => (isASCII) ? "\xe3\xbf\xbf" : I8_to_native("\xef\xbf\xbf"),
111 0x4000 => (isASCII) ? "\xe4\x80\x80" : I8_to_native("\xf0\xb0\xa0\xa0"),
112 0x8000 - 1 => (isASCII) ? "\xe7\xbf\xbf" : I8_to_native("\xf0\xbf\xbf\xbf"),
113
114 # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC,
115 # as of this writing, considers potentially problematic on EBCDIC
116 0x8000 => (isASCII) ? "\xe8\x80\x80" : I8_to_native("\xf1\xa0\xa0\xa0"),
117
118 0xD000 - 1 => (isASCII) ? "\xec\xbf\xbf" : I8_to_native("\xf1\xb3\xbf\xbf"),
119
120 # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC,
121 # as of this writing, considers potentially problematic on ASCII
122 0xD000 => (isASCII) ? "\xed\x80\x80" : I8_to_native("\xf1\xb4\xa0\xa0"),
123
124 # Bracket the surrogates
125 0xD7FF => (isASCII) ? "\xed\x9f\xbf" : I8_to_native("\xf1\xb5\xbf\xbf"),
126 0xE000 => (isASCII) ? "\xee\x80\x80" : I8_to_native("\xf1\xb8\xa0\xa0"),
127
128 # Bracket the 32 contiguous non characters
129 0xFDCF => (isASCII) ? "\xef\xb7\x8f" : I8_to_native("\xf1\xbf\xae\xaf"),
130 0xFDF0 => (isASCII) ? "\xef\xb7\xb0" : I8_to_native("\xf1\xbf\xaf\xb0"),
131
132 # Mostly bracket non-characters, but some are transitions to longer
133 # strings
134 0xFFFD => (isASCII) ? "\xef\xbf\xbd" : I8_to_native("\xf1\xbf\xbf\xbd"),
135 0x10000 - 1 => (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
136 0x10000 => (isASCII) ? "\xf0\x90\x80\x80" : I8_to_native("\xf2\xa0\xa0\xa0"),
137 0x1FFFD => (isASCII) ? "\xf0\x9f\xbf\xbd" : I8_to_native("\xf3\xbf\xbf\xbd"),
138 0x20000 => (isASCII) ? "\xf0\xa0\x80\x80" : I8_to_native("\xf4\xa0\xa0\xa0"),
139 0x2FFFD => (isASCII) ? "\xf0\xaf\xbf\xbd" : I8_to_native("\xf5\xbf\xbf\xbd"),
140 0x30000 => (isASCII) ? "\xf0\xb0\x80\x80" : I8_to_native("\xf6\xa0\xa0\xa0"),
141 0x3FFFD => (isASCII) ? "\xf0\xbf\xbf\xbd" : I8_to_native("\xf7\xbf\xbf\xbd"),
142 0x40000 - 1 => (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
143 0x40000 => (isASCII) ? "\xf1\x80\x80\x80" : I8_to_native("\xf8\xa8\xa0\xa0\xa0"),
144 0x4FFFD => (isASCII) ? "\xf1\x8f\xbf\xbd" : I8_to_native("\xf8\xa9\xbf\xbf\xbd"),
145 0x50000 => (isASCII) ? "\xf1\x90\x80\x80" : I8_to_native("\xf8\xaa\xa0\xa0\xa0"),
146 0x5FFFD => (isASCII) ? "\xf1\x9f\xbf\xbd" : I8_to_native("\xf8\xab\xbf\xbf\xbd"),
147 0x60000 => (isASCII) ? "\xf1\xa0\x80\x80" : I8_to_native("\xf8\xac\xa0\xa0\xa0"),
148 0x6FFFD => (isASCII) ? "\xf1\xaf\xbf\xbd" : I8_to_native("\xf8\xad\xbf\xbf\xbd"),
149 0x70000 => (isASCII) ? "\xf1\xb0\x80\x80" : I8_to_native("\xf8\xae\xa0\xa0\xa0"),
150 0x7FFFD => (isASCII) ? "\xf1\xbf\xbf\xbd" : I8_to_native("\xf8\xaf\xbf\xbf\xbd"),
151 0x80000 => (isASCII) ? "\xf2\x80\x80\x80" : I8_to_native("\xf8\xb0\xa0\xa0\xa0"),
152 0x8FFFD => (isASCII) ? "\xf2\x8f\xbf\xbd" : I8_to_native("\xf8\xb1\xbf\xbf\xbd"),
153 0x90000 => (isASCII) ? "\xf2\x90\x80\x80" : I8_to_native("\xf8\xb2\xa0\xa0\xa0"),
154 0x9FFFD => (isASCII) ? "\xf2\x9f\xbf\xbd" : I8_to_native("\xf8\xb3\xbf\xbf\xbd"),
155 0xA0000 => (isASCII) ? "\xf2\xa0\x80\x80" : I8_to_native("\xf8\xb4\xa0\xa0\xa0"),
156 0xAFFFD => (isASCII) ? "\xf2\xaf\xbf\xbd" : I8_to_native("\xf8\xb5\xbf\xbf\xbd"),
157 0xB0000 => (isASCII) ? "\xf2\xb0\x80\x80" : I8_to_native("\xf8\xb6\xa0\xa0\xa0"),
158 0xBFFFD => (isASCII) ? "\xf2\xbf\xbf\xbd" : I8_to_native("\xf8\xb7\xbf\xbf\xbd"),
159 0xC0000 => (isASCII) ? "\xf3\x80\x80\x80" : I8_to_native("\xf8\xb8\xa0\xa0\xa0"),
160 0xCFFFD => (isASCII) ? "\xf3\x8f\xbf\xbd" : I8_to_native("\xf8\xb9\xbf\xbf\xbd"),
161 0xD0000 => (isASCII) ? "\xf3\x90\x80\x80" : I8_to_native("\xf8\xba\xa0\xa0\xa0"),
162 0xDFFFD => (isASCII) ? "\xf3\x9f\xbf\xbd" : I8_to_native("\xf8\xbb\xbf\xbf\xbd"),
163 0xE0000 => (isASCII) ? "\xf3\xa0\x80\x80" : I8_to_native("\xf8\xbc\xa0\xa0\xa0"),
164 0xEFFFD => (isASCII) ? "\xf3\xaf\xbf\xbd" : I8_to_native("\xf8\xbd\xbf\xbf\xbd"),
165 0xF0000 => (isASCII) ? "\xf3\xb0\x80\x80" : I8_to_native("\xf8\xbe\xa0\xa0\xa0"),
166 0xFFFFD => (isASCII) ? "\xf3\xbf\xbf\xbd" : I8_to_native("\xf8\xbf\xbf\xbf\xbd"),
167 0x100000 => (isASCII) ? "\xf4\x80\x80\x80" : I8_to_native("\xf9\xa0\xa0\xa0\xa0"),
168 0x10FFFD => (isASCII) ? "\xf4\x8f\xbf\xbd" : I8_to_native("\xf9\xa1\xbf\xbf\xbd"),
169 0x110000 => (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
170
171 # Things that would be noncharacters if they were in Unicode, and might be
172 # mistaken, if the C code is bad, to be nonchars
173 0x11FFFE => (isASCII) ? "\xf4\x9f\xbf\xbe" : I8_to_native("\xf9\xa3\xbf\xbf\xbe"),
174 0x11FFFF => (isASCII) ? "\xf4\x9f\xbf\xbf" : I8_to_native("\xf9\xa3\xbf\xbf\xbf"),
175 0x20FFFE => (isASCII) ? "\xf8\x88\x8f\xbf\xbe" : I8_to_native("\xfa\xa1\xbf\xbf\xbe"),
176 0x20FFFF => (isASCII) ? "\xf8\x88\x8f\xbf\xbf" : I8_to_native("\xfa\xa1\xbf\xbf\xbf"),
177
178 0x200000 - 1 => (isASCII) ? "\xf7\xbf\xbf\xbf" : I8_to_native("\xf9\xbf\xbf\xbf\xbf"),
179 0x200000 => (isASCII) ? "\xf8\x88\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
180 0x400000 - 1 => (isASCII) ? "\xf8\x8f\xbf\xbf\xbf" : I8_to_native("\xfb\xbf\xbf\xbf\xbf"),
181 0x400000 => (isASCII) ? "\xf8\x90\x80\x80\x80" : I8_to_native("\xfc\xa4\xa0\xa0\xa0\xa0"),
182 0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"),
183 0x4000000 => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"),
184 0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"),
185 0x4000000 => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"),
186 0x40000000 - 1 => (isASCII) ? "\xfc\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xfe\xbf\xbf\xbf\xbf\xbf\xbf"),
187 0x40000000 => (isASCII) ? "\xfd\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0"),
188 0x80000000 - 1 => (isASCII) ? "\xfd\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"),
189);
4deba822 190
7dfd8446
KW
191if (isASCII) {
192 $code_points{0x80000000} = "\xfe\x82\x80\x80\x80\x80\x80";
193 $code_points{0xFFFFFFFF} = "\xfe\x83\xbf\xbf\xbf\xbf\xbf";
194}
eb83ed87 195
7dfd8446
KW
196if ($is64bit) {
197 no warnings qw(overflow portable);
198 $code_points{0x100000000} = (isASCII) ? "\xfe\x84\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0");
199 $code_points{0x1000000000 - 1} = (isASCII) ? "\xfe\xbf\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf\xbf");
200 $code_points{0x1000000000} = (isASCII) ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0");
201 $code_points{0xFFFFFFFFFFFFFFFF} = (isASCII) ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf");
202}
eb83ed87 203
7dfd8446
KW
204# Now add in entries for each of code points 0-255, which require special
205# handling on EBCDIC. Remember the keys are Unicode values, and the values
206# are the native UTF-8. For invariants, the bytes are just the native chr.
207
208my $cp = 0;
209while ($cp < ((isASCII) ? 128 : 160)) { # This is from the definition of
210 # invariant
211 $code_points{$cp} = chr utf8::unicode_to_native($cp);
212 $cp++;
213}
214
215# Done with the invariants. Now do the variants. All in this range are 2
216# byte. Again, we can't use the internal functions to generate UTF-8, as
217# those are what we are trying to test. In the loop, we know what range the
218# continuation bytes can be in, and what the lowest start byte can be. So we
219# cycle through them.
220
221my $first_continuation = (isASCII) ? 0x80 : 0xA0;
222my $final_continuation = 0xBF;
223my $start = (isASCII) ? 0xC2 : 0xC5;
224
225my $continuation = $first_continuation - 1;
226
227while ($cp < 255) {
228 if (++$continuation > $final_continuation) {
229
230 # Wrap to the next start byte when we reach the final continuation
231 # byte possible
232 $continuation = $first_continuation;
233 $start++;
234 }
235 $code_points{$cp} = I8_to_native(chr($start) . chr($continuation));
236
237 $cp++;
238}
eb83ed87
KW
239
240my @warnings;
241
242use warnings 'utf8';
243local $SIG{__WARN__} = sub { push @warnings, @_ };
244
7dfd8446
KW
245# This set of tests looks for basic sanity, and lastly tests the bottom level
246# decode routine for the given code point. If the earlier tests for that code
247# point fail, that one probably will too. Malformations are tested in later
248# segments of code.
249for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
250 keys %code_points)
251{
252 my $hex_u = sprintf("0x%02X", $u);
253 my $n = utf8::unicode_to_native($u);
254 my $hex_n = sprintf("0x%02X", $n);
255 my $bytes = $code_points{$u};
256
257 my $offskip_should_be;
258 {
259 no warnings qw(overflow portable);
260 $offskip_should_be = (isASCII)
261 ? ( $u < 0x80 ? 1 :
262 $u < 0x800 ? 2 :
263 $u < 0x10000 ? 3 :
264 $u < 0x200000 ? 4 :
265 $u < 0x4000000 ? 5 :
266 $u < 0x80000000 ? 6 : (($is64bit)
267 ? ($u < 0x1000000000 ? 7 : 13)
268 : 7)
269 )
270 : ($u < 0xA0 ? 1 :
271 $u < 0x400 ? 2 :
272 $u < 0x4000 ? 3 :
273 $u < 0x40000 ? 4 :
274 $u < 0x400000 ? 5 :
275 $u < 0x4000000 ? 6 : 7
276 );
277 }
278
279 # If this test fails, subsequent ones are meaningless.
280 next unless is(test_OFFUNISKIP($u), $offskip_should_be,
281 "Verify OFFUNISKIP($hex_u) is $offskip_should_be");
282 my $invariant = $offskip_should_be == 1;
283 my $display_invariant = $invariant || 0;
284 is(test_OFFUNI_IS_INVARIANT($u), $invariant,
285 "Verify OFFUNI_IS_INVARIANT($hex_u) is $display_invariant");
286
287 my $uvchr_skip_should_be = $offskip_should_be;
288 next unless is(test_UVCHR_SKIP($n), $uvchr_skip_should_be,
289 "Verify UVCHR_SKIP($hex_n) is $uvchr_skip_should_be");
290 is(test_UVCHR_IS_INVARIANT($n), $offskip_should_be == 1,
291 "Verify UVCHR_IS_INVARIANT($hex_n) is $display_invariant");
292
293 my $n_chr = chr $n;
294 utf8::upgrade $n_chr;
295
296 is(test_UTF8_SKIP($n_chr), $uvchr_skip_should_be,
297 "Verify UTF8_SKIP(chr $hex_n) is $uvchr_skip_should_be");
298
299 use bytes;
300 for (my $j = 0; $j < length $n_chr; $j++) {
301 my $b = substr($n_chr, $j, 1);
302 my $hex_b = sprintf("\"\\x%02x\"", ord $b);
303
304 my $byte_invariant = $j == 0 && $uvchr_skip_should_be == 1;
305 my $display_byte_invariant = $byte_invariant || 0;
306 next unless is(test_UTF8_IS_INVARIANT($b), $byte_invariant,
307 " Verify UTF8_IS_INVARIANT($hex_b) for byte $j "
308 . "is $display_byte_invariant");
309
310 my $is_start = $j == 0 && $uvchr_skip_should_be > 1;
311 my $display_is_start = $is_start || 0;
312 next unless is(test_UTF8_IS_START($b), $is_start,
313 " Verify UTF8_IS_START($hex_b) is $display_is_start");
314
315 my $is_continuation = $j != 0 && $uvchr_skip_should_be > 1;
316 my $display_is_continuation = $is_continuation || 0;
317 next unless is(test_UTF8_IS_CONTINUATION($b), $is_continuation,
318 " Verify UTF8_IS_CONTINUATION($hex_b) is "
319 . "$display_is_continuation");
320
321 my $is_continued = $uvchr_skip_should_be > 1;
322 my $display_is_continued = $is_continued || 0;
323 next unless is(test_UTF8_IS_CONTINUED($b), $is_continued,
324 " Verify UTF8_IS_CONTINUED($hex_b) is "
325 . "$display_is_continued");
326
327 my $is_downgradeable_start = $n < 256
328 && $uvchr_skip_should_be > 1
329 && $j == 0;
330 my $display_is_downgradeable_start = $is_downgradeable_start || 0;
331 next unless is(test_UTF8_IS_DOWNGRADEABLE_START($b),
332 $is_downgradeable_start,
333 " Verify UTF8_IS_DOWNGRADEABLE_START($hex_b) is "
334 . "$display_is_downgradeable_start");
335
336 my $is_above_latin1 = $n > 255 && $j == 0;
337 my $display_is_above_latin1 = $is_above_latin1 || 0;
338 next unless is(test_UTF8_IS_ABOVE_LATIN1($b),
339 $is_above_latin1,
340 " Verify UTF8_IS_ABOVE_LATIN1($hex_b) is "
341 . "$display_is_above_latin1");
342
343 my $is_possibly_problematic = $j == 0
344 && $n >= ((isASCII)
345 ? 0xD000
346 : 0x8000);
347 my $display_is_possibly_problematic = $is_possibly_problematic || 0;
348 next unless is(test_isUTF8_POSSIBLY_PROBLEMATIC($b),
349 $is_possibly_problematic,
350 " Verify isUTF8_POSSIBLY_PROBLEMATIC($hex_b) is "
351 . "$display_is_above_latin1");
352 }
353
354 # We are not trying to look for warnings, etc, so if they should occur, it
355 # is an error. But some of the code points here do cause warnings, so we
356 # check here and turn off the ones that apply to such code points. A
357 # later section of the code tests for these kinds of things.
358 my $this_utf8_flags = $look_for_everything;
359 my $len = length $bytes;
360 if ($n > 2 ** 31 - 1) {
361 $this_utf8_flags &= ~($UTF8_DISALLOW_FE_FF|$UTF8_WARN_FE_FF);
362 }
363 if ($n > 0x10FFFF) {
364 $this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER);
365 }
366 elsif (($n & 0xFFFE) == 0xFFFE) {
367 $this_utf8_flags &= ~($UTF8_DISALLOW_NONCHAR|$UTF8_WARN_NONCHAR);
368 }
369
370 undef @warnings;
371
372 my $display_flags = sprintf "0x%x", $this_utf8_flags;
373 my $ret_ref = test_utf8n_to_uvchr($bytes, $len, $this_utf8_flags);
374 my $display_bytes = display_bytes($bytes);
375 is($ret_ref->[0], $n, "Verify utf8n_to_uvchr($display_bytes, $display_flags) returns $hex_n");
376 is($ret_ref->[1], $len, "Verify utf8n_to_uvchr() for $hex_n returns expected length");
377
378 unless (is(scalar @warnings, 0,
379 "Verify utf8n_to_uvchr() for $hex_n generated no warnings"))
380 {
381 diag "The warnings were: " . join(", ", @warnings);
382 }
383}
384
385my $REPLACEMENT = 0xFFFD;
386
387# Now test the malformations. All these raise category utf8 warnings.
388my $c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte
389my @malformations = (
eb83ed87
KW
390 [ "zero length string malformation", "", 0,
391 $UTF8_ALLOW_EMPTY, 0, 0,
392 qr/empty string/
393 ],
7dfd8446
KW
394 [ "orphan continuation byte malformation", I8_to_native("${c}a"),
395 2,
eb83ed87
KW
396 $UTF8_ALLOW_CONTINUATION, $REPLACEMENT, 1,
397 qr/unexpected continuation byte/
398 ],
7dfd8446
KW
399 [ "premature next character malformation (immediate)",
400 (isASCII) ? "\xc2a" : I8_to_native("\xc5") ."a",
401 2,
eb83ed87
KW
402 $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 1,
403 qr/unexpected non-continuation byte.*immediately after start byte/
404 ],
7dfd8446
KW
405 [ "premature next character malformation (non-immediate)",
406 I8_to_native("\xf0${c}a"),
407 3,
eb83ed87
KW
408 $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 2,
409 qr/unexpected non-continuation byte .* 2 bytes after start byte/
410 ],
7dfd8446 411 [ "too short malformation", I8_to_native("\xf0${c}a"), 2,
eb83ed87
KW
412 # Having the 'a' after this, but saying there are only 2 bytes also
413 # tests that we pay attention to the passed in length
414 $UTF8_ALLOW_SHORT, $REPLACEMENT, 2,
415 qr/2 bytes, need 4/
416 ],
7dfd8446
KW
417 [ "overlong malformation", I8_to_native("\xc0$c"), 2,
418 $UTF8_ALLOW_LONG,
419 0, # NUL
420 2,
eb83ed87 421 qr/2 bytes, need 1/
7dfd8446
KW
422 ]
423);
424
425if (isASCII) {
426 push @malformations,
427 [ "overflow malformation",
428 # These are the smallest overflowing on 64 byte machines:
429 # 2**64
430 "\xff\x80\x90\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0",
431 13,
eb83ed87 432 0, # There is no way to allow this malformation
7dfd8446
KW
433 $REPLACEMENT,
434 13,
eb83ed87 435 qr/overflow/
7dfd8446
KW
436 ];
437}
438
439foreach my $test (@malformations) {
eb83ed87
KW
440 my ($testname, $bytes, $length, $allow_flags, $allowed_uv, $expected_len, $message ) = @$test;
441
442 next if ! ok(length($bytes) >= $length, "$testname: Make sure won't read beyond buffer: " . length($bytes) . " >= $length");
443
444 # Test what happens when this malformation is not allowed
445 undef @warnings;
bd70aaaf 446 my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
eb83ed87
KW
447 is($ret_ref->[0], 0, "$testname: disallowed: Returns 0");
448 is($ret_ref->[1], $expected_len, "$testname: disallowed: Returns expected length");
449 if (is(scalar @warnings, 1, "$testname: disallowed: Got a single warning ")) {
450 like($warnings[0], $message, "$testname: disallowed: Got expected warning");
451 }
452 else {
453 if (scalar @warnings) {
7dfd8446 454 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
455 }
456 }
457
458 { # Next test when disallowed, and warnings are off.
459 undef @warnings;
460 no warnings 'utf8';
bd70aaaf 461 my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
eb83ed87
KW
462 is($ret_ref->[0], 0, "$testname: disallowed: no warnings 'utf8': Returns 0");
463 is($ret_ref->[1], $expected_len, "$testname: disallowed: no warnings 'utf8': Returns expected length");
464 if (!is(scalar @warnings, 0, "$testname: disallowed: no warnings 'utf8': no warnings generated")) {
7dfd8446 465 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
466 }
467 }
468
469 # Test with CHECK_ONLY
470 undef @warnings;
bd70aaaf 471 $ret_ref = test_utf8n_to_uvchr($bytes, $length, $UTF8_CHECK_ONLY);
eb83ed87
KW
472 is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0");
473 is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns expected length");
474 if (! is(scalar @warnings, 0, "$testname: CHECK_ONLY: no warnings generated")) {
7dfd8446 475 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
476 }
477
478 next if $allow_flags == 0; # Skip if can't allow this malformation
479
480 # Test when the malformation is allowed
481 undef @warnings;
bd70aaaf 482 $ret_ref = test_utf8n_to_uvchr($bytes, $length, $allow_flags);
eb83ed87
KW
483 is($ret_ref->[0], $allowed_uv, "$testname: allowed: Returns expected uv");
484 is($ret_ref->[1], $expected_len, "$testname: allowed: Returns expected length");
485 if (!is(scalar @warnings, 0, "$testname: allowed: no warnings generated"))
486 {
7dfd8446 487 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
488 }
489}
490
491my $FF_ret;
492
7dfd8446 493if ($is64bit) {
eb83ed87
KW
494 no warnings qw{portable overflow};
495 $FF_ret = 0x1000000000;
496}
497else { # The above overflows unless a quad platform
498 $FF_ret = 0;
499}
500
501# Now test the cases where a legal code point is generated, but may or may not
502# be allowed/warned on.
2f8f112e 503my @tests = (
7dfd8446
KW
504 [ "lowest surrogate",
505 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
506 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
507 'surrogate', 0xD800,
508 (isASCII) ? 3 : 4,
509 qr/surrogate/
510 ],
511 [ "a middle surrogate",
512 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
513 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
514 'surrogate', 0xD90D,
515 (isASCII) ? 3 : 4,
eb83ed87
KW
516 qr/surrogate/
517 ],
7dfd8446
KW
518 [ "highest surrogate",
519 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
520 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
521 'surrogate', 0xDFFF,
522 (isASCII) ? 3 : 4,
523 qr/surrogate/
524 ],
525 [ "first non_unicode",
526 (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
527 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
528 'non_unicode', 0x110000,
529 (isASCII) ? 4 : 5,
530 qr/not Unicode.* may not be portable/
531 ],
532 [ "first of 32 consecutive non-character code points",
533 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
534 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
535 'nonchar', 0xFDD0,
536 (isASCII) ? 3 : 4,
537 qr/Unicode non-character.*is not recommended for open interchange/
538 ],
539 [ "a mid non-character code point of the 32 consecutive ones",
540 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
541 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
542 'nonchar', 0xFDE0,
543 (isASCII) ? 3 : 4,
544 qr/Unicode non-character.*is not recommended for open interchange/
545 ],
546 [ "final of 32 consecutive non-character code points",
547 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
548 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
549 'nonchar', 0xFDEF,
550 (isASCII) ? 3 : 4,
551 qr/Unicode non-character.*is not recommended for open interchange/
552 ],
553 [ "non-character code point U+FFFE",
554 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
555 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
556 'nonchar', 0xFFFE,
557 (isASCII) ? 3 : 4,
558 qr/Unicode non-character.*is not recommended for open interchange/
559 ],
560 [ "non-character code point U+FFFF",
561 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
562 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
563 'nonchar', 0xFFFF,
564 (isASCII) ? 3 : 4,
565 qr/Unicode non-character.*is not recommended for open interchange/
566 ],
567 [ "non-character code point U+1FFFE",
568 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
569 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
570 'nonchar', 0x1FFFE, 4,
571 qr/Unicode non-character.*is not recommended for open interchange/
572 ],
573 [ "non-character code point U+1FFFF",
574 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
575 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
576 'nonchar', 0x1FFFF, 4,
577 qr/Unicode non-character.*is not recommended for open interchange/
578 ],
579 [ "non-character code point U+2FFFE",
580 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
581 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
582 'nonchar', 0x2FFFE, 4,
583 qr/Unicode non-character.*is not recommended for open interchange/
584 ],
585 [ "non-character code point U+2FFFF",
586 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
587 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
588 'nonchar', 0x2FFFF, 4,
589 qr/Unicode non-character.*is not recommended for open interchange/
590 ],
591 [ "non-character code point U+3FFFE",
592 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
593 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
594 'nonchar', 0x3FFFE, 4,
595 qr/Unicode non-character.*is not recommended for open interchange/
596 ],
597 [ "non-character code point U+3FFFF",
598 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
599 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
600 'nonchar', 0x3FFFF, 4,
601 qr/Unicode non-character.*is not recommended for open interchange/
602 ],
603 [ "non-character code point U+4FFFE",
604 (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
605 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
606 'nonchar', 0x4FFFE,
607 (isASCII) ? 4 : 5,
608 qr/Unicode non-character.*is not recommended for open interchange/
609 ],
610 [ "non-character code point U+4FFFF",
611 (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
612 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
613 'nonchar', 0x4FFFF,
614 (isASCII) ? 4 : 5,
615 qr/Unicode non-character.*is not recommended for open interchange/
616 ],
617 [ "non-character code point U+5FFFE",
618 (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
619 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
620 'nonchar', 0x5FFFE,
621 (isASCII) ? 4 : 5,
622 qr/Unicode non-character.*is not recommended for open interchange/
623 ],
624 [ "non-character code point U+5FFFF",
625 (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
626 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
627 'nonchar', 0x5FFFF,
628 (isASCII) ? 4 : 5,
629 qr/Unicode non-character.*is not recommended for open interchange/
630 ],
631 [ "non-character code point U+6FFFE",
632 (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
633 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
634 'nonchar', 0x6FFFE,
635 (isASCII) ? 4 : 5,
636 qr/Unicode non-character.*is not recommended for open interchange/
637 ],
638 [ "non-character code point U+6FFFF",
639 (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
640 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
641 'nonchar', 0x6FFFF,
642 (isASCII) ? 4 : 5,
643 qr/Unicode non-character.*is not recommended for open interchange/
644 ],
645 [ "non-character code point U+7FFFE",
646 (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
647 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
648 'nonchar', 0x7FFFE,
649 (isASCII) ? 4 : 5,
650 qr/Unicode non-character.*is not recommended for open interchange/
651 ],
652 [ "non-character code point U+7FFFF",
653 (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
654 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
655 'nonchar', 0x7FFFF,
656 (isASCII) ? 4 : 5,
657 qr/Unicode non-character.*is not recommended for open interchange/
658 ],
659 [ "non-character code point U+8FFFE",
660 (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
661 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
662 'nonchar', 0x8FFFE,
663 (isASCII) ? 4 : 5,
664 qr/Unicode non-character.*is not recommended for open interchange/
665 ],
666 [ "non-character code point U+8FFFF",
667 (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
668 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
669 'nonchar', 0x8FFFF,
670 (isASCII) ? 4 : 5,
671 qr/Unicode non-character.*is not recommended for open interchange/
672 ],
673 [ "non-character code point U+9FFFE",
674 (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
675 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
676 'nonchar', 0x9FFFE,
677 (isASCII) ? 4 : 5,
678 qr/Unicode non-character.*is not recommended for open interchange/
679 ],
680 [ "non-character code point U+9FFFF",
681 (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
682 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
683 'nonchar', 0x9FFFF,
684 (isASCII) ? 4 : 5,
685 qr/Unicode non-character.*is not recommended for open interchange/
686 ],
687 [ "non-character code point U+AFFFE",
688 (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
689 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
690 'nonchar', 0xAFFFE,
691 (isASCII) ? 4 : 5,
692 qr/Unicode non-character.*is not recommended for open interchange/
693 ],
694 [ "non-character code point U+AFFFF",
695 (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
696 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
697 'nonchar', 0xAFFFF,
698 (isASCII) ? 4 : 5,
699 qr/Unicode non-character.*is not recommended for open interchange/
700 ],
701 [ "non-character code point U+BFFFE",
702 (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
703 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
704 'nonchar', 0xBFFFE,
705 (isASCII) ? 4 : 5,
706 qr/Unicode non-character.*is not recommended for open interchange/
707 ],
708 [ "non-character code point U+BFFFF",
709 (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
710 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
711 'nonchar', 0xBFFFF,
712 (isASCII) ? 4 : 5,
713 qr/Unicode non-character.*is not recommended for open interchange/
714 ],
715 [ "non-character code point U+CFFFE",
716 (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
717 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
718 'nonchar', 0xCFFFE,
719 (isASCII) ? 4 : 5,
720 qr/Unicode non-character.*is not recommended for open interchange/
721 ],
722 [ "non-character code point U+CFFFF",
723 (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
724 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
725 'nonchar', 0xCFFFF,
726 (isASCII) ? 4 : 5,
727 qr/Unicode non-character.*is not recommended for open interchange/
728 ],
729 [ "non-character code point U+DFFFE",
730 (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
731 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
732 'nonchar', 0xDFFFE,
733 (isASCII) ? 4 : 5,
734 qr/Unicode non-character.*is not recommended for open interchange/
735 ],
736 [ "non-character code point U+DFFFF",
737 (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
738 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
739 'nonchar', 0xDFFFF,
740 (isASCII) ? 4 : 5,
741 qr/Unicode non-character.*is not recommended for open interchange/
742 ],
743 [ "non-character code point U+EFFFE",
744 (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
745 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
746 'nonchar', 0xEFFFE,
747 (isASCII) ? 4 : 5,
748 qr/Unicode non-character.*is not recommended for open interchange/
749 ],
750 [ "non-character code point U+EFFFF",
751 (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
752 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
753 'nonchar', 0xEFFFF,
754 (isASCII) ? 4 : 5,
755 qr/Unicode non-character.*is not recommended for open interchange/
756 ],
757 [ "non-character code point U+FFFFE",
758 (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
759 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
760 'nonchar', 0xFFFFE,
761 (isASCII) ? 4 : 5,
762 qr/Unicode non-character.*is not recommended for open interchange/
763 ],
764 [ "non-character code point U+FFFFF",
765 (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
766 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
767 'nonchar', 0xFFFFF,
768 (isASCII) ? 4 : 5,
769 qr/Unicode non-character.*is not recommended for open interchange/
770 ],
771 [ "non-character code point U+10FFFE",
772 (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
773 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
774 'nonchar', 0x10FFFE,
775 (isASCII) ? 4 : 5,
776 qr/Unicode non-character.*is not recommended for open interchange/
eb83ed87 777 ],
7dfd8446
KW
778 [ "non-character code point U+10FFFF",
779 (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
780 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
781 'nonchar', 0x10FFFF,
782 (isASCII) ? 4 : 5,
ba707cdc 783 qr/Unicode non-character.*is not recommended for open interchange/
eb83ed87 784 ],
7dfd8446
KW
785);
786
787
788if (isASCII) {
789 push @tests,
790 [ "is at least 2*31",
791 "\xfe\x82\x80\x80\x80\x80\x80",
eb83ed87
KW
792
793 # This code point is chosen so that it is representable in a UV on
2f8f112e 794 # 32-bit machines
7dfd8446
KW
795 $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF,
796 'utf8', 0x80000000, 7,
797 qr/Code point 0x80000000 is not Unicode, and not portable/
798 ],
799 [ "is at least 2*31, and use SUPER-type flags, instead of FE_FF",
800 "\xfe\x82\x80\x80\x80\x80\x80",
801 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
802 'utf8', 0x80000000, 7,
ea5ced44 803 qr/Code point 0x80000000 is not Unicode, and not portable/
eb83ed87 804 ],
2f8f112e
KW
805 [ "overflow with FE/FF",
806 # This tests the interaction of WARN_FE_FF/DISALLOW_FE_FF with
807 # overflow. The overflow malformation is never allowed, so preventing
808 # it takes precedence if the FE_FF options would otherwise allow in an
809 # overflowing value. These two code points (1 for 32-bits; 1 for 64)
810 # were chosen because the old overflow detection algorithm did not
811 # catch them; this means this test also checks for that fix.
7dfd8446 812 ($is64bit)
2f8f112e
KW
813 ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
814 : "\xfe\x86\x80\x80\x80\x80\x80",
ea5ced44
KW
815
816 # We include both warning categories to make sure the FE_FF one has
817 # precedence
7dfd8446
KW
818 "$UTF8_WARN_FE_FF|$UTF8_WARN_SUPER", "$UTF8_DISALLOW_FE_FF",
819 'utf8', 0,
820 ($is64bit) ? 13 : 7,
ea5ced44 821 qr/overflow at byte .*, after start byte 0xf/
eb83ed87 822 ],
7dfd8446
KW
823 ;
824}
2f8f112e 825
7dfd8446 826if ($is64bit) { # All FF's will overflow on 32 bit
2f8f112e
KW
827 push @tests,
828 [ "begins with FF", "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
7dfd8446
KW
829 $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF,
830 'utf8', $FF_ret, 13,
ea5ced44 831 qr/Code point 0x.* is not Unicode, and not portable/
2f8f112e
KW
832 ];
833}
834
835foreach my $test (@tests) {
eb83ed87
KW
836 my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test;
837
838 my $length = length $bytes;
2f8f112e 839 my $will_overflow = $testname =~ /overflow/;
eb83ed87
KW
840
841 # This is more complicated than the malformations tested earlier, as there
842 # are several orthogonal variables involved. We test all the subclasses
843 # of utf8 warnings to verify they work with and without the utf8 class,
844 # and don't have effects on other sublass warnings
54f4afef 845 foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
eb83ed87
KW
846 foreach my $warn_flag (0, $warn_flags) {
847 foreach my $disallow_flag (0, $disallow_flags) {
54f4afef 848 foreach my $do_warning (0, 1) {
eb83ed87 849
13d7a909
KW
850 my $eval_warn = $do_warning
851 ? "use warnings '$warning'"
852 : $warning eq "utf8"
7dfd8446
KW
853 ? "no warnings 'utf8'"
854 : "use warnings 'utf8'; no warnings '$warning'";
2f8f112e 855
13d7a909
KW
856 # is effectively disallowed if will overflow, even if the
857 # flag indicates it is allowed, fix up test name to
858 # indicate this as well
859 my $disallowed = $disallow_flag || $will_overflow;
2f8f112e 860
13d7a909
KW
861 my $this_name = "$testname: " . (($disallow_flag)
862 ? 'disallowed'
863 : ($disallowed)
864 ? 'FE_FF allowed'
865 : 'allowed');
866 $this_name .= ", $eval_warn";
867 $this_name .= ", " . (($warn_flag)
868 ? 'with warning flag'
869 : 'no warning flag');
eb83ed87 870
13d7a909
KW
871 undef @warnings;
872 my $ret_ref;
7dfd8446
KW
873 my $display_bytes = display_bytes($bytes);
874 my $call = "Call was: $eval_warn; \$ret_ref = test_utf8n_to_uvchr('$display_bytes', $length, $warn_flag|$disallow_flag)";
875 my $eval_text = "$eval_warn; \$ret_ref = test_utf8n_to_uvchr('$bytes', $length, $warn_flag|$disallow_flag)";
13d7a909
KW
876 eval "$eval_text";
877 if (! ok ("$@ eq ''", "$this_name: eval succeeded")) {
7dfd8446 878 diag "\$!='$!'; eval'd=\"$call\"";
13d7a909 879 next;
54f4afef 880 }
13d7a909 881 if ($disallowed) {
7dfd8446
KW
882 unless (is($ret_ref->[0], 0, "$this_name: Returns 0"))
883 {
884 diag $call;
885 }
2f8f112e
KW
886 }
887 else {
7dfd8446
KW
888 unless (is($ret_ref->[0], $allowed_uv,
889 "$this_name: Returns expected uv"))
890 {
891 diag $call;
892 }
893 }
894 unless (is($ret_ref->[1], $expected_len,
895 "$this_name: Returns expected length"))
896 {
897 diag $call;
13d7a909 898 }
13d7a909
KW
899
900 if (! $do_warning
901 && ($warning eq 'utf8' || $warning eq $category))
902 {
903 if (!is(scalar @warnings, 0,
904 "$this_name: No warnings generated"))
905 {
7dfd8446
KW
906 diag $call;
907 diag "The warnings were: " . join(", ", @warnings);
2f8f112e
KW
908 }
909 }
13d7a909
KW
910 elsif ($will_overflow
911 && ! $disallow_flag
912 && $warning eq 'utf8')
913 {
914
915 # Will get the overflow message instead of the expected
916 # message under these circumstances, as they would
917 # otherwise accept an overflowed value, which the code
918 # should not allow, so falls back to overflow.
919 if (is(scalar @warnings, 1,
920 "$this_name: Got a single warning "))
921 {
7dfd8446
KW
922 unless (like($warnings[0], qr/overflow/,
923 "$this_name: Got overflow warning"))
924 {
925 diag $call;
926 }
13d7a909
KW
927 }
928 else {
7dfd8446 929 diag $call;
13d7a909 930 if (scalar @warnings) {
7dfd8446 931 diag "The warnings were: "
13d7a909
KW
932 . join(", ", @warnings);
933 }
934 }
eb83ed87 935 }
13d7a909
KW
936 elsif ($warn_flag
937 && ($warning eq 'utf8' || $warning eq $category))
938 {
939 if (is(scalar @warnings, 1,
940 "$this_name: Got a single warning "))
941 {
7dfd8446
KW
942 TODO: {
943 local $TODO = "Doesn't give dire warning for code points 2**31" if $allowed_uv >= 2**31 && $warnings[0] =~ /may not be portable/;
944 unless (like($warnings[0], $message,
945 "$this_name: Got expected warning"))
946 {
947 diag $call;
948 }
949 }
13d7a909
KW
950 }
951 else {
7dfd8446 952 diag $call;
13d7a909 953 if (scalar @warnings) {
7dfd8446 954 diag "The warnings were: "
13d7a909
KW
955 . join(", ", @warnings);
956 }
eb83ed87
KW
957 }
958 }
eb83ed87 959
13d7a909
KW
960 # Check CHECK_ONLY results when the input is disallowed. Do
961 # this when actually disallowed, not just when the
962 # $disallow_flag is set
963 if ($disallowed) {
964 undef @warnings;
965 $ret_ref = test_utf8n_to_uvchr($bytes, $length,
966 $disallow_flag|$UTF8_CHECK_ONLY);
7dfd8446
KW
967 unless (is($ret_ref->[0], 0, "$this_name, CHECK_ONLY: Returns 0")) {
968 diag $call;
969 }
970 unless (is($ret_ref->[1], -1,
971 "$this_name: CHECK_ONLY: returns expected length"))
972 {
973 diag $call;
974 }
13d7a909
KW
975 if (! is(scalar @warnings, 0,
976 "$this_name, CHECK_ONLY: no warnings generated"))
977 {
7dfd8446
KW
978 diag $call;
979 diag "The warnings were: " . join(", ", @warnings);
13d7a909 980 }
eb83ed87
KW
981 }
982 }
983 }
984 }
985 }
986}
6e3d6c02 987
fed3ba5d 988done_testing;