This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/utf8_warn_base.pl: Remove hash elements
[perl5.git] / ext / XS-APItest / t / utf8_warn_base.pl
CommitLineData
6aa905cf
KW
1#!perl -w
2
3# This is a base file to be used by various .t's in its directory
4
5use strict;
6use Test::More;
7
8BEGIN {
9 use_ok('XS::APItest');
10 require 'charset_tools.pl';
11 require './t/utf8_setup.pl';
12};
13
14$|=1;
15
16no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
17 # machines, and that is tested elsewhere
18
19use XS::APItest;
20use Data::Dumper;
21
22my @warnings;
23
24use warnings 'utf8';
25local $SIG{__WARN__} = sub { push @warnings, @_ };
26
27sub nonportable_regex ($) {
28
29 # Returns a pattern that matches the non-portable message raised either
30 # for the specific input code point, or the one generated when there
31 # is some malformation that precludes the message containing the specific
32 # code point
33
34 my $code_point = shift;
35
36 my $string = sprintf '(Code point 0x%X is not Unicode, and'
37 . '|Any UTF-8 sequence that starts with'
38 . ' "(\\\x[[:xdigit:]]{2})+" is for a'
39 . ' non-Unicode code point, and is) not portable',
40 $code_point;
41 return qr/$string/;
42}
43
44# Now test the cases where a legal code point is generated, but may or may not
45# be allowed/warned on.
46my @tests = (
b7e1f4b2
KW
47 # ($testname, $bytes, $disallow_flags, $category,
48 # $allowed_uv, $needed_to_discern_len )
6aa905cf
KW
49 [ "lowest surrogate",
50 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
b7e1f4b2 51 $::UTF8_DISALLOW_SURROGATE,
6aa905cf 52 'surrogate', 0xD800,
6aa905cf 53 2,
6aa905cf
KW
54 ],
55 [ "a middle surrogate",
56 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
b7e1f4b2 57 $::UTF8_DISALLOW_SURROGATE,
6aa905cf 58 'surrogate', 0xD90D,
6aa905cf 59 2,
6aa905cf
KW
60 ],
61 [ "highest surrogate",
62 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
b7e1f4b2 63 $::UTF8_DISALLOW_SURROGATE,
6aa905cf 64 'surrogate', 0xDFFF,
6aa905cf 65 2,
6aa905cf
KW
66 ],
67 [ "first non_unicode",
68 (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
b7e1f4b2 69 $::UTF8_DISALLOW_SUPER,
6aa905cf 70 'non_unicode', 0x110000,
6aa905cf 71 2,
6aa905cf
KW
72 ],
73 [ "non_unicode whose first byte tells that",
74 (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
b7e1f4b2 75 $::UTF8_DISALLOW_SUPER,
6aa905cf
KW
76 'non_unicode',
77 (isASCII) ? 0x140000 : 0x200000,
6aa905cf 78 1,
6aa905cf
KW
79 ],
80 [ "first of 32 consecutive non-character code points",
81 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
b7e1f4b2 82 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
83 'nonchar', 0xFDD0,
84 (isASCII) ? 3 : 4,
6aa905cf
KW
85 ],
86 [ "a mid non-character code point of the 32 consecutive ones",
87 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
b7e1f4b2 88 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
89 'nonchar', 0xFDE0,
90 (isASCII) ? 3 : 4,
6aa905cf
KW
91 ],
92 [ "final of 32 consecutive non-character code points",
93 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
b7e1f4b2 94 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
95 'nonchar', 0xFDEF,
96 (isASCII) ? 3 : 4,
6aa905cf
KW
97 ],
98 [ "non-character code point U+FFFE",
99 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
b7e1f4b2 100 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
101 'nonchar', 0xFFFE,
102 (isASCII) ? 3 : 4,
6aa905cf
KW
103 ],
104 [ "non-character code point U+FFFF",
105 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
b7e1f4b2 106 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
107 'nonchar', 0xFFFF,
108 (isASCII) ? 3 : 4,
6aa905cf
KW
109 ],
110 [ "non-character code point U+1FFFE",
111 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
b7e1f4b2 112 $::UTF8_DISALLOW_NONCHAR,
6aa905cf 113 'nonchar', 0x1FFFE,
b3169593 114 4,
6aa905cf
KW
115 ],
116 [ "non-character code point U+1FFFF",
117 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
b7e1f4b2 118 $::UTF8_DISALLOW_NONCHAR,
6aa905cf 119 'nonchar', 0x1FFFF,
b3169593 120 4,
6aa905cf
KW
121 ],
122 [ "non-character code point U+2FFFE",
123 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
b7e1f4b2 124 $::UTF8_DISALLOW_NONCHAR,
6aa905cf 125 'nonchar', 0x2FFFE,
b3169593 126 4,
6aa905cf
KW
127 ],
128 [ "non-character code point U+2FFFF",
129 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
b7e1f4b2 130 $::UTF8_DISALLOW_NONCHAR,
6aa905cf 131 'nonchar', 0x2FFFF,
b3169593 132 4,
6aa905cf
KW
133 ],
134 [ "non-character code point U+3FFFE",
135 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
b7e1f4b2 136 $::UTF8_DISALLOW_NONCHAR,
6aa905cf 137 'nonchar', 0x3FFFE,
b3169593 138 4,
6aa905cf
KW
139 ],
140 [ "non-character code point U+3FFFF",
141 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
b7e1f4b2 142 $::UTF8_DISALLOW_NONCHAR,
6aa905cf 143 'nonchar', 0x3FFFF,
b3169593 144 4,
6aa905cf
KW
145 ],
146 [ "non-character code point U+4FFFE",
147 (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
b7e1f4b2 148 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
149 'nonchar', 0x4FFFE,
150 (isASCII) ? 4 : 5,
6aa905cf
KW
151 ],
152 [ "non-character code point U+4FFFF",
153 (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
b7e1f4b2 154 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
155 'nonchar', 0x4FFFF,
156 (isASCII) ? 4 : 5,
6aa905cf
KW
157 ],
158 [ "non-character code point U+5FFFE",
159 (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
b7e1f4b2 160 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
161 'nonchar', 0x5FFFE,
162 (isASCII) ? 4 : 5,
6aa905cf
KW
163 ],
164 [ "non-character code point U+5FFFF",
165 (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
b7e1f4b2 166 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
167 'nonchar', 0x5FFFF,
168 (isASCII) ? 4 : 5,
6aa905cf
KW
169 ],
170 [ "non-character code point U+6FFFE",
171 (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
b7e1f4b2 172 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
173 'nonchar', 0x6FFFE,
174 (isASCII) ? 4 : 5,
6aa905cf
KW
175 ],
176 [ "non-character code point U+6FFFF",
177 (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
b7e1f4b2 178 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
179 'nonchar', 0x6FFFF,
180 (isASCII) ? 4 : 5,
6aa905cf
KW
181 ],
182 [ "non-character code point U+7FFFE",
183 (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
b7e1f4b2 184 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
185 'nonchar', 0x7FFFE,
186 (isASCII) ? 4 : 5,
6aa905cf
KW
187 ],
188 [ "non-character code point U+7FFFF",
189 (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
b7e1f4b2 190 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
191 'nonchar', 0x7FFFF,
192 (isASCII) ? 4 : 5,
6aa905cf
KW
193 ],
194 [ "non-character code point U+8FFFE",
195 (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
b7e1f4b2 196 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
197 'nonchar', 0x8FFFE,
198 (isASCII) ? 4 : 5,
6aa905cf
KW
199 ],
200 [ "non-character code point U+8FFFF",
201 (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
b7e1f4b2 202 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
203 'nonchar', 0x8FFFF,
204 (isASCII) ? 4 : 5,
6aa905cf
KW
205 ],
206 [ "non-character code point U+9FFFE",
207 (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
b7e1f4b2 208 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
209 'nonchar', 0x9FFFE,
210 (isASCII) ? 4 : 5,
6aa905cf
KW
211 ],
212 [ "non-character code point U+9FFFF",
213 (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
b7e1f4b2 214 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
215 'nonchar', 0x9FFFF,
216 (isASCII) ? 4 : 5,
6aa905cf
KW
217 ],
218 [ "non-character code point U+AFFFE",
219 (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
b7e1f4b2 220 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
221 'nonchar', 0xAFFFE,
222 (isASCII) ? 4 : 5,
6aa905cf
KW
223 ],
224 [ "non-character code point U+AFFFF",
225 (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
b7e1f4b2 226 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
227 'nonchar', 0xAFFFF,
228 (isASCII) ? 4 : 5,
6aa905cf
KW
229 ],
230 [ "non-character code point U+BFFFE",
231 (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
b7e1f4b2 232 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
233 'nonchar', 0xBFFFE,
234 (isASCII) ? 4 : 5,
6aa905cf
KW
235 ],
236 [ "non-character code point U+BFFFF",
237 (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
b7e1f4b2 238 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
239 'nonchar', 0xBFFFF,
240 (isASCII) ? 4 : 5,
6aa905cf
KW
241 ],
242 [ "non-character code point U+CFFFE",
243 (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
b7e1f4b2 244 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
245 'nonchar', 0xCFFFE,
246 (isASCII) ? 4 : 5,
6aa905cf
KW
247 ],
248 [ "non-character code point U+CFFFF",
249 (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
b7e1f4b2 250 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
251 'nonchar', 0xCFFFF,
252 (isASCII) ? 4 : 5,
6aa905cf
KW
253 ],
254 [ "non-character code point U+DFFFE",
255 (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
b7e1f4b2 256 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
257 'nonchar', 0xDFFFE,
258 (isASCII) ? 4 : 5,
6aa905cf
KW
259 ],
260 [ "non-character code point U+DFFFF",
261 (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
b7e1f4b2 262 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
263 'nonchar', 0xDFFFF,
264 (isASCII) ? 4 : 5,
6aa905cf
KW
265 ],
266 [ "non-character code point U+EFFFE",
267 (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
b7e1f4b2 268 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
269 'nonchar', 0xEFFFE,
270 (isASCII) ? 4 : 5,
6aa905cf
KW
271 ],
272 [ "non-character code point U+EFFFF",
273 (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
b7e1f4b2 274 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
275 'nonchar', 0xEFFFF,
276 (isASCII) ? 4 : 5,
6aa905cf
KW
277 ],
278 [ "non-character code point U+FFFFE",
279 (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
b7e1f4b2 280 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
281 'nonchar', 0xFFFFE,
282 (isASCII) ? 4 : 5,
6aa905cf
KW
283 ],
284 [ "non-character code point U+FFFFF",
285 (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
b7e1f4b2 286 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
287 'nonchar', 0xFFFFF,
288 (isASCII) ? 4 : 5,
6aa905cf
KW
289 ],
290 [ "non-character code point U+10FFFE",
291 (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
b7e1f4b2 292 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
293 'nonchar', 0x10FFFE,
294 (isASCII) ? 4 : 5,
6aa905cf
KW
295 ],
296 [ "non-character code point U+10FFFF",
297 (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
b7e1f4b2 298 $::UTF8_DISALLOW_NONCHAR,
6aa905cf
KW
299 'nonchar', 0x10FFFF,
300 (isASCII) ? 4 : 5,
6aa905cf 301 ],
76513bdc
KW
302 [ "requires at least 32 bits",
303 (isASCII)
304 ? "\xfe\x82\x80\x80\x80\x80\x80"
305 : I8_to_native(
306 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
307 # This code point is chosen so that it is representable in a UV on
308 # 32-bit machines
b7e1f4b2 309 $::UTF8_DISALLOW_ABOVE_31_BIT,
76513bdc 310 'utf8', 0x80000000,
76513bdc 311 (isASCII) ? 1 : 8,
76513bdc
KW
312 ],
313 [ "highest 32 bit code point",
314 (isASCII)
315 ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
316 : I8_to_native(
317 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
b7e1f4b2 318 $::UTF8_DISALLOW_ABOVE_31_BIT,
76513bdc 319 'utf8', 0xFFFFFFFF,
76513bdc 320 (isASCII) ? 1 : 8,
76513bdc
KW
321 ],
322 [ "requires at least 32 bits, and use SUPER-type flags, instead of"
323 . " ABOVE_31_BIT",
324 (isASCII)
325 ? "\xfe\x82\x80\x80\x80\x80\x80"
326 : I8_to_native(
327 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
b7e1f4b2 328 $::UTF8_DISALLOW_SUPER,
76513bdc 329 'utf8', 0x80000000,
76513bdc 330 1,
76513bdc
KW
331 ],
332 [ "overflow with warnings/disallow for more than 31 bits",
333 # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
334 # with overflow. The overflow malformation is never allowed, so
335 # preventing it takes precedence if the ABOVE_31_BIT options would
336 # otherwise allow in an overflowing value. The ASCII code points (1
337 # for 32-bits; 1 for 64) were chosen because the old overflow
338 # detection algorithm did not catch them; this means this test also
339 # checks for that fix. The EBCDIC are arbitrary overflowing ones
340 # since we have no reports of failures with it.
341 (($::is64bit)
342 ? ((isASCII)
343 ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
344 : I8_to_native(
345 "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
346 : ((isASCII)
347 ? "\xfe\x86\x80\x80\x80\x80\x80"
348 : I8_to_native(
349 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
76513bdc 350 $::UTF8_DISALLOW_ABOVE_31_BIT,
04f42bf6 351 'utf8', -1,
76513bdc 352 (isASCII || $::is64bit) ? 2 : 8,
76513bdc 353 ],
6aa905cf
KW
354);
355
356if (! $::is64bit) {
357 if (isASCII) {
358 no warnings qw{portable overflow};
359 push @tests,
360 [ "Lowest 33 bit code point: overflow",
361 "\xFE\x84\x80\x80\x80\x80\x80",
b7e1f4b2 362 $::UTF8_DISALLOW_ABOVE_31_BIT,
04f42bf6 363 'utf8', -1,
b3169593 364 1,
6aa905cf
KW
365 ];
366 }
367}
368else {
369 no warnings qw{portable overflow};
370 push @tests,
371 [ "More than 32 bits",
372 (isASCII)
373 ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
374 : I8_to_native(
375 "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
b7e1f4b2 376 $::UTF8_DISALLOW_ABOVE_31_BIT,
6aa905cf 377 'utf8', 0x1000000000,
b3169593 378 (isASCII) ? 1 : 7,
6aa905cf
KW
379 ];
380 if (! isASCII) {
381 push @tests, # These could falsely show wrongly in a naive
382 # implementation
383 [ "requires at least 32 bits",
384 I8_to_native(
385 "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
b7e1f4b2 386 $::UTF8_DISALLOW_ABOVE_31_BIT,
6aa905cf 387 'utf8', 0x800000000,
b3169593 388 7,
6aa905cf
KW
389 ],
390 [ "requires at least 32 bits",
391 I8_to_native(
392 "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
b7e1f4b2 393 $::UTF8_DISALLOW_ABOVE_31_BIT,
6aa905cf 394 'utf8', 0x10000000000,
b3169593 395 6,
6aa905cf
KW
396 ],
397 [ "requires at least 32 bits",
398 I8_to_native(
399 "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
b7e1f4b2 400 $::UTF8_DISALLOW_ABOVE_31_BIT,
6aa905cf 401 'utf8', 0x200000000000,
b3169593 402 5,
6aa905cf
KW
403 ],
404 [ "requires at least 32 bits",
405 I8_to_native(
406 "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
b7e1f4b2 407 $::UTF8_DISALLOW_ABOVE_31_BIT,
6aa905cf 408 'utf8', 0x4000000000000,
b3169593 409 4,
6aa905cf
KW
410 ],
411 [ "requires at least 32 bits",
412 I8_to_native(
413 "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
b7e1f4b2 414 $::UTF8_DISALLOW_ABOVE_31_BIT,
6aa905cf 415 'utf8', 0x80000000000000,
b3169593 416 3,
6aa905cf
KW
417 ],
418 [ "requires at least 32 bits",
419 I8_to_native(
420 "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
b7e1f4b2 421 $::UTF8_DISALLOW_ABOVE_31_BIT,
6aa905cf 422 'utf8', 0x1000000000000000,
b3169593 423 2,
6aa905cf
KW
424 ];
425 }
426}
427
428# This test is split into this number of files.
429my $num_test_files = $ENV{TEST_JOBS} || 1;
430$num_test_files = 10 if $num_test_files > 10;
431
432my $test_count = -1;
433foreach my $test (@tests) {
434 $test_count++;
435 next if $test_count % $num_test_files != $::TEST_CHUNK;
436
b7e1f4b2 437 my ($testname, $bytes, $disallow_flags,
3022ad00 438 $category, $allowed_uv, $needed_to_discern_len
6aa905cf
KW
439 ) = @$test;
440
441 my $length = length $bytes;
04f42bf6 442 my $will_overflow = $allowed_uv < 0;
6aa905cf 443
b7e1f4b2
KW
444 # The convention is that the got flag is the same value as the disallow
445 # one, and the warn flag is the next bit over. If this were violated, the
446 # tests here should start failing. We could do an eval under no strict to
447 # be sure.
448 my $expected_error_flags = $disallow_flags;
449 my $warn_flags = $disallow_flags << 1;
450
3022ad00
KW
451 my $message;
452 if ($allowed_uv > 0x7FFFFFFF) {
453 $message = nonportable_regex($allowed_uv);
454 }
455 elsif ($allowed_uv > 0x10FFFF) {
456 $message = qr/(not Unicode|for a non-Unicode code point).* may not be portable/;
457 }
458 elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
459 $message = qr/surrogate/;
460 }
461 elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
462 || ($allowed_uv & 0xFFFE) == 0xFFFE)
463 {
464 $message = qr/Unicode non-character.*is not recommended for open interchange/;
465 }
466 elsif ($will_overflow) {
467 $message = qr/overflows/;
468 }
469 else {
470 die "Can't figure out what type of warning to test for $testname"
471 }
472
6aa905cf
KW
473 {
474 use warnings;
475 undef @warnings;
476 my $ret = test_isUTF8_CHAR($bytes, $length);
477 my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
478 if ($will_overflow) {
479 is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
480 is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
481 }
482 else {
483 is($ret, $length,
484 "isUTF8_CHAR() $testname: returns expected length: $length");
485 is($ret_flags, $length, "isUTF8_CHAR_flags(...,0) $testname:"
486 . " returns expected length: $length");
487 }
488 is(scalar @warnings, 0,
489 "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated"
490 . " no warnings")
491 or output_warnings(@warnings);
492
493 undef @warnings;
494 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
495 if ($will_overflow) {
496 is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
497 }
498 else {
499 my $expected_ret = ( $testname =~ /surrogate|non-character/
500 || $allowed_uv > 0x10FFFF)
501 ? 0
502 : $length;
503 is($ret, $expected_ret, "isSTRICT_UTF8_CHAR() $testname: returns"
504 . " expected length: $expected_ret");
505 $ret = test_isUTF8_CHAR_flags($bytes, $length,
506 $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
507 is($ret, $expected_ret,
508 "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
509 . " acts like isSTRICT_UTF8_CHAR");
510 }
511 is(scalar @warnings, 0,
512 "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
513 . " generated no warnings")
514 or output_warnings(@warnings);
515
516 undef @warnings;
517 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
518 if ($will_overflow) {
519 is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
520 }
521 else {
522 my $expected_ret = ( $testname =~ /surrogate/
523 || $allowed_uv > 0x10FFFF)
524 ? 0
525 : $length;
526 is($ret, $expected_ret, "isC9_STRICT_UTF8_CHAR() $testname:"
527 ." returns expected length: $expected_ret");
528 $ret = test_isUTF8_CHAR_flags($bytes, $length,
529 $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
530 is($ret, $expected_ret,
531 "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
532 . " acts like isC9_STRICT_UTF8_CHAR");
533 }
534 is(scalar @warnings, 0,
535 "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
536 . " generated no warnings")
537 or output_warnings(@warnings);
538
539 # Test partial character handling, for each byte not a full character
540 for my $j (1.. $length - 1) {
541
542 # Skip the test for the interaction between overflow and above-31
543 # bit. It is really testing other things than the partial
544 # character tests, for which other tests in this file are
545 # sufficient
04f42bf6 546 last if $will_overflow;
6aa905cf
KW
547
548 foreach my $disallow_flag (0, $disallow_flags) {
549 my $partial = substr($bytes, 0, $j);
550 my $ret_should_be;
551 my $comment;
552 if ($disallow_flag) {
553 $ret_should_be = 0;
554 $comment = "disallowed";
555 if ($j < $needed_to_discern_len) {
556 $ret_should_be = 1;
557 $comment .= ", but need $needed_to_discern_len bytes"
558 . " to discern:";
559 }
560 }
561 else {
562 $ret_should_be = 1;
563 $comment = "allowed";
564 }
565
566 undef @warnings;
567
568 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
569 $disallow_flag);
570 is($ret, $ret_should_be,
571 "$testname: is_utf8_valid_partial_char_flags("
572 . display_bytes($partial)
573 . "), $comment: returns $ret_should_be");
574 is(scalar @warnings, 0,
575 "$testname: is_utf8_valid_partial_char_flags()"
576 . " generated no warnings")
577 or output_warnings(@warnings);
578 }
579 }
580 }
581
582 # This is more complicated than the malformations tested earlier, as there
583 # are several orthogonal variables involved. We test all the subclasses
584 # of utf8 warnings to verify they work with and without the utf8 class,
585 # and don't have effects on other sublass warnings
586 foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
587 foreach my $warn_flag (0, $warn_flags) {
588 foreach my $disallow_flag (0, $disallow_flags) {
589 foreach my $do_warning (0, 1) {
590
591 # We try each of the above with various combinations of
592 # malformations that can occur on the same input sequence.
593 foreach my $short ("", "short") {
594 foreach my $unexpected_noncont ("",
595 "unexpected non-continuation")
596 {
597 foreach my $overlong ("", "overlong") {
598
b3169593
KW
599 # If we're creating an overlong, it can't be longer than
600 # the maximum length, so skip if we're already at that
601 # length.
602 next if $overlong && $length >= $::max_bytes;
6aa905cf
KW
603
604 my @malformations;
605 my @expected_errors;
606 push @malformations, $short if $short;
607 push @malformations, $unexpected_noncont
608 if $unexpected_noncont;
609 push @malformations, $overlong if $overlong;
610
611 # The overflow malformation test in the input
612 # array is coerced into being treated like one of
613 # the others.
614 if ($will_overflow) {
615 push @malformations, 'overflow';
616 push @expected_errors, $::UTF8_GOT_OVERFLOW;
617 }
618
619 my $malformations_name = join "/", @malformations;
620 $malformations_name .= " malformation"
621 if $malformations_name;
622 $malformations_name .= "s" if @malformations > 1;
623 my $this_bytes = $bytes;
624 my $this_length = $length;
625 my $expected_uv = $allowed_uv;
b3169593 626 my $this_expected_len = $length;
6aa905cf
KW
627 my $this_needed_to_discern_len = $needed_to_discern_len;
628 if ($malformations_name) {
629 $expected_uv = 0;
630
631 # Coerce the input into the desired
632 # malformation
633 if ($malformations_name =~ /overlong/) {
634
635 # For an overlong, we convert the original
636 # start byte into a continuation byte with
637 # the same data bits as originally. ...
638 substr($this_bytes, 0, 1)
639 = start_byte_to_cont(substr($this_bytes,
640 0, 1));
641
642 # ... Then we prepend it with a known
643 # overlong sequence. This should evaluate
644 # to the exact same code point as the
645 # original.
646 $this_bytes
647 = I8_to_native("\xff")
dbb8d798 648 . (I8_to_native(chr $::lowest_continuation)
6aa905cf
KW
649 x ( $::max_bytes - 1 - length($this_bytes)))
650 . $this_bytes;
651 $this_length = length($this_bytes);
652 $this_needed_to_discern_len
653 = $::max_bytes - ($this_expected_len
654 - $this_needed_to_discern_len);
655 $this_expected_len = $::max_bytes;
656 push @expected_errors, $::UTF8_GOT_LONG;
657 }
658 if ($malformations_name =~ /short/) {
659
660 # Just tell the test to not look far
661 # enough into the input.
662 $this_length--;
663 $this_expected_len--;
664 push @expected_errors, $::UTF8_GOT_SHORT;
665 }
666 if ($malformations_name
667 =~ /non-continuation/)
668 {
669 # Change the final continuation byte into
670 # a non one.
671 my $pos = ($short) ? -2 : -1;
672 substr($this_bytes, $pos, 1) = '?';
673 $this_expected_len--;
674 push @expected_errors,
675 $::UTF8_GOT_NON_CONTINUATION;
676 }
677 }
678
679 my $eval_warn = $do_warning
680 ? "use warnings '$warning'"
681 : $warning eq "utf8"
682 ? "no warnings 'utf8'"
683 : ( "use warnings 'utf8';"
684 . " no warnings '$warning'");
685
686 # Is effectively disallowed if we've set up a
687 # malformation, even if the flag indicates it is
688 # allowed. Fix up test name to indicate this as
689 # well
690 my $disallowed = $disallow_flag
691 || $malformations_name;
692 my $this_name = "utf8n_to_uvchr_error() $testname: "
693 . (($disallow_flag)
694 ? 'disallowed'
695 : $disallowed
696 ? $disallowed
697 : 'allowed');
698 $this_name .= ", $eval_warn";
699 $this_name .= ", " . (($warn_flag)
700 ? 'with warning flag'
701 : 'no warning flag');
702
703 undef @warnings;
704 my $ret_ref;
705 my $display_bytes = display_bytes($this_bytes);
706 my $call = " Call was: $eval_warn; \$ret_ref"
707 . " = test_utf8n_to_uvchr_error("
708 . "'$display_bytes', $this_length,"
709 . "$warn_flag"
710 . "|$disallow_flag)";
711 my $eval_text = "$eval_warn; \$ret_ref"
712 . " = test_utf8n_to_uvchr_error("
713 . "'$this_bytes',"
714 . " $this_length, $warn_flag"
715 . "|$disallow_flag)";
716 eval "$eval_text";
717 if (! ok ("$@ eq ''",
718 "$this_name: eval succeeded"))
719 {
720 diag "\$!='$!'; eval'd=\"$call\"";
721 next;
722 }
723 if ($disallowed) {
724 is($ret_ref->[0], 0, "$this_name: Returns 0")
725 or diag $call;
726 }
727 else {
728 is($ret_ref->[0], $expected_uv,
729 "$this_name: Returns expected uv: "
730 . sprintf("0x%04X", $expected_uv))
731 or diag $call;
732 }
733 is($ret_ref->[1], $this_expected_len,
734 "$this_name: Returns expected length:"
735 . " $this_expected_len")
736 or diag $call;
737
738 my $errors = $ret_ref->[2];
739
740 for (my $i = @expected_errors - 1; $i >= 0; $i--) {
741 if (ok($expected_errors[$i] & $errors,
742 "Expected and got error bit return"
743 . " for $malformations[$i] malformation"))
744 {
745 $errors &= ~$expected_errors[$i];
746 }
747 splice @expected_errors, $i, 1;
748 }
749 is(scalar @expected_errors, 0,
750 "Got all the expected malformation errors")
751 or diag Dumper \@expected_errors;
752
753 if ( $this_expected_len >= $this_needed_to_discern_len
754 && ($warn_flag || $disallow_flag))
755 {
756 is($errors, $expected_error_flags,
757 "Got the correct error flag")
758 or diag $call;
759 }
760 else {
761 is($errors, 0, "Got no other error flag");
762 }
763
764 if (@malformations) {
765 if (! $do_warning && $warning eq 'utf8') {
766 goto no_warnings_expected;
767 }
768
769 # Check that each malformation generates a
770 # warning, removing that warning if found
771 MALFORMATION:
772 foreach my $malformation (@malformations) {
773 foreach (my $i = 0; $i < @warnings; $i++) {
774 if ($warnings[$i] =~ /$malformation/) {
775 pass("Expected and got"
776 . "'$malformation' warning");
777 splice @warnings, $i, 1;
778 next MALFORMATION;
779 }
780 }
781 fail("Expected '$malformation' warning"
782 . " but didn't get it");
783
784 }
785 }
786
787 # Any overflow will override any super or above-31
788 # warnings.
789 goto no_warnings_expected
790 if $will_overflow || $this_expected_len
791 < $this_needed_to_discern_len;
792
793 if ( ! $do_warning
794 && ( $warning eq 'utf8'
795 || $warning eq $category))
796 {
797 goto no_warnings_expected;
798 }
799 elsif ($warn_flag) {
800 if (is(scalar @warnings, 1,
801 "$this_name: Got a single warning "))
802 {
803 like($warnings[0], $message,
804 "$this_name: Got expected warning")
805 or diag $call;
806 }
807 else {
808 diag $call;
809 if (scalar @warnings) {
810 output_warnings(@warnings);
811 }
812 }
813 }
814 else {
815 no_warnings_expected:
816 unless (is(scalar @warnings, 0,
817 "$this_name: Got no warnings"))
818 {
819 diag $call;
820 output_warnings(@warnings);
821 }
822 }
823
824 # Check CHECK_ONLY results when the input is
825 # disallowed. Do this when actually disallowed,
826 # not just when the $disallow_flag is set
827 if ($disallowed) {
828 undef @warnings;
829 $ret_ref = test_utf8n_to_uvchr_error(
830 $this_bytes, $this_length,
831 $disallow_flag|$::UTF8_CHECK_ONLY);
832 is($ret_ref->[0], 0,
833 "$this_name, CHECK_ONLY: Returns 0")
834 or diag $call;
835 is($ret_ref->[1], -1,
836 "$this_name: CHECK_ONLY: returns -1 for length")
837 or diag $call;
838 if (! is(scalar @warnings, 0,
839 "$this_name, CHECK_ONLY: no warnings"
840 . " generated"))
841 {
842 diag $call;
843 output_warnings(@warnings);
844 }
845 }
846
847 # Now repeat some of the above, but for
848 # uvchr_to_utf8_flags(). Since this comes from an
849 # existing code point, it hasn't overflowed, and
850 # isn't malformed.
851 next if @malformations;
852
853 # The warning and disallow flags passed in are for
854 # utf8n_to_uvchr_error(). Convert them for
855 # uvchr_to_utf8_flags().
856 my $uvchr_warn_flag = 0;
857 my $uvchr_disallow_flag = 0;
858 if ($warn_flag) {
859 if ($warn_flag == $::UTF8_WARN_SURROGATE) {
860 $uvchr_warn_flag = $::UNICODE_WARN_SURROGATE
861 }
862 elsif ($warn_flag == $::UTF8_WARN_NONCHAR) {
863 $uvchr_warn_flag = $::UNICODE_WARN_NONCHAR
864 }
865 elsif ($warn_flag == $::UTF8_WARN_SUPER) {
866 $uvchr_warn_flag = $::UNICODE_WARN_SUPER
867 }
868 elsif ($warn_flag == $::UTF8_WARN_ABOVE_31_BIT) {
869 $uvchr_warn_flag
870 = $::UNICODE_WARN_ABOVE_31_BIT;
871 }
872 else {
873 fail(sprintf "Unexpected warn flag: %x",
874 $warn_flag);
875 next;
876 }
877 }
878 if ($disallow_flag) {
879 if ($disallow_flag == $::UTF8_DISALLOW_SURROGATE)
880 {
881 $uvchr_disallow_flag
882 = $::UNICODE_DISALLOW_SURROGATE;
883 }
884 elsif ($disallow_flag == $::UTF8_DISALLOW_NONCHAR)
885 {
886 $uvchr_disallow_flag
887 = $::UNICODE_DISALLOW_NONCHAR;
888 }
889 elsif ($disallow_flag == $::UTF8_DISALLOW_SUPER) {
890 $uvchr_disallow_flag
891 = $::UNICODE_DISALLOW_SUPER;
892 }
893 elsif ($disallow_flag
894 == $::UTF8_DISALLOW_ABOVE_31_BIT)
895 {
896 $uvchr_disallow_flag =
897 $::UNICODE_DISALLOW_ABOVE_31_BIT;
898 }
899 else {
900 fail(sprintf "Unexpected disallow flag: %x",
901 $disallow_flag);
902 next;
903 }
904 }
905
906 $disallowed = $uvchr_disallow_flag;
907
908 $this_name = "uvchr_to_utf8_flags() $testname: "
909 . (($uvchr_disallow_flag)
910 ? 'disallowed'
911 : ($disallowed)
912 ? 'ABOVE_31_BIT allowed'
913 : 'allowed');
914 $this_name .= ", $eval_warn";
915 $this_name .= ", " . (($uvchr_warn_flag)
916 ? 'with warning flag'
917 : 'no warning flag');
918
919 undef @warnings;
920 my $ret;
921 my $warn_flag = sprintf "0x%x", $uvchr_warn_flag;
922 my $disallow_flag = sprintf "0x%x",
923 $uvchr_disallow_flag;
924 $call = sprintf(" Call was: $eval_warn; \$ret"
925 . " = test_uvchr_to_utf8_flags("
926 . " 0x%x, $warn_flag|$disallow_flag)",
927 $allowed_uv);
928 $eval_text = "$eval_warn; \$ret ="
929 . " test_uvchr_to_utf8_flags("
930 . "$allowed_uv, $warn_flag|"
931 . "$disallow_flag)";
932 eval "$eval_text";
933 if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
934 {
935 diag "\$!='$!'; eval'd=\"$eval_text\"";
936 next;
937 }
938 if ($disallowed) {
939 is($ret, undef, "$this_name: Returns undef")
940 or diag $call;
941 }
942 else {
943 is($ret, $bytes, "$this_name: Returns expected string")
944 or diag $call;
945 }
946 if (! $do_warning
947 && ($warning eq 'utf8' || $warning eq $category))
948 {
949 if (!is(scalar @warnings, 0,
950 "$this_name: No warnings generated"))
951 {
952 diag $call;
953 output_warnings(@warnings);
954 }
955 }
956 elsif ( $uvchr_warn_flag
957 && ( $warning eq 'utf8'
958 || $warning eq $category))
959 {
960 if (is(scalar @warnings, 1,
961 "$this_name: Got a single warning "))
962 {
963 like($warnings[0], $message,
964 "$this_name: Got expected warning")
965 or diag $call;
966 }
967 else {
968 diag $call;
969 output_warnings(@warnings)
970 if scalar @warnings;
971 }
972 }
973 }
974 }
975 }
976 }
977 }
978 }
979 }
980}
981
982done_testing;