3 # This is a base file to be used by various .t's in its directory
10 require 'charset_tools.pl';
11 require './t/utf8_setup.pl';
16 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
17 # machines, and that is tested elsewhere
25 local $SIG{__WARN__} = sub { push @warnings, @_ };
27 sub nonportable_regex ($) {
29 # Returns a pattern that matches the non-portable message raised either
30 # for the specific input code point, or the one generated when there
31 # is some malformation that precludes the message containing the specific
34 my $code_point = shift;
36 my $string = sprintf '(Code point 0x%X is not Unicode, and'
37 . '|Any UTF-8 sequence that starts with'
38 . ' "(\\\x[[:xdigit:]]{2})+" is for a'
39 . ' non-Unicode code point, and is) not portable',
44 # Now test the cases where a legal code point is generated, but may or may not
45 # be allowed/warned on.
47 # ($testname, $bytes, $warn_flags, $disallow_flags, $expected_error_flags,
48 # $category, $allowed_uv, $needed_to_discern_len )
50 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
51 $::UTF8_WARN_SURROGATE, $::UTF8_DISALLOW_SURROGATE, $::UTF8_GOT_SURROGATE,
55 [ "a middle surrogate",
56 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
57 $::UTF8_WARN_SURROGATE, $::UTF8_DISALLOW_SURROGATE, $::UTF8_GOT_SURROGATE,
61 [ "highest surrogate",
62 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
63 $::UTF8_WARN_SURROGATE, $::UTF8_DISALLOW_SURROGATE, $::UTF8_GOT_SURROGATE,
67 [ "first non_unicode",
68 (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
69 $::UTF8_WARN_SUPER, $::UTF8_DISALLOW_SUPER, $::UTF8_GOT_SUPER,
70 'non_unicode', 0x110000,
73 [ "non_unicode whose first byte tells that",
74 (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
75 $::UTF8_WARN_SUPER, $::UTF8_DISALLOW_SUPER, $::UTF8_GOT_SUPER,
77 (isASCII) ? 0x140000 : 0x200000,
80 [ "first of 32 consecutive non-character code points",
81 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
82 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
86 [ "a mid non-character code point of the 32 consecutive ones",
87 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
88 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
92 [ "final of 32 consecutive non-character code points",
93 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
94 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
98 [ "non-character code point U+FFFE",
99 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
100 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
104 [ "non-character code point U+FFFF",
105 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
106 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
110 [ "non-character code point U+1FFFE",
111 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
112 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
116 [ "non-character code point U+1FFFF",
117 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
118 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
122 [ "non-character code point U+2FFFE",
123 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
124 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
128 [ "non-character code point U+2FFFF",
129 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
130 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
134 [ "non-character code point U+3FFFE",
135 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
136 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
140 [ "non-character code point U+3FFFF",
141 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
142 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
146 [ "non-character code point U+4FFFE",
147 (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
148 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
152 [ "non-character code point U+4FFFF",
153 (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
154 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
158 [ "non-character code point U+5FFFE",
159 (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
160 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
164 [ "non-character code point U+5FFFF",
165 (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
166 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
170 [ "non-character code point U+6FFFE",
171 (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
172 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
176 [ "non-character code point U+6FFFF",
177 (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
178 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
182 [ "non-character code point U+7FFFE",
183 (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
184 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
188 [ "non-character code point U+7FFFF",
189 (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
190 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
194 [ "non-character code point U+8FFFE",
195 (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
196 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
200 [ "non-character code point U+8FFFF",
201 (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
202 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
206 [ "non-character code point U+9FFFE",
207 (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
208 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
212 [ "non-character code point U+9FFFF",
213 (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
214 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
218 [ "non-character code point U+AFFFE",
219 (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
220 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
224 [ "non-character code point U+AFFFF",
225 (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
226 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
230 [ "non-character code point U+BFFFE",
231 (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
232 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
236 [ "non-character code point U+BFFFF",
237 (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
238 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
242 [ "non-character code point U+CFFFE",
243 (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
244 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
248 [ "non-character code point U+CFFFF",
249 (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
250 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
254 [ "non-character code point U+DFFFE",
255 (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
256 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
260 [ "non-character code point U+DFFFF",
261 (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
262 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
266 [ "non-character code point U+EFFFE",
267 (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
268 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
272 [ "non-character code point U+EFFFF",
273 (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
274 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
278 [ "non-character code point U+FFFFE",
279 (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
280 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
284 [ "non-character code point U+FFFFF",
285 (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
286 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
290 [ "non-character code point U+10FFFE",
291 (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
292 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
296 [ "non-character code point U+10FFFF",
297 (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
298 $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
302 [ "requires at least 32 bits",
304 ? "\xfe\x82\x80\x80\x80\x80\x80"
306 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
307 # This code point is chosen so that it is representable in a UV on
309 $::UTF8_WARN_ABOVE_31_BIT, $::UTF8_DISALLOW_ABOVE_31_BIT,
310 $::UTF8_GOT_ABOVE_31_BIT,
314 [ "highest 32 bit code point",
316 ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
318 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
319 $::UTF8_WARN_ABOVE_31_BIT, $::UTF8_DISALLOW_ABOVE_31_BIT,
320 $::UTF8_GOT_ABOVE_31_BIT,
324 [ "requires at least 32 bits, and use SUPER-type flags, instead of"
327 ? "\xfe\x82\x80\x80\x80\x80\x80"
329 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
330 $::UTF8_WARN_SUPER, $::UTF8_DISALLOW_SUPER, $::UTF8_GOT_SUPER,
334 [ "overflow with warnings/disallow for more than 31 bits",
335 # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
336 # with overflow. The overflow malformation is never allowed, so
337 # preventing it takes precedence if the ABOVE_31_BIT options would
338 # otherwise allow in an overflowing value. The ASCII code points (1
339 # for 32-bits; 1 for 64) were chosen because the old overflow
340 # detection algorithm did not catch them; this means this test also
341 # checks for that fix. The EBCDIC are arbitrary overflowing ones
342 # since we have no reports of failures with it.
345 ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
347 "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
349 ? "\xfe\x86\x80\x80\x80\x80\x80"
351 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
352 $::UTF8_WARN_ABOVE_31_BIT,
353 $::UTF8_DISALLOW_ABOVE_31_BIT,
354 $::UTF8_GOT_ABOVE_31_BIT,
356 (isASCII || $::is64bit) ? 2 : 8,
362 no warnings qw{portable overflow};
364 [ "Lowest 33 bit code point: overflow",
365 "\xFE\x84\x80\x80\x80\x80\x80",
366 $::UTF8_WARN_ABOVE_31_BIT, $::UTF8_DISALLOW_ABOVE_31_BIT,
367 $::UTF8_GOT_ABOVE_31_BIT,
374 no warnings qw{portable overflow};
376 [ "More than 32 bits",
378 ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
380 "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
381 $::UTF8_WARN_ABOVE_31_BIT, $::UTF8_DISALLOW_ABOVE_31_BIT,
382 $::UTF8_GOT_ABOVE_31_BIT,
383 'utf8', 0x1000000000,
387 push @tests, # These could falsely show wrongly in a naive
389 [ "requires at least 32 bits",
391 "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
392 $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT,
393 $::UTF8_GOT_ABOVE_31_BIT,
397 [ "requires at least 32 bits",
399 "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
400 $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT,
401 $::UTF8_GOT_ABOVE_31_BIT,
402 'utf8', 0x10000000000,
405 [ "requires at least 32 bits",
407 "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
408 $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT,
409 $::UTF8_GOT_ABOVE_31_BIT,
410 'utf8', 0x200000000000,
413 [ "requires at least 32 bits",
415 "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
416 $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT,
417 $::UTF8_GOT_ABOVE_31_BIT,
418 'utf8', 0x4000000000000,
421 [ "requires at least 32 bits",
423 "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
424 $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT,
425 $::UTF8_GOT_ABOVE_31_BIT,
426 'utf8', 0x80000000000000,
429 [ "requires at least 32 bits",
431 "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
432 $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT,
433 $::UTF8_GOT_ABOVE_31_BIT,
434 'utf8', 0x1000000000000000,
440 # This test is split into this number of files.
441 my $num_test_files = $ENV{TEST_JOBS} || 1;
442 $num_test_files = 10 if $num_test_files > 10;
445 foreach my $test (@tests) {
447 next if $test_count % $num_test_files != $::TEST_CHUNK;
449 my ($testname, $bytes, $warn_flags, $disallow_flags, $expected_error_flags,
450 $category, $allowed_uv, $needed_to_discern_len
453 my $length = length $bytes;
454 my $will_overflow = $allowed_uv < 0;
457 if ($allowed_uv > 0x7FFFFFFF) {
458 $message = nonportable_regex($allowed_uv);
460 elsif ($allowed_uv > 0x10FFFF) {
461 $message = qr/(not Unicode|for a non-Unicode code point).* may not be portable/;
463 elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
464 $message = qr/surrogate/;
466 elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
467 || ($allowed_uv & 0xFFFE) == 0xFFFE)
469 $message = qr/Unicode non-character.*is not recommended for open interchange/;
471 elsif ($will_overflow) {
472 $message = qr/overflows/;
475 die "Can't figure out what type of warning to test for $testname"
481 my $ret = test_isUTF8_CHAR($bytes, $length);
482 my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
483 if ($will_overflow) {
484 is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
485 is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
489 "isUTF8_CHAR() $testname: returns expected length: $length");
490 is($ret_flags, $length, "isUTF8_CHAR_flags(...,0) $testname:"
491 . " returns expected length: $length");
493 is(scalar @warnings, 0,
494 "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated"
496 or output_warnings(@warnings);
499 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
500 if ($will_overflow) {
501 is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
504 my $expected_ret = ( $testname =~ /surrogate|non-character/
505 || $allowed_uv > 0x10FFFF)
508 is($ret, $expected_ret, "isSTRICT_UTF8_CHAR() $testname: returns"
509 . " expected length: $expected_ret");
510 $ret = test_isUTF8_CHAR_flags($bytes, $length,
511 $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
512 is($ret, $expected_ret,
513 "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
514 . " acts like isSTRICT_UTF8_CHAR");
516 is(scalar @warnings, 0,
517 "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
518 . " generated no warnings")
519 or output_warnings(@warnings);
522 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
523 if ($will_overflow) {
524 is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
527 my $expected_ret = ( $testname =~ /surrogate/
528 || $allowed_uv > 0x10FFFF)
531 is($ret, $expected_ret, "isC9_STRICT_UTF8_CHAR() $testname:"
532 ." returns expected length: $expected_ret");
533 $ret = test_isUTF8_CHAR_flags($bytes, $length,
534 $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
535 is($ret, $expected_ret,
536 "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
537 . " acts like isC9_STRICT_UTF8_CHAR");
539 is(scalar @warnings, 0,
540 "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
541 . " generated no warnings")
542 or output_warnings(@warnings);
544 # Test partial character handling, for each byte not a full character
545 for my $j (1.. $length - 1) {
547 # Skip the test for the interaction between overflow and above-31
548 # bit. It is really testing other things than the partial
549 # character tests, for which other tests in this file are
551 last if $will_overflow;
553 foreach my $disallow_flag (0, $disallow_flags) {
554 my $partial = substr($bytes, 0, $j);
557 if ($disallow_flag) {
559 $comment = "disallowed";
560 if ($j < $needed_to_discern_len) {
562 $comment .= ", but need $needed_to_discern_len bytes"
568 $comment = "allowed";
573 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
575 is($ret, $ret_should_be,
576 "$testname: is_utf8_valid_partial_char_flags("
577 . display_bytes($partial)
578 . "), $comment: returns $ret_should_be");
579 is(scalar @warnings, 0,
580 "$testname: is_utf8_valid_partial_char_flags()"
581 . " generated no warnings")
582 or output_warnings(@warnings);
587 # This is more complicated than the malformations tested earlier, as there
588 # are several orthogonal variables involved. We test all the subclasses
589 # of utf8 warnings to verify they work with and without the utf8 class,
590 # and don't have effects on other sublass warnings
591 foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
592 foreach my $warn_flag (0, $warn_flags) {
593 foreach my $disallow_flag (0, $disallow_flags) {
594 foreach my $do_warning (0, 1) {
596 # We try each of the above with various combinations of
597 # malformations that can occur on the same input sequence.
598 foreach my $short ("", "short") {
599 foreach my $unexpected_noncont ("",
600 "unexpected non-continuation")
602 foreach my $overlong ("", "overlong") {
604 # If we're creating an overlong, it can't be longer than
605 # the maximum length, so skip if we're already at that
607 next if $overlong && $length >= $::max_bytes;
611 push @malformations, $short if $short;
612 push @malformations, $unexpected_noncont
613 if $unexpected_noncont;
614 push @malformations, $overlong if $overlong;
616 # The overflow malformation test in the input
617 # array is coerced into being treated like one of
619 if ($will_overflow) {
620 push @malformations, 'overflow';
621 push @expected_errors, $::UTF8_GOT_OVERFLOW;
624 my $malformations_name = join "/", @malformations;
625 $malformations_name .= " malformation"
626 if $malformations_name;
627 $malformations_name .= "s" if @malformations > 1;
628 my $this_bytes = $bytes;
629 my $this_length = $length;
630 my $expected_uv = $allowed_uv;
631 my $this_expected_len = $length;
632 my $this_needed_to_discern_len = $needed_to_discern_len;
633 if ($malformations_name) {
636 # Coerce the input into the desired
638 if ($malformations_name =~ /overlong/) {
640 # For an overlong, we convert the original
641 # start byte into a continuation byte with
642 # the same data bits as originally. ...
643 substr($this_bytes, 0, 1)
644 = start_byte_to_cont(substr($this_bytes,
647 # ... Then we prepend it with a known
648 # overlong sequence. This should evaluate
649 # to the exact same code point as the
652 = I8_to_native("\xff")
653 . (I8_to_native(chr $::lowest_continuation)
654 x ( $::max_bytes - 1 - length($this_bytes)))
656 $this_length = length($this_bytes);
657 $this_needed_to_discern_len
658 = $::max_bytes - ($this_expected_len
659 - $this_needed_to_discern_len);
660 $this_expected_len = $::max_bytes;
661 push @expected_errors, $::UTF8_GOT_LONG;
663 if ($malformations_name =~ /short/) {
665 # Just tell the test to not look far
666 # enough into the input.
668 $this_expected_len--;
669 push @expected_errors, $::UTF8_GOT_SHORT;
671 if ($malformations_name
672 =~ /non-continuation/)
674 # Change the final continuation byte into
676 my $pos = ($short) ? -2 : -1;
677 substr($this_bytes, $pos, 1) = '?';
678 $this_expected_len--;
679 push @expected_errors,
680 $::UTF8_GOT_NON_CONTINUATION;
684 my $eval_warn = $do_warning
685 ? "use warnings '$warning'"
687 ? "no warnings 'utf8'"
688 : ( "use warnings 'utf8';"
689 . " no warnings '$warning'");
691 # Is effectively disallowed if we've set up a
692 # malformation, even if the flag indicates it is
693 # allowed. Fix up test name to indicate this as
695 my $disallowed = $disallow_flag
696 || $malformations_name;
697 my $this_name = "utf8n_to_uvchr_error() $testname: "
703 $this_name .= ", $eval_warn";
704 $this_name .= ", " . (($warn_flag)
705 ? 'with warning flag'
706 : 'no warning flag');
710 my $display_bytes = display_bytes($this_bytes);
711 my $call = " Call was: $eval_warn; \$ret_ref"
712 . " = test_utf8n_to_uvchr_error("
713 . "'$display_bytes', $this_length,"
715 . "|$disallow_flag)";
716 my $eval_text = "$eval_warn; \$ret_ref"
717 . " = test_utf8n_to_uvchr_error("
719 . " $this_length, $warn_flag"
720 . "|$disallow_flag)";
722 if (! ok ("$@ eq ''",
723 "$this_name: eval succeeded"))
725 diag "\$!='$!'; eval'd=\"$call\"";
729 is($ret_ref->[0], 0, "$this_name: Returns 0")
733 is($ret_ref->[0], $expected_uv,
734 "$this_name: Returns expected uv: "
735 . sprintf("0x%04X", $expected_uv))
738 is($ret_ref->[1], $this_expected_len,
739 "$this_name: Returns expected length:"
740 . " $this_expected_len")
743 my $errors = $ret_ref->[2];
745 for (my $i = @expected_errors - 1; $i >= 0; $i--) {
746 if (ok($expected_errors[$i] & $errors,
747 "Expected and got error bit return"
748 . " for $malformations[$i] malformation"))
750 $errors &= ~$expected_errors[$i];
752 splice @expected_errors, $i, 1;
754 is(scalar @expected_errors, 0,
755 "Got all the expected malformation errors")
756 or diag Dumper \@expected_errors;
758 if ( $this_expected_len >= $this_needed_to_discern_len
759 && ($warn_flag || $disallow_flag))
761 is($errors, $expected_error_flags,
762 "Got the correct error flag")
766 is($errors, 0, "Got no other error flag");
769 if (@malformations) {
770 if (! $do_warning && $warning eq 'utf8') {
771 goto no_warnings_expected;
774 # Check that each malformation generates a
775 # warning, removing that warning if found
777 foreach my $malformation (@malformations) {
778 foreach (my $i = 0; $i < @warnings; $i++) {
779 if ($warnings[$i] =~ /$malformation/) {
780 pass("Expected and got"
781 . "'$malformation' warning");
782 splice @warnings, $i, 1;
786 fail("Expected '$malformation' warning"
787 . " but didn't get it");
792 # Any overflow will override any super or above-31
794 goto no_warnings_expected
795 if $will_overflow || $this_expected_len
796 < $this_needed_to_discern_len;
799 && ( $warning eq 'utf8'
800 || $warning eq $category))
802 goto no_warnings_expected;
805 if (is(scalar @warnings, 1,
806 "$this_name: Got a single warning "))
808 like($warnings[0], $message,
809 "$this_name: Got expected warning")
814 if (scalar @warnings) {
815 output_warnings(@warnings);
820 no_warnings_expected:
821 unless (is(scalar @warnings, 0,
822 "$this_name: Got no warnings"))
825 output_warnings(@warnings);
829 # Check CHECK_ONLY results when the input is
830 # disallowed. Do this when actually disallowed,
831 # not just when the $disallow_flag is set
834 $ret_ref = test_utf8n_to_uvchr_error(
835 $this_bytes, $this_length,
836 $disallow_flag|$::UTF8_CHECK_ONLY);
838 "$this_name, CHECK_ONLY: Returns 0")
840 is($ret_ref->[1], -1,
841 "$this_name: CHECK_ONLY: returns -1 for length")
843 if (! is(scalar @warnings, 0,
844 "$this_name, CHECK_ONLY: no warnings"
848 output_warnings(@warnings);
852 # Now repeat some of the above, but for
853 # uvchr_to_utf8_flags(). Since this comes from an
854 # existing code point, it hasn't overflowed, and
856 next if @malformations;
858 # The warning and disallow flags passed in are for
859 # utf8n_to_uvchr_error(). Convert them for
860 # uvchr_to_utf8_flags().
861 my $uvchr_warn_flag = 0;
862 my $uvchr_disallow_flag = 0;
864 if ($warn_flag == $::UTF8_WARN_SURROGATE) {
865 $uvchr_warn_flag = $::UNICODE_WARN_SURROGATE
867 elsif ($warn_flag == $::UTF8_WARN_NONCHAR) {
868 $uvchr_warn_flag = $::UNICODE_WARN_NONCHAR
870 elsif ($warn_flag == $::UTF8_WARN_SUPER) {
871 $uvchr_warn_flag = $::UNICODE_WARN_SUPER
873 elsif ($warn_flag == $::UTF8_WARN_ABOVE_31_BIT) {
875 = $::UNICODE_WARN_ABOVE_31_BIT;
878 fail(sprintf "Unexpected warn flag: %x",
883 if ($disallow_flag) {
884 if ($disallow_flag == $::UTF8_DISALLOW_SURROGATE)
887 = $::UNICODE_DISALLOW_SURROGATE;
889 elsif ($disallow_flag == $::UTF8_DISALLOW_NONCHAR)
892 = $::UNICODE_DISALLOW_NONCHAR;
894 elsif ($disallow_flag == $::UTF8_DISALLOW_SUPER) {
896 = $::UNICODE_DISALLOW_SUPER;
898 elsif ($disallow_flag
899 == $::UTF8_DISALLOW_ABOVE_31_BIT)
901 $uvchr_disallow_flag =
902 $::UNICODE_DISALLOW_ABOVE_31_BIT;
905 fail(sprintf "Unexpected disallow flag: %x",
911 $disallowed = $uvchr_disallow_flag;
913 $this_name = "uvchr_to_utf8_flags() $testname: "
914 . (($uvchr_disallow_flag)
917 ? 'ABOVE_31_BIT allowed'
919 $this_name .= ", $eval_warn";
920 $this_name .= ", " . (($uvchr_warn_flag)
921 ? 'with warning flag'
922 : 'no warning flag');
926 my $warn_flag = sprintf "0x%x", $uvchr_warn_flag;
927 my $disallow_flag = sprintf "0x%x",
928 $uvchr_disallow_flag;
929 $call = sprintf(" Call was: $eval_warn; \$ret"
930 . " = test_uvchr_to_utf8_flags("
931 . " 0x%x, $warn_flag|$disallow_flag)",
933 $eval_text = "$eval_warn; \$ret ="
934 . " test_uvchr_to_utf8_flags("
935 . "$allowed_uv, $warn_flag|"
938 if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
940 diag "\$!='$!'; eval'd=\"$eval_text\"";
944 is($ret, undef, "$this_name: Returns undef")
948 is($ret, $bytes, "$this_name: Returns expected string")
952 && ($warning eq 'utf8' || $warning eq $category))
954 if (!is(scalar @warnings, 0,
955 "$this_name: No warnings generated"))
958 output_warnings(@warnings);
961 elsif ( $uvchr_warn_flag
962 && ( $warning eq 'utf8'
963 || $warning eq $category))
965 if (is(scalar @warnings, 1,
966 "$this_name: Got a single warning "))
968 like($warnings[0], $message,
969 "$this_name: Got expected warning")
974 output_warnings(@warnings)