3 # This is a base file to be used by various .t's in its directory
10 require 'charset_tools.pl';
11 require './t/utf8_setup.pl';
16 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
17 # machines, and that is tested elsewhere
25 local $SIG{__WARN__} = sub { push @warnings_gotten, @_ };
27 sub nonportable_regex ($) {
29 # Returns a pattern that matches the non-portable message raised either
30 # for the specific input code point, or the one generated when there
31 # is some malformation that precludes the message containing the specific
34 my $code_point = shift;
36 my $string = sprintf '(Code point 0x%X is not Unicode, and'
37 . '|Any UTF-8 sequence that starts with'
38 . ' "(\\\x[[:xdigit:]]{2})+" is for a'
39 . ' non-Unicode code point, and is) not portable',
44 # Now test the cases where a legal code point is generated, but may or may not
45 # be allowed/warned on.
47 # ($testname, $bytes, $disallow_flags, $controlling_warning_category,
48 # $allowed_uv, $needed_to_discern_len )
50 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
51 $::UTF8_DISALLOW_SURROGATE,
54 [ "a middle surrogate",
55 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
56 $::UTF8_DISALLOW_SURROGATE,
59 [ "highest surrogate",
60 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
61 $::UTF8_DISALLOW_SURROGATE,
64 [ "first non_unicode",
65 (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
66 $::UTF8_DISALLOW_SUPER,
67 'non_unicode', 0x110000,
70 [ "non_unicode whose first byte tells that",
71 (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
72 $::UTF8_DISALLOW_SUPER,
74 (isASCII) ? 0x140000 : 0x200000,
77 [ "first of 32 consecutive non-character code points",
78 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
79 $::UTF8_DISALLOW_NONCHAR,
82 [ "a mid non-character code point of the 32 consecutive ones",
83 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
84 $::UTF8_DISALLOW_NONCHAR,
87 [ "final of 32 consecutive non-character code points",
88 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
89 $::UTF8_DISALLOW_NONCHAR,
92 [ "non-character code point U+FFFE",
93 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
94 $::UTF8_DISALLOW_NONCHAR,
97 [ "non-character code point U+FFFF",
98 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
99 $::UTF8_DISALLOW_NONCHAR,
102 [ "non-character code point U+1FFFE",
103 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
104 $::UTF8_DISALLOW_NONCHAR,
107 [ "non-character code point U+1FFFF",
108 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
109 $::UTF8_DISALLOW_NONCHAR,
112 [ "non-character code point U+2FFFE",
113 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
114 $::UTF8_DISALLOW_NONCHAR,
117 [ "non-character code point U+2FFFF",
118 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
119 $::UTF8_DISALLOW_NONCHAR,
122 [ "non-character code point U+3FFFE",
123 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
124 $::UTF8_DISALLOW_NONCHAR,
127 [ "non-character code point U+3FFFF",
128 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
129 $::UTF8_DISALLOW_NONCHAR,
132 [ "non-character code point U+4FFFE",
133 (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
134 $::UTF8_DISALLOW_NONCHAR,
137 [ "non-character code point U+4FFFF",
138 (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
139 $::UTF8_DISALLOW_NONCHAR,
142 [ "non-character code point U+5FFFE",
143 (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
144 $::UTF8_DISALLOW_NONCHAR,
147 [ "non-character code point U+5FFFF",
148 (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
149 $::UTF8_DISALLOW_NONCHAR,
152 [ "non-character code point U+6FFFE",
153 (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
154 $::UTF8_DISALLOW_NONCHAR,
157 [ "non-character code point U+6FFFF",
158 (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
159 $::UTF8_DISALLOW_NONCHAR,
162 [ "non-character code point U+7FFFE",
163 (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
164 $::UTF8_DISALLOW_NONCHAR,
167 [ "non-character code point U+7FFFF",
168 (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
169 $::UTF8_DISALLOW_NONCHAR,
172 [ "non-character code point U+8FFFE",
173 (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
174 $::UTF8_DISALLOW_NONCHAR,
177 [ "non-character code point U+8FFFF",
178 (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
179 $::UTF8_DISALLOW_NONCHAR,
182 [ "non-character code point U+9FFFE",
183 (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
184 $::UTF8_DISALLOW_NONCHAR,
187 [ "non-character code point U+9FFFF",
188 (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
189 $::UTF8_DISALLOW_NONCHAR,
192 [ "non-character code point U+AFFFE",
193 (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
194 $::UTF8_DISALLOW_NONCHAR,
197 [ "non-character code point U+AFFFF",
198 (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
199 $::UTF8_DISALLOW_NONCHAR,
202 [ "non-character code point U+BFFFE",
203 (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
204 $::UTF8_DISALLOW_NONCHAR,
207 [ "non-character code point U+BFFFF",
208 (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
209 $::UTF8_DISALLOW_NONCHAR,
212 [ "non-character code point U+CFFFE",
213 (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
214 $::UTF8_DISALLOW_NONCHAR,
217 [ "non-character code point U+CFFFF",
218 (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
219 $::UTF8_DISALLOW_NONCHAR,
222 [ "non-character code point U+DFFFE",
223 (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
224 $::UTF8_DISALLOW_NONCHAR,
227 [ "non-character code point U+DFFFF",
228 (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
229 $::UTF8_DISALLOW_NONCHAR,
232 [ "non-character code point U+EFFFE",
233 (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
234 $::UTF8_DISALLOW_NONCHAR,
237 [ "non-character code point U+EFFFF",
238 (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
239 $::UTF8_DISALLOW_NONCHAR,
242 [ "non-character code point U+FFFFE",
243 (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
244 $::UTF8_DISALLOW_NONCHAR,
247 [ "non-character code point U+FFFFF",
248 (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
249 $::UTF8_DISALLOW_NONCHAR,
252 [ "non-character code point U+10FFFE",
253 (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
254 $::UTF8_DISALLOW_NONCHAR,
257 [ "non-character code point U+10FFFF",
258 (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
259 $::UTF8_DISALLOW_NONCHAR,
262 [ "requires at least 32 bits",
264 ? "\xfe\x82\x80\x80\x80\x80\x80"
266 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
267 # This code point is chosen so that it is representable in a UV on
269 $::UTF8_DISALLOW_ABOVE_31_BIT,
273 [ "highest 32 bit code point",
275 ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
277 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
278 $::UTF8_DISALLOW_ABOVE_31_BIT,
282 [ "requires at least 32 bits, and use SUPER-type flags, instead of"
285 ? "\xfe\x82\x80\x80\x80\x80\x80"
287 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
288 $::UTF8_DISALLOW_SUPER,
292 [ "overflow with warnings/disallow for more than 31 bits",
293 # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
294 # with overflow. The overflow malformation is never allowed, so
295 # preventing it takes precedence if the ABOVE_31_BIT options would
296 # otherwise allow in an overflowing value. The ASCII code points (1
297 # for 32-bits; 1 for 64) were chosen because the old overflow
298 # detection algorithm did not catch them; this means this test also
299 # checks for that fix. The EBCDIC are arbitrary overflowing ones
300 # since we have no reports of failures with it.
303 ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
305 "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
307 ? "\xfe\x86\x80\x80\x80\x80\x80"
309 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
310 $::UTF8_DISALLOW_ABOVE_31_BIT,
312 (isASCII || $::is64bit) ? 2 : 8,
318 no warnings qw{portable overflow};
320 [ "Lowest 33 bit code point: overflow",
321 "\xFE\x84\x80\x80\x80\x80\x80",
322 $::UTF8_DISALLOW_ABOVE_31_BIT,
329 no warnings qw{portable overflow};
331 [ "More than 32 bits",
333 ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
335 "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
336 $::UTF8_DISALLOW_ABOVE_31_BIT,
337 'utf8', 0x1000000000,
341 push @tests, # These could falsely show wrongly in a naive
343 [ "requires at least 32 bits",
345 "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
346 $::UTF8_DISALLOW_ABOVE_31_BIT,
350 [ "requires at least 32 bits",
352 "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
353 $::UTF8_DISALLOW_ABOVE_31_BIT,
354 'utf8', 0x10000000000,
357 [ "requires at least 32 bits",
359 "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
360 $::UTF8_DISALLOW_ABOVE_31_BIT,
361 'utf8', 0x200000000000,
364 [ "requires at least 32 bits",
366 "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
367 $::UTF8_DISALLOW_ABOVE_31_BIT,
368 'utf8', 0x4000000000000,
371 [ "requires at least 32 bits",
373 "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
374 $::UTF8_DISALLOW_ABOVE_31_BIT,
375 'utf8', 0x80000000000000,
378 [ "requires at least 32 bits",
380 "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
381 $::UTF8_DISALLOW_ABOVE_31_BIT,
382 'utf8', 0x1000000000000000,
388 sub flags_to_text($$)
390 my ($flags, $flags_to_text_ref) = @_;
392 # Returns a string containing a mnemonic representation of the bits that
393 # are set in the $flags. These are assumed to be flag bits. The return
394 # looks like "FOO|BAR|BAZ". The second parameter is a reference to an
395 # array that gives the textual representation of all the possible flags.
396 # Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If
397 # no bits at all are set the string "0" is returned;
402 return "0" if $flags == 0;
405 #diag sprintf "%x", $flags;
407 push @flag_text, $flags_to_text_ref->[$shift];
413 return join "|", @flag_text;
416 # Possible flag returns from utf8n_to_uvchr_error(). These should have G_,
417 # instead of A_, D_, but the prefixes will be used in a a later commit, so
418 # minimize churn by having them here.
419 my @utf8n_flags_to_text = ( qw(
437 NO_CONFIDENCE_IN_CURLEN_
440 sub utf8n_display_call($)
442 # Converts an eval string that calls test_utf8n_to_uvchr into a more human
443 # readable form, and returns it. Doesn't work if the byte string contains
444 # an apostrophe. The return will look something like:
445 # test_utf8n_to_uvchr_error('$bytes', $length, $flags)
448 $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
449 my $text1 = $1; # Everything before the byte string
451 my $text2 = $3; # Includes the length
455 . display_bytes($bytes)
457 . flags_to_text($flags, \@utf8n_flags_to_text)
461 sub uvchr_display_call($)
463 # Converts an eval string that calls test_uvchr_to_utf8 into a more human
464 # readable form, and returns it. The return will look something like:
465 # test_uvchr_to_utf8n_flags($uv, $flags)
468 my @flags_to_text = ( qw(
479 $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
481 my $cp = sprintf "%X", $2;
484 return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
487 # This test is split into this number of files.
488 my $num_test_files = $ENV{TEST_JOBS} || 1;
489 $num_test_files = 10 if $num_test_files > 10;
492 foreach my $test (@tests) {
494 next if $test_count % $num_test_files != $::TEST_CHUNK;
496 my ($testname, $bytes, $disallow_flags,
497 $controlling_warning_category, $allowed_uv, $needed_to_discern_len
500 my $length = length $bytes;
501 my $will_overflow = $allowed_uv < 0;
503 # The convention is that the got flag is the same value as the disallow
504 # one, and the warn flag is the next bit over. If this were violated, the
505 # tests here should start failing. We could do an eval under no strict to
507 my $expected_error_flags = $disallow_flags;
508 my $warn_flags = $disallow_flags << 1;
511 if ($allowed_uv > 0x7FFFFFFF) {
512 $message = nonportable_regex($allowed_uv);
514 elsif ($allowed_uv > 0x10FFFF) {
515 $message = qr/(not Unicode|for a non-Unicode code point).* may not be portable/;
517 elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
518 $message = qr/surrogate/;
519 $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
521 elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
522 || ($allowed_uv & 0xFFFE) == 0xFFFE)
524 $message = qr/Unicode non-character.*is not recommended for open interchange/;
525 $needed_to_discern_len = $length unless defined $needed_to_discern_len;
527 elsif ($will_overflow) {
528 $message = qr/overflows/;
531 die "Can't figure out what type of warning to test for $testname"
534 die 'Didn\'t set $needed_to_discern_len for ' . $testname
535 unless defined $needed_to_discern_len;
539 undef @warnings_gotten;
540 my $ret = test_isUTF8_CHAR($bytes, $length);
541 my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
542 if ($will_overflow) {
543 is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
544 is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
548 "isUTF8_CHAR() $testname: returns expected length: $length");
549 is($ret_flags, $length, "isUTF8_CHAR_flags(...,0) $testname:"
550 . " returns expected length: $length");
552 is(scalar @warnings_gotten, 0,
553 "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated"
555 or output_warnings(@warnings_gotten);
557 undef @warnings_gotten;
558 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
559 if ($will_overflow) {
560 is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
563 my $expected_ret = ( $testname =~ /surrogate|non-character/
564 || $allowed_uv > 0x10FFFF)
567 is($ret, $expected_ret, "isSTRICT_UTF8_CHAR() $testname: returns"
568 . " expected length: $expected_ret");
569 $ret = test_isUTF8_CHAR_flags($bytes, $length,
570 $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
571 is($ret, $expected_ret,
572 "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
573 . " acts like isSTRICT_UTF8_CHAR");
575 is(scalar @warnings_gotten, 0,
576 "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
577 . " generated no warnings")
578 or output_warnings(@warnings_gotten);
580 undef @warnings_gotten;
581 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
582 if ($will_overflow) {
583 is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
586 my $expected_ret = ( $testname =~ /surrogate/
587 || $allowed_uv > 0x10FFFF)
590 is($ret, $expected_ret, "isC9_STRICT_UTF8_CHAR() $testname:"
591 ." returns expected length: $expected_ret");
592 $ret = test_isUTF8_CHAR_flags($bytes, $length,
593 $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
594 is($ret, $expected_ret,
595 "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
596 . " acts like isC9_STRICT_UTF8_CHAR");
598 is(scalar @warnings_gotten, 0,
599 "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
600 . " generated no warnings")
601 or output_warnings(@warnings_gotten);
603 # Test partial character handling, for each byte not a full character
604 for my $j (1.. $length - 1) {
606 # Skip the test for the interaction between overflow and above-31
607 # bit. It is really testing other things than the partial
608 # character tests, for which other tests in this file are
610 last if $will_overflow;
612 foreach my $disallow_flag (0, $disallow_flags) {
613 my $partial = substr($bytes, 0, $j);
616 if ($disallow_flag) {
618 $comment = "disallowed";
619 if ($j < $needed_to_discern_len) {
621 $comment .= ", but need $needed_to_discern_len bytes"
627 $comment = "allowed";
630 undef @warnings_gotten;
632 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
634 is($ret, $ret_should_be,
635 "$testname: is_utf8_valid_partial_char_flags("
636 . display_bytes($partial)
637 . "), $comment: returns $ret_should_be");
638 is(scalar @warnings_gotten, 0,
639 "$testname: is_utf8_valid_partial_char_flags()"
640 . " generated no warnings")
641 or output_warnings(@warnings_gotten);
646 # This is more complicated than the malformations tested earlier, as there
647 # are several orthogonal variables involved. We test all the subclasses
648 # of utf8 warnings to verify they work with and without the utf8 class,
649 # and don't have effects on other sublass warnings
650 foreach my $trial_warning_category ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
651 foreach my $warn_flag (0, $warn_flags) {
652 foreach my $disallow_flag (0, $disallow_flags) {
653 foreach my $do_warning (0, 1) {
655 # We try each of the above with various combinations of
656 # malformations that can occur on the same input sequence.
657 foreach my $short ("", "short") {
658 foreach my $unexpected_noncont ("",
659 "unexpected non-continuation")
661 foreach my $overlong ("", "overlong") {
663 # If we're creating an overlong, it can't be longer than
664 # the maximum length, so skip if we're already at that
666 next if $overlong && $length >= $::max_bytes;
669 my @expected_return_flags;
670 push @malformations, $short if $short;
671 push @malformations, $unexpected_noncont
672 if $unexpected_noncont;
673 push @malformations, $overlong if $overlong;
675 # The overflow malformation test in the input
676 # array is coerced into being treated like one of
678 if ($will_overflow) {
679 push @malformations, 'overflow';
680 push @expected_return_flags, $::UTF8_GOT_OVERFLOW;
683 my $malformations_name = join "/", @malformations;
684 $malformations_name .= " malformation"
685 if $malformations_name;
686 $malformations_name .= "s" if @malformations > 1;
687 my $this_bytes = $bytes;
688 my $this_length = $length;
689 my $expected_uv = $allowed_uv;
690 my $this_expected_len = $length;
691 my $this_needed_to_discern_len = $needed_to_discern_len;
692 if ($malformations_name) {
695 # Coerce the input into the desired
697 if ($malformations_name =~ /overlong/) {
699 # For an overlong, we convert the original
700 # start byte into a continuation byte with
701 # the same data bits as originally. ...
702 substr($this_bytes, 0, 1)
703 = start_byte_to_cont(substr($this_bytes,
706 # ... Then we prepend it with a known
707 # overlong sequence. This should evaluate
708 # to the exact same code point as the
711 = I8_to_native("\xff")
712 . (I8_to_native(chr $::lowest_continuation)
713 x ( $::max_bytes - 1 - length($this_bytes)))
715 $this_length = length($this_bytes);
716 $this_needed_to_discern_len
717 = $::max_bytes - ($this_expected_len
718 - $this_needed_to_discern_len);
719 $this_expected_len = $::max_bytes;
720 push @expected_return_flags, $::UTF8_GOT_LONG;
722 if ($malformations_name =~ /short/) {
724 # Just tell the test to not look far
725 # enough into the input.
727 $this_expected_len--;
728 push @expected_return_flags, $::UTF8_GOT_SHORT;
730 if ($malformations_name
731 =~ /non-continuation/)
733 # Change the final continuation byte into
735 my $pos = ($short) ? -2 : -1;
736 substr($this_bytes, $pos, 1) = '?';
737 $this_expected_len--;
738 push @expected_return_flags,
739 $::UTF8_GOT_NON_CONTINUATION;
743 my $eval_warn = $do_warning
744 ? "use warnings '$trial_warning_category'"
745 : $trial_warning_category eq "utf8"
746 ? "no warnings 'utf8'"
747 : ( "use warnings 'utf8';"
748 . " no warnings '$trial_warning_category'");
750 # Is effectively disallowed if we've set up a
751 # malformation, even if the flag indicates it is
752 # allowed. Fix up test name to indicate this as
754 my $disallowed = $disallow_flag
755 || $malformations_name;
756 my $this_name = "utf8n_to_uvchr_error() $testname: "
762 $this_name .= ", $eval_warn";
763 $this_name .= ", " . (($warn_flag)
764 ? 'with warning flag'
765 : 'no warning flag');
767 undef @warnings_gotten;
769 my $this_flags = $warn_flag | $disallow_flag;
770 my $eval_text = "$eval_warn; \$ret_ref"
771 . " = test_utf8n_to_uvchr_error("
773 . " $this_length, $this_flags)";
775 if (! ok ("$@ eq ''",
776 "$this_name: eval succeeded"))
778 diag "\$@='$@'; call was: "
779 . utf8n_display_call($eval_text);
783 is($ret_ref->[0], 0, "$this_name: Returns 0")
784 or diag "Call was: " . utf8n_display_call($eval_text);
787 is($ret_ref->[0], $expected_uv,
788 "$this_name: Returns expected uv: "
789 . sprintf("0x%04X", $expected_uv))
790 or diag "Call was: " . utf8n_display_call($eval_text);
792 is($ret_ref->[1], $this_expected_len,
793 "$this_name: Returns expected length:"
794 . " $this_expected_len")
795 or diag "Call was: " . utf8n_display_call($eval_text);
797 my $returned_flags = $ret_ref->[2];
799 for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
800 if (ok($expected_return_flags[$i] & $returned_flags,
801 "Expected and got error bit return"
802 . " for $malformations[$i] malformation"))
804 $returned_flags &= ~$expected_return_flags[$i];
806 splice @expected_return_flags, $i, 1;
808 is(scalar @expected_return_flags, 0,
809 "Got all the expected malformation errors")
810 or diag Dumper \@expected_return_flags;
812 if ( $this_expected_len >= $this_needed_to_discern_len
813 && ($warn_flag || $disallow_flag))
815 is($returned_flags, $expected_error_flags,
816 "Got the correct error flag")
817 or diag "Call was: " . utf8n_display_call($eval_text);
820 is($returned_flags, 0, "Got no other error flag")
823 # We strip off any prefixes from the flag names
824 diag "The unexpected flags were: "
825 . (flags_to_text($returned_flags,
826 \@utf8n_flags_to_text)
827 =~ s/ \b [A-Z] _ //xgr);
830 if (@malformations) {
831 if (! $do_warning && $trial_warning_category eq 'utf8') {
832 goto no_warnings_expected;
835 # Check that each malformation generates a
836 # warning, removing that warning if found
838 foreach my $malformation (@malformations) {
839 foreach (my $i = 0; $i < @warnings_gotten; $i++) {
840 if ($warnings_gotten[$i] =~ /$malformation/) {
841 pass("Expected and got"
842 . "'$malformation' warning");
843 splice @warnings_gotten, $i, 1;
847 fail("Expected '$malformation' warning"
848 . " but didn't get it");
853 # Any overflow will override any super or above-31
855 goto no_warnings_expected
856 if $will_overflow || $this_expected_len
857 < $this_needed_to_discern_len;
860 && ( $trial_warning_category eq 'utf8'
861 || $trial_warning_category eq $controlling_warning_category))
863 goto no_warnings_expected;
866 if (is(scalar @warnings_gotten, 1,
867 "$this_name: Got a single warning "))
869 like($warnings_gotten[0], $message,
870 "$this_name: Got expected warning")
872 . utf8n_display_call($eval_text);
875 diag "Call was: " . utf8n_display_call($eval_text);
876 if (scalar @warnings_gotten) {
877 output_warnings(@warnings_gotten);
883 no_warnings_expected:
884 unless (is(scalar @warnings_gotten, 0,
885 "$this_name: Got no warnings"))
887 diag "Call was: " . utf8n_display_call($eval_text);
888 output_warnings(@warnings_gotten);
892 # Check CHECK_ONLY results when the input is
893 # disallowed. Do this when actually disallowed,
894 # not just when the $disallow_flag is set
896 undef @warnings_gotten;
897 $this_flags = $disallow_flag|$::UTF8_CHECK_ONLY;
898 $eval_text = "\$ret_ref = test_utf8n_to_uvchr_error("
899 . "'$this_bytes', $this_length, $this_flags)";
901 if (! ok ("$@ eq ''",
902 " And eval succeeded with CHECK_ONLY"))
904 diag "\$@='$@'; Call was: "
905 . utf8n_display_call($eval_text);
909 "$this_name, CHECK_ONLY: Returns 0")
910 or diag "Call was: " . utf8n_display_call($eval_text);
911 is($ret_ref->[1], -1,
912 "$this_name: CHECK_ONLY: returns -1 for length")
913 or diag "Call was: " . utf8n_display_call($eval_text);
914 if (! is(scalar @warnings_gotten, 0,
915 "$this_name, CHECK_ONLY: no warnings"
918 diag "Call was: " . utf8n_display_call($eval_text);
919 output_warnings(@warnings_gotten);
923 # Now repeat some of the above, but for
924 # uvchr_to_utf8_flags(). Since this comes from an
925 # existing code point, it hasn't overflowed, and
927 next if @malformations;
929 # The warning and disallow flags passed in are for
930 # utf8n_to_uvchr_error(). Convert them for
931 # uvchr_to_utf8_flags().
932 my $uvchr_warn_flag = 0;
933 my $uvchr_disallow_flag = 0;
935 if ($warn_flag == $::UTF8_WARN_SURROGATE) {
936 $uvchr_warn_flag = $::UNICODE_WARN_SURROGATE
938 elsif ($warn_flag == $::UTF8_WARN_NONCHAR) {
939 $uvchr_warn_flag = $::UNICODE_WARN_NONCHAR
941 elsif ($warn_flag == $::UTF8_WARN_SUPER) {
942 $uvchr_warn_flag = $::UNICODE_WARN_SUPER
944 elsif ($warn_flag == $::UTF8_WARN_ABOVE_31_BIT) {
946 = $::UNICODE_WARN_ABOVE_31_BIT;
949 fail(sprintf "Unexpected warn flag: %x",
954 if ($disallow_flag) {
955 if ($disallow_flag == $::UTF8_DISALLOW_SURROGATE)
958 = $::UNICODE_DISALLOW_SURROGATE;
960 elsif ($disallow_flag == $::UTF8_DISALLOW_NONCHAR)
963 = $::UNICODE_DISALLOW_NONCHAR;
965 elsif ($disallow_flag == $::UTF8_DISALLOW_SUPER) {
967 = $::UNICODE_DISALLOW_SUPER;
969 elsif ($disallow_flag
970 == $::UTF8_DISALLOW_ABOVE_31_BIT)
972 $uvchr_disallow_flag =
973 $::UNICODE_DISALLOW_ABOVE_31_BIT;
976 fail(sprintf "Unexpected disallow flag: %x",
982 $disallowed = $uvchr_disallow_flag;
984 $this_name = "uvchr_to_utf8_flags() $testname: "
985 . (($uvchr_disallow_flag)
988 ? 'ABOVE_31_BIT allowed'
990 $this_name .= ", $eval_warn";
991 $this_name .= ", " . (($uvchr_warn_flag)
992 ? 'with warning flag'
993 : 'no warning flag');
995 undef @warnings_gotten;
997 $this_flags = $uvchr_warn_flag | $uvchr_disallow_flag;
998 $eval_text = "$eval_warn; \$ret ="
999 . " test_uvchr_to_utf8_flags("
1000 . "$allowed_uv, $this_flags)";
1002 if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1004 diag "\$@='$@'; call was: "
1005 . uvchr_display_call($eval_text);
1009 is($ret, undef, "$this_name: Returns undef")
1010 or diag "Call was: " . uvchr_display_call($eval_text);
1013 is($ret, $this_bytes, "$this_name: Returns expected string")
1014 or diag "Call was: " . uvchr_display_call($eval_text);
1017 && ($trial_warning_category eq 'utf8' || $trial_warning_category eq $controlling_warning_category))
1019 if (!is(scalar @warnings_gotten, 0,
1020 "$this_name: No warnings generated"))
1022 diag "Call was: " . uvchr_display_call($eval_text);
1023 output_warnings(@warnings_gotten);
1026 elsif ( $uvchr_warn_flag
1027 && ( $trial_warning_category eq 'utf8'
1028 || $trial_warning_category eq $controlling_warning_category))
1030 if (is(scalar @warnings_gotten, 1,
1031 "$this_name: Got a single warning "))
1033 like($warnings_gotten[0], $message,
1034 "$this_name: Got expected warning")
1035 or diag "Call was: "
1036 . uvchr_display_call($eval_text);
1039 diag "Call was: " . uvchr_display_call($eval_text);
1040 output_warnings(@warnings_gotten)
1041 if scalar @warnings_gotten;