3 # This is a base file to be used by various .t's in its directory
4 # It tests various code points that are "problematic", and verifies that the
5 # correct warnings/flags etc are generated when using them. It also takes the
6 # UTF-8 for some of them and perturbs it to be malformed in various ways, and
7 # tests that this gets appropriately detected.
13 use_ok('XS::APItest');
14 require 'charset_tools.pl';
15 require './t/utf8_setup.pl';
20 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
21 # machines, and that is tested elsewhere
27 local $SIG{__WARN__} = sub { my @copy = @_;
28 push @warnings_gotten, map { chomp; $_ } @copy;
31 my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
33 sub requires_extended_utf8($) {
35 # Returns a boolean as to whether or not the code point parameter fits
36 # into 31 bits, subject to the convention that a negative code point
37 # stands for one that overflows the word size, so won't fit in 31 bits.
40 return $cp > 0x7FFFFFFF;
45 no warnings qw(portable overflow);
48 # $bytes, UTF-8 string
49 # $allowed_uv, code point $bytes evaluates to; -1 if
51 # $needed_to_discern_len optional, how long an initial substring do
52 # we need to tell that the string must be for
53 # a code point in the category it falls in,
54 # like being a surrogate; 0 indicates we need
55 # the whole string. Some categories have a
56 # default that is used if this is omitted.
58 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
61 [ "a middle surrogate",
62 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
65 [ "highest surrogate",
66 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
69 [ "first of 32 consecutive non-character code points",
70 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
73 [ "a mid non-character code point of the 32 consecutive ones",
74 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
77 [ "final of 32 consecutive non-character code points",
78 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
81 [ "non-character code point U+FFFE",
82 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
85 [ "non-character code point U+FFFF",
86 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
89 [ "non-character code point U+1FFFE",
90 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
93 [ "non-character code point U+1FFFF",
94 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
97 [ "non-character code point U+2FFFE",
98 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
101 [ "non-character code point U+2FFFF",
102 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
105 [ "non-character code point U+3FFFE",
106 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
109 [ "non-character code point U+3FFFF",
110 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
113 [ "non-character code point U+4FFFE",
116 : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
119 [ "non-character code point U+4FFFF",
122 : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
125 [ "non-character code point U+5FFFE",
128 : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
131 [ "non-character code point U+5FFFF",
134 : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
137 [ "non-character code point U+6FFFE",
140 : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
143 [ "non-character code point U+6FFFF",
146 : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
149 [ "non-character code point U+7FFFE",
152 : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
155 [ "non-character code point U+7FFFF",
158 : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
161 [ "non-character code point U+8FFFE",
164 : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
167 [ "non-character code point U+8FFFF",
170 : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
173 [ "non-character code point U+9FFFE",
176 : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
179 [ "non-character code point U+9FFFF",
182 : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
185 [ "non-character code point U+AFFFE",
188 : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
191 [ "non-character code point U+AFFFF",
194 : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
197 [ "non-character code point U+BFFFE",
200 : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
203 [ "non-character code point U+BFFFF",
206 : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
209 [ "non-character code point U+CFFFE",
212 : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
215 [ "non-character code point U+CFFFF",
218 : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
221 [ "non-character code point U+DFFFE",
224 : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
227 [ "non-character code point U+DFFFF",
230 : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
233 [ "non-character code point U+EFFFE",
236 : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
239 [ "non-character code point U+EFFFF",
242 : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
245 [ "non-character code point U+FFFFE",
248 : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
251 [ "non-character code point U+FFFFF",
254 : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
257 [ "non-character code point U+10FFFE",
260 : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
263 [ "non-character code point U+10FFFF",
266 : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
269 [ "first non_unicode",
272 : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
276 [ "non_unicode whose first byte tells that",
279 : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
280 (isASCII) ? 0x140000 : 0x200000,
283 [ "lowest 32 bit code point",
285 ? "\xfe\x82\x80\x80\x80\x80\x80"
287 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
291 [ "highest 32 bit code point",
293 ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
295 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
299 [ "Lowest 33 bit code point",
301 ? "\xfe\x84\x80\x80\x80\x80\x80"
303 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
304 ($::is64bit) ? 0x100000000 : -1, # Overflows on 32-bit systems
305 (isASCII && ! $::is64bit) ? 2 : 1,
312 [ "overflow that old algorithm failed to detect",
313 "\xfe\x86\x80\x80\x80\x80\x80",
322 [ "highest 64 bit code point",
324 ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
326 "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
330 [ "first 65 bit code point",
332 ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
334 "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
340 [ "Lowest code point requiring 13 bytes to represent",
341 "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
345 [ "overflow that old algorithm failed to detect",
346 "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
352 push @tests, # These could falsely show wrongly in a naive
354 [ "requires at least 32 bits",
356 "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
360 [ "requires at least 32 bits",
362 "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
366 [ "requires at least 32 bits",
368 "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
372 [ "requires at least 32 bits",
374 "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
378 [ "requires at least 32 bits",
380 "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
384 [ "requires at least 32 bits",
386 "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
394 sub flags_to_text($$)
396 my ($flags, $flags_to_text_ref) = @_;
398 # Returns a string containing a mnemonic representation of the bits that
399 # are set in the $flags. These are assumed to be flag bits. The return
400 # looks like "FOO|BAR|BAZ". The second parameter is a reference to an
401 # array that gives the textual representation of all the possible flags.
402 # Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If
403 # no bits at all are set the string "0" is returned;
408 return "0" if $flags == 0;
411 #diag sprintf "%x", $flags;
413 push @flag_text, $flags_to_text_ref->[$shift];
419 return join "|", @flag_text;
422 # Possible flag returns from utf8n_to_uvchr_error(). These should have G_,
423 # instead of A_, D_, but the prefixes will be used in a a later commit, so
424 # minimize churn by having them here.
425 my @utf8n_flags_to_text = ( qw(
443 NO_CONFIDENCE_IN_CURLEN_
446 sub utf8n_display_call($)
448 # Converts an eval string that calls test_utf8n_to_uvchr into a more human
449 # readable form, and returns it. Doesn't work if the byte string contains
450 # an apostrophe. The return will look something like:
451 # test_utf8n_to_uvchr_error('$bytes', $length, $flags)
454 $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
455 my $text1 = $1; # Everything before the byte string
457 my $text2 = $3; # Includes the length
461 . display_bytes($bytes)
463 . flags_to_text($flags, \@utf8n_flags_to_text)
467 sub uvchr_display_call($)
469 # Converts an eval string that calls test_uvchr_to_utf8 into a more human
470 # readable form, and returns it. The return will look something like:
471 # test_uvchr_to_utf8n_flags($uv, $flags)
474 my @flags_to_text = ( qw(
485 $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
487 my $cp = sprintf "%X", $2;
490 return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
493 sub do_warnings_test(@)
495 my @expected_warnings = @_;
497 # Compares the input expected warnings array with @warnings_gotten,
498 # generating a pass for each found, removing it from @warnings_gotten.
499 # Any discrepancies generate test failures. Returns TRUE if no
500 # discrepcancies; otherwise FALSE.
504 if (@expected_warnings == 0) {
505 if (! is(@warnings_gotten, 0, " Expected and got no warnings")) {
506 output_warnings(@warnings_gotten);
512 # Check that we got all the expected warnings,
513 # removing each one found
515 foreach my $expected (@expected_warnings) {
516 foreach (my $i = 0; $i < @warnings_gotten; $i++) {
517 if ($warnings_gotten[$i] =~ $expected) {
518 pass(" Expected and got warning: "
519 . " $warnings_gotten[$i]");
520 splice @warnings_gotten, $i, 1;
524 fail(" Expected a warning that matches "
525 . $expected . " but didn't get it");
529 if (! is(@warnings_gotten, 0, " Got no unexpected warnings")) {
530 output_warnings(@warnings_gotten);
537 # This test is split into this number of files.
538 my $num_test_files = $ENV{TEST_JOBS} || 1;
539 $num_test_files = 10 if $num_test_files > 10;
542 foreach my $test (@tests) {
544 next if $test_count % $num_test_files != $::TEST_CHUNK;
546 my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
548 my $length = length $bytes;
549 my $will_overflow = $allowed_uv < 0;
551 my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
552 my $display_bytes = display_bytes($bytes);
554 my $controlling_warning_category;
555 my $utf8n_flag_to_warn;
556 my $utf8n_flag_to_disallow;
557 my $uvchr_flag_to_warn;
558 my $uvchr_flag_to_disallow;
560 # We want to test that the independent flags are actually independent.
561 # For example, that a surrogate doesn't trigger a non-character warning,
562 # and conversely, turning off an above-Unicode flag doesn't suppress a
563 # surrogate warning. Earlier versions of this file used nested loops to
564 # test all possible combinations. But that creates lots of tests, making
565 # this run too long. What is now done instead is to use the complement of
566 # the category we are testing to greatly reduce the combinatorial
567 # explosion. For example, if we have a surrogate and we aren't expecting
568 # a warning about it, we set all the flags for non-surrogates to raise
569 # warnings. If one shows up, it indicates the flags aren't independent.
570 my $utf8n_flag_to_warn_complement;
571 my $utf8n_flag_to_disallow_complement;
572 my $uvchr_flag_to_warn_complement;
573 my $uvchr_flag_to_disallow_complement;
575 # Many of the code points being tested are middling in that if code point
576 # edge cases work, these are very likely to as well. Because this test
577 # file takes a while to execute, we skip testing the edge effects of code
578 # points deemed middling, while testing their basics and continuing to
579 # fully test the non-middling code points.
580 my $skip_most_tests = 0;
582 my $cp_message_qr; # Pattern that matches the message raised when
583 # that message contains the problematic code
584 # point. The message is the same (currently) both
585 # when going from/to utf8.
586 my $non_cp_trailing_text; # The suffix text when the message doesn't
587 # contain a code point. (This is a result of
588 # some sort of malformation that means we
589 # can't get an exact code poin
591 # Is this test malformed from the beginning? If so, we know to generally
592 # expect that the tests will show it isn't valid.
593 my $initially_malformed = 0;
595 if ($will_overflow || $allowed_uv > 0x10FFFF) {
597 # Set the SUPER flags; later, we test for ABOVE_31_BIT as well.
598 $utf8n_flag_to_warn = $::UTF8_WARN_SUPER;
599 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
600 $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER;
601 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
603 # Below, we add the flags for non-above-31 bit to the code points that
604 # don't fit that category. Special tests are done for this category
606 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
607 |$::UTF8_WARN_SURROGATE;
608 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
609 |$::UTF8_DISALLOW_SURROGATE;
610 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
611 |$::UNICODE_WARN_SURROGATE;
612 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
613 |$::UNICODE_DISALLOW_SURROGATE;
614 $controlling_warning_category = 'non_unicode';
616 if ($will_overflow) { # This is realy a malformation
617 $non_cp_trailing_text = "if you see this, there is an error";
618 $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
619 $initially_malformed = 1;
621 elsif (requires_extended_utf8($allowed_uv)) {
622 $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
623 \Q and not portable\E/x;
624 $non_cp_trailing_text = "is for a non-Unicode code point, and is not portable";
627 $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
628 \Q may not be portable\E/x;
629 $non_cp_trailing_text = "is for a non-Unicode code point, may not"
631 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_ABOVE_31_BIT;
632 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_ABOVE_31_BIT;
633 $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_ABOVE_31_BIT;
634 $uvchr_flag_to_disallow_complement
635 |= $::UNICODE_DISALLOW_ABOVE_31_BIT;
638 elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
639 $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/;
640 $non_cp_trailing_text = "is for a surrogate";
641 $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
642 $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
644 $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE;
645 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
646 $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE;
647 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
649 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
651 |$::UTF8_WARN_ABOVE_31_BIT;
652 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
653 |$::UTF8_DISALLOW_SUPER
654 |$::UTF8_DISALLOW_ABOVE_31_BIT;
655 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
656 |$::UNICODE_WARN_SUPER
657 |$::UNICODE_WARN_ABOVE_31_BIT;
658 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
659 |$::UNICODE_DISALLOW_SUPER
660 |$::UNICODE_DISALLOW_ABOVE_31_BIT;
661 $controlling_warning_category = 'surrogate';
663 elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
664 || ($allowed_uv & 0xFFFE) == 0xFFFE)
666 $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E
667 \Q is not recommended for open interchange\E/x;
668 $non_cp_trailing_text = "if you see this, there is an error";
669 $needed_to_discern_len = $length unless defined $needed_to_discern_len;
670 if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
671 || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
673 $skip_most_tests = 1;
676 $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR;
677 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
678 $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR;
679 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
681 $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE
683 |$::UTF8_WARN_ABOVE_31_BIT;
684 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
685 |$::UTF8_DISALLOW_SUPER
686 |$::UTF8_DISALLOW_ABOVE_31_BIT;
687 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE
688 |$::UNICODE_WARN_SUPER
689 |$::UNICODE_WARN_ABOVE_31_BIT;
690 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
691 |$::UNICODE_DISALLOW_SUPER
692 |$::UNICODE_DISALLOW_ABOVE_31_BIT;
694 $controlling_warning_category = 'nonchar';
697 die "Can't figure out what type of warning to test for $testname"
700 die 'Didn\'t set $needed_to_discern_len for ' . $testname
701 unless defined $needed_to_discern_len;
703 { # First test the isFOO calls
704 use warnings; no warnings 'deprecated'; # Make sure these don't raise warnings
705 undef @warnings_gotten;
707 my $ret = test_isUTF8_CHAR($bytes, $length);
708 my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
709 if ($initially_malformed) {
710 is($ret, 0, "For $testname: isUTF8_CHAR() returns 0");
711 is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0");
715 "For $testname: isUTF8_CHAR() returns expected length: $length");
716 is($ret_flags, $length, " And isUTF8_CHAR_flags(...,0)"
717 . " returns expected length: $length");
719 is(scalar @warnings_gotten, 0,
720 " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags generated"
722 or output_warnings(@warnings_gotten);
724 undef @warnings_gotten;
725 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
726 if ($initially_malformed) {
727 is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0");
730 my $expected_ret = ( $testname =~ /surrogate|non-character/
731 || $allowed_uv > 0x10FFFF)
734 is($ret, $expected_ret, " And isSTRICT_UTF8_CHAR() returns"
735 . " expected length: $expected_ret");
736 $ret = test_isUTF8_CHAR_flags($bytes, $length,
737 $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
738 is($ret, $expected_ret,
739 " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
740 . " acts like isSTRICT_UTF8_CHAR");
742 is(scalar @warnings_gotten, 0,
743 " And neither isSTRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
744 . " generated any warnings")
745 or output_warnings(@warnings_gotten);
747 undef @warnings_gotten;
748 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
749 if ($initially_malformed) {
750 is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0");
753 my $expected_ret = ( $testname =~ /surrogate/
754 || $allowed_uv > 0x10FFFF)
757 is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()"
758 ." returns expected length: $expected_ret");
759 $ret = test_isUTF8_CHAR_flags($bytes, $length,
760 $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
761 is($ret, $expected_ret,
762 " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
763 . " acts like isC9_STRICT_UTF8_CHAR");
765 is(scalar @warnings_gotten, 0,
766 " And neither isC9_STRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
767 . " generated any warnings")
768 or output_warnings(@warnings_gotten);
770 foreach my $disallow_type (0..2) {
771 # 0 is don't disallow this type of code point
773 # 2 is do disallow, but only for above 31 bit
778 if ($initially_malformed) {
780 # Malformations are by default disallowed, so testing with
781 # $disallow_type equal to 0 is sufficicient.
782 next if $disallow_type;
787 elsif ($disallow_type == 1) {
788 $disallow_flags = $utf8n_flag_to_disallow;
791 elsif ($disallow_type == 2) {
792 next if ! requires_extended_utf8($allowed_uv);
793 $disallow_flags = $::UTF8_DISALLOW_ABOVE_31_BIT;
797 $disallow_flags = $utf8n_flag_to_disallow_complement;
798 $expected_ret = $length;
801 $ret = test_isUTF8_CHAR_flags($bytes, $length, $disallow_flags);
802 is($ret, $expected_ret, " And isUTF8_CHAR_flags("
803 . "$display_bytes, $disallow_flags) returns "
805 or diag "The flags mean "
806 . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
808 is(scalar @warnings_gotten, 0,
809 " And isUTF8_CHAR_flags(...) generated no warnings")
810 or output_warnings(@warnings_gotten);
812 # Test partial character handling, for each byte not a full character
813 my $did_test_partial = 0;
814 for (my $j = 1; $j < $length - 1; $j++) {
815 $did_test_partial = 1;
816 my $partial = substr($bytes, 0, $j);
819 if ($disallow_type || $initially_malformed) {
821 $comment = "disallowed";
822 if ($j < $needed_to_discern_len) {
824 $comment .= ", but need $needed_to_discern_len"
825 . " bytes to discern:";
830 $comment = "allowed";
833 undef @warnings_gotten;
835 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
837 is($ret, $ret_should_be,
838 " And is_utf8_valid_partial_char_flags("
839 . display_bytes($partial)
840 . ", $disallow_flags), $comment: returns $ret_should_be")
841 or diag "The flags mean "
842 . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
845 if ($did_test_partial) {
846 is(scalar @warnings_gotten, 0,
847 " And is_utf8_valid_partial_char_flags()"
848 . " generated no warnings for any of the lengths")
849 or output_warnings(@warnings_gotten);
854 # Now test the to/from UTF-8 calls
855 # This is more complicated than the malformations tested in other files in
856 # this directory, as there are several orthogonal variables involved. We
857 # test most possible combinations
859 # We try various combinations of malformations that can occur
860 foreach my $short (0, 1) {
861 next if $skip_most_tests && $short;
862 foreach my $unexpected_noncont (0, 1) {
863 next if $skip_most_tests && $unexpected_noncont;
864 foreach my $overlong (0, 1) {
865 next if $overlong && $skip_most_tests;
867 # If we're creating an overlong, it can't be longer than the
868 # maximum length, so skip if we're already at that length.
869 next if $overlong && $length >= $::max_bytes;
871 foreach my $malformed_allow_type (0..2) {
872 # 0 don't allow this malformation; ignored if no malformation
873 # 1 allow, with REPLACEMENT CHARACTER returned
874 # 2 allow, with intended code point returned. All malformations
875 # other than overlong can't determine the intended code point,
876 # so this isn't valid for them.
877 next if $malformed_allow_type == 2
878 && ($will_overflow || $short || $unexpected_noncont);
879 next if $skip_most_tests && $malformed_allow_type;
881 # Here we are in the innermost loop for malformations. So we
882 # know which ones are in effect. Can now change the input to be
883 # appropriately malformed. We also can set up certain other
884 # things now, like whether we expect a return flag from this
885 # malformation, and which flag.
887 my $this_bytes = $bytes;
888 my $this_length = $length;
889 my $this_expected_len = $length;
890 my $this_needed_to_discern_len = $needed_to_discern_len;
892 my @malformation_names;
893 my @expected_malformation_warnings;
894 my @expected_malformation_return_flags;
896 # Contains the flags for any allowed malformations. Currently no
897 # combinations of on/off are tested for. It's either all are
898 # allowed, or none are.
902 my $new_expected_len;
904 # To force this malformation, we convert the original start
905 # byte into a continuation byte with the same data bits as
907 my $start_byte = substr($this_bytes, 0, 1);
908 my $converted_to_continuation_byte
909 = start_byte_to_cont($start_byte);
911 # ... Then we prepend it with a known overlong sequence. This
912 # should evaluate to the exact same code point as the
913 # original. We try to avoid an overlong using Perl extended
914 # UTF-8. The code points are the highest representable as
915 # overlongs on the respective platform without using extended
917 if (native_to_I8($start_byte) lt "\xFC") {
918 $start_byte = I8_to_native("\xFC");
919 $new_expected_len = 6;
921 elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") {
923 # FE is not extended UTF-8 on EBCDIC
924 $start_byte = I8_to_native("\xFE");
925 $new_expected_len = 7;
927 else { # Must use extended UTF-8. On ASCII platforms, we
928 # could express some overlongs here starting with
929 # \xFE, but there's no real reason to do so.
930 $start_byte = I8_to_native("\xFF");
931 $new_expected_len = $::max_bytes;
934 # Splice in the revise continuation byte, preceded by the
935 # start byte and the proper number of the lowest continuation
937 $this_bytes = $start_byte
938 . ($native_lowest_continuation_chr
939 x ( $new_expected_len - 1 - length($this_bytes)))
940 . $converted_to_continuation_byte
941 . substr($this_bytes, 1);
942 $this_length = length($this_bytes);
943 $this_needed_to_discern_len = $new_expected_len
944 - ( $this_expected_len
945 - $this_needed_to_discern_len);
946 $this_expected_len = $new_expected_len;
947 push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
948 push @malformation_names, 'overlong';
950 if ($malformed_allow_type == 2) {
951 $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
953 elsif ($malformed_allow_type) {
954 $allow_flags |= $::UTF8_ALLOW_LONG;
959 push @malformation_names, 'short';
961 # To force this malformation, just tell the test to not look
962 # as far as it should into the input.
964 $this_expected_len--;
965 push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
967 $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
970 if ($unexpected_noncont) {
971 push @malformation_names, 'unexpected non-continuation';
973 # To force this malformation, change the final continuation
974 # byte into a non continuation.
975 my $pos = ($short) ? -2 : -1;
976 substr($this_bytes, $pos, 1) = '?';
977 $this_expected_len--;
978 push @expected_malformation_return_flags,
979 $::UTF8_GOT_NON_CONTINUATION;
980 $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
981 if $malformed_allow_type;
984 # Here, we've transformed the input with all of the desired
985 # non-overflow malformations. We are now in a position to
986 # construct any potential warnings for those malformations. But
987 # it's a pain to get the detailed messages exactly right, so for
988 # now XXX, only do so for those that return an explicit code
993 # If one of the other malformation types is also in effect, we
994 # don't know what the intended code point was.
995 if ($short || $unexpected_noncont || $will_overflow) {
996 push @expected_malformation_warnings, qr/overlong/;
999 my $wrong_bytes = display_bytes_no_quotes(
1000 substr($this_bytes, 0, $this_length));
1001 my $correct_bytes = display_bytes_no_quotes($bytes);
1002 my $prefix = ($allowed_uv > 0x10FFFF) ? "0x" : "U+";
1003 push @expected_malformation_warnings,
1004 qr/\QMalformed UTF-8 character: $wrong_bytes\E
1005 \Q (overlong; instead use\E
1006 \Q $correct_bytes to\E
1007 \Q represent $prefix$uv_string)/x;
1011 push @expected_malformation_warnings, qr/too short/;
1013 if ($unexpected_noncont) {
1014 push @expected_malformation_warnings,
1015 qr/unexpected non-continuation byte/;
1018 # The overflow malformation is done differently than other
1019 # malformations. It comes from manually typed tests in the test
1020 # array. We now make it be treated like one of the other
1021 # malformations. But some has to be deferred until the inner loop
1022 my $overflow_msg_pattern;
1023 if ($will_overflow) {
1024 push @malformation_names, 'overflow';
1026 $overflow_msg_pattern = display_bytes_no_quotes(
1027 substr($this_bytes, 0, $this_expected_len));
1028 $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E
1029 \Q $overflow_msg_pattern\E
1031 push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW;
1032 $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type;
1035 # And we can create the malformation-related text for the the test
1036 # names we eventually will generate.
1037 my $malformations_name = "";
1038 if (@malformation_names) {
1039 $malformations_name .= "dis" unless $malformed_allow_type;
1040 $malformations_name .= "allowed ";
1041 $malformations_name .= "malformation";
1042 $malformations_name .= "s" if @malformation_names > 1;
1043 $malformations_name .= ": ";
1044 $malformations_name .= join "/", @malformation_names;
1045 $malformations_name = " ($malformations_name)";
1048 # Done setting up the malformation related stuff
1050 foreach my $do_disallow (0, 1) {
1051 next if $skip_most_tests && ! $do_disallow;
1053 # We classify the warnings into certain "interesting" types,
1055 foreach my $warning_type (0..4) {
1056 next if $skip_most_tests && $warning_type != 1;
1057 foreach my $use_warn_flag (0, 1) {
1058 next if $skip_most_tests && ! $use_warn_flag;
1060 # Finally, here is the inner loop
1062 my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn;
1063 my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow;
1064 my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn;
1065 my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow;
1068 my $expect_regular_warnings;
1069 my $expect_warnings_for_malformed;
1070 my $expect_warnings_for_overflow;
1072 if ($warning_type == 0) {
1073 $eval_warn = "use warnings; no warnings 'deprecated'";
1074 $expect_regular_warnings = $use_warn_flag;
1076 # We ordinarily expect overflow warnings here. But it
1077 # is somewhat more complicated, and the final
1078 # determination is deferred to one place in the filw
1079 # where we handle overflow.
1080 $expect_warnings_for_overflow = 1;
1082 # We would ordinarily expect malformed warnings in
1083 # this case, but not if malformations are allowed.
1084 $expect_warnings_for_malformed
1085 = $malformed_allow_type == 0;
1087 elsif ($warning_type == 1) {
1088 $eval_warn = "no warnings";
1089 $expect_regular_warnings = 0;
1090 $expect_warnings_for_overflow = 0;
1091 $expect_warnings_for_malformed = 0;
1093 elsif ($warning_type == 2) {
1094 $eval_warn = "no warnings; use warnings 'utf8'";
1095 $expect_regular_warnings = $use_warn_flag;
1096 $expect_warnings_for_overflow = 1;
1097 $expect_warnings_for_malformed
1098 = $malformed_allow_type == 0;
1100 elsif ($warning_type == 3) {
1101 $eval_warn = "no warnings; use warnings"
1102 . " '$controlling_warning_category'";
1103 $expect_regular_warnings = $use_warn_flag;
1104 $expect_warnings_for_overflow
1105 = $controlling_warning_category eq 'non_unicode';
1106 $expect_warnings_for_malformed = 0;
1108 elsif ($warning_type == 4) { # Like type 3, but uses the
1109 # above-31-bit flags
1110 # The complement flags were set up so that the
1111 # above-31-bit flags have been tested that they don't
1112 # trigger wrongly for too small code points. And the
1113 # flags have been set up so that those small code
1114 # points are tested for being above Unicode. What's
1115 # left to test is that the large code points do
1116 # trigger the above-31-bit flags.
1117 next if ! requires_extended_utf8($allowed_uv);
1118 next if $controlling_warning_category ne 'non_unicode';
1119 $eval_warn = "no warnings; use warnings 'non_unicode'";
1120 $expect_regular_warnings = 1;
1121 $expect_warnings_for_overflow = 1;
1122 $expect_warnings_for_malformed = 0;
1123 $this_utf8n_flag_to_warn = $::UTF8_WARN_ABOVE_31_BIT;
1124 $this_utf8n_flag_to_disallow
1125 = $::UTF8_DISALLOW_ABOVE_31_BIT;
1126 $this_uvchr_flag_to_warn = $::UNICODE_WARN_ABOVE_31_BIT;
1127 $this_uvchr_flag_to_disallow
1128 = $::UNICODE_DISALLOW_ABOVE_31_BIT;
1131 die "Unexpected warning type '$warning_type'";
1134 # We only need to test the case where all warnings are
1135 # enabled (type 0) to see if turning off the warning flag
1136 # causes things to not be output. If those pass, then
1137 # turning on some sub-category of warnings, or turning off
1138 # warnings altogether are extremely likely to not output
1139 # warnings either, given how the warnings subsystem is
1140 # supposed to work, and this file assumes it does work.
1141 next if $warning_type != 0 && ! $use_warn_flag;
1143 # The convention is that the 'got' flag is the same value
1144 # as the disallow one. If this were violated, the tests
1145 # here should start failing.
1146 my $return_flag = $this_utf8n_flag_to_disallow;
1148 # If we aren't expecting warnings/disallow for this, turn
1149 # on all the other flags. That makes sure that they all
1150 # are independent of this flag, and so we don't need to
1151 # test them individually.
1152 my $this_warning_flags = ($use_warn_flag)
1153 ? $this_utf8n_flag_to_warn
1154 : $utf8n_flag_to_warn_complement;
1155 my $this_disallow_flags = ($do_disallow)
1156 ? $this_utf8n_flag_to_disallow
1157 : $utf8n_flag_to_disallow_complement;
1158 my $expected_uv = $allowed_uv;
1159 my $this_uv_string = $uv_string;
1161 my @expected_return_flags
1162 = @expected_malformation_return_flags;
1163 my @expected_warnings;
1164 push @expected_warnings, @expected_malformation_warnings
1165 if $expect_warnings_for_malformed;
1167 # The overflow malformation is done differently than other
1168 # malformations. It comes from manually typed tests in
1169 # the test array, but it also is above Unicode and uses
1170 # Perl extended UTF-8, so affects some of the flags being
1171 # tested. We now make it be treated like one of the other
1172 # generated malformations.
1173 if ($will_overflow) {
1175 # An overflow is (way) above Unicode, and overrides
1177 $expect_regular_warnings = 0;
1179 # Earlier, we tentatively calculated whether this
1180 # should emit a message or not. It's tentative
1181 # because, even if we ordinarily would output it, we
1182 # don't if malformations are allowed -- except an
1183 # overflow is also a SUPER and ABOVE_31_BIT, and if
1184 # warnings for those are enabled, the overflow
1185 # warning does get raised.
1186 if ( $expect_warnings_for_overflow
1187 && ( $malformed_allow_type == 0
1188 || ( $this_warning_flags
1189 & ($::UTF8_WARN_SUPER
1190 |$::UTF8_WARN_ABOVE_31_BIT))))
1192 push @expected_warnings, $overflow_msg_pattern;
1196 # It may be that the malformations have shortened the
1197 # amount of input we look at so much that we can't tell
1198 # what the category the code point was in. Otherwise, set
1199 # up the expected return flags based on the warnings and
1201 if ($this_expected_len < $this_needed_to_discern_len) {
1202 $expect_regular_warnings = 0;
1204 elsif ( ($this_warning_flags & $this_utf8n_flag_to_warn)
1205 || ( $this_disallow_flags
1206 & $this_utf8n_flag_to_disallow))
1208 push @expected_return_flags, $return_flag;
1211 # Finish setting up the expected warning.
1212 if ($expect_regular_warnings) {
1214 # So far the array contains warnings generated by
1215 # malformations. Add the expected regular one.
1216 unshift @expected_warnings, $cp_message_qr;
1218 # But it may need to be modified, because either of
1219 # these malformations means we can't determine the
1220 # expected code point.
1221 if ($short || $unexpected_noncont) {
1222 my $first_byte = substr($this_bytes, 0, 1);
1223 $expected_warnings[0] = display_bytes(
1224 substr($this_bytes, 0, $this_expected_len));
1225 $expected_warnings[0]
1226 = qr/[Aa]\Qny UTF-8 sequence that starts with\E
1227 \Q $expected_warnings[0]\E
1228 \Q $non_cp_trailing_text\E/x;
1232 # Is effectively disallowed if we've set up a malformation
1233 # (unless malformations are allowed), even if the flag
1234 # indicates it is allowed. Fix up test name to indicate
1237 if ( $this_disallow_flags & $this_utf8n_flag_to_disallow
1238 && $this_expected_len >= $this_needed_to_discern_len)
1242 if ($malformations_name) {
1243 if ($malformed_allow_type == 0) {
1246 elsif ($malformed_allow_type == 1) {
1248 # Even if allowed, the malformation returns the
1249 # REPLACEMENT CHARACTER.
1250 $expected_uv = 0xFFFD;
1251 $this_uv_string = "0xFFFD"
1255 my $this_name = "utf8n_to_uvchr_error() $testname: "
1259 $this_name .= ", $eval_warn";
1260 $this_name .= ", " . (( $this_warning_flags
1261 & $this_utf8n_flag_to_warn)
1262 ? 'with flag for raising warnings'
1263 : 'no flag for raising warnings');
1264 $this_name .= $malformations_name;
1266 # Do the actual test using an eval
1267 undef @warnings_gotten;
1270 = $allow_flags|$this_warning_flags|$this_disallow_flags;
1271 my $eval_text = "$eval_warn; \$ret_ref"
1272 . " = test_utf8n_to_uvchr_error("
1273 . "'$this_bytes', $this_length, $this_flags)";
1275 if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1277 diag "\$@='$@'; call was: "
1278 . utf8n_display_call($eval_text);
1282 is($ret_ref->[0], 0, " And returns 0")
1283 or diag "Call was: " . utf8n_display_call($eval_text);
1286 is($ret_ref->[0], $expected_uv,
1287 " And returns expected uv: "
1289 or diag "Call was: " . utf8n_display_call($eval_text);
1291 is($ret_ref->[1], $this_expected_len,
1292 " And returns expected length:"
1293 . " $this_expected_len")
1294 or diag "Call was: " . utf8n_display_call($eval_text);
1296 my $returned_flags = $ret_ref->[2];
1298 for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
1299 if ($expected_return_flags[$i] & $returned_flags) {
1300 if ($expected_return_flags[$i]
1301 == $::UTF8_DISALLOW_ABOVE_31_BIT)
1303 pass(" Expected and got return flag for"
1306 # The first entries in this are
1308 elsif ($i > @malformation_names - 1) {
1309 pass(" Expected and got return flag"
1310 . " for " . $controlling_warning_category);
1313 pass(" Expected and got return flag for "
1314 . $malformation_names[$i]
1317 $returned_flags &= ~$expected_return_flags[$i];
1318 splice @expected_return_flags, $i, 1;
1322 is($returned_flags, 0,
1323 " Got no unexpected return flags")
1324 or diag "The unexpected flags gotten were: "
1325 . (flags_to_text($returned_flags,
1326 \@utf8n_flags_to_text)
1327 # We strip off any prefixes from the flag
1329 =~ s/ \b [A-Z] _ //xgr);
1330 is (scalar @expected_return_flags, 0,
1331 " Got all expected return flags")
1332 or diag "The expected flags not gotten were: "
1333 . (flags_to_text(eval join("|",
1334 @expected_return_flags),
1335 \@utf8n_flags_to_text)
1336 # We strip off any prefixes from the flag
1338 =~ s/ \b [A-Z] _ //xgr);
1340 do_warnings_test(@expected_warnings)
1341 or diag "Call was: " . utf8n_display_call($eval_text);
1342 undef @warnings_gotten;
1344 # Check CHECK_ONLY results when the input is
1345 # disallowed. Do this when actually disallowed,
1346 # not just when the $this_disallow_flags is set
1348 my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
1349 my $eval_text = "use warnings; no warnings 'deprecated'; \$ret_ref ="
1350 . " test_utf8n_to_uvchr_error('"
1351 . "$this_bytes', $this_length,"
1354 if (! ok ("$@ eq ''",
1355 " And eval succeeded with CHECK_ONLY"))
1357 diag "\$@='$@'; Call was: "
1358 . utf8n_display_call($eval_text);
1361 is($ret_ref->[0], 0, " CHECK_ONLY: Returns 0")
1362 or diag "Call was: " . utf8n_display_call($eval_text);
1363 is($ret_ref->[1], -1,
1364 " CHECK_ONLY: returns -1 for length")
1365 or diag "Call was: " . utf8n_display_call($eval_text);
1366 if (! is(scalar @warnings_gotten, 0,
1367 " CHECK_ONLY: no warnings generated"))
1369 diag "Call was: " . utf8n_display_call($eval_text);
1370 output_warnings(@warnings_gotten);
1374 # Now repeat some of the above, but for
1375 # uvchr_to_utf8_flags(). Since this comes from an
1376 # existing code point, it hasn't overflowed, and isn't
1378 next if @malformation_names;
1380 $this_warning_flags = ($use_warn_flag)
1381 ? $this_uvchr_flag_to_warn
1383 $this_disallow_flags = ($do_disallow)
1384 ? $this_uvchr_flag_to_disallow
1387 $disallowed = $this_disallow_flags
1388 & $this_uvchr_flag_to_disallow;
1389 $this_name .= ", " . (( $this_warning_flags
1390 & $this_utf8n_flag_to_warn)
1391 ? 'with flag for raising warnings'
1392 : 'no flag for raising warnings');
1394 $this_name = "uvchr_to_utf8_flags() $testname: "
1398 $this_name .= ", $eval_warn";
1399 $this_name .= ", " . (( $this_warning_flags
1400 & $this_uvchr_flag_to_warn)
1401 ? 'with warning flag'
1402 : 'no warning flag');
1404 undef @warnings_gotten;
1406 $this_flags = $this_warning_flags|$this_disallow_flags;
1407 $eval_text = "$eval_warn; \$ret ="
1408 . " test_uvchr_to_utf8_flags("
1409 . "$allowed_uv, $this_flags)";
1411 if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1413 diag "\$@='$@'; call was: "
1414 . uvchr_display_call($eval_text);
1418 is($ret, undef, " And returns undef")
1419 or diag "Call was: " . uvchr_display_call($eval_text);
1422 is($ret, $this_bytes, " And returns expected string")
1423 or diag "Call was: " . uvchr_display_call($eval_text);
1426 do_warnings_test(@expected_warnings)
1427 or diag "Call was: " . uvchr_display_call($eval_text);