3 # This is a base file to be used by various .t's in its directory
4 # It tests various code points that are "problematic", and verifies that the
5 # correct warnings/flags etc are generated when using them. It also takes the
6 # UTF-8 for some of them and perturbs it to be malformed in various ways, and
7 # tests that this gets appropriately detected.
13 use_ok('XS::APItest');
14 require 'charset_tools.pl';
15 require './t/utf8_setup.pl';
20 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
21 # machines, and that is tested elsewhere
29 local $SIG{__WARN__} = sub { my @copy = @_;
30 push @warnings_gotten, map { chomp; $_ } @copy;
34 no warnings qw(portable overflow);
37 # $bytes, UTF-8 string
38 # $allowed_uv, code point $bytes evaluates to; -1 if
40 # $needed_to_discern_len optional, how long an initial substring do
41 # we need to tell that the string must be for
42 # a code point in the category it falls in,
43 # like being a surrogate; 0 indicates we need
44 # the whole string. Some categories have a
45 # default that is used if this is omitted.
47 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
50 [ "a middle surrogate",
51 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
54 [ "highest surrogate",
55 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
58 [ "first of 32 consecutive non-character code points",
59 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
62 [ "a mid non-character code point of the 32 consecutive ones",
63 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
66 [ "final of 32 consecutive non-character code points",
67 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
70 [ "non-character code point U+FFFE",
71 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
74 [ "non-character code point U+FFFF",
75 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
78 [ "non-character code point U+1FFFE",
79 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
82 [ "non-character code point U+1FFFF",
83 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
86 [ "non-character code point U+2FFFE",
87 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
90 [ "non-character code point U+2FFFF",
91 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
94 [ "non-character code point U+3FFFE",
95 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
98 [ "non-character code point U+3FFFF",
99 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
102 [ "non-character code point U+4FFFE",
105 : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
108 [ "non-character code point U+4FFFF",
111 : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
114 [ "non-character code point U+5FFFE",
117 : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
120 [ "non-character code point U+5FFFF",
123 : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
126 [ "non-character code point U+6FFFE",
129 : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
132 [ "non-character code point U+6FFFF",
135 : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
138 [ "non-character code point U+7FFFE",
141 : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
144 [ "non-character code point U+7FFFF",
147 : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
150 [ "non-character code point U+8FFFE",
153 : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
156 [ "non-character code point U+8FFFF",
159 : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
162 [ "non-character code point U+9FFFE",
165 : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
168 [ "non-character code point U+9FFFF",
171 : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
174 [ "non-character code point U+AFFFE",
177 : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
180 [ "non-character code point U+AFFFF",
183 : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
186 [ "non-character code point U+BFFFE",
189 : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
192 [ "non-character code point U+BFFFF",
195 : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
198 [ "non-character code point U+CFFFE",
201 : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
204 [ "non-character code point U+CFFFF",
207 : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
210 [ "non-character code point U+DFFFE",
213 : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
216 [ "non-character code point U+DFFFF",
219 : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
222 [ "non-character code point U+EFFFE",
225 : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
228 [ "non-character code point U+EFFFF",
231 : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
234 [ "non-character code point U+FFFFE",
237 : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
240 [ "non-character code point U+FFFFF",
243 : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
246 [ "non-character code point U+10FFFE",
249 : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
252 [ "non-character code point U+10FFFF",
255 : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
258 [ "first non_unicode",
261 : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
265 [ "non_unicode whose first byte tells that",
268 : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
269 (isASCII) ? 0x140000 : 0x200000,
272 [ "requires at least 32 bits",
274 ? "\xfe\x82\x80\x80\x80\x80\x80"
276 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
277 # This code point is chosen so that it is representable in a UV on
282 [ "highest 32 bit code point",
284 ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
286 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
290 [ "requires at least 32 bits, and use SUPER-type flags, instead of"
293 ? "\xfe\x82\x80\x80\x80\x80\x80"
295 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
304 [ "Lowest 33 bit code point: overflow",
305 "\xFE\x84\x80\x80\x80\x80\x80",
309 [ "overflow that old algorithm failed to detect",
310 "\xfe\x86\x80\x80\x80\x80\x80",
319 [ "More than 32 bits",
321 ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
323 "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
329 [ "overflow that old algorithm failed to detect",
330 "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
336 push @tests, # These could falsely show wrongly in a naive
338 [ "requires at least 32 bits",
340 "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
344 [ "requires at least 32 bits",
346 "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
350 [ "requires at least 32 bits",
352 "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
356 [ "requires at least 32 bits",
358 "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
362 [ "requires at least 32 bits",
364 "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
368 [ "requires at least 32 bits",
370 "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
378 sub flags_to_text($$)
380 my ($flags, $flags_to_text_ref) = @_;
382 # Returns a string containing a mnemonic representation of the bits that
383 # are set in the $flags. These are assumed to be flag bits. The return
384 # looks like "FOO|BAR|BAZ". The second parameter is a reference to an
385 # array that gives the textual representation of all the possible flags.
386 # Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If
387 # no bits at all are set the string "0" is returned;
392 return "0" if $flags == 0;
395 #diag sprintf "%x", $flags;
397 push @flag_text, $flags_to_text_ref->[$shift];
403 return join "|", @flag_text;
406 # Possible flag returns from utf8n_to_uvchr_error(). These should have G_,
407 # instead of A_, D_, but the prefixes will be used in a a later commit, so
408 # minimize churn by having them here.
409 my @utf8n_flags_to_text = ( qw(
427 NO_CONFIDENCE_IN_CURLEN_
430 sub utf8n_display_call($)
432 # Converts an eval string that calls test_utf8n_to_uvchr into a more human
433 # readable form, and returns it. Doesn't work if the byte string contains
434 # an apostrophe. The return will look something like:
435 # test_utf8n_to_uvchr_error('$bytes', $length, $flags)
438 $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
439 my $text1 = $1; # Everything before the byte string
441 my $text2 = $3; # Includes the length
445 . display_bytes($bytes)
447 . flags_to_text($flags, \@utf8n_flags_to_text)
451 sub uvchr_display_call($)
453 # Converts an eval string that calls test_uvchr_to_utf8 into a more human
454 # readable form, and returns it. The return will look something like:
455 # test_uvchr_to_utf8n_flags($uv, $flags)
458 my @flags_to_text = ( qw(
469 $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
471 my $cp = sprintf "%X", $2;
474 return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
477 sub do_warnings_test(@)
479 my @expected_warnings = @_;
481 # Compares the input expected warnings array with @warnings_gotten,
482 # generating a pass for each found, removing it from @warnings_gotten.
483 # Any discrepancies generate test failures. Returns TRUE if no
484 # discrepcancies; otherwise FALSE.
488 if (@expected_warnings == 0) {
489 if (! is(@warnings_gotten, 0, " Expected and got no warnings")) {
490 output_warnings(@warnings_gotten);
496 # Check that we got all the expected warnings,
497 # removing each one found
499 foreach my $expected (@expected_warnings) {
500 foreach (my $i = 0; $i < @warnings_gotten; $i++) {
501 if ($warnings_gotten[$i] =~ $expected) {
502 pass(" Expected and got warning: "
503 . " $warnings_gotten[$i]");
504 splice @warnings_gotten, $i, 1;
508 fail(" Expected a warning that matches "
509 . $expected . " but didn't get it");
513 if (! is(@warnings_gotten, 0, " Got no unexpected warnings")) {
514 output_warnings(@warnings_gotten);
521 # This test is split into this number of files.
522 my $num_test_files = $ENV{TEST_JOBS} || 1;
523 $num_test_files = 10 if $num_test_files > 10;
526 foreach my $test (@tests) {
528 next if $test_count % $num_test_files != $::TEST_CHUNK;
530 my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
532 my $length = length $bytes;
533 my $will_overflow = $allowed_uv < 0;
535 my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
537 my $controlling_warning_category;
538 my $utf8n_flag_to_warn;
539 my $utf8n_flag_to_disallow;
540 my $uvchr_flag_to_warn;
541 my $uvchr_flag_to_disallow;
543 # We want to test that the independent flags are actually independent.
544 # For example, that a surrogate doesn't trigger a non-character warning,
545 # and conversely, turning off an above-Unicode flag doesn't suppress a
546 # surrogate warning. Earlier versions of this file used nested loops to
547 # test all possible combinations. But that creates lots of tests, making
548 # this run too long. What is now done instead is to use the complement of
549 # the category we are testing to greatly reduce the combinatorial
550 # explosion. For example, if we have a surrogate and we aren't expecting
551 # a warning about it, we set all the flags for non-surrogates to raise
552 # warnings. If one shows up, it indicates the flags aren't independent.
553 my $utf8n_flag_to_warn_complement;
554 my $utf8n_flag_to_disallow_complement;
555 my $uvchr_flag_to_warn_complement;
556 my $uvchr_flag_to_disallow_complement;
558 # Many of the code points being tested are middling in that if code point
559 # edge cases work, these are very likely to as well. Because this test
560 # file takes a while to execute, we skip testing the edge effects of code
561 # points deemed middling, while testing their basics and continuing to
562 # fully test the non-middling code points.
563 my $skip_most_tests = 0;
565 my $cp_message_qr; # Pattern that matches the message raised when
566 # that message contains the problematic code
567 # point. The message is the same (currently) both
568 # when going from/to utf8.
569 my $non_cp_trailing_text; # The suffix text when the message doesn't
570 # contain a code point. (This is a result of
571 # some sort of malformation that means we
572 # can't get an exact code poin
574 if ($will_overflow || $allowed_uv > 0x10FFFF) {
576 # Set the SUPER flags; later, we test for ABOVE_31_BIT as well.
577 $utf8n_flag_to_warn = $::UTF8_WARN_SUPER;
578 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
579 $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER;
580 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
582 # Below, we add the flags for non-above-31 bit to the code points that
583 # don't fit that category. Special tests are done for this category
585 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
586 |$::UTF8_WARN_SURROGATE;
587 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
588 |$::UTF8_DISALLOW_SURROGATE;
589 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
590 |$::UNICODE_WARN_SURROGATE;
591 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
592 |$::UNICODE_DISALLOW_SURROGATE;
593 $controlling_warning_category = 'non_unicode';
595 if ($will_overflow) { # This is realy a malformation
596 $non_cp_trailing_text = "if you see this, there is an error";
597 $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
599 elsif ($allowed_uv > 0x7FFFFFFF) {
600 $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
601 \Q and not portable\E/x;
602 $non_cp_trailing_text = "is for a non-Unicode code point, and is not portable";
605 $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
606 \Q may not be portable\E/x;
607 $non_cp_trailing_text = "is for a non-Unicode code point, may not"
609 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_ABOVE_31_BIT;
610 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_ABOVE_31_BIT;
611 $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_ABOVE_31_BIT;
612 $uvchr_flag_to_disallow_complement
613 |= $::UNICODE_DISALLOW_ABOVE_31_BIT;
616 elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
617 $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/;
618 $non_cp_trailing_text = "is for a surrogate";
619 $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
620 $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
622 $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE;
623 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
624 $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE;
625 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
627 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
629 |$::UTF8_WARN_ABOVE_31_BIT;
630 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
631 |$::UTF8_DISALLOW_SUPER
632 |$::UTF8_DISALLOW_ABOVE_31_BIT;
633 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
634 |$::UNICODE_WARN_SUPER
635 |$::UNICODE_WARN_ABOVE_31_BIT;
636 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
637 |$::UNICODE_DISALLOW_SUPER
638 |$::UNICODE_DISALLOW_ABOVE_31_BIT;
639 $controlling_warning_category = 'surrogate';
641 elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
642 || ($allowed_uv & 0xFFFE) == 0xFFFE)
644 $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E
645 \Q is not recommended for open interchange\E/x;
646 $non_cp_trailing_text = "if you see this, there is an error";
647 $needed_to_discern_len = $length unless defined $needed_to_discern_len;
648 if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
649 || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
651 $skip_most_tests = 1;
654 $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR;
655 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
656 $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR;
657 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
659 $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE
661 |$::UTF8_WARN_ABOVE_31_BIT;
662 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
663 |$::UTF8_DISALLOW_SUPER
664 |$::UTF8_DISALLOW_ABOVE_31_BIT;
665 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE
666 |$::UNICODE_WARN_SUPER
667 |$::UNICODE_WARN_ABOVE_31_BIT;
668 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
669 |$::UNICODE_DISALLOW_SUPER
670 |$::UNICODE_DISALLOW_ABOVE_31_BIT;
672 $controlling_warning_category = 'nonchar';
675 die "Can't figure out what type of warning to test for $testname"
678 die 'Didn\'t set $needed_to_discern_len for ' . $testname
679 unless defined $needed_to_discern_len;
680 { # First test the isFOO calls
681 use warnings; # Make sure these don't raise warnings
682 undef @warnings_gotten;
684 my $ret = test_isUTF8_CHAR($bytes, $length);
685 my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
686 if ($will_overflow) {
687 is($ret, 0, "For $testname: isUTF8_CHAR() returns 0");
688 is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0");
692 "For $testname: isUTF8_CHAR() returns expected length: $length");
693 is($ret_flags, $length, " And isUTF8_CHAR_flags(...,0)"
694 . " returns expected length: $length");
696 is(scalar @warnings_gotten, 0,
697 " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags generated"
699 or output_warnings(@warnings_gotten);
701 undef @warnings_gotten;
702 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
703 if ($will_overflow) {
704 is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0");
707 my $expected_ret = ( $testname =~ /surrogate|non-character/
708 || $allowed_uv > 0x10FFFF)
711 is($ret, $expected_ret, " And isSTRICT_UTF8_CHAR() returns"
712 . " expected length: $expected_ret");
713 $ret = test_isUTF8_CHAR_flags($bytes, $length,
714 $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
715 is($ret, $expected_ret,
716 " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
717 . " acts like isSTRICT_UTF8_CHAR");
719 is(scalar @warnings_gotten, 0,
720 " And neither isSTRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
721 . " generated any warnings")
722 or output_warnings(@warnings_gotten);
724 undef @warnings_gotten;
725 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
726 if ($will_overflow) {
727 is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0");
730 my $expected_ret = ( $testname =~ /surrogate/
731 || $allowed_uv > 0x10FFFF)
734 is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()"
735 ." returns expected length: $expected_ret");
736 $ret = test_isUTF8_CHAR_flags($bytes, $length,
737 $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
738 is($ret, $expected_ret,
739 " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
740 . " acts like isC9_STRICT_UTF8_CHAR");
742 is(scalar @warnings_gotten, 0,
743 " And neither isC9_STRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
744 . " generated any warnings")
745 or output_warnings(@warnings_gotten);
747 # Test partial character handling, for each byte not a full character
748 for my $j (1.. $length - 1) {
750 # Skip the test for the interaction between overflow and above-31
751 # bit. It is really testing other things than the partial
752 # character tests, for which other tests in this file are
754 last if $will_overflow;
756 foreach my $disallow_flag (0, $utf8n_flag_to_disallow) {
757 my $partial = substr($bytes, 0, $j);
760 if ($disallow_flag) {
762 $comment = "disallowed";
763 if ($j < $needed_to_discern_len) {
765 $comment .= ", but need $needed_to_discern_len bytes"
771 $comment = "allowed";
774 undef @warnings_gotten;
776 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
778 is($ret, $ret_should_be,
779 " And is_utf8_valid_partial_char_flags("
780 . display_bytes($partial)
781 . "), $comment: returns $ret_should_be");
782 is(scalar @warnings_gotten, 0,
783 " And is_utf8_valid_partial_char_flags()"
784 . " generated no warnings")
785 or output_warnings(@warnings_gotten);
790 # Now test the to/from UTF-8 calls
791 # This is more complicated than the malformations tested in other files in
792 # this directory, as there are several orthogonal variables involved. We
793 # test most possible combinations
794 foreach my $do_disallow (0, 1) {
795 next if $skip_most_tests && ! $do_disallow;
797 # We try various combinations of malformations that can occur
798 foreach my $short ("", "short") {
799 next if $skip_most_tests && $short;
800 foreach my $unexpected_noncont ("", "unexpected non-continuation") {
801 next if $skip_most_tests && $unexpected_noncont;
802 foreach my $overlong ("", "overlong") {
803 next if $overlong && $skip_most_tests;
805 # If we're creating an overlong, it can't be longer than the
806 # maximum length, so skip if we're already at that length.
807 next if $overlong && $length >= $::max_bytes;
809 # We classify the warnings into certain "interesting" types,
811 foreach my $warning_type (0..4) {
812 next if $skip_most_tests && $warning_type != 1;
813 foreach my $use_warn_flag (0, 1) {
814 next if $skip_most_tests && ! $use_warn_flag;
816 # Finally, here is the inner loop
818 my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn;
819 my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow;
820 my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn;
821 my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow;
824 my $expect_regular_warnings;
825 my $expect_warnings_for_malformed;
826 my $expect_warnings_for_overflow;
828 if ($warning_type == 0) {
829 $eval_warn = "use warnings; no warnings 'deprecated'";
830 $expect_regular_warnings = $use_warn_flag;
831 $expect_warnings_for_overflow = 1;
832 $expect_warnings_for_malformed = 1;
834 elsif ($warning_type == 1) {
835 $eval_warn = "no warnings";
836 $expect_regular_warnings = 0;
837 $expect_warnings_for_overflow = 0;
838 $expect_warnings_for_malformed = 0;
840 elsif ($warning_type == 2) {
841 $eval_warn = "no warnings; use warnings 'utf8'";
842 $expect_regular_warnings = $use_warn_flag;
843 $expect_warnings_for_overflow = 1;
844 $expect_warnings_for_malformed = 1;
846 elsif ($warning_type == 3) {
847 $eval_warn = "no warnings; use warnings"
848 . " '$controlling_warning_category'";
849 $expect_regular_warnings = $use_warn_flag;
850 $expect_warnings_for_overflow
851 = $controlling_warning_category eq 'non_unicode';
852 $expect_warnings_for_malformed = 0;
854 elsif ($warning_type == 4) { # Like type 3, but uses the
856 # The complement flags were set up so that the
857 # above-31-bit flags have been tested that they don't
858 # trigger wrongly for too small code points. And the
859 # flags have been set up so that those small code
860 # points are tested for being above Unicode. What's
861 # left to test is that the large code points do
862 # trigger the above-31-bit flags.
863 next if ! $will_overflow && $allowed_uv < 0x80000000;
864 next if $controlling_warning_category ne 'non_unicode';
865 $eval_warn = "no warnings; use warnings 'non_unicode'";
866 $expect_regular_warnings = 1;
867 $expect_warnings_for_overflow = 1;
868 $expect_warnings_for_malformed = 0;
869 $this_utf8n_flag_to_warn = $::UTF8_WARN_ABOVE_31_BIT;
870 $this_utf8n_flag_to_disallow
871 = $::UTF8_DISALLOW_ABOVE_31_BIT;
872 $this_uvchr_flag_to_warn = $::UNICODE_WARN_ABOVE_31_BIT;
873 $this_uvchr_flag_to_disallow
874 = $::UNICODE_DISALLOW_ABOVE_31_BIT;
877 die "Unexpected warning type '$warning_type'";
880 # We only need to test the case where all warnings are
881 # enabled (type 0) to see if turning off the warning flag
882 # causes things to not be output. If those pass, then
883 # turning on some sub-category of warnings, or turning off
884 # warnings altogether are extremely likely to not output
885 # warnings either, given how the warnings subsystem is
886 # supposed to work, and this file assumes it does work.
887 next if $warning_type != 0 && ! $use_warn_flag;
889 # The convention is that the 'got' flag is the same value
890 # as the disallow one. If this were violated, the tests
891 # here should start failing.
892 my $return_flag = $this_utf8n_flag_to_disallow;
894 # If we aren't expecting warnings/disallow for this, turn
895 # on all the other flags. That makes sure that they all
896 # are independent of this flag, and so we don't need to
897 # test them individually.
898 my $this_warning_flags = ($use_warn_flag)
899 ? $this_utf8n_flag_to_warn
900 : $utf8n_flag_to_warn_complement;
901 my $this_disallow_flags = ($do_disallow)
902 ? $this_utf8n_flag_to_disallow
903 : $utf8n_flag_to_disallow_complement;
904 my $this_bytes = $bytes;
905 my $this_length = $length;
906 my $expected_uv = $allowed_uv;
907 my $this_expected_len = $length;
908 my $this_needed_to_discern_len = $needed_to_discern_len;
910 my @malformation_names;
911 my @expected_warnings;
912 my @expected_return_flags;
914 # Now go through the possible malformations wanted, and
915 # change the input accordingly. We also can set up
916 # certain other things now, like whether we expect a
917 # return flag from this malformation and which flag.
920 # To force this malformation, we convert the original
921 # start byte into a continuation byte with the same
922 # data bits as originally. ...
923 substr($this_bytes, 0, 1)
924 = start_byte_to_cont(substr($this_bytes,
927 # ... Then we prepend it with a known overlong
928 # sequence. This should evaluate to the exact same
929 # code point as the original.
931 = I8_to_native("\xff")
932 . (I8_to_native(chr $::lowest_continuation)
933 x ( $::max_bytes - 1 - length($this_bytes)))
935 $this_length = length($this_bytes);
936 $this_needed_to_discern_len
937 = $::max_bytes - ($this_expected_len
938 - $this_needed_to_discern_len);
939 $this_expected_len = $::max_bytes;
940 push @expected_return_flags, $::UTF8_GOT_LONG;
941 push @malformation_names, $overlong;
942 if ($expect_warnings_for_malformed) {
944 && ! $unexpected_noncont
948 = display_bytes_no_quotes($this_bytes);
950 = display_bytes_no_quotes($bytes);
951 push @expected_warnings,
952 qr/\QMalformed UTF-8 character:\E
953 \Q $overlong_bytes (overlong;\E
954 \Q instead use $correct_bytes to\E
955 \Q represent U+$uv_string)/x;
958 push @expected_warnings, qr/overlong/;
964 push @malformation_names, $short;
965 push @expected_warnings, qr/short/
966 if $expect_warnings_for_malformed;
968 # To force this malformation, just tell the test to
969 # not look as far as it should into the input.
971 $this_expected_len--;
972 push @expected_return_flags, $::UTF8_GOT_SHORT;
975 if ($unexpected_noncont) {
976 push @malformation_names, $unexpected_noncont;
977 push @expected_warnings, qr/$unexpected_noncont/
978 if $expect_warnings_for_malformed;
980 # To force this malformation, change the final
981 # continuation byte into a non continuation.
982 my $pos = ($short) ? -2 : -1;
983 substr($this_bytes, $pos, 1) = '?';
984 $this_expected_len--;
985 push @expected_return_flags,
986 $::UTF8_GOT_NON_CONTINUATION;
989 # The overflow malformation is done differently than other
990 # malformations. It comes from manually typed tests in
991 # the test array, but it also is above Unicode and uses
992 # Perl extended UTF-8, so affects some of the flags being
993 # tested. We now make it be treated like one of the other
994 # generated malformations.
995 if ($will_overflow) {
997 # An overflow is (way) above Unicode, and overrides
999 $expect_regular_warnings = 0;
1001 push @malformation_names, 'overflow';
1002 if ($expect_warnings_for_overflow) {
1003 my $qr = display_bytes_no_quotes(
1004 substr($this_bytes, 0, $this_expected_len));
1005 $qr = qr/\QMalformed UTF-8 character: \E
1006 \Q$qr (overflows)\E/x;
1007 push @expected_warnings, $qr;
1009 push @expected_return_flags, $::UTF8_GOT_OVERFLOW;
1012 # Here, we've set things up based on the malformations.
1013 # Now generate the text for them for the test name.
1014 my $malformations_name = "";
1015 if (@malformation_names) {
1016 $malformations_name .= "malformation";
1017 $malformations_name .= "s" if @malformation_names > 1;
1018 $malformations_name .= ": ";
1019 $malformations_name .= join "/", @malformation_names;
1020 $malformations_name = " ($malformations_name)";
1023 # It may be that the malformations have shortened the
1024 # amount of input we look at so much that we can't tell
1025 # what the category the code point was in. Otherwise, set
1026 # up the expected return flags based on the warnings and
1028 if ($this_expected_len < $this_needed_to_discern_len) {
1029 $expect_regular_warnings = 0;
1031 elsif ( ($this_warning_flags & $this_utf8n_flag_to_warn)
1032 || ( $this_disallow_flags
1033 & $this_utf8n_flag_to_disallow))
1035 push @expected_return_flags, $return_flag;
1038 # Finish setting up the expected warning.
1039 if ($expect_regular_warnings) {
1041 # So far the array contains warnings generated by
1042 # malformations. Add the expected regular one.
1043 unshift @expected_warnings, $cp_message_qr;
1045 # But it may need to be modified, because either of
1046 # these malformations means we can't determine the
1047 # expected code point.
1048 if ($short || $unexpected_noncont) {
1049 my $first_byte = substr($this_bytes, 0, 1);
1050 $expected_warnings[0] = display_bytes(
1051 substr($this_bytes, 0, $this_expected_len));
1052 $expected_warnings[0]
1053 = qr/[Aa]\Qny UTF-8 sequence that starts with\E
1054 \Q $expected_warnings[0]\E
1055 \Q $non_cp_trailing_text\E/x;
1059 # Is effectively disallowed if we've set up a
1060 # malformation, even if the flag indicates it is
1061 # allowed. Fix up test name to indicate this as
1063 my $disallowed = ( $this_disallow_flags
1064 & $this_utf8n_flag_to_disallow)
1065 || $malformations_name;
1066 my $this_name = "utf8n_to_uvchr_error() $testname: "
1070 $this_name .= ", $eval_warn";
1071 $this_name .= ", " . (( $this_warning_flags
1072 & $this_utf8n_flag_to_warn)
1073 ? 'with flag for raising warnings'
1074 : 'no flag for raising warnings');
1075 $this_name .= $malformations_name;
1077 # Do the actual test using an eval
1078 undef @warnings_gotten;
1080 my $this_flags = $this_warning_flags|$this_disallow_flags;
1081 my $eval_text = "$eval_warn; \$ret_ref"
1082 . " = test_utf8n_to_uvchr_error("
1083 . "'$this_bytes', $this_length, $this_flags)";
1085 if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1087 diag "\$@='$@'; call was: "
1088 . utf8n_display_call($eval_text);
1092 is($ret_ref->[0], 0, " And returns 0")
1093 or diag "Call was: " . utf8n_display_call($eval_text);
1096 is($ret_ref->[0], $expected_uv,
1097 " And returns expected uv: "
1099 or diag "Call was: " . utf8n_display_call($eval_text);
1101 is($ret_ref->[1], $this_expected_len,
1102 " And returns expected length:"
1103 . " $this_expected_len")
1104 or diag "Call was: " . utf8n_display_call($eval_text);
1106 my $returned_flags = $ret_ref->[2];
1108 for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
1109 if ($expected_return_flags[$i] & $returned_flags) {
1110 if ($expected_return_flags[$i]
1111 == $::UTF8_DISALLOW_ABOVE_31_BIT)
1113 pass(" Expected and got return flag for"
1116 # The first entries in this are
1118 elsif ($i > @malformation_names - 1) {
1119 pass(" Expected and got return flag"
1120 . " for " . $controlling_warning_category);
1123 pass(" Expected and got return flag for "
1124 . $malformation_names[$i]
1127 $returned_flags &= ~$expected_return_flags[$i];
1128 splice @expected_return_flags, $i, 1;
1132 is($returned_flags, 0,
1133 " Got no unexpected return flags")
1134 or diag "The unexpected flags gotten were: "
1135 . (flags_to_text($returned_flags,
1136 \@utf8n_flags_to_text)
1137 # We strip off any prefixes from the flag
1139 =~ s/ \b [A-Z] _ //xgr);
1140 is (scalar @expected_return_flags, 0,
1141 " Got all expected return flags")
1142 or diag "The expected flags not gotten were: "
1143 . (flags_to_text(eval join("|",
1144 @expected_return_flags),
1145 \@utf8n_flags_to_text)
1146 # We strip off any prefixes from the flag
1148 =~ s/ \b [A-Z] _ //xgr);
1150 do_warnings_test(@expected_warnings)
1151 or diag "Call was: " . utf8n_display_call($eval_text);
1152 undef @warnings_gotten;
1154 # Check CHECK_ONLY results when the input is
1155 # disallowed. Do this when actually disallowed,
1156 # not just when the $this_disallow_flags is set
1158 my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
1159 my $eval_text = "use warnings; no warnings 'deprecated'; \$ret_ref ="
1160 . " test_utf8n_to_uvchr_error('"
1161 . "$this_bytes', $this_length,"
1164 if (! ok ("$@ eq ''",
1165 " And eval succeeded with CHECK_ONLY"))
1167 diag "\$@='$@'; Call was: "
1168 . utf8n_display_call($eval_text);
1171 is($ret_ref->[0], 0, " CHECK_ONLY: Returns 0")
1172 or diag "Call was: " . utf8n_display_call($eval_text);
1173 is($ret_ref->[1], -1,
1174 " CHECK_ONLY: returns -1 for length")
1175 or diag "Call was: " . utf8n_display_call($eval_text);
1176 if (! is(scalar @warnings_gotten, 0,
1177 " CHECK_ONLY: no warnings generated"))
1179 diag "Call was: " . utf8n_display_call($eval_text);
1180 output_warnings(@warnings_gotten);
1184 # Now repeat some of the above, but for
1185 # uvchr_to_utf8_flags(). Since this comes from an
1186 # existing code point, it hasn't overflowed, and isn't
1188 next if @malformation_names;
1190 $this_warning_flags = ($use_warn_flag)
1191 ? $this_uvchr_flag_to_warn
1193 $this_disallow_flags = ($do_disallow)
1194 ? $this_uvchr_flag_to_disallow
1197 $disallowed = $this_disallow_flags
1198 & $this_uvchr_flag_to_disallow;
1199 $this_name .= ", " . (( $this_warning_flags
1200 & $this_utf8n_flag_to_warn)
1201 ? 'with flag for raising warnings'
1202 : 'no flag for raising warnings');
1204 $this_name = "uvchr_to_utf8_flags() $testname: "
1208 $this_name .= ", $eval_warn";
1209 $this_name .= ", " . (( $this_warning_flags
1210 & $this_uvchr_flag_to_warn)
1211 ? 'with warning flag'
1212 : 'no warning flag');
1214 undef @warnings_gotten;
1216 $this_flags = $this_warning_flags|$this_disallow_flags;
1217 $eval_text = "$eval_warn; \$ret ="
1218 . " test_uvchr_to_utf8_flags("
1219 . "$allowed_uv, $this_flags)";
1221 if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1223 diag "\$@='$@'; call was: "
1224 . uvchr_display_call($eval_text);
1228 is($ret, undef, " And returns undef")
1229 or diag "Call was: " . uvchr_display_call($eval_text);
1232 is($ret, $this_bytes, " And returns expected string")
1233 or diag "Call was: " . uvchr_display_call($eval_text);
1236 do_warnings_test(@expected_warnings)
1237 or diag "Call was: " . uvchr_display_call($eval_text);