3 # This is a base file to be used by various .t's in its directory
4 # It tests various malformed UTF-8 sequences and some code points that are
5 # "problematic", and verifies that the correct warnings/flags etc are
6 # generated when using them. For the code points, it also takes the UTF-8 and
7 # perturbs it to be malformed in various ways, and tests that this gets
8 # appropriately detected.
14 use_ok('XS::APItest');
15 require 'charset_tools.pl';
16 require './t/utf8_setup.pl';
26 local $SIG{__WARN__} = sub { my @copy = @_;
27 push @warnings_gotten, map { chomp; $_ } @copy;
30 my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
31 my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
33 # C5 is chosen as it is valid for both ASCII and EBCDIC platforms
34 my $known_start_byte = I8_to_native("\xC5");
36 sub requires_extended_utf8($) {
38 # Returns a boolean as to whether or not the code point parameter fits
39 # into 31 bits, subject to the convention that a negative code point
40 # stands for one that overflows the word size, so won't fit in 31 bits.
42 return shift > $highest_non_extended_utf8_cp;
45 sub is_extended_utf8($) {
47 # Returns a boolean as to whether or not the input UTF-8 sequence uses
48 # Perl extended UTF-8.
50 my $byte = substr(shift, 0, 1);
51 return ord $byte >= 0xFE if isASCII;
52 return $byte == I8_to_native("\xFF");
55 sub overflow_discern_len($) {
57 # Returns how many bytes are needed to tell if a non-overlong UTF-8
58 # sequence is for a code point that won't fit in the platform's word size.
59 # Only the length of the sequence representing a single code point is
63 return ($::is64bit) ? 3 : 1;
65 # Below is needed for code points above IV_MAX
66 #return ($::is64bit) ? 3 : ((shift == $::max_bytes)
71 return ($::is64bit) ? 2 : 8;
74 sub overlong_discern_len($) {
76 # Returns how many bytes are needed to tell if the input UTF-8 sequence
77 # for a code point is overlong
80 my $length = length $string;
81 my $byte = ord native_to_I8(substr($string, 0, 1));
83 return ($byte >= 0xFE)
86 : ($byte == 0xFF) ? 7 : 2)
87 : (($length == 2) ? 1 : 2);
88 # Below is needed for code points above IV_MAX
89 #return ($length == $::max_bytes)
90 # # This is constrained to 1 on 32-bit machines, as it
92 # ? (($::is64bit) ? 7 : 1)
93 # : (($length == 2) ? 1 : 2);
96 return ($length == $::max_bytes) ? 8 : (($length <= 3) ? 1 : 2);
101 no warnings qw(portable overflow);
104 # $bytes, UTF-8 string
105 # $allowed_uv, code point $bytes evaluates to; -1 if
107 # $needed_to_discern_len optional, how long an initial substring do
108 # we need to tell that the string must be for
109 # a code point in the category it falls in,
110 # like being a surrogate; 0 indicates we need
111 # the whole string. Some categories have a
112 # default that is used if this is omitted.
113 [ "orphan continuation byte malformation",
114 I8_to_native("$::I8c"),
118 [ "overlong malformation, lowest 2-byte",
119 (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"),
122 [ "overlong malformation, highest 2-byte",
123 (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
124 (isASCII) ? 0x7F : 0xFF,
126 [ "overlong malformation, lowest 3-byte",
127 (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
130 [ "overlong malformation, highest 3-byte",
131 (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
132 (isASCII) ? 0x7FF : 0x3FF,
134 [ "lowest surrogate",
135 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
138 [ "a middle surrogate",
139 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
142 [ "highest surrogate",
143 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
146 [ "first of 32 consecutive non-character code points",
147 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
150 [ "a mid non-character code point of the 32 consecutive ones",
151 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
154 [ "final of 32 consecutive non-character code points",
155 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
158 [ "non-character code point U+FFFE",
159 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
162 [ "non-character code point U+FFFF",
163 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
166 [ "overlong malformation, lowest 4-byte",
167 (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
170 [ "overlong malformation, highest 4-byte",
171 (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
172 (isASCII) ? 0xFFFF : 0x3FFF,
174 [ "non-character code point U+1FFFE",
175 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
178 [ "non-character code point U+1FFFF",
179 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
182 [ "non-character code point U+2FFFE",
183 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
186 [ "non-character code point U+2FFFF",
187 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
190 [ "non-character code point U+3FFFE",
191 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
194 [ "non-character code point U+3FFFF",
195 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
198 [ "non-character code point U+4FFFE",
201 : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
204 [ "non-character code point U+4FFFF",
207 : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
210 [ "non-character code point U+5FFFE",
213 : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
216 [ "non-character code point U+5FFFF",
219 : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
222 [ "non-character code point U+6FFFE",
225 : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
228 [ "non-character code point U+6FFFF",
231 : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
234 [ "non-character code point U+7FFFE",
237 : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
240 [ "non-character code point U+7FFFF",
243 : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
246 [ "non-character code point U+8FFFE",
249 : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
252 [ "non-character code point U+8FFFF",
255 : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
258 [ "non-character code point U+9FFFE",
261 : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
264 [ "non-character code point U+9FFFF",
267 : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
270 [ "non-character code point U+AFFFE",
273 : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
276 [ "non-character code point U+AFFFF",
279 : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
282 [ "non-character code point U+BFFFE",
285 : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
288 [ "non-character code point U+BFFFF",
291 : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
294 [ "non-character code point U+CFFFE",
297 : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
300 [ "non-character code point U+CFFFF",
303 : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
306 [ "non-character code point U+DFFFE",
309 : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
312 [ "non-character code point U+DFFFF",
315 : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
318 [ "non-character code point U+EFFFE",
321 : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
324 [ "non-character code point U+EFFFF",
327 : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
330 [ "non-character code point U+FFFFE",
333 : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
336 [ "non-character code point U+FFFFF",
339 : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
342 [ "non-character code point U+10FFFE",
345 : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
348 [ "non-character code point U+10FFFF",
351 : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
354 [ "first non_unicode",
357 : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
361 [ "non_unicode whose first byte tells that",
364 : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
365 (isASCII) ? 0x140000 : 0x200000,
368 [ "overlong malformation, lowest 5-byte",
370 ? "\xf8\x80\x80\x80\x80"
371 : I8_to_native("\xf8\xa0\xa0\xa0\xa0"),
374 [ "overlong malformation, highest 5-byte",
376 ? "\xf8\x87\xbf\xbf\xbf"
377 : I8_to_native("\xf8\xa7\xbf\xbf\xbf"),
378 (isASCII) ? 0x1FFFFF : 0x3FFFF,
380 [ "overlong malformation, lowest 6-byte",
382 ? "\xfc\x80\x80\x80\x80\x80"
383 : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"),
386 [ "overlong malformation, highest 6-byte",
388 ? "\xfc\x83\xbf\xbf\xbf\xbf"
389 : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"),
390 (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
392 [ "overlong malformation, lowest 7-byte",
394 ? "\xfe\x80\x80\x80\x80\x80\x80"
395 : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"),
398 [ "overlong malformation, highest 7-byte",
400 ? "\xfe\x81\xbf\xbf\xbf\xbf\xbf"
401 : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"),
402 (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
404 [ "highest 31 bit code point",
406 ? "\xfd\xbf\xbf\xbf\xbf\xbf"
408 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"),
412 [ "lowest 32 bit code point",
414 ? "\xfe\x82\x80\x80\x80\x80\x80"
416 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
417 ($::is64bit) ? 0x80000000 : -1, # Overflows on 32-bit systems
420 # Used when UV_MAX is allowed as a code point
421 #[ "highest 32 bit code point",
423 # ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
425 # "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
428 #[ "Lowest 33 bit code point",
430 # ? "\xfe\x84\x80\x80\x80\x80\x80"
432 # "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
433 # ($::is64bit) ? 0x100000000 : 0x0, # Overflows on 32-bit systems
440 [ "overlong malformation, but naively looks like overflow",
441 "\xff\x80\x80\x80\x80\x80\x80\x81\xbf\xbf\xbf\xbf\xbf",
444 # Used when above IV_MAX are allowed.
445 #[ "overlong malformation, but naively looks like overflow",
446 # "\xff\x80\x80\x80\x80\x80\x80\x83\xbf\xbf\xbf\xbf\xbf",
449 [ "overflow that old algorithm failed to detect",
450 "\xfe\x86\x80\x80\x80\x80\x80",
457 [ "overlong malformation, lowest max-byte",
459 ? "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
461 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
464 [ "overlong malformation, highest max-byte",
465 (isASCII) # 2**36-1 on ASCII; 2**30-1 on EBCDIC
466 ? "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf"
468 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"),
469 (isASCII) ? (($::is64bit) ? 0xFFFFFFFFF : -1) : 0x3FFFFFFF,
474 [ "Lowest code point requiring 13 bytes to represent", # 2**36
475 "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
476 ($::is64bit) ? 0x1000000000 : -1, # overflows on 32bit
482 [ "highest 63 bit code point",
484 ? "\xff\x80\x87\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
486 "\xff\xa7\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
490 [ "first 64 bit code point",
492 ? "\xff\x80\x88\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
494 "\xff\xa8\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
497 # Used when UV_MAX is allowed as a code point
498 #[ "highest 64 bit code point",
500 # ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
502 # "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
503 # 0xFFFFFFFFFFFFFFFF,
506 #[ "first 65 bit code point",
508 # ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
510 # "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
515 [ "overflow that old algorithm failed to detect",
516 "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
521 push @tests, # These could falsely show wrongly in a naive
523 [ "requires at least 32 bits",
525 "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
529 [ "requires at least 32 bits",
531 "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
534 [ "requires at least 32 bits",
536 "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
539 [ "requires at least 32 bits",
541 "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
544 [ "requires at least 32 bits",
546 "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
549 [ "requires at least 32 bits",
551 "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
558 sub flags_to_text($$)
560 my ($flags, $flags_to_text_ref) = @_;
562 # Returns a string containing a mnemonic representation of the bits that
563 # are set in the $flags. These are assumed to be flag bits. The return
564 # looks like "FOO|BAR|BAZ". The second parameter is a reference to an
565 # array that gives the textual representation of all the possible flags.
566 # Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If
567 # no bits at all are set the string "0" is returned;
572 return "0" if $flags == 0;
575 #diag sprintf "%x", $flags;
577 push @flag_text, $flags_to_text_ref->[$shift];
583 return join "|", @flag_text;
586 # Possible flag returns from utf8n_to_uvchr_error(). These should have G_,
587 # instead of A_, D_, but the prefixes will be used in a a later commit, so
588 # minimize churn by having them here.
589 my @utf8n_flags_to_text = ( qw(
607 NO_CONFIDENCE_IN_CURLEN_
610 sub utf8n_display_call($)
612 # Converts an eval string that calls test_utf8n_to_uvchr into a more human
613 # readable form, and returns it. Doesn't work if the byte string contains
614 # an apostrophe. The return will look something like:
615 # test_utf8n_to_uvchr_error('$bytes', $length, $flags)
618 $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
619 my $text1 = $1; # Everything before the byte string
621 my $text2 = $3; # Includes the length
625 . display_bytes($bytes)
627 . flags_to_text($flags, \@utf8n_flags_to_text)
631 my @uvchr_flags_to_text = ( qw(
642 sub uvchr_display_call($)
644 # Converts an eval string that calls test_uvchr_to_utf8 into a more human
645 # readable form, and returns it. The return will look something like:
646 # test_uvchr_to_utf8n_flags($uv, $flags)
650 $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
652 my $cp = sprintf "%X", $2;
655 return "${text}0x$cp, " . flags_to_text($flags, \@uvchr_flags_to_text) . ')';
658 sub do_warnings_test(@)
660 my @expected_warnings = @_;
662 # Compares the input expected warnings array with @warnings_gotten,
663 # generating a pass for each found, removing it from @warnings_gotten.
664 # Any discrepancies generate test failures. Returns TRUE if no
665 # discrepcancies; otherwise FALSE.
669 if (@expected_warnings == 0) {
670 if (! is(@warnings_gotten, 0, " Expected and got no warnings")) {
671 output_warnings(@warnings_gotten);
677 # Check that we got all the expected warnings,
678 # removing each one found
680 foreach my $expected (@expected_warnings) {
681 foreach (my $i = 0; $i < @warnings_gotten; $i++) {
682 if ($warnings_gotten[$i] =~ $expected) {
683 pass(" Expected and got warning: "
684 . " $warnings_gotten[$i]");
685 splice @warnings_gotten, $i, 1;
689 fail(" Expected a warning that matches "
690 . $expected . " but didn't get it");
694 if (! is(@warnings_gotten, 0, " Got no unexpected warnings")) {
695 output_warnings(@warnings_gotten);
702 my $min_cont = (isASCII) ? 0x80 : 0xA0;
703 my $continuation_shift = (isASCII) ? 6 : 5;
704 my $continuation_mask = (1 << $continuation_shift) - 1;
706 sub isUTF8_CHAR($$) { # Uses first principals to determine if this is legal
707 # (Doesn't work if overflows)
708 my ($string, $length) = @_;
710 # Uses first principals to calculate if $string is legal
712 return 0 if $length <= 0;
714 my $first = ord substr($string, 0, 1);
717 return 1 if $length == 1 && $first < $min_cont;
719 return 0 if $first < 0xC0; # Starts with continuation
721 # Calculate the number of leading 1 bits
726 $bits = ($bits << 1) & 0xFF;
727 } while ($bits & 0x80);
729 return 0 if $utf8skip != $length;
731 # Acuumulate the $code point. The remaining bits in the start byte count
733 my $cp = $bits >> $utf8skip;
735 for my $i (1 .. $length - 1) {
736 my $ord = ord substr($string, $i, 1);
738 # Wrong if not a continuation
739 return 0 if $ord < $min_cont || $ord >= 0xC0;
741 $cp = ($cp << $continuation_shift)
742 | ($ord & $continuation_mask);
745 # If the calculated value can be expressed in fewer bytes than were passed
746 # in, is an illegal overlong. XXX if 'chr' is not working properly, this
752 return 0 if length $chr < $length;
759 return 0xFF if $len > 7;
760 return (0xFF & (0xFE << (7 - $len)));
765 return 0 if $len > 7;
766 return 0x1F >> ($len - 2);
769 # This test is split into this number of files.
770 my $num_test_files = $ENV{TEST_JOBS} || 1;
771 $num_test_files = 10 if $num_test_files > 10;
773 # We only really need to test utf8n_to_uvchr_msgs() once with this flag.
774 my $tested_CHECK_ONLY = 0;
778 # By setting this environment variable to this particular value, we test
779 # essentially all combinations of potential UTF-8, so that can get a
780 # comprehensive test of the decoding routine. This test assumes the routine
781 # that does the translation from code point to UTF-8 is working. An assert
782 # can be used in the routine to make sure that the dfa is working precisely
783 # correctly, and any flaws in it aren't being masked by the remainder of the
785 if ($::TEST_CHUNK == 0
786 && $ENV{PERL_DEBUG_FULL_TEST}
787 && $ENV{PERL_DEBUG_FULL_TEST} == 97)
789 my $min_cont_mask = $min_cont | 0xF;
790 my @bytes = ( 0, # Placeholder to signify to use an empty string ""
791 ord 'A',# We assume that all the invariant characters are
792 # properly in the same class, so this is an exemplar
794 $min_cont .. 0xFF # But test every non-invariant individually
796 my $shift = (isASCII) ? 6 : 5;
797 my $mark = $min_cont;
798 my $mask = (1 << $shift) - 1;
799 for my $byte1 (@bytes) {
800 for my $byte2 (@bytes) {
801 last if $byte2 && ! $byte1; # Don't test empty preceding byte
803 last if $byte2 && $byte1 < 0xC0; # No need to test more than a
804 # single byte unless start byte
807 for my $byte3 (@bytes) {
808 last if $byte3 && ! $byte2;
809 last if $byte3 && $byte1 < 0xE0; # Only test 3 bytes for
812 # If the preceding byte is a start byte, it should fail, and
813 # there is no need to test illegal bytes that follow.
814 # Instead, limit ourselves to just a few legal bytes that
815 # could follow. This cuts down tremendously on the number of
817 next if $byte2 >= 0xC0
818 && $byte3 >= $min_cont
819 && ($byte3 & $min_cont_mask) != $min_cont;
821 for my $byte4 (@bytes) {
822 last if $byte4 && ! $byte3;
823 last if $byte4 && $byte1 < 0xF0; # Only test 4 bytes for
826 # Like for byte 3, we limit things that come after a
827 # mispositioned start-byte to just a few things that
828 # otherwise would be legal
829 next if ($byte2 >= 0xC0 || $byte3 >= 0xC0)
830 && $byte4 >= $min_cont
831 && ($byte4 & $min_cont_mask) != $min_cont;
833 for my $byte5 (@bytes) {
834 last if $byte5 && ! $byte4;
835 last if $byte5 && $byte1 < 0xF8; # Only test 5 bytes for
838 # Like for byte 4, we limit things that come after a
839 # mispositioned start-byte to just a few things that
840 # otherwise would be legal
841 next if ( $byte2 >= 0xC0
844 && $byte4 >= $min_cont
845 && ($byte4 & $min_cont_mask) != $min_cont;
848 $string .= chr $byte1 if $byte1;
849 $string .= chr $byte2 if $byte2;
850 $string .= chr $byte3 if $byte3;
851 $string .= chr $byte4 if $byte4;
852 $string .= chr $byte5 if $byte5;
854 my $length = length $string;
856 last if $byte1 >= ((isASCII) ? 0xF6 : 0xFA);
858 my $native = I8_to_native($string);
859 my $is_valid = isUTF8_CHAR($native, $length);
860 my $got_valid = test_isUTF8_CHAR($native, $length);
862 = test_isSTRICT_UTF8_CHAR($native, $length);
864 = test_isC9_STRICT_UTF8_CHAR($native, $length);
865 my $ret = test_utf8n_to_uvchr_msgs($native, $length,
866 $::UTF8_WARN_ILLEGAL_INTERCHANGE);
867 my $is_strict = $is_valid;
868 my $is_C9 = $is_valid;
872 # Here, is legal UTF-8. Verify that it returned
873 # the correct code point, and if so, that it
874 # correctly classifies the result.
877 my $should_be_string;
879 $should_be_string = chr $cp;
883 # Starting with the code point, use first
884 # principals to find the equivalen UTF-8
888 for (my $i = $length - 1; $i > 0; $i--) {
889 $bytes[$i] = chr I8_to_native(($uv & $mask)
893 $bytes[0] = chr I8_to_native(( $uv
894 & start_mask($length))
895 | start_mark($length));
896 $should_be_string = join "", @bytes;
899 # If the original string and the inverse are the
901 if (is($native, $should_be_string,
902 "utf8n_to_uvchr_msgs("
903 . display_bytes($native)
904 . ") returns correct uv=0x"
905 . sprintf ("%x", $cp)))
907 my $is_surrogate = $cp >= 0xD800
910 = ($ret->[2] & $::UTF8_GOT_SURROGATE) != 0;
911 $is_strict = 0 if $is_surrogate;
912 $is_C9 = 0 if $is_surrogate;
914 my $is_super = $cp > 0x10FFFF;
916 = ($ret->[2] & $::UTF8_GOT_SUPER) != 0;
917 $is_strict = 0 if $is_super;
918 $is_C9 = 0 if $is_super;
920 my $is_nonchar = ! $is_super
921 && ( ($cp & 0xFFFE) == 0xFFFE
922 || ($cp >= 0xFDD0 && $cp <= 0xFDEF));
924 = ($ret->[2] & $::UTF8_GOT_NONCHAR) != 0;
925 $is_strict = 0 if $is_nonchar;
927 is($got_surrogate, $is_surrogate,
928 " And correctly flagged it as"
929 . ((! $is_surrogate) ? " not" : "")
930 . " being a surrogate");
931 is($got_super, $is_super,
932 " And correctly flagged it as"
933 . ((! $is_super) ? " not" : "")
934 . " being above Unicode");
935 is($got_nonchar, $is_nonchar,
936 " And correctly flagged it as"
937 . ((! $is_nonchar) ? " not" : "")
938 . " being a non-char");
941 # This is how we exit the loop normally if things
942 # are working. The fail-safe code above is used
944 goto done if $cp > 0x140001;
947 is($ret->[0], 0, "utf8n_to_uvchr_msgs("
948 . display_bytes($native)
949 . ") correctly returns error");
950 if (! ($ret->[2] & ($::UTF8_GOT_SHORT
951 |$::UTF8_GOT_NON_CONTINUATION
954 is($ret->[2] & ( $::UTF8_GOT_NONCHAR
955 |$::UTF8_GOT_SURROGATE
956 |$::UTF8_GOT_SUPER), 0,
957 " And isn't a surrogate, non-char, nor"
962 is($got_valid == 0, $is_valid == 0,
963 " And isUTF8_CHAR() correctly returns "
964 . (($got_valid == 0) ? "0" : "non-zero"));
965 is($got_strict == 0, $is_strict == 0,
966 " And isSTRICT_UTF8_CHAR() correctly returns "
967 . (($got_strict == 0) ? "0" : "non-zero"));
968 is($got_C9 == 0, $is_C9 == 0,
969 " And isC9_UTF8_CHAR() correctly returns "
970 . (($got_C9 == 0) ? "0" : "non-zero"));
979 foreach my $test (@tests) {
981 next if $test_count % $num_test_files != $::TEST_CHUNK;
983 my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
985 my $length = length $bytes;
986 my $initially_overlong = $testname =~ /overlong/;
987 my $initially_orphan = $testname =~ /orphan/;
988 my $will_overflow = $allowed_uv < 0;
990 my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
991 my $display_bytes = display_bytes($bytes);
993 my $controlling_warning_category;
994 my $utf8n_flag_to_warn;
995 my $utf8n_flag_to_disallow;
996 my $uvchr_flag_to_warn;
997 my $uvchr_flag_to_disallow;
999 # We want to test that the independent flags are actually independent.
1000 # For example, that a surrogate doesn't trigger a non-character warning,
1001 # and conversely, turning off an above-Unicode flag doesn't suppress a
1002 # surrogate warning. Earlier versions of this file used nested loops to
1003 # test all possible combinations. But that creates lots of tests, making
1004 # this run too long. What is now done instead is to use the complement of
1005 # the category we are testing to greatly reduce the combinatorial
1006 # explosion. For example, if we have a surrogate and we aren't expecting
1007 # a warning about it, we set all the flags for non-surrogates to raise
1008 # warnings. If one shows up, it indicates the flags aren't independent.
1009 my $utf8n_flag_to_warn_complement;
1010 my $utf8n_flag_to_disallow_complement;
1011 my $uvchr_flag_to_warn_complement;
1012 my $uvchr_flag_to_disallow_complement;
1014 # Many of the code points being tested are middling in that if code point
1015 # edge cases work, these are very likely to as well. Because this test
1016 # file takes a while to execute, we skip testing the edge effects of code
1017 # points deemed middling, while testing their basics and continuing to
1018 # fully test the non-middling code points.
1019 my $skip_most_tests = 0;
1021 my $cp_message_qr; # Pattern that matches the message raised when
1022 # that message contains the problematic code
1023 # point. The message is the same (currently) both
1024 # when going from/to utf8.
1025 my $non_cp_trailing_text; # The suffix text when the message doesn't
1026 # contain a code point. (This is a result of
1027 # some sort of malformation that means we
1028 # can't get an exact code poin
1029 my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
1030 \Q requires a Perl extension, and so is not\E
1032 my $extended_non_cp_trailing_text
1033 = "is a Perl extension, and so is not portable";
1035 # What bytes should have been used to specify a code point that has been
1036 # specified as an overlong.
1037 my $correct_bytes_for_overlong;
1039 # Is this test malformed from the beginning? If so, we know to generally
1040 # expect that the tests will show it isn't valid.
1041 my $initially_malformed = 0;
1043 if ($initially_overlong || $initially_orphan) {
1044 $non_cp_trailing_text = "if you see this, there is an error";
1045 $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
1046 $initially_malformed = 1;
1047 $utf8n_flag_to_warn = 0;
1048 $utf8n_flag_to_disallow = 0;
1050 $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE;
1051 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE;
1052 if (! $will_overflow && $allowed_uv <= 0x10FFFF) {
1053 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_SUPER;
1054 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_SUPER;
1055 if (($allowed_uv & 0xFFFF) != 0xFFFF) {
1056 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_NONCHAR;
1057 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_NONCHAR;
1060 if (! is_extended_utf8($bytes)) {
1061 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED;
1062 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_PERL_EXTENDED;
1065 $controlling_warning_category = 'utf8';
1067 if ($initially_overlong) {
1068 if (! defined $needed_to_discern_len) {
1069 $needed_to_discern_len = overlong_discern_len($bytes);
1071 $correct_bytes_for_overlong = display_bytes_no_quotes(chr $allowed_uv);
1074 elsif($will_overflow || $allowed_uv > 0x10FFFF) {
1076 # Set the SUPER flags; later, we test for PERL_EXTENDED as well.
1077 $utf8n_flag_to_warn = $::UTF8_WARN_SUPER;
1078 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
1079 $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER;
1080 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
1082 # Below, we add the flags for non-perl_extended to the code points
1083 # that don't fit that category. Special tests are done for this
1084 # category in the inner loop.
1085 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
1086 |$::UTF8_WARN_SURROGATE;
1087 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
1088 |$::UTF8_DISALLOW_SURROGATE;
1089 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
1090 |$::UNICODE_WARN_SURROGATE;
1091 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
1092 |$::UNICODE_DISALLOW_SURROGATE;
1093 $controlling_warning_category = 'non_unicode';
1095 if ($will_overflow) { # This is realy a malformation
1096 $non_cp_trailing_text = "if you see this, there is an error";
1097 $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
1098 $initially_malformed = 1;
1099 if (! defined $needed_to_discern_len) {
1100 $needed_to_discern_len = overflow_discern_len($length);
1103 elsif (requires_extended_utf8($allowed_uv)) {
1104 $cp_message_qr = $extended_cp_message_qr;
1105 $non_cp_trailing_text = $extended_non_cp_trailing_text;
1106 $needed_to_discern_len = 1 unless defined $needed_to_discern_len;
1109 $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
1110 \Q may not be portable\E/x;
1111 $non_cp_trailing_text = "is for a non-Unicode code point, may not"
1113 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED;
1114 $utf8n_flag_to_disallow_complement
1115 |= $::UTF8_DISALLOW_PERL_EXTENDED;
1116 $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED;
1117 $uvchr_flag_to_disallow_complement
1118 |= $::UNICODE_DISALLOW_PERL_EXTENDED;
1121 elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
1122 $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/;
1123 $non_cp_trailing_text = "is for a surrogate";
1124 $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
1125 $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
1127 $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE;
1128 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
1129 $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE;
1130 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
1132 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
1134 |$::UTF8_WARN_PERL_EXTENDED;
1135 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
1136 |$::UTF8_DISALLOW_SUPER
1137 |$::UTF8_DISALLOW_PERL_EXTENDED;
1138 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
1139 |$::UNICODE_WARN_SUPER
1140 |$::UNICODE_WARN_PERL_EXTENDED;
1141 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
1142 |$::UNICODE_DISALLOW_SUPER
1143 |$::UNICODE_DISALLOW_PERL_EXTENDED;
1144 $controlling_warning_category = 'surrogate';
1146 elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
1147 || ($allowed_uv & 0xFFFE) == 0xFFFE)
1149 $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E
1150 \Q is not recommended for open interchange\E/x;
1151 $non_cp_trailing_text = "if you see this, there is an error";
1152 $needed_to_discern_len = $length unless defined $needed_to_discern_len;
1153 if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
1154 || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
1156 $skip_most_tests = 1;
1159 $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR;
1160 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
1161 $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR;
1162 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
1164 $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE
1166 |$::UTF8_WARN_PERL_EXTENDED;
1167 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
1168 |$::UTF8_DISALLOW_SUPER
1169 |$::UTF8_DISALLOW_PERL_EXTENDED;
1170 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE
1171 |$::UNICODE_WARN_SUPER
1172 |$::UNICODE_WARN_PERL_EXTENDED;
1173 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
1174 |$::UNICODE_DISALLOW_SUPER
1175 |$::UNICODE_DISALLOW_PERL_EXTENDED;
1177 $controlling_warning_category = 'nonchar';
1180 die "Can't figure out what type of warning to test for $testname"
1183 die 'Didn\'t set $needed_to_discern_len for ' . $testname
1184 unless defined $needed_to_discern_len;
1186 # We try various combinations of malformations that can occur
1187 foreach my $short (0, 1) {
1188 next if $skip_most_tests && $short;
1189 foreach my $unexpected_noncont (0, 1) {
1190 next if $skip_most_tests && $unexpected_noncont;
1191 foreach my $overlong (0, 1) {
1192 next if $overlong && $skip_most_tests;
1193 next if $initially_overlong && ! $overlong;
1195 # If we're creating an overlong, it can't be longer than the
1196 # maximum length, so skip if we're already at that length.
1197 next if (! $initially_overlong && $overlong)
1198 && $length >= $::max_bytes;
1200 my $this_cp_message_qr = $cp_message_qr;
1201 my $this_non_cp_trailing_text = $non_cp_trailing_text;
1203 foreach my $malformed_allow_type (0..2) {
1204 # 0 don't allow this malformation; ignored if no malformation
1205 # 1 allow, with REPLACEMENT CHARACTER returned
1206 # 2 allow, with intended code point returned. All malformations
1207 # other than overlong can't determine the intended code point,
1208 # so this isn't valid for them.
1209 next if $malformed_allow_type == 2
1210 && ($will_overflow || $short || $unexpected_noncont);
1211 next if $skip_most_tests && $malformed_allow_type;
1213 # Here we are in the innermost loop for malformations. So we
1214 # know which ones are in effect. Can now change the input to be
1215 # appropriately malformed. We also can set up certain other
1216 # things now, like whether we expect a return flag from this
1217 # malformation, and which flag.
1219 my $this_bytes = $bytes;
1220 my $this_length = $length;
1221 my $this_expected_len = $length;
1222 my $this_needed_to_discern_len = $needed_to_discern_len;
1224 my @malformation_names;
1225 my @expected_malformation_warnings;
1226 my @expected_malformation_return_flags;
1228 # Contains the flags for any allowed malformations. Currently no
1229 # combinations of on/off are tested for. It's either all are
1230 # allowed, or none are.
1231 my $allow_flags = 0;
1232 my $overlong_is_in_perl_extended_utf8 = 0;
1233 my $dont_use_overlong_cp = 0;
1235 if ($initially_orphan) {
1236 next if $overlong || $short || $unexpected_noncont;
1240 if (! $initially_overlong) {
1241 my $new_expected_len;
1243 # To force this malformation, we convert the original start
1244 # byte into a continuation byte with the same data bits as
1246 my $start_byte = substr($this_bytes, 0, 1);
1247 my $converted_to_continuation_byte
1248 = start_byte_to_cont($start_byte);
1250 # ... Then we prepend it with a known overlong sequence.
1251 # This should evaluate to the exact same code point as the
1252 # original. We try to avoid an overlong using Perl
1253 # extended UTF-8. The code points are the highest
1254 # representable as overlongs on the respective platform
1255 # without using extended UTF-8.
1256 if (native_to_I8($start_byte) lt "\xFC") {
1257 $start_byte = I8_to_native("\xFC");
1258 $new_expected_len = 6;
1260 elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") {
1262 # FE is not extended UTF-8 on EBCDIC
1263 $start_byte = I8_to_native("\xFE");
1264 $new_expected_len = 7;
1266 else { # Must use extended UTF-8. On ASCII platforms, we
1267 # could express some overlongs here starting with
1268 # \xFE, but there's no real reason to do so.
1269 $overlong_is_in_perl_extended_utf8 = 1;
1270 $start_byte = I8_to_native("\xFF");
1271 $new_expected_len = $::max_bytes;
1272 $this_cp_message_qr = $extended_cp_message_qr;
1274 # The warning that gets raised doesn't include the
1275 # code point in the message if the code point can be
1276 # expressed without using extended UTF-8, but the
1277 # particular overlong sequence used is in extended
1278 # UTF-8. To do otherwise would be confusing to the
1279 # user, as it would claim the code point requires
1280 # extended, when it doesn't.
1281 $dont_use_overlong_cp = 1
1282 unless requires_extended_utf8($allowed_uv);
1283 $this_non_cp_trailing_text
1284 = $extended_non_cp_trailing_text;
1287 # Splice in the revise continuation byte, preceded by the
1288 # start byte and the proper number of the lowest
1289 # continuation bytes.
1290 $this_bytes = $start_byte
1291 . ($native_lowest_continuation_chr
1292 x ( $new_expected_len
1294 - length($this_bytes)))
1295 . $converted_to_continuation_byte
1296 . substr($this_bytes, 1);
1297 $this_length = length($this_bytes);
1298 $this_needed_to_discern_len = $new_expected_len
1299 - ( $this_expected_len
1300 - $this_needed_to_discern_len);
1301 $this_expected_len = $new_expected_len;
1307 # To force this malformation, just tell the test to not look
1308 # as far as it should into the input.
1310 $this_expected_len--;
1312 $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
1315 if ($unexpected_noncont) {
1317 # To force this malformation, change the final continuation
1318 # byte into a start byte.
1319 my $pos = ($short) ? -2 : -1;
1320 substr($this_bytes, $pos, 1) = $known_start_byte;
1321 $this_expected_len--;
1324 # The whole point of a test that is malformed from the beginning
1325 # is to test for that malformation. If we've modified things so
1326 # much that we don't have enough information to detect that
1327 # malformation, there's no point in testing.
1328 next if $initially_malformed
1329 && $this_expected_len < $this_needed_to_discern_len;
1331 # Here, we've transformed the input with all of the desired
1332 # non-overflow malformations. We are now in a position to
1333 # construct any potential warnings for those malformations. But
1334 # it's a pain to get the detailed messages exactly right, so for
1335 # now XXX, only do so for those that return an explicit code
1338 if ($initially_orphan) {
1339 push @malformation_names, "orphan continuation";
1340 push @expected_malformation_return_flags,
1341 $::UTF8_GOT_CONTINUATION;
1342 $allow_flags |= $::UTF8_ALLOW_CONTINUATION
1343 if $malformed_allow_type;
1344 push @expected_malformation_warnings, qr/unexpected continuation/;
1348 push @malformation_names, 'overlong';
1349 push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
1351 # If one of the other malformation types is also in effect, we
1352 # don't know what the intended code point was.
1353 if ($short || $unexpected_noncont || $will_overflow) {
1354 push @expected_malformation_warnings, qr/overlong/;
1357 my $wrong_bytes = display_bytes_no_quotes(
1358 substr($this_bytes, 0, $this_length));
1359 if (! defined $correct_bytes_for_overlong) {
1360 $correct_bytes_for_overlong
1361 = display_bytes_no_quotes($bytes);
1363 my $prefix = ( $allowed_uv > 0x10FFFF
1364 || ! isASCII && $allowed_uv < 256)
1367 push @expected_malformation_warnings,
1368 qr/\QMalformed UTF-8 character: $wrong_bytes\E
1369 \Q (overlong; instead use\E
1370 \Q $correct_bytes_for_overlong to\E
1371 \Q represent $prefix$uv_string)/x;
1374 if ($malformed_allow_type == 2) {
1375 $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
1377 elsif ($malformed_allow_type) {
1378 $allow_flags |= $::UTF8_ALLOW_LONG;
1382 push @malformation_names, 'short';
1383 push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
1384 push @expected_malformation_warnings, qr/too short/;
1386 if ($unexpected_noncont) {
1387 push @malformation_names, 'unexpected non-continuation';
1388 push @expected_malformation_return_flags,
1389 $::UTF8_GOT_NON_CONTINUATION;
1390 $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
1391 if $malformed_allow_type;
1392 push @expected_malformation_warnings,
1393 qr/unexpected non-continuation byte/;
1396 # The overflow malformation is done differently than other
1397 # malformations. It comes from manually typed tests in the test
1398 # array. We now make it be treated like one of the other
1399 # malformations. But some has to be deferred until the inner loop
1400 my $overflow_msg_pattern;
1401 if ($will_overflow) {
1402 push @malformation_names, 'overflow';
1404 $overflow_msg_pattern = display_bytes_no_quotes(
1405 substr($this_bytes, 0, $this_expected_len));
1406 $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E
1407 \Q $overflow_msg_pattern\E
1409 push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW;
1410 $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type;
1413 # And we can create the malformation-related text for the the test
1414 # names we eventually will generate.
1415 my $malformations_name = "";
1416 if (@malformation_names) {
1417 $malformations_name .= "dis" unless $malformed_allow_type;
1418 $malformations_name .= "allowed ";
1419 $malformations_name .= "malformation";
1420 $malformations_name .= "s" if @malformation_names > 1;
1421 $malformations_name .= ": ";
1422 $malformations_name .= join "/", @malformation_names;
1423 $malformations_name = " ($malformations_name)";
1426 # Done setting up the malformation related stuff
1428 { # First test the isFOO calls
1429 use warnings; # XXX no warnings 'deprecated'; # Make sure these don't raise warnings
1430 undef @warnings_gotten;
1432 my $ret = test_isUTF8_CHAR($this_bytes, $this_length);
1434 = test_isUTF8_CHAR_flags($this_bytes, $this_length, 0);
1435 if ($malformations_name) {
1436 is($ret, 0, "For $testname$malformations_name: isUTF8_CHAR() returns 0");
1437 is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0");
1440 is($ret, $this_length, "For $testname: isUTF8_CHAR() returns"
1441 . " expected length: $this_length");
1442 is($ret_flags, $this_length,
1443 " And isUTF8_CHAR_flags(...,0) returns expected"
1444 . " length: $this_length");
1446 is(scalar @warnings_gotten, 0,
1447 " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags"
1448 . " generated any warnings")
1449 or output_warnings(@warnings_gotten);
1451 undef @warnings_gotten;
1452 $ret = test_isSTRICT_UTF8_CHAR($this_bytes, $this_length);
1453 if ($malformations_name) {
1454 is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0");
1458 = ( $testname =~ /surrogate|non-character/
1459 || $allowed_uv > 0x10FFFF)
1462 is($ret, $expected_ret,
1463 " And isSTRICT_UTF8_CHAR() returns expected"
1464 . " length: $expected_ret");
1465 $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1466 $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
1467 is($ret, $expected_ret,
1468 " And isUTF8_CHAR_flags('"
1469 . "DISALLOW_ILLEGAL_INTERCHANGE') acts like"
1470 . " isSTRICT_UTF8_CHAR");
1472 is(scalar @warnings_gotten, 0,
1473 " And neither isSTRICT_UTF8_CHAR() nor"
1474 . " isUTF8_CHAR_flags generated any warnings")
1475 or output_warnings(@warnings_gotten);
1477 undef @warnings_gotten;
1478 $ret = test_isC9_STRICT_UTF8_CHAR($this_bytes, $this_length);
1479 if ($malformations_name) {
1480 is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0");
1483 my $expected_ret = ( $testname =~ /surrogate/
1484 || $allowed_uv > 0x10FFFF)
1486 : $this_expected_len;
1487 is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()"
1488 . " returns expected length:"
1489 . " $expected_ret");
1490 $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1491 $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
1492 is($ret, $expected_ret,
1493 " And isUTF8_CHAR_flags('"
1494 . "DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like"
1495 . " isC9_STRICT_UTF8_CHAR");
1497 is(scalar @warnings_gotten, 0,
1498 " And neither isC9_STRICT_UTF8_CHAR() nor"
1499 . " isUTF8_CHAR_flags generated any warnings")
1500 or output_warnings(@warnings_gotten);
1502 foreach my $disallow_type (0..2) {
1503 # 0 is don't disallow this type of code point
1505 # 2 is do disallow, but only code points requiring
1506 # perl-extended-UTF8
1511 if ($malformations_name) {
1513 # Malformations are by default disallowed, so testing
1514 # with $disallow_type equal to 0 is sufficicient.
1515 next if $disallow_type;
1517 $disallow_flags = 0;
1520 elsif ($disallow_type == 1) {
1521 $disallow_flags = $utf8n_flag_to_disallow;
1524 elsif ($disallow_type == 2) {
1525 next if ! requires_extended_utf8($allowed_uv);
1526 $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED;
1530 $disallow_flags = $utf8n_flag_to_disallow_complement;
1531 $expected_ret = $this_length;
1534 $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1536 is($ret, $expected_ret,
1537 " And isUTF8_CHAR_flags($display_bytes,"
1538 . " $disallow_flags) returns $expected_ret")
1539 or diag "The flags mean "
1540 . flags_to_text($disallow_flags,
1541 \@utf8n_flags_to_text);
1542 is(scalar @warnings_gotten, 0,
1543 " And isUTF8_CHAR_flags(...) generated"
1545 or output_warnings(@warnings_gotten);
1547 # Test partial character handling, for each byte not a
1549 my $did_test_partial = 0;
1550 for (my $j = 1; $j < $this_length - 1; $j++) {
1551 $did_test_partial = 1;
1552 my $partial = substr($this_bytes, 0, $j);
1555 if ($disallow_type || $malformations_name) {
1557 $comment = "disallowed";
1559 # The number of bytes required to tell if a
1560 # sequence has something wrong is the smallest of
1561 # all the things wrong with it. We start with the
1562 # number for this type of code point, if that is
1563 # disallowed; or the whole length if not. The
1564 # latter is what a couple of the malformations
1566 my $needed_to_tell = ($disallow_type)
1567 ? $this_needed_to_discern_len
1568 : $this_expected_len;
1570 # Then we see if the malformations that are
1571 # detectable early in the string are present.
1573 my $dl = overlong_discern_len($this_bytes);
1574 $needed_to_tell = $dl if $dl < $needed_to_tell;
1576 if ($will_overflow) {
1577 my $dl = overflow_discern_len($length);
1578 $needed_to_tell = $dl if $dl < $needed_to_tell;
1581 if ($j < $needed_to_tell) {
1583 $comment .= ", but need $needed_to_tell"
1584 . " bytes to discern:";
1589 $comment = "allowed";
1592 undef @warnings_gotten;
1594 $ret = test_is_utf8_valid_partial_char_flags($partial,
1595 $j, $disallow_flags);
1596 is($ret, $ret_should_be,
1597 " And is_utf8_valid_partial_char_flags("
1598 . display_bytes($partial)
1599 . ", $disallow_flags), $comment: returns"
1600 . " $ret_should_be")
1601 or diag "The flags mean "
1602 . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
1605 if ($did_test_partial) {
1606 is(scalar @warnings_gotten, 0,
1607 " And is_utf8_valid_partial_char_flags()"
1608 . " generated no warnings for any of the lengths")
1609 or output_warnings(@warnings_gotten);
1614 # Now test the to/from UTF-8 calls. There are several orthogonal
1615 # variables involved. We test most possible combinations
1617 foreach my $do_disallow (0, 1) {
1619 next if $initially_overlong || $initially_orphan;
1622 next if $skip_most_tests;
1625 # This tests four functions: utf8n_to_uvchr_error,
1626 # utf8n_to_uvchr_msgs, uvchr_to_utf8_flags, and
1627 # uvchr_to_utf8_msgs. The first two are variants of each other,
1628 # and the final two also form a pair. We use a loop 'which_func'
1629 # to determine which of each pair is being tested. The main loop
1630 # tests either the first and third, or the 2nd and fourth.
1631 # which_func is sets whether we are expecting warnings or not in
1632 # certain places. The _msgs() version of the functions expects
1633 # warnings even if lexical ones are turned off, so by making its
1634 # which_func == 1, we can say we want warnings; whereas the other
1635 # one with the value 0, doesn't get them.
1636 for my $which_func (0, 1) {
1637 my $utf8_func = ($which_func)
1638 ? 'utf8n_to_uvchr_msgs'
1639 : 'utf8n_to_uvchr_error';
1641 # We classify the warnings into certain "interesting" types,
1643 foreach my $warning_type (0..4) {
1644 next if $skip_most_tests && $warning_type != 1;
1645 foreach my $use_warn_flag (0, 1) {
1646 if ($use_warn_flag) {
1647 next if $initially_overlong || $initially_orphan;
1649 # Since foo_msgs() expects warnings even when lexical
1650 # ones are turned off, we can skip testing it when
1651 # they are turned on, with little likelihood of
1652 # missing an error case.
1653 next if $which_func;
1656 next if $skip_most_tests;
1659 # Finally, here is the inner loop
1661 my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn;
1662 my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow;
1663 my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn;
1664 my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow;
1667 my $expect_regular_warnings;
1668 my $expect_warnings_for_malformed;
1669 my $expect_warnings_for_overflow;
1671 if ($warning_type == 0) {
1672 $eval_warn = "use warnings";
1673 $expect_regular_warnings = $use_warn_flag;
1675 # We ordinarily expect overflow warnings here. But it
1676 # is somewhat more complicated, and the final
1677 # determination is deferred to one place in the file
1678 # where we handle overflow.
1679 $expect_warnings_for_overflow = 1;
1681 # We would ordinarily expect malformed warnings in
1682 # this case, but not if malformations are allowed.
1683 $expect_warnings_for_malformed
1684 = $malformed_allow_type == 0;
1686 elsif ($warning_type == 1) {
1687 $eval_warn = "no warnings";
1688 $expect_regular_warnings = $which_func;
1689 $expect_warnings_for_overflow = $which_func;
1690 $expect_warnings_for_malformed = $which_func;
1692 elsif ($warning_type == 2) {
1693 $eval_warn = "no warnings; use warnings 'utf8'";
1694 $expect_regular_warnings = $use_warn_flag;
1695 $expect_warnings_for_overflow = 1;
1696 $expect_warnings_for_malformed
1697 = $malformed_allow_type == 0;
1699 elsif ($warning_type == 3) {
1700 $eval_warn = "no warnings; use warnings"
1701 . " '$controlling_warning_category'";
1702 $expect_regular_warnings = $use_warn_flag;
1703 $expect_warnings_for_overflow
1704 = $controlling_warning_category eq 'non_unicode';
1705 $expect_warnings_for_malformed = $which_func;
1707 elsif ($warning_type == 4) { # Like type 3, but uses the
1708 # PERL_EXTENDED flags
1709 # The complement flags were set up so that the
1710 # PERL_EXTENDED flags have been tested that they don't
1711 # trigger wrongly for too small code points. And the
1712 # flags have been set up so that those small code
1713 # points are tested for being above Unicode. What's
1714 # left to test is that the large code points do
1715 # trigger the PERL_EXTENDED flags.
1716 next if ! requires_extended_utf8($allowed_uv);
1717 next if $controlling_warning_category ne 'non_unicode';
1718 $eval_warn = "no warnings; use warnings 'non_unicode'";
1719 $expect_regular_warnings = 1;
1720 $expect_warnings_for_overflow = 1;
1721 $expect_warnings_for_malformed = 0;
1722 $this_utf8n_flag_to_warn = $::UTF8_WARN_PERL_EXTENDED;
1723 $this_utf8n_flag_to_disallow
1724 = $::UTF8_DISALLOW_PERL_EXTENDED;
1725 $this_uvchr_flag_to_warn
1726 = $::UNICODE_WARN_PERL_EXTENDED;
1727 $this_uvchr_flag_to_disallow
1728 = $::UNICODE_DISALLOW_PERL_EXTENDED;
1731 die "Unexpected warning type '$warning_type'";
1734 # We only need to test the case where all warnings are
1735 # enabled (type 0) to see if turning off the warning flag
1736 # causes things to not be output. If those pass, then
1737 # turning on some sub-category of warnings, or turning off
1738 # warnings altogether are extremely likely to not output
1739 # warnings either, given how the warnings subsystem is
1740 # supposed to work, and this file assumes it does work.
1741 next if $warning_type != 0 && ! $use_warn_flag;
1743 # The convention is that the 'got' flag is the same value
1744 # as the disallow one. If this were violated, the tests
1745 # here should start failing.
1746 my $return_flag = $this_utf8n_flag_to_disallow;
1748 # If we aren't expecting warnings/disallow for this, turn
1749 # on all the other flags. That makes sure that they all
1750 # are independent of this flag, and so we don't need to
1751 # test them individually.
1752 my $this_warning_flags
1754 ? $this_utf8n_flag_to_warn
1755 : ($overlong_is_in_perl_extended_utf8
1756 ? ($utf8n_flag_to_warn_complement
1757 & ~$::UTF8_WARN_PERL_EXTENDED)
1758 : $utf8n_flag_to_warn_complement);
1759 my $this_disallow_flags
1761 ? $this_utf8n_flag_to_disallow
1762 : ($overlong_is_in_perl_extended_utf8
1763 ? ($utf8n_flag_to_disallow_complement
1764 & ~$::UTF8_DISALLOW_PERL_EXTENDED)
1765 : $utf8n_flag_to_disallow_complement);
1766 my $expected_uv = $allowed_uv;
1767 my $this_uv_string = $uv_string;
1769 my @expected_return_flags
1770 = @expected_malformation_return_flags;
1771 my @expected_warnings;
1772 push @expected_warnings, @expected_malformation_warnings
1773 if $expect_warnings_for_malformed;
1775 # The overflow malformation is done differently than other
1776 # malformations. It comes from manually typed tests in
1777 # the test array, but it also is above Unicode and uses
1778 # Perl extended UTF-8, so affects some of the flags being
1779 # tested. We now make it be treated like one of the other
1780 # generated malformations.
1781 if ($will_overflow) {
1783 # An overflow is (way) above Unicode, and overrides
1785 $expect_regular_warnings = 0;
1787 # Earlier, we tentatively calculated whether this
1788 # should emit a message or not. It's tentative
1789 # because, even if we ordinarily would output it, we
1790 # don't if malformations are allowed -- except an
1791 # overflow is also a SUPER and PERL_EXTENDED, and if
1792 # warnings for those are enabled, the overflow
1793 # warning does get raised.
1794 if ( $expect_warnings_for_overflow
1795 && ( $malformed_allow_type == 0
1796 || ( $this_warning_flags
1797 & ($::UTF8_WARN_SUPER
1798 |$::UTF8_WARN_PERL_EXTENDED))))
1800 push @expected_warnings, $overflow_msg_pattern;
1804 # It may be that the malformations have shortened the
1805 # amount of input we look at so much that we can't tell
1806 # what the category the code point was in. Otherwise, set
1807 # up the expected return flags based on the warnings and
1809 if ($this_expected_len < $this_needed_to_discern_len) {
1810 $expect_regular_warnings = 0;
1812 elsif ( ($this_warning_flags & $this_utf8n_flag_to_warn)
1813 || ( $this_disallow_flags
1814 & $this_utf8n_flag_to_disallow))
1816 push @expected_return_flags, $return_flag;
1819 # Finish setting up the expected warning.
1820 if ($expect_regular_warnings) {
1822 # So far the array contains warnings generated by
1823 # malformations. Add the expected regular one.
1824 unshift @expected_warnings, $this_cp_message_qr;
1826 # But it may need to be modified, because either of
1827 # these malformations means we can't determine the
1828 # expected code point.
1829 if ( $short || $unexpected_noncont
1830 || $dont_use_overlong_cp)
1832 my $first_byte = substr($this_bytes, 0, 1);
1833 $expected_warnings[0] = display_bytes(
1834 substr($this_bytes, 0, $this_expected_len));
1835 $expected_warnings[0]
1836 = qr/[Aa]\Qny UTF-8 sequence that starts with\E
1837 \Q $expected_warnings[0]\E
1838 \Q $this_non_cp_trailing_text\E/x;
1842 # Is effectively disallowed if we've set up a malformation
1843 # (unless malformations are allowed), even if the flag
1844 # indicates it is allowed. Fix up test name to indicate
1847 if ( $this_disallow_flags & $this_utf8n_flag_to_disallow
1848 && $this_expected_len >= $this_needed_to_discern_len)
1852 if ($malformations_name) {
1853 if ($malformed_allow_type == 0) {
1856 elsif ($malformed_allow_type == 1) {
1858 # Even if allowed, the malformation returns the
1859 # REPLACEMENT CHARACTER.
1860 $expected_uv = 0xFFFD;
1861 $this_uv_string = "0xFFFD"
1865 my $this_name = "$utf8_func() $testname: ";
1866 my @scratch_expected_return_flags = @expected_return_flags;
1867 if (! $initially_malformed) {
1868 $this_name .= ($disallowed)
1872 $this_name .= "$eval_warn";
1873 $this_name .= ", " . (( $this_warning_flags
1874 & $this_utf8n_flag_to_warn)
1875 ? 'with flag for raising warnings'
1876 : 'no flag for raising warnings');
1877 $this_name .= $malformations_name;
1879 # Do the actual test using an eval
1880 undef @warnings_gotten;
1883 = $allow_flags|$this_warning_flags|$this_disallow_flags;
1884 my $eval_text = "$eval_warn; \$ret_ref"
1885 . " = test_$utf8_func("
1886 . "'$this_bytes', $this_length, $this_flags)";
1888 if (! ok ($@ eq "", "$this_name: eval succeeded"))
1890 diag "\$@='$@'; call was: "
1891 . utf8n_display_call($eval_text);
1896 is($ret_ref->[0], 0, " And returns 0")
1897 or diag "Call was: " . utf8n_display_call($eval_text);
1900 is($ret_ref->[0], $expected_uv,
1901 " And returns expected uv: "
1903 or diag "Call was: " . utf8n_display_call($eval_text);
1905 is($ret_ref->[1], $this_expected_len,
1906 " And returns expected length:"
1907 . " $this_expected_len")
1908 or diag "Call was: " . utf8n_display_call($eval_text);
1910 my $returned_flags = $ret_ref->[2];
1912 for (my $i = @scratch_expected_return_flags - 1;
1916 if ($scratch_expected_return_flags[$i] & $returned_flags)
1918 if ($scratch_expected_return_flags[$i]
1919 == $::UTF8_GOT_PERL_EXTENDED)
1921 pass(" Expected and got return flag for"
1922 . " PERL_EXTENDED");
1924 # The first entries in this are
1926 elsif ($i > @malformation_names - 1) {
1927 pass(" Expected and got return flag"
1928 . " for " . $controlling_warning_category);
1931 pass(" Expected and got return flag for "
1932 . $malformation_names[$i]
1936 &= ~$scratch_expected_return_flags[$i];
1937 splice @scratch_expected_return_flags, $i, 1;
1941 if (! is($returned_flags, 0,
1942 " Got no unexpected return flags"))
1944 diag "The unexpected flags gotten were: "
1945 . (flags_to_text($returned_flags,
1946 \@utf8n_flags_to_text)
1947 # We strip off any prefixes from the flag
1949 =~ s/ \b [A-Z] _ //xgr);
1950 diag "Call was: " . utf8n_display_call($eval_text);
1953 if (! is (scalar @scratch_expected_return_flags, 0,
1954 " Got all expected return flags"))
1956 diag "The expected flags not gotten were: "
1957 . (flags_to_text(eval join("|",
1958 @scratch_expected_return_flags),
1959 \@utf8n_flags_to_text)
1960 # We strip off any prefixes from the flag
1962 =~ s/ \b [A-Z] _ //xgr);
1963 diag "Call was: " . utf8n_display_call($eval_text);
1967 my @returned_warnings;
1968 for my $element_ref (@{$ret_ref->[3]}) {
1969 push @returned_warnings, $element_ref->{'text'};
1970 my $text = $element_ref->{'text'};
1971 my $flag = $element_ref->{'flag_bit'};
1972 my $category = $element_ref->{'warning_category'};
1974 if (! ok(($flag & ($flag-1)) == 0,
1975 "flag for returned msg is a single bit"))
1977 diag sprintf("flags are %x; msg=%s", $flag, $text);
1980 if (grep { $_ == $flag } @expected_return_flags) {
1981 pass("flag for returned msg is expected");
1985 . flags_to_text($flag, \@utf8n_flags_to_text)
1986 . ") for returned msg is expected");
1990 # In perl space, don't know the category numbers
1992 "returned category for msg isn't 0");
1995 ok(@warnings_gotten == 0, "$utf8_func raised no warnings;"
1996 . " the next tests are for ones in the returned"
1998 or diag join "\n", "The unexpected warnings were:",
2000 @warnings_gotten = @returned_warnings;
2003 do_warnings_test(@expected_warnings)
2004 or diag "Call was: " . utf8n_display_call($eval_text);
2005 undef @warnings_gotten;
2007 # Check CHECK_ONLY results when the input is
2008 # disallowed. Do this when actually disallowed,
2009 # not just when the $this_disallow_flags is set. We only
2010 # test once utf8n_to_uvchr_msgs() with this.
2012 && ($which_func == 0 || ! $tested_CHECK_ONLY))
2014 $tested_CHECK_ONLY = 1;
2015 my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
2016 my $eval_text = "use warnings; \$ret_ref ="
2017 . " test_$utf8_func('"
2018 . "$this_bytes', $this_length,"
2022 " And eval succeeded with CHECK_ONLY"))
2024 diag "\$@='$@'; Call was: "
2025 . utf8n_display_call($eval_text);
2028 is($ret_ref->[0], 0, " CHECK_ONLY: Returns 0")
2029 or diag "Call was: " . utf8n_display_call($eval_text);
2030 is($ret_ref->[1], -1,
2031 " CHECK_ONLY: returns -1 for length")
2032 or diag "Call was: " . utf8n_display_call($eval_text);
2033 if (! is(scalar @warnings_gotten, 0,
2034 " CHECK_ONLY: no warnings generated"))
2036 diag "Call was: " . utf8n_display_call($eval_text);
2037 output_warnings(@warnings_gotten);
2041 # Now repeat some of the above, but for
2042 # uvchr_to_utf8_flags(). Since this comes from an
2043 # existing code point, it hasn't overflowed, and isn't
2045 next if @malformation_names;
2047 my $uvchr_func = ($which_func)
2048 ? 'uvchr_to_utf8_flags_msgs'
2049 : 'uvchr_to_utf8_flags';
2051 $this_warning_flags = ($use_warn_flag)
2052 ? $this_uvchr_flag_to_warn
2054 $this_disallow_flags = ($do_disallow)
2055 ? $this_uvchr_flag_to_disallow
2058 $disallowed = $this_disallow_flags
2059 & $this_uvchr_flag_to_disallow;
2060 $this_name .= ", " . (( $this_warning_flags
2061 & $this_utf8n_flag_to_warn)
2062 ? 'with flag for raising warnings'
2063 : 'no flag for raising warnings');
2065 $this_name = "$uvchr_func() $testname: "
2069 $this_name .= ", $eval_warn";
2070 $this_name .= ", " . (( $this_warning_flags
2071 & $this_uvchr_flag_to_warn)
2072 ? 'with warning flag'
2073 : 'no warning flag');
2075 undef @warnings_gotten;
2077 $this_flags = $this_warning_flags|$this_disallow_flags;
2078 $eval_text = "$eval_warn; \$ret ="
2079 . " test_$uvchr_func("
2080 . "$allowed_uv, $this_flags)";
2082 if (! ok ($@ eq "", "$this_name: eval succeeded"))
2084 diag "\$@='$@'; call was: "
2085 . uvchr_display_call($eval_text);
2090 if (defined $ret->[1]) {
2091 my @returned_warnings;
2092 push @returned_warnings, $ret->[1]{'text'};
2093 my $text = $ret->[1]{'text'};
2094 my $flag = $ret->[1]{'flag_bit'};
2095 my $category = $ret->[1]{'warning_category'};
2097 if (! ok(($flag & ($flag-1)) == 0,
2098 "flag for returned msg is a single bit"))
2100 diag sprintf("flags are %x; msg=%s", $flag, $text);
2103 if ($flag & $this_uvchr_flag_to_disallow) {
2104 pass("flag for returned msg is expected");
2108 . flags_to_text($flag, \@utf8n_flags_to_text)
2109 . ") for returned msg is expected");
2113 # In perl space, don't know the category numbers
2115 "returned category for msg isn't 0");
2117 ok(@warnings_gotten == 0, "$uvchr_func raised no warnings;"
2118 . " the next tests are for ones in the returned"
2120 or diag join "\n", "The unexpected warnings were:",
2122 @warnings_gotten = @returned_warnings;
2129 is($ret, undef, " And returns undef")
2130 or diag "Call was: " . uvchr_display_call($eval_text);
2133 is($ret, $this_bytes, " And returns expected string")
2134 or diag "Call was: " . uvchr_display_call($eval_text);
2137 do_warnings_test(@expected_warnings)
2138 or diag "Call was: " . uvchr_display_call($eval_text);