APItest: Add comprehensive UTF-8 validity tests
authorKarl Williamson <khw@cpan.org>
Mon, 2 Jul 2018 01:20:59 +0000 (19:20 -0600)
committerKarl Williamson <khw@cpan.org>
Thu, 5 Jul 2018 20:44:23 +0000 (14:44 -0600)
These test all combinations of bytes at all likely to have any issues.
They are run only when an environment variable is set to a particular
obscure value, as they take a long time.

ext/XS-APItest/t/utf8_warn_base.pl

index 0c9e20b..2359ca7 100644 (file)
@@ -699,6 +699,73 @@ sub do_warnings_test(@)
     return $succeeded;
 }
 
+my $min_cont = (isASCII) ? 0x80 : 0xA0;
+my $continuation_shift = (isASCII) ? 6 : 5;
+my $continuation_mask = (1 << $continuation_shift) - 1;
+
+sub isUTF8_CHAR($$) {   # Uses first principals to determine if this is legal
+                        # (Doesn't work if overflows)
+    my ($string, $length) = @_;
+
+    # Uses first principals to calculate if $string is legal
+
+    return 0 if $length <= 0;
+
+    my $first = ord substr($string, 0, 1);
+
+    # Invariant
+    return 1 if $length == 1 && $first < $min_cont;
+
+    return 0 if $first < 0xC0;  # Starts with continuation
+
+    # Calculate the number of leading 1 bits
+    my $utf8skip = 0;
+    my $bits = $first;
+    do {
+        $utf8skip++;
+        $bits = ($bits << 1) & 0xFF;
+    } while ($bits & 0x80);
+
+    return 0 if $utf8skip != $length;
+
+    # Acuumulate the $code point.  The remaining bits in the start byte count
+    # towards it
+    my $cp = $bits >> $utf8skip;
+
+    for my $i (1 .. $length - 1) {
+        my $ord = ord substr($string, $i, 1);
+
+        # Wrong if not a continuation
+        return 0 if $ord < $min_cont || $ord >= 0xC0;
+
+        $cp = ($cp << $continuation_shift)
+            | ($ord & $continuation_mask);
+    }
+
+    # If the calculated value can be expressed in fewer bytes than were passed
+    # in, is an illegal overlong.  XXX if 'chr' is not working properly, this
+    # may not be right
+    my $chr = chr $cp;
+    utf8::upgrade($chr);
+
+    use bytes;
+    return 0 if length $chr < $length;
+
+    return 1;
+}
+
+sub start_mark($) {
+    my $len = shift;
+    return 0xFF if $len >  7;
+    return (0xFF & (0xFE << (7 - $len)));
+}
+
+sub start_mask($) {
+    my $len = shift;
+    return 0 if $len >  7;
+    return 0x1F >> ($len - 2);
+}
+
 # This test is split into this number of files.
 my $num_test_files = $ENV{TEST_JOBS} || 1;
 $num_test_files = 10 if $num_test_files > 10;
@@ -707,6 +774,208 @@ $num_test_files = 10 if $num_test_files > 10;
 my $tested_CHECK_ONLY = 0;
 
 my $test_count = -1;
+
+# By setting this environment variable to this particular value, we test
+# essentially all combinations of potential UTF-8, so that can get a
+# comprehensive test of the decoding routine.  This test assumes the routine
+# that does the translation from code point to UTF-8 is working.  An assert
+# can be used in the routine to make sure that the dfa is working precisely
+# correctly, and any flaws in it aren't being masked by the remainder of the
+# function.
+if ($::TEST_CHUNK == 0
+&& $ENV{PERL_DEBUG_FULL_TEST}
+&& $ENV{PERL_DEBUG_FULL_TEST} == 97)
+{
+    my $min_cont_mask = $min_cont | 0xF;
+    my @bytes = (   0,  # Placeholder to signify to use an empty string ""
+                ord 'A',# We assume that all the invariant characters are
+                        # properly in the same class, so this is an exemplar
+                        # character
+                $min_cont .. 0xFF   # But test every non-invariant individually
+                );
+    my $shift = (isASCII) ? 6 : 5;
+    my $mark = $min_cont;
+    my $mask = (1 << $shift) - 1;
+    for my $byte1 (@bytes) {
+        for my $byte2 (@bytes) {
+            last if $byte2 && ! $byte1;      # Don't test empty preceding byte
+
+            last if $byte2 && $byte1 < 0xC0; # No need to test more than a
+                                             # single byte unless start byte
+                                             # indicates those.
+
+            for my $byte3 (@bytes) {
+                last if $byte3 && ! $byte2;
+                last if $byte3 && $byte1 < 0xE0;    # Only test 3 bytes for
+                                                    # 3-byte start byte
+
+                # If the preceding byte is a start byte, it should fail, and
+                # there is no need to test illegal bytes that follow.
+                # Instead, limit ourselves to just a few legal bytes that
+                # could follow.  This cuts down tremendously on the number of
+                # tests executed.
+                next if $byte2 >= 0xC0
+                     && $byte3 >= $min_cont
+                     && ($byte3 & $min_cont_mask) != $min_cont;
+
+                for my $byte4 (@bytes) {
+                    last if $byte4 && ! $byte3;
+                    last if $byte4 && $byte1 < 0xF0;  # Only test 4 bytes for
+                                                      # 4 byte strings
+
+                    # Like for byte 3, we limit things that come after a
+                    # mispositioned start-byte to just a few things that
+                    # otherwise would be legal
+                    next if ($byte2 >= 0xC0 || $byte3 >= 0xC0)
+                          && $byte4 >= $min_cont
+                          && ($byte4 & $min_cont_mask) != $min_cont;
+
+                    for my $byte5 (@bytes) {
+                        last if $byte5 && ! $byte4;
+                        last if $byte5 && $byte1 < 0xF8;  # Only test 5 bytes for
+                                                          # 5 byte strings
+
+                        # Like for byte 4, we limit things that come after a
+                        # mispositioned start-byte to just a few things that
+                        # otherwise would be legal
+                        next if (   $byte2 >= 0xC0
+                                 || $byte3 >= 0xC0
+                                 || $byte4 >= 0xC0)
+                              && $byte4 >= $min_cont
+                              && ($byte4 & $min_cont_mask) != $min_cont;
+
+                        my $string = "";
+                        $string .= chr $byte1 if $byte1;
+                        $string .= chr $byte2 if $byte2;
+                        $string .= chr $byte3 if $byte3;
+                        $string .= chr $byte4 if $byte4;
+                        $string .= chr $byte5 if $byte5;
+
+                        my $length = length $string;
+                        next unless $length;
+                        last if $byte1 >= ((isASCII) ? 0xF6 : 0xFA);
+
+                        my $native = I8_to_native($string);
+                        my $is_valid = isUTF8_CHAR($native, $length);
+                        my $got_valid = test_isUTF8_CHAR($native, $length);
+                        my $got_strict
+                                    = test_isSTRICT_UTF8_CHAR($native, $length);
+                        my $got_C9
+                                 = test_isC9_STRICT_UTF8_CHAR($native, $length);
+                        my $ret = test_utf8n_to_uvchr_msgs($native, $length,
+                                            $::UTF8_WARN_ILLEGAL_INTERCHANGE);
+                        my $is_strict = $is_valid;
+                        my $is_C9 = $is_valid;
+
+                        if ($is_valid) {
+
+                            # Here, is legal UTF-8.  Verify that it returned
+                            # the correct code point, and if so, that it
+                            # correctly classifies the result.
+                            my $cp = $ret->[0];
+
+                            my $should_be_string;
+                            if ($length == 1) {
+                                $should_be_string = chr $cp;
+                            }
+                            else {
+
+                                # Starting with the code point, use first
+                                # principals to find the equivalen UTF-8
+                                # string
+                                my @bytes;
+                                my $uv = $cp;
+                                for (my $i = $length - 1; $i > 0; $i--) {
+                                    $bytes[$i] = chr I8_to_native(($uv & $mask)
+                                                                  | $mark);
+                                    $uv >>= $shift;
+                                }
+                                $bytes[0] = chr I8_to_native((   $uv
+                                                        & start_mask($length))
+                                            | start_mark($length));
+                                $should_be_string = join "", @bytes;
+                            }
+
+                            # If the original string and the inverse are the
+                            # same, it worked.
+                            if (is($native, $should_be_string,
+                                    "utf8n_to_uvchr_msgs("
+                                 .  display_bytes($native)
+                                 . ") returns correct uv=0x"
+                                 . sprintf ("%x", $cp)))
+                            {
+                                my $is_surrogate = $cp >= 0xD800
+                                                && $cp <= 0xDFFF;
+                                my $got_surrogate
+                                    = ($ret->[2] & $::UTF8_GOT_SURROGATE) != 0;
+                                $is_strict = 0 if $is_surrogate;
+                                $is_C9 = 0 if $is_surrogate;
+
+                                my $is_super = $cp > 0x10FFFF;
+                                my $got_super
+                                        = ($ret->[2] & $::UTF8_GOT_SUPER) != 0;
+                                $is_strict = 0 if $is_super;
+                                $is_C9 = 0 if $is_super;
+
+                                my $is_nonchar = ! $is_super
+                                    && (   ($cp & 0xFFFE) == 0xFFFE
+                                        || ($cp >= 0xFDD0 && $cp <= 0xFDEF));
+                                my $got_nonchar
+                                      = ($ret->[2] & $::UTF8_GOT_NONCHAR) != 0;
+                                $is_strict = 0 if $is_nonchar;
+
+                                is($got_surrogate, $is_surrogate,
+                                    "    And correctly flagged it as"
+                                  . ((! $is_surrogate) ? " not" : "")
+                                  . " being a surrogate");
+                                is($got_super, $is_super,
+                                    "    And correctly flagged it as"
+                                  . ((! $is_super) ? " not" : "")
+                                  . " being above Unicode");
+                                is($got_nonchar, $is_nonchar,
+                                    "    And correctly flagged it as"
+                                  . ((! $is_nonchar) ? " not" : "")
+                                  . " being a non-char");
+                            }
+
+                            # This is how we exit the loop normally if things
+                            # are working.  The fail-safe code above is used
+                            # when they aren't.
+                            goto done if $cp > 0x140001;
+                        }
+                        else {
+                            is($ret->[0], 0, "utf8n_to_uvchr_msgs("
+                                            . display_bytes($native)
+                                            . ") correctly returns error");
+                            if (! ($ret->[2] & ($::UTF8_GOT_SHORT
+                                               |$::UTF8_GOT_NON_CONTINUATION
+                                               |$::UTF8_GOT_LONG)))
+                            {
+                                is($ret->[2] & ( $::UTF8_GOT_NONCHAR
+                                                |$::UTF8_GOT_SURROGATE
+                                                |$::UTF8_GOT_SUPER), 0,
+                                "    And isn't a surrogate, non-char, nor"
+                                . " above Unicode");
+                             }
+                        }
+
+                        is($got_valid == 0, $is_valid == 0,
+                           "    And isUTF8_CHAR() correctly returns "
+                         . (($got_valid == 0) ? "0" : "non-zero"));
+                        is($got_strict == 0, $is_strict == 0,
+                           "    And isSTRICT_UTF8_CHAR() correctly returns "
+                         . (($got_strict == 0) ? "0" : "non-zero"));
+                        is($got_C9 == 0, $is_C9 == 0,
+                           "    And isC9_UTF8_CHAR() correctly returns "
+                         . (($got_C9 == 0) ? "0" : "non-zero"));
+                    }
+                }
+            }
+        }
+    }
+  done:
+}
+
 foreach my $test (@tests) {
   $test_count++;
   next if $test_count % $num_test_files != $::TEST_CHUNK;