utf8n_to_uvchr(): Properly handle extremely high code points

author Karl Williamson <khw@cpan.org>

Sat, 8 Jul 2017 20:54:28 +0000 (14:54 -0600)

committer Karl Williamson <khw@cpan.org>

Thu, 13 Jul 2017 03:14:26 +0000 (21:14 -0600)
author Karl Williamson <khw@cpan.org>
Sat, 8 Jul 2017 20:54:28 +0000 (14:54 -0600)
committer Karl Williamson <khw@cpan.org>
Thu, 13 Jul 2017 03:14:26 +0000 (21:14 -0600)
diff --git a/ext/XS-APItest/t/utf8_warn_base.pl b/ext/XS-APItest/t/utf8_warn_base.pl

index 4e09353..6f40c17 100644 (file)
--- a/ext/XS-APItest/t/utf8_warn_base.pl
+++ b/ext/XS-APItest/t/utf8_warn_base.pl
@@ -32,6 +32,9 @@ local $SIG{__WARN__} = sub { my @copy = @_;
  my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
  my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
  
+# C5 is chosen as it is valid for both ASCII and EBCDIC platforms
+my $known_start_byte = I8_to_native("\xC5");
+
  sub requires_extended_utf8($) {
  
      # Returns a boolean as to whether or not the code point parameter fits
@@ -1000,9 +1003,9 @@ foreach my $test (@tests) {
              if ($unexpected_noncont) {
  
                  # To force this malformation, change the final continuation
-                # byte into a non continuation.
+                # byte into a start byte.
                  my $pos = ($short) ? -2 : -1;
-                substr($this_bytes, $pos, 1) = '?';
+                substr($this_bytes, $pos, 1) = $known_start_byte;
                  $this_expected_len--;
              }
  
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t

index b56c38b..1e9ea88 100644 (file)
--- a/t/op/utf8decode.t
+++ b/t/op/utf8decode.t
@@ -84,10 +84,17 @@ foreach (<DATA>) {
  
              my $message;
              my $after = "";
-            if ($expect64 != $expect32 && ! $is64bit) {
-                like($warnings[0], qr/overflow/, "overflow warning for $id seen");
-                shift @warnings;
-                $after .= "overflow";
+            if ($expect64 < $expect32 && ! $is64bit) {
+                if (       substr($octets, 0, 1) gt "\xfe"
+                    || (   substr($octets, 0, 1) eq "\xfe"
+                        && length $octets > 1
+                        && substr($octets, 1, 1) le "\xbf"
+                        && substr($octets, 1, 1) ge "\x80"))
+                {
+                    like($warnings[0], qr/overflow/, "overflow warning for $id seen");
+                    shift @warnings;
+                    $after .= "overflow";
+                }
              }
  
              # The data below assumes that if there is both a 'short' and
@@ -183,7 +190,7 @@ __DATA__
  3.5    Impossible bytes (but not with Perl's extended UTF-8)
  3.5.1 n -      1       fe      -       1 byte available, need 7
  3.5.2 N2,1 -   1       ff      -       1 byte available, need 13
-3.5.3 N11,8 -  4       fe:fe:ff:ff     -       byte 0xfe
+3.5.3 N9,7 -   4       fe:fe:ff:ff     -       byte 0xfe
  4      Overlong sequences
  4.1    Examples of an overlong ASCII character
  4.1.1 n -      2       c0:af   -       overlong
diff --git a/utf8.c b/utf8.c

index 4be3bb7..2472032 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1272,8 +1272,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
          uv = UNICODE_REPLACEMENT;
      }
  
-    /* Check for overflow. */
-    if (UNLIKELY(does_utf8_overflow(s0, send))) {
+    /* Check for overflow.  The algorithm requires us to not look past the end
+     * of the current character, even if partial, so the upper limit is 's' */
+    if (UNLIKELY(does_utf8_overflow(s0, s))) {
          possible_problems |= UTF8_GOT_OVERFLOW;
          uv = UNICODE_REPLACEMENT;
      }
@@ -1288,7 +1289,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
              && (   UNLIKELY(! UTF8_IS_START(*s0))
                  || (   curlen > 1
                      && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
-                                                                send - s0))))))
+                                                                s - s0))))))
      {
          possible_problems |= UTF8_GOT_LONG;
author	Karl Williamson <khw@cpan.org>
	Sat, 8 Jul 2017 20:54:28 +0000 (14:54 -0600)
committer	Karl Williamson <khw@cpan.org>
	Thu, 13 Jul 2017 03:14:26 +0000 (21:14 -0600)
ext/XS-APItest/t/utf8_warn_base.pl		patch \| blob \| blame \| history
t/op/utf8decode.t		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history