This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8n_to_uvchr(): Properly handle extremely high code points
authorKarl Williamson <khw@cpan.org>
Sat, 8 Jul 2017 20:54:28 +0000 (14:54 -0600)
committerKarl Williamson <khw@cpan.org>
Thu, 13 Jul 2017 03:14:26 +0000 (21:14 -0600)
It turns out that it could incorrectly deem something to be overflowing
or overlong.  This fixes that and changes the test to catch this
possibility.  This fixes a bug, so now on 32-bit systems, it detects
that if you have a start byte of FE, you need  a continuation byte to
determine if the result overflows.

ext/XS-APItest/t/utf8_warn_base.pl
t/op/utf8decode.t
utf8.c

index 4e09353..6f40c17 100644 (file)
@@ -32,6 +32,9 @@ local $SIG{__WARN__} = sub { my @copy = @_;
 my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
 my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
 
+# C5 is chosen as it is valid for both ASCII and EBCDIC platforms
+my $known_start_byte = I8_to_native("\xC5");
+
 sub requires_extended_utf8($) {
 
     # Returns a boolean as to whether or not the code point parameter fits
@@ -1000,9 +1003,9 @@ foreach my $test (@tests) {
             if ($unexpected_noncont) {
 
                 # To force this malformation, change the final continuation
-                # byte into a non continuation.
+                # byte into a start byte.
                 my $pos = ($short) ? -2 : -1;
-                substr($this_bytes, $pos, 1) = '?';
+                substr($this_bytes, $pos, 1) = $known_start_byte;
                 $this_expected_len--;
             }
 
index b56c38b..1e9ea88 100644 (file)
@@ -84,10 +84,17 @@ foreach (<DATA>) {
 
             my $message;
             my $after = "";
-            if ($expect64 != $expect32 && ! $is64bit) {
-                like($warnings[0], qr/overflow/, "overflow warning for $id seen");
-                shift @warnings;
-                $after .= "overflow";
+            if ($expect64 < $expect32 && ! $is64bit) {
+                if (       substr($octets, 0, 1) gt "\xfe"
+                    || (   substr($octets, 0, 1) eq "\xfe"
+                        && length $octets > 1
+                        && substr($octets, 1, 1) le "\xbf"
+                        && substr($octets, 1, 1) ge "\x80"))
+                {
+                    like($warnings[0], qr/overflow/, "overflow warning for $id seen");
+                    shift @warnings;
+                    $after .= "overflow";
+                }
             }
 
             # The data below assumes that if there is both a 'short' and
@@ -183,7 +190,7 @@ __DATA__
 3.5    Impossible bytes (but not with Perl's extended UTF-8)
 3.5.1 n -      1       fe      -       1 byte available, need 7
 3.5.2 N2,1 -   1       ff      -       1 byte available, need 13
-3.5.3 N11,8 -  4       fe:fe:ff:ff     -       byte 0xfe
+3.5.3 N9,7 -   4       fe:fe:ff:ff     -       byte 0xfe
 4      Overlong sequences
 4.1    Examples of an overlong ASCII character
 4.1.1 n -      2       c0:af   -       overlong
diff --git a/utf8.c b/utf8.c
index 4be3bb7..2472032 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1272,8 +1272,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
         uv = UNICODE_REPLACEMENT;
     }
 
-    /* Check for overflow. */
-    if (UNLIKELY(does_utf8_overflow(s0, send))) {
+    /* Check for overflow.  The algorithm requires us to not look past the end
+     * of the current character, even if partial, so the upper limit is 's' */
+    if (UNLIKELY(does_utf8_overflow(s0, s))) {
         possible_problems |= UTF8_GOT_OVERFLOW;
         uv = UNICODE_REPLACEMENT;
     }
@@ -1288,7 +1289,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
             && (   UNLIKELY(! UTF8_IS_START(*s0))
                 || (   curlen > 1
                     && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
-                                                                send - s0))))))
+                                                                s - s0))))))
     {
         possible_problems |= UTF8_GOT_LONG;