This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Allow allowing UTF-8 overflow malformation
authorKarl Williamson <khw@cpan.org>
Sun, 11 Dec 2016 01:01:39 +0000 (18:01 -0700)
committerKarl Williamson <khw@cpan.org>
Fri, 23 Dec 2016 23:48:35 +0000 (16:48 -0700)
perl has never allowed the UTF-8 overflow malformation, for some reason.
But as long as overflows are turned into the REPLACEMENT CHARACTER,
there is no real reason not to.  And making it allowable allows code
that wants to carry on in the face of malformed input to do so, without
risk of contaminating things, as the REPLACEMENT is the Unicode
prescribed way of handling malformations.

ext/XS-APItest/t/utf8.t
pod/perldelta.pod
utf8.c
utf8.h

index 5fe56df..c7f2c1d 100644 (file)
@@ -100,7 +100,8 @@ my $UTF8_GOT_SHORT              = $UTF8_ALLOW_SHORT;
 my $UTF8_ALLOW_LONG             = 0x0010;
 my $UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020;
 my $UTF8_GOT_LONG               = $UTF8_ALLOW_LONG;
-my $UTF8_GOT_OVERFLOW           = 0x0080;
+my $UTF8_ALLOW_OVERFLOW         = 0x0080;
+my $UTF8_GOT_OVERFLOW           = $UTF8_ALLOW_OVERFLOW;
 my $UTF8_DISALLOW_SURROGATE     = 0x0100;
 my $UTF8_GOT_SURROGATE          = $UTF8_DISALLOW_SURROGATE;
 my $UTF8_WARN_SURROGATE         = 0x0200;
@@ -1347,8 +1348,7 @@ if (isASCII && ! $is64bit) {    # 32-bit ASCII platform
         [ "overflow malformation",
             "\xfe\x84\x80\x80\x80\x80\x80",  # Represents 2**32
             7,
-            0,  # There is no way to allow this malformation
-            $UTF8_GOT_OVERFLOW,
+            $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
             $REPLACEMENT,
             7, 2,
             qr/overflows/
@@ -1356,8 +1356,7 @@ if (isASCII && ! $is64bit) {    # 32-bit ASCII platform
         [ "overflow malformation",
             "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
             $max_bytes,
-            0,  # There is no way to allow this malformation
-            $UTF8_GOT_OVERFLOW,
+            $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
             $REPLACEMENT,
             $max_bytes, 1,
             qr/overflows/
@@ -1399,8 +1398,7 @@ else { # 64-bit ASCII, or EBCDIC of any size.
             I8_to_native(
                     "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
             $max_bytes,
-            0,  # There is no way to allow this malformation
-            $UTF8_GOT_OVERFLOW,
+            $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
             $REPLACEMENT,
             $max_bytes, 8,
             qr/overflows/
@@ -1414,8 +1412,7 @@ else { # 64-bit ASCII, or EBCDIC of any size.
                 : I8_to_native(
                     "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $max_bytes,
-                0,  # There is no way to allow this malformation
-                $UTF8_GOT_OVERFLOW,
+                $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
                 $REPLACEMENT,
                 $max_bytes, (isASCII) ? 3 : 2,
                 qr/overflows/
index 1c770d2..372b5fe 100644 (file)
@@ -355,6 +355,17 @@ This malformation is where the UTF-8 looks valid syntactically, but
 there is a shorter sequence that yields the same code point.  This has
 been forbidden since Unicode version 3.1.
 
+=item *
+
+The functions C<utf8n_to_uvchr> and its derivatives now accept an input
+flag to allow the overflow malformation.  This malformation is when the
+UTF-8 may be syntactically valid, but the code point it represents is
+not capable of being represented in the word length on the platform.
+What "allowed" means in this case is that the function doesn't return an
+error, and advances the parse pointer to beyond the UTF-8 in question,
+but it returns the Unicode REPLACEMENT CHARACTER as the value of the
+code point (since the real value is not representable).
+
 =back
 
 =head1 Selected Bug Fixes
diff --git a/utf8.c b/utf8.c
index d5e675b..85432dc 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1259,6 +1259,12 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                                               /* isn't problematic if < this */
     if (   (   (   LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
             || (   UNLIKELY(possible_problems)
+
+                          /* if overflow, we know without looking further
+                           * precisely which of the problematic types it is,
+                           * and we deal with those in the overflow handling
+                           * code */
+                && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
                 && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
        && ((flags & ( UTF8_DISALLOW_NONCHAR
                       |UTF8_DISALLOW_SURROGATE
@@ -1371,7 +1377,21 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                     *errors |= UTF8_GOT_ABOVE_31_BIT;
                 }
 
-                disallowed = TRUE;
+                /* Disallow if any of the three categories say to */
+                if ( ! (flags & UTF8_ALLOW_OVERFLOW)
+                    || (flags & ( UTF8_DISALLOW_SUPER
+                                 |UTF8_DISALLOW_ABOVE_31_BIT)))
+                {
+                    disallowed = TRUE;
+                }
+
+
+                /* Likewise, warn if any say to, plus if deprecation warnings
+                 * are on, because this code point is above IV_MAX */
+                if (  ckWARN_d(WARN_DEPRECATED)
+                    || ! (flags & UTF8_ALLOW_OVERFLOW)
+                    ||   (flags & (UTF8_WARN_SUPER|UTF8_WARN_ABOVE_31_BIT)))
+                {
 
                 /* The warnings code explicitly says it doesn't handle the case
                  * of packWARN2 and two categories which have parent-child
@@ -1391,6 +1411,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                                         _byte_dump_string(s0, send - s0));
                     }
                 }
+                }
             }
             else if (possible_problems & UTF8_GOT_EMPTY) {
                 possible_problems &= ~UTF8_GOT_EMPTY;
diff --git a/utf8.h b/utf8.h
index 3dde45a..d7c4e1a 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -745,8 +745,8 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
 #define UTF8_ALLOW_LONG_AND_ITS_VALUE   (UTF8_ALLOW_LONG|0x0020)
 #define UTF8_GOT_LONG                   UTF8_ALLOW_LONG
 
-/* Currently no way to allow overflow */
-#define UTF8_GOT_OVERFLOW               0x0080
+#define UTF8_ALLOW_OVERFLOW             0x0080
+#define UTF8_GOT_OVERFLOW               UTF8_ALLOW_OVERFLOW
 
 #define UTF8_DISALLOW_SURROGATE                0x0100  /* Unicode surrogates */
 #define UTF8_GOT_SURROGATE             UTF8_DISALLOW_SURROGATE
@@ -790,10 +790,15 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
 #define UTF8_WARN_ILLEGAL_INTERCHANGE \
                           (UTF8_WARN_ILLEGAL_C9_INTERCHANGE|UTF8_WARN_NONCHAR)
 
+/* This is used typically for code that is willing to accept inputs of
+ * illformed UTF-8 sequences, for whatever reason.  However, all such sequences
+ * evaluate to the REPLACEMENT CHARACTER unless other flags overriding this are
+ * also present. */
 #define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION                                \
                         |UTF8_ALLOW_NON_CONTINUATION                            \
                         |UTF8_ALLOW_SHORT                                       \
-                        |UTF8_ALLOW_LONG)
+                        |UTF8_ALLOW_LONG                                        \
+                        |UTF8_ALLOW_OVERFLOW)
 
 /* Accept any Perl-extended UTF-8 that evaluates to any UV on the platform, but
  * not any malformed.  This is the default.  (Note that UVs above IV_MAX are