perl has never allowed the UTF-8 overflow malformation, for some reason.
But as long as overflows are turned into the REPLACEMENT CHARACTER,
there is no real reason not to. And making it allowable allows code
that wants to carry on in the face of malformed input to do so, without
risk of contaminating things, as the REPLACEMENT is the Unicode
prescribed way of handling malformations.
my $UTF8_ALLOW_LONG = 0x0010;
my $UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020;
my $UTF8_GOT_LONG = $UTF8_ALLOW_LONG;
-my $UTF8_GOT_OVERFLOW = 0x0080;
+my $UTF8_ALLOW_OVERFLOW = 0x0080;
+my $UTF8_GOT_OVERFLOW = $UTF8_ALLOW_OVERFLOW;
my $UTF8_DISALLOW_SURROGATE = 0x0100;
my $UTF8_GOT_SURROGATE = $UTF8_DISALLOW_SURROGATE;
my $UTF8_WARN_SURROGATE = 0x0200;
[ "overflow malformation",
"\xfe\x84\x80\x80\x80\x80\x80", # Represents 2**32
7,
- 0, # There is no way to allow this malformation
- $UTF8_GOT_OVERFLOW,
+ $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
$REPLACEMENT,
7, 2,
qr/overflows/
[ "overflow malformation",
"\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
$max_bytes,
- 0, # There is no way to allow this malformation
- $UTF8_GOT_OVERFLOW,
+ $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
$REPLACEMENT,
$max_bytes, 1,
qr/overflows/
I8_to_native(
"\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
$max_bytes,
- 0, # There is no way to allow this malformation
- $UTF8_GOT_OVERFLOW,
+ $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
$REPLACEMENT,
$max_bytes, 8,
qr/overflows/
: I8_to_native(
"\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
$max_bytes,
- 0, # There is no way to allow this malformation
- $UTF8_GOT_OVERFLOW,
+ $UTF8_ALLOW_OVERFLOW, $UTF8_GOT_OVERFLOW,
$REPLACEMENT,
$max_bytes, (isASCII) ? 3 : 2,
qr/overflows/
there is a shorter sequence that yields the same code point. This has
been forbidden since Unicode version 3.1.
+=item *
+
+The functions C<utf8n_to_uvchr> and its derivatives now accept an input
+flag to allow the overflow malformation. This malformation is when the
+UTF-8 may be syntactically valid, but the code point it represents is
+not capable of being represented in the word length on the platform.
+What "allowed" means in this case is that the function doesn't return an
+error, and advances the parse pointer to beyond the UTF-8 in question,
+but it returns the Unicode REPLACEMENT CHARACTER as the value of the
+code point (since the real value is not representable).
+
=back
=head1 Selected Bug Fixes
/* isn't problematic if < this */
if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
|| ( UNLIKELY(possible_problems)
+
+ /* if overflow, we know without looking further
+ * precisely which of the problematic types it is,
+ * and we deal with those in the overflow handling
+ * code */
+ && LIKELY(! (possible_problems & UTF8_GOT_OVERFLOW))
&& isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
&& ((flags & ( UTF8_DISALLOW_NONCHAR
|UTF8_DISALLOW_SURROGATE
*errors |= UTF8_GOT_ABOVE_31_BIT;
}
- disallowed = TRUE;
+ /* Disallow if any of the three categories say to */
+ if ( ! (flags & UTF8_ALLOW_OVERFLOW)
+ || (flags & ( UTF8_DISALLOW_SUPER
+ |UTF8_DISALLOW_ABOVE_31_BIT)))
+ {
+ disallowed = TRUE;
+ }
+
+
+ /* Likewise, warn if any say to, plus if deprecation warnings
+ * are on, because this code point is above IV_MAX */
+ if ( ckWARN_d(WARN_DEPRECATED)
+ || ! (flags & UTF8_ALLOW_OVERFLOW)
+ || (flags & (UTF8_WARN_SUPER|UTF8_WARN_ABOVE_31_BIT)))
+ {
/* The warnings code explicitly says it doesn't handle the case
* of packWARN2 and two categories which have parent-child
_byte_dump_string(s0, send - s0));
}
}
+ }
}
else if (possible_problems & UTF8_GOT_EMPTY) {
possible_problems &= ~UTF8_GOT_EMPTY;
#define UTF8_ALLOW_LONG_AND_ITS_VALUE (UTF8_ALLOW_LONG|0x0020)
#define UTF8_GOT_LONG UTF8_ALLOW_LONG
-/* Currently no way to allow overflow */
-#define UTF8_GOT_OVERFLOW 0x0080
+#define UTF8_ALLOW_OVERFLOW 0x0080
+#define UTF8_GOT_OVERFLOW UTF8_ALLOW_OVERFLOW
#define UTF8_DISALLOW_SURROGATE 0x0100 /* Unicode surrogates */
#define UTF8_GOT_SURROGATE UTF8_DISALLOW_SURROGATE
#define UTF8_WARN_ILLEGAL_INTERCHANGE \
(UTF8_WARN_ILLEGAL_C9_INTERCHANGE|UTF8_WARN_NONCHAR)
+/* This is used typically for code that is willing to accept inputs of
+ * illformed UTF-8 sequences, for whatever reason. However, all such sequences
+ * evaluate to the REPLACEMENT CHARACTER unless other flags overriding this are
+ * also present. */
#define UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \
|UTF8_ALLOW_NON_CONTINUATION \
|UTF8_ALLOW_SHORT \
- |UTF8_ALLOW_LONG)
+ |UTF8_ALLOW_LONG \
+ |UTF8_ALLOW_OVERFLOW)
/* Accept any Perl-extended UTF-8 that evaluates to any UV on the platform, but
* not any malformed. This is the default. (Note that UVs above IV_MAX are