my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
+# C5 is chosen as it is valid for both ASCII and EBCDIC platforms
+my $known_start_byte = I8_to_native("\xC5");
+
sub requires_extended_utf8($) {
# Returns a boolean as to whether or not the code point parameter fits
if ($unexpected_noncont) {
# To force this malformation, change the final continuation
- # byte into a non continuation.
+ # byte into a start byte.
my $pos = ($short) ? -2 : -1;
- substr($this_bytes, $pos, 1) = '?';
+ substr($this_bytes, $pos, 1) = $known_start_byte;
$this_expected_len--;
}
my $message;
my $after = "";
- if ($expect64 != $expect32 && ! $is64bit) {
- like($warnings[0], qr/overflow/, "overflow warning for $id seen");
- shift @warnings;
- $after .= "overflow";
+ if ($expect64 < $expect32 && ! $is64bit) {
+ if ( substr($octets, 0, 1) gt "\xfe"
+ || ( substr($octets, 0, 1) eq "\xfe"
+ && length $octets > 1
+ && substr($octets, 1, 1) le "\xbf"
+ && substr($octets, 1, 1) ge "\x80"))
+ {
+ like($warnings[0], qr/overflow/, "overflow warning for $id seen");
+ shift @warnings;
+ $after .= "overflow";
+ }
}
# The data below assumes that if there is both a 'short' and
3.5 Impossible bytes (but not with Perl's extended UTF-8)
3.5.1 n - 1 fe - 1 byte available, need 7
3.5.2 N2,1 - 1 ff - 1 byte available, need 13
-3.5.3 N11,8 - 4 fe:fe:ff:ff - byte 0xfe
+3.5.3 N9,7 - 4 fe:fe:ff:ff - byte 0xfe
4 Overlong sequences
4.1 Examples of an overlong ASCII character
4.1.1 n - 2 c0:af - overlong
uv = UNICODE_REPLACEMENT;
}
- /* Check for overflow. */
- if (UNLIKELY(does_utf8_overflow(s0, send))) {
+ /* Check for overflow. The algorithm requires us to not look past the end
+ * of the current character, even if partial, so the upper limit is 's' */
+ if (UNLIKELY(does_utf8_overflow(s0, s))) {
possible_problems |= UTF8_GOT_OVERFLOW;
uv = UNICODE_REPLACEMENT;
}
&& ( UNLIKELY(! UTF8_IS_START(*s0))
|| ( curlen > 1
&& UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
- send - s0))))))
+ s - s0))))))
{
possible_problems |= UTF8_GOT_LONG;