my $UTF8_ALLOW_SHORT = 0x0008;
my $UTF8_GOT_SHORT = $UTF8_ALLOW_SHORT;
my $UTF8_ALLOW_LONG = 0x0010;
+my $UTF8_ALLOW_LONG_AND_ITS_VALUE = $UTF8_ALLOW_LONG|0x0020;
my $UTF8_GOT_LONG = $UTF8_ALLOW_LONG;
my $UTF8_GOT_OVERFLOW = 0x0080;
my $UTF8_DISALLOW_SURROGATE = 0x0100;
}
}
+# For each overlong malformation in the list, we modify it, so that there are
+# two tests. The first one returns the replacement character given the input
+# flags, and the second test adds a flag that causes the actual code point the
+# malformation represents to be returned.
+my @added_overlongs;
+foreach my $test (@malformations) {
+ my ($testname, $bytes, $length, $allow_flags, $expected_error_flags,
+ $allowed_uv, $expected_len, $needed_to_discern_len, $message ) = @$test;
+ next unless $testname =~ /overlong/;
+
+ $test->[0] .= "; use REPLACEMENT CHAR";
+ $test->[5] = $REPLACEMENT;
+
+ push @added_overlongs,
+ [ $testname . "; use actual value",
+ $bytes, $length,
+ $allow_flags | $UTF8_ALLOW_LONG_AND_ITS_VALUE,
+ $expected_error_flags, $allowed_uv, $expected_len,
+ $needed_to_discern_len, $message
+ ];
+}
+push @malformations, @added_overlongs;
+
foreach my $test (@malformations) {
my ($testname, $bytes, $length, $allow_flags, $expected_error_flags,
$allowed_uv, $expected_len, $needed_to_discern_len, $message ) = @$test;
builds, and otherwise returns the Unicode REPLACEMENT CHARACTER. If
you have nothing to decode, you shouldn't call the decode function.
+=item *
+
+The functions C<utf8n_to_uvchr> and its derivatives now return the
+Unicode REPLACEMENT CHARACTER if called with UTF-8 that has the overlong
+malformation, and that malformation is allowed by the input parameters.
+This malformation is where the UTF-8 looks valid syntactically, but
+there is a shorter sequence that yields the same code point. This has
+been forbidden since Unicode version 3.1.
+
=back
=head1 Selected Bug Fixes
overlong sequences are expressly forbidden in the UTF-8 standard due to
potential security issues). Another malformation example is the first byte of
a character not being a legal first byte. See F<utf8.h> for the list of such
-flags. For allowed overlong sequences, the computed code point is returned;
-for all other allowed malformations, the Unicode REPLACEMENT CHARACTER is
-returned.
+flags. Even if allowed, this function generally returns the Unicode
+REPLACEMENT CHARACTER when it encounters a malformation. There are flags in
+F<utf8.h> to override this behavior for the overlong malformations, but don't
+do that except for very specialized purposes.
The C<UTF8_CHECK_ONLY> flag overrides the behavior when a non-allowed (by other
flags) malformation is found. If this flag is set, the routine assumes that
possible_problems &= ~UTF8_GOT_LONG;
*errors |= UTF8_GOT_LONG;
- if (! (flags & UTF8_ALLOW_LONG)) {
+ if (flags & UTF8_ALLOW_LONG) {
+
+ /* We don't allow the actual overlong value, unless the
+ * special extra bit is also set */
+ if (! (flags & ( UTF8_ALLOW_LONG_AND_ITS_VALUE
+ & ~UTF8_ALLOW_LONG)))
+ {
+ uv = UNICODE_REPLACEMENT;
+ }
+ }
+ else {
disallowed = TRUE;
if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
#define UTF8_ALLOW_SHORT 0x0008
#define UTF8_GOT_SHORT UTF8_ALLOW_SHORT
-/* Overlong sequence; i.e., the code point can be specified in fewer bytes. */
+/* Overlong sequence; i.e., the code point can be specified in fewer bytes.
+ * First one will convert the overlong to the REPLACEMENT CHARACTER; second
+ * will return what the overlong evaluates to */
#define UTF8_ALLOW_LONG 0x0010
+#define UTF8_ALLOW_LONG_AND_ITS_VALUE (UTF8_ALLOW_LONG|0x0020)
#define UTF8_GOT_LONG UTF8_ALLOW_LONG
/* Currently no way to allow overflow */