? sub { return shift }
: sub { return join "", map { chr $native_to_i8[ord $_] }
split "", shift };
+sub start_byte_to_cont($) {
+
+ # Extract the code point information from the input UTF-8 start byte, and
+ # return a continuation byte containing the same information. This is
+ # used in constructing an overlong malformation from valid input.
+
+ my $byte = shift;
+ my $len = test_UTF8_SKIP($byte);
+ if ($len < 2) {
+ die "";
+ }
+
+ $byte = ord native_to_I8($byte);
+
+ # Copied from utf8.h. This gets rid of the leading 1 bits.
+ $byte &= ((($len) >= 7) ? 0x00 : (0x1F >> (($len)-2)));
+
+ $byte |= (isASCII) ? 0x80 : ord I8_to_native("\xA0");
+ return chr $byte;
+}
my $is64bit = length sprintf("%x", ~0) > 8;
}
}
+sub nonportable_regex ($) {
+
+ # Returns a pattern that matches the non-portable message raised either
+ # for the specific input code point, or the one generated when there
+ # is some malformation that precludes the message containing the specific
+ # code point
+
+ my $code_point = shift;
+
+ my $string = sprintf '(Code point 0x%x is not Unicode, and'
+ . '|Any UTF-8 sequence that starts with'
+ . ' "(\\\x[[:xdigit:]]{2})+" is for a'
+ . ' non-Unicode code point, and is) not portable',
+ $code_point;
+ return qr/$string/;
+}
+
# Now test the cases where a legal code point is generated, but may or may not
# be allowed/warned on.
my @tests = (
$UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
'non_unicode', 0x110000,
(isASCII) ? 4 : 5,
- qr/not Unicode.* may not be portable/
+ qr/(not Unicode|for a non-Unicode code point).* may not be portable/
],
[ "non_unicode whose first byte tells that",
(isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
'non_unicode',
(isASCII) ? 0x140000 : 0x200000,
(isASCII) ? 4 : 5,
- qr/not Unicode.* may not be portable/
+ qr/(not Unicode|for a non-Unicode code point).* may not be portable/
],
[ "first of 32 consecutive non-character code points",
(isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
# 32-bit machines
$UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
'utf8', 0x80000000, (isASCII) ? 7 :14,
- qr/Code point 0x80000000 is not Unicode, and not portable/
+ nonportable_regex(0x80000000)
],
[ "requires at least 32 bits, and use SUPER-type flags, instead of ABOVE_31_BIT",
(isASCII)
: I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
$UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
'utf8', 0x80000000, (isASCII) ? 7 :14,
- qr/Code point 0x80000000 is not Unicode, and not portable/
+ nonportable_regex(0x80000000)
],
[ "overflow with warnings/disallow for more than 31 bits",
# This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
: I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
$UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
'utf8', 0x1000000000, (isASCII) ? 13 : 14,
- qr/Code point 0x.* is not Unicode, and not portable/
+ qr/and( is)? not portable/
];
if (! isASCII) {
push @tests, # These could falsely show wrongly in a naive implementation
I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
$UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
'utf8', 0x800000000, 14,
- qr/Code point 0x800000000 is not Unicode, and not portable/
+ nonportable_regex(0x80000000)
],
[ "requires at least 32 bits",
I8_to_native("\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
$UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
'utf8', 0x10000000000, 14,
- qr/Code point 0x10000000000 is not Unicode, and not portable/
+ nonportable_regex(0x10000000000)
],
[ "requires at least 32 bits",
I8_to_native("\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
$UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
'utf8', 0x200000000000, 14,
- qr/Code point 0x200000000000 is not Unicode, and not portable/
+ nonportable_regex(0x20000000000)
],
[ "requires at least 32 bits",
I8_to_native("\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
$UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
'utf8', 0x4000000000000, 14,
- qr/Code point 0x4000000000000 is not Unicode, and not portable/
+ nonportable_regex(0x4000000000000)
],
[ "requires at least 32 bits",
I8_to_native("\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
$UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
'utf8', 0x80000000000000, 14,
- qr/Code point 0x80000000000000 is not Unicode, and not portable/
+ nonportable_regex(0x80000000000000)
],
[ "requires at least 32 bits",
I8_to_native("\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
+ #IBM-1047 \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
$UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
'utf8', 0x1000000000000000, 14,
- qr/Code point 0x1000000000000000 is not Unicode, and not portable/
+ nonportable_regex(0x1000000000000000)
];
}
}
my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test;
my $length = length $bytes;
- my $will_overflow = $testname =~ /overflow/;
+ my $will_overflow = $testname =~ /overflow/ ? 'overflow' : "";
{
use warnings;
foreach my $warn_flag (0, $warn_flags) {
foreach my $disallow_flag (0, $disallow_flags) {
foreach my $do_warning (0, 1) {
+
+ # We try each of the above with various combinations of
+ # malformations that can occur on the same input sequence.
+ foreach my $short ("",
+ "short",
+ "unexpected non-continuation")
+ {
+ # The non-characters can't be discerned with a short
+ # malformation
+ next if $short && $testname =~ /non-character/;
+
+ foreach my $overlong ("", "overlong") {
+
+ # Our hard-coded overlong starts with \xFE, so
+ # can't handle anything larger.
+ next if $overlong
+ && ord native_to_I8(substr($bytes, 0, 1)) >= 0xFE;
+
+ my @malformations;
+ push @malformations, $short if $short;
+ push @malformations, $overlong if $overlong;
+
+ # The overflow malformation test in the input
+ # array is coerced into being treated like one of
+ # the others.
+ push @malformations, 'overflow' if $will_overflow;
+
+ my $malformations_name = join "/", @malformations;
+ $malformations_name .= " malformation"
+ if $malformations_name;
+ $malformations_name .= "s" if @malformations > 1;
+ my $this_bytes = $bytes;
+ my $this_length = $length;
+ my $expected_uv = $allowed_uv;
+ my $this_expected_len = $expected_len;
+ if ($malformations_name) {
+ $expected_uv = 0;
+
+ # Coerce the input into the desired
+ # malformation
+ if ($malformations_name =~ /overlong/) {
+
+ # For an overlong, we convert the original
+ # start byte into a continuation byte with
+ # the same data bits as originally. ...
+ substr($this_bytes, 0, 1)
+ = start_byte_to_cont(substr($this_bytes,
+ 0, 1));
+
+ # ... Then we prepend it with a known
+ # overlong sequence. This should evaluate
+ # to the exact same code point as the
+ # original.
+ $this_bytes = "\xfe"
+ . ("\x80"
+ x ( 6 - length($this_bytes)))
+ . $this_bytes;
+ $this_length = length($this_bytes);
+ $this_expected_len = 7;
+ }
+ if ($malformations_name =~ /short/) {
+
+ # Just tell the test to not look far
+ # enough into the input.
+ $this_length--;
+ $this_expected_len--;
+ }
+ elsif ($malformations_name
+ =~ /non-continuation/)
+ {
+ # Change the final continuation byte into
+ # a non one.
+ substr($this_bytes, -1, 1) = '?';
+ $this_expected_len--;
+ }
+ }
+
my $eval_warn = $do_warning
? "use warnings '$warning'"
: $warning eq "utf8"
: ( "use warnings 'utf8';"
. " no warnings '$warning'");
- # is effectively disallowed if will overflow, even
- # if the flag indicates it is allowed, fix up test
- # name to indicate this as well
- my $disallowed = $disallow_flag || $will_overflow;
-
+ # Is effectively disallowed if we've set up a
+ # malformation, even if the flag indicates it is
+ # allowed. Fix up test name to indicate this as
+ # well
+ my $disallowed = $disallow_flag
+ || $malformations_name;
my $this_name = "utf8n_to_uvchr() $testname: "
- . (($disallow_flag)
- ? 'disallowed'
- : ($disallowed)
- ? 'ABOVE_31_BIT allowed'
- : 'allowed');
+ . (($disallow_flag)
+ ? 'disallowed'
+ : $disallowed
+ ? $disallowed
+ : 'allowed');
$this_name .= ", $eval_warn";
$this_name .= ", " . (($warn_flag)
? 'with warning flag'
undef @warnings;
my $ret_ref;
- my $display_bytes = display_bytes($bytes);
+ my $display_bytes = display_bytes($this_bytes);
my $call = "Call was: $eval_warn; \$ret_ref"
. " = test_utf8n_to_uvchr('$display_bytes'"
- . ", $length, $warn_flag|$disallow_flag)";
+ . ", $this_length, $warn_flag"
+ . "|$disallow_flag)";
my $eval_text = "$eval_warn; \$ret_ref"
- . " = test_utf8n_to_uvchr('$bytes',"
- . " $length, $warn_flag|$disallow_flag)";
+ . " = test_utf8n_to_uvchr('$this_bytes',"
+ . " $this_length, $warn_flag"
+ . "|$disallow_flag)";
eval "$eval_text";
if (! ok ("$@ eq ''",
"$this_name: eval succeeded"))
}
}
else {
- unless (is($ret_ref->[0], $allowed_uv,
+ unless (is($ret_ref->[0], $expected_uv,
"$this_name: Returns expected uv: "
- . sprintf("0x%04X", $allowed_uv)))
+ . sprintf("0x%04X", $expected_uv)))
{
diag $call;
}
}
- unless (is($ret_ref->[1], $expected_len,
+ unless (is($ret_ref->[1], $this_expected_len,
"$this_name: Returns expected length:"
- . " $expected_len"))
+ . " $this_expected_len"))
{
diag $call;
}
- if ($will_overflow) {
+ if (@malformations) {
if (! $do_warning && $warning eq 'utf8') {
goto no_warnings_expected;
}
- # Will get the overflow message instead of the
- # expected message under these circumstances,
- # as they would otherwise accept an overflowed
- # value, which the code should not allow, so
- # falls back to overflow.
- if (is(scalar @warnings, 1,
- "$this_name: Got a single warning "))
- {
- unless (like($warnings[0], qr/overflow/,
- "$this_name: Got overflow"
- . " warning"))
- {
- diag $call;
+ # Check that each malformation generates a
+ # warning, removing that warning if found
+ MALFORMATION:
+ foreach my $malformation (@malformations) {
+ foreach (my $i = 0; $i < @warnings; $i++) {
+ if ($warnings[$i] =~ /$malformation/) {
+ pass("Expected and got"
+ . "'$malformation' warning");
+ splice @warnings, $i, 1;
+ next MALFORMATION;
+ }
}
- }
- else {
- diag $call;
- output_warnings(@warnings)
- if scalar @warnings;
+ fail("Expected '$malformation' warning"
+ . "but didn't get it");
+
}
}
- elsif ( ! $do_warning
- && ( $warning eq 'utf8'
- || $warning eq $category))
+
+ # Any overflow will override any super or above-31
+ # warnings.
+ goto no_warnings_expected if $will_overflow;
+
+ if ( ! $do_warning
+ && ( $warning eq 'utf8'
+ || $warning eq $category))
{
goto no_warnings_expected;
}
# not just when the $disallow_flag is set
if ($disallowed) {
undef @warnings;
- $ret_ref = test_utf8n_to_uvchr($bytes, $length,
- $disallow_flag|$UTF8_CHECK_ONLY);
+ $ret_ref = test_utf8n_to_uvchr(
+ $this_bytes, $this_length,
+ $disallow_flag|$UTF8_CHECK_ONLY);
unless (is($ret_ref->[0], 0,
"$this_name, CHECK_ONLY: Returns 0"))
{
# Now repeat some of the above, but for
# uvchr_to_utf8_flags(). Since this comes from an
- # existing code point, it hasn't overflowed.
- next if $will_overflow;
+ # existing code point, it hasn't overflowed, and
+ # isn't malformed.
+ next if @malformations;
# The warning and disallow flags passed in are for
# utf8n_to_uvchr(). Convert them for
if scalar @warnings;
}
}
+ }
+ }
}
}
}
the length, in bytes, of that character.
The value of C<flags> determines the behavior when C<s> does not point to a
-well-formed UTF-8 character. If C<flags> is 0, when a malformation is found,
-zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
-next possible position in C<s> that could begin a non-malformed character.
-Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
+well-formed UTF-8 character. If C<flags> is 0, encountering a malformation
+causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
+is the next possible position in C<s> that could begin a non-malformed
+character. Also, if UTF-8 warnings haven't been lexically disabled, a warning
+is raised. Some UTF-8 input sequences may contain multiple malformations.
+This function tries to find every possible one in each call, so multiple
+warnings can be raised for each sequence.
Various ALLOW flags can be set in C<flags> to allow (and not warn on)
individual types of malformations, such as the sequence being overlong (that
Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
{
const U8 * const s0 = s;
- U8 * send;
+ U8 * send = NULL; /* (initialized to silence compilers' wrong
+ warning) */
+ U32 possible_problems = 0; /* A bit is set here for each potential problem
+ found as we go along */
UV uv = *s;
- STRLEN expectlen;
- SV* sv = NULL;
- UV outlier_ret = 0; /* return value when input is in error or problematic
- */
- UV pack_warn = 0; /* Save result of packWARN() for later */
- bool unexpected_non_continuation = FALSE;
- bool overflowed = FALSE;
- bool do_overlong_test = TRUE; /* May have to skip this test */
+ STRLEN expectlen = 0; /* How long should this sequence be?
+ (initialized to silence compilers' wrong
+ warning) */
+ /* The below are used only if there is both an overlong malformation and a
+ * too short one. Otherwise the first two are set to 's0' and 'send', and
+ * the third not used at all */
+ U8 * adjusted_s0 = (U8 *) s0;
+ U8 * adjusted_send;
+ UV uv_so_far = 0; /* (Initialized to silence compilers' wrong warning) */
PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
* returning to the caller C<*retlen> pointing to the very next byte (one
* which is actually part of of the overflowing sequence), that could look
* legitimate to the caller, which could discard the initial partial
- * sequence and process the rest, inappropriately */
+ * sequence and process the rest, inappropriately.
+ *
+ * Some possible input sequences are malformed in more than one way. This
+ * function goes to lengths to try to find all of them. This is necessary
+ * for correctness, as the inputs may allow one malformation but not
+ * another, and if we abandon searching for others after finding the
+ * allowed one, we could allow in something that shouldn't have been.
+ */
- /* Zero length strings, if allowed, of necessity are zero */
if (UNLIKELY(curlen == 0)) {
- if (retlen) {
- *retlen = 0;
- }
-
- if (flags & UTF8_ALLOW_EMPTY) {
- return 0;
- }
- if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)",
- malformed_text));
- }
- goto malformed;
+ possible_problems |= UTF8_GOT_EMPTY;
+ curlen = 0;
+ uv = 0; /* XXX It could be argued that this should be
+ UNICODE_REPLACEMENT? */
+ goto ready_to_handle_errors;
}
expectlen = UTF8SKIP(s);
/* A continuation character can't start a valid sequence */
if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
- if (flags & UTF8_ALLOW_CONTINUATION) {
- if (retlen) {
- *retlen = 1;
- }
- return UNICODE_REPLACEMENT;
- }
-
- if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_
- "%s: %s (unexpected continuation byte 0x%02x,"
- " with no preceding start byte)",
- malformed_text,
- _byte_dump_string(s0, 1), *s0));
- }
- curlen = 1;
- goto malformed;
+ possible_problems |= UTF8_GOT_CONTINUATION;
+ curlen = 1;
+ uv = UNICODE_REPLACEMENT;
+ goto ready_to_handle_errors;
}
/* Here is not a continuation byte, nor an invariant. The only thing left
/* Now, loop through the remaining bytes in the character's sequence,
* accumulating each into the working value as we go. Be sure to not look
* past the end of the input string */
- send = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
-
+ send = adjusted_send
+ = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
for (s = s0 + 1; s < send; s++) {
if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
- if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
-
- /* The original implementors viewed this malformation as more
- * serious than the others (though I, khw, don't understand
- * why, since other malformations also give very very wrong
- * results), so there is no way to turn off checking for it.
- * Set a flag, but keep going in the loop, so that we absorb
- * the rest of the bytes that comprise the character. */
- overflowed = TRUE;
- }
uv = UTF8_ACCUMULATE(uv, *s);
- }
- else {
- /* Here, found a non-continuation before processing all expected
- * bytes. This byte begins a new character, so quit, even if
- * allowing this malformation. */
- unexpected_non_continuation = TRUE;
- break;
- }
+ continue;
+ }
+
+ /* Here, found a non-continuation before processing all expected bytes.
+ * This byte indicates the beginning of a new character, so quit, even
+ * if allowing this malformation. */
+ curlen = s - s0; /* Save how many bytes we actually got */
+ possible_problems |= UTF8_GOT_NON_CONTINUATION;
+ goto finish_short;
} /* End of loop through the character's bytes */
/* Save how many bytes were actually in the character */
curlen = s - s0;
- /* The loop above finds two types of malformations: non-continuation and/or
- * overflow. The non-continuation malformation is really a too-short
- * malformation, as it means that the current character ended before it was
- * expected to (being terminated prematurely by the beginning of the next
- * character, whereas in the too-short malformation there just are too few
- * bytes available to hold the character. In both cases, the check below
- * that we have found the expected number of bytes would fail if executed.)
- * Thus the non-continuation malformation is really unnecessary, being a
- * subset of the too-short malformation. But there may be existing
- * applications that are expecting the non-continuation type, so we retain
- * it, and return it in preference to the too-short malformation. (If this
- * code were being written from scratch, the two types might be collapsed
- * into one.) I, khw, am also giving priority to returning the
- * non-continuation and too-short malformations over overflow when multiple
- * ones are present. I don't know of any real reason to prefer one over
- * the other, except that it seems to me that multiple-byte errors trumps
- * errors from a single byte */
- if (UNLIKELY(unexpected_non_continuation)) {
- if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
- if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s",
- unexpected_non_continuation_text(s0,
- send - s0,
- s - s0,
- (int) expectlen)));
- }
- goto malformed;
- }
- uv = UNICODE_REPLACEMENT;
-
- /* Skip testing for overlongs, as the REPLACEMENT may not be the same
- * as what the original expectations were. */
- do_overlong_test = FALSE;
- if (retlen) {
- *retlen = curlen;
- }
- }
- else if (UNLIKELY(curlen < expectlen)) {
- if (! (flags & UTF8_ALLOW_SHORT)) {
- if (! (flags & UTF8_CHECK_ONLY)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_
- "%s: %s (too short; got %d byte%s, need %d)",
- malformed_text,
- _byte_dump_string(s0, send - s0),
- (int)curlen,
- curlen == 1 ? "" : "s",
- (int)expectlen));
- }
- goto malformed;
- }
- uv = UNICODE_REPLACEMENT;
- do_overlong_test = FALSE;
- if (retlen) {
- *retlen = curlen;
- }
- }
-
- if (UNLIKELY(overflowed)) {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s: %s (overflows)",
- malformed_text,
- _byte_dump_string(s0, send - s0)));
- goto malformed;
- }
-
- if (do_overlong_test
- && expectlen > (STRLEN) OFFUNISKIP(uv)
- && ! (flags & UTF8_ALLOW_LONG))
+ /* Did we get all the continuation bytes that were expected? Note that we
+ * know this result even without executing the loop above. But we had to
+ * do the loop to see if there are unexpected non-continuations. */
+ if (UNLIKELY(curlen < expectlen)) {
+ possible_problems |= UTF8_GOT_SHORT;
+
+ finish_short:
+ uv_so_far = uv;
+ uv = UNICODE_REPLACEMENT;
+ }
+
+ /* Note that there are two types of too-short malformation. One is when
+ * there is actual wrong data before the normal termination of the
+ * sequence. The other is that the sequence wasn't complete before the end
+ * of the data we are allowed to look at, based on the input 'curlen'.
+ * This means that we were passed data for a partial character, but it is
+ * valid as far as we saw. The other is definitely invalid. This
+ * distinction could be important to a caller, so the two types are kept
+ * separate. */
+
+ /* Check for overflow */
+ if (UNLIKELY(does_utf8_overflow(s0, send))) {
+ possible_problems |= UTF8_GOT_OVERFLOW;
+ uv = UNICODE_REPLACEMENT;
+ }
+
+ /* Check for overlong. If no problems so far, 'uv' is the correct code
+ * point value. Simply see if it is expressible in fewer bytes. Otherwise
+ * we must look at the UTF-8 byte sequence itself to see if it is for an
+ * overlong */
+ if ( ( LIKELY(! possible_problems)
+ && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
+ || ( UNLIKELY( possible_problems)
+ && ( UNLIKELY(! UTF8_IS_START(*s0))
+ || ( curlen > 1
+ && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
+ send - s0))))))
{
- /* The overlong malformation has lower precedence than the others.
- * Note that if this malformation is allowed, we return the actual
- * value, instead of the replacement character. This is because this
- * value is actually well-defined. */
- if (! (flags & UTF8_CHECK_ONLY)) {
- U8 tmpbuf[UTF8_MAXBYTES+1];
- const U8 * const e = uvchr_to_utf8(tmpbuf, uv);
- sv = sv_2mortal(Perl_newSVpvf(aTHX_
- "%s: %s (overlong; instead use %s to represent U+%0*"UVXf")",
- malformed_text,
- _byte_dump_string(s0, send - s0),
- _byte_dump_string(tmpbuf, e - tmpbuf),
- ((uv < 256) ? 2 : 4), /* Field width of 2 for small code
- points */
- uv));
- }
- goto malformed;
+ possible_problems |= UTF8_GOT_LONG;
+
+ /* A convenience macro that matches either of the too-short conditions.
+ * */
+# define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
+
+ if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
+ UV min_uv = uv_so_far;
+ STRLEN i;
+
+ /* Here, the input is both overlong and is missing some trailing
+ * bytes. There is no single code point it could be for, but there
+ * may be enough information present to determine if what we have
+ * so far is for an unallowed code point, such as for a surrogate.
+ * The code below has the intelligence to determine this, but just
+ * for non-overlong UTF-8 sequences. What we do here is calculate
+ * the smallest code point the input could represent if there were
+ * no too short malformation. Then we compute and save the UTF-8
+ * for that, which is what the code below looks at instead of the
+ * raw input. It turns out that the smallest such code point is
+ * all we need. */
+ for (i = curlen; i < expectlen; i++) {
+ min_uv = UTF8_ACCUMULATE(min_uv,
+ I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
+ }
+
+ Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8);
+ SAVEFREEPV((U8 *) adjusted_s0); /* Needed because we may not get
+ to free it ourselves if
+ warnings are made fatal */
+ adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
+ }
}
- /* Here, the input is considered to be well-formed, but it still could be a
- * problematic code point that is not allowed by the input parameters. */
- if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
+ /* Now check that the input isn't for a problematic code point not allowed
+ * by the input parameters. */
+ /* isn't problematic if < this */
+ if ( ( ( LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
+ || ( UNLIKELY(possible_problems)
+ && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
&& ((flags & ( UTF8_DISALLOW_NONCHAR
|UTF8_DISALLOW_SURROGATE
|UTF8_DISALLOW_SUPER
|UTF8_WARN_SURROGATE
|UTF8_WARN_SUPER
|UTF8_WARN_ABOVE_31_BIT))
+ /* In case of a malformation, 'uv' is not valid, and has
+ * been changed to something in the Unicode range.
+ * Currently we don't output a deprecation message if there
+ * is already a malformation, so we don't have to special
+ * case the test immediately below */
|| ( UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
&& ckWARN_d(WARN_DEPRECATED))))
{
- if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
-
- /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
- * generation of the sv, since no warnings are raised under CHECK */
- if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
- && ckWARN_d(WARN_SURROGATE))
- {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
- pack_warn = packWARN(WARN_SURROGATE);
- }
- if (flags & UTF8_DISALLOW_SURROGATE) {
- goto disallowed;
- }
- }
- else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
- if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
- && ckWARN_d(WARN_NON_UNICODE))
- {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_
- "Code point 0x%04"UVXf" is not Unicode, may not be portable",
- uv));
- pack_warn = packWARN(WARN_NON_UNICODE);
- }
+ /* If there were no malformations, or the only malformation is an
+ * overlong, 'uv' is valid */
+ if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
+ if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+ possible_problems |= UTF8_GOT_SURROGATE;
+ }
+ else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
+ possible_problems |= UTF8_GOT_SUPER;
+ }
+ else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
+ possible_problems |= UTF8_GOT_NONCHAR;
+ }
+ }
+ else { /* Otherwise, need to look at the source UTF-8, possibly
+ adjusted to be non-overlong */
- /* The maximum code point ever specified by a standard was
- * 2**31 - 1. Anything larger than that is a Perl extension that
- * very well may not be understood by other applications (including
- * earlier perl versions on EBCDIC platforms). We test for these
- * after the regular SUPER ones, and before possibly bailing out,
- * so that the slightly more dire warning will override the regular
- * one. */
- if ( (flags & (UTF8_WARN_ABOVE_31_BIT
- |UTF8_WARN_SUPER
- |UTF8_DISALLOW_ABOVE_31_BIT))
- && UNLIKELY(is_utf8_cp_above_31_bits(s0, send)))
+ if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
+ >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
{
- if ( ! (flags & UTF8_CHECK_ONLY)
- && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
- && ckWARN_d(WARN_UTF8))
+ possible_problems |= UTF8_GOT_SUPER;
+ }
+ else if (curlen > 1) {
+ if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
+ NATIVE_UTF8_TO_I8(*adjusted_s0),
+ NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
{
- sv = sv_2mortal(Perl_newSVpvf(aTHX_
- "Code point 0x%"UVXf" is not Unicode, and not portable",
- uv));
- pack_warn = packWARN(WARN_UTF8);
+ possible_problems |= UTF8_GOT_SUPER;
}
- if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
- goto disallowed;
+ else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
+ NATIVE_UTF8_TO_I8(*adjusted_s0),
+ NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
+ {
+ possible_problems |= UTF8_GOT_SURROGATE;
}
}
- if (flags & UTF8_DISALLOW_SUPER) {
- goto disallowed;
- }
+ /* We need a complete well-formed UTF-8 character to discern
+ * non-characters, so can't look for them here */
+ }
+ }
- /* The deprecated warning overrides any non-deprecated one */
- if (UNLIKELY(uv > MAX_NON_DEPRECATED_CP) && ckWARN_d(WARN_DEPRECATED))
- {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ cp_above_legal_max,
- uv, MAX_NON_DEPRECATED_CP));
- pack_warn = packWARN(WARN_DEPRECATED);
- }
- }
- else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
- if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
- && ckWARN_d(WARN_NONCHAR))
- {
- sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is not recommended for open interchange", uv));
- pack_warn = packWARN(WARN_NONCHAR);
- }
- if (flags & UTF8_DISALLOW_NONCHAR) {
- goto disallowed;
- }
- }
+ ready_to_handle_errors:
+
+ /* At this point:
+ * curlen contains the number of bytes in the sequence that
+ * this call should advance the input by.
+ * possible_problems' is 0 if there weren't any problems; otherwise a bit
+ * is set in it for each potential problem found.
+ * uv contains the code point the input sequence
+ * represents; or if there is a problem that prevents
+ * a well-defined value from being computed, it is
+ * some subsitute value, typically the REPLACEMENT
+ * CHARACTER.
+ * s0 points to the first byte of the character
+ * send points to just after where that (potentially
+ * partial) character ends
+ * adjusted_s0 normally is the same as s0, but in case of an
+ * overlong for which the UTF-8 matters below, it is
+ * the first byte of the shortest form representation
+ * of the input.
+ * adjusted_send normally is the same as 'send', but if adjusted_s0
+ * is set to something other than s0, this points one
+ * beyond its end
+ */
- if (sv) {
- outlier_ret = UNI_TO_NATIVE(uv);
- goto do_warn;
- }
+ if (UNLIKELY(possible_problems)) {
+ bool disallowed = FALSE;
+ const U32 orig_problems = possible_problems;
+
+ while (possible_problems) { /* Handle each possible problem */
+ UV pack_warn = 0;
+ char * message = NULL;
+
+ /* Each 'if' clause handles one problem. They are ordered so that
+ * the first ones' messages will be displayed before the later
+ * ones; this is kinda in decreasing severity order */
+ if (possible_problems & UTF8_GOT_OVERFLOW) {
+
+ /* Overflow means also got a super and above 31 bits, but we
+ * handle all three cases here */
+ possible_problems
+ &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
+
+ disallowed = TRUE;
+
+ /* The warnings code explicitly says it doesn't handle the case
+ * of packWARN2 and two categories which have parent-child
+ * relationship. Even if it works now to raise the warning if
+ * either is enabled, it wouldn't necessarily do so in the
+ * future. We output (only) the most dire warning*/
+ if (! (flags & UTF8_CHECK_ONLY)) {
+ if (ckWARN_d(WARN_UTF8)) {
+ pack_warn = packWARN(WARN_UTF8);
+ }
+ else if (ckWARN_d(WARN_NON_UNICODE)) {
+ pack_warn = packWARN(WARN_NON_UNICODE);
+ }
+ if (pack_warn) {
+ message = Perl_form(aTHX_ "%s: %s (overflows)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0));
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_EMPTY) {
+ possible_problems &= ~UTF8_GOT_EMPTY;
+
+ if (! (flags & UTF8_ALLOW_EMPTY)) {
+ disallowed = TRUE;
+ if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_ "%s (empty string)",
+ malformed_text);
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_CONTINUATION) {
+ possible_problems &= ~UTF8_GOT_CONTINUATION;
+
+ if (! (flags & UTF8_ALLOW_CONTINUATION)) {
+ disallowed = TRUE;
+ if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_
+ "%s: %s (unexpected continuation byte 0x%02x,"
+ " with no preceding start byte)",
+ malformed_text,
+ _byte_dump_string(s0, 1), *s0);
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
+ possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
+
+ if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
+ disallowed = TRUE;
+ if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_ "%s",
+ unexpected_non_continuation_text(s0,
+ send - s0,
+ s - s0,
+ (int) expectlen));
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_SHORT) {
+ possible_problems &= ~UTF8_GOT_SHORT;
+
+ if (! (flags & UTF8_ALLOW_SHORT)) {
+ disallowed = TRUE;
+ if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+ pack_warn = packWARN(WARN_UTF8);
+ message = Perl_form(aTHX_
+ "%s: %s (too short; got %d byte%s, need %d)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ (int)curlen,
+ curlen == 1 ? "" : "s",
+ (int)expectlen);
+ }
+ }
- /* Here, this is not considered a malformed character, so drop through
- * to return it */
- }
+ }
+ else if (possible_problems & UTF8_GOT_LONG) {
+ possible_problems &= ~UTF8_GOT_LONG;
+
+ if (! (flags & UTF8_ALLOW_LONG)) {
+ disallowed = TRUE;
+
+ if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+ pack_warn = packWARN(WARN_UTF8);
+
+ /* These error types cause 'uv' to be something that
+ * isn't what was intended, so can't use it in the
+ * message. The other error types either can't
+ * generate an overlong, or else the 'uv' is valid */
+ if (orig_problems &
+ (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+ {
+ message = Perl_form(aTHX_
+ "%s: %s (any UTF-8 sequence that starts"
+ " with \"%s\" is overlong which can and"
+ " should be represented with a"
+ " different, shorter sequence)",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ _byte_dump_string(s0, curlen));
+ }
+ else {
+ U8 tmpbuf[UTF8_MAXBYTES+1];
+ const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
+ uv, 0);
+ message = Perl_form(aTHX_
+ "%s: %s (overlong; instead use %s to represent"
+ " U+%0*"UVXf")",
+ malformed_text,
+ _byte_dump_string(s0, send - s0),
+ _byte_dump_string(tmpbuf, e - tmpbuf),
+ ((uv < 256) ? 2 : 4), /* Field width of 2 for
+ small code points */
+ uv);
+ }
+ }
+ }
+ }
+ else if (possible_problems & UTF8_GOT_SURROGATE) {
+ possible_problems &= ~UTF8_GOT_SURROGATE;
+
+ /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
+ * generation of the format, since no warnings are raised under
+ * CHECK */
+ if ( (flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY))
+ == UTF8_WARN_SURROGATE
+ && ckWARN_d(WARN_SURROGATE))
+ {
+ pack_warn = packWARN(WARN_SURROGATE);
+
+ /* These are the only errors that can occur with a
+ * surrogate when the 'uv' isn't valid */
+ if (orig_problems & UTF8_GOT_TOO_SHORT) {
+ message = Perl_form(aTHX_
+ "UTF-16 surrogate (any UTF-8 sequence that"
+ " starts with \"%s\" is for a surrogate)",
+ _byte_dump_string(s0, curlen));
+ }
+ else {
+ message = Perl_form(aTHX_
+ "UTF-16 surrogate U+%04"UVXf"", uv);
+ }
+ }
- return UNI_TO_NATIVE(uv);
+ if (flags & UTF8_DISALLOW_SURROGATE) {
+ disallowed = TRUE;
+ }
+ }
+ else if (possible_problems & UTF8_GOT_SUPER) {
+ possible_problems &= ~UTF8_GOT_SUPER;
- /* There are three cases which get to beyond this point. In all 3 cases:
- * <sv> if not null points to a string to print as a warning.
- * <curlen> is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
- * set.
- * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
- * This is done by initializing it to 0, and changing it only
- * for case 1).
- * The 3 cases are:
- * 1) The input is valid but problematic, and to be warned about. The
- * return value is the resultant code point; <*retlen> is set to
- * <curlen>, the number of bytes that comprise the code point.
- * <pack_warn> contains the result of packWARN() for the warning
- * types. The entry point for this case is the label <do_warn>;
- * 2) The input is a valid code point but disallowed by the parameters to
- * this function. The return value is 0. If UTF8_CHECK_ONLY is set,
- * <*relen> is -1; otherwise it is <curlen>, the number of bytes that
- * comprise the code point. <pack_warn> contains the result of
- * packWARN() for the warning types. The entry point for this case is
- * the label <disallowed>.
- * 3) The input is malformed. The return value is 0. If UTF8_CHECK_ONLY
- * is set, <*relen> is -1; otherwise it is <curlen>, the number of
- * bytes that comprise the malformation. All such malformations are
- * assumed to be warning type <utf8>. The entry point for this case
- * is the label <malformed>.
- */
+ if ( (flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY))
+ == UTF8_WARN_SUPER
+ && ckWARN_d(WARN_NON_UNICODE))
+ {
+ pack_warn = packWARN(WARN_NON_UNICODE);
+
+ if (orig_problems & UTF8_GOT_TOO_SHORT) {
+ message = Perl_form(aTHX_
+ "Any UTF-8 sequence that starts with"
+ " \"%s\" is for a non-Unicode code point,"
+ " may not be portable",
+ _byte_dump_string(s0, curlen));
+ }
+ else {
+ message = Perl_form(aTHX_
+ "Code point 0x%04"UVXf" is not"
+ " Unicode, may not be portable",
+ uv);
+ }
+ }
- malformed:
+ /* The maximum code point ever specified by a standard was
+ * 2**31 - 1. Anything larger than that is a Perl extension
+ * that very well may not be understood by other applications
+ * (including earlier perl versions on EBCDIC platforms). We
+ * test for these after the regular SUPER ones, and before
+ * possibly bailing out, so that the slightly more dire warning
+ * will override the regular one. */
+ if ( (flags & (UTF8_WARN_ABOVE_31_BIT
+ |UTF8_WARN_SUPER
+ |UTF8_DISALLOW_ABOVE_31_BIT))
+ && ( ( UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
+ && UNLIKELY(is_utf8_cp_above_31_bits(
+ adjusted_s0,
+ adjusted_send)))
+ || ( LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
+ && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv)))))
+ {
+ if ( ! (flags & UTF8_CHECK_ONLY)
+ && (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
+ && ckWARN_d(WARN_UTF8))
+ {
+ pack_warn = packWARN(WARN_UTF8);
+
+ if (orig_problems & UTF8_GOT_TOO_SHORT) {
+ message = Perl_form(aTHX_
+ "Any UTF-8 sequence that starts with"
+ " \"%s\" is for a non-Unicode code"
+ " point, and is not portable",
+ _byte_dump_string(s0, curlen));
+ }
+ else {
+ message = Perl_form(aTHX_
+ "Code point 0x%"UVXf" is not Unicode,"
+ " and not portable",
+ uv);
+ }
+ }
- if (sv && ckWARN_d(WARN_UTF8)) {
- pack_warn = packWARN(WARN_UTF8);
- }
+ if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
+ disallowed = TRUE;
+ }
+ }
- disallowed:
+ if (flags & UTF8_DISALLOW_SUPER) {
+ disallowed = TRUE;
+ }
- if (flags & UTF8_CHECK_ONLY) {
- if (retlen)
- *retlen = ((STRLEN) -1);
- return 0;
- }
+ /* The deprecated warning overrides any non-deprecated one. If
+ * there are other problems, a deprecation message is not
+ * really helpful, so don't bother to raise it in that case.
+ * This also keeps the code from having to handle the case
+ * where 'uv' is not valid. */
+ if ( ! (orig_problems
+ & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+ && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
+ && ckWARN_d(WARN_DEPRECATED))
+ {
+ message = Perl_form(aTHX_ cp_above_legal_max,
+ uv, MAX_NON_DEPRECATED_CP);
+ pack_warn = packWARN(WARN_DEPRECATED);
+ }
+ }
+ else if (possible_problems & UTF8_GOT_NONCHAR) {
+ possible_problems &= ~UTF8_GOT_NONCHAR;
- do_warn:
+ if ( (flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY))
+ == UTF8_WARN_NONCHAR
+ && ckWARN_d(WARN_NONCHAR))
+ {
+ /* The code above should have guaranteed that we don't
+ * get here with errors other than overlong */
+ assert (! (orig_problems
+ & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
+
+ pack_warn = packWARN(WARN_NONCHAR);
+ message = Perl_form(aTHX_ "Unicode non-character"
+ " U+%04"UVXf" is not recommended"
+ " for open interchange", uv);
+ }
- if (pack_warn) { /* <pack_warn> was initialized to 0, and changed only
- if warnings are to be raised. */
- const char * const string = SvPVX_const(sv);
+ if (flags & UTF8_DISALLOW_NONCHAR) {
+ disallowed = TRUE;
+ }
+ } /* End of looking through the possible flags */
+
+ /* Display the message (if any) for the problem being handled in
+ * this iteration of the loop */
+ if (message) {
+ if (PL_op)
+ Perl_warner(aTHX_ pack_warn, "%s in %s", message,
+ OP_DESC(PL_op));
+ else
+ Perl_warner(aTHX_ pack_warn, "%s", message);
+ }
+ } /* End of 'while (possible_problems) {' */
- if (PL_op)
- Perl_warner(aTHX_ pack_warn, "%s in %s", string, OP_DESC(PL_op));
- else
- Perl_warner(aTHX_ pack_warn, "%s", string);
- }
+ /* Since there was a possible problem, the returned length may need to
+ * be changed from the one stored at the beginning of this function.
+ * Instead of trying to figure out if that's needed, just do it. */
+ if (retlen) {
+ *retlen = curlen;
+ }
- if (retlen) {
- *retlen = curlen;
+ if (disallowed) {
+ if (flags & UTF8_CHECK_ONLY && retlen) {
+ *retlen = ((STRLEN) -1);
+ }
+ return 0;
+ }
}
- return outlier_ret;
+ return UNI_TO_NATIVE(uv);
}
/*
if (UTF8_IS_CONTINUATION(c1)) {
c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
} else {
- /* diag_listed_as: Malformed UTF-8 character (%s) */
+ /* diag_listed_as: Malformed UTF-8 character%s */
Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
"%s %s%s",
unexpected_non_continuation_text(u - 1, 2, 1, 2),