This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8n_to_uvchr(): Note multiple malformations
authorKarl Williamson <khw@cpan.org>
Thu, 6 Oct 2016 01:09:02 +0000 (19:09 -0600)
committerKarl Williamson <khw@cpan.org>
Thu, 13 Oct 2016 17:18:12 +0000 (11:18 -0600)
Some UTF-8 sequences can have multiple malformations.  For example, a
sequence can be the start of an overlong representation of a code point,
and still be incomplete.  Until this commit what was generally done was
to stop looking when the first malformation was found.  This was not
correct behavior, as that malformation may be allowed, while another
unallowed one went unnoticed.  (But this did not actually create
security holes, as those allowed malformations replaced the input with a
REPLACEMENT CHARACTER.)  This commit refactors the error handling of
this function to set a flag and keep going if a malformation is found
that doesn't preclude others.  Then each is handled in a loop at the
end, warning if warranted.  The result is that there is a warning for
each malformation for which warnings should be generated, and an error
return is made if any one is disallowed.

Overflow doesn't happen except for very high code points, well above the
Unicode range, and above fitting in 31 bits.  Hence the latter 2
potential malformations are subsets of overflow, so only one warning is
output--the most dire.

This will speed up the normal case slightly, as the test for overflow is
pulled out of the loop, allowing the UV to overflow.  Then a single test
after the loop is done to see if there was overflow or not.

ext/XS-APItest/t/utf8.t
pod/perldelta.pod
pod/perldiag.pod
t/op/utf8decode.t
utf8.c
utf8.h

index 6581ee9..9e75c25 100644 (file)
@@ -63,6 +63,26 @@ for (my $i = 0; $i < 256; $i++) {
                     ? sub { return shift }
                     : sub { return join "", map { chr $native_to_i8[ord $_] }
                                             split "", shift };
+sub start_byte_to_cont($) {
+
+    # Extract the code point information from the input UTF-8 start byte, and
+    # return a continuation byte containing the same information.  This is
+    # used in constructing an overlong malformation from valid input.
+
+    my $byte = shift;
+    my $len = test_UTF8_SKIP($byte);
+    if ($len < 2) {
+        die "";
+    }
+
+    $byte = ord native_to_I8($byte);
+
+    # Copied from utf8.h.  This gets rid of the leading 1 bits.
+    $byte &= ((($len) >= 7) ? 0x00 : (0x1F >> (($len)-2)));
+
+    $byte |= (isASCII) ? 0x80 : ord I8_to_native("\xA0");
+    return chr $byte;
+}
 
 my $is64bit = length sprintf("%x", ~0) > 8;
 
@@ -1339,6 +1359,23 @@ foreach my $test (@malformations) {
     }
 }
 
+sub nonportable_regex ($) {
+
+    # Returns a pattern that matches the non-portable message raised either
+    # for the specific input code point, or the one generated when there
+    # is some malformation that precludes the message containing the specific
+    # code point
+
+    my $code_point = shift;
+
+    my $string = sprintf '(Code point 0x%x is not Unicode, and'
+                       . '|Any UTF-8 sequence that starts with'
+                       . ' "(\\\x[[:xdigit:]]{2})+" is for a'
+                       . ' non-Unicode code point, and is) not portable',
+                    $code_point;
+    return qr/$string/;
+}
+
 # Now test the cases where a legal code point is generated, but may or may not
 # be allowed/warned on.
 my @tests = (
@@ -1368,7 +1405,7 @@ my @tests = (
         $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
         'non_unicode', 0x110000,
         (isASCII) ? 4 : 5,
-        qr/not Unicode.* may not be portable/
+        qr/(not Unicode|for a non-Unicode code point).* may not be portable/
     ],
     [ "non_unicode whose first byte tells that",
         (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
@@ -1376,7 +1413,7 @@ my @tests = (
         'non_unicode',
         (isASCII) ? 0x140000 : 0x200000,
         (isASCII) ? 4 : 5,
-        qr/not Unicode.* may not be portable/
+        qr/(not Unicode|for a non-Unicode code point).* may not be portable/
     ],
     [ "first of 32 consecutive non-character code points",
         (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
@@ -1639,7 +1676,7 @@ my @tests = (
         # 32-bit machines
         $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
         'utf8', 0x80000000, (isASCII) ? 7 :14,
-        qr/Code point 0x80000000 is not Unicode, and not portable/
+        nonportable_regex(0x80000000)
     ],
     [ "requires at least 32 bits, and use SUPER-type flags, instead of ABOVE_31_BIT",
         (isASCII)
@@ -1647,7 +1684,7 @@ my @tests = (
          : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
         $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
         'utf8', 0x80000000, (isASCII) ? 7 :14,
-        qr/Code point 0x80000000 is not Unicode, and not portable/
+        nonportable_regex(0x80000000)
     ],
     [ "overflow with warnings/disallow for more than 31 bits",
         # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
@@ -1685,7 +1722,7 @@ if ($is64bit) {
             : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
             $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
             'utf8', 0x1000000000, (isASCII) ? 13 : 14,
-            qr/Code point 0x.* is not Unicode, and not portable/
+            qr/and( is)? not portable/
         ];
     if (! isASCII) {
         push @tests,   # These could falsely show wrongly in a naive implementation
@@ -1693,37 +1730,38 @@ if ($is64bit) {
                 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                 'utf8', 0x800000000, 14,
-                qr/Code point 0x800000000 is not Unicode, and not portable/
+                nonportable_regex(0x80000000)
             ],
             [ "requires at least 32 bits",
                 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                 'utf8', 0x10000000000, 14,
-                qr/Code point 0x10000000000 is not Unicode, and not portable/
+                nonportable_regex(0x10000000000)
             ],
             [ "requires at least 32 bits",
                 I8_to_native("\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                 'utf8', 0x200000000000, 14,
-                qr/Code point 0x200000000000 is not Unicode, and not portable/
+                nonportable_regex(0x20000000000)
             ],
             [ "requires at least 32 bits",
                 I8_to_native("\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                 'utf8', 0x4000000000000, 14,
-                qr/Code point 0x4000000000000 is not Unicode, and not portable/
+                nonportable_regex(0x4000000000000)
             ],
             [ "requires at least 32 bits",
                 I8_to_native("\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                 'utf8', 0x80000000000000, 14,
-                qr/Code point 0x80000000000000 is not Unicode, and not portable/
+                nonportable_regex(0x80000000000000)
             ],
             [ "requires at least 32 bits",
                 I8_to_native("\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
+                   #IBM-1047  \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                 'utf8', 0x1000000000000000, 14,
-                qr/Code point 0x1000000000000000 is not Unicode, and not portable/
+                nonportable_regex(0x1000000000000000)
             ];
     }
 }
@@ -1732,7 +1770,7 @@ foreach my $test (@tests) {
     my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test;
 
     my $length = length $bytes;
-    my $will_overflow = $testname =~ /overflow/;
+    my $will_overflow = $testname =~ /overflow/ ? 'overflow' : "";
 
     {
         use warnings;
@@ -1875,6 +1913,83 @@ foreach my $test (@tests) {
         foreach my $warn_flag (0, $warn_flags) {
             foreach my $disallow_flag (0, $disallow_flags) {
                 foreach my $do_warning (0, 1) {
+
+                    # We try each of the above with various combinations of
+                    # malformations that can occur on the same input sequence.
+                    foreach my $short ("",
+                                       "short",
+                                       "unexpected non-continuation")
+                    {
+                        # The non-characters can't be discerned with a short
+                        # malformation
+                        next if $short && $testname =~ /non-character/;
+
+                        foreach my $overlong ("", "overlong") {
+
+                            # Our hard-coded overlong starts with \xFE, so
+                            # can't handle anything larger.
+                            next if $overlong
+                            && ord native_to_I8(substr($bytes, 0, 1)) >= 0xFE;
+
+                            my @malformations;
+                            push @malformations, $short if $short;
+                            push @malformations, $overlong if $overlong;
+
+                            # The overflow malformation test in the input
+                            # array is coerced into being treated like one of
+                            # the others.
+                            push @malformations, 'overflow' if $will_overflow;
+
+                            my $malformations_name = join "/", @malformations;
+                            $malformations_name .= " malformation"
+                                                        if $malformations_name;
+                            $malformations_name .= "s" if @malformations > 1;
+                            my $this_bytes = $bytes;
+                            my $this_length = $length;
+                            my $expected_uv = $allowed_uv;
+                            my $this_expected_len = $expected_len;
+                            if ($malformations_name) {
+                                $expected_uv = 0;
+
+                                # Coerce the input into the desired
+                                # malformation
+                                if ($malformations_name =~ /overlong/) {
+
+                                    # For an overlong, we convert the original
+                                    # start byte into a continuation byte with
+                                    # the same data bits as originally. ...
+                                    substr($this_bytes, 0, 1)
+                                        = start_byte_to_cont(substr($this_bytes,
+                                                                    0, 1));
+
+                                    # ... Then we prepend it with a known
+                                    # overlong sequence.  This should evaluate
+                                    # to the exact same code point as the
+                                    # original.
+                                    $this_bytes = "\xfe"
+                                               . ("\x80"
+                                                   x ( 6 - length($this_bytes)))
+                                               . $this_bytes;
+                                    $this_length = length($this_bytes);
+                                    $this_expected_len = 7;
+                                }
+                                if ($malformations_name =~ /short/) {
+
+                                    # Just tell the test to not look far
+                                    # enough into the input.
+                                    $this_length--;
+                                    $this_expected_len--;
+                                }
+                                elsif ($malformations_name
+                                                        =~ /non-continuation/)
+                                {
+                                    # Change the final continuation byte into
+                                    # a non one.
+                                    substr($this_bytes, -1, 1) = '?';
+                                    $this_expected_len--;
+                                }
+                            }
+
                             my $eval_warn = $do_warning
                                         ? "use warnings '$warning'"
                                         : $warning eq "utf8"
@@ -1882,17 +1997,18 @@ foreach my $test (@tests) {
                                             : ( "use warnings 'utf8';"
                                               . " no warnings '$warning'");
 
-                            # is effectively disallowed if will overflow, even
-                            # if the flag indicates it is allowed, fix up test
-                            # name to indicate this as well
-                            my $disallowed = $disallow_flag || $will_overflow;
-
+                            # Is effectively disallowed if we've set up a
+                            # malformation, even if the flag indicates it is
+                            # allowed.  Fix up test name to indicate this as
+                            # well
+                            my $disallowed = $disallow_flag
+                                          || $malformations_name;
                             my $this_name = "utf8n_to_uvchr() $testname: "
-                                          . (($disallow_flag)
-                                             ? 'disallowed'
-                                             : ($disallowed)
-                                               ? 'ABOVE_31_BIT allowed'
-                                               : 'allowed');
+                                                        . (($disallow_flag)
+                                                            ? 'disallowed'
+                                                            : $disallowed
+                                                            ? $disallowed
+                                                            : 'allowed');
                             $this_name .= ", $eval_warn";
                             $this_name .= ", " . (($warn_flag)
                                                 ? 'with warning flag'
@@ -1900,13 +2016,15 @@ foreach my $test (@tests) {
 
                             undef @warnings;
                             my $ret_ref;
-                            my $display_bytes = display_bytes($bytes);
+                            my $display_bytes = display_bytes($this_bytes);
                             my $call = "Call was: $eval_warn; \$ret_ref"
                                      . " = test_utf8n_to_uvchr('$display_bytes'"
-                                     . ", $length, $warn_flag|$disallow_flag)";
+                                     . ", $this_length, $warn_flag"
+                                     . "|$disallow_flag)";
                             my $eval_text =      "$eval_warn; \$ret_ref"
-                                     . " = test_utf8n_to_uvchr('$bytes',"
-                                     . " $length, $warn_flag|$disallow_flag)";
+                                     . " = test_utf8n_to_uvchr('$this_bytes',"
+                                     . " $this_length, $warn_flag"
+                                     . "|$disallow_flag)";
                             eval "$eval_text";
                             if (! ok ("$@ eq ''",
                                 "$this_name: eval succeeded"))
@@ -1922,49 +2040,50 @@ foreach my $test (@tests) {
                                 }
                             }
                             else {
-                                unless (is($ret_ref->[0], $allowed_uv,
+                                unless (is($ret_ref->[0], $expected_uv,
                                         "$this_name: Returns expected uv: "
-                                        . sprintf("0x%04X", $allowed_uv)))
+                                        . sprintf("0x%04X", $expected_uv)))
                                 {
                                     diag $call;
                                 }
                             }
-                            unless (is($ret_ref->[1], $expected_len,
+                            unless (is($ret_ref->[1], $this_expected_len,
                                 "$this_name: Returns expected length:"
-                              . " $expected_len"))
+                              . " $this_expected_len"))
                             {
                                 diag $call;
                             }
 
-                            if ($will_overflow) {
+                            if (@malformations) {
                                 if (! $do_warning && $warning eq 'utf8') {
                                     goto no_warnings_expected;
                                 }
 
-                                # Will get the overflow message instead of the
-                                # expected message under these circumstances,
-                                # as they would otherwise accept an overflowed
-                                # value, which the code should not allow, so
-                                # falls back to overflow.
-                                if (is(scalar @warnings, 1,
-                                    "$this_name: Got a single warning "))
-                                {
-                                    unless (like($warnings[0], qr/overflow/,
-                                                "$this_name: Got overflow"
-                                              . " warning"))
-                                    {
-                                        diag $call;
+                                # Check that each malformation generates a
+                                # warning, removing that warning if found
+                              MALFORMATION:
+                                foreach my $malformation (@malformations) {
+                                    foreach (my $i = 0; $i < @warnings; $i++) {
+                                        if ($warnings[$i] =~ /$malformation/) {
+                                            pass("Expected and got"
+                                               . "'$malformation' warning");
+                                            splice @warnings, $i, 1;
+                                            next MALFORMATION;
+                                        }
                                     }
-                                }
-                                else {
-                                    diag $call;
-                                    output_warnings(@warnings)
-                                                            if scalar @warnings;
+                                    fail("Expected '$malformation' warning"
+                                       . "but didn't get it");
+
                                 }
                             }
-                            elsif (       ! $do_warning
-                                   && (   $warning eq 'utf8'
-                                       || $warning eq $category))
+
+                            # Any overflow will override any super or above-31
+                            # warnings.
+                            goto no_warnings_expected if $will_overflow;
+
+                            if (    ! $do_warning
+                                && (   $warning eq 'utf8'
+                                    || $warning eq $category))
                             {
                                 goto no_warnings_expected;
                             }
@@ -2000,8 +2119,9 @@ foreach my $test (@tests) {
                             # not just when the $disallow_flag is set
                             if ($disallowed) {
                                 undef @warnings;
-                                $ret_ref = test_utf8n_to_uvchr($bytes, $length,
-                                                $disallow_flag|$UTF8_CHECK_ONLY);
+                                $ret_ref = test_utf8n_to_uvchr(
+                                               $this_bytes, $this_length,
+                                               $disallow_flag|$UTF8_CHECK_ONLY);
                                 unless (is($ret_ref->[0], 0,
                                         "$this_name, CHECK_ONLY: Returns 0"))
                                 {
@@ -2024,8 +2144,9 @@ foreach my $test (@tests) {
 
                             # Now repeat some of the above, but for
                             # uvchr_to_utf8_flags().  Since this comes from an
-                            # existing code point, it hasn't overflowed.
-                            next if $will_overflow;
+                            # existing code point, it hasn't overflowed, and
+                            # isn't malformed.
+                            next if @malformations;
 
                             # The warning and disallow flags passed in are for
                             # utf8n_to_uvchr().  Convert them for
@@ -2155,6 +2276,8 @@ foreach my $test (@tests) {
                                                         if scalar @warnings;
                                 }
                             }
+                        }
+                    }
                 }
             }
         }
index dbba9ef..62eaa0b 100644 (file)
@@ -328,6 +328,13 @@ The C<PADOFFSET> type has changed from being unsigned to signed, and
 several pad-related variables such as C<PL_padix> have changed from being
 of type C<I32> to type C<PADOFFSET>.
 
+=item *
+
+The function C<L<perlapi/utf8n_to_uvchr>> has been changed to not
+abandon searching for other malformations when the first one is
+encountered.  A call to it thus can generate multiple diagnostics,
+instead of just one.
+
 =back
 
 =head1 Selected Bug Fixes
index d9f807c..6b42a00 100644 (file)
@@ -3344,7 +3344,7 @@ Perhaps the function's author was trying to write a subroutine signature
 but didn't enable that feature first (C<use feature 'signatures'>),
 so the signature was instead interpreted as a bad prototype.
 
-=item Malformed UTF-8 character (%s)
+=item Malformed UTF-8 character%s
 
 (S utf8)(F) Perl detected a string that should be UTF-8, but didn't
 comply with UTF-8 encoding rules, or represents a code point whose
index 40ec540..8de9154 100644 (file)
@@ -123,28 +123,28 @@ __DATA__
 3.1.8 N7 -             7       80:bf:80:bf:80:bf:80    -       unexpected continuation byte 0x80
 3.1.9 N64 -    64      80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf -       unexpected continuation byte 0x80
 3.2    Lonely start characters
-3.2.1 N32 -    64      c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 -       unexpected non-continuation byte 0x20, immediately after start byte 0xc0
+3.2.1 N34 -    64      c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 -       unexpected non-continuation byte 0x20, immediately after start byte 0xc0
 3.2.2 N16 -    32      e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20 -       unexpected non-continuation byte 0x20, immediately after start byte 0xe0
 3.2.3 N8 -     16      f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 -       unexpected non-continuation byte 0x20, immediately after start byte 0xf0
 3.2.4 N4 -     8       f8:20:f9:20:fa:20:fb:20 -       unexpected non-continuation byte 0x20, immediately after start byte 0xf8
 3.2.5 N2 -     4       fc:20:fd:20     -       unexpected non-continuation byte 0x20, immediately after start byte 0xfc
 3.3    Sequences with last continuation byte missing
-3.3.1 n -      1       c0      -       1 byte, need 2
-3.3.2 n -      2       e0:80   -       2 bytes, need 3
-3.3.3 n -      3       f0:80:80        -       3 bytes, need 4
-3.3.4 n -      4       f8:80:80:80     -       4 bytes, need 5
-3.3.5 n -      5       fc:80:80:80:80  -       5 bytes, need 6
+3.3.1 N2 -     1       c0      -       1 byte, need 2
+3.3.2 N2 -     2       e0:80   -       2 bytes, need 3
+3.3.3 N2 -     3       f0:80:80        -       3 bytes, need 4
+3.3.4 N2 -     4       f8:80:80:80     -       4 bytes, need 5
+3.3.5 N2 -     5       fc:80:80:80:80  -       5 bytes, need 6
 3.3.6 n -      1       df      -       1 byte, need 2
 3.3.7 n -      2       ef:bf   -       2 bytes, need 3
 3.3.8 n -      3       f7:bf:bf        -       3 bytes, need 4
 3.3.9 n -      4       fb:bf:bf:bf     -       4 bytes, need 5
 3.3.10 n -     5       fd:bf:bf:bf:bf  -       5 bytes, need 6
 3.4    Concatenation of incomplete sequences
-3.4.1 N10 -    30      c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf       -       unexpected non-continuation byte 0xe0, immediately after start byte 0xc0
+3.4.1 N15 -    30      c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf       -       unexpected non-continuation byte 0xe0, immediately after start byte 0xc0
 3.5    Impossible bytes (but not with Perl's extended UTF-8)
 3.5.1 n -      1       fe      -       1 byte, need 7
 3.5.2 n -      1       ff      -       1 byte, need 13
-3.5.3 N4 -     4       fe:fe:ff:ff     -       byte 0xfe
+3.5.3 N5 -     4       fe:fe:ff:ff     -       byte 0xfe
 4      Overlong sequences
 4.1    Examples of an overlong ASCII character
 4.1.1 n -      2       c0:af   -       overlong
diff --git a/utf8.c b/utf8.c
index 49a9204..a307fdd 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -782,10 +782,13 @@ C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
 the length, in bytes, of that character.
 
 The value of C<flags> determines the behavior when C<s> does not point to a
-well-formed UTF-8 character.  If C<flags> is 0, when a malformation is found,
-zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
-next possible position in C<s> that could begin a non-malformed character.
-Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
+well-formed UTF-8 character.  If C<flags> is 0, encountering a malformation
+causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
+is the next possible position in C<s> that could begin a non-malformed
+character.  Also, if UTF-8 warnings haven't been lexically disabled, a warning
+is raised.  Some UTF-8 input sequences may contain multiple malformations.
+This function tries to find every possible one in each call, so multiple
+warnings can be raised for each sequence.
 
 Various ALLOW flags can be set in C<flags> to allow (and not warn on)
 individual types of malformations, such as the sequence being overlong (that
@@ -877,17 +880,21 @@ UV
 Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 {
     const U8 * const s0 = s;
-    U8 * send;
+    U8 * send = NULL;           /* (initialized to silence compilers' wrong
+                                   warning) */
+    U32 possible_problems = 0;  /* A bit is set here for each potential problem
+                                   found as we go along */
     UV uv = *s;
-    STRLEN expectlen;
-    SV* sv = NULL;
-    UV outlier_ret = 0;        /* return value when input is in error or problematic
-                        */
-    UV pack_warn = 0;  /* Save result of packWARN() for later */
-    bool unexpected_non_continuation = FALSE;
-    bool overflowed = FALSE;
-    bool do_overlong_test = TRUE;   /* May have to skip this test */
+    STRLEN expectlen   = 0;     /* How long should this sequence be?
+                                   (initialized to silence compilers' wrong
+                                   warning) */
 
+    /* The below are used only if there is both an overlong malformation and a
+     * too short one.  Otherwise the first two are set to 's0' and 'send', and
+     * the third not used at all */
+    U8 * adjusted_s0 = (U8 *) s0;
+    U8 * adjusted_send;
+    UV uv_so_far = 0;   /* (Initialized to silence compilers' wrong warning) */
 
     PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
 
@@ -909,22 +916,21 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
      * returning to the caller C<*retlen> pointing to the very next byte (one
      * which is actually part of of the overflowing sequence), that could look
      * legitimate to the caller, which could discard the initial partial
-     * sequence and process the rest, inappropriately */
+     * sequence and process the rest, inappropriately.
+     *
+     * Some possible input sequences are malformed in more than one way.  This
+     * function goes to lengths to try to find all of them.  This is necessary
+     * for correctness, as the inputs may allow one malformation but not
+     * another, and if we abandon searching for others after finding the
+     * allowed one, we could allow in something that shouldn't have been.
+     */
 
-    /* Zero length strings, if allowed, of necessity are zero */
     if (UNLIKELY(curlen == 0)) {
-       if (retlen) {
-           *retlen = 0;
-       }
-
-       if (flags & UTF8_ALLOW_EMPTY) {
-           return 0;
-       }
-       if (! (flags & UTF8_CHECK_ONLY)) {
-           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)",
-                                                malformed_text));
-       }
-       goto malformed;
+        possible_problems |= UTF8_GOT_EMPTY;
+        curlen = 0;
+        uv = 0; /* XXX It could be argued that this should be
+                   UNICODE_REPLACEMENT? */
+       goto ready_to_handle_errors;
     }
 
     expectlen = UTF8SKIP(s);
@@ -944,22 +950,10 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
 
     /* A continuation character can't start a valid sequence */
     if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
-       if (flags & UTF8_ALLOW_CONTINUATION) {
-           if (retlen) {
-               *retlen = 1;
-           }
-           return UNICODE_REPLACEMENT;
-       }
-
-       if (! (flags & UTF8_CHECK_ONLY)) {
-           sv = sv_2mortal(Perl_newSVpvf(aTHX_
-                            "%s: %s (unexpected continuation byte 0x%02x,"
-                            " with no preceding start byte)",
-                            malformed_text,
-                            _byte_dump_string(s0, 1), *s0));
-       }
-       curlen = 1;
-       goto malformed;
+       possible_problems |= UTF8_GOT_CONTINUATION;
+        curlen = 1;
+        uv = UNICODE_REPLACEMENT;
+       goto ready_to_handle_errors;
     }
 
     /* Here is not a continuation byte, nor an invariant.  The only thing left
@@ -973,124 +967,103 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
     /* Now, loop through the remaining bytes in the character's sequence,
      * accumulating each into the working value as we go.  Be sure to not look
      * past the end of the input string */
-    send =  (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
-
+    send = adjusted_send
+                    = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
     for (s = s0 + 1; s < send; s++) {
        if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
-           if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
-
-               /* The original implementors viewed this malformation as more
-                * serious than the others (though I, khw, don't understand
-                * why, since other malformations also give very very wrong
-                * results), so there is no way to turn off checking for it.
-                * Set a flag, but keep going in the loop, so that we absorb
-                * the rest of the bytes that comprise the character. */
-               overflowed = TRUE;
-           }
            uv = UTF8_ACCUMULATE(uv, *s);
-       }
-       else {
-           /* Here, found a non-continuation before processing all expected
-            * bytes.  This byte begins a new character, so quit, even if
-            * allowing this malformation. */
-           unexpected_non_continuation = TRUE;
-           break;
-       }
+            continue;
+        }
+
+        /* Here, found a non-continuation before processing all expected bytes.
+         * This byte indicates the beginning of a new character, so quit, even
+         * if allowing this malformation. */
+        curlen = s - s0;    /* Save how many bytes we actually got */
+        possible_problems |= UTF8_GOT_NON_CONTINUATION;
+        goto finish_short;
     } /* End of loop through the character's bytes */
 
     /* Save how many bytes were actually in the character */
     curlen = s - s0;
 
-    /* The loop above finds two types of malformations: non-continuation and/or
-     * overflow.  The non-continuation malformation is really a too-short
-     * malformation, as it means that the current character ended before it was
-     * expected to (being terminated prematurely by the beginning of the next
-     * character, whereas in the too-short malformation there just are too few
-     * bytes available to hold the character.  In both cases, the check below
-     * that we have found the expected number of bytes would fail if executed.)
-     * Thus the non-continuation malformation is really unnecessary, being a
-     * subset of the too-short malformation.  But there may be existing
-     * applications that are expecting the non-continuation type, so we retain
-     * it, and return it in preference to the too-short malformation.  (If this
-     * code were being written from scratch, the two types might be collapsed
-     * into one.)  I, khw, am also giving priority to returning the
-     * non-continuation and too-short malformations over overflow when multiple
-     * ones are present.  I don't know of any real reason to prefer one over
-     * the other, except that it seems to me that multiple-byte errors trumps
-     * errors from a single byte */
-    if (UNLIKELY(unexpected_non_continuation)) {
-       if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
-           if (! (flags & UTF8_CHECK_ONLY)) {
-                sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s",
-                                unexpected_non_continuation_text(s0,
-                                                             send - s0,
-                                                             s - s0,
-                                                             (int) expectlen)));
-           }
-           goto malformed;
-       }
-       uv = UNICODE_REPLACEMENT;
-
-       /* Skip testing for overlongs, as the REPLACEMENT may not be the same
-        * as what the original expectations were. */
-       do_overlong_test = FALSE;
-       if (retlen) {
-           *retlen = curlen;
-       }
-    }
-    else if (UNLIKELY(curlen < expectlen)) {
-       if (! (flags & UTF8_ALLOW_SHORT)) {
-           if (! (flags & UTF8_CHECK_ONLY)) {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_
-                                "%s: %s (too short; got %d byte%s, need %d)",
-                                malformed_text,
-                                _byte_dump_string(s0, send - s0),
-                                (int)curlen,
-                                curlen == 1 ? "" : "s",
-                                (int)expectlen));
-           }
-           goto malformed;
-       }
-       uv = UNICODE_REPLACEMENT;
-       do_overlong_test = FALSE;
-       if (retlen) {
-           *retlen = curlen;
-       }
-    }
-
-    if (UNLIKELY(overflowed)) {
-        sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s: %s (overflows)",
-                                            malformed_text,
-                                            _byte_dump_string(s0, send - s0)));
-       goto malformed;
-    }
-
-    if (do_overlong_test
-       && expectlen > (STRLEN) OFFUNISKIP(uv)
-       && ! (flags & UTF8_ALLOW_LONG))
+    /* Did we get all the continuation bytes that were expected?  Note that we
+     * know this result even without executing the loop above.  But we had to
+     * do the loop to see if there are unexpected non-continuations. */
+    if (UNLIKELY(curlen < expectlen)) {
+       possible_problems |= UTF8_GOT_SHORT;
+
+      finish_short:
+        uv_so_far = uv;
+        uv = UNICODE_REPLACEMENT;
+    }
+
+    /* Note that there are two types of too-short malformation.  One is when
+     * there is actual wrong data before the normal termination of the
+     * sequence.  The other is that the sequence wasn't complete before the end
+     * of the data we are allowed to look at, based on the input 'curlen'.
+     * This means that we were passed data for a partial character, but it is
+     * valid as far as we saw.  The other is definitely invalid.  This
+     * distinction could be important to a caller, so the two types are kept
+     * separate. */
+
+    /* Check for overflow */
+    if (UNLIKELY(does_utf8_overflow(s0, send))) {
+        possible_problems |= UTF8_GOT_OVERFLOW;
+        uv = UNICODE_REPLACEMENT;
+    }
+
+    /* Check for overlong.  If no problems so far, 'uv' is the correct code
+     * point value.  Simply see if it is expressible in fewer bytes.  Otherwise
+     * we must look at the UTF-8 byte sequence itself to see if it is for an
+     * overlong */
+    if (     (   LIKELY(! possible_problems)
+              && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
+        || (   UNLIKELY(  possible_problems)
+            && (   UNLIKELY(! UTF8_IS_START(*s0))
+                || (   curlen > 1
+                    && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
+                                                                send - s0))))))
     {
-       /* The overlong malformation has lower precedence than the others.
-        * Note that if this malformation is allowed, we return the actual
-        * value, instead of the replacement character.  This is because this
-        * value is actually well-defined. */
-       if (! (flags & UTF8_CHECK_ONLY)) {
-            U8 tmpbuf[UTF8_MAXBYTES+1];
-            const U8 * const e = uvchr_to_utf8(tmpbuf, uv);
-            sv = sv_2mortal(Perl_newSVpvf(aTHX_
-                  "%s: %s (overlong; instead use %s to represent U+%0*"UVXf")",
-                  malformed_text,
-                  _byte_dump_string(s0, send - s0),
-                  _byte_dump_string(tmpbuf, e - tmpbuf),
-                  ((uv < 256) ? 2 : 4), /* Field width of 2 for small code
-                                           points */
-                  uv));
-       }
-       goto malformed;
+        possible_problems |= UTF8_GOT_LONG;
+
+        /* A convenience macro that matches either of the too-short conditions.
+         * */
+#       define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
+
+        if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
+            UV min_uv = uv_so_far;
+            STRLEN i;
+
+            /* Here, the input is both overlong and is missing some trailing
+             * bytes.  There is no single code point it could be for, but there
+             * may be enough information present to determine if what we have
+             * so far is for an unallowed code point, such as for a surrogate.
+             * The code below has the intelligence to determine this, but just
+             * for non-overlong UTF-8 sequences.  What we do here is calculate
+             * the smallest code point the input could represent if there were
+             * no too short malformation.  Then we compute and save the UTF-8
+             * for that, which is what the code below looks at instead of the
+             * raw input.  It turns out that the smallest such code point is
+             * all we need. */
+            for (i = curlen; i < expectlen; i++) {
+                min_uv = UTF8_ACCUMULATE(min_uv,
+                                     I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
+            }
+
+            Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8);
+            SAVEFREEPV((U8 *) adjusted_s0);    /* Needed because we may not get
+                                                  to free it ourselves if
+                                                  warnings are made fatal */
+            adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
+        }
     }
 
-    /* Here, the input is considered to be well-formed, but it still could be a
-     * problematic code point that is not allowed by the input parameters. */
-    if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
+    /* Now check that the input isn't for a problematic code point not allowed
+     * by the input parameters. */
+                                              /* isn't problematic if < this */
+    if (   (   (   LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
+            || (   UNLIKELY(possible_problems)
+                && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
        && ((flags & ( UTF8_DISALLOW_NONCHAR
                       |UTF8_DISALLOW_SURROGATE
                       |UTF8_DISALLOW_SUPER
@@ -1099,151 +1072,386 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
                       |UTF8_WARN_SURROGATE
                       |UTF8_WARN_SUPER
                       |UTF8_WARN_ABOVE_31_BIT))
+                   /* In case of a malformation, 'uv' is not valid, and has
+                    * been changed to something in the Unicode range.
+                    * Currently we don't output a deprecation message if there
+                    * is already a malformation, so we don't have to special
+                    * case the test immediately below */
             || (   UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
                 && ckWARN_d(WARN_DEPRECATED))))
     {
-       if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
-
-            /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
-             * generation of the sv, since no warnings are raised under CHECK */
-           if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
-               && ckWARN_d(WARN_SURROGATE))
-           {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
-               pack_warn = packWARN(WARN_SURROGATE);
-           }
-           if (flags & UTF8_DISALLOW_SURROGATE) {
-               goto disallowed;
-           }
-       }
-       else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
-           if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
-                && ckWARN_d(WARN_NON_UNICODE))
-           {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_
-                   "Code point 0x%04"UVXf" is not Unicode, may not be portable",
-                   uv));
-               pack_warn = packWARN(WARN_NON_UNICODE);
-           }
+        /* If there were no malformations, or the only malformation is an
+         * overlong, 'uv' is valid */
+        if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
+            if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+                possible_problems |= UTF8_GOT_SURROGATE;
+            }
+            else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
+                possible_problems |= UTF8_GOT_SUPER;
+            }
+            else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
+                possible_problems |= UTF8_GOT_NONCHAR;
+            }
+        }
+        else {  /* Otherwise, need to look at the source UTF-8, possibly
+                   adjusted to be non-overlong */
 
-            /* The maximum code point ever specified by a standard was
-             * 2**31 - 1.  Anything larger than that is a Perl extension that
-             * very well may not be understood by other applications (including
-             * earlier perl versions on EBCDIC platforms).  We test for these
-             * after the regular SUPER ones, and before possibly bailing out,
-             * so that the slightly more dire warning will override the regular
-             * one. */
-            if (   (flags & (UTF8_WARN_ABOVE_31_BIT
-                            |UTF8_WARN_SUPER
-                            |UTF8_DISALLOW_ABOVE_31_BIT))
-                && UNLIKELY(is_utf8_cp_above_31_bits(s0, send)))
+            if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
+                                >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
             {
-                if (  ! (flags & UTF8_CHECK_ONLY)
-                    &&  (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
-                    &&  ckWARN_d(WARN_UTF8))
+                possible_problems |= UTF8_GOT_SUPER;
+            }
+            else if (curlen > 1) {
+                if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
+                                      NATIVE_UTF8_TO_I8(*adjusted_s0),
+                                      NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
                 {
-                    sv = sv_2mortal(Perl_newSVpvf(aTHX_
-                        "Code point 0x%"UVXf" is not Unicode, and not portable",
-                        uv));
-                    pack_warn = packWARN(WARN_UTF8);
+                    possible_problems |= UTF8_GOT_SUPER;
                 }
-                if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
-                    goto disallowed;
+                else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
+                                      NATIVE_UTF8_TO_I8(*adjusted_s0),
+                                      NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
+                {
+                    possible_problems |= UTF8_GOT_SURROGATE;
                 }
             }
 
-           if (flags & UTF8_DISALLOW_SUPER) {
-               goto disallowed;
-           }
+            /* We need a complete well-formed UTF-8 character to discern
+             * non-characters, so can't look for them here */
+        }
+    }
 
-            /* The deprecated warning overrides any non-deprecated one */
-            if (UNLIKELY(uv > MAX_NON_DEPRECATED_CP) && ckWARN_d(WARN_DEPRECATED))
-            {
-                sv = sv_2mortal(Perl_newSVpvf(aTHX_ cp_above_legal_max,
-                                              uv, MAX_NON_DEPRECATED_CP));
-                pack_warn = packWARN(WARN_DEPRECATED);
-            }
-       }
-       else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
-           if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
-               && ckWARN_d(WARN_NONCHAR))
-           {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is not recommended for open interchange", uv));
-               pack_warn = packWARN(WARN_NONCHAR);
-           }
-           if (flags & UTF8_DISALLOW_NONCHAR) {
-               goto disallowed;
-           }
-       }
+  ready_to_handle_errors:
+
+    /* At this point:
+     * curlen               contains the number of bytes in the sequence that
+     *                      this call should advance the input by.
+     * possible_problems'   is 0 if there weren't any problems; otherwise a bit
+     *                      is set in it for each potential problem found.
+     * uv                   contains the code point the input sequence
+     *                      represents; or if there is a problem that prevents
+     *                      a well-defined value from being computed, it is
+     *                      some subsitute value, typically the REPLACEMENT
+     *                      CHARACTER.
+     * s0                   points to the first byte of the character
+     * send                 points to just after where that (potentially
+     *                      partial) character ends
+     * adjusted_s0          normally is the same as s0, but in case of an
+     *                      overlong for which the UTF-8 matters below, it is
+     *                      the first byte of the shortest form representation
+     *                      of the input.
+     * adjusted_send        normally is the same as 'send', but if adjusted_s0
+     *                      is set to something other than s0, this points one
+     *                      beyond its end
+     */
 
-       if (sv) {
-            outlier_ret = UNI_TO_NATIVE(uv);
-           goto do_warn;
-       }
+    if (UNLIKELY(possible_problems)) {
+        bool disallowed = FALSE;
+        const U32 orig_problems = possible_problems;
+
+        while (possible_problems) { /* Handle each possible problem */
+            UV pack_warn = 0;
+            char * message = NULL;
+
+            /* Each 'if' clause handles one problem.  They are ordered so that
+             * the first ones' messages will be displayed before the later
+             * ones; this is kinda in decreasing severity order */
+            if (possible_problems & UTF8_GOT_OVERFLOW) {
+
+                /* Overflow means also got a super and above 31 bits, but we
+                 * handle all three cases here */
+                possible_problems
+                  &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
+
+                disallowed = TRUE;
+
+                /* The warnings code explicitly says it doesn't handle the case
+                 * of packWARN2 and two categories which have parent-child
+                 * relationship.  Even if it works now to raise the warning if
+                 * either is enabled, it wouldn't necessarily do so in the
+                 * future.  We output (only) the most dire warning*/
+                if (! (flags & UTF8_CHECK_ONLY)) {
+                    if (ckWARN_d(WARN_UTF8)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                    }
+                    else if (ckWARN_d(WARN_NON_UNICODE)) {
+                        pack_warn = packWARN(WARN_NON_UNICODE);
+                    }
+                    if (pack_warn) {
+                        message = Perl_form(aTHX_ "%s: %s (overflows)",
+                                        malformed_text,
+                                        _byte_dump_string(s0, send - s0));
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_EMPTY) {
+                possible_problems &= ~UTF8_GOT_EMPTY;
+
+                if (! (flags & UTF8_ALLOW_EMPTY)) {
+                    disallowed = TRUE;
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_ "%s (empty string)",
+                                                   malformed_text);
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_CONTINUATION) {
+                possible_problems &= ~UTF8_GOT_CONTINUATION;
+
+                if (! (flags & UTF8_ALLOW_CONTINUATION)) {
+                    disallowed = TRUE;
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_
+                                "%s: %s (unexpected continuation byte 0x%02x,"
+                                " with no preceding start byte)",
+                                malformed_text,
+                                _byte_dump_string(s0, 1), *s0);
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
+                possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
+
+                if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
+                    disallowed = TRUE;
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_ "%s",
+                            unexpected_non_continuation_text(s0,
+                                                            send - s0,
+                                                            s - s0,
+                                                            (int) expectlen));
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_SHORT) {
+                possible_problems &= ~UTF8_GOT_SHORT;
+
+                if (! (flags & UTF8_ALLOW_SHORT)) {
+                    disallowed = TRUE;
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_
+                                "%s: %s (too short; got %d byte%s, need %d)",
+                                malformed_text,
+                                _byte_dump_string(s0, send - s0),
+                                (int)curlen,
+                                curlen == 1 ? "" : "s",
+                                (int)expectlen);
+                    }
+                }
 
-       /* Here, this is not considered a malformed character, so drop through
-        * to return it */
-    }
+            }
+            else if (possible_problems & UTF8_GOT_LONG) {
+                possible_problems &= ~UTF8_GOT_LONG;
+
+                if (! (flags & UTF8_ALLOW_LONG)) {
+                    disallowed = TRUE;
+
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+
+                        /* These error types cause 'uv' to be something that
+                         * isn't what was intended, so can't use it in the
+                         * message.  The other error types either can't
+                         * generate an overlong, or else the 'uv' is valid */
+                        if (orig_problems &
+                                        (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+                        {
+                            message = Perl_form(aTHX_
+                                    "%s: %s (any UTF-8 sequence that starts"
+                                    " with \"%s\" is overlong which can and"
+                                    " should be represented with a"
+                                    " different, shorter sequence)",
+                                    malformed_text,
+                                    _byte_dump_string(s0, send - s0),
+                                    _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            U8 tmpbuf[UTF8_MAXBYTES+1];
+                            const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
+                                                                        uv, 0);
+                            message = Perl_form(aTHX_
+                                "%s: %s (overlong; instead use %s to represent"
+                                " U+%0*"UVXf")",
+                                malformed_text,
+                                _byte_dump_string(s0, send - s0),
+                                _byte_dump_string(tmpbuf, e - tmpbuf),
+                                ((uv < 256) ? 2 : 4), /* Field width of 2 for
+                                                         small code points */
+                                uv);
+                        }
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_SURROGATE) {
+                possible_problems &= ~UTF8_GOT_SURROGATE;
+
+                /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
+                 * generation of the format, since no warnings are raised under
+                 * CHECK */
+                if (   (flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY))
+                                                        == UTF8_WARN_SURROGATE
+                    && ckWARN_d(WARN_SURROGATE))
+                {
+                        pack_warn = packWARN(WARN_SURROGATE);
+
+                        /* These are the only errors that can occur with a
+                        * surrogate when the 'uv' isn't valid */
+                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                            message = Perl_form(aTHX_
+                                    "UTF-16 surrogate (any UTF-8 sequence that"
+                                    " starts with \"%s\" is for a surrogate)",
+                                    _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            message = Perl_form(aTHX_
+                                            "UTF-16 surrogate U+%04"UVXf"", uv);
+                        }
+                }
 
-    return UNI_TO_NATIVE(uv);
+                if (flags & UTF8_DISALLOW_SURROGATE) {
+                    disallowed = TRUE;
+                }
+            }
+            else if (possible_problems & UTF8_GOT_SUPER) {
+                possible_problems &= ~UTF8_GOT_SUPER;
 
-    /* There are three cases which get to beyond this point.  In all 3 cases:
-     * <sv>        if not null points to a string to print as a warning.
-     * <curlen>            is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
-     *             set.
-     * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
-     *             This is done by initializing it to 0, and changing it only
-     *             for case 1).
-     * The 3 cases are:
-     * 1)   The input is valid but problematic, and to be warned about.  The
-     *     return value is the resultant code point; <*retlen> is set to
-     *     <curlen>, the number of bytes that comprise the code point.
-     *     <pack_warn> contains the result of packWARN() for the warning
-     *     types.  The entry point for this case is the label <do_warn>;
-     * 2)   The input is a valid code point but disallowed by the parameters to
-     *     this function.  The return value is 0.  If UTF8_CHECK_ONLY is set,
-     *     <*relen> is -1; otherwise it is <curlen>, the number of bytes that
-     *     comprise the code point.  <pack_warn> contains the result of
-     *     packWARN() for the warning types.  The entry point for this case is
-     *     the label <disallowed>.
-     * 3)   The input is malformed.  The return value is 0.  If UTF8_CHECK_ONLY
-     *     is set, <*relen> is -1; otherwise it is <curlen>, the number of
-     *     bytes that comprise the malformation.  All such malformations are
-     *     assumed to be warning type <utf8>.  The entry point for this case
-     *     is the label <malformed>.
-     */
+                if (   (flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY))
+                                                            == UTF8_WARN_SUPER
+                    && ckWARN_d(WARN_NON_UNICODE))
+                {
+                        pack_warn = packWARN(WARN_NON_UNICODE);
+
+                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                            message = Perl_form(aTHX_
+                                    "Any UTF-8 sequence that starts with"
+                                    " \"%s\" is for a non-Unicode code point,"
+                                    " may not be portable",
+                                    _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            message = Perl_form(aTHX_
+                                                "Code point 0x%04"UVXf" is not"
+                                                " Unicode, may not be portable",
+                                                uv);
+                        }
+                }
 
-  malformed:
+                /* The maximum code point ever specified by a standard was
+                 * 2**31 - 1.  Anything larger than that is a Perl extension
+                 * that very well may not be understood by other applications
+                 * (including earlier perl versions on EBCDIC platforms).  We
+                 * test for these after the regular SUPER ones, and before
+                 * possibly bailing out, so that the slightly more dire warning
+                 * will override the regular one. */
+                if (   (flags & (UTF8_WARN_ABOVE_31_BIT
+                                |UTF8_WARN_SUPER
+                                |UTF8_DISALLOW_ABOVE_31_BIT))
+                    && (   (   UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
+                            && UNLIKELY(is_utf8_cp_above_31_bits(
+                                                                adjusted_s0,
+                                                                adjusted_send)))
+                        || (   LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
+                            && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv)))))
+                {
+                    if (  ! (flags & UTF8_CHECK_ONLY)
+                        &&  (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
+                        &&  ckWARN_d(WARN_UTF8))
+                    {
+                        pack_warn = packWARN(WARN_UTF8);
+
+                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                            message = Perl_form(aTHX_
+                                        "Any UTF-8 sequence that starts with"
+                                        " \"%s\" is for a non-Unicode code"
+                                        " point, and is not portable",
+                                        _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            message = Perl_form(aTHX_
+                                        "Code point 0x%"UVXf" is not Unicode,"
+                                        " and not portable",
+                                         uv);
+                        }
+                    }
 
-    if (sv && ckWARN_d(WARN_UTF8)) {
-       pack_warn = packWARN(WARN_UTF8);
-    }
+                    if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
+                        disallowed = TRUE;
+                    }
+                }
 
-  disallowed:
+                if (flags & UTF8_DISALLOW_SUPER) {
+                    disallowed = TRUE;
+                }
 
-    if (flags & UTF8_CHECK_ONLY) {
-       if (retlen)
-           *retlen = ((STRLEN) -1);
-       return 0;
-    }
+                /* The deprecated warning overrides any non-deprecated one.  If
+                 * there are other problems, a deprecation message is not
+                 * really helpful, so don't bother to raise it in that case.
+                 * This also keeps the code from having to handle the case
+                 * where 'uv' is not valid. */
+                if (   ! (orig_problems
+                                    & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+                    && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
+                    && ckWARN_d(WARN_DEPRECATED))
+                {
+                    message = Perl_form(aTHX_ cp_above_legal_max,
+                                              uv, MAX_NON_DEPRECATED_CP);
+                    pack_warn = packWARN(WARN_DEPRECATED);
+                }
+            }
+            else if (possible_problems & UTF8_GOT_NONCHAR) {
+                possible_problems &= ~UTF8_GOT_NONCHAR;
 
-  do_warn:
+                if (  (flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY))
+                                                        == UTF8_WARN_NONCHAR
+                    && ckWARN_d(WARN_NONCHAR))
+                {
+                        /* The code above should have guaranteed that we don't
+                         * get here with errors other than overlong */
+                        assert (! (orig_problems
+                                        & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
+
+                        pack_warn = packWARN(WARN_NONCHAR);
+                        message = Perl_form(aTHX_ "Unicode non-character"
+                                                " U+%04"UVXf" is not recommended"
+                                                " for open interchange", uv);
+                }
 
-    if (pack_warn) {   /* <pack_warn> was initialized to 0, and changed only
-                          if warnings are to be raised. */
-       const char * const string = SvPVX_const(sv);
+                if (flags & UTF8_DISALLOW_NONCHAR) {
+                    disallowed = TRUE;
+                }
+            } /* End of looking through the possible flags */
+
+            /* Display the message (if any) for the problem being handled in
+             * this iteration of the loop */
+            if (message) {
+                if (PL_op)
+                    Perl_warner(aTHX_ pack_warn, "%s in %s", message,
+                                                 OP_DESC(PL_op));
+                else
+                    Perl_warner(aTHX_ pack_warn, "%s", message);
+            }
+        }   /* End of 'while (possible_problems) {' */
 
-       if (PL_op)
-           Perl_warner(aTHX_ pack_warn, "%s in %s", string,  OP_DESC(PL_op));
-       else
-           Perl_warner(aTHX_ pack_warn, "%s", string);
-    }
+        /* Since there was a possible problem, the returned length may need to
+         * be changed from the one stored at the beginning of this function.
+         * Instead of trying to figure out if that's needed, just do it. */
+        if (retlen) {
+            *retlen = curlen;
+        }
 
-    if (retlen) {
-       *retlen = curlen;
+        if (disallowed) {
+            if (flags & UTF8_CHECK_ONLY && retlen) {
+                *retlen = ((STRLEN) -1);
+            }
+            return 0;
+        }
     }
 
-    return outlier_ret;
+    return UNI_TO_NATIVE(uv);
 }
 
 /*
@@ -1394,7 +1602,7 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
                    if (UTF8_IS_CONTINUATION(c1)) {
                        c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
                    } else {
-                        /* diag_listed_as: Malformed UTF-8 character (%s) */
+                        /* diag_listed_as: Malformed UTF-8 character%s */
                        Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
                                     "%s %s%s",
                                     unexpected_non_continuation_text(u - 1, 2, 1, 2),
diff --git a/utf8.h b/utf8.h
index 7cd163a..c55ce26 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -526,13 +526,6 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) code point if less than
                                    | ((NATIVE_UTF8_TO_I8((U8)new))             \
                                        & UTF_CONTINUATION_MASK))
 
-/* If a value is anded with this, and the result is non-zero, then using the
- * original value in UTF8_ACCUMULATE will overflow, shifting bits off the left
- * */
-#define UTF_ACCUMULATION_OVERFLOW_MASK                                 \
-    (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * CHARBITS)           \
-           - UTF_ACCUMULATION_SHIFT))
-
 /* This works in the face of malformed UTF-8. */
 #define UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, e) (UTF8_IS_DOWNGRADEABLE_START(*s) \
                                                && ( (e) - (s) > 1)             \
@@ -718,26 +711,37 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
 
 
 #define UTF8_ALLOW_EMPTY               0x0001  /* Allow a zero length string */
+#define UTF8_GOT_EMPTY                  UTF8_ALLOW_EMPTY
 
 /* Allow first byte to be a continuation byte */
 #define UTF8_ALLOW_CONTINUATION                0x0002
+#define UTF8_GOT_CONTINUATION          UTF8_ALLOW_CONTINUATION
 
-/* Allow second... bytes to be non-continuation bytes */
+/* Unexpected continuation byte */
 #define UTF8_ALLOW_NON_CONTINUATION    0x0004
+#define UTF8_GOT_NON_CONTINUATION      UTF8_ALLOW_NON_CONTINUATION
 
 /* expecting more bytes than were available in the string */
 #define UTF8_ALLOW_SHORT               0x0008
+#define UTF8_GOT_SHORT                 UTF8_ALLOW_SHORT
 
 /* Overlong sequence; i.e., the code point can be specified in fewer bytes. */
 #define UTF8_ALLOW_LONG                 0x0010
+#define UTF8_GOT_LONG                   UTF8_ALLOW_LONG
+
+/* Currently no way to allow overflow */
+#define UTF8_GOT_OVERFLOW               0x0020
 
 #define UTF8_DISALLOW_SURROGATE                0x0040  /* Unicode surrogates */
+#define UTF8_GOT_SURROGATE             UTF8_DISALLOW_SURROGATE
 #define UTF8_WARN_SURROGATE            0x0080
 
 #define UTF8_DISALLOW_NONCHAR           0x0100 /* Unicode non-character */
+#define UTF8_GOT_NONCHAR                UTF8_DISALLOW_NONCHAR
 #define UTF8_WARN_NONCHAR               0x0200 /*  code points */
 
 #define UTF8_DISALLOW_SUPER            0x0400  /* Super-set of Unicode: code */
+#define UTF8_GOT_SUPER                 UTF8_DISALLOW_SUPER
 #define UTF8_WARN_SUPER                        0x0800  /* points above the legal max */
 
 /* Code points which never were part of the original UTF-8 standard, which only
@@ -745,6 +749,7 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
  * The first byte of these code points is FE or FF on ASCII platforms.  If the
  * first byte is FF, it will overflow a 32-bit word. */
 #define UTF8_DISALLOW_ABOVE_31_BIT      0x1000
+#define UTF8_GOT_ABOVE_31_BIT           UTF8_DISALLOW_ABOVE_31_BIT
 #define UTF8_WARN_ABOVE_31_BIT          0x2000
 
 /* For back compat, these old names are misleading for UTF_EBCDIC */