utf8n_to_uvchr(): Note multiple malformations

author Karl Williamson <khw@cpan.org>

Thu, 6 Oct 2016 01:09:02 +0000 (19:09 -0600)

committer Karl Williamson <khw@cpan.org>

Thu, 13 Oct 2016 17:18:12 +0000 (11:18 -0600)
author Karl Williamson <khw@cpan.org>
Thu, 6 Oct 2016 01:09:02 +0000 (19:09 -0600)
committer Karl Williamson <khw@cpan.org>
Thu, 13 Oct 2016 17:18:12 +0000 (11:18 -0600)
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t

index 6581ee9..9e75c25 100644 (file)
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -63,6 +63,26 @@ for (my $i = 0; $i < 256; $i++) {
                      ? sub { return shift }
                      : sub { return join "", map { chr $native_to_i8[ord $_] }
                                              split "", shift };
+sub start_byte_to_cont($) {
+
+    # Extract the code point information from the input UTF-8 start byte, and
+    # return a continuation byte containing the same information.  This is
+    # used in constructing an overlong malformation from valid input.
+
+    my $byte = shift;
+    my $len = test_UTF8_SKIP($byte);
+    if ($len < 2) {
+        die "";
+    }
+
+    $byte = ord native_to_I8($byte);
+
+    # Copied from utf8.h.  This gets rid of the leading 1 bits.
+    $byte &= ((($len) >= 7) ? 0x00 : (0x1F >> (($len)-2)));
+
+    $byte |= (isASCII) ? 0x80 : ord I8_to_native("\xA0");
+    return chr $byte;
+}
  
  my $is64bit = length sprintf("%x", ~0) > 8;
  
@@ -1339,6 +1359,23 @@ foreach my $test (@malformations) {
      }
  }
  
+sub nonportable_regex ($) {
+
+    # Returns a pattern that matches the non-portable message raised either
+    # for the specific input code point, or the one generated when there
+    # is some malformation that precludes the message containing the specific
+    # code point
+
+    my $code_point = shift;
+
+    my $string = sprintf '(Code point 0x%x is not Unicode, and'
+                       . '|Any UTF-8 sequence that starts with'
+                       . ' "(\\\x[[:xdigit:]]{2})+" is for a'
+                       . ' non-Unicode code point, and is) not portable',
+                    $code_point;
+    return qr/$string/;
+}
+
  # Now test the cases where a legal code point is generated, but may or may not
  # be allowed/warned on.
  my @tests = (
@@ -1368,7 +1405,7 @@ my @tests = (
          $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
          'non_unicode', 0x110000,
          (isASCII) ? 4 : 5,
-        qr/not Unicode.* may not be portable/
+        qr/(not Unicode|for a non-Unicode code point).* may not be portable/
      ],
      [ "non_unicode whose first byte tells that",
          (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
@@ -1376,7 +1413,7 @@ my @tests = (
          'non_unicode',
          (isASCII) ? 0x140000 : 0x200000,
          (isASCII) ? 4 : 5,
-        qr/not Unicode.* may not be portable/
+        qr/(not Unicode|for a non-Unicode code point).* may not be portable/
      ],
      [ "first of 32 consecutive non-character code points",
          (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
@@ -1639,7 +1676,7 @@ my @tests = (
          # 32-bit machines
          $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
          'utf8', 0x80000000, (isASCII) ? 7 :14,
-        qr/Code point 0x80000000 is not Unicode, and not portable/
+        nonportable_regex(0x80000000)
      ],
      [ "requires at least 32 bits, and use SUPER-type flags, instead of ABOVE_31_BIT",
          (isASCII)
@@ -1647,7 +1684,7 @@ my @tests = (
           : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
          $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
          'utf8', 0x80000000, (isASCII) ? 7 :14,
-        qr/Code point 0x80000000 is not Unicode, and not portable/
+        nonportable_regex(0x80000000)
      ],
      [ "overflow with warnings/disallow for more than 31 bits",
          # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
@@ -1685,7 +1722,7 @@ if ($is64bit) {
              : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
              $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
              'utf8', 0x1000000000, (isASCII) ? 13 : 14,
-            qr/Code point 0x.* is not Unicode, and not portable/
+            qr/and( is)? not portable/
          ];
      if (! isASCII) {
          push @tests,   # These could falsely show wrongly in a naive implementation
@@ -1693,37 +1730,38 @@ if ($is64bit) {
                  I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                  $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                  'utf8', 0x800000000, 14,
-                qr/Code point 0x800000000 is not Unicode, and not portable/
+                nonportable_regex(0x80000000)
              ],
              [ "requires at least 32 bits",
                  I8_to_native("\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                  $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                  'utf8', 0x10000000000, 14,
-                qr/Code point 0x10000000000 is not Unicode, and not portable/
+                nonportable_regex(0x10000000000)
              ],
              [ "requires at least 32 bits",
                  I8_to_native("\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                  $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                  'utf8', 0x200000000000, 14,
-                qr/Code point 0x200000000000 is not Unicode, and not portable/
+                nonportable_regex(0x20000000000)
              ],
              [ "requires at least 32 bits",
                  I8_to_native("\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                  $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                  'utf8', 0x4000000000000, 14,
-                qr/Code point 0x4000000000000 is not Unicode, and not portable/
+                nonportable_regex(0x4000000000000)
              ],
              [ "requires at least 32 bits",
                  I8_to_native("\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                  $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                  'utf8', 0x80000000000000, 14,
-                qr/Code point 0x80000000000000 is not Unicode, and not portable/
+                nonportable_regex(0x80000000000000)
              ],
              [ "requires at least 32 bits",
                  I8_to_native("\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
+                   #IBM-1047  \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
                  $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
                  'utf8', 0x1000000000000000, 14,
-                qr/Code point 0x1000000000000000 is not Unicode, and not portable/
+                nonportable_regex(0x1000000000000000)
              ];
      }
  }
@@ -1732,7 +1770,7 @@ foreach my $test (@tests) {
      my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test;
  
      my $length = length $bytes;
-    my $will_overflow = $testname =~ /overflow/;
+    my $will_overflow = $testname =~ /overflow/ ? 'overflow' : "";
  
      {
          use warnings;
@@ -1875,6 +1913,83 @@ foreach my $test (@tests) {
          foreach my $warn_flag (0, $warn_flags) {
              foreach my $disallow_flag (0, $disallow_flags) {
                  foreach my $do_warning (0, 1) {
+
+                    # We try each of the above with various combinations of
+                    # malformations that can occur on the same input sequence.
+                    foreach my $short ("",
+                                       "short",
+                                       "unexpected non-continuation")
+                    {
+                        # The non-characters can't be discerned with a short
+                        # malformation
+                        next if $short && $testname =~ /non-character/;
+
+                        foreach my $overlong ("", "overlong") {
+
+                            # Our hard-coded overlong starts with \xFE, so
+                            # can't handle anything larger.
+                            next if $overlong
+                            && ord native_to_I8(substr($bytes, 0, 1)) >= 0xFE;
+
+                            my @malformations;
+                            push @malformations, $short if $short;
+                            push @malformations, $overlong if $overlong;
+
+                            # The overflow malformation test in the input
+                            # array is coerced into being treated like one of
+                            # the others.
+                            push @malformations, 'overflow' if $will_overflow;
+
+                            my $malformations_name = join "/", @malformations;
+                            $malformations_name .= " malformation"
+                                                        if $malformations_name;
+                            $malformations_name .= "s" if @malformations > 1;
+                            my $this_bytes = $bytes;
+                            my $this_length = $length;
+                            my $expected_uv = $allowed_uv;
+                            my $this_expected_len = $expected_len;
+                            if ($malformations_name) {
+                                $expected_uv = 0;
+
+                                # Coerce the input into the desired
+                                # malformation
+                                if ($malformations_name =~ /overlong/) {
+
+                                    # For an overlong, we convert the original
+                                    # start byte into a continuation byte with
+                                    # the same data bits as originally. ...
+                                    substr($this_bytes, 0, 1)
+                                        = start_byte_to_cont(substr($this_bytes,
+                                                                    0, 1));
+
+                                    # ... Then we prepend it with a known
+                                    # overlong sequence.  This should evaluate
+                                    # to the exact same code point as the
+                                    # original.
+                                    $this_bytes = "\xfe"
+                                               . ("\x80"
+                                                   x ( 6 - length($this_bytes)))
+                                               . $this_bytes;
+                                    $this_length = length($this_bytes);
+                                    $this_expected_len = 7;
+                                }
+                                if ($malformations_name =~ /short/) {
+
+                                    # Just tell the test to not look far
+                                    # enough into the input.
+                                    $this_length--;
+                                    $this_expected_len--;
+                                }
+                                elsif ($malformations_name
+                                                        =~ /non-continuation/)
+                                {
+                                    # Change the final continuation byte into
+                                    # a non one.
+                                    substr($this_bytes, -1, 1) = '?';
+                                    $this_expected_len--;
+                                }
+                            }
+
                              my $eval_warn = $do_warning
                                          ? "use warnings '$warning'"
                                          : $warning eq "utf8"
@@ -1882,17 +1997,18 @@ foreach my $test (@tests) {
                                              : ( "use warnings 'utf8';"
                                                . " no warnings '$warning'");
  
-                            # is effectively disallowed if will overflow, even
-                            # if the flag indicates it is allowed, fix up test
-                            # name to indicate this as well
-                            my $disallowed = $disallow_flag || $will_overflow;
-
+                            # Is effectively disallowed if we've set up a
+                            # malformation, even if the flag indicates it is
+                            # allowed.  Fix up test name to indicate this as
+                            # well
+                            my $disallowed = $disallow_flag
+                                          || $malformations_name;
                              my $this_name = "utf8n_to_uvchr() $testname: "
-                                          . (($disallow_flag)
-                                             ? 'disallowed'
-                                             : ($disallowed)
-                                               ? 'ABOVE_31_BIT allowed'
-                                               : 'allowed');
+                                                        . (($disallow_flag)
+                                                            ? 'disallowed'
+                                                            : $disallowed
+                                                            ? $disallowed
+                                                            : 'allowed');
                              $this_name .= ", $eval_warn";
                              $this_name .= ", " . (($warn_flag)
                                                  ? 'with warning flag'
@@ -1900,13 +2016,15 @@ foreach my $test (@tests) {
  
                              undef @warnings;
                              my $ret_ref;
-                            my $display_bytes = display_bytes($bytes);
+                            my $display_bytes = display_bytes($this_bytes);
                              my $call = "Call was: $eval_warn; \$ret_ref"
                                       . " = test_utf8n_to_uvchr('$display_bytes'"
-                                     . ", $length, $warn_flag|$disallow_flag)";
+                                     . ", $this_length, $warn_flag"
+                                     . "|$disallow_flag)";
                              my $eval_text =      "$eval_warn; \$ret_ref"
-                                     . " = test_utf8n_to_uvchr('$bytes',"
-                                     . " $length, $warn_flag|$disallow_flag)";
+                                     . " = test_utf8n_to_uvchr('$this_bytes',"
+                                     . " $this_length, $warn_flag"
+                                     . "|$disallow_flag)";
                              eval "$eval_text";
                              if (! ok ("$@ eq ''",
                                  "$this_name: eval succeeded"))
@@ -1922,49 +2040,50 @@ foreach my $test (@tests) {
                                  }
                              }
                              else {
-                                unless (is($ret_ref->[0], $allowed_uv,
+                                unless (is($ret_ref->[0], $expected_uv,
                                          "$this_name: Returns expected uv: "
-                                        . sprintf("0x%04X", $allowed_uv)))
+                                        . sprintf("0x%04X", $expected_uv)))
                                  {
                                      diag $call;
                                  }
                              }
-                            unless (is($ret_ref->[1], $expected_len,
+                            unless (is($ret_ref->[1], $this_expected_len,
                                  "$this_name: Returns expected length:"
-                              . " $expected_len"))
+                              . " $this_expected_len"))
                              {
                                  diag $call;
                              }
  
-                            if ($will_overflow) {
+                            if (@malformations) {
                                  if (! $do_warning && $warning eq 'utf8') {
                                      goto no_warnings_expected;
                                  }
  
-                                # Will get the overflow message instead of the
-                                # expected message under these circumstances,
-                                # as they would otherwise accept an overflowed
-                                # value, which the code should not allow, so
-                                # falls back to overflow.
-                                if (is(scalar @warnings, 1,
-                                    "$this_name: Got a single warning "))
-                                {
-                                    unless (like($warnings[0], qr/overflow/,
-                                                "$this_name: Got overflow"
-                                              . " warning"))
-                                    {
-                                        diag $call;
+                                # Check that each malformation generates a
+                                # warning, removing that warning if found
+                              MALFORMATION:
+                                foreach my $malformation (@malformations) {
+                                    foreach (my $i = 0; $i < @warnings; $i++) {
+                                        if ($warnings[$i] =~ /$malformation/) {
+                                            pass("Expected and got"
+                                               . "'$malformation' warning");
+                                            splice @warnings, $i, 1;
+                                            next MALFORMATION;
+                                        }
                                      }
-                                }
-                                else {
-                                    diag $call;
-                                    output_warnings(@warnings)
-                                                            if scalar @warnings;
+                                    fail("Expected '$malformation' warning"
+                                       . "but didn't get it");
+
                                  }
                              }
-                            elsif (       ! $do_warning
-                                   && (   $warning eq 'utf8'
-                                       || $warning eq $category))
+
+                            # Any overflow will override any super or above-31
+                            # warnings.
+                            goto no_warnings_expected if $will_overflow;
+
+                            if (    ! $do_warning
+                                && (   $warning eq 'utf8'
+                                    || $warning eq $category))
                              {
                                  goto no_warnings_expected;
                              }
@@ -2000,8 +2119,9 @@ foreach my $test (@tests) {
                              # not just when the $disallow_flag is set
                              if ($disallowed) {
                                  undef @warnings;
-                                $ret_ref = test_utf8n_to_uvchr($bytes, $length,
-                                                $disallow_flag|$UTF8_CHECK_ONLY);
+                                $ret_ref = test_utf8n_to_uvchr(
+                                               $this_bytes, $this_length,
+                                               $disallow_flag|$UTF8_CHECK_ONLY);
                                  unless (is($ret_ref->[0], 0,
                                          "$this_name, CHECK_ONLY: Returns 0"))
                                  {
@@ -2024,8 +2144,9 @@ foreach my $test (@tests) {
  
                              # Now repeat some of the above, but for
                              # uvchr_to_utf8_flags().  Since this comes from an
-                            # existing code point, it hasn't overflowed.
-                            next if $will_overflow;
+                            # existing code point, it hasn't overflowed, and
+                            # isn't malformed.
+                            next if @malformations;
  
                              # The warning and disallow flags passed in are for
                              # utf8n_to_uvchr().  Convert them for
@@ -2155,6 +2276,8 @@ foreach my $test (@tests) {
                                                          if scalar @warnings;
                                  }
                              }
+                        }
+                    }
                  }
              }
          }
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index dbba9ef..62eaa0b 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -328,6 +328,13 @@ The C<PADOFFSET> type has changed from being unsigned to signed, and
  several pad-related variables such as C<PL_padix> have changed from being
  of type C<I32> to type C<PADOFFSET>.
  
+=item *
+
+The function C<L<perlapi/utf8n_to_uvchr>> has been changed to not
+abandon searching for other malformations when the first one is
+encountered.  A call to it thus can generate multiple diagnostics,
+instead of just one.
+
  =back
  
  =head1 Selected Bug Fixes
diff --git a/pod/perldiag.pod b/pod/perldiag.pod

index d9f807c..6b42a00 100644 (file)
--- a/pod/perldiag.pod
+++ b/pod/perldiag.pod
@@ -3344,7 +3344,7 @@ Perhaps the function's author was trying to write a subroutine signature
  but didn't enable that feature first (C<use feature 'signatures'>),
  so the signature was instead interpreted as a bad prototype.
  
-=item Malformed UTF-8 character (%s)
+=item Malformed UTF-8 character%s
  
  (S utf8)(F) Perl detected a string that should be UTF-8, but didn't
  comply with UTF-8 encoding rules, or represents a code point whose
diff --git a/t/op/utf8decode.t b/t/op/utf8decode.t

index 40ec540..8de9154 100644 (file)
--- a/t/op/utf8decode.t
+++ b/t/op/utf8decode.t
@@ -123,28 +123,28 @@ __DATA__
  3.1.8 N7 -             7       80:bf:80:bf:80:bf:80    -       unexpected continuation byte 0x80
  3.1.9 N64 -    64      80:81:82:83:84:85:86:87:88:89:8a:8b:8c:8d:8e:8f:90:91:92:93:94:95:96:97:98:99:9a:9b:9c:9d:9e:9f:a0:a1:a2:a3:a4:a5:a6:a7:a8:a9:aa:ab:ac:ad:ae:af:b0:b1:b2:b3:b4:b5:b6:b7:b8:b9:ba:bb:bc:bd:be:bf -       unexpected continuation byte 0x80
  3.2    Lonely start characters
-3.2.1 N32 -    64      c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 -       unexpected non-continuation byte 0x20, immediately after start byte 0xc0
+3.2.1 N34 -    64      c0:20:c1:20:c2:20:c3:20:c4:20:c5:20:c6:20:c7:20:c8:20:c9:20:ca:20:cb:20:cc:20:cd:20:ce:20:cf:20:d0:20:d1:20:d2:20:d3:20:d4:20:d5:20:d6:20:d7:20:d8:20:d9:20:da:20:db:20:dc:20:dd:20:de:20:df:20 -       unexpected non-continuation byte 0x20, immediately after start byte 0xc0
  3.2.2 N16 -    32      e0:20:e1:20:e2:20:e3:20:e4:20:e5:20:e6:20:e7:20:e8:20:e9:20:ea:20:eb:20:ec:20:ed:20:ee:20:ef:20 -       unexpected non-continuation byte 0x20, immediately after start byte 0xe0
  3.2.3 N8 -     16      f0:20:f1:20:f2:20:f3:20:f4:20:f5:20:f6:20:f7:20 -       unexpected non-continuation byte 0x20, immediately after start byte 0xf0
  3.2.4 N4 -     8       f8:20:f9:20:fa:20:fb:20 -       unexpected non-continuation byte 0x20, immediately after start byte 0xf8
  3.2.5 N2 -     4       fc:20:fd:20     -       unexpected non-continuation byte 0x20, immediately after start byte 0xfc
  3.3    Sequences with last continuation byte missing
-3.3.1 n -      1       c0      -       1 byte, need 2
-3.3.2 n -      2       e0:80   -       2 bytes, need 3
-3.3.3 n -      3       f0:80:80        -       3 bytes, need 4
-3.3.4 n -      4       f8:80:80:80     -       4 bytes, need 5
-3.3.5 n -      5       fc:80:80:80:80  -       5 bytes, need 6
+3.3.1 N2 -     1       c0      -       1 byte, need 2
+3.3.2 N2 -     2       e0:80   -       2 bytes, need 3
+3.3.3 N2 -     3       f0:80:80        -       3 bytes, need 4
+3.3.4 N2 -     4       f8:80:80:80     -       4 bytes, need 5
+3.3.5 N2 -     5       fc:80:80:80:80  -       5 bytes, need 6
  3.3.6 n -      1       df      -       1 byte, need 2
  3.3.7 n -      2       ef:bf   -       2 bytes, need 3
  3.3.8 n -      3       f7:bf:bf        -       3 bytes, need 4
  3.3.9 n -      4       fb:bf:bf:bf     -       4 bytes, need 5
  3.3.10 n -     5       fd:bf:bf:bf:bf  -       5 bytes, need 6
  3.4    Concatenation of incomplete sequences
-3.4.1 N10 -    30      c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf       -       unexpected non-continuation byte 0xe0, immediately after start byte 0xc0
+3.4.1 N15 -    30      c0:e0:80:f0:80:80:f8:80:80:80:fc:80:80:80:80:df:ef:bf:f7:bf:bf:fb:bf:bf:bf:fd:bf:bf:bf:bf       -       unexpected non-continuation byte 0xe0, immediately after start byte 0xc0
  3.5    Impossible bytes (but not with Perl's extended UTF-8)
  3.5.1 n -      1       fe      -       1 byte, need 7
  3.5.2 n -      1       ff      -       1 byte, need 13
-3.5.3 N4 -     4       fe:fe:ff:ff     -       byte 0xfe
+3.5.3 N5 -     4       fe:fe:ff:ff     -       byte 0xfe
  4      Overlong sequences
  4.1    Examples of an overlong ASCII character
  4.1.1 n -      2       c0:af   -       overlong
diff --git a/utf8.c b/utf8.c

index 49a9204..a307fdd 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -782,10 +782,13 @@ C<curlen> bytes; C<*retlen> (if C<retlen> isn't NULL) will be set to
  the length, in bytes, of that character.
  
  The value of C<flags> determines the behavior when C<s> does not point to a
-well-formed UTF-8 character.  If C<flags> is 0, when a malformation is found,
-zero is returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>) is the
-next possible position in C<s> that could begin a non-malformed character.
-Also, if UTF-8 warnings haven't been lexically disabled, a warning is raised.
+well-formed UTF-8 character.  If C<flags> is 0, encountering a malformation
+causes zero to be returned and C<*retlen> is set so that (S<C<s> + C<*retlen>>)
+is the next possible position in C<s> that could begin a non-malformed
+character.  Also, if UTF-8 warnings haven't been lexically disabled, a warning
+is raised.  Some UTF-8 input sequences may contain multiple malformations.
+This function tries to find every possible one in each call, so multiple
+warnings can be raised for each sequence.
  
  Various ALLOW flags can be set in C<flags> to allow (and not warn on)
  individual types of malformations, such as the sequence being overlong (that
@@ -877,17 +880,21 @@ UV
  Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  {
      const U8 * const s0 = s;
-    U8 * send;
+    U8 * send = NULL;           /* (initialized to silence compilers' wrong
+                                   warning) */
+    U32 possible_problems = 0;  /* A bit is set here for each potential problem
+                                   found as we go along */
      UV uv = *s;
-    STRLEN expectlen;
-    SV* sv = NULL;
-    UV outlier_ret = 0;        /* return value when input is in error or problematic
-                        */
-    UV pack_warn = 0;  /* Save result of packWARN() for later */
-    bool unexpected_non_continuation = FALSE;
-    bool overflowed = FALSE;
-    bool do_overlong_test = TRUE;   /* May have to skip this test */
+    STRLEN expectlen   = 0;     /* How long should this sequence be?
+                                   (initialized to silence compilers' wrong
+                                   warning) */
  
+    /* The below are used only if there is both an overlong malformation and a
+     * too short one.  Otherwise the first two are set to 's0' and 'send', and
+     * the third not used at all */
+    U8 * adjusted_s0 = (U8 *) s0;
+    U8 * adjusted_send;
+    UV uv_so_far = 0;   /* (Initialized to silence compilers' wrong warning) */
  
      PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
  
@@ -909,22 +916,21 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
       * returning to the caller C<*retlen> pointing to the very next byte (one
       * which is actually part of of the overflowing sequence), that could look
       * legitimate to the caller, which could discard the initial partial
-     * sequence and process the rest, inappropriately */
+     * sequence and process the rest, inappropriately.
+     *
+     * Some possible input sequences are malformed in more than one way.  This
+     * function goes to lengths to try to find all of them.  This is necessary
+     * for correctness, as the inputs may allow one malformation but not
+     * another, and if we abandon searching for others after finding the
+     * allowed one, we could allow in something that shouldn't have been.
+     */
  
-    /* Zero length strings, if allowed, of necessity are zero */
      if (UNLIKELY(curlen == 0)) {
-       if (retlen) {
-           *retlen = 0;
-       }
-
-       if (flags & UTF8_ALLOW_EMPTY) {
-           return 0;
-       }
-       if (! (flags & UTF8_CHECK_ONLY)) {
-           sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s (empty string)",
-                                                malformed_text));
-       }
-       goto malformed;
+        possible_problems |= UTF8_GOT_EMPTY;
+        curlen = 0;
+        uv = 0; /* XXX It could be argued that this should be
+                   UNICODE_REPLACEMENT? */
+       goto ready_to_handle_errors;
      }
  
      expectlen = UTF8SKIP(s);
@@ -944,22 +950,10 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
  
      /* A continuation character can't start a valid sequence */
      if (UNLIKELY(UTF8_IS_CONTINUATION(uv))) {
-       if (flags & UTF8_ALLOW_CONTINUATION) {
-           if (retlen) {
-               *retlen = 1;
-           }
-           return UNICODE_REPLACEMENT;
-       }
-
-       if (! (flags & UTF8_CHECK_ONLY)) {
-           sv = sv_2mortal(Perl_newSVpvf(aTHX_
-                            "%s: %s (unexpected continuation byte 0x%02x,"
-                            " with no preceding start byte)",
-                            malformed_text,
-                            _byte_dump_string(s0, 1), *s0));
-       }
-       curlen = 1;
-       goto malformed;
+       possible_problems |= UTF8_GOT_CONTINUATION;
+        curlen = 1;
+        uv = UNICODE_REPLACEMENT;
+       goto ready_to_handle_errors;
      }
  
      /* Here is not a continuation byte, nor an invariant.  The only thing left
@@ -973,124 +967,103 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
      /* Now, loop through the remaining bytes in the character's sequence,
       * accumulating each into the working value as we go.  Be sure to not look
       * past the end of the input string */
-    send =  (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
-
+    send = adjusted_send
+                    = (U8*) s0 + ((expectlen <= curlen) ? expectlen : curlen);
      for (s = s0 + 1; s < send; s++) {
         if (LIKELY(UTF8_IS_CONTINUATION(*s))) {
-           if (uv & UTF_ACCUMULATION_OVERFLOW_MASK) {
-
-               /* The original implementors viewed this malformation as more
-                * serious than the others (though I, khw, don't understand
-                * why, since other malformations also give very very wrong
-                * results), so there is no way to turn off checking for it.
-                * Set a flag, but keep going in the loop, so that we absorb
-                * the rest of the bytes that comprise the character. */
-               overflowed = TRUE;
-           }
             uv = UTF8_ACCUMULATE(uv, *s);
-       }
-       else {
-           /* Here, found a non-continuation before processing all expected
-            * bytes.  This byte begins a new character, so quit, even if
-            * allowing this malformation. */
-           unexpected_non_continuation = TRUE;
-           break;
-       }
+            continue;
+        }
+
+        /* Here, found a non-continuation before processing all expected bytes.
+         * This byte indicates the beginning of a new character, so quit, even
+         * if allowing this malformation. */
+        curlen = s - s0;    /* Save how many bytes we actually got */
+        possible_problems |= UTF8_GOT_NON_CONTINUATION;
+        goto finish_short;
      } /* End of loop through the character's bytes */
  
      /* Save how many bytes were actually in the character */
      curlen = s - s0;
  
-    /* The loop above finds two types of malformations: non-continuation and/or
-     * overflow.  The non-continuation malformation is really a too-short
-     * malformation, as it means that the current character ended before it was
-     * expected to (being terminated prematurely by the beginning of the next
-     * character, whereas in the too-short malformation there just are too few
-     * bytes available to hold the character.  In both cases, the check below
-     * that we have found the expected number of bytes would fail if executed.)
-     * Thus the non-continuation malformation is really unnecessary, being a
-     * subset of the too-short malformation.  But there may be existing
-     * applications that are expecting the non-continuation type, so we retain
-     * it, and return it in preference to the too-short malformation.  (If this
-     * code were being written from scratch, the two types might be collapsed
-     * into one.)  I, khw, am also giving priority to returning the
-     * non-continuation and too-short malformations over overflow when multiple
-     * ones are present.  I don't know of any real reason to prefer one over
-     * the other, except that it seems to me that multiple-byte errors trumps
-     * errors from a single byte */
-    if (UNLIKELY(unexpected_non_continuation)) {
-       if (!(flags & UTF8_ALLOW_NON_CONTINUATION)) {
-           if (! (flags & UTF8_CHECK_ONLY)) {
-                sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s",
-                                unexpected_non_continuation_text(s0,
-                                                             send - s0,
-                                                             s - s0,
-                                                             (int) expectlen)));
-           }
-           goto malformed;
-       }
-       uv = UNICODE_REPLACEMENT;
-
-       /* Skip testing for overlongs, as the REPLACEMENT may not be the same
-        * as what the original expectations were. */
-       do_overlong_test = FALSE;
-       if (retlen) {
-           *retlen = curlen;
-       }
-    }
-    else if (UNLIKELY(curlen < expectlen)) {
-       if (! (flags & UTF8_ALLOW_SHORT)) {
-           if (! (flags & UTF8_CHECK_ONLY)) {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_
-                                "%s: %s (too short; got %d byte%s, need %d)",
-                                malformed_text,
-                                _byte_dump_string(s0, send - s0),
-                                (int)curlen,
-                                curlen == 1 ? "" : "s",
-                                (int)expectlen));
-           }
-           goto malformed;
-       }
-       uv = UNICODE_REPLACEMENT;
-       do_overlong_test = FALSE;
-       if (retlen) {
-           *retlen = curlen;
-       }
-    }
-
-    if (UNLIKELY(overflowed)) {
-        sv = sv_2mortal(Perl_newSVpvf(aTHX_ "%s: %s (overflows)",
-                                            malformed_text,
-                                            _byte_dump_string(s0, send - s0)));
-       goto malformed;
-    }
-
-    if (do_overlong_test
-       && expectlen > (STRLEN) OFFUNISKIP(uv)
-       && ! (flags & UTF8_ALLOW_LONG))
+    /* Did we get all the continuation bytes that were expected?  Note that we
+     * know this result even without executing the loop above.  But we had to
+     * do the loop to see if there are unexpected non-continuations. */
+    if (UNLIKELY(curlen < expectlen)) {
+       possible_problems |= UTF8_GOT_SHORT;
+
+      finish_short:
+        uv_so_far = uv;
+        uv = UNICODE_REPLACEMENT;
+    }
+
+    /* Note that there are two types of too-short malformation.  One is when
+     * there is actual wrong data before the normal termination of the
+     * sequence.  The other is that the sequence wasn't complete before the end
+     * of the data we are allowed to look at, based on the input 'curlen'.
+     * This means that we were passed data for a partial character, but it is
+     * valid as far as we saw.  The other is definitely invalid.  This
+     * distinction could be important to a caller, so the two types are kept
+     * separate. */
+
+    /* Check for overflow */
+    if (UNLIKELY(does_utf8_overflow(s0, send))) {
+        possible_problems |= UTF8_GOT_OVERFLOW;
+        uv = UNICODE_REPLACEMENT;
+    }
+
+    /* Check for overlong.  If no problems so far, 'uv' is the correct code
+     * point value.  Simply see if it is expressible in fewer bytes.  Otherwise
+     * we must look at the UTF-8 byte sequence itself to see if it is for an
+     * overlong */
+    if (     (   LIKELY(! possible_problems)
+              && UNLIKELY(expectlen > (STRLEN) OFFUNISKIP(uv)))
+        || (   UNLIKELY(  possible_problems)
+            && (   UNLIKELY(! UTF8_IS_START(*s0))
+                || (   curlen > 1
+                    && UNLIKELY(is_utf8_overlong_given_start_byte_ok(s0,
+                                                                send - s0))))))
      {
-       /* The overlong malformation has lower precedence than the others.
-        * Note that if this malformation is allowed, we return the actual
-        * value, instead of the replacement character.  This is because this
-        * value is actually well-defined. */
-       if (! (flags & UTF8_CHECK_ONLY)) {
-            U8 tmpbuf[UTF8_MAXBYTES+1];
-            const U8 * const e = uvchr_to_utf8(tmpbuf, uv);
-            sv = sv_2mortal(Perl_newSVpvf(aTHX_
-                  "%s: %s (overlong; instead use %s to represent U+%0*"UVXf")",
-                  malformed_text,
-                  _byte_dump_string(s0, send - s0),
-                  _byte_dump_string(tmpbuf, e - tmpbuf),
-                  ((uv < 256) ? 2 : 4), /* Field width of 2 for small code
-                                           points */
-                  uv));
-       }
-       goto malformed;
+        possible_problems |= UTF8_GOT_LONG;
+
+        /* A convenience macro that matches either of the too-short conditions.
+         * */
+#       define UTF8_GOT_TOO_SHORT (UTF8_GOT_SHORT|UTF8_GOT_NON_CONTINUATION)
+
+        if (UNLIKELY(possible_problems & UTF8_GOT_TOO_SHORT)) {
+            UV min_uv = uv_so_far;
+            STRLEN i;
+
+            /* Here, the input is both overlong and is missing some trailing
+             * bytes.  There is no single code point it could be for, but there
+             * may be enough information present to determine if what we have
+             * so far is for an unallowed code point, such as for a surrogate.
+             * The code below has the intelligence to determine this, but just
+             * for non-overlong UTF-8 sequences.  What we do here is calculate
+             * the smallest code point the input could represent if there were
+             * no too short malformation.  Then we compute and save the UTF-8
+             * for that, which is what the code below looks at instead of the
+             * raw input.  It turns out that the smallest such code point is
+             * all we need. */
+            for (i = curlen; i < expectlen; i++) {
+                min_uv = UTF8_ACCUMULATE(min_uv,
+                                     I8_TO_NATIVE_UTF8(UTF_CONTINUATION_MARK));
+            }
+
+            Newx(adjusted_s0, OFFUNISKIP(min_uv) + 1, U8);
+            SAVEFREEPV((U8 *) adjusted_s0);    /* Needed because we may not get
+                                                  to free it ourselves if
+                                                  warnings are made fatal */
+            adjusted_send = uvoffuni_to_utf8_flags(adjusted_s0, min_uv, 0);
+        }
      }
  
-    /* Here, the input is considered to be well-formed, but it still could be a
-     * problematic code point that is not allowed by the input parameters. */
-    if (uv >= UNICODE_SURROGATE_FIRST /* isn't problematic if < this */
+    /* Now check that the input isn't for a problematic code point not allowed
+     * by the input parameters. */
+                                              /* isn't problematic if < this */
+    if (   (   (   LIKELY(! possible_problems) && uv >= UNICODE_SURROGATE_FIRST)
+            || (   UNLIKELY(possible_problems)
+                && isUTF8_POSSIBLY_PROBLEMATIC(*adjusted_s0)))
         && ((flags & ( UTF8_DISALLOW_NONCHAR
                        |UTF8_DISALLOW_SURROGATE
                        |UTF8_DISALLOW_SUPER
@@ -1099,151 +1072,386 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
                        |UTF8_WARN_SURROGATE
                        |UTF8_WARN_SUPER
                        |UTF8_WARN_ABOVE_31_BIT))
+                   /* In case of a malformation, 'uv' is not valid, and has
+                    * been changed to something in the Unicode range.
+                    * Currently we don't output a deprecation message if there
+                    * is already a malformation, so we don't have to special
+                    * case the test immediately below */
              || (   UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
                  && ckWARN_d(WARN_DEPRECATED))))
      {
-       if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
-
-            /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
-             * generation of the sv, since no warnings are raised under CHECK */
-           if ((flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY)) == UTF8_WARN_SURROGATE
-               && ckWARN_d(WARN_SURROGATE))
-           {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "UTF-16 surrogate U+%04"UVXf"", uv));
-               pack_warn = packWARN(WARN_SURROGATE);
-           }
-           if (flags & UTF8_DISALLOW_SURROGATE) {
-               goto disallowed;
-           }
-       }
-       else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
-           if ((flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY)) == UTF8_WARN_SUPER
-                && ckWARN_d(WARN_NON_UNICODE))
-           {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_
-                   "Code point 0x%04"UVXf" is not Unicode, may not be portable",
-                   uv));
-               pack_warn = packWARN(WARN_NON_UNICODE);
-           }
+        /* If there were no malformations, or the only malformation is an
+         * overlong, 'uv' is valid */
+        if (LIKELY(! (possible_problems & ~UTF8_GOT_LONG))) {
+            if (UNLIKELY(UNICODE_IS_SURROGATE(uv))) {
+                possible_problems |= UTF8_GOT_SURROGATE;
+            }
+            else if (UNLIKELY(uv > PERL_UNICODE_MAX)) {
+                possible_problems |= UTF8_GOT_SUPER;
+            }
+            else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
+                possible_problems |= UTF8_GOT_NONCHAR;
+            }
+        }
+        else {  /* Otherwise, need to look at the source UTF-8, possibly
+                   adjusted to be non-overlong */
  
-            /* The maximum code point ever specified by a standard was
-             * 2**31 - 1.  Anything larger than that is a Perl extension that
-             * very well may not be understood by other applications (including
-             * earlier perl versions on EBCDIC platforms).  We test for these
-             * after the regular SUPER ones, and before possibly bailing out,
-             * so that the slightly more dire warning will override the regular
-             * one. */
-            if (   (flags & (UTF8_WARN_ABOVE_31_BIT
-                            |UTF8_WARN_SUPER
-                            |UTF8_DISALLOW_ABOVE_31_BIT))
-                && UNLIKELY(is_utf8_cp_above_31_bits(s0, send)))
+            if (UNLIKELY(NATIVE_UTF8_TO_I8(*adjusted_s0)
+                                >= FIRST_START_BYTE_THAT_IS_DEFINITELY_SUPER))
              {
-                if (  ! (flags & UTF8_CHECK_ONLY)
-                    &&  (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
-                    &&  ckWARN_d(WARN_UTF8))
+                possible_problems |= UTF8_GOT_SUPER;
+            }
+            else if (curlen > 1) {
+                if (UNLIKELY(IS_UTF8_2_BYTE_SUPER(
+                                      NATIVE_UTF8_TO_I8(*adjusted_s0),
+                                      NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
                  {
-                    sv = sv_2mortal(Perl_newSVpvf(aTHX_
-                        "Code point 0x%"UVXf" is not Unicode, and not portable",
-                        uv));
-                    pack_warn = packWARN(WARN_UTF8);
+                    possible_problems |= UTF8_GOT_SUPER;
                  }
-                if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
-                    goto disallowed;
+                else if (UNLIKELY(IS_UTF8_2_BYTE_SURROGATE(
+                                      NATIVE_UTF8_TO_I8(*adjusted_s0),
+                                      NATIVE_UTF8_TO_I8(*(adjusted_s0 + 1)))))
+                {
+                    possible_problems |= UTF8_GOT_SURROGATE;
                  }
              }
  
-           if (flags & UTF8_DISALLOW_SUPER) {
-               goto disallowed;
-           }
+            /* We need a complete well-formed UTF-8 character to discern
+             * non-characters, so can't look for them here */
+        }
+    }
  
-            /* The deprecated warning overrides any non-deprecated one */
-            if (UNLIKELY(uv > MAX_NON_DEPRECATED_CP) && ckWARN_d(WARN_DEPRECATED))
-            {
-                sv = sv_2mortal(Perl_newSVpvf(aTHX_ cp_above_legal_max,
-                                              uv, MAX_NON_DEPRECATED_CP));
-                pack_warn = packWARN(WARN_DEPRECATED);
-            }
-       }
-       else if (UNLIKELY(UNICODE_IS_NONCHAR(uv))) {
-           if ((flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY)) == UTF8_WARN_NONCHAR
-               && ckWARN_d(WARN_NONCHAR))
-           {
-               sv = sv_2mortal(Perl_newSVpvf(aTHX_ "Unicode non-character U+%04"UVXf" is not recommended for open interchange", uv));
-               pack_warn = packWARN(WARN_NONCHAR);
-           }
-           if (flags & UTF8_DISALLOW_NONCHAR) {
-               goto disallowed;
-           }
-       }
+  ready_to_handle_errors:
+
+    /* At this point:
+     * curlen               contains the number of bytes in the sequence that
+     *                      this call should advance the input by.
+     * possible_problems'   is 0 if there weren't any problems; otherwise a bit
+     *                      is set in it for each potential problem found.
+     * uv                   contains the code point the input sequence
+     *                      represents; or if there is a problem that prevents
+     *                      a well-defined value from being computed, it is
+     *                      some subsitute value, typically the REPLACEMENT
+     *                      CHARACTER.
+     * s0                   points to the first byte of the character
+     * send                 points to just after where that (potentially
+     *                      partial) character ends
+     * adjusted_s0          normally is the same as s0, but in case of an
+     *                      overlong for which the UTF-8 matters below, it is
+     *                      the first byte of the shortest form representation
+     *                      of the input.
+     * adjusted_send        normally is the same as 'send', but if adjusted_s0
+     *                      is set to something other than s0, this points one
+     *                      beyond its end
+     */
  
-       if (sv) {
-            outlier_ret = UNI_TO_NATIVE(uv);
-           goto do_warn;
-       }
+    if (UNLIKELY(possible_problems)) {
+        bool disallowed = FALSE;
+        const U32 orig_problems = possible_problems;
+
+        while (possible_problems) { /* Handle each possible problem */
+            UV pack_warn = 0;
+            char * message = NULL;
+
+            /* Each 'if' clause handles one problem.  They are ordered so that
+             * the first ones' messages will be displayed before the later
+             * ones; this is kinda in decreasing severity order */
+            if (possible_problems & UTF8_GOT_OVERFLOW) {
+
+                /* Overflow means also got a super and above 31 bits, but we
+                 * handle all three cases here */
+                possible_problems
+                  &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
+
+                disallowed = TRUE;
+
+                /* The warnings code explicitly says it doesn't handle the case
+                 * of packWARN2 and two categories which have parent-child
+                 * relationship.  Even if it works now to raise the warning if
+                 * either is enabled, it wouldn't necessarily do so in the
+                 * future.  We output (only) the most dire warning*/
+                if (! (flags & UTF8_CHECK_ONLY)) {
+                    if (ckWARN_d(WARN_UTF8)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                    }
+                    else if (ckWARN_d(WARN_NON_UNICODE)) {
+                        pack_warn = packWARN(WARN_NON_UNICODE);
+                    }
+                    if (pack_warn) {
+                        message = Perl_form(aTHX_ "%s: %s (overflows)",
+                                        malformed_text,
+                                        _byte_dump_string(s0, send - s0));
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_EMPTY) {
+                possible_problems &= ~UTF8_GOT_EMPTY;
+
+                if (! (flags & UTF8_ALLOW_EMPTY)) {
+                    disallowed = TRUE;
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_ "%s (empty string)",
+                                                   malformed_text);
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_CONTINUATION) {
+                possible_problems &= ~UTF8_GOT_CONTINUATION;
+
+                if (! (flags & UTF8_ALLOW_CONTINUATION)) {
+                    disallowed = TRUE;
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_
+                                "%s: %s (unexpected continuation byte 0x%02x,"
+                                " with no preceding start byte)",
+                                malformed_text,
+                                _byte_dump_string(s0, 1), *s0);
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
+                possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
+
+                if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
+                    disallowed = TRUE;
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_ "%s",
+                            unexpected_non_continuation_text(s0,
+                                                            send - s0,
+                                                            s - s0,
+                                                            (int) expectlen));
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_SHORT) {
+                possible_problems &= ~UTF8_GOT_SHORT;
+
+                if (! (flags & UTF8_ALLOW_SHORT)) {
+                    disallowed = TRUE;
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+                        message = Perl_form(aTHX_
+                                "%s: %s (too short; got %d byte%s, need %d)",
+                                malformed_text,
+                                _byte_dump_string(s0, send - s0),
+                                (int)curlen,
+                                curlen == 1 ? "" : "s",
+                                (int)expectlen);
+                    }
+                }
  
-       /* Here, this is not considered a malformed character, so drop through
-        * to return it */
-    }
+            }
+            else if (possible_problems & UTF8_GOT_LONG) {
+                possible_problems &= ~UTF8_GOT_LONG;
+
+                if (! (flags & UTF8_ALLOW_LONG)) {
+                    disallowed = TRUE;
+
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+
+                        /* These error types cause 'uv' to be something that
+                         * isn't what was intended, so can't use it in the
+                         * message.  The other error types either can't
+                         * generate an overlong, or else the 'uv' is valid */
+                        if (orig_problems &
+                                        (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+                        {
+                            message = Perl_form(aTHX_
+                                    "%s: %s (any UTF-8 sequence that starts"
+                                    " with \"%s\" is overlong which can and"
+                                    " should be represented with a"
+                                    " different, shorter sequence)",
+                                    malformed_text,
+                                    _byte_dump_string(s0, send - s0),
+                                    _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            U8 tmpbuf[UTF8_MAXBYTES+1];
+                            const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
+                                                                        uv, 0);
+                            message = Perl_form(aTHX_
+                                "%s: %s (overlong; instead use %s to represent"
+                                " U+%0*"UVXf")",
+                                malformed_text,
+                                _byte_dump_string(s0, send - s0),
+                                _byte_dump_string(tmpbuf, e - tmpbuf),
+                                ((uv < 256) ? 2 : 4), /* Field width of 2 for
+                                                         small code points */
+                                uv);
+                        }
+                    }
+                }
+            }
+            else if (possible_problems & UTF8_GOT_SURROGATE) {
+                possible_problems &= ~UTF8_GOT_SURROGATE;
+
+                /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
+                 * generation of the format, since no warnings are raised under
+                 * CHECK */
+                if (   (flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY))
+                                                        == UTF8_WARN_SURROGATE
+                    && ckWARN_d(WARN_SURROGATE))
+                {
+                        pack_warn = packWARN(WARN_SURROGATE);
+
+                        /* These are the only errors that can occur with a
+                        * surrogate when the 'uv' isn't valid */
+                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                            message = Perl_form(aTHX_
+                                    "UTF-16 surrogate (any UTF-8 sequence that"
+                                    " starts with \"%s\" is for a surrogate)",
+                                    _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            message = Perl_form(aTHX_
+                                            "UTF-16 surrogate U+%04"UVXf"", uv);
+                        }
+                }
  
-    return UNI_TO_NATIVE(uv);
+                if (flags & UTF8_DISALLOW_SURROGATE) {
+                    disallowed = TRUE;
+                }
+            }
+            else if (possible_problems & UTF8_GOT_SUPER) {
+                possible_problems &= ~UTF8_GOT_SUPER;
  
-    /* There are three cases which get to beyond this point.  In all 3 cases:
-     * <sv>        if not null points to a string to print as a warning.
-     * <curlen>            is what <*retlen> should be set to if UTF8_CHECK_ONLY isn't
-     *             set.
-     * <outlier_ret> is what return value to use if UTF8_CHECK_ONLY isn't set.
-     *             This is done by initializing it to 0, and changing it only
-     *             for case 1).
-     * The 3 cases are:
-     * 1)   The input is valid but problematic, and to be warned about.  The
-     *     return value is the resultant code point; <*retlen> is set to
-     *     <curlen>, the number of bytes that comprise the code point.
-     *     <pack_warn> contains the result of packWARN() for the warning
-     *     types.  The entry point for this case is the label <do_warn>;
-     * 2)   The input is a valid code point but disallowed by the parameters to
-     *     this function.  The return value is 0.  If UTF8_CHECK_ONLY is set,
-     *     <*relen> is -1; otherwise it is <curlen>, the number of bytes that
-     *     comprise the code point.  <pack_warn> contains the result of
-     *     packWARN() for the warning types.  The entry point for this case is
-     *     the label <disallowed>.
-     * 3)   The input is malformed.  The return value is 0.  If UTF8_CHECK_ONLY
-     *     is set, <*relen> is -1; otherwise it is <curlen>, the number of
-     *     bytes that comprise the malformation.  All such malformations are
-     *     assumed to be warning type <utf8>.  The entry point for this case
-     *     is the label <malformed>.
-     */
+                if (   (flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY))
+                                                            == UTF8_WARN_SUPER
+                    && ckWARN_d(WARN_NON_UNICODE))
+                {
+                        pack_warn = packWARN(WARN_NON_UNICODE);
+
+                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                            message = Perl_form(aTHX_
+                                    "Any UTF-8 sequence that starts with"
+                                    " \"%s\" is for a non-Unicode code point,"
+                                    " may not be portable",
+                                    _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            message = Perl_form(aTHX_
+                                                "Code point 0x%04"UVXf" is not"
+                                                " Unicode, may not be portable",
+                                                uv);
+                        }
+                }
  
-  malformed:
+                /* The maximum code point ever specified by a standard was
+                 * 2**31 - 1.  Anything larger than that is a Perl extension
+                 * that very well may not be understood by other applications
+                 * (including earlier perl versions on EBCDIC platforms).  We
+                 * test for these after the regular SUPER ones, and before
+                 * possibly bailing out, so that the slightly more dire warning
+                 * will override the regular one. */
+                if (   (flags & (UTF8_WARN_ABOVE_31_BIT
+                                |UTF8_WARN_SUPER
+                                |UTF8_DISALLOW_ABOVE_31_BIT))
+                    && (   (   UNLIKELY(orig_problems & UTF8_GOT_TOO_SHORT)
+                            && UNLIKELY(is_utf8_cp_above_31_bits(
+                                                                adjusted_s0,
+                                                                adjusted_send)))
+                        || (   LIKELY(! (orig_problems & UTF8_GOT_TOO_SHORT))
+                            && UNLIKELY(UNICODE_IS_ABOVE_31_BIT(uv)))))
+                {
+                    if (  ! (flags & UTF8_CHECK_ONLY)
+                        &&  (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_WARN_SUPER))
+                        &&  ckWARN_d(WARN_UTF8))
+                    {
+                        pack_warn = packWARN(WARN_UTF8);
+
+                        if (orig_problems & UTF8_GOT_TOO_SHORT) {
+                            message = Perl_form(aTHX_
+                                        "Any UTF-8 sequence that starts with"
+                                        " \"%s\" is for a non-Unicode code"
+                                        " point, and is not portable",
+                                        _byte_dump_string(s0, curlen));
+                        }
+                        else {
+                            message = Perl_form(aTHX_
+                                        "Code point 0x%"UVXf" is not Unicode,"
+                                        " and not portable",
+                                         uv);
+                        }
+                    }
  
-    if (sv && ckWARN_d(WARN_UTF8)) {
-       pack_warn = packWARN(WARN_UTF8);
-    }
+                    if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
+                        disallowed = TRUE;
+                    }
+                }
  
-  disallowed:
+                if (flags & UTF8_DISALLOW_SUPER) {
+                    disallowed = TRUE;
+                }
  
-    if (flags & UTF8_CHECK_ONLY) {
-       if (retlen)
-           *retlen = ((STRLEN) -1);
-       return 0;
-    }
+                /* The deprecated warning overrides any non-deprecated one.  If
+                 * there are other problems, a deprecation message is not
+                 * really helpful, so don't bother to raise it in that case.
+                 * This also keeps the code from having to handle the case
+                 * where 'uv' is not valid. */
+                if (   ! (orig_problems
+                                    & (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+                    && UNLIKELY(uv > MAX_NON_DEPRECATED_CP)
+                    && ckWARN_d(WARN_DEPRECATED))
+                {
+                    message = Perl_form(aTHX_ cp_above_legal_max,
+                                              uv, MAX_NON_DEPRECATED_CP);
+                    pack_warn = packWARN(WARN_DEPRECATED);
+                }
+            }
+            else if (possible_problems & UTF8_GOT_NONCHAR) {
+                possible_problems &= ~UTF8_GOT_NONCHAR;
  
-  do_warn:
+                if (  (flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY))
+                                                        == UTF8_WARN_NONCHAR
+                    && ckWARN_d(WARN_NONCHAR))
+                {
+                        /* The code above should have guaranteed that we don't
+                         * get here with errors other than overlong */
+                        assert (! (orig_problems
+                                        & ~(UTF8_GOT_LONG|UTF8_GOT_NONCHAR)));
+
+                        pack_warn = packWARN(WARN_NONCHAR);
+                        message = Perl_form(aTHX_ "Unicode non-character"
+                                                " U+%04"UVXf" is not recommended"
+                                                " for open interchange", uv);
+                }
  
-    if (pack_warn) {   /* <pack_warn> was initialized to 0, and changed only
-                          if warnings are to be raised. */
-       const char * const string = SvPVX_const(sv);
+                if (flags & UTF8_DISALLOW_NONCHAR) {
+                    disallowed = TRUE;
+                }
+            } /* End of looking through the possible flags */
+
+            /* Display the message (if any) for the problem being handled in
+             * this iteration of the loop */
+            if (message) {
+                if (PL_op)
+                    Perl_warner(aTHX_ pack_warn, "%s in %s", message,
+                                                 OP_DESC(PL_op));
+                else
+                    Perl_warner(aTHX_ pack_warn, "%s", message);
+            }
+        }   /* End of 'while (possible_problems) {' */
  
-       if (PL_op)
-           Perl_warner(aTHX_ pack_warn, "%s in %s", string,  OP_DESC(PL_op));
-       else
-           Perl_warner(aTHX_ pack_warn, "%s", string);
-    }
+        /* Since there was a possible problem, the returned length may need to
+         * be changed from the one stored at the beginning of this function.
+         * Instead of trying to figure out if that's needed, just do it. */
+        if (retlen) {
+            *retlen = curlen;
+        }
  
-    if (retlen) {
-       *retlen = curlen;
+        if (disallowed) {
+            if (flags & UTF8_CHECK_ONLY && retlen) {
+                *retlen = ((STRLEN) -1);
+            }
+            return 0;
+        }
      }
  
-    return outlier_ret;
+    return UNI_TO_NATIVE(uv);
  }
  
  /*
@@ -1394,7 +1602,7 @@ Perl_bytes_cmp_utf8(pTHX_ const U8 *b, STRLEN blen, const U8 *u, STRLEN ulen)
                     if (UTF8_IS_CONTINUATION(c1)) {
                         c = EIGHT_BIT_UTF8_TO_NATIVE(c, c1);
                     } else {
-                        /* diag_listed_as: Malformed UTF-8 character (%s) */
+                        /* diag_listed_as: Malformed UTF-8 character%s */
                         Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
                                      "%s %s%s",
                                      unexpected_non_continuation_text(u - 1, 2, 1, 2),
diff --git a/utf8.h b/utf8.h

index 7cd163a..c55ce26 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -526,13 +526,6 @@ encoded as UTF-8.  C<cp> is a native (ASCII or EBCDIC) code point if less than
                                     | ((NATIVE_UTF8_TO_I8((U8)new))             \
                                         & UTF_CONTINUATION_MASK))
  
-/* If a value is anded with this, and the result is non-zero, then using the
- * original value in UTF8_ACCUMULATE will overflow, shifting bits off the left
- * */
-#define UTF_ACCUMULATION_OVERFLOW_MASK                                 \
-    (((UV) UTF_CONTINUATION_MASK) << ((sizeof(UV) * CHARBITS)           \
-           - UTF_ACCUMULATION_SHIFT))
-
  /* This works in the face of malformed UTF-8. */
  #define UTF8_IS_NEXT_CHAR_DOWNGRADEABLE(s, e) (UTF8_IS_DOWNGRADEABLE_START(*s) \
                                                 && ( (e) - (s) > 1)             \
@@ -718,26 +711,37 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
  
  
  #define UTF8_ALLOW_EMPTY               0x0001  /* Allow a zero length string */
+#define UTF8_GOT_EMPTY                  UTF8_ALLOW_EMPTY
  
  /* Allow first byte to be a continuation byte */
  #define UTF8_ALLOW_CONTINUATION                0x0002
+#define UTF8_GOT_CONTINUATION          UTF8_ALLOW_CONTINUATION
  
-/* Allow second... bytes to be non-continuation bytes */
+/* Unexpected continuation byte */
  #define UTF8_ALLOW_NON_CONTINUATION    0x0004
+#define UTF8_GOT_NON_CONTINUATION      UTF8_ALLOW_NON_CONTINUATION
  
  /* expecting more bytes than were available in the string */
  #define UTF8_ALLOW_SHORT               0x0008
+#define UTF8_GOT_SHORT                 UTF8_ALLOW_SHORT
  
  /* Overlong sequence; i.e., the code point can be specified in fewer bytes. */
  #define UTF8_ALLOW_LONG                 0x0010
+#define UTF8_GOT_LONG                   UTF8_ALLOW_LONG
+
+/* Currently no way to allow overflow */
+#define UTF8_GOT_OVERFLOW               0x0020
  
  #define UTF8_DISALLOW_SURROGATE                0x0040  /* Unicode surrogates */
+#define UTF8_GOT_SURROGATE             UTF8_DISALLOW_SURROGATE
  #define UTF8_WARN_SURROGATE            0x0080
  
  #define UTF8_DISALLOW_NONCHAR           0x0100 /* Unicode non-character */
+#define UTF8_GOT_NONCHAR                UTF8_DISALLOW_NONCHAR
  #define UTF8_WARN_NONCHAR               0x0200 /*  code points */
  
  #define UTF8_DISALLOW_SUPER            0x0400  /* Super-set of Unicode: code */
+#define UTF8_GOT_SUPER                 UTF8_DISALLOW_SUPER
  #define UTF8_WARN_SUPER                        0x0800  /* points above the legal max */
  
  /* Code points which never were part of the original UTF-8 standard, which only
@@ -745,6 +749,7 @@ case any call to string overloading updates the internal UTF-8 encoding flag.
   * The first byte of these code points is FE or FF on ASCII platforms.  If the
   * first byte is FF, it will overflow a 32-bit word. */
  #define UTF8_DISALLOW_ABOVE_31_BIT      0x1000
+#define UTF8_GOT_ABOVE_31_BIT           UTF8_DISALLOW_ABOVE_31_BIT
  #define UTF8_WARN_ABOVE_31_BIT          0x2000
  
  /* For back compat, these old names are misleading for UTF_EBCDIC */
author	Karl Williamson <khw@cpan.org>
	Thu, 6 Oct 2016 01:09:02 +0000 (19:09 -0600)
committer	Karl Williamson <khw@cpan.org>
	Thu, 13 Oct 2016 17:18:12 +0000 (11:18 -0600)
ext/XS-APItest/t/utf8.t		patch \| blob \| blame \| history
pod/perldelta.pod		patch \| blob \| blame \| history
pod/perldiag.pod		patch \| blob \| blame \| history
t/op/utf8decode.t		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history
utf8.h		patch \| blob \| blame \| history