Add utf8n_to_uvchr_error

author Karl Williamson <khw@cpan.org>

Tue, 11 Oct 2016 03:18:37 +0000 (21:18 -0600)

committer Karl Williamson <khw@cpan.org>

Thu, 13 Oct 2016 17:18:12 +0000 (11:18 -0600)
author Karl Williamson <khw@cpan.org>
Tue, 11 Oct 2016 03:18:37 +0000 (21:18 -0600)
committer Karl Williamson <khw@cpan.org>
Thu, 13 Oct 2016 17:18:12 +0000 (11:18 -0600)
diff --git a/embed.fnc b/embed.fnc

index 64a812b..46426b6 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1737,7 +1737,15 @@ Amd      |UV     |utf8_to_uvchr_buf      |NN const U8 *s|NN const U8 *send|NULLOK STRLEN *retl
  ApdD   |UV     |utf8_to_uvuni_buf      |NN const U8 *s|NN const U8 *send|NULLOK STRLEN *retlen
  pM     |bool   |check_utf8_print       |NN const U8 *s|const STRLEN len
  
-Adp    |UV     |utf8n_to_uvchr |NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|const U32 flags
+Adop   |UV     |utf8n_to_uvchr |NN const U8 *s                             \
+                               |STRLEN curlen                              \
+                               |NULLOK STRLEN *retlen                      \
+                               |const U32 flags
+Adp    |UV     |utf8n_to_uvchr_error|NN const U8 *s                        \
+                               |STRLEN curlen                              \
+                               |NULLOK STRLEN *retlen                      \
+                               |const U32 flags                            \
+                               |NULLOK U32 * errors
  AipnR  |UV     |valid_utf8_to_uvchr    |NN const U8 *s|NULLOK STRLEN *retlen
  Ap     |UV     |utf8n_to_uvuni|NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|U32 flags
  
diff --git a/embed.h b/embed.h

index c2ed3f1..5df381c 100644 (file)
--- a/embed.h
+++ b/embed.h
@@ -737,7 +737,7 @@
  #define utf8_to_uvchr(a,b)     Perl_utf8_to_uvchr(aTHX_ a,b)
  #define utf8_to_uvuni(a,b)     Perl_utf8_to_uvuni(aTHX_ a,b)
  #define utf8_to_uvuni_buf(a,b,c)       Perl_utf8_to_uvuni_buf(aTHX_ a,b,c)
-#define utf8n_to_uvchr(a,b,c,d)        Perl_utf8n_to_uvchr(aTHX_ a,b,c,d)
+#define utf8n_to_uvchr_error(a,b,c,d,e)        Perl_utf8n_to_uvchr_error(aTHX_ a,b,c,d,e)
  #define utf8n_to_uvuni(a,b,c,d)        Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
  #define uvoffuni_to_utf8_flags(a,b,c)  Perl_uvoffuni_to_utf8_flags(aTHX_ a,b,c)
  #define uvuni_to_utf8(a,b)     Perl_uvuni_to_utf8(aTHX_ a,b)
diff --git a/ext/XS-APItest/APItest.xs b/ext/XS-APItest/APItest.xs

index ce94968..6b6f45f 100644 (file)
--- a/ext/XS-APItest/APItest.xs
+++ b/ext/XS-APItest/APItest.xs
@@ -1362,7 +1362,7 @@ bytes_cmp_utf8(bytes, utf8)
         RETVAL
  
  AV *
-test_utf8n_to_uvchr(s, len, flags)
+test_utf8n_to_uvchr_error(s, len, flags)
  
          SV *s
          SV *len
@@ -1371,20 +1371,25 @@ test_utf8n_to_uvchr(s, len, flags)
          STRLEN retlen;
          UV ret;
          STRLEN slen;
+        U32 errors;
  
      CODE:
-        /* Call utf8n_to_uvchr() with the inputs.  It always asks for the
-         * actual length to be returned
+        /* Now that utf8n_to_uvchr() is a trivial wrapper for
+         * utf8n_to_uvchr_error(), call the latter with the inputs.  It always
+         * asks for the actual length to be returned and errors to be returned
           *
           * Length to assume <s> is; not checked, so could have buffer overflow
           */
          RETVAL = newAV();
          sv_2mortal((SV*)RETVAL);
  
-        ret
-         = utf8n_to_uvchr((U8*) SvPV(s, slen), SvUV(len), &retlen, SvUV(flags));
+        ret = utf8n_to_uvchr_error((U8*) SvPV(s, slen),
+                                         SvUV(len),
+                                         &retlen,
+                                         SvUV(flags),
+                                         &errors);
  
-        /* Returns the return value in [0]; <retlen> in [1] */
+        /* Returns the return value in [0]; <retlen> in [1], <errors> in [2] */
          av_push(RETVAL, newSVuv(ret));
          if (retlen == (STRLEN) -1) {
              av_push(RETVAL, newSViv(-1));
@@ -1392,6 +1397,7 @@ test_utf8n_to_uvchr(s, len, flags)
          else {
              av_push(RETVAL, newSVuv(retlen));
          }
+        av_push(RETVAL, newSVuv(errors));
  
      OUTPUT:
          RETVAL
diff --git a/ext/XS-APItest/t/utf8.t b/ext/XS-APItest/t/utf8.t

index 9e75c25..0f2d9ee 100644 (file)
--- a/ext/XS-APItest/t/utf8.t
+++ b/ext/XS-APItest/t/utf8.t
@@ -8,7 +8,7 @@ no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
                            # machines, and that is tested elsewhere
  
  use XS::APItest;
-
+use Data::Dumper;
  my $pound_sign = chr utf8::unicode_to_native(163);
  
  sub isASCII { ord "A" == 65 }
@@ -87,20 +87,30 @@ sub start_byte_to_cont($) {
  my $is64bit = length sprintf("%x", ~0) > 8;
  
  
-# Test utf8n_to_uvchr().  These provide essentially complete code coverage.
-# Copied from utf8.h
+# Test utf8n_to_uvchr_error().  These provide essentially complete code
+# coverage.  Copied from utf8.h
  my $UTF8_ALLOW_EMPTY            = 0x0001;
+my $UTF8_GOT_EMPTY              = $UTF8_ALLOW_EMPTY;
  my $UTF8_ALLOW_CONTINUATION     = 0x0002;
+my $UTF8_GOT_CONTINUATION       = $UTF8_ALLOW_CONTINUATION;
  my $UTF8_ALLOW_NON_CONTINUATION = 0x0004;
+my $UTF8_GOT_NON_CONTINUATION   = $UTF8_ALLOW_NON_CONTINUATION;
  my $UTF8_ALLOW_SHORT            = 0x0008;
+my $UTF8_GOT_SHORT              = $UTF8_ALLOW_SHORT;
  my $UTF8_ALLOW_LONG             = 0x0010;
+my $UTF8_GOT_LONG               = $UTF8_ALLOW_LONG;
+my $UTF8_GOT_OVERFLOW           = 0x0020;
  my $UTF8_DISALLOW_SURROGATE     = 0x0040;
+my $UTF8_GOT_SURROGATE          = $UTF8_DISALLOW_SURROGATE;
  my $UTF8_WARN_SURROGATE         = 0x0080;
  my $UTF8_DISALLOW_NONCHAR       = 0x0100;
+my $UTF8_GOT_NONCHAR            = $UTF8_DISALLOW_NONCHAR;
  my $UTF8_WARN_NONCHAR           = 0x0200;
  my $UTF8_DISALLOW_SUPER         = 0x0400;
+my $UTF8_GOT_SUPER              = $UTF8_DISALLOW_SUPER;
  my $UTF8_WARN_SUPER             = 0x0800;
  my $UTF8_DISALLOW_ABOVE_31_BIT  = 0x1000;
+my $UTF8_GOT_ABOVE_31_BIT       = $UTF8_DISALLOW_ABOVE_31_BIT;
  my $UTF8_WARN_ABOVE_31_BIT      = 0x2000;
  my $UTF8_CHECK_ONLY             = 0x4000;
  my $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
@@ -559,19 +569,23 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
  
      my $display_flags = sprintf "0x%x", $this_utf8_flags;
      my $display_bytes = display_bytes($bytes);
-    my $ret_ref = test_utf8n_to_uvchr($bytes, $len, $this_utf8_flags);
+    my $ret_ref = test_utf8n_to_uvchr_error($bytes, $len, $this_utf8_flags);
  
      # Rest of tests likely meaningless if it gets the wrong code point.
      next unless is($ret_ref->[0], $n,
-       "Verify utf8n_to_uvchr($display_bytes, $display_flags) returns $hex_n");
+                   "Verify utf8n_to_uvchr_error($display_bytes, $display_flags)"
+                 . "returns $hex_n");
      is($ret_ref->[1], $len,
-       "Verify utf8n_to_uvchr() for $hex_n returns expected length: $len");
+       "Verify utf8n_to_uvchr_error() for $hex_n returns expected length:"
+     . " $len");
  
      unless (is(scalar @warnings, 0,
-               "Verify utf8n_to_uvchr() for $hex_n generated no warnings"))
+             "Verify utf8n_to_uvchr_error() for $hex_n generated no warnings"))
      {
          output_warnings(@warnings);
      }
+    is($ret_ref->[2], 0,
+       "Verify utf8n_to_uvchr_error() returned no error bits");
  
      undef @warnings;
  
@@ -999,36 +1013,36 @@ my $REPLACEMENT = 0xFFFD;
  # Now test the malformations.  All these raise category utf8 warnings.
  my @malformations = (
      [ "zero length string malformation", "", 0,
-        $UTF8_ALLOW_EMPTY, 0, 0,
+        $UTF8_ALLOW_EMPTY, $UTF8_GOT_EMPTY, 0, 0,
          qr/empty string/
      ],
      [ "orphan continuation byte malformation", I8_to_native("${I8c}a"),
          2,
-        $UTF8_ALLOW_CONTINUATION, $REPLACEMENT, 1,
+        $UTF8_ALLOW_CONTINUATION, $UTF8_GOT_CONTINUATION, $REPLACEMENT, 1,
          qr/unexpected continuation byte/
      ],
      [ "premature next character malformation (immediate)",
          (isASCII) ? "\xc2\xc2\x80" : I8_to_native("\xc5\xc5\xa0"),
          3,
-        $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 1,
+        $UTF8_ALLOW_NON_CONTINUATION, $UTF8_GOT_NON_CONTINUATION, $REPLACEMENT, 1,
          qr/unexpected non-continuation byte.*immediately after start byte/
      ],
      [ "premature next character malformation (non-immediate)",
          I8_to_native("\xf1${I8c}a"),
          3,
-        $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 2,
+        $UTF8_ALLOW_NON_CONTINUATION, $UTF8_GOT_NON_CONTINUATION, $REPLACEMENT, 2,
          qr/unexpected non-continuation byte .* 2 bytes after start byte/
      ],
      [ "too short malformation", I8_to_native("\xf1${I8c}a"), 2,
          # Having the 'a' after this, but saying there are only 2 bytes also
          # tests that we pay attention to the passed in length
-        $UTF8_ALLOW_SHORT, $REPLACEMENT, 2,
+        $UTF8_ALLOW_SHORT, $UTF8_GOT_SHORT, $REPLACEMENT, 2,
          qr/2 bytes, need 4/
      ],
      [ "overlong malformation, lowest 2-byte",
          (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"),
          2,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
          0,   # NUL
          2,
          qr/overlong/
@@ -1036,7 +1050,7 @@ my @malformations = (
      [ "overlong malformation, highest 2-byte",
          (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
          2,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
          (isASCII) ? 0x7F : utf8::unicode_to_native(0xBF),
          2,
          qr/overlong/
@@ -1044,7 +1058,7 @@ my @malformations = (
      [ "overlong malformation, lowest 3-byte",
          (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
          3,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
          0,   # NUL
          3,
          qr/overlong/
@@ -1052,7 +1066,7 @@ my @malformations = (
      [ "overlong malformation, highest 3-byte",
          (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
          3,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
          (isASCII) ? 0x7FF : 0x3FF,
          3,
          qr/overlong/
@@ -1060,7 +1074,7 @@ my @malformations = (
      [ "overlong malformation, lowest 4-byte",
          (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
          4,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
          0,   # NUL
          4,
          qr/overlong/
@@ -1068,7 +1082,7 @@ my @malformations = (
      [ "overlong malformation, highest 4-byte",
          (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
          4,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
          (isASCII) ? 0xFFFF : 0x3FFF,
          4,
          qr/overlong/
@@ -1078,7 +1092,7 @@ my @malformations = (
           ?              "\xf8\x80\x80\x80\x80"
           : I8_to_native("\xf8\xa0\xa0\xa0\xa0"),
          5,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
          0,   # NUL
          5,
          qr/overlong/
@@ -1088,7 +1102,7 @@ my @malformations = (
           ?              "\xf8\x87\xbf\xbf\xbf"
           : I8_to_native("\xf8\xa7\xbf\xbf\xbf"),
          5,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
          (isASCII) ? 0x1FFFFF : 0x3FFFF,
          5,
          qr/overlong/
@@ -1098,7 +1112,7 @@ my @malformations = (
           ?              "\xfc\x80\x80\x80\x80\x80"
           : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"),
          6,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
          0,   # NUL
          6,
          qr/overlong/
@@ -1108,7 +1122,7 @@ my @malformations = (
           ?              "\xfc\x83\xbf\xbf\xbf\xbf"
           : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"),
          6,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
          (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
          6,
          qr/overlong/
@@ -1118,7 +1132,7 @@ my @malformations = (
           ?              "\xfe\x80\x80\x80\x80\x80\x80"
           : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"),
          7,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
          0,   # NUL
          7,
          qr/overlong/
@@ -1128,7 +1142,7 @@ my @malformations = (
           ?              "\xfe\x81\xbf\xbf\xbf\xbf\xbf"
           : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"),
          7,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
          (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
          7,
          qr/overlong/
@@ -1142,6 +1156,7 @@ if (isASCII && ! $is64bit) {    # 32-bit ASCII platform
              "\xfe\x84\x80\x80\x80\x80\x80",  # Represents 2**32
              7,
              0,  # There is no way to allow this malformation
+            $UTF8_GOT_OVERFLOW,
              $REPLACEMENT,
              7,
              qr/overflows/
@@ -1150,6 +1165,7 @@ if (isASCII && ! $is64bit) {    # 32-bit ASCII platform
              "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
              13,
              0,  # There is no way to allow this malformation
+            $UTF8_GOT_OVERFLOW,
              $REPLACEMENT,
              13,
              qr/overflows/
@@ -1167,7 +1183,7 @@ else {
               ?              "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
               : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
              (isASCII) ? 13 : 14,
-            $UTF8_ALLOW_LONG,
+            $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
              0,   # NUL
              (isASCII) ? 13 : 14,
              qr/overlong/,
@@ -1177,7 +1193,7 @@ else {
               ?              "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf"
               : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"),
              (isASCII) ? 13 : 14,
-            $UTF8_ALLOW_LONG,
+            $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
              (isASCII) ? 0xFFFFFFFFF : 0x3FFFFFFF,
              (isASCII) ? 13 : 14,
              qr/overlong/,
@@ -1189,6 +1205,7 @@ else {
              I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
              14,
              0,  # There is no way to allow this malformation
+            $UTF8_GOT_OVERFLOW,
              $REPLACEMENT,
              14,
              qr/overflows/
@@ -1202,6 +1219,7 @@ else {
                  : I8_to_native("\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                  (isASCII) ? 13 : 14,
                  0,  # There is no way to allow this malformation
+                $UTF8_GOT_OVERFLOW,
                  $REPLACEMENT,
                  (isASCII) ? 13 : 14,
                  qr/overflows/
@@ -1210,7 +1228,8 @@ else {
  }
  
  foreach my $test (@malformations) {
-    my ($testname, $bytes, $length, $allow_flags, $allowed_uv, $expected_len, $message ) = @$test;
+    my ($testname, $bytes, $length, $allow_flags, $expected_error_flags,
+        $allowed_uv, $expected_len, $message ) = @$test;
  
      if (length($bytes) < $length) {
          fail("Internal test error: actual buffer length (" . length($bytes)
@@ -1314,49 +1333,79 @@ foreach my $test (@malformations) {
  
      # Test what happens when this malformation is not allowed
      undef @warnings;
-    my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
+    my $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, 0);
      is($ret_ref->[0], 0, "$testname: disallowed: Returns 0");
-    is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), disallowed: Returns expected length: $expected_len");
-    if (is(scalar @warnings, 1, "$testname: disallowed: Got a single warning ")) {
-        like($warnings[0], $message, "$testname: disallowed: Got expected warning");
+    is($ret_ref->[1], $expected_len,
+       "$testname: utf8n_to_uvchr_error(), disallowed: Returns expected"
+     . " length: $expected_len");
+    if (is(scalar @warnings, 1,
+           "$testname: disallowed: Got a single warning "))
+    {
+        like($warnings[0], $message,
+             "$testname: disallowed: Got expected warning");
      }
      else {
          if (scalar @warnings) {
              output_warnings(@warnings);
          }
      }
+    is($ret_ref->[2], $expected_error_flags,
+       "$testname: utf8n_to_uvchr_error(), disallowed:"
+     . " Returns expected error");
  
      {   # Next test when disallowed, and warnings are off.
          undef @warnings;
          no warnings 'utf8';
-        my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
-        is($ret_ref->[0], 0, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': Returns 0");
-        is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': Returns expected length: $expected_len");
-        if (!is(scalar @warnings, 0, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': no warnings generated")) {
+        my $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, 0);
+        is($ret_ref->[0], 0,
+           "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':"
+         . " Returns 0");
+        is($ret_ref->[1], $expected_len,
+           "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':"
+         . " Returns expected length: $expected_len");
+        if (!is(scalar @warnings, 0,
+            "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':"
+          . " no warnings generated"))
+        {
              output_warnings(@warnings);
          }
+        is($ret_ref->[2], $expected_error_flags,
+           "$testname: utf8n_to_uvchr_error(), disallowed: Returns"
+         . " expected error");
      }
  
      # Test with CHECK_ONLY
      undef @warnings;
-    $ret_ref = test_utf8n_to_uvchr($bytes, $length, $UTF8_CHECK_ONLY);
+    $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, $UTF8_CHECK_ONLY);
      is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0");
      is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns -1 for length");
      if (! is(scalar @warnings, 0, "$testname: CHECK_ONLY: no warnings generated")) {
          output_warnings(@warnings);
      }
+    is($ret_ref->[2], $expected_error_flags,
+       "$testname: utf8n_to_uvchr_error(), disallowed: Returns expected"
+     . " error");
  
      next if $allow_flags == 0;    # Skip if can't allow this malformation
  
      # Test when the malformation is allowed
      undef @warnings;
-    $ret_ref = test_utf8n_to_uvchr($bytes, $length, $allow_flags);
-    is($ret_ref->[0], $allowed_uv, "$testname: utf8n_to_uvchr(), allowed: Returns expected uv: " . sprintf("0x%04X", $allowed_uv));
-    is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), allowed: Returns expected length: $expected_len");
-    if (!is(scalar @warnings, 0, "$testname: utf8n_to_uvchr(), allowed: no warnings generated"))
+    $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, $allow_flags);
+    is($ret_ref->[0], $allowed_uv,
+       "$testname: utf8n_to_uvchr_error(), allowed: Returns expected uv: "
+     . sprintf("0x%04X", $allowed_uv));
+    is($ret_ref->[1], $expected_len,
+       "$testname: utf8n_to_uvchr_error(), allowed: Returns expected length:"
+     . " $expected_len");
+    if (!is(scalar @warnings, 0,
+            "$testname: utf8n_to_uvchr_error(), allowed: no warnings"
+          . " generated"))
      {
          output_warnings(@warnings);
      }
+    is($ret_ref->[2], $expected_error_flags,
+       "$testname: utf8n_to_uvchr_error(), disallowed: Returns"
+     . " expected error");
  }
  
  sub nonportable_regex ($) {
@@ -1381,35 +1430,35 @@ sub nonportable_regex ($) {
  my @tests = (
      [ "lowest surrogate",
          (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
-        $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
+        $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE,
          'surrogate', 0xD800,
          (isASCII) ? 3 : 4,
          qr/surrogate/
      ],
      [ "a middle surrogate",
          (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
-        $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
+        $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE,
          'surrogate', 0xD90D,
          (isASCII) ? 3 : 4,
          qr/surrogate/
      ],
      [ "highest surrogate",
          (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
-        $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
+        $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE,
          'surrogate', 0xDFFF,
          (isASCII) ? 3 : 4,
          qr/surrogate/
      ],
      [ "first non_unicode",
          (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
-        $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
+        $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, $UTF8_GOT_SUPER,
          'non_unicode', 0x110000,
          (isASCII) ? 4 : 5,
          qr/(not Unicode|for a non-Unicode code point).* may not be portable/
      ],
      [ "non_unicode whose first byte tells that",
          (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
-        $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
+        $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, $UTF8_GOT_SUPER,
          'non_unicode',
          (isASCII) ? 0x140000 : 0x200000,
          (isASCII) ? 4 : 5,
@@ -1417,253 +1466,253 @@ my @tests = (
      ],
      [ "first of 32 consecutive non-character code points",
          (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xFDD0,
          (isASCII) ? 3 : 4,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "a mid non-character code point of the 32 consecutive ones",
          (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xFDE0,
          (isASCII) ? 3 : 4,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "final of 32 consecutive non-character code points",
          (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xFDEF,
          (isASCII) ? 3 : 4,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+FFFE",
          (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xFFFE,
          (isASCII) ? 3 : 4,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+FFFF",
          (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xFFFF,
          (isASCII) ? 3 : 4,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+1FFFE",
          (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x1FFFE, 4,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+1FFFF",
          (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x1FFFF, 4,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+2FFFE",
          (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x2FFFE, 4,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+2FFFF",
          (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x2FFFF, 4,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+3FFFE",
          (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x3FFFE, 4,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+3FFFF",
          (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x3FFFF, 4,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+4FFFE",
          (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x4FFFE,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+4FFFF",
          (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x4FFFF,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+5FFFE",
          (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x5FFFE,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+5FFFF",
          (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x5FFFF,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+6FFFE",
          (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x6FFFE,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+6FFFF",
          (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x6FFFF,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+7FFFE",
          (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x7FFFE,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+7FFFF",
          (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x7FFFF,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+8FFFE",
          (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x8FFFE,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+8FFFF",
          (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x8FFFF,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+9FFFE",
          (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x9FFFE,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+9FFFF",
          (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x9FFFF,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+AFFFE",
          (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xAFFFE,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+AFFFF",
          (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xAFFFF,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+BFFFE",
          (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xBFFFE,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+BFFFF",
          (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xBFFFF,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+CFFFE",
          (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xCFFFE,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+CFFFF",
          (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xCFFFF,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+DFFFE",
          (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xDFFFE,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+DFFFF",
          (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xDFFFF,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+EFFFE",
          (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xEFFFE,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+EFFFF",
          (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xEFFFF,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+FFFFE",
          (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xFFFFE,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+FFFFF",
          (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0xFFFFF,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+10FFFE",
          (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x10FFFE,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
      ],
      [ "non-character code point U+10FFFF",
          (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
          'nonchar', 0x10FFFF,
          (isASCII) ? 4 : 5,
          qr/Unicode non-character.*is not recommended for open interchange/
@@ -1675,6 +1724,7 @@ my @tests = (
          # This code point is chosen so that it is representable in a UV on
          # 32-bit machines
          $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
+        $UTF8_GOT_ABOVE_31_BIT,
          'utf8', 0x80000000, (isASCII) ? 7 :14,
          nonportable_regex(0x80000000)
      ],
@@ -1682,7 +1732,7 @@ my @tests = (
          (isASCII)
           ? "\xfe\x82\x80\x80\x80\x80\x80"
           : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
-        $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
+        $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, $UTF8_GOT_SUPER,
          'utf8', 0x80000000, (isASCII) ? 7 :14,
          nonportable_regex(0x80000000)
      ],
@@ -1702,11 +1752,9 @@ my @tests = (
          : ((isASCII)
             ?              "\xfe\x86\x80\x80\x80\x80\x80"
             : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
-
-        # We include both warning categories to make sure the ABOVE_31_BIT one
-        # has precedence
-        "$UTF8_WARN_ABOVE_31_BIT|$UTF8_WARN_SUPER",
-        "$UTF8_DISALLOW_ABOVE_31_BIT",
+        $UTF8_WARN_ABOVE_31_BIT,
+        $UTF8_DISALLOW_ABOVE_31_BIT,
+        $UTF8_GOT_ABOVE_31_BIT,
          'utf8', 0,
          (! isASCII) ? 14 : ($is64bit) ? 13 : 7,
          qr/overflows/
@@ -1721,6 +1769,7 @@ if ($is64bit) {
              ?              "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
              : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
              $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
+            $UTF8_GOT_ABOVE_31_BIT,
              'utf8', 0x1000000000, (isASCII) ? 13 : 14,
              qr/and( is)? not portable/
          ];
@@ -1729,30 +1778,35 @@ if ($is64bit) {
              [ "requires at least 32 bits",
                  I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                  $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
+                $UTF8_GOT_ABOVE_31_BIT,
                  'utf8', 0x800000000, 14,
                  nonportable_regex(0x80000000)
              ],
              [ "requires at least 32 bits",
                  I8_to_native("\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                  $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
+                $UTF8_GOT_ABOVE_31_BIT,
                  'utf8', 0x10000000000, 14,
                  nonportable_regex(0x10000000000)
              ],
              [ "requires at least 32 bits",
                  I8_to_native("\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                  $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
+                $UTF8_GOT_ABOVE_31_BIT,
                  'utf8', 0x200000000000, 14,
                  nonportable_regex(0x20000000000)
              ],
              [ "requires at least 32 bits",
                  I8_to_native("\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                  $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
+                $UTF8_GOT_ABOVE_31_BIT,
                  'utf8', 0x4000000000000, 14,
                  nonportable_regex(0x4000000000000)
              ],
              [ "requires at least 32 bits",
                  I8_to_native("\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                  $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
+                $UTF8_GOT_ABOVE_31_BIT,
                  'utf8', 0x80000000000000, 14,
                  nonportable_regex(0x80000000000000)
              ],
@@ -1760,6 +1814,7 @@ if ($is64bit) {
                  I8_to_native("\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                     #IBM-1047  \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
                  $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
+                $UTF8_GOT_ABOVE_31_BIT,
                  'utf8', 0x1000000000000000, 14,
                  nonportable_regex(0x1000000000000000)
              ];
@@ -1767,7 +1822,8 @@ if ($is64bit) {
  }
  
  foreach my $test (@tests) {
-    my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test;
+    my ($testname, $bytes, $warn_flags, $disallow_flags, $expected_error_flags,
+        $category, $allowed_uv, $expected_len, $message ) = @$test;
  
      my $length = length $bytes;
      my $will_overflow = $testname =~ /overflow/ ? 'overflow' : "";
@@ -1932,13 +1988,17 @@ foreach my $test (@tests) {
                              && ord native_to_I8(substr($bytes, 0, 1)) >= 0xFE;
  
                              my @malformations;
+                            my @expected_errors;
                              push @malformations, $short if $short;
                              push @malformations, $overlong if $overlong;
  
                              # The overflow malformation test in the input
                              # array is coerced into being treated like one of
                              # the others.
-                            push @malformations, 'overflow' if $will_overflow;
+                            if ($will_overflow) {
+                                push @malformations, 'overflow';
+                                push @expected_errors, $UTF8_GOT_OVERFLOW;
+                            }
  
                              my $malformations_name = join "/", @malformations;
                              $malformations_name .= " malformation"
@@ -1972,6 +2032,7 @@ foreach my $test (@tests) {
                                                 . $this_bytes;
                                      $this_length = length($this_bytes);
                                      $this_expected_len = 7;
+                                    push @expected_errors, $UTF8_GOT_LONG;
                                  }
                                  if ($malformations_name =~ /short/) {
  
@@ -1979,6 +2040,7 @@ foreach my $test (@tests) {
                                      # enough into the input.
                                      $this_length--;
                                      $this_expected_len--;
+                                    push @expected_errors, $UTF8_GOT_SHORT;
                                  }
                                  elsif ($malformations_name
                                                          =~ /non-continuation/)
@@ -1987,6 +2049,8 @@ foreach my $test (@tests) {
                                      # a non one.
                                      substr($this_bytes, -1, 1) = '?';
                                      $this_expected_len--;
+                                    push @expected_errors,
+                                                    $UTF8_GOT_NON_CONTINUATION;
                                  }
                              }
  
@@ -2003,7 +2067,7 @@ foreach my $test (@tests) {
                              # well
                              my $disallowed = $disallow_flag
                                            || $malformations_name;
-                            my $this_name = "utf8n_to_uvchr() $testname: "
+                            my $this_name = "utf8n_to_uvchr_error() $testname: "
                                                          . (($disallow_flag)
                                                              ? 'disallowed'
                                                              : $disallowed
@@ -2018,11 +2082,13 @@ foreach my $test (@tests) {
                              my $ret_ref;
                              my $display_bytes = display_bytes($this_bytes);
                              my $call = "Call was: $eval_warn; \$ret_ref"
-                                     . " = test_utf8n_to_uvchr('$display_bytes'"
-                                     . ", $this_length, $warn_flag"
+                                     . " = test_utf8n_to_uvchr_error("
+                                     . "'$display_bytes', $this_length,"
+                                     . "$warn_flag"
                                       . "|$disallow_flag)";
                              my $eval_text =      "$eval_warn; \$ret_ref"
-                                     . " = test_utf8n_to_uvchr('$this_bytes',"
+                                     . " = test_utf8n_to_uvchr_error("
+                                     . "'$this_bytes',"
                                       . " $this_length, $warn_flag"
                                       . "|$disallow_flag)";
                              eval "$eval_text";
@@ -2054,6 +2120,31 @@ foreach my $test (@tests) {
                                  diag $call;
                              }
  
+                            my $errors = $ret_ref->[2];
+
+                            for (my $i = @expected_errors - 1; $i >= 0; $i--) {
+                                if (ok($expected_errors[$i] & $errors,
+                                       "Expected and got error bit return"
+                                     . " for $malformations[$i] malformation"))
+                                {
+                                    $errors &= ~$expected_errors[$i];
+                                }
+                                splice @expected_errors, $i, 1;
+                            }
+                            unless (is(scalar @expected_errors, 0,
+                                    "Got all the expected malformation errors"))
+                            {
+                                diag Dumper \@expected_errors;
+                            }
+
+                            if ($warn_flag || $disallow_flag) {
+                                is($errors, $expected_error_flags,
+                                   "Got the correct error flag");
+                            }
+                            else {
+                                is($errors, 0, "Got no other error flag");
+                            }
+
                              if (@malformations) {
                                  if (! $do_warning && $warning eq 'utf8') {
                                      goto no_warnings_expected;
@@ -2119,7 +2210,7 @@ foreach my $test (@tests) {
                              # not just when the $disallow_flag is set
                              if ($disallowed) {
                                  undef @warnings;
-                                $ret_ref = test_utf8n_to_uvchr(
+                                $ret_ref = test_utf8n_to_uvchr_error(
                                                 $this_bytes, $this_length,
                                                 $disallow_flag|$UTF8_CHECK_ONLY);
                                  unless (is($ret_ref->[0], 0,
@@ -2149,7 +2240,7 @@ foreach my $test (@tests) {
                              next if @malformations;
  
                              # The warning and disallow flags passed in are for
-                            # utf8n_to_uvchr().  Convert them for
+                            # utf8n_to_uvchr_error().  Convert them for
                              # uvchr_to_utf8_flags().
                              my $uvchr_warn_flag = 0;
                              my $uvchr_disallow_flag = 0;
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index 62eaa0b..7479528 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -335,6 +335,14 @@ abandon searching for other malformations when the first one is
  encountered.  A call to it thus can generate multiple diagnostics,
  instead of just one.
  
+=item *
+
+A new function, C<L<perlapi/utf8n_to_uvchr_error>>, has been added for
+use by modules that need to know the details of UTF-8 malformations
+beyond pass/fail.  Previously, the only ways to know why a sequence was
+ill-formed was to capture and parse the generated diagnostics, or to do
+your own analysis.
+
  =back
  
  =head1 Selected Bug Fixes
diff --git a/proto.h b/proto.h

index 5b54bda..701dc9e 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -3537,6 +3537,9 @@ PERL_CALLCONV UV  Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLE
  PERL_CALLCONV UV       Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags);
  #define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR        \
         assert(s)
+PERL_CALLCONV UV       Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 * errors);
+#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR  \
+       assert(s)
  PERL_CALLCONV UV       Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
  #define PERL_ARGS_ASSERT_UTF8N_TO_UVUNI        \
         assert(s)
diff --git a/utf8.c b/utf8.c

index a538296..0432bb0 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -811,7 +811,7 @@ character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
  in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
  be set to 1.  To disambiguate, upon a zero return, see if the first byte of
  C<s> is 0 as well.  If so, the input was a C<NUL>; if not, the input had an
-error.
+error.  Or you can use C<L</utf8n_to_uvchr_error>>.
  
  Certain code points are considered problematic.  These are Unicode surrogates,
  Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
@@ -874,10 +874,112 @@ use and those yet to be assigned, are never considered malformed and never
  warn.
  
  =cut
+
+Also implemented as a macro in utf8.h
+*/
+
+UV
+Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
+                          STRLEN curlen,
+                          STRLEN *retlen,
+                          const U32 flags)
+{
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
+
+    return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
+}
+
+/*
+
+=for apidoc utf8n_to_uvchr_error
+
+THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
+
+This function is for code that needs to know what the precise malformation(s)
+are when an error is found.
+
+It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
+all the others, C<errors>.  If this parameter is 0, this function behaves
+identically to C<L</utf8n_to_uvchr>>.  Otherwise, C<errors> should be a pointer
+to a C<U32> variable, which this function sets to indicate any errors found.
+Upon return, if C<*errors> is 0, there were no errors found.  Otherwise,
+C<*errors> is the bit-wise C<OR> of the bits described in the list below.  Some
+of these bits will be set if a malformation is found, even if the input
+C<flags> parameter indicates that the given malformation is allowed; the
+exceptions are noted:
+
+=over 4
+
+=item C<UTF8_GOT_ABOVE_31_BIT>
+
+The code point represented by the input UTF-8 sequence occupies more than 31
+bits.
+This bit is set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
+
+=item C<UTF8_GOT_CONTINUATION>
+
+The input sequence was malformed in that the first byte was a a UTF-8
+continuation byte.
+
+=item C<UTF8_GOT_EMPTY>
+
+The input C<curlen> parameter was 0.
+
+=item C<UTF8_GOT_LONG>
+
+The input sequence was malformed in that there is some other sequence that
+evaluates to the same code point, but that sequence is shorter than this one.
+
+=item C<UTF8_GOT_NONCHAR>
+
+The code point represented by the input UTF-8 sequence is for a Unicode
+non-character code point.
+This bit is set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
+
+=item C<UTF8_GOT_NON_CONTINUATION>
+
+The input sequence was malformed in that a non-continuation type byte was found
+in a position where only a continuation type one should be.
+
+=item C<UTF8_GOT_OVERFLOW>
+
+The input sequence was malformed in that it is for a code point that is not
+representable in the number of bits available in a UV on the current platform.
+
+=item C<UTF8_GOT_SHORT>
+
+The input sequence was malformed in that C<curlen> is smaller than required for
+a complete sequence.  In other words, the input is for a partial character
+sequence.
+
+=item C<UTF8_GOT_SUPER>
+
+The input sequence was malformed in that it is for a non-Unicode code point;
+that is, one above the legal Unicode maximum.
+This bit is set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
+
+=item C<UTF8_GOT_SURROGATE>
+
+The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
+code point.
+This bit is set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
+
+=back
+
+=cut
  */
  
  UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags)
+Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
+                                STRLEN curlen,
+                                STRLEN *retlen,
+                                const U32 flags,
+                                U32 * errors)
  {
      const U8 * const s0 = s;
      U8 * send = NULL;           /* (initialized to silence compilers' wrong
@@ -888,6 +990,8 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
      STRLEN expectlen   = 0;     /* How long should this sequence be?
                                     (initialized to silence compilers' wrong
                                     warning) */
+    U32 discard_errors = 0;     /* Used to save branches when 'errors' is NULL;
+                                   this gets set and discarded */
  
      /* The below are used only if there is both an overlong malformation and a
       * too short one.  Otherwise the first two are set to 's0' and 'send', and
@@ -896,7 +1000,14 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
      U8 * adjusted_send;
      UV uv_so_far = 0;   /* (Initialized to silence compilers' wrong warning) */
  
-    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
+
+    if (errors) {
+        *errors = 0;
+    }
+    else {
+        errors = &discard_errors;
+    }
  
      /* The order of malformation tests here is important.  We should consume as
       * few bytes as possible in order to not skip any valid character.  This is
@@ -1162,6 +1273,15 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
                   * handle all three cases here */
                  possible_problems
                    &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
+                *errors |= UTF8_GOT_OVERFLOW;
+
+                /* But the API says we flag all errors found */
+                if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
+                    *errors |= UTF8_GOT_SUPER;
+                }
+                if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
+                    *errors |= UTF8_GOT_ABOVE_31_BIT;
+                }
  
                  disallowed = TRUE;
  
@@ -1186,6 +1306,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
              }
              else if (possible_problems & UTF8_GOT_EMPTY) {
                  possible_problems &= ~UTF8_GOT_EMPTY;
+                *errors |= UTF8_GOT_EMPTY;
  
                  if (! (flags & UTF8_ALLOW_EMPTY)) {
                      disallowed = TRUE;
@@ -1198,6 +1319,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
              }
              else if (possible_problems & UTF8_GOT_CONTINUATION) {
                  possible_problems &= ~UTF8_GOT_CONTINUATION;
+                *errors |= UTF8_GOT_CONTINUATION;
  
                  if (! (flags & UTF8_ALLOW_CONTINUATION)) {
                      disallowed = TRUE;
@@ -1213,6 +1335,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
              }
              else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
                  possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
+                *errors |= UTF8_GOT_NON_CONTINUATION;
  
                  if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
                      disallowed = TRUE;
@@ -1228,6 +1351,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
              }
              else if (possible_problems & UTF8_GOT_SHORT) {
                  possible_problems &= ~UTF8_GOT_SHORT;
+                *errors |= UTF8_GOT_SHORT;
  
                  if (! (flags & UTF8_ALLOW_SHORT)) {
                      disallowed = TRUE;
@@ -1246,6 +1370,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
              }
              else if (possible_problems & UTF8_GOT_LONG) {
                  possible_problems &= ~UTF8_GOT_LONG;
+                *errors |= UTF8_GOT_LONG;
  
                  if (! (flags & UTF8_ALLOW_LONG)) {
                      disallowed = TRUE;
@@ -1289,13 +1414,12 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
              else if (possible_problems & UTF8_GOT_SURROGATE) {
                  possible_problems &= ~UTF8_GOT_SURROGATE;
  
-                /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
-                 * generation of the format, since no warnings are raised under
-                 * CHECK */
-                if (   (flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY))
-                                                        == UTF8_WARN_SURROGATE
-                    && ckWARN_d(WARN_SURROGATE))
-                {
+                if (flags & UTF8_WARN_SURROGATE) {
+                    *errors |= UTF8_GOT_SURROGATE;
+
+                    if (   ! (flags & UTF8_CHECK_ONLY)
+                        && ckWARN_d(WARN_SURROGATE))
+                    {
                          pack_warn = packWARN(WARN_SURROGATE);
  
                          /* These are the only errors that can occur with a
@@ -1310,19 +1434,23 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
                              message = Perl_form(aTHX_
                                              "UTF-16 surrogate U+%04"UVXf"", uv);
                          }
+                    }
                  }
  
                  if (flags & UTF8_DISALLOW_SURROGATE) {
                      disallowed = TRUE;
+                    *errors |= UTF8_GOT_SURROGATE;
                  }
              }
              else if (possible_problems & UTF8_GOT_SUPER) {
                  possible_problems &= ~UTF8_GOT_SUPER;
  
-                if (   (flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY))
-                                                            == UTF8_WARN_SUPER
-                    && ckWARN_d(WARN_NON_UNICODE))
-                {
+                if (flags & UTF8_WARN_SUPER) {
+                    *errors |= UTF8_GOT_SUPER;
+
+                    if (   ! (flags & UTF8_CHECK_ONLY)
+                        && ckWARN_d(WARN_NON_UNICODE))
+                    {
                          pack_warn = packWARN(WARN_NON_UNICODE);
  
                          if (orig_problems & UTF8_GOT_TOO_SHORT) {
@@ -1338,6 +1466,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
                                                  " Unicode, may not be portable",
                                                  uv);
                          }
+                    }
                  }
  
                  /* The maximum code point ever specified by a standard was
@@ -1378,12 +1507,17 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
                          }
                      }
  
-                    if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
-                        disallowed = TRUE;
+                    if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
+                        *errors |= UTF8_GOT_ABOVE_31_BIT;
+
+                        if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
+                            disallowed = TRUE;
+                        }
                      }
                  }
  
                  if (flags & UTF8_DISALLOW_SUPER) {
+                    *errors |= UTF8_GOT_SUPER;
                      disallowed = TRUE;
                  }
  
@@ -1405,10 +1539,12 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
              else if (possible_problems & UTF8_GOT_NONCHAR) {
                  possible_problems &= ~UTF8_GOT_NONCHAR;
  
-                if (  (flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY))
-                                                        == UTF8_WARN_NONCHAR
-                    && ckWARN_d(WARN_NONCHAR))
-                {
+                if (flags & UTF8_WARN_NONCHAR) {
+                    *errors |= UTF8_GOT_NONCHAR;
+
+                    if (  ! (flags & UTF8_CHECK_ONLY)
+                        && ckWARN_d(WARN_NONCHAR))
+                    {
                          /* The code above should have guaranteed that we don't
                           * get here with errors other than overlong */
                          assert (! (orig_problems
@@ -1418,10 +1554,12 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
                          message = Perl_form(aTHX_ "Unicode non-character"
                                                  " U+%04"UVXf" is not recommended"
                                                  " for open interchange", uv);
+                    }
                  }
  
                  if (flags & UTF8_DISALLOW_NONCHAR) {
                      disallowed = TRUE;
+                    *errors |= UTF8_GOT_NONCHAR;
                  }
              } /* End of looking through the possible flags */
  
diff --git a/utf8.h b/utf8.h

index c55ce26..aa1c69d 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -72,6 +72,8 @@ the string is invariant.
  #define utf8_to_uvchr_buf(s, e, lenp)                                          \
                       utf8n_to_uvchr(s, (U8*)(e) - (U8*)(s), lenp,              \
                                      ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)
+#define utf8n_to_uvchr(s, len, lenp, flags)                                    \
+                                utf8n_to_uvchr_error(s, len, lenp, flags, 0)
  
  #define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
  #define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
author	Karl Williamson <khw@cpan.org>
	Tue, 11 Oct 2016 03:18:37 +0000 (21:18 -0600)
committer	Karl Williamson <khw@cpan.org>
	Thu, 13 Oct 2016 17:18:12 +0000 (11:18 -0600)
embed.fnc		patch \| blob \| blame \| history
embed.h		patch \| blob \| blame \| history
ext/XS-APItest/APItest.xs		patch \| blob \| blame \| history
ext/XS-APItest/t/utf8.t		patch \| blob \| blame \| history
pod/perldelta.pod		patch \| blob \| blame \| history
proto.h		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history
utf8.h		patch \| blob \| blame \| history