This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Add utf8n_to_uvchr_error
authorKarl Williamson <khw@cpan.org>
Tue, 11 Oct 2016 03:18:37 +0000 (21:18 -0600)
committerKarl Williamson <khw@cpan.org>
Thu, 13 Oct 2016 17:18:12 +0000 (11:18 -0600)
This new function behaves like utf8n_to_uvchr(), but takes an extra
parameter that points to a U32 which will be set to 0 if no errors are
found; otherwise each error found will set a bit in it.  This can be
used by the caller to figure out precisely what the error(s) is/are.
Previously, one would have to capture and parse the warning/error
messages raised.   This can be used, for example, to customize the
messages to the expected end-user's knowledge level.

embed.fnc
embed.h
ext/XS-APItest/APItest.xs
ext/XS-APItest/t/utf8.t
pod/perldelta.pod
proto.h
utf8.c
utf8.h

index 64a812b..46426b6 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1737,7 +1737,15 @@ Amd      |UV     |utf8_to_uvchr_buf      |NN const U8 *s|NN const U8 *send|NULLOK STRLEN *retl
 ApdD   |UV     |utf8_to_uvuni_buf      |NN const U8 *s|NN const U8 *send|NULLOK STRLEN *retlen
 pM     |bool   |check_utf8_print       |NN const U8 *s|const STRLEN len
 
-Adp    |UV     |utf8n_to_uvchr |NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|const U32 flags
+Adop   |UV     |utf8n_to_uvchr |NN const U8 *s                             \
+                               |STRLEN curlen                              \
+                               |NULLOK STRLEN *retlen                      \
+                               |const U32 flags
+Adp    |UV     |utf8n_to_uvchr_error|NN const U8 *s                        \
+                               |STRLEN curlen                              \
+                               |NULLOK STRLEN *retlen                      \
+                               |const U32 flags                            \
+                               |NULLOK U32 * errors
 AipnR  |UV     |valid_utf8_to_uvchr    |NN const U8 *s|NULLOK STRLEN *retlen
 Ap     |UV     |utf8n_to_uvuni|NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|U32 flags
 
diff --git a/embed.h b/embed.h
index c2ed3f1..5df381c 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define utf8_to_uvchr(a,b)     Perl_utf8_to_uvchr(aTHX_ a,b)
 #define utf8_to_uvuni(a,b)     Perl_utf8_to_uvuni(aTHX_ a,b)
 #define utf8_to_uvuni_buf(a,b,c)       Perl_utf8_to_uvuni_buf(aTHX_ a,b,c)
-#define utf8n_to_uvchr(a,b,c,d)        Perl_utf8n_to_uvchr(aTHX_ a,b,c,d)
+#define utf8n_to_uvchr_error(a,b,c,d,e)        Perl_utf8n_to_uvchr_error(aTHX_ a,b,c,d,e)
 #define utf8n_to_uvuni(a,b,c,d)        Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
 #define uvoffuni_to_utf8_flags(a,b,c)  Perl_uvoffuni_to_utf8_flags(aTHX_ a,b,c)
 #define uvuni_to_utf8(a,b)     Perl_uvuni_to_utf8(aTHX_ a,b)
index ce94968..6b6f45f 100644 (file)
@@ -1362,7 +1362,7 @@ bytes_cmp_utf8(bytes, utf8)
        RETVAL
 
 AV *
-test_utf8n_to_uvchr(s, len, flags)
+test_utf8n_to_uvchr_error(s, len, flags)
 
         SV *s
         SV *len
@@ -1371,20 +1371,25 @@ test_utf8n_to_uvchr(s, len, flags)
         STRLEN retlen;
         UV ret;
         STRLEN slen;
+        U32 errors;
 
     CODE:
-        /* Call utf8n_to_uvchr() with the inputs.  It always asks for the
-         * actual length to be returned
+        /* Now that utf8n_to_uvchr() is a trivial wrapper for
+         * utf8n_to_uvchr_error(), call the latter with the inputs.  It always
+         * asks for the actual length to be returned and errors to be returned
          *
          * Length to assume <s> is; not checked, so could have buffer overflow
          */
         RETVAL = newAV();
         sv_2mortal((SV*)RETVAL);
 
-        ret
-         = utf8n_to_uvchr((U8*) SvPV(s, slen), SvUV(len), &retlen, SvUV(flags));
+        ret = utf8n_to_uvchr_error((U8*) SvPV(s, slen),
+                                         SvUV(len),
+                                         &retlen,
+                                         SvUV(flags),
+                                         &errors);
 
-        /* Returns the return value in [0]; <retlen> in [1] */
+        /* Returns the return value in [0]; <retlen> in [1], <errors> in [2] */
         av_push(RETVAL, newSVuv(ret));
         if (retlen == (STRLEN) -1) {
             av_push(RETVAL, newSViv(-1));
@@ -1392,6 +1397,7 @@ test_utf8n_to_uvchr(s, len, flags)
         else {
             av_push(RETVAL, newSVuv(retlen));
         }
+        av_push(RETVAL, newSVuv(errors));
 
     OUTPUT:
         RETVAL
index 9e75c25..0f2d9ee 100644 (file)
@@ -8,7 +8,7 @@ no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
                           # machines, and that is tested elsewhere
 
 use XS::APItest;
-
+use Data::Dumper;
 my $pound_sign = chr utf8::unicode_to_native(163);
 
 sub isASCII { ord "A" == 65 }
@@ -87,20 +87,30 @@ sub start_byte_to_cont($) {
 my $is64bit = length sprintf("%x", ~0) > 8;
 
 
-# Test utf8n_to_uvchr().  These provide essentially complete code coverage.
-# Copied from utf8.h
+# Test utf8n_to_uvchr_error().  These provide essentially complete code
+# coverage.  Copied from utf8.h
 my $UTF8_ALLOW_EMPTY            = 0x0001;
+my $UTF8_GOT_EMPTY              = $UTF8_ALLOW_EMPTY;
 my $UTF8_ALLOW_CONTINUATION     = 0x0002;
+my $UTF8_GOT_CONTINUATION       = $UTF8_ALLOW_CONTINUATION;
 my $UTF8_ALLOW_NON_CONTINUATION = 0x0004;
+my $UTF8_GOT_NON_CONTINUATION   = $UTF8_ALLOW_NON_CONTINUATION;
 my $UTF8_ALLOW_SHORT            = 0x0008;
+my $UTF8_GOT_SHORT              = $UTF8_ALLOW_SHORT;
 my $UTF8_ALLOW_LONG             = 0x0010;
+my $UTF8_GOT_LONG               = $UTF8_ALLOW_LONG;
+my $UTF8_GOT_OVERFLOW           = 0x0020;
 my $UTF8_DISALLOW_SURROGATE     = 0x0040;
+my $UTF8_GOT_SURROGATE          = $UTF8_DISALLOW_SURROGATE;
 my $UTF8_WARN_SURROGATE         = 0x0080;
 my $UTF8_DISALLOW_NONCHAR       = 0x0100;
+my $UTF8_GOT_NONCHAR            = $UTF8_DISALLOW_NONCHAR;
 my $UTF8_WARN_NONCHAR           = 0x0200;
 my $UTF8_DISALLOW_SUPER         = 0x0400;
+my $UTF8_GOT_SUPER              = $UTF8_DISALLOW_SUPER;
 my $UTF8_WARN_SUPER             = 0x0800;
 my $UTF8_DISALLOW_ABOVE_31_BIT  = 0x1000;
+my $UTF8_GOT_ABOVE_31_BIT       = $UTF8_DISALLOW_ABOVE_31_BIT;
 my $UTF8_WARN_ABOVE_31_BIT      = 0x2000;
 my $UTF8_CHECK_ONLY             = 0x4000;
 my $UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE
@@ -559,19 +569,23 @@ for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
 
     my $display_flags = sprintf "0x%x", $this_utf8_flags;
     my $display_bytes = display_bytes($bytes);
-    my $ret_ref = test_utf8n_to_uvchr($bytes, $len, $this_utf8_flags);
+    my $ret_ref = test_utf8n_to_uvchr_error($bytes, $len, $this_utf8_flags);
 
     # Rest of tests likely meaningless if it gets the wrong code point.
     next unless is($ret_ref->[0], $n,
-       "Verify utf8n_to_uvchr($display_bytes, $display_flags) returns $hex_n");
+                   "Verify utf8n_to_uvchr_error($display_bytes, $display_flags)"
+                 . "returns $hex_n");
     is($ret_ref->[1], $len,
-       "Verify utf8n_to_uvchr() for $hex_n returns expected length: $len");
+       "Verify utf8n_to_uvchr_error() for $hex_n returns expected length:"
+     . " $len");
 
     unless (is(scalar @warnings, 0,
-               "Verify utf8n_to_uvchr() for $hex_n generated no warnings"))
+             "Verify utf8n_to_uvchr_error() for $hex_n generated no warnings"))
     {
         output_warnings(@warnings);
     }
+    is($ret_ref->[2], 0,
+       "Verify utf8n_to_uvchr_error() returned no error bits");
 
     undef @warnings;
 
@@ -999,36 +1013,36 @@ my $REPLACEMENT = 0xFFFD;
 # Now test the malformations.  All these raise category utf8 warnings.
 my @malformations = (
     [ "zero length string malformation", "", 0,
-        $UTF8_ALLOW_EMPTY, 0, 0,
+        $UTF8_ALLOW_EMPTY, $UTF8_GOT_EMPTY, 0, 0,
         qr/empty string/
     ],
     [ "orphan continuation byte malformation", I8_to_native("${I8c}a"),
         2,
-        $UTF8_ALLOW_CONTINUATION, $REPLACEMENT, 1,
+        $UTF8_ALLOW_CONTINUATION, $UTF8_GOT_CONTINUATION, $REPLACEMENT, 1,
         qr/unexpected continuation byte/
     ],
     [ "premature next character malformation (immediate)",
         (isASCII) ? "\xc2\xc2\x80" : I8_to_native("\xc5\xc5\xa0"),
         3,
-        $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 1,
+        $UTF8_ALLOW_NON_CONTINUATION, $UTF8_GOT_NON_CONTINUATION, $REPLACEMENT, 1,
         qr/unexpected non-continuation byte.*immediately after start byte/
     ],
     [ "premature next character malformation (non-immediate)",
         I8_to_native("\xf1${I8c}a"),
         3,
-        $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 2,
+        $UTF8_ALLOW_NON_CONTINUATION, $UTF8_GOT_NON_CONTINUATION, $REPLACEMENT, 2,
         qr/unexpected non-continuation byte .* 2 bytes after start byte/
     ],
     [ "too short malformation", I8_to_native("\xf1${I8c}a"), 2,
         # Having the 'a' after this, but saying there are only 2 bytes also
         # tests that we pay attention to the passed in length
-        $UTF8_ALLOW_SHORT, $REPLACEMENT, 2,
+        $UTF8_ALLOW_SHORT, $UTF8_GOT_SHORT, $REPLACEMENT, 2,
         qr/2 bytes, need 4/
     ],
     [ "overlong malformation, lowest 2-byte",
         (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"),
         2,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         0,   # NUL
         2,
         qr/overlong/
@@ -1036,7 +1050,7 @@ my @malformations = (
     [ "overlong malformation, highest 2-byte",
         (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
         2,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         (isASCII) ? 0x7F : utf8::unicode_to_native(0xBF),
         2,
         qr/overlong/
@@ -1044,7 +1058,7 @@ my @malformations = (
     [ "overlong malformation, lowest 3-byte",
         (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
         3,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         0,   # NUL
         3,
         qr/overlong/
@@ -1052,7 +1066,7 @@ my @malformations = (
     [ "overlong malformation, highest 3-byte",
         (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
         3,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         (isASCII) ? 0x7FF : 0x3FF,
         3,
         qr/overlong/
@@ -1060,7 +1074,7 @@ my @malformations = (
     [ "overlong malformation, lowest 4-byte",
         (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
         4,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         0,   # NUL
         4,
         qr/overlong/
@@ -1068,7 +1082,7 @@ my @malformations = (
     [ "overlong malformation, highest 4-byte",
         (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
         4,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         (isASCII) ? 0xFFFF : 0x3FFF,
         4,
         qr/overlong/
@@ -1078,7 +1092,7 @@ my @malformations = (
          ?              "\xf8\x80\x80\x80\x80"
          : I8_to_native("\xf8\xa0\xa0\xa0\xa0"),
         5,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         0,   # NUL
         5,
         qr/overlong/
@@ -1088,7 +1102,7 @@ my @malformations = (
          ?              "\xf8\x87\xbf\xbf\xbf"
          : I8_to_native("\xf8\xa7\xbf\xbf\xbf"),
         5,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         (isASCII) ? 0x1FFFFF : 0x3FFFF,
         5,
         qr/overlong/
@@ -1098,7 +1112,7 @@ my @malformations = (
          ?              "\xfc\x80\x80\x80\x80\x80"
          : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"),
         6,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         0,   # NUL
         6,
         qr/overlong/
@@ -1108,7 +1122,7 @@ my @malformations = (
          ?              "\xfc\x83\xbf\xbf\xbf\xbf"
          : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"),
         6,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
         6,
         qr/overlong/
@@ -1118,7 +1132,7 @@ my @malformations = (
          ?              "\xfe\x80\x80\x80\x80\x80\x80"
          : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"),
         7,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         0,   # NUL
         7,
         qr/overlong/
@@ -1128,7 +1142,7 @@ my @malformations = (
          ?              "\xfe\x81\xbf\xbf\xbf\xbf\xbf"
          : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"),
         7,
-        $UTF8_ALLOW_LONG,
+        $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
         (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
         7,
         qr/overlong/
@@ -1142,6 +1156,7 @@ if (isASCII && ! $is64bit) {    # 32-bit ASCII platform
             "\xfe\x84\x80\x80\x80\x80\x80",  # Represents 2**32
             7,
             0,  # There is no way to allow this malformation
+            $UTF8_GOT_OVERFLOW,
             $REPLACEMENT,
             7,
             qr/overflows/
@@ -1150,6 +1165,7 @@ if (isASCII && ! $is64bit) {    # 32-bit ASCII platform
             "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
             13,
             0,  # There is no way to allow this malformation
+            $UTF8_GOT_OVERFLOW,
             $REPLACEMENT,
             13,
             qr/overflows/
@@ -1167,7 +1183,7 @@ else {
              ?              "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
              : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
             (isASCII) ? 13 : 14,
-            $UTF8_ALLOW_LONG,
+            $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
             0,   # NUL
             (isASCII) ? 13 : 14,
             qr/overlong/,
@@ -1177,7 +1193,7 @@ else {
              ?              "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf"
              : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"),
             (isASCII) ? 13 : 14,
-            $UTF8_ALLOW_LONG,
+            $UTF8_ALLOW_LONG, $UTF8_GOT_LONG,
             (isASCII) ? 0xFFFFFFFFF : 0x3FFFFFFF,
             (isASCII) ? 13 : 14,
             qr/overlong/,
@@ -1189,6 +1205,7 @@ else {
             I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
             14,
             0,  # There is no way to allow this malformation
+            $UTF8_GOT_OVERFLOW,
             $REPLACEMENT,
             14,
             qr/overflows/
@@ -1202,6 +1219,7 @@ else {
                 : I8_to_native("\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 (isASCII) ? 13 : 14,
                 0,  # There is no way to allow this malformation
+                $UTF8_GOT_OVERFLOW,
                 $REPLACEMENT,
                 (isASCII) ? 13 : 14,
                 qr/overflows/
@@ -1210,7 +1228,8 @@ else {
 }
 
 foreach my $test (@malformations) {
-    my ($testname, $bytes, $length, $allow_flags, $allowed_uv, $expected_len, $message ) = @$test;
+    my ($testname, $bytes, $length, $allow_flags, $expected_error_flags,
+        $allowed_uv, $expected_len, $message ) = @$test;
 
     if (length($bytes) < $length) {
         fail("Internal test error: actual buffer length (" . length($bytes)
@@ -1314,49 +1333,79 @@ foreach my $test (@malformations) {
 
     # Test what happens when this malformation is not allowed
     undef @warnings;
-    my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
+    my $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, 0);
     is($ret_ref->[0], 0, "$testname: disallowed: Returns 0");
-    is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), disallowed: Returns expected length: $expected_len");
-    if (is(scalar @warnings, 1, "$testname: disallowed: Got a single warning ")) {
-        like($warnings[0], $message, "$testname: disallowed: Got expected warning");
+    is($ret_ref->[1], $expected_len,
+       "$testname: utf8n_to_uvchr_error(), disallowed: Returns expected"
+     . " length: $expected_len");
+    if (is(scalar @warnings, 1,
+           "$testname: disallowed: Got a single warning "))
+    {
+        like($warnings[0], $message,
+             "$testname: disallowed: Got expected warning");
     }
     else {
         if (scalar @warnings) {
             output_warnings(@warnings);
         }
     }
+    is($ret_ref->[2], $expected_error_flags,
+       "$testname: utf8n_to_uvchr_error(), disallowed:"
+     . " Returns expected error");
 
     {   # Next test when disallowed, and warnings are off.
         undef @warnings;
         no warnings 'utf8';
-        my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
-        is($ret_ref->[0], 0, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': Returns 0");
-        is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': Returns expected length: $expected_len");
-        if (!is(scalar @warnings, 0, "$testname: utf8n_to_uvchr(), disallowed: no warnings 'utf8': no warnings generated")) {
+        my $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, 0);
+        is($ret_ref->[0], 0,
+           "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':"
+         . " Returns 0");
+        is($ret_ref->[1], $expected_len,
+           "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':"
+         . " Returns expected length: $expected_len");
+        if (!is(scalar @warnings, 0,
+            "$testname: utf8n_to_uvchr_error(), disallowed: no warnings 'utf8':"
+          . " no warnings generated"))
+        {
             output_warnings(@warnings);
         }
+        is($ret_ref->[2], $expected_error_flags,
+           "$testname: utf8n_to_uvchr_error(), disallowed: Returns"
+         . " expected error");
     }
 
     # Test with CHECK_ONLY
     undef @warnings;
-    $ret_ref = test_utf8n_to_uvchr($bytes, $length, $UTF8_CHECK_ONLY);
+    $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, $UTF8_CHECK_ONLY);
     is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0");
     is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns -1 for length");
     if (! is(scalar @warnings, 0, "$testname: CHECK_ONLY: no warnings generated")) {
         output_warnings(@warnings);
     }
+    is($ret_ref->[2], $expected_error_flags,
+       "$testname: utf8n_to_uvchr_error(), disallowed: Returns expected"
+     . " error");
 
     next if $allow_flags == 0;    # Skip if can't allow this malformation
 
     # Test when the malformation is allowed
     undef @warnings;
-    $ret_ref = test_utf8n_to_uvchr($bytes, $length, $allow_flags);
-    is($ret_ref->[0], $allowed_uv, "$testname: utf8n_to_uvchr(), allowed: Returns expected uv: " . sprintf("0x%04X", $allowed_uv));
-    is($ret_ref->[1], $expected_len, "$testname: utf8n_to_uvchr(), allowed: Returns expected length: $expected_len");
-    if (!is(scalar @warnings, 0, "$testname: utf8n_to_uvchr(), allowed: no warnings generated"))
+    $ret_ref = test_utf8n_to_uvchr_error($bytes, $length, $allow_flags);
+    is($ret_ref->[0], $allowed_uv,
+       "$testname: utf8n_to_uvchr_error(), allowed: Returns expected uv: "
+     . sprintf("0x%04X", $allowed_uv));
+    is($ret_ref->[1], $expected_len,
+       "$testname: utf8n_to_uvchr_error(), allowed: Returns expected length:"
+     . " $expected_len");
+    if (!is(scalar @warnings, 0,
+            "$testname: utf8n_to_uvchr_error(), allowed: no warnings"
+          . " generated"))
     {
         output_warnings(@warnings);
     }
+    is($ret_ref->[2], $expected_error_flags,
+       "$testname: utf8n_to_uvchr_error(), disallowed: Returns"
+     . " expected error");
 }
 
 sub nonportable_regex ($) {
@@ -1381,35 +1430,35 @@ sub nonportable_regex ($) {
 my @tests = (
     [ "lowest surrogate",
         (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
-        $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
+        $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE,
         'surrogate', 0xD800,
         (isASCII) ? 3 : 4,
         qr/surrogate/
     ],
     [ "a middle surrogate",
         (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
-        $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
+        $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE,
         'surrogate', 0xD90D,
         (isASCII) ? 3 : 4,
         qr/surrogate/
     ],
     [ "highest surrogate",
         (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
-        $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
+        $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, $UTF8_GOT_SURROGATE,
         'surrogate', 0xDFFF,
         (isASCII) ? 3 : 4,
         qr/surrogate/
     ],
     [ "first non_unicode",
         (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
-        $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
+        $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, $UTF8_GOT_SUPER,
         'non_unicode', 0x110000,
         (isASCII) ? 4 : 5,
         qr/(not Unicode|for a non-Unicode code point).* may not be portable/
     ],
     [ "non_unicode whose first byte tells that",
         (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
-        $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
+        $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, $UTF8_GOT_SUPER,
         'non_unicode',
         (isASCII) ? 0x140000 : 0x200000,
         (isASCII) ? 4 : 5,
@@ -1417,253 +1466,253 @@ my @tests = (
     ],
     [ "first of 32 consecutive non-character code points",
         (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xFDD0,
         (isASCII) ? 3 : 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "a mid non-character code point of the 32 consecutive ones",
         (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xFDE0,
         (isASCII) ? 3 : 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "final of 32 consecutive non-character code points",
         (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xFDEF,
         (isASCII) ? 3 : 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+FFFE",
         (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xFFFE,
         (isASCII) ? 3 : 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+FFFF",
         (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xFFFF,
         (isASCII) ? 3 : 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+1FFFE",
         (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x1FFFE, 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+1FFFF",
         (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x1FFFF, 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+2FFFE",
         (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x2FFFE, 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+2FFFF",
         (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x2FFFF, 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+3FFFE",
         (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x3FFFE, 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+3FFFF",
         (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x3FFFF, 4,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+4FFFE",
         (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x4FFFE,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+4FFFF",
         (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x4FFFF,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+5FFFE",
         (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x5FFFE,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+5FFFF",
         (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x5FFFF,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+6FFFE",
         (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x6FFFE,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+6FFFF",
         (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x6FFFF,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+7FFFE",
         (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x7FFFE,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+7FFFF",
         (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x7FFFF,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+8FFFE",
         (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x8FFFE,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+8FFFF",
         (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x8FFFF,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+9FFFE",
         (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x9FFFE,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+9FFFF",
         (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x9FFFF,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+AFFFE",
         (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xAFFFE,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+AFFFF",
         (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xAFFFF,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+BFFFE",
         (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xBFFFE,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+BFFFF",
         (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xBFFFF,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+CFFFE",
         (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xCFFFE,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+CFFFF",
         (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xCFFFF,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+DFFFE",
         (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xDFFFE,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+DFFFF",
         (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xDFFFF,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+EFFFE",
         (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xEFFFE,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+EFFFF",
         (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xEFFFF,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+FFFFE",
         (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xFFFFE,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+FFFFF",
         (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0xFFFFF,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+10FFFE",
         (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x10FFFE,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
     ],
     [ "non-character code point U+10FFFF",
         (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
-        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
+        $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, $UTF8_GOT_NONCHAR,
         'nonchar', 0x10FFFF,
         (isASCII) ? 4 : 5,
         qr/Unicode non-character.*is not recommended for open interchange/
@@ -1675,6 +1724,7 @@ my @tests = (
         # This code point is chosen so that it is representable in a UV on
         # 32-bit machines
         $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
+        $UTF8_GOT_ABOVE_31_BIT,
         'utf8', 0x80000000, (isASCII) ? 7 :14,
         nonportable_regex(0x80000000)
     ],
@@ -1682,7 +1732,7 @@ my @tests = (
         (isASCII)
          ? "\xfe\x82\x80\x80\x80\x80\x80"
          : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
-        $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
+        $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, $UTF8_GOT_SUPER,
         'utf8', 0x80000000, (isASCII) ? 7 :14,
         nonportable_regex(0x80000000)
     ],
@@ -1702,11 +1752,9 @@ my @tests = (
         : ((isASCII)
            ?              "\xfe\x86\x80\x80\x80\x80\x80"
            : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
-
-        # We include both warning categories to make sure the ABOVE_31_BIT one
-        # has precedence
-        "$UTF8_WARN_ABOVE_31_BIT|$UTF8_WARN_SUPER",
-        "$UTF8_DISALLOW_ABOVE_31_BIT",
+        $UTF8_WARN_ABOVE_31_BIT,
+        $UTF8_DISALLOW_ABOVE_31_BIT,
+        $UTF8_GOT_ABOVE_31_BIT,
         'utf8', 0,
         (! isASCII) ? 14 : ($is64bit) ? 13 : 7,
         qr/overflows/
@@ -1721,6 +1769,7 @@ if ($is64bit) {
             ?              "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
             : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
             $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
+            $UTF8_GOT_ABOVE_31_BIT,
             'utf8', 0x1000000000, (isASCII) ? 13 : 14,
             qr/and( is)? not portable/
         ];
@@ -1729,30 +1778,35 @@ if ($is64bit) {
             [ "requires at least 32 bits",
                 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
+                $UTF8_GOT_ABOVE_31_BIT,
                 'utf8', 0x800000000, 14,
                 nonportable_regex(0x80000000)
             ],
             [ "requires at least 32 bits",
                 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
+                $UTF8_GOT_ABOVE_31_BIT,
                 'utf8', 0x10000000000, 14,
                 nonportable_regex(0x10000000000)
             ],
             [ "requires at least 32 bits",
                 I8_to_native("\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
+                $UTF8_GOT_ABOVE_31_BIT,
                 'utf8', 0x200000000000, 14,
                 nonportable_regex(0x20000000000)
             ],
             [ "requires at least 32 bits",
                 I8_to_native("\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
+                $UTF8_GOT_ABOVE_31_BIT,
                 'utf8', 0x4000000000000, 14,
                 nonportable_regex(0x4000000000000)
             ],
             [ "requires at least 32 bits",
                 I8_to_native("\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
+                $UTF8_GOT_ABOVE_31_BIT,
                 'utf8', 0x80000000000000, 14,
                 nonportable_regex(0x80000000000000)
             ],
@@ -1760,6 +1814,7 @@ if ($is64bit) {
                 I8_to_native("\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
                    #IBM-1047  \xFE\x41\x41\x41\x41\x41\x41\x43\x41\x41\x41\x41\x41\x41
                 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
+                $UTF8_GOT_ABOVE_31_BIT,
                 'utf8', 0x1000000000000000, 14,
                 nonportable_regex(0x1000000000000000)
             ];
@@ -1767,7 +1822,8 @@ if ($is64bit) {
 }
 
 foreach my $test (@tests) {
-    my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test;
+    my ($testname, $bytes, $warn_flags, $disallow_flags, $expected_error_flags,
+        $category, $allowed_uv, $expected_len, $message ) = @$test;
 
     my $length = length $bytes;
     my $will_overflow = $testname =~ /overflow/ ? 'overflow' : "";
@@ -1932,13 +1988,17 @@ foreach my $test (@tests) {
                             && ord native_to_I8(substr($bytes, 0, 1)) >= 0xFE;
 
                             my @malformations;
+                            my @expected_errors;
                             push @malformations, $short if $short;
                             push @malformations, $overlong if $overlong;
 
                             # The overflow malformation test in the input
                             # array is coerced into being treated like one of
                             # the others.
-                            push @malformations, 'overflow' if $will_overflow;
+                            if ($will_overflow) {
+                                push @malformations, 'overflow';
+                                push @expected_errors, $UTF8_GOT_OVERFLOW;
+                            }
 
                             my $malformations_name = join "/", @malformations;
                             $malformations_name .= " malformation"
@@ -1972,6 +2032,7 @@ foreach my $test (@tests) {
                                                . $this_bytes;
                                     $this_length = length($this_bytes);
                                     $this_expected_len = 7;
+                                    push @expected_errors, $UTF8_GOT_LONG;
                                 }
                                 if ($malformations_name =~ /short/) {
 
@@ -1979,6 +2040,7 @@ foreach my $test (@tests) {
                                     # enough into the input.
                                     $this_length--;
                                     $this_expected_len--;
+                                    push @expected_errors, $UTF8_GOT_SHORT;
                                 }
                                 elsif ($malformations_name
                                                         =~ /non-continuation/)
@@ -1987,6 +2049,8 @@ foreach my $test (@tests) {
                                     # a non one.
                                     substr($this_bytes, -1, 1) = '?';
                                     $this_expected_len--;
+                                    push @expected_errors,
+                                                    $UTF8_GOT_NON_CONTINUATION;
                                 }
                             }
 
@@ -2003,7 +2067,7 @@ foreach my $test (@tests) {
                             # well
                             my $disallowed = $disallow_flag
                                           || $malformations_name;
-                            my $this_name = "utf8n_to_uvchr() $testname: "
+                            my $this_name = "utf8n_to_uvchr_error() $testname: "
                                                         . (($disallow_flag)
                                                             ? 'disallowed'
                                                             : $disallowed
@@ -2018,11 +2082,13 @@ foreach my $test (@tests) {
                             my $ret_ref;
                             my $display_bytes = display_bytes($this_bytes);
                             my $call = "Call was: $eval_warn; \$ret_ref"
-                                     . " = test_utf8n_to_uvchr('$display_bytes'"
-                                     . ", $this_length, $warn_flag"
+                                     . " = test_utf8n_to_uvchr_error("
+                                     . "'$display_bytes', $this_length,"
+                                     . "$warn_flag"
                                      . "|$disallow_flag)";
                             my $eval_text =      "$eval_warn; \$ret_ref"
-                                     . " = test_utf8n_to_uvchr('$this_bytes',"
+                                     . " = test_utf8n_to_uvchr_error("
+                                     . "'$this_bytes',"
                                      . " $this_length, $warn_flag"
                                      . "|$disallow_flag)";
                             eval "$eval_text";
@@ -2054,6 +2120,31 @@ foreach my $test (@tests) {
                                 diag $call;
                             }
 
+                            my $errors = $ret_ref->[2];
+
+                            for (my $i = @expected_errors - 1; $i >= 0; $i--) {
+                                if (ok($expected_errors[$i] & $errors,
+                                       "Expected and got error bit return"
+                                     . " for $malformations[$i] malformation"))
+                                {
+                                    $errors &= ~$expected_errors[$i];
+                                }
+                                splice @expected_errors, $i, 1;
+                            }
+                            unless (is(scalar @expected_errors, 0,
+                                    "Got all the expected malformation errors"))
+                            {
+                                diag Dumper \@expected_errors;
+                            }
+
+                            if ($warn_flag || $disallow_flag) {
+                                is($errors, $expected_error_flags,
+                                   "Got the correct error flag");
+                            }
+                            else {
+                                is($errors, 0, "Got no other error flag");
+                            }
+
                             if (@malformations) {
                                 if (! $do_warning && $warning eq 'utf8') {
                                     goto no_warnings_expected;
@@ -2119,7 +2210,7 @@ foreach my $test (@tests) {
                             # not just when the $disallow_flag is set
                             if ($disallowed) {
                                 undef @warnings;
-                                $ret_ref = test_utf8n_to_uvchr(
+                                $ret_ref = test_utf8n_to_uvchr_error(
                                                $this_bytes, $this_length,
                                                $disallow_flag|$UTF8_CHECK_ONLY);
                                 unless (is($ret_ref->[0], 0,
@@ -2149,7 +2240,7 @@ foreach my $test (@tests) {
                             next if @malformations;
 
                             # The warning and disallow flags passed in are for
-                            # utf8n_to_uvchr().  Convert them for
+                            # utf8n_to_uvchr_error().  Convert them for
                             # uvchr_to_utf8_flags().
                             my $uvchr_warn_flag = 0;
                             my $uvchr_disallow_flag = 0;
index 62eaa0b..7479528 100644 (file)
@@ -335,6 +335,14 @@ abandon searching for other malformations when the first one is
 encountered.  A call to it thus can generate multiple diagnostics,
 instead of just one.
 
+=item *
+
+A new function, C<L<perlapi/utf8n_to_uvchr_error>>, has been added for
+use by modules that need to know the details of UTF-8 malformations
+beyond pass/fail.  Previously, the only ways to know why a sequence was
+ill-formed was to capture and parse the generated diagnostics, or to do
+your own analysis.
+
 =back
 
 =head1 Selected Bug Fixes
diff --git a/proto.h b/proto.h
index 5b54bda..701dc9e 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -3537,6 +3537,9 @@ PERL_CALLCONV UV  Perl_utf8_to_uvuni_buf(pTHX_ const U8 *s, const U8 *send, STRLE
 PERL_CALLCONV UV       Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags);
 #define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR        \
        assert(s)
+PERL_CALLCONV UV       Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 * errors);
+#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR  \
+       assert(s)
 PERL_CALLCONV UV       Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
 #define PERL_ARGS_ASSERT_UTF8N_TO_UVUNI        \
        assert(s)
diff --git a/utf8.c b/utf8.c
index a538296..0432bb0 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -811,7 +811,7 @@ character, and an error return (unless the C<UTF8_CHECK_ONLY> flag is set), as
 in both cases, 0 is returned, and, depending on the malformation, C<retlen> may
 be set to 1.  To disambiguate, upon a zero return, see if the first byte of
 C<s> is 0 as well.  If so, the input was a C<NUL>; if not, the input had an
-error.
+error.  Or you can use C<L</utf8n_to_uvchr_error>>.
 
 Certain code points are considered problematic.  These are Unicode surrogates,
 Unicode non-characters, and code points above the Unicode maximum of 0x10FFFF.
@@ -874,10 +874,112 @@ use and those yet to be assigned, are never considered malformed and never
 warn.
 
 =cut
+
+Also implemented as a macro in utf8.h
+*/
+
+UV
+Perl_utf8n_to_uvchr(pTHX_ const U8 *s,
+                          STRLEN curlen,
+                          STRLEN *retlen,
+                          const U32 flags)
+{
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
+
+    return utf8n_to_uvchr_error(s, curlen, retlen, flags, NULL);
+}
+
+/*
+
+=for apidoc utf8n_to_uvchr_error
+
+THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
+
+This function is for code that needs to know what the precise malformation(s)
+are when an error is found.
+
+It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
+all the others, C<errors>.  If this parameter is 0, this function behaves
+identically to C<L</utf8n_to_uvchr>>.  Otherwise, C<errors> should be a pointer
+to a C<U32> variable, which this function sets to indicate any errors found.
+Upon return, if C<*errors> is 0, there were no errors found.  Otherwise,
+C<*errors> is the bit-wise C<OR> of the bits described in the list below.  Some
+of these bits will be set if a malformation is found, even if the input
+C<flags> parameter indicates that the given malformation is allowed; the
+exceptions are noted:
+
+=over 4
+
+=item C<UTF8_GOT_ABOVE_31_BIT>
+
+The code point represented by the input UTF-8 sequence occupies more than 31
+bits.
+This bit is set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_ABOVE_31_BIT> or the C<UTF8_WARN_ABOVE_31_BIT> flags.
+
+=item C<UTF8_GOT_CONTINUATION>
+
+The input sequence was malformed in that the first byte was a a UTF-8
+continuation byte.
+
+=item C<UTF8_GOT_EMPTY>
+
+The input C<curlen> parameter was 0.
+
+=item C<UTF8_GOT_LONG>
+
+The input sequence was malformed in that there is some other sequence that
+evaluates to the same code point, but that sequence is shorter than this one.
+
+=item C<UTF8_GOT_NONCHAR>
+
+The code point represented by the input UTF-8 sequence is for a Unicode
+non-character code point.
+This bit is set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_NONCHAR> or the C<UTF8_WARN_NONCHAR> flags.
+
+=item C<UTF8_GOT_NON_CONTINUATION>
+
+The input sequence was malformed in that a non-continuation type byte was found
+in a position where only a continuation type one should be.
+
+=item C<UTF8_GOT_OVERFLOW>
+
+The input sequence was malformed in that it is for a code point that is not
+representable in the number of bits available in a UV on the current platform.
+
+=item C<UTF8_GOT_SHORT>
+
+The input sequence was malformed in that C<curlen> is smaller than required for
+a complete sequence.  In other words, the input is for a partial character
+sequence.
+
+=item C<UTF8_GOT_SUPER>
+
+The input sequence was malformed in that it is for a non-Unicode code point;
+that is, one above the legal Unicode maximum.
+This bit is set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_SUPER> or the C<UTF8_WARN_SUPER> flags.
+
+=item C<UTF8_GOT_SURROGATE>
+
+The input sequence was malformed in that it is for a -Unicode UTF-16 surrogate
+code point.
+This bit is set only if the input C<flags> parameter contains either the
+C<UTF8_DISALLOW_SURROGATE> or the C<UTF8_WARN_SURROGATE> flags.
+
+=back
+
+=cut
 */
 
 UV
-Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags)
+Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
+                                STRLEN curlen,
+                                STRLEN *retlen,
+                                const U32 flags,
+                                U32 * errors)
 {
     const U8 * const s0 = s;
     U8 * send = NULL;           /* (initialized to silence compilers' wrong
@@ -888,6 +990,8 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
     STRLEN expectlen   = 0;     /* How long should this sequence be?
                                    (initialized to silence compilers' wrong
                                    warning) */
+    U32 discard_errors = 0;     /* Used to save branches when 'errors' is NULL;
+                                   this gets set and discarded */
 
     /* The below are used only if there is both an overlong malformation and a
      * too short one.  Otherwise the first two are set to 's0' and 'send', and
@@ -896,7 +1000,14 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
     U8 * adjusted_send;
     UV uv_so_far = 0;   /* (Initialized to silence compilers' wrong warning) */
 
-    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
+
+    if (errors) {
+        *errors = 0;
+    }
+    else {
+        errors = &discard_errors;
+    }
 
     /* The order of malformation tests here is important.  We should consume as
      * few bytes as possible in order to not skip any valid character.  This is
@@ -1162,6 +1273,15 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
                  * handle all three cases here */
                 possible_problems
                   &= ~(UTF8_GOT_OVERFLOW|UTF8_GOT_SUPER|UTF8_GOT_ABOVE_31_BIT);
+                *errors |= UTF8_GOT_OVERFLOW;
+
+                /* But the API says we flag all errors found */
+                if (flags & (UTF8_WARN_SUPER|UTF8_DISALLOW_SUPER)) {
+                    *errors |= UTF8_GOT_SUPER;
+                }
+                if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
+                    *errors |= UTF8_GOT_ABOVE_31_BIT;
+                }
 
                 disallowed = TRUE;
 
@@ -1186,6 +1306,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
             }
             else if (possible_problems & UTF8_GOT_EMPTY) {
                 possible_problems &= ~UTF8_GOT_EMPTY;
+                *errors |= UTF8_GOT_EMPTY;
 
                 if (! (flags & UTF8_ALLOW_EMPTY)) {
                     disallowed = TRUE;
@@ -1198,6 +1319,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
             }
             else if (possible_problems & UTF8_GOT_CONTINUATION) {
                 possible_problems &= ~UTF8_GOT_CONTINUATION;
+                *errors |= UTF8_GOT_CONTINUATION;
 
                 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
                     disallowed = TRUE;
@@ -1213,6 +1335,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
             }
             else if (possible_problems & UTF8_GOT_NON_CONTINUATION) {
                 possible_problems &= ~UTF8_GOT_NON_CONTINUATION;
+                *errors |= UTF8_GOT_NON_CONTINUATION;
 
                 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
                     disallowed = TRUE;
@@ -1228,6 +1351,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
             }
             else if (possible_problems & UTF8_GOT_SHORT) {
                 possible_problems &= ~UTF8_GOT_SHORT;
+                *errors |= UTF8_GOT_SHORT;
 
                 if (! (flags & UTF8_ALLOW_SHORT)) {
                     disallowed = TRUE;
@@ -1246,6 +1370,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
             }
             else if (possible_problems & UTF8_GOT_LONG) {
                 possible_problems &= ~UTF8_GOT_LONG;
+                *errors |= UTF8_GOT_LONG;
 
                 if (! (flags & UTF8_ALLOW_LONG)) {
                     disallowed = TRUE;
@@ -1289,13 +1414,12 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
             else if (possible_problems & UTF8_GOT_SURROGATE) {
                 possible_problems &= ~UTF8_GOT_SURROGATE;
 
-                /* By adding UTF8_CHECK_ONLY to the test, we avoid unnecessary
-                 * generation of the format, since no warnings are raised under
-                 * CHECK */
-                if (   (flags & (UTF8_WARN_SURROGATE|UTF8_CHECK_ONLY))
-                                                        == UTF8_WARN_SURROGATE
-                    && ckWARN_d(WARN_SURROGATE))
-                {
+                if (flags & UTF8_WARN_SURROGATE) {
+                    *errors |= UTF8_GOT_SURROGATE;
+
+                    if (   ! (flags & UTF8_CHECK_ONLY)
+                        && ckWARN_d(WARN_SURROGATE))
+                    {
                         pack_warn = packWARN(WARN_SURROGATE);
 
                         /* These are the only errors that can occur with a
@@ -1310,19 +1434,23 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
                             message = Perl_form(aTHX_
                                             "UTF-16 surrogate U+%04"UVXf"", uv);
                         }
+                    }
                 }
 
                 if (flags & UTF8_DISALLOW_SURROGATE) {
                     disallowed = TRUE;
+                    *errors |= UTF8_GOT_SURROGATE;
                 }
             }
             else if (possible_problems & UTF8_GOT_SUPER) {
                 possible_problems &= ~UTF8_GOT_SUPER;
 
-                if (   (flags & (UTF8_WARN_SUPER|UTF8_CHECK_ONLY))
-                                                            == UTF8_WARN_SUPER
-                    && ckWARN_d(WARN_NON_UNICODE))
-                {
+                if (flags & UTF8_WARN_SUPER) {
+                    *errors |= UTF8_GOT_SUPER;
+
+                    if (   ! (flags & UTF8_CHECK_ONLY)
+                        && ckWARN_d(WARN_NON_UNICODE))
+                    {
                         pack_warn = packWARN(WARN_NON_UNICODE);
 
                         if (orig_problems & UTF8_GOT_TOO_SHORT) {
@@ -1338,6 +1466,7 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
                                                 " Unicode, may not be portable",
                                                 uv);
                         }
+                    }
                 }
 
                 /* The maximum code point ever specified by a standard was
@@ -1378,12 +1507,17 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
                         }
                     }
 
-                    if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
-                        disallowed = TRUE;
+                    if (flags & (UTF8_WARN_ABOVE_31_BIT|UTF8_DISALLOW_ABOVE_31_BIT)) {
+                        *errors |= UTF8_GOT_ABOVE_31_BIT;
+
+                        if (flags & UTF8_DISALLOW_ABOVE_31_BIT) {
+                            disallowed = TRUE;
+                        }
                     }
                 }
 
                 if (flags & UTF8_DISALLOW_SUPER) {
+                    *errors |= UTF8_GOT_SUPER;
                     disallowed = TRUE;
                 }
 
@@ -1405,10 +1539,12 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
             else if (possible_problems & UTF8_GOT_NONCHAR) {
                 possible_problems &= ~UTF8_GOT_NONCHAR;
 
-                if (  (flags & (UTF8_WARN_NONCHAR|UTF8_CHECK_ONLY))
-                                                        == UTF8_WARN_NONCHAR
-                    && ckWARN_d(WARN_NONCHAR))
-                {
+                if (flags & UTF8_WARN_NONCHAR) {
+                    *errors |= UTF8_GOT_NONCHAR;
+
+                    if (  ! (flags & UTF8_CHECK_ONLY)
+                        && ckWARN_d(WARN_NONCHAR))
+                    {
                         /* The code above should have guaranteed that we don't
                          * get here with errors other than overlong */
                         assert (! (orig_problems
@@ -1418,10 +1554,12 @@ Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32
                         message = Perl_form(aTHX_ "Unicode non-character"
                                                 " U+%04"UVXf" is not recommended"
                                                 " for open interchange", uv);
+                    }
                 }
 
                 if (flags & UTF8_DISALLOW_NONCHAR) {
                     disallowed = TRUE;
+                    *errors |= UTF8_GOT_NONCHAR;
                 }
             } /* End of looking through the possible flags */
 
diff --git a/utf8.h b/utf8.h
index c55ce26..aa1c69d 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -72,6 +72,8 @@ the string is invariant.
 #define utf8_to_uvchr_buf(s, e, lenp)                                          \
                      utf8n_to_uvchr(s, (U8*)(e) - (U8*)(s), lenp,              \
                                     ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)
+#define utf8n_to_uvchr(s, len, lenp, flags)                                    \
+                                utf8n_to_uvchr_error(s, len, lenp, flags, 0)
 
 #define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)
 #define to_utf8_fold(c, p, lenp) _to_utf8_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)