Add utf8n_to_uvchr_msgs()
authorKarl Williamson <khw@cpan.org>
Sun, 28 Jan 2018 00:43:00 +0000 (17:43 -0700)
committerKarl Williamson <khw@cpan.org>
Tue, 30 Jan 2018 15:54:40 +0000 (08:54 -0700)
This UTF-8 to code point translator variant is to meet the needs of
Encode, and provides XS authors with more general capability than
the other decoders.

embed.fnc
embed.h
ext/XS-APItest/APItest.xs
ext/XS-APItest/t/utf8_warn_base.pl
proto.h
utf8.c
utf8.h

index e16c8a6..35202e8 100644 (file)
--- a/embed.fnc
+++ b/embed.fnc
@@ -1860,11 +1860,17 @@ Adop    |UV     |utf8n_to_uvchr |NN const U8 *s                             \
                                |STRLEN curlen                              \
                                |NULLOK STRLEN *retlen                      \
                                |const U32 flags
-Ad   |UV     |utf8n_to_uvchr_error|NN const U8 *s                        \
+Adop   |UV     |utf8n_to_uvchr_error|NN const U8 *s                        \
                                |STRLEN curlen                              \
                                |NULLOK STRLEN *retlen                      \
                                |const U32 flags                            \
                                |NULLOK U32 * errors
+Adp    |UV     |utf8n_to_uvchr_msgs|NN const U8 *s                         \
+                               |STRLEN curlen                              \
+                               |NULLOK STRLEN *retlen                      \
+                               |const U32 flags                            \
+                               |NULLOK U32 * errors                        \
+                               |NULLOK AV ** msgs
 AipnR  |UV     |valid_utf8_to_uvchr    |NN const U8 *s|NULLOK STRLEN *retlen
 Ap     |UV     |utf8n_to_uvuni|NN const U8 *s|STRLEN curlen|NULLOK STRLEN *retlen|U32 flags
 
diff --git a/embed.h b/embed.h
index 008b806..334c606 100644 (file)
--- a/embed.h
+++ b/embed.h
 #define utf8_to_uvchr(a,b)     Perl_utf8_to_uvchr(aTHX_ a,b)
 #define utf8_to_uvuni(a,b)     Perl_utf8_to_uvuni(aTHX_ a,b)
 #define utf8_to_uvuni_buf(a,b,c)       Perl_utf8_to_uvuni_buf(aTHX_ a,b,c)
-#define utf8n_to_uvchr_error(a,b,c,d,e)        Perl_utf8n_to_uvchr_error(aTHX_ a,b,c,d,e)
+#define utf8n_to_uvchr_msgs(a,b,c,d,e,f)       Perl_utf8n_to_uvchr_msgs(aTHX_ a,b,c,d,e,f)
 #define utf8n_to_uvuni(a,b,c,d)        Perl_utf8n_to_uvuni(aTHX_ a,b,c,d)
 #define uvoffuni_to_utf8_flags(a,b,c)  Perl_uvoffuni_to_utf8_flags(aTHX_ a,b,c)
 #define uvuni_to_utf8(a,b)     Perl_uvuni_to_utf8(aTHX_ a,b)
index 0ad0823..a693318 100644 (file)
@@ -1379,6 +1379,46 @@ bytes_cmp_utf8(bytes, utf8)
     OUTPUT:
        RETVAL
 
+AV *
+test_utf8n_to_uvchr_msgs(s, len, flags)
+        char *s
+        STRLEN len
+        U32 flags
+    PREINIT:
+        STRLEN retlen;
+        UV ret;
+        U32 errors;
+        AV *msgs = NULL;
+
+    CODE:
+        RETVAL = newAV();
+        sv_2mortal((SV*)RETVAL);
+
+        ret = utf8n_to_uvchr_msgs((U8*)  s,
+                                         len,
+                                         &retlen,
+                                         flags,
+                                         &errors,
+                                         &msgs);
+
+        /* Returns the return value in [0]; <retlen> in [1], <errors> in [2] */
+        av_push(RETVAL, newSVuv(ret));
+        if (retlen == (STRLEN) -1) {
+            av_push(RETVAL, newSViv(-1));
+        }
+        else {
+            av_push(RETVAL, newSVuv(retlen));
+        }
+        av_push(RETVAL, newSVuv(errors));
+
+        /* And any messages in [3] */
+        if (msgs) {
+            av_push(RETVAL, newRV_noinc((SV*)msgs));
+        }
+
+    OUTPUT:
+        RETVAL
+
 AV *
 test_utf8n_to_uvchr_error(s, len, flags)
 
index 91de8a8..037b3e2 100644 (file)
@@ -702,6 +702,9 @@ sub do_warnings_test(@)
 my $num_test_files = $ENV{TEST_JOBS} || 1;
 $num_test_files = 10 if $num_test_files > 10;
 
+# We only really need to test utf8n_to_uvchr_msgs() once with this flag.
+my $tested_CHECK_ONLY = 0;
+
 my $test_count = -1;
 foreach my $test (@tests) {
     $test_count++;
@@ -1347,7 +1350,24 @@ foreach my $test (@tests) {
               }
               else {
                 next if $skip_most_tests;
-            }
+              }
+
+              # This tests three functions.  utf8n_to_uvchr_error,
+              # utf8n_to_uvchr_msgs, and uvchr_to_utf8_flags.  But only the
+              # first two are variants of each other.  We use a loop
+              # 'which_func' to determine which of these.  uvchr_to_utf8_flags
+              # is done separately at the end of each iteration, only when
+              # which_func is 0.  which_func is numeric in part so we don't
+              # have to type in the function name and risk misspelling it
+              # somewhere, and also it sets whether we are expecting warnings
+              # or not in certain places.  The _msgs() version of the function
+              # expects warnings even if lexical ones are turned off, so by
+              # making its which_func == 1, we can say we want warnings;
+              # whereas the other one with the value 0, doesn't get them.
+              for my $which_func (0, 1) {
+                my $func = ($which_func)
+                            ? 'utf8n_to_uvchr_msgs'
+                            : 'utf8n_to_uvchr_error';
 
               # We classify the warnings into certain "interesting" types,
               # described later
@@ -1356,6 +1376,12 @@ foreach my $test (@tests) {
                 foreach my $use_warn_flag (0, 1) {
                     if ($use_warn_flag) {
                         next if $initially_overlong || $initially_orphan;
+
+                        # Since utf8n_to_uvchr_msgs() expects warnings even
+                        # when lexical ones are turned off, we can skip
+                        # testing it when they are turned on, with little
+                        # likelihood of missing an error case.
+                        next if $which_func;
                     }
                     else {
                         next if $skip_most_tests;
@@ -1390,9 +1416,9 @@ foreach my $test (@tests) {
                     }
                     elsif ($warning_type == 1) {
                         $eval_warn = "no warnings";
-                        $expect_regular_warnings = 0;
-                        $expect_warnings_for_overflow = 0;
-                        $expect_warnings_for_malformed = 0;
+                        $expect_regular_warnings = $which_func;
+                        $expect_warnings_for_overflow = $which_func;
+                        $expect_warnings_for_malformed = $which_func;
                     }
                     elsif ($warning_type == 2) {
                         $eval_warn = "no warnings; use warnings 'utf8'";
@@ -1407,7 +1433,7 @@ foreach my $test (@tests) {
                         $expect_regular_warnings = $use_warn_flag;
                         $expect_warnings_for_overflow
                             = $controlling_warning_category eq 'non_unicode';
-                        $expect_warnings_for_malformed = 0;
+                        $expect_warnings_for_malformed = $which_func;
                     }
                     elsif ($warning_type == 4) {  # Like type 3, but uses the
                                                   # PERL_EXTENDED flags
@@ -1567,7 +1593,8 @@ foreach my $test (@tests) {
                         }
                     }
 
-                    my $this_name = "utf8n_to_uvchr_error() $testname: ";
+                    my $this_name = "$func() $testname: ";
+                    my @scratch_expected_return_flags = @expected_return_flags;
                     if (! $initially_malformed) {
                         $this_name .= ($disallowed)
                                        ? 'disallowed, '
@@ -1586,7 +1613,7 @@ foreach my $test (@tests) {
                     my $this_flags
                         = $allow_flags|$this_warning_flags|$this_disallow_flags;
                     my $eval_text =      "$eval_warn; \$ret_ref"
-                            . " = test_utf8n_to_uvchr_error("
+                            . " = test_$func("
                             . "'$this_bytes', $this_length, $this_flags)";
                     eval "$eval_text";
                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
@@ -1595,6 +1622,7 @@ foreach my $test (@tests) {
                            . utf8n_display_call($eval_text);
                         next;
                     }
+
                     if ($disallowed) {
                         is($ret_ref->[0], 0, "    And returns 0")
                           or diag "Call was: " . utf8n_display_call($eval_text);
@@ -1612,28 +1640,33 @@ foreach my $test (@tests) {
 
                     my $returned_flags = $ret_ref->[2];
 
-                    for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
-                        if ($expected_return_flags[$i] & $returned_flags) {
-                            if ($expected_return_flags[$i]
-                                                == $::UTF8_GOT_PERL_EXTENDED)
-                            {
-                                pass("    Expected and got return flag for"
-                                   . " PERL_EXTENDED");
-                            }
-                                   # The first entries in this are
-                                   # malformations
-                            elsif ($i > @malformation_names - 1)  {
-                                pass("    Expected and got return flag"
-                                   . " for " . $controlling_warning_category);
-                            }
-                            else {
-                                pass("    Expected and got return flag for "
-                                   . $malformation_names[$i]
-                                   . " malformation");
-                            }
-                            $returned_flags &= ~$expected_return_flags[$i];
-                            splice @expected_return_flags, $i, 1;
-                        }
+                    for (my $i = @scratch_expected_return_flags - 1;
+                         $i >= 0;
+                         $i--)
+                    {
+                      if ($scratch_expected_return_flags[$i] & $returned_flags)
+                      {
+                          if ($scratch_expected_return_flags[$i]
+                                              == $::UTF8_GOT_PERL_EXTENDED)
+                          {
+                              pass("    Expected and got return flag for"
+                                  . " PERL_EXTENDED");
+                          }
+                                  # The first entries in this are
+                                  # malformations
+                          elsif ($i > @malformation_names - 1)  {
+                              pass("    Expected and got return flag"
+                                  . " for " . $controlling_warning_category);
+                          }
+                          else {
+                              pass("    Expected and got return flag for "
+                                  . $malformation_names[$i]
+                                  . " malformation");
+                          }
+                          $returned_flags
+                                      &= ~$scratch_expected_return_flags[$i];
+                          splice @scratch_expected_return_flags, $i, 1;
+                      }
                     }
 
                     is($returned_flags, 0,
@@ -1644,27 +1677,67 @@ foreach my $test (@tests) {
                                 # We strip off any prefixes from the flag
                                 # names
                              =~ s/ \b [A-Z] _ //xgr);
-                    is (scalar @expected_return_flags, 0,
+                    is (scalar @scratch_expected_return_flags, 0,
                         "    Got all expected return flags")
                         or diag "The expected flags not gotten were: "
                            . (flags_to_text(eval join("|",
-                                                        @expected_return_flags),
+                                                @scratch_expected_return_flags),
                                             \@utf8n_flags_to_text)
                                 # We strip off any prefixes from the flag
                                 # names
                              =~ s/ \b [A-Z] _ //xgr);
 
+                    if ($which_func) {
+                        my @returned_warnings;
+                        for my $element_ref (@{$ret_ref->[3]}) {
+                            push @returned_warnings, $element_ref->{'text'};
+                            my $text = $element_ref->{'text'};
+                            my $flag = $element_ref->{'flag_bit'};
+                            my $category = $element_ref->{'warning_category'};
+
+                            if (! ok(($flag & ($flag-1)) == 0,
+                                      "flag for returned msg is a single bit"))
+                            {
+                              diag sprintf("flags are %x; msg=%s", $flag, $text);
+                            }
+                            else {
+                              if (grep { $_ == $flag } @expected_return_flags) {
+                                  pass("flag for returned msg is expected");
+                              }
+                              else {
+                                  fail("flag for returned msg is expected: "
+                                 . flags_to_text($flag, \@utf8n_flags_to_text));
+                              }
+                            }
+
+                            # In perl space, don't know the category numbers
+                            isnt($category, 0,
+                                          "returned category for msg isn't 0");
+                        }
+
+                        ok(@warnings_gotten == 0, "$func raised no warnings;"
+                              . " the next tests are for ones in the returned"
+                              . " variable")
+                            or diag join "\n", "The unexpected warnings were:",
+                                                              @warnings_gotten;
+                        @warnings_gotten = @returned_warnings;
+                    }
+
                     do_warnings_test(@expected_warnings)
                       or diag "Call was: " . utf8n_display_call($eval_text);
                     undef @warnings_gotten;
 
                     # Check CHECK_ONLY results when the input is
                     # disallowed.  Do this when actually disallowed,
-                    # not just when the $this_disallow_flags is set
-                    if ($disallowed) {
+                    # not just when the $this_disallow_flags is set.  We only
+                    # test once utf8n_to_uvchr_msgs() with this.
+                    if (   $disallowed
+                        && ($which_func == 0 || ! $tested_CHECK_ONLY))
+                    {
+                        $tested_CHECK_ONLY = 1;
                         my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
                         my $eval_text = "use warnings; \$ret_ref ="
-                                      . " test_utf8n_to_uvchr_error('"
+                                      . " test_$func('"
                                       . "$this_bytes', $this_length,"
                                       . " $this_flags)";
                         eval $eval_text;
@@ -1693,6 +1766,7 @@ foreach my $test (@tests) {
                     # existing code point, it hasn't overflowed, and isn't
                     # malformed.
                     next if @malformation_names;
+                    next if $which_func;
 
                     $this_warning_flags = ($use_warn_flag)
                                           ? $this_uvchr_flag_to_warn
@@ -1744,6 +1818,7 @@ foreach my $test (@tests) {
                       or diag "Call was: " . uvchr_display_call($eval_text);
                 }
               }
+              }
             }
           }
         }
diff --git a/proto.h b/proto.h
index 911b961..eadfc97 100644 (file)
--- a/proto.h
+++ b/proto.h
@@ -3666,6 +3666,9 @@ PERL_CALLCONV UV  Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *r
 PERL_CALLCONV UV       Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 * errors);
 #define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR  \
        assert(s)
+PERL_CALLCONV UV       Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, const U32 flags, U32 * errors, AV ** msgs);
+#define PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS   \
+       assert(s)
 PERL_CALLCONV UV       Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
 #define PERL_ARGS_ASSERT_UTF8N_TO_UVUNI        \
        assert(s)
diff --git a/utf8.c b/utf8.c
index 3123bd0..34e47f3 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1167,7 +1167,8 @@ THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
 Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
 
 This function is for code that needs to know what the precise malformation(s)
-are when an error is found.
+are when an error is found.  If you also need to know the generated warning
+messages, use L</utf8n_to_uvchr_msgs>() instead.
 
 It is like C<L</utf8n_to_uvchr>> but it takes an extra parameter placed after
 all the others, C<errors>.  If this parameter is 0, this function behaves
@@ -1272,14 +1273,81 @@ To do your own error handling, call this function with the C<UTF8_CHECK_ONLY>
 flag to suppress any warnings, and then examine the C<*errors> return.
 
 =cut
+
+Also implemented as a macro in utf8.h
 */
 
 UV
 Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
-                                STRLEN curlen,
-                                STRLEN *retlen,
-                                const U32 flags,
-                                U32 * errors)
+                          STRLEN curlen,
+                          STRLEN *retlen,
+                          const U32 flags,
+                          U32 * errors)
+{
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
+
+    return utf8n_to_uvchr_msgs(s, curlen, retlen, flags, errors, NULL);
+}
+
+/*
+
+=for apidoc utf8n_to_uvchr_msgs
+
+THIS FUNCTION SHOULD BE USED IN ONLY VERY SPECIALIZED CIRCUMSTANCES.
+Most code should use L</utf8_to_uvchr_buf>() rather than call this directly.
+
+This function is for code that needs to know what the precise malformation(s)
+are when an error is found, and wants the corresponding warning and/or error
+messages to be returned to the caller rather than be displayed.  All messages
+that would have been displayed if all lexcial warnings are enabled will be
+returned.
+
+It is just like C<L</utf8n_to_uvchr_error>> but it takes an extra parameter
+placed after all the others, C<msgs>.  If this parameter is 0, this function
+behaves identically to C<L</utf8n_to_uvchr_error>>.  Otherwise, C<msgs> should
+be a pointer to an C<AV *> variable, in which this function creates a new AV to
+contain any appropriate messages.  The elements of the array are ordered so
+that the first message that would have been displayed is in the 0th element,
+and so on.  Each element is a hash with three key-value pairs, as follows:
+
+=over 4
+
+=item C<text>
+
+The text of the message as a C<SVpv>.
+
+=item C<warn_categories>
+
+The warning category (or categories) packed into a C<SVuv>.
+
+=item C<flag>
+
+A single flag bit associated with this message, in a C<SVuv>.
+The bit corresponds to some bit in the C<*errors> return value,
+such as C<UTF8_GOT_LONG>.
+
+=back
+
+It's important to note that specifying this parameter as non-null will cause
+any warnings this function would otherwise generate to be suppressed, and
+instead be placed in C<*msgs>.  The caller can check the lexical warnings state
+(or not) when choosing what to do with the returned messages.
+
+If the flag C<UTF8_CHECK_ONLY> is passed, no warnings are generated, and hence
+no AV is created.
+
+The caller, of course, is responsible for freeing any returned AV.
+
+=cut
+*/
+
+UV
+Perl_utf8n_to_uvchr_msgs(pTHX_ const U8 *s,
+                               STRLEN curlen,
+                               STRLEN *retlen,
+                               const U32 flags,
+                               U32 * errors,
+                               AV ** msgs)
 {
     const U8 * const s0 = s;
     U8 * send = NULL;           /* (initialized to silence compilers' wrong
@@ -1302,7 +1370,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                                             routine; see [perl #130921] */
     UV uv_so_far = 0;   /* (Initialized to silence compilers' wrong warning) */
 
-    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_ERROR;
+    PERL_ARGS_ASSERT_UTF8N_TO_UVCHR_MSGS;
 
     if (errors) {
         *errors = 0;
@@ -1576,9 +1644,14 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
         bool disallowed = FALSE;
         const U32 orig_problems = possible_problems;
 
+        if (msgs) {
+            *msgs = NULL;
+        }
+
         while (possible_problems) { /* Handle each possible problem */
             UV pack_warn = 0;
             char * message = NULL;
+            U32 this_flag_bit = 0;
 
             /* Each 'if' clause handles one problem.  They are ordered so that
              * the first ones' messages will be displayed before the later
@@ -1623,16 +1696,17 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      * necessarily do so in the future.  We output (only) the
                      * most dire warning */
                     if (! (flags & UTF8_CHECK_ONLY)) {
-                        if (ckWARN_d(WARN_UTF8)) {
+                        if (msgs || ckWARN_d(WARN_UTF8)) {
                             pack_warn = packWARN(WARN_UTF8);
                         }
-                        else if (ckWARN_d(WARN_NON_UNICODE)) {
+                        else if (msgs || ckWARN_d(WARN_NON_UNICODE)) {
                             pack_warn = packWARN(WARN_NON_UNICODE);
                         }
                         if (pack_warn) {
                             message = Perl_form(aTHX_ "%s: %s (overflows)",
                                             malformed_text,
                                             _byte_dump_string(s0, curlen, 0));
+                            this_flag_bit = UTF8_GOT_OVERFLOW;
                         }
                     }
                 }
@@ -1649,10 +1723,13 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                     assert(0);
 
                     disallowed = TRUE;
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                    if (  (msgs
+                        || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
                         pack_warn = packWARN(WARN_UTF8);
                         message = Perl_form(aTHX_ "%s (empty string)",
                                                    malformed_text);
+                        this_flag_bit = UTF8_GOT_EMPTY;
                     }
                 }
             }
@@ -1662,13 +1739,16 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
 
                 if (! (flags & UTF8_ALLOW_CONTINUATION)) {
                     disallowed = TRUE;
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
                         pack_warn = packWARN(WARN_UTF8);
                         message = Perl_form(aTHX_
                                 "%s: %s (unexpected continuation byte 0x%02x,"
                                 " with no preceding start byte)",
                                 malformed_text,
                                 _byte_dump_string(s0, 1, 0), *s0);
+                        this_flag_bit = UTF8_GOT_CONTINUATION;
                     }
                 }
             }
@@ -1678,7 +1758,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
 
                 if (! (flags & UTF8_ALLOW_SHORT)) {
                     disallowed = TRUE;
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
                         pack_warn = packWARN(WARN_UTF8);
                         message = Perl_form(aTHX_
                              "%s: %s (too short; %d byte%s available, need %d)",
@@ -1687,6 +1769,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                              (int)avail_len,
                              avail_len == 1 ? "" : "s",
                              (int)expectlen);
+                        this_flag_bit = UTF8_GOT_SHORT;
                     }
                 }
 
@@ -1697,7 +1780,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
 
                 if (! (flags & UTF8_ALLOW_NON_CONTINUATION)) {
                     disallowed = TRUE;
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
 
                         /* If we don't know for sure that the input length is
                          * valid, avoid as much as possible reading past the
@@ -1711,6 +1796,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                                                             printlen,
                                                             s - s0,
                                                             (int) expectlen));
+                        this_flag_bit = UTF8_GOT_NON_CONTINUATION;
                     }
                 }
             }
@@ -1721,7 +1807,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                     *errors |= UTF8_GOT_SURROGATE;
 
                     if (   ! (flags & UTF8_CHECK_ONLY)
-                        && ckWARN_d(WARN_SURROGATE))
+                        && (msgs || ckWARN_d(WARN_SURROGATE)))
                     {
                         pack_warn = packWARN(WARN_SURROGATE);
 
@@ -1736,6 +1822,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                         else {
                             message = Perl_form(aTHX_ surrogate_cp_format, uv);
                         }
+                        this_flag_bit = UTF8_GOT_SURROGATE;
                     }
                 }
 
@@ -1751,7 +1838,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                     *errors |= UTF8_GOT_SUPER;
 
                     if (   ! (flags & UTF8_CHECK_ONLY)
-                        && ckWARN_d(WARN_NON_UNICODE))
+                        && (msgs || ckWARN_d(WARN_NON_UNICODE)))
                     {
                         pack_warn = packWARN(WARN_NON_UNICODE);
 
@@ -1765,6 +1852,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                         else {
                             message = Perl_form(aTHX_ super_cp_format, uv);
                         }
+                        this_flag_bit = UTF8_GOT_SUPER;
                     }
                 }
 
@@ -1774,7 +1862,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                 if (UNLIKELY(isUTF8_PERL_EXTENDED(s0))) {
                     if (  ! (flags & UTF8_CHECK_ONLY)
                         &&  (flags & (UTF8_WARN_PERL_EXTENDED|UTF8_WARN_SUPER))
-                        &&  ckWARN_d(WARN_NON_UNICODE))
+                        &&  (msgs || ckWARN_d(WARN_NON_UNICODE)))
                     {
                         pack_warn = packWARN(WARN_NON_UNICODE);
 
@@ -1798,6 +1886,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                                         " so is not portable",
                                         _byte_dump_string(s0, curlen, 0));
                         }
+                        this_flag_bit = UTF8_GOT_PERL_EXTENDED;
                     }
 
                     if (flags & ( UTF8_WARN_PERL_EXTENDED
@@ -1823,7 +1912,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                     *errors |= UTF8_GOT_NONCHAR;
 
                     if (  ! (flags & UTF8_CHECK_ONLY)
-                        && ckWARN_d(WARN_NONCHAR))
+                        && (msgs || ckWARN_d(WARN_NONCHAR)))
                     {
                         /* The code above should have guaranteed that we don't
                          * get here with errors other than overlong */
@@ -1832,6 +1921,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
 
                         pack_warn = packWARN(WARN_NONCHAR);
                         message = Perl_form(aTHX_ nonchar_cp_format, uv);
+                        this_flag_bit = UTF8_GOT_NONCHAR;
                     }
                 }
 
@@ -1857,7 +1947,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                 else {
                     disallowed = TRUE;
 
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                    if ((   msgs
+                         || ckWARN_d(WARN_UTF8)) && ! (flags & UTF8_CHECK_ONLY))
+                    {
                         pack_warn = packWARN(WARN_UTF8);
 
                         /* These error types cause 'uv' to be something that
@@ -1900,6 +1992,7 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                                                          small code points */
                                 UNI_TO_NATIVE(uv));
                         }
+                        this_flag_bit = UTF8_GOT_LONG;
                     }
                 }
             } /* End of looking through the possible flags */
@@ -1907,7 +2000,25 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
             /* Display the message (if any) for the problem being handled in
              * this iteration of the loop */
             if (message) {
-                if (PL_op)
+                if (msgs) {
+                    SV* msg_sv = newSVpv(message, 0);
+                    SV* category_sv = newSVuv(pack_warn);
+                    SV* flag_bit_sv = newSVuv(this_flag_bit);
+                    HV* msg_hv = newHV();
+
+                    assert(this_flag_bit);
+
+                    if (*msgs == NULL) {
+                        *msgs = newAV();
+                    }
+
+                    hv_stores(msg_hv, "text", msg_sv);
+                    hv_stores(msg_hv, "warn_categories",  category_sv);
+                    hv_stores(msg_hv, "flag_bit", flag_bit_sv);
+
+                    av_push(*msgs, newRV_noinc((SV*)msg_hv));
+                }
+                else if (PL_op)
                     Perl_warner(aTHX_ pack_warn, "%s in %s", message,
                                                  OP_DESC(PL_op));
                 else
diff --git a/utf8.h b/utf8.h
index 4d2d01b..cfcdf84 100644 (file)
--- a/utf8.h
+++ b/utf8.h
@@ -74,6 +74,8 @@ the string is invariant.
                                     ckWARN_d(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY)
 #define utf8n_to_uvchr(s, len, lenp, flags)                                    \
                                 utf8n_to_uvchr_error(s, len, lenp, flags, 0)
+#define utf8n_to_uvchr_error(s, len, lenp, flags, errors)                      \
+                        utf8n_to_uvchr_msgs(s, len, lenp, flags, errors, 0)
 
 #define to_uni_fold(c, p, lenp) _to_uni_fold_flags(c, p, lenp, FOLD_FLAGS_FULL)