utf8.c: Fix bugs with overlongs combined with other malformations.

author Karl Williamson <khw@cpan.org>

Mon, 19 Jun 2017 18:58:19 +0000 (12:58 -0600)

committer Karl Williamson <khw@cpan.org>

Thu, 13 Jul 2017 03:14:25 +0000 (21:14 -0600)
author Karl Williamson <khw@cpan.org>
Mon, 19 Jun 2017 18:58:19 +0000 (12:58 -0600)
committer Karl Williamson <khw@cpan.org>
Thu, 13 Jul 2017 03:14:25 +0000 (21:14 -0600)
diff --git a/ext/XS-APItest/t/utf8_warn_base.pl b/ext/XS-APItest/t/utf8_warn_base.pl

index 2406307..c1ecf0e 100644 (file)
--- a/ext/XS-APItest/t/utf8_warn_base.pl
+++ b/ext/XS-APItest/t/utf8_warn_base.pl
@@ -861,8 +861,6 @@ foreach my $test (@tests) {
              next if     $malformed_allow_type == 2
                      && ($will_overflow || $short || $unexpected_noncont);
              next if $skip_most_tests && $malformed_allow_type;
-            local $TODO = "Warning messages don't return correct code point"
-                        . " for allowed malformations" if $malformed_allow_type;
  
              # Here we are in the innermost loop for malformations.  So we
              # know which ones are in effect.  Can now change the input to be
diff --git a/utf8.c b/utf8.c

index 76c3487..67580ff 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1415,7 +1415,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
  
              /* Each 'if' clause handles one problem.  They are ordered so that
               * the first ones' messages will be displayed before the later
-             * ones; this is kinda in decreasing severity order */
+             * ones; this is kinda in decreasing severity order.  But the
+             * overlong must come last, as it changes 'uv' looked at by the
+             * others */
              if (possible_problems & UTF8_GOT_OVERFLOW) {
  
                  /* Overflow means also got a super and are using Perl's
@@ -1547,63 +1549,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      }
                  }
              }
-            else if (possible_problems & UTF8_GOT_LONG) {
-                possible_problems &= ~UTF8_GOT_LONG;
-                *errors |= UTF8_GOT_LONG;
-
-                if (flags & UTF8_ALLOW_LONG) {
-
-                    /* We don't allow the actual overlong value, unless the
-                     * special extra bit is also set */
-                    if (! (flags & (   UTF8_ALLOW_LONG_AND_ITS_VALUE
-                                    & ~UTF8_ALLOW_LONG)))
-                    {
-                        uv = UNICODE_REPLACEMENT;
-                    }
-                }
-                else {
-                    disallowed = TRUE;
-
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
-                        pack_warn = packWARN(WARN_UTF8);
-
-                        /* These error types cause 'uv' to be something that
-                         * isn't what was intended, so can't use it in the
-                         * message.  The other error types either can't
-                         * generate an overlong, or else the 'uv' is valid */
-                        if (orig_problems &
-                                        (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
-                        {
-                            message = Perl_form(aTHX_
-                                    "%s: %s (any UTF-8 sequence that starts"
-                                    " with \"%s\" is overlong which can and"
-                                    " should be represented with a"
-                                    " different, shorter sequence)",
-                                    malformed_text,
-                                    _byte_dump_string(s0, send - s0, 0),
-                                    _byte_dump_string(s0, curlen, 0));
-                        }
-                        else {
-                            U8 tmpbuf[UTF8_MAXBYTES+1];
-                            const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
-                                                                        uv, 0);
-                            const char * preface = (uv <= PERL_UNICODE_MAX)
-                                                   ? "U+"
-                                                   : "0x";
-                            message = Perl_form(aTHX_
-                                "%s: %s (overlong; instead use %s to represent"
-                                " %s%0*" UVXf ")",
-                                malformed_text,
-                                _byte_dump_string(s0, curlen, 0),
-                                _byte_dump_string(tmpbuf, e - tmpbuf, 0),
-                                preface,
-                                ((uv < 256) ? 2 : 4), /* Field width of 2 for
-                                                         small code points */
-                                uv);
-                        }
-                    }
-                }
-            }
              else if (possible_problems & UTF8_GOT_SURROGATE) {
                  possible_problems &= ~UTF8_GOT_SURROGATE;
  
@@ -1748,6 +1693,63 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                      disallowed = TRUE;
                      *errors |= UTF8_GOT_NONCHAR;
                  }
+            }
+            else if (possible_problems & UTF8_GOT_LONG) {
+                possible_problems &= ~UTF8_GOT_LONG;
+                *errors |= UTF8_GOT_LONG;
+
+                if (flags & UTF8_ALLOW_LONG) {
+
+                    /* We don't allow the actual overlong value, unless the
+                     * special extra bit is also set */
+                    if (! (flags & (   UTF8_ALLOW_LONG_AND_ITS_VALUE
+                                    & ~UTF8_ALLOW_LONG)))
+                    {
+                        uv = UNICODE_REPLACEMENT;
+                    }
+                }
+                else {
+                    disallowed = TRUE;
+
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+
+                        /* These error types cause 'uv' to be something that
+                         * isn't what was intended, so can't use it in the
+                         * message.  The other error types either can't
+                         * generate an overlong, or else the 'uv' is valid */
+                        if (orig_problems &
+                                        (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+                        {
+                            message = Perl_form(aTHX_
+                                    "%s: %s (any UTF-8 sequence that starts"
+                                    " with \"%s\" is overlong which can and"
+                                    " should be represented with a"
+                                    " different, shorter sequence)",
+                                    malformed_text,
+                                    _byte_dump_string(s0, send - s0, 0),
+                                    _byte_dump_string(s0, curlen, 0));
+                        }
+                        else {
+                            U8 tmpbuf[UTF8_MAXBYTES+1];
+                            const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
+                                                                        uv, 0);
+                            const char * preface = (uv <= PERL_UNICODE_MAX)
+                                                   ? "U+"
+                                                   : "0x";
+                            message = Perl_form(aTHX_
+                                "%s: %s (overlong; instead use %s to represent"
+                                " %s%0*" UVXf ")",
+                                malformed_text,
+                                _byte_dump_string(s0, send - s0, 0),
+                                _byte_dump_string(tmpbuf, e - tmpbuf, 0),
+                                preface,
+                                ((uv < 256) ? 2 : 4), /* Field width of 2 for
+                                                         small code points */
+                                uv);
+                        }
+                    }
+                }
              } /* End of looking through the possible flags */
  
              /* Display the message (if any) for the problem being handled in
author	Karl Williamson <khw@cpan.org>
	Mon, 19 Jun 2017 18:58:19 +0000 (12:58 -0600)
committer	Karl Williamson <khw@cpan.org>
	Thu, 13 Jul 2017 03:14:25 +0000 (21:14 -0600)
ext/XS-APItest/t/utf8_warn_base.pl		patch \| blob \| blame \| history
utf8.c		patch \| blob \| blame \| history