This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8.c: Fix bugs with overlongs combined with other malformations.
authorKarl Williamson <khw@cpan.org>
Mon, 19 Jun 2017 18:58:19 +0000 (12:58 -0600)
committerKarl Williamson <khw@cpan.org>
Thu, 13 Jul 2017 03:14:25 +0000 (21:14 -0600)
The code handling the UTF-8 overlong malformation must come after
handling all the other malformations.  This is because it may change the
code point represented to the REPLACEMENT CHARACTER.  The other
malformation code is expecting the code point to be the original one.
This may cause failure to catch and report other malformations, or
report the wrong value of the erroneous code point.

What was needed was simply to move the 'if else' branch for overlongs to
after the branches for the other formations.

ext/XS-APItest/t/utf8_warn_base.pl
utf8.c

index 2406307..c1ecf0e 100644 (file)
@@ -861,8 +861,6 @@ foreach my $test (@tests) {
             next if     $malformed_allow_type == 2
                     && ($will_overflow || $short || $unexpected_noncont);
             next if $skip_most_tests && $malformed_allow_type;
-            local $TODO = "Warning messages don't return correct code point"
-                        . " for allowed malformations" if $malformed_allow_type;
 
             # Here we are in the innermost loop for malformations.  So we
             # know which ones are in effect.  Can now change the input to be
diff --git a/utf8.c b/utf8.c
index 76c3487..67580ff 100644 (file)
--- a/utf8.c
+++ b/utf8.c
@@ -1415,7 +1415,9 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
 
             /* Each 'if' clause handles one problem.  They are ordered so that
              * the first ones' messages will be displayed before the later
-             * ones; this is kinda in decreasing severity order */
+             * ones; this is kinda in decreasing severity order.  But the
+             * overlong must come last, as it changes 'uv' looked at by the
+             * others */
             if (possible_problems & UTF8_GOT_OVERFLOW) {
 
                 /* Overflow means also got a super and are using Perl's
@@ -1547,63 +1549,6 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                     }
                 }
             }
-            else if (possible_problems & UTF8_GOT_LONG) {
-                possible_problems &= ~UTF8_GOT_LONG;
-                *errors |= UTF8_GOT_LONG;
-
-                if (flags & UTF8_ALLOW_LONG) {
-
-                    /* We don't allow the actual overlong value, unless the
-                     * special extra bit is also set */
-                    if (! (flags & (   UTF8_ALLOW_LONG_AND_ITS_VALUE
-                                    & ~UTF8_ALLOW_LONG)))
-                    {
-                        uv = UNICODE_REPLACEMENT;
-                    }
-                }
-                else {
-                    disallowed = TRUE;
-
-                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
-                        pack_warn = packWARN(WARN_UTF8);
-
-                        /* These error types cause 'uv' to be something that
-                         * isn't what was intended, so can't use it in the
-                         * message.  The other error types either can't
-                         * generate an overlong, or else the 'uv' is valid */
-                        if (orig_problems &
-                                        (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
-                        {
-                            message = Perl_form(aTHX_
-                                    "%s: %s (any UTF-8 sequence that starts"
-                                    " with \"%s\" is overlong which can and"
-                                    " should be represented with a"
-                                    " different, shorter sequence)",
-                                    malformed_text,
-                                    _byte_dump_string(s0, send - s0, 0),
-                                    _byte_dump_string(s0, curlen, 0));
-                        }
-                        else {
-                            U8 tmpbuf[UTF8_MAXBYTES+1];
-                            const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
-                                                                        uv, 0);
-                            const char * preface = (uv <= PERL_UNICODE_MAX)
-                                                   ? "U+"
-                                                   : "0x";
-                            message = Perl_form(aTHX_
-                                "%s: %s (overlong; instead use %s to represent"
-                                " %s%0*" UVXf ")",
-                                malformed_text,
-                                _byte_dump_string(s0, curlen, 0),
-                                _byte_dump_string(tmpbuf, e - tmpbuf, 0),
-                                preface,
-                                ((uv < 256) ? 2 : 4), /* Field width of 2 for
-                                                         small code points */
-                                uv);
-                        }
-                    }
-                }
-            }
             else if (possible_problems & UTF8_GOT_SURROGATE) {
                 possible_problems &= ~UTF8_GOT_SURROGATE;
 
@@ -1748,6 +1693,63 @@ Perl_utf8n_to_uvchr_error(pTHX_ const U8 *s,
                     disallowed = TRUE;
                     *errors |= UTF8_GOT_NONCHAR;
                 }
+            }
+            else if (possible_problems & UTF8_GOT_LONG) {
+                possible_problems &= ~UTF8_GOT_LONG;
+                *errors |= UTF8_GOT_LONG;
+
+                if (flags & UTF8_ALLOW_LONG) {
+
+                    /* We don't allow the actual overlong value, unless the
+                     * special extra bit is also set */
+                    if (! (flags & (   UTF8_ALLOW_LONG_AND_ITS_VALUE
+                                    & ~UTF8_ALLOW_LONG)))
+                    {
+                        uv = UNICODE_REPLACEMENT;
+                    }
+                }
+                else {
+                    disallowed = TRUE;
+
+                    if (ckWARN_d(WARN_UTF8) && ! (flags & UTF8_CHECK_ONLY)) {
+                        pack_warn = packWARN(WARN_UTF8);
+
+                        /* These error types cause 'uv' to be something that
+                         * isn't what was intended, so can't use it in the
+                         * message.  The other error types either can't
+                         * generate an overlong, or else the 'uv' is valid */
+                        if (orig_problems &
+                                        (UTF8_GOT_TOO_SHORT|UTF8_GOT_OVERFLOW))
+                        {
+                            message = Perl_form(aTHX_
+                                    "%s: %s (any UTF-8 sequence that starts"
+                                    " with \"%s\" is overlong which can and"
+                                    " should be represented with a"
+                                    " different, shorter sequence)",
+                                    malformed_text,
+                                    _byte_dump_string(s0, send - s0, 0),
+                                    _byte_dump_string(s0, curlen, 0));
+                        }
+                        else {
+                            U8 tmpbuf[UTF8_MAXBYTES+1];
+                            const U8 * const e = uvoffuni_to_utf8_flags(tmpbuf,
+                                                                        uv, 0);
+                            const char * preface = (uv <= PERL_UNICODE_MAX)
+                                                   ? "U+"
+                                                   : "0x";
+                            message = Perl_form(aTHX_
+                                "%s: %s (overlong; instead use %s to represent"
+                                " %s%0*" UVXf ")",
+                                malformed_text,
+                                _byte_dump_string(s0, send - s0, 0),
+                                _byte_dump_string(tmpbuf, e - tmpbuf, 0),
+                                preface,
+                                ((uv < 256) ? 2 : 4), /* Field width of 2 for
+                                                         small code points */
+                                uv);
+                        }
+                    }
+                }
             } /* End of looking through the possible flags */
 
             /* Display the message (if any) for the problem being handled in