This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
PATCH: [perl #130655] Unrecognized UTF-8 char
authorKarl Williamson <khw@cpan.org>
Tue, 31 Jan 2017 21:17:14 +0000 (14:17 -0700)
committerKarl Williamson <khw@cpan.org>
Tue, 31 Jan 2017 21:29:44 +0000 (14:29 -0700)
The root cause of this was code like this

    if (a)
        b

which got changed into

    if (a)
        c
        b

thus causing 'b' to being changed to be executed unconditionally.  The
solution is just to add braces

    if (a) {
        c
        b
    }

This is why I always use braces even if not required at the moment.  It
was the coding standard at $work.

It turns out that #130567 doesn't even come up with this fix in place.

t/lib/warnings/toke
toke.c

index 2774f08..8ed6177 100644 (file)
@@ -1634,9 +1634,6 @@ s//\3000/;
 s//"\x{180};;s\221(*$@$`\241\275";/gee;
 s//"s\221\302\302\302\302\302\302\302$@\241\275";/gee;
 EXPECT
-OPTION fatal
-Malformed UTF-8 character: \xc3\x20 (unexpected non-continuation byte 0x20, immediately after start byte 0xc3; need 2 bytes, got 1) in eval "string" at - line 11.
-Malformed UTF-8 character (fatal) at - line 11.
 ########
 # NAME  [perl $130666] Assertion failure
 no warnings "uninitialized";
@@ -1649,3 +1646,8 @@ EXPECT
 OPTION fatal
 syntax error at - line 1, at EOF
 Execution of - aborted due to compilation errors.
+########
+# NAME  [perl #130655]
+use utf8;
+qw∘foo ∞ ♥ bar∘
+EXPECT
diff --git a/toke.c b/toke.c
index 9972b97..b9096b0 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -10549,6 +10549,7 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
                    if (termlen == 1)
                        break;
                    if (s+termlen <= PL_bufend && memEQ(s, (char*)termstr, termlen))
+                    {
                         if (   check_grapheme
                             && UNLIKELY(! _is_grapheme((U8 *) start,
                                                               (U8 *) s,
@@ -10559,6 +10560,7 @@ S_scan_str(pTHX_ char *start, int keep_bracketed_quoted, int keep_delims, int re
                                         "%s", non_grapheme_msg);
                         }
                        break;
+                    }
                }
                else if (!has_utf8 && !UTF8_IS_INVARIANT((U8)*s) && UTF)
                    has_utf8 = TRUE;