This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Consistently upgrade under encoding
authorKarl Williamson <khw@cpan.org>
Wed, 19 Nov 2014 05:02:21 +0000 (22:02 -0700)
committerKarl Williamson <khw@cpan.org>
Fri, 21 Nov 2014 04:45:18 +0000 (21:45 -0700)
The documentation says that intermixing above-Latin1 code points with
ones that would be otherwise encoded to something else, like Greek,
causes the encoding to be foregone.  Until this commit, this only
happened when the above-latin1 code point came first in the string
constant being scanned; meaning string-order was important.  This
changes things to match the documentation

cpan/Encode/t/encoding.t
toke.c

index 8c7f253..847c26f 100644 (file)
@@ -14,7 +14,7 @@ BEGIN {
     }
 }
 
-print "1..31\n";
+print "1..33\n";
 
 no warnings "deprecated";
 use encoding "latin1"; # ignored (overwritten by the next line)
@@ -202,3 +202,10 @@ print "ok 28\n";
     print $h1{"\x{3af}"} == 41 ? "ok 30\n" : "not ok 30\n";
     print $h2{"\xdf"}    == 42 ? "ok 31\n" : "not ok 31\n";
 }
+
+# Order of finding the above-Latin1 code point should not matter: both should
+# assume Latin1/Unicode encoding
+print "not " if "\xDF\x{100}" =~ /\x{3af}\x{100}/;
+print "ok 32\n";
+print "not " if "\x{100}\xDF" =~ /\x{100}\x{3af}/;
+print "ok 33\n";
diff --git a/toke.c b/toke.c
index 5fbc11b..e20c93f 100644 (file)
--- a/toke.c
+++ b/toke.c
@@ -3189,9 +3189,13 @@ S_scan_const(pTHX_ char *start)
                        SvPOK_on(sv);
                        *d = '\0';
                        /* See Note on sizing above.  */
-                       sv_utf8_upgrade_flags_grow(sv,
-                                       SV_GMAGIC|SV_FORCE_UTF8_UPGRADE,
-                                       UNISKIP(uv) + (STRLEN)(send - s) + 1);
+                       sv_utf8_upgrade_flags_grow(
+                                         sv,
+                                         SV_GMAGIC|SV_FORCE_UTF8_UPGRADE
+                                                  /* Above-latin1 in string
+                                                   * implies no encoding */
+                                                  |SV_UTF8_NO_ENCODING,
+                                         UNISKIP(uv) + (STRLEN)(send - s) + 1);
                        d = SvPVX(sv) + SvCUR(sv);
                        has_utf8 = TRUE;
                     }