RT #130907: Fix the Unicode Bug in split " "

author Aaron Crane <arc@cpan.org>

Sat, 4 Mar 2017 12:50:58 +0000 (12:50 +0000)

committer Aaron Crane <arc@cpan.org>

Sat, 15 Jul 2017 18:35:43 +0000 (19:35 +0100)
author Aaron Crane <arc@cpan.org>
Sat, 4 Mar 2017 12:50:58 +0000 (12:50 +0000)
committer Aaron Crane <arc@cpan.org>
Sat, 15 Jul 2017 18:35:43 +0000 (19:35 +0100)
diff --git a/lib/feature.pm b/lib/feature.pm

index c23e96a..37218bc 100644 (file)
--- a/lib/feature.pm
+++ b/lib/feature.pm
@@ -5,7 +5,7 @@
  
  package feature;
  
-our $VERSION = '1.48';
+our $VERSION = '1.49';
  
  our %feature = (
      fc              => 'feature_fc',
@@ -177,8 +177,9 @@ C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
  
  This feature is available starting with Perl 5.12; was almost fully
  implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>;
-and extended further in Perl 5.26 to cover L<the range
-operator|perlop/Range Operators>.
+was extended further in Perl 5.26 to cover L<the range
+operator|perlop/Range Operators>; and was extended again in Perl 5.28 to
+cover L<special-cased whitespace splitting|perlfunc/split>.
  
  =head2 The 'unicode_eval' and 'evalbytes' features
  
diff --git a/pod/perldelta.pod b/pod/perldelta.pod

index 0fee558..aa41ad3 100644 (file)
--- a/pod/perldelta.pod
+++ b/pod/perldelta.pod
@@ -375,6 +375,15 @@ function of that name already exists could result in a NULL pointer
  being supplied where an SV was expected, crashing perl.  [perl
  #131597]
  
+=item *
+
+C<split ' '> now correctly handles the argument being split when in the
+scope of the L<< C<unicode_strings>|feature/"The 'unicode_strings' feature"
+>> feature. Previously, when a string using the single-byte internal
+representation contained characters that are whitespace by Unicode rules but
+not by ASCII rules, it treated those characters as part of fields rather
+than as field separators.  [perl #130907]
+
  =back
  
  =head1 Known Problems
diff --git a/pod/perlfunc.pod b/pod/perlfunc.pod

index b1e4ac2..e787704 100644 (file)
--- a/pod/perlfunc.pod
+++ b/pod/perlfunc.pod
@@ -7623,6 +7623,14 @@ special case was restricted to the use of a plain S<C<" ">> as the
  pattern argument to split; in Perl 5.18.0 and later this special case is
  triggered by any expression which evaluates to the simple string S<C<" ">>.
  
+As of Perl 5.28, this special-cased whitespace splitting works as expected in
+the scope of L<< S<C<"use feature 'unicode_strings">>|feature/The
+'unicode_strings' feature >>. In previous versions, and outside the scope of
+that feature, it exhibits L<perlunicode/The "Unicode Bug">: characters that are
+whitespace according to Unicode rules but not according to ASCII rules can be
+treated as part of fields rather than as field separators, depending on the
+string's internal encoding.
+
  If omitted, PATTERN defaults to a single space, S<C<" ">>, triggering
  the previously described I<awk> emulation.
  
diff --git a/pod/perlunicode.pod b/pod/perlunicode.pod

index 43f1459..ef02b0a 100644 (file)
--- a/pod/perlunicode.pod
+++ b/pod/perlunicode.pod
@@ -1836,6 +1836,17 @@ outside its scope, it could produce strings whose length in characters
  exceeded that of the right-hand side, where the right-hand side took up more
  bytes than the correct range endpoint.
  
+=item *
+
+In L<< C<split>'s special-case whitespace splitting|perlfunc/split >>.
+
+Starting in Perl 5.28.0, the C<split> function with a pattern specified as
+a string containing a single space handles whitespace characters consistently
+within the scope of of C<unicode_strings>. Prior to that, or outside its scope,
+characters that are whitespace according to Unicode rules but not according to
+ASCII rules were treated as field contents rather than field separators when
+they appear in byte-encoded strings.
+
  =back
  
  You can see from the above that the effect of C<unicode_strings>
diff --git a/pod/perluniintro.pod b/pod/perluniintro.pod

index 73aadff..0ad9dda 100644 (file)
--- a/pod/perluniintro.pod
+++ b/pod/perluniintro.pod
@@ -151,11 +151,12 @@ serious Unicode work.  The maintenance release 5.6.1 fixed many of the
  problems of the initial Unicode implementation, but for example
  regular expressions still do not work with Unicode in 5.6.1.
  Perl v5.14.0 is the first release where Unicode support is
-(almost) seamlessly integrable without some gotchas. (There are two
+(almost) seamlessly integrable without some gotchas. (There are a few
  exceptions. Firstly, some differences in L<quotemeta|perlfunc/quotemeta>
  were fixed starting in Perl 5.16.0. Secondly, some differences in
  L<the range operator|perlop/Range Operators> were fixed starting in
-Perl 5.26.0.)
+Perl 5.26.0. Thirdly, some differences in L<split|perlfunc/split> were fixed
+started in Perl 5.28.0.)
  
  To enable this
  seamless support, you should C<use feature 'unicode_strings'> (which is
diff --git a/pp.c b/pp.c

index 768eb7d..9ac4b07 100644 (file)
--- a/pp.c
+++ b/pp.c
@@ -5742,6 +5742,7 @@ PP(pp_split)
      STRLEN len;
      const char *s = SvPV_const(sv, len);
      const bool do_utf8 = DO_UTF8(sv);
+    const bool in_uni_8_bit = IN_UNI_8_BIT;
      const char *strend = s + len;
      PMOP *pm = cPMOPx(PL_op);
      REGEXP *rx;
@@ -5828,6 +5829,10 @@ PP(pp_split)
             while (s < strend && isSPACE_LC(*s))
                 s++;
         }
+        else if (in_uni_8_bit) {
+            while (s < strend && isSPACE_L1(*s))
+                s++;
+        }
         else {
             while (s < strend && isSPACE(*s))
                 s++;
@@ -5859,6 +5864,10 @@ PP(pp_split)
              {
                 while (m < strend && !isSPACE_LC(*m))
                     ++m;
+            }
+            else if (in_uni_8_bit) {
+                while (m < strend && !isSPACE_L1(*m))
+                    ++m;
              } else {
                  while (m < strend && !isSPACE(*m))
                      ++m;
@@ -5893,6 +5902,10 @@ PP(pp_split)
              {
                 while (s < strend && isSPACE_LC(*s))
                     ++s;
+            }
+            else if (in_uni_8_bit) {
+                while (s < strend && isSPACE_L1(*s))
+                    ++s;
              } else {
                  while (s < strend && isSPACE(*s))
                      ++s;
diff --git a/regen/feature.pl b/regen/feature.pl

index 41cccac..6b95e90 100755 (executable)
--- a/regen/feature.pl
+++ b/regen/feature.pl
@@ -369,7 +369,7 @@ read_only_bottom_close_and_rename($h);
  __END__
  package feature;
  
-our $VERSION = '1.48';
+our $VERSION = '1.49';
  
  FEATURES
  
@@ -487,8 +487,9 @@ C<use feature 'unicode_strings'> subpragma is B<strongly> recommended.
  
  This feature is available starting with Perl 5.12; was almost fully
  implemented in Perl 5.14; and extended in Perl 5.16 to cover C<quotemeta>;
-and extended further in Perl 5.26 to cover L<the range
-operator|perlop/Range Operators>.
+was extended further in Perl 5.26 to cover L<the range
+operator|perlop/Range Operators>; and was extended again in Perl 5.28 to
+cover L<special-cased whitespace splitting|perlfunc/split>.
  
  =head2 The 'unicode_eval' and 'evalbytes' features
  
diff --git a/t/op/split.t b/t/op/split.t

index d60bcaf..038c5d7 100644 (file)
--- a/t/op/split.t
+++ b/t/op/split.t
@@ -7,7 +7,7 @@ BEGIN {
      set_up_inc('../lib');
  }
  
-plan tests => 163;
+plan tests => 172;
  
  $FS = ':';
  
@@ -480,6 +480,24 @@ is($cnt, scalar(@ary));
          qq{split(\$cond ? qr/ / : " ", "$exp") behaves as expected over repeated similar patterns};
  }
  
+SKIP: {
+    # RT #130907: unicode_strings feature doesn't work with split ' '
+
+    my ($sp) = grep /\s/u, map chr, reverse 128 .. 255 # prefer \xA0 over \x85
+        or skip 'no unicode whitespace found in high-8-bit range', 9;
+
+    for (["$sp$sp. /", "leading unicode whitespace"],
+         [".$sp$sp/",  "unicode whitespace separator"],
+         [". /$sp$sp", "trailing unicode whitespace"]) {
+        my ($str, $desc) = @$_;
+        use feature "unicode_strings";
+        my @got = split " ", $str;
+        is @got, 2, "whitespace split: $desc: field count";
+        is $got[0], '.', "whitespace split: $desc: field 0";
+        is $got[1], '/', "whitespace split: $desc: field 1";
+    }
+}
+
  {
      # 'RT #116086: split "\x20" does not work as documented';
      my @results;
author	Aaron Crane <arc@cpan.org>
	Sat, 4 Mar 2017 12:50:58 +0000 (12:50 +0000)
committer	Aaron Crane <arc@cpan.org>
	Sat, 15 Jul 2017 18:35:43 +0000 (19:35 +0100)
lib/feature.pm		patch \| blob \| blame \| history
pod/perldelta.pod		patch \| blob \| blame \| history
pod/perlfunc.pod		patch \| blob \| blame \| history
pod/perlunicode.pod		patch \| blob \| blame \| history
pod/perluniintro.pod		patch \| blob \| blame \| history
pp.c		patch \| blob \| blame \| history
regen/feature.pl		patch \| blob \| blame \| history
t/op/split.t		patch \| blob \| blame \| history