t/op/split_unicode.t

   1 #!./perl
   2
   3 BEGIN {
   4     chdir 't' if -d 't';
   5     require './test.pl';
   6     skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec (used by charnames)");
   7     plan(tests => 145);
   8 }
   9
  10 {
  11     # check the special casing of split /\s/ and unicode
  12     use charnames qw(:full);
  13     # below test data is extracted from
  14     # PropList-5.0.0.txt
  15     # Date: 2006-06-07, 23:22:52 GMT [MD]
  16     #
  17     # Unicode Character Database
  18     # Copyright (c) 1991-2006 Unicode, Inc.
  19     # For terms of use, see http://www.unicode.org/terms_of_use.html
  20     # For documentation, see UCD.html
  21     my @spaces=(
  22         ord("\t"),      # Cc       <control-0009>
  23         ord("\n"),      # Cc       <control-000A>
  24         # not PerlSpace # Cc       <control-000B>
  25         ord("\f"),      # Cc       <control-000C>
  26         ord("\r"),      # Cc       <control-000D>
  27         ord(" "),       # Zs       SPACE
  28         ord("\N{NEL}"), # Cc       <control-0085>
  29         ord("\N{NO-BREAK SPACE}"),
  30                         # Zs       NO-BREAK SPACE
  31         0x1680,         # Zs       OGHAM SPACE MARK
  32         0x2000..0x200A, # Zs  [11] EN QUAD..HAIR SPACE
  33         0x2028,         # Zl       LINE SEPARATOR
  34         0x2029,         # Zp       PARAGRAPH SEPARATOR
  35         0x202F,         # Zs       NARROW NO-BREAK SPACE
  36         0x205F,         # Zs       MEDIUM MATHEMATICAL SPACE
  37         0x3000          # Zs       IDEOGRAPHIC SPACE
  38     );
  39     #diag "Have @{[0+@spaces]} to test\n";
  40     foreach my $cp (@spaces) {
  41         my $msg = sprintf "Space: U+%04x", $cp;
  42         my $space = chr($cp);
  43         my $str="A:$space:B\x{FFFD}";
  44         chop $str;
  45
  46         my @res=split(/\s+/,$str);
  47         my $cnt=split(/\s+/,$str);
  48         ok(@res == 2 && join('-',@res) eq "A:-:B", "$msg - /\\s+/");
  49         is($cnt, scalar(@res), "$msg - /\\s+/ (count)");
  50
  51         my $s2 = "$space$space:A:$space$space:B\x{FFFD}";
  52         chop $s2;
  53
  54         my @r2 = split(' ',$s2);
  55         my $c2 = split(' ',$s2);
  56         ok(@r2 == 2 && join('-', @r2) eq ":A:-:B",  "$msg - ' '");
  57         is($c2, scalar(@r2), "$msg - ' ' (count)");
  58
  59         my @r3 = split(/\s+/, $s2);
  60         my $c3 = split(/\s+/, $s2);
  61         ok(@r3 == 3 && join('-', @r3) eq "-:A:-:B", "$msg - /\\s+/ No.2");
  62         is($c3, scalar(@r3), "$msg - /\\s+/ No.2 (count)");
  63     }
  64
  65     { # RT #114808
  66         warning_is(
  67             sub {
  68                 $p=chr(0x100);
  69                 for (".","ab\x{101}def") {
  70                     @q = split /$p/
  71                 }
  72             },
  73             undef,
  74             'no warnings when part of split cant match non-utf8'
  75         );
  76     }
  77
  78 }