lib/utf8.t

   1 #!./perl
   2
   3 my $has_perlio;
   4
   5 BEGIN {
   6     chdir 't' if -d 't';
   7     @INC = '../lib';
   8     require './test.pl';
   9     unless ($has_perlio = find PerlIO::Layer 'perlio') {
  10         print <<EOF;
  11 # Since you don't have perlio you might get failures with UTF-8 locales.
  12 EOF
  13     }
  14 }
  15
  16 # NOTE!
  17 #
  18 # Think carefully before adding tests here.  In general this should be
  19 # used only for about three categories of tests:
  20 #
  21 # (1) tests that absolutely require 'use utf8', and since that in general
  22 #     shouldn't be needed as the utf8 is being obsoleted, this should
  23 #     have rather few tests.  If you want to test Unicode and regexes,
  24 #     you probably want to go to op/regexp or op/pat; if you want to test
  25 #     split, go to op/split; pack, op/pack; appending or joining,
  26 #     op/append or op/join, and so forth
  27 #
  28 # (2) tests that have to do with Unicode tokenizing (though it's likely
  29 #     that all the other Unicode tests sprinkled around the t/**/*.t are
  30 #     going to catch that)
  31 #
  32 # (3) complicated tests that simultaneously stress so many Unicode features
  33 #     that deciding into which other test script the tests should go to
  34 #     is hard -- maybe consider breaking up the complicated test
  35 #
  36 #
  37
  38 plan tests => 94;
  39
  40 {
  41     # bug id 20001009.001
  42
  43     my ($a, $b);
  44
  45     { use bytes; $a = "\xc3\xa4" }
  46     { use utf8;  $b = "\xe4"     }
  47
  48     my $test = 68;
  49
  50     ok($a ne $b);
  51
  52     { use utf8; ok($a ne $b) }
  53 }
  54
  55
  56 {
  57     # bug id 20000730.004
  58
  59     my $smiley = "\x{263a}";
  60
  61     for my $s ("\x{263a}",
  62                $smiley,
  63
  64                "" . $smiley,
  65                "" . "\x{263a}",
  66
  67                $smiley    . "",
  68                "\x{263a}" . "",
  69                ) {
  70         my $length_chars = length($s);
  71         my $length_bytes;
  72         { use bytes; $length_bytes = length($s) }
  73         my @regex_chars = $s =~ m/(.)/g;
  74         my $regex_chars = @regex_chars;
  75         my @split_chars = split //, $s;
  76         my $split_chars = @split_chars;
  77         ok("$length_chars/$regex_chars/$split_chars/$length_bytes" eq
  78            "1/1/1/3");
  79     }
  80
  81     for my $s ("\x{263a}" . "\x{263a}",
  82                $smiley    . $smiley,
  83
  84                "\x{263a}\x{263a}",
  85                "$smiley$smiley",
  86
  87                "\x{263a}" x 2,
  88                $smiley    x 2,
  89                ) {
  90         my $length_chars = length($s);
  91         my $length_bytes;
  92         { use bytes; $length_bytes = length($s) }
  93         my @regex_chars = $s =~ m/(.)/g;
  94         my $regex_chars = @regex_chars;
  95         my @split_chars = split //, $s;
  96         my $split_chars = @split_chars;
  97         ok("$length_chars/$regex_chars/$split_chars/$length_bytes" eq
  98            "2/2/2/6");
  99     }
 100 }
 101
 102
 103 {
 104     my $w = 0;
 105     local $SIG{__WARN__} = sub { print "#($_[0])\n"; $w++ };
 106     my $x = eval q/"\\/ . "\x{100}" . q/"/;;
 107
 108     ok($w == 0 && $x eq "\x{100}");
 109 }
 110
 111 {
 112     use warnings;
 113     use strict;
 114
 115     my $show = q(
 116                  sub show {
 117                    my $result;
 118                    $result .= '>' . join (',', map {ord} split //, $_) . '<'
 119                      foreach @_;
 120                    $result;
 121                  }
 122                  1;
 123                 );
 124     eval $show or die $@; # We don't expect this sub definition to fail.
 125     my $progfile = 'utf' . $$;
 126     END {unlink_all $progfile}
 127
 128     # If I'm right 60 is '>' in ASCII, ' ' in EBCDIC
 129     # 173 is not punctuation in either ASCII or EBCDIC
 130     my (@char);
 131     foreach (60, 173, 257, 65532) {
 132       my $char = chr $_;
 133       utf8::encode($char);
 134       # I don't want to use map {ord} and I've no need to hardcode the UTF
 135       # version
 136       my $charsubst = $char;
 137       $charsubst =~ s/(.)/ord ($1) . ','/ge;
 138       chop $charsubst;
 139       # Not testing this one against map {ord}
 140       my $char_as_ord
 141           = join " . ", map {sprintf 'chr (%d)', ord $_} split //, $char;
 142       push @char, [$_, $char, $charsubst, $char_as_ord];
 143     }
 144     # Now we've done all the UTF8 munching hopefully we're safe
 145     my @tests = (
 146              ['check our detection program works',
 147               'my @a = ("'.chr(60).'\x2A", ""); $b = show @a', qr/^>60,42<><$/],
 148              ['check literal 8 bit input',
 149               '$a = "' . chr (173) . '"; $b = show $a', qr/^>173<$/],
 150              ['check no utf8; makes no change',
 151               'no utf8; $a = "' . chr (173) . '"; $b = show $a', qr/^>173<$/],
 152              # Now we do the real byte sequences that are valid UTF8
 153              (map {
 154                ["the utf8 sequence for chr $_->[0]",
 155                 qq{\$a = "$_->[1]"; \$b = show \$a}, qr/^>$_->[2]<$/],
 156                ["no utf8; for the utf8 sequence for chr $_->[0]",
 157                 qq(no utf8; \$a = "$_->[1]"; \$b = show \$a), qr/^>$_->[2]<$/],
 158                ["use utf8; for the utf8 sequence for chr $_->[0]",
 159                 qq(use utf8; \$a = "$_->[1]"; \$b = show \$a), qr/^>$_->[0]<$/],
 160               } @char),
 161              # Interpolation of hex characters needs to take place now, as we're
 162              # testing feeding malformed utf8 into perl. Bug now fixed was an
 163              # "out of memory" error. We really need the "" [rather than qq()
 164              # or q()] to get the best explosion.
 165              ["!Feed malformed utf8 into perl.", <<"BANG",
 166     use utf8; %a = ("\xE1\xA0"=>"sterling");
 167     print 'start'; printf '%x,', ord \$_ foreach keys %a; print "end\n";
 168 BANG
 169               qr/^Malformed UTF-8 character \(2 bytes, need 3.+\).*start\d+,end$/s
 170              ],
 171             );
 172     foreach (@tests) {
 173         my ($why, $prog, $expect) = @$_;
 174         open P, ">$progfile" or die "Can't open '$progfile': $!";
 175         binmode(P, ":bytes") if $has_perlio;
 176         print P $show, $prog, '; print $b'
 177             or die "Print to 'progfile' failed: $!";
 178         close P or die "Can't close '$progfile': $!";
 179         if ($why =~ s/^!//) {
 180             print "# Possible delay...\n";
 181         } else {
 182             print "# $prog\n";
 183         }
 184         my $result = runperl ( stderr => 1, progfile => $progfile );
 185         like ($result, $expect, $why);
 186     }
 187     print
 188         "# Again! Again! [but this time as eval, and not the explosive one]\n";
 189     # and now we've safely done them all as separate files, check that the
 190     # evals do the same thing. Hopefully doing it later sucessfully decouples
 191     # the previous tests from anything messy that may go wrong with the evals.
 192     foreach (@tests) {
 193         my ($why, $prog, $expect) = @$_;
 194         next if $why =~ m/^!/; # Goes bang.
 195         my $result = eval $prog;
 196         if ($@) {
 197             print "# prog is $prog\n";
 198             print "# \$\@=", _qq($@), "\n";
 199         }
 200         like ($result, $expect, $why);
 201     }
 202
 203     # See what the tokeniser does with hash keys.
 204     print "# What does the tokeniser do with utf8 hash keys?\n";
 205     @tests = (map {
 206         # This is the control - I don't expect it to fail
 207         ["assign utf8 for chr $_->[0] to a hash",
 208          qq(my \$a = "$_->[1]"; my %h; \$h{\$a} = 1;
 209             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 210          qr/^>$_->[2]<$/],
 211         ["no utf8; assign utf8 for chr $_->[0] to a hash",
 212          qq(no utf8; my \$a = "$_->[1]"; my %h; \$h{\$a} = 1;
 213             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 214          qr/^>$_->[2]<$/],
 215         ["use utf8; assign utf8 for chr $_->[0] to a hash",
 216          qq(use utf8; my \$a = "$_->[1]"; my %h; \$h{\$a} = 1;
 217             my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
 218          qr/^>$_->[0]<$/],
 219         # Now check literal $h{"x"} constructions.
 220         ["\$h{\"x\"} construction, where x is utf8 for chr $_->[0]",
 221          qq(my \$a = "$_->[1]"; my %h; \$h{"$_->[1]"} = 1;
 222             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 223          qr/^>$_->[2]<$/],
 224         ["no utf8; \$h{\"x\"} construction, where x is utf8 for chr $_->[0]",
 225          qq(no utf8; my \$a = "$_->[1]"; my %h; \$h{"$_->[1]"} = 1;
 226             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 227          qr/^>$_->[2]<$/],
 228         ["use utf8; \$h{\"x\"} construction, where x is utf8 for chr $_->[0]",
 229          qq(use utf8; my \$a = "$_->[1]"; my %h; \$h{"$_->[1]"} = 1;
 230             my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
 231          qr/^>$_->[0]<$/],
 232         # Now check "x" => constructions.
 233         ["assign \"x\"=>1 to a hash, where x is utf8 for chr $_->[0]",
 234          qq(my \$a = "$_->[1]"; my %h; %h = ("$_->[1]" => 1);
 235             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 236          qr/^>$_->[2]<$/],
 237         ["no utf8; assign \"x\"=>1 to a hash, where x is utf8 for chr $_->[0]",
 238          qq(no utf8; my \$a = "$_->[1]"; my %h; %h = ("$_->[1]" => 1);
 239             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 240          qr/^>$_->[2]<$/],
 241         ["use utf8; assign \"x\"=>1 to a hash, where x is utf8 for chr $_->[0]",
 242          qq(use utf8; my \$a = "$_->[1]"; my %h; %h = ("$_->[1]" => 1);
 243             my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
 244          qr/^>$_->[0]<$/],
 245         # Check copies of hashes made from literal utf8 keys
 246         ["assign utf8 for chr $_->[0] to a hash, then copy it",
 247          qq(my \$a = "$_->[1]"; my %i; \$i{\$a} = 1; my %h = %i;
 248             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 249          qr/^>$_->[2]<$/],
 250         ["no utf8; assign utf8 for chr $_->[0] to a hash, then copy it",
 251          qq(no utf8; my \$a = "$_->[1]"; my %i; \$i{\$a} = 1;; my %h = %i;
 252             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 253          qr/^>$_->[2]<$/],
 254         ["use utf8; assign utf8 for chr $_->[0] to a hash, then copy it",
 255          qq(use utf8; my \$a = "$_->[1]"; my %i; \$i{\$a} = 1; my %h = %i;
 256             my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
 257          qr/^>$_->[0]<$/],
 258      } @char);
 259     foreach (@tests) {
 260         my ($why, $prog, $expect) = @$_;
 261         # print "# $prog\n";
 262         my $result = eval $prog;
 263         like ($result, $expect, $why);
 264     }
 265 }