lib/utf8.t

   1 #!./perl
   2
   3 my $has_perlio;
   4
   5 BEGIN {
   6     chdir 't' if -d 't';
   7     @INC = '../lib';
   8     require './test.pl'; require './charset_tools.pl';
   9     unless ($has_perlio = find PerlIO::Layer 'perlio') {
  10         print <<EOF;
  11 # Since you don't have perlio you might get failures with UTF-8 locales.
  12 EOF
  13     }
  14 }
  15
  16 use strict;
  17 use warnings;
  18 no utf8; # Ironic, no?
  19
  20 # NOTE!
  21 #
  22 # Think carefully before adding tests here.  In general this should be
  23 # used only for about three categories of tests:
  24 #
  25 # (1) tests that absolutely require 'use utf8', and since that in general
  26 #     shouldn't be needed as the utf8 is being obsoleted, this should
  27 #     have rather few tests.  If you want to test Unicode and regexes,
  28 #     you probably want to go to op/regexp or op/pat; if you want to test
  29 #     split, go to op/split; pack, op/pack; appending or joining,
  30 #     op/append or op/join, and so forth
  31 #
  32 # (2) tests that have to do with Unicode tokenizing (though it's likely
  33 #     that all the other Unicode tests sprinkled around the t/**/*.t are
  34 #     going to catch that)
  35 #
  36 # (3) complicated tests that simultaneously stress so many Unicode features
  37 #     that deciding into which other test script the tests should go to
  38 #     is hard -- maybe consider breaking up the complicated test
  39 #
  40 #
  41
  42 {
  43     # bug id 20001009.001 (#4409)
  44
  45     my ($a, $b);
  46
  47     { use bytes; $a = byte_utf8a_to_utf8n("\xc3\xa4") }
  48     { use utf8;  $b = uni_to_native("\xe4")     }
  49
  50     my $test = 68;
  51
  52     ok($a ne $b);
  53
  54     { use utf8; ok($a ne $b) }
  55 }
  56
  57
  58 {
  59     # bug id 20000730.004 (#3599)
  60
  61     my $smiley = "\x{263a}";
  62
  63     for my $s ("\x{263a}",
  64                $smiley,
  65
  66                "" . $smiley,
  67                "" . "\x{263a}",
  68
  69                $smiley    . "",
  70                "\x{263a}" . "",
  71                ) {
  72         my $length_chars = length($s);
  73         my $length_bytes;
  74         { use bytes; $length_bytes = length($s) }
  75         my @regex_chars = $s =~ m/(.)/g;
  76         my $regex_chars = @regex_chars;
  77         my @split_chars = split //, $s;
  78         my $split_chars = @split_chars;
  79         ok("$length_chars/$regex_chars/$split_chars/$length_bytes" eq
  80            "1/1/1/3");
  81     }
  82
  83     for my $s ("\x{263a}" . "\x{263a}",
  84                $smiley    . $smiley,
  85
  86                "\x{263a}\x{263a}",
  87                "$smiley$smiley",
  88
  89                "\x{263a}" x 2,
  90                $smiley    x 2,
  91                ) {
  92         my $length_chars = length($s);
  93         my $length_bytes;
  94         { use bytes; $length_bytes = length($s) }
  95         my @regex_chars = $s =~ m/(.)/g;
  96         my $regex_chars = @regex_chars;
  97         my @split_chars = split //, $s;
  98         my $split_chars = @split_chars;
  99         ok("$length_chars/$regex_chars/$split_chars/$length_bytes" eq
 100            "2/2/2/6");
 101     }
 102 }
 103
 104
 105 {
 106     my $w = 0;
 107     local $SIG{__WARN__} = sub { print "#($_[0])\n"; $w++ };
 108     my $x = eval q/"\\/ . "\x{100}" . q/"/;;
 109
 110     ok($w == 0 && $x eq "\x{100}");
 111 }
 112
 113 {
 114     my $show = q(
 115                  sub show {
 116                    my $result;
 117                    $result .= '>' . join (',', map {ord} split //, $_) . '<'
 118                      foreach @_;
 119                    $result;
 120                  }
 121                  1;
 122                 );
 123     eval $show or die $@; # We don't expect this sub definition to fail.
 124     my $progfile = 'utf' . $$;
 125     END {unlink_all $progfile}
 126
 127     # 64 is '@' in ASCII, ' ' in EBCDIC
 128     # 193 is not punctuation in either ASCII nor EBCDIC
 129     my (@char);
 130     foreach (64, 193, 257, 65532) {
 131       my $char = chr $_;
 132       utf8::encode($char);
 133       # I don't want to use map {ord} and I've no need to hardcode the UTF
 134       # version
 135       my $charsubst = $char;
 136       $charsubst =~ s/(.)/ord ($1) . ','/ge;
 137       chop $charsubst;
 138       # Not testing this one against map {ord}
 139       my $char_as_ord
 140           = join " . ", map {sprintf 'chr (%d)', ord $_} split //, $char;
 141       push @char, [$_, $char, $charsubst, $char_as_ord];
 142     }
 143     my $malformed = $::IS_ASCII
 144                     ? "\xE1\xA0"
 145                     : I8_to_native("\xE6\xA0");
 146     # Now we've done all the UTF8 munching hopefully we're safe
 147     my @tests = (
 148              ['check our detection program works',
 149               'my @a = ("'.chr(64).'\x2A", ""); $b = show @a', qr/^>64,42<><$/],
 150              ['check literal 8 bit input',
 151               '$a = "' . chr (193) . '"; $b = show $a', qr/^>193<$/],
 152              ['check no utf8; makes no change',
 153               'no utf8; $a = "' . chr (193) . '"; $b = show $a', qr/^>193<$/],
 154              # Now we do the real byte sequences that are valid UTF8
 155              (map {
 156                ["the utf8 sequence for chr $_->[0]",
 157                 qq{\$a = "$_->[1]"; \$b = show \$a}, qr/^>$_->[2]<$/],
 158                ["no utf8; for the utf8 sequence for chr $_->[0]",
 159                 qq(no utf8; \$a = "$_->[1]"; \$b = show \$a), qr/^>$_->[2]<$/],
 160                ["use utf8; for the utf8 sequence for chr $_->[0]",
 161                 qq(use utf8; \$a = "$_->[1]"; \$b = show \$a), qr/^>$_->[0]<$/],
 162               } @char),
 163              # Interpolation of hex characters needs to take place now, as we're
 164              # testing feeding malformed utf8 into perl. Bug now fixed was an
 165              # "out of memory" error. We really need the "" [rather than qq()
 166              # or q()] to get the best explosion.
 167              ["!Feed malformed utf8 into perl.", <<"BANG",
 168     use utf8; %a = ("$malformed" =>"sterling");
 169     print 'start'; printf '%x,', ord \$_ foreach keys %a; print "end\n";
 170 BANG
 171               qr/^Malformed UTF-8 character: .*? \(unexpected non-continuation byte/
 172              ],
 173             );
 174     foreach (@tests) {
 175         my ($why, $prog, $expect) = @$_;
 176         open P, ">", $progfile or die "Can't open '$progfile': $!";
 177         binmode(P, ":bytes") if $has_perlio;
 178         print P $show, $prog, '; print $b'
 179             or die "Print to 'progfile' failed: $!";
 180         close P or die "Can't close '$progfile': $!";
 181         if ($why =~ s/^!//) {
 182             print "# Possible delay...\n";
 183         } else {
 184             print "# $prog\n";
 185         }
 186         my $result = runperl ( stderr => 1, progfile => $progfile );
 187         like ($result, $expect, $why);
 188     }
 189     print
 190         "# Again! Again! [but this time as eval, and not the explosive one]\n";
 191     # and now we've safely done them all as separate files, check that the
 192     # evals do the same thing. Hopefully doing it later successfully decouples
 193     # the previous tests from anything messy that may go wrong with the evals.
 194     foreach (@tests) {
 195         my ($why, $prog, $expect) = @$_;
 196         next if $why =~ m/^!/; # Goes bang.
 197         my $result = eval $prog;
 198         if ($@) {
 199             print "# prog is $prog\n";
 200             print "# \$\@=", _qq($@), "\n";
 201         }
 202         like ($result, $expect, $why);
 203     }
 204
 205     # See what the tokeniser does with hash keys.
 206     print "# What does the tokeniser do with utf8 hash keys?\n";
 207     @tests = (map {
 208         # This is the control - I don't expect it to fail
 209         ["assign utf8 for chr $_->[0] to a hash",
 210          qq(my \$a = "$_->[1]"; my %h; \$h{\$a} = 1;
 211             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 212          qr/^>$_->[2]<$/],
 213         ["no utf8; assign utf8 for chr $_->[0] to a hash",
 214          qq(no utf8; my \$a = "$_->[1]"; my %h; \$h{\$a} = 1;
 215             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 216          qr/^>$_->[2]<$/],
 217         ["use utf8; assign utf8 for chr $_->[0] to a hash",
 218          qq(use utf8; my \$a = "$_->[1]"; my %h; \$h{\$a} = 1;
 219             my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
 220          qr/^>$_->[0]<$/],
 221         # Now check literal $h{"x"} constructions.
 222         ["\$h{\"x\"} construction, where x is utf8 for chr $_->[0]",
 223          qq(my \$a = "$_->[1]"; my %h; \$h{"$_->[1]"} = 1;
 224             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 225          qr/^>$_->[2]<$/],
 226         ["no utf8; \$h{\"x\"} construction, where x is utf8 for chr $_->[0]",
 227          qq(no utf8; my \$a = "$_->[1]"; my %h; \$h{"$_->[1]"} = 1;
 228             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 229          qr/^>$_->[2]<$/],
 230         ["use utf8; \$h{\"x\"} construction, where x is utf8 for chr $_->[0]",
 231          qq(use utf8; my \$a = "$_->[1]"; my %h; \$h{"$_->[1]"} = 1;
 232             my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
 233          qr/^>$_->[0]<$/],
 234         # Now check "x" => constructions.
 235         ["assign \"x\"=>1 to a hash, where x is utf8 for chr $_->[0]",
 236          qq(my \$a = "$_->[1]"; my %h; %h = ("$_->[1]" => 1);
 237             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 238          qr/^>$_->[2]<$/],
 239         ["no utf8; assign \"x\"=>1 to a hash, where x is utf8 for chr $_->[0]",
 240          qq(no utf8; my \$a = "$_->[1]"; my %h; %h = ("$_->[1]" => 1);
 241             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 242          qr/^>$_->[2]<$/],
 243         ["use utf8; assign \"x\"=>1 to a hash, where x is utf8 for chr $_->[0]",
 244          qq(use utf8; my \$a = "$_->[1]"; my %h; %h = ("$_->[1]" => 1);
 245             my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
 246          qr/^>$_->[0]<$/],
 247         # Check copies of hashes made from literal utf8 keys
 248         ["assign utf8 for chr $_->[0] to a hash, then copy it",
 249          qq(my \$a = "$_->[1]"; my %i; \$i{\$a} = 1; my %h = %i;
 250             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 251          qr/^>$_->[2]<$/],
 252         ["no utf8; assign utf8 for chr $_->[0] to a hash, then copy it",
 253          qq(no utf8; my \$a = "$_->[1]"; my %i; \$i{\$a} = 1;; my %h = %i;
 254             my \$b = show keys %h; \$b .= 'F' unless \$h{$_->[3]}; \$b),
 255          qr/^>$_->[2]<$/],
 256         ["use utf8; assign utf8 for chr $_->[0] to a hash, then copy it",
 257          qq(use utf8; my \$a = "$_->[1]"; my %i; \$i{\$a} = 1; my %h = %i;
 258             my \$b = show keys %h; \$b .= 'F' unless \$h{chr $_->[0]}; \$b),
 259          qr/^>$_->[0]<$/],
 260      } @char);
 261     foreach (@tests) {
 262         my ($why, $prog, $expect) = @$_;
 263         # print "# $prog\n";
 264         my $result = eval $prog;
 265         like ($result, $expect, $why);
 266     }
 267 }
 268
 269 #
 270 # bug fixed by change #17928
 271 # separate perl used because we rely on 'strict' not yet loaded;
 272 # before the patch, the eval died with an error like:
 273 #   "my" variable $strict::VERSION can't be in a package
 274 #
 275 SKIP: {
 276     skip("Haven't bothered to port this to EBCDIC non-1047", 1) if $::IS_EBCDIC
 277                                                                 && ord '^' != 95;
 278     if ($::IS_ASCII) {
 279         ok('' eq runperl(prog => <<'CODE'), "change #17928");
 280             my $code = qq{ my \$\xe3\x83\x95\xe3\x83\xbc = 5; };
 281         {
 282             use utf8;
 283             eval $code;
 284             print $@ if $@;
 285         }
 286 CODE
 287     }
 288     else {
 289         ok('' eq runperl(prog => <<'CODE'), "change #17928");
 290             my $code = qq{ my \$\xCE\x47\x64\xCE\x48\x70 = 5; };
 291         {
 292             use utf8;
 293             eval $code;
 294             print $@ if $@;
 295         }
 296 CODE
 297     }
 298 }
 299
 300 {
 301     use utf8;
 302     $a = <<'END';
 303 0 ....... 1 ....... 2 ....... 3 ....... 4 ....... 5 ....... 6 ....... 7 .......
 304 END
 305     my (@i, $s);
 306
 307     @i = ();
 308     push @i, $s = index($a, '6');     # 60
 309     push @i, $s = index($a, '.', $s); # next . after 60 is 62
 310     push @i, $s = index($a, '5');     # 50
 311     push @i, $s = index($a, '.', $s); # next . after 52 is 52
 312     push @i, $s = index($a, '7');     # 70
 313     push @i, $s = index($a, '.', $s); # next . after 70 is 72
 314     push @i, $s = index($a, '4');     # 40
 315     push @i, $s = index($a, '.', $s); # next . after 40 is 42
 316     is("@i", "60 62 50 52 70 72 40 42", "utf8 heredoc index");
 317
 318     @i = ();
 319     push @i, $s = rindex($a, '6');     # 60
 320     push @i, $s = rindex($a, '.', $s); # previous . before 60 is 58
 321     push @i, $s = rindex($a, '5');     # 50
 322     push @i, $s = rindex($a, '.', $s); # previous . before 52 is 48
 323     push @i, $s = rindex($a, '7');     # 70
 324     push @i, $s = rindex($a, '.', $s); # previous . before 70 is 68
 325     push @i, $s = rindex($a, '4');     # 40
 326     push @i, $s = rindex($a, '.', $s); # previous . before 40 is 38
 327     is("@i", "60 58 50 48 70 68 40 38", "utf8 heredoc rindex");
 328
 329     @i = ();
 330     push @i, $s =  index($a, '6');     # 60
 331     push @i,  index($a, '.', $s);      # next     . after  60 is 62
 332     push @i, rindex($a, '.', $s);      # previous . before 60 is 58
 333     push @i, $s = rindex($a, '5');     # 60
 334     push @i,  index($a, '.', $s);      # next     . after  50 is 52
 335     push @i, rindex($a, '.', $s);      # previous . before 50 is 48
 336     push @i, $s =  index($a, '7', $s); # 70
 337     push @i,  index($a, '.', $s);      # next     . after  70 is 72
 338     push @i, rindex($a, '.', $s);      # previous . before 70 is 68
 339     is("@i", "60 62 58 50 52 48 70 72 68", "utf8 heredoc index and rindex");
 340 }
 341
 342 SKIP: {
 343     skip("Haven't bothered to port this to EBCDIC non-1047", 1) if $::IS_EBCDIC
 344                                                                 && ord '^' != 95;
 345     use utf8;
 346     if ($::IS_ASCII) {
 347         is eval qq{q \xc3\xbc test \xc3\xbc . qq\xc2\xb7 test \xc2\xb7},
 348         ' test  test ',
 349         "utf8 quote delimiters [perl #16823]";
 350     }
 351     else {
 352         is eval qq{q \x8B\x70 test \x8B\x70 . qq\x80\x66 test \x80\x66},
 353         ' test  test ',
 354         "utf8 quote delimiters [perl #16823]";
 355     }
 356 }
 357
 358 # Test the "internals".
 359
 360 {
 361     my $a = "A";
 362     my $b = chr(0x0FF);
 363     my $c = chr(0x0DF);  # FF is invariant in many EBCDIC pages, so is not a
 364                          # fair test of 'beyond'; but DF is variant (in all
 365                          # supported EBCDIC pages so far), so make 2 'beyond'
 366                          # tests
 367     my $d = chr(0x100);
 368
 369     ok( utf8::valid($a), "utf8::valid basic");
 370     ok( utf8::valid($b), "utf8::valid beyond");
 371     ok( utf8::valid($c), "utf8::valid beyond");
 372     ok( utf8::valid($d), "utf8::valid unicode");
 373
 374     ok(!utf8::is_utf8($a), "!utf8::is_utf8 basic");
 375     ok(!utf8::is_utf8($b), "!utf8::is_utf8 beyond");
 376     ok(!utf8::is_utf8($c), "!utf8::is_utf8 beyond");
 377     ok( utf8::is_utf8($d), "utf8::is_utf8 unicode");
 378
 379     is(utf8::upgrade($a), 1, "utf8::upgrade basic");
 380     if ($::IS_EBCDIC) { # EBCDIC.
 381         is(utf8::upgrade($b), 1, "utf8::upgrade beyond");
 382     } else {
 383         is(utf8::upgrade($b), 2, "utf8::upgrade beyond");
 384     }
 385     is(utf8::upgrade($c), 2, "utf8::upgrade beyond");
 386     is(utf8::upgrade($d), 2, "utf8::upgrade unicode");
 387
 388     is($a, "A",       "basic");
 389     is($b, "\xFF",    "beyond");
 390     is($c, "\xDF",    "beyond");
 391     is($d, "\x{100}", "unicode");
 392
 393     ok( utf8::valid($a), "utf8::valid basic");
 394     ok( utf8::valid($b), "utf8::valid beyond");
 395     ok( utf8::valid($c), "utf8::valid beyond");
 396     ok( utf8::valid($d), "utf8::valid unicode");
 397
 398     ok( utf8::is_utf8($a), "utf8::is_utf8 basic");
 399     ok( utf8::is_utf8($b), "utf8::is_utf8 beyond");
 400     ok( utf8::is_utf8($c), "utf8::is_utf8 beyond");
 401     ok( utf8::is_utf8($d), "utf8::is_utf8 unicode");
 402
 403     is(utf8::downgrade($a), 1, "utf8::downgrade basic");
 404     is(utf8::downgrade($b), 1, "utf8::downgrade beyond");
 405     is(utf8::downgrade($c), 1, "utf8::downgrade beyond");
 406
 407     is($a, "A",       "basic");
 408     is($b, "\xFF",    "beyond");
 409     is($c, "\xDF",    "beyond");
 410
 411     ok( utf8::valid($a), "utf8::valid basic");
 412     ok( utf8::valid($b), "utf8::valid beyond");
 413     ok( utf8::valid($c), "utf8::valid beyond");
 414
 415     ok(!utf8::is_utf8($a), "!utf8::is_utf8 basic");
 416     ok(!utf8::is_utf8($b), "!utf8::is_utf8 beyond");
 417     ok(!utf8::is_utf8($c), "!utf8::is_utf8 beyond");
 418
 419     utf8::encode($a);
 420     utf8::encode($b);
 421     utf8::encode($c);
 422     utf8::encode($d);
 423
 424     is($a, "A",       "basic");
 425     if ($::IS_EBCDIC) { # EBCDIC.
 426         is(length($b), 1, "beyond length");
 427     } else {
 428         is(length($b), 2, "beyond length");
 429     }
 430     is(length($c), 2, "beyond length");
 431     is(length($d), 2, "unicode length");
 432
 433     ok(utf8::valid($a), "utf8::valid basic");
 434     ok(utf8::valid($b), "utf8::valid beyond");
 435     ok(utf8::valid($c), "utf8::valid beyond");
 436     ok(utf8::valid($d), "utf8::valid unicode");
 437
 438     # encode() clears the UTF-8 flag (unlike upgrade()).
 439     ok(!utf8::is_utf8($a), "!utf8::is_utf8 basic");
 440     ok(!utf8::is_utf8($b), "!utf8::is_utf8 beyond");
 441     ok(!utf8::is_utf8($c), "!utf8::is_utf8 beyond");
 442     ok(!utf8::is_utf8($d), "!utf8::is_utf8 unicode");
 443
 444     utf8::decode($a);
 445     utf8::decode($b);
 446     utf8::decode($c);
 447     utf8::decode($d);
 448
 449     is($a, "A",       "basic");
 450     is($b, "\xFF",    "beyond");
 451     is($c, "\xDF",    "beyond");
 452     is($d, "\x{100}", "unicode");
 453
 454     ok(utf8::valid($a), "!utf8::valid basic");
 455     ok(utf8::valid($b), "!utf8::valid beyond");
 456     ok(utf8::valid($c), "!utf8::valid beyond");
 457     ok(utf8::valid($d), " utf8::valid unicode");
 458
 459     ok(!utf8::is_utf8($a), "!utf8::is_utf8 basic");
 460     if ($::IS_EBCDIC) { # EBCDIC.
 461         ok( utf8::is_utf8(pack('U',0x0ff)), " utf8::is_utf8 beyond");
 462     } else {
 463         ok( utf8::is_utf8($b), " utf8::is_utf8 beyond"); # $b stays in UTF-8.
 464     }
 465     ok( utf8::is_utf8($c), " utf8::is_utf8 beyond"); # $c stays in UTF-8.
 466     ok( utf8::is_utf8($d), " utf8::is_utf8 unicode");
 467 }
 468
 469 {
 470     eval {utf8::encode("£")};
 471     like($@, qr/^Modification of a read-only value attempted/,
 472          "utf8::encode should refuse to touch read-only values");
 473 }
 474
 475 {
 476     # Make sure utf8::decode respects copy-on-write [perl #91834].
 477     # Hash keys are the easiest way to test this.
 478     my $name = byte_utf8a_to_utf8n("\x{c3}\x{b3}");
 479     my ($k1) = keys %{ { $name=>undef } };
 480     my $k2 = $name;
 481     utf8::decode($k1);
 482     utf8::decode($k2);
 483     my $h = { $k1 => 1, $k2 => 2 };
 484     is join('', keys %$h), $k2, 'utf8::decode respects copy-on-write';
 485 }
 486
 487 {
 488     # Make sure utf8::decode does not modify read-only scalars
 489     # [perl #91850].
 490
 491     my $name = byte_utf8a_to_utf8n("\x{c3}\x{b3}");
 492     Internals::SvREADONLY($name, 1);
 493     eval { utf8::decode($name) };
 494     like $@, qr/^Modification of a read-only/,
 495         'utf8::decode respects readonliness';
 496 }
 497
 498 {
 499     # utf8::decode should stringify refs [perl #91852].
 500
 501     package eieifg { use overload '""'      => sub { main::byte_utf8a_to_utf8n("\x{c3}\x{b3}") },
 502                                    fallback => 1 }
 503
 504     my $name = bless[], eieifg::;
 505     utf8::decode($name);
 506     is $name, uni_to_native("\xf3"), 'utf8::decode flattens references';
 507 }
 508
 509 {
 510     # What do the utf8::* functions do when given a reference? A test
 511     # for a behavior change that made this start dying as of
 512     # v5.15.6-407-gc710240 due to a fix for [perl #91852]:
 513     #
 514     #    ./miniperl -Ilib -wle 'use strict; print $]; my $s = shift; my $s_ref = \$s; utf8::decode($s_ref); print $$s_ref' hlagh
 515     my %expected = (
 516         'utf8::is_utf8'           => { returns => "hlagh" },
 517         'utf8::valid'             => { returns => "hlagh" },
 518         'utf8::encode'            => { error => qr/Can't use string .*? as a SCALAR ref/},
 519         'utf8::decode'            => { error => qr/Can't use string .*? as a SCALAR ref/},
 520         'utf8::upgrade'           => { error => qr/Can't use string .*? as a SCALAR ref/ },
 521         'utf8::downgrade'         => { returns => "hlagh" },
 522         'utf8::native_to_unicode' => { returns => "hlagh" },
 523         'utf8::unicode_to_native' => { returns => "hlagh" },
 524     );
 525     for my $func (sort keys %expected) { # sort just so it's deterministic wrt diffing *.t output
 526         my $code = sprintf q[
 527             use strict;
 528             my $s = "hlagh";
 529             my $r = \$s;
 530             my $dummy = %s($r);
 531             $$r;
 532         ], $func;
 533         my $ret = eval $code or my $error = $@;
 534         if (my $error_rx = $expected{$func}->{error}) {
 535             if (defined $error) {
 536                 like $error, $error_rx, "The $func function should die with an error matching $error_rx";
 537             } else {
 538                 fail("We were expecting an error when calling the $func function but got a value of '$ret' instead");
 539             }
 540         } elsif (my $returns = $expected{$func}->{returns}) {
 541             is($ret, $returns, "The $func function lives and returns '$returns' as expected");
 542         } else {
 543             die "PANIC: Internal Error"
 544         }
 545     }
 546 }
 547
 548 {
 549     my $a = "456" . uni_to_native("\xb6");
 550     utf8::upgrade($a);
 551
 552     my $b = "123456" . uni_to_native("\xb6");
 553     $b =~ s/^...//;
 554     utf8::upgrade($b);
 555     is($b, $a, "utf8::upgrade OffsetOK");
 556 }
 557
 558 {
 559     fresh_perl_like ('use utf8; utf8::moo()',
 560                      qr/Undefined subroutine utf8::moo/, {stderr=>1},
 561                     "Check Carp is loaded for AUTOLOADing errors")
 562 }
 563
 564 {
 565     # failure of is_utf8_char() without NATIVE_TO_UTF on EBCDIC (0260..027F)
 566     ok(utf8::valid(chr(0x250)), "0x250");
 567     ok(utf8::valid(chr(0x260)), "0x260");
 568     ok(utf8::valid(chr(0x270)), "0x270");
 569     ok(utf8::valid(chr(0x280)), "0x280");
 570 }
 571
 572 {
 573    use utf8;
 574    ok( !utf8::is_utf8( "asd"         ), "Wasteful format - qq{}" );
 575    ok( !utf8::is_utf8( 'asd'         ), "Wasteful format - q{}" );
 576    ok( !utf8::is_utf8( qw(asd)       ), "Wasteful format - qw{}" );
 577    ok( !utf8::is_utf8( (asd => 1)[0] ), "Wasteful format - =>" );
 578    ok( !utf8::is_utf8( -asd          ), "Wasteful format - -word" );
 579    no warnings 'bareword';
 580    ok( !utf8::is_utf8( asd::         ), "Wasteful format - word::" );
 581    no warnings 'reserved';
 582    no strict 'subs';
 583    ok( !utf8::is_utf8( asd           ), "Wasteful format - bareword" );
 584 }
 585
 586 {
 587     my @highest =
 588         (undef, 0x7F, 0x7FF, 0xFFFF, 0x1FFFFF, 0x3FFFFFF, 0x7FFFFFFF);
 589     my @step =
 590         (undef, undef, 0x40, 0x1000, 0x40000, 0x1000000, 0x40000000);
 591
 592     foreach my $length (6, 5, 4, 3, 2) {
 593         my $high = $highest[$length];
 594         while ($high > $highest[$length - 1]) {
 595             my $low = $high - $step[$length] + 1;
 596             $low = $highest[$length - 1] + 1 if $low <= $highest[$length - 1];
 597             ok(utf8::valid(do {no warnings 'utf8'; chr $low}),
 598                sprintf "chr %x, length $length is valid", $low);
 599             ok(utf8::valid(do {no warnings 'utf8'; chr $high}),
 600                sprintf "chr %x, length $length is valid", $high);
 601             $high -= $step[$length];
 602         }
 603     }
 604 }
 605
 606 # #80190 update pos, and cached length/position-mapping after
 607 # utf8 upgrade/downgrade, encode/decode
 608
 609 for my $pos (0..5) {
 610
 611     my $p;
 612     my $utf8_bytes = byte_utf8a_to_utf8n("\xc8\x81\xe3\xbf\xbf");
 613     my $s = "A$utf8_bytes\x{100}";
 614     chop($s);
 615
 616     pos($s) = $pos;
 617     # also sets cache
 618     is(length($s), 6,              "(pos $pos) len before    utf8::downgrade");
 619     is(pos($s),    $pos,           "(pos $pos) pos before    utf8::downgrade");
 620     utf8::downgrade($s);
 621     is(length($s), 6,              "(pos $pos) len after     utf8::downgrade");
 622     is(pos($s),    $pos,           "(pos $pos) pos after     utf8::downgrade");
 623     is($s, "A$utf8_bytes","(pos $pos) str after     utf8::downgrade");
 624     utf8::decode($s);
 625     is(length($s), 3,              "(pos $pos) len after  D; utf8::decode");
 626     is(pos($s),    undef,          "(pos $pos) pos after  D; utf8::decode");
 627     is($s, "A\x{201}\x{3fff}",     "(pos $pos) str after  D; utf8::decode");
 628     utf8::encode($s);
 629     is(length($s), 6,              "(pos $pos) len after  D; utf8::encode");
 630     is(pos($s),    undef,          "(pos $pos) pos after  D; utf8::encode");
 631     is($s, "A$utf8_bytes","(pos $pos) str after  D; utf8::encode");
 632
 633     $s = "A$utf8_bytes";
 634
 635     pos($s) = $pos;
 636     is(length($s), 6,              "(pos $pos) len before    utf8::upgrade");
 637     is(pos($s),    $pos,           "(pos $pos) pos before    utf8::upgrade");
 638     utf8::upgrade($s);
 639     is(length($s), 6,              "(pos $pos) len after     utf8::upgrade");
 640     is(pos($s),    $pos,           "(pos $pos) pos after     utf8::upgrade");
 641     is($s, "A$utf8_bytes","(pos $pos) str after     utf8::upgrade");
 642     utf8::decode($s);
 643     is(length($s), 3,              "(pos $pos) len after  U; utf8::decode");
 644     is(pos($s),    undef,          "(pos $pos) pos after  U; utf8::decode");
 645     is($s, "A\x{201}\x{3fff}",     "(pos $pos) str after  U; utf8::decode");
 646     utf8::encode($s);
 647     is(length($s), 6,              "(pos $pos) len after  U; utf8::encode");
 648     is(pos($s),    undef,          "(pos $pos) pos after  U; utf8::encode");
 649     is($s, "A$utf8_bytes","(pos $pos) str after  U; utf8::encode");
 650 }
 651
 652 SKIP: {
 653     skip("Test only valid on ASCII platform", 1) unless $::IS_ASCII;
 654     require Config;
 655     skip("Test needs a B module, which is lacking in this Perl", 1)
 656         if $Config::Config{'extensions'} !~ /\bB\b/;
 657
 658     my $out = runperl ( switches => ["-XMO=Concise"],
 659                     prog => 'utf8::unicode_to_native(0x41);
 660                              utf8::native_to_unicode(0x42)',
 661                     stderr => 1 );
 662     unlike($out, qr/entersub/,
 663             "utf8::unicode_to_native() and native_to_unicode() optimized out");
 664 }
 665
 666
 667 # [perl #119043] utf8::upgrade should not croak on read-only COWs
 668 for(__PACKAGE__) {
 669         eval { utf8::upgrade($_) };
 670         is $@, "", 'no error with utf8::upgrade on read-only COW';
 671 }
 672 # This one croaks, but not because the scalar is read-only
 673 eval "package \x{100};\n" . <<'END'
 674     for(__PACKAGE__) {
 675         eval { utf8::downgrade($_) };
 676         ::like $@, qr/^Wide character/,
 677             'right error with utf8::downgrade on read-only COW';
 678     }
 679     1
 680 END
 681 or die $@;
 682
 683 done_testing();