lib/locale.t

   1 #!./perl -wT
   2
   3 binmode STDOUT, ':utf8';
   4 binmode STDERR, ':utf8';
   5
   6 BEGIN {
   7     chdir 't' if -d 't';
   8     @INC = '../lib';
   9     unshift @INC, '.';
  10     require Config; import Config;
  11     if (!$Config{d_setlocale} || $Config{ccflags} =~ /\bD?NO_LOCALE\b/) {
  12         print "1..0\n";
  13         exit;
  14     }
  15     $| = 1;
  16 }
  17
  18 use strict;
  19
  20 my $debug = 1;
  21
  22 use Dumpvalue;
  23
  24 my $dumper = Dumpvalue->new(
  25                             tick => qq{"},
  26                             quoteHighBit => 0,
  27                             unctrl => "quote"
  28                            );
  29 sub debug {
  30   return unless $debug;
  31   my($mess) = join "", @_;
  32   chop $mess;
  33   print $dumper->stringify($mess,1), "\n";
  34 }
  35
  36 sub debugf {
  37     printf @_ if $debug;
  38 }
  39
  40 my $have_setlocale = 0;
  41 eval {
  42     require POSIX;
  43     import POSIX ':locale_h';
  44     $have_setlocale++;
  45 };
  46
  47 # Visual C's CRT goes silly on strings of the form "en_US.ISO8859-1"
  48 # and mingw32 uses said silly CRT
  49 # This doesn't seem to be an issue any more, at least on Windows XP,
  50 # so re-enable the tests for Windows XP onwards.
  51 my $winxp = ($^O eq 'MSWin32' && defined &Win32::GetOSVersion &&
  52                 join('.', (Win32::GetOSVersion())[1..2]) >= 5.1);
  53 $have_setlocale = 0 if ((($^O eq 'MSWin32' && !$winxp) || $^O eq 'NetWare') &&
  54                 $Config{cc} =~ /^(cl|gcc)/i);
  55
  56 # UWIN seems to loop after test 98, just skip for now
  57 $have_setlocale = 0 if ($^O =~ /^uwin/);
  58
  59 my $last_locales = $have_setlocale ? &last_locales : &last_without_setlocale;
  60 my $last = $have_setlocale ? &last : &last_without_setlocale;
  61
  62 print "1..$last\n";
  63
  64 sub LC_ALL ();
  65
  66 $a = 'abc %';
  67
  68 sub ok {
  69     my ($n, $result, $message) = @_;
  70     $message = "" unless defined $message;
  71
  72     print 'not ' unless ($result);
  73     print "ok $n";
  74     print " $message";
  75     print "\n";
  76 }
  77
  78 # First we'll do a lot of taint checking for locales.
  79 # This is the easiest to test, actually, as any locale,
  80 # even the default locale will taint under 'use locale'.
  81
  82 sub is_tainted { # hello, camel two.
  83     no warnings 'uninitialized' ;
  84     my $dummy;
  85     local $@;
  86     not eval { $dummy = join("", @_), kill 0; 1 }
  87 }
  88
  89 sub check_taint ($$) {
  90     ok $_[0], is_tainted($_[1]), "verify that is tainted";
  91 }
  92
  93 sub check_taint_not ($$) {
  94     ok $_[0], (not is_tainted($_[1])), "verify that isn't tainted";
  95 }
  96
  97 use locale;     # engage locale and therefore locale taint.
  98
  99 check_taint_not   1, $a;
 100
 101 check_taint       2, uc($a);
 102 check_taint       3, "\U$a";
 103 check_taint       4, ucfirst($a);
 104 check_taint       5, "\u$a";
 105 check_taint       6, lc($a);
 106 check_taint       7, "\L$a";
 107 check_taint       8, lcfirst($a);
 108 check_taint       9, "\l$a";
 109
 110 check_taint_not  10, sprintf('%e', 123.456);
 111 check_taint_not  11, sprintf('%f', 123.456);
 112 check_taint_not  12, sprintf('%g', 123.456);
 113 check_taint_not  13, sprintf('%d', 123.456);
 114 check_taint_not  14, sprintf('%x', 123.456);
 115
 116 $_ = $a;        # untaint $_
 117
 118 $_ = uc($a);    # taint $_
 119
 120 check_taint      15, $_;
 121
 122 /(\w)/; # taint $&, $`, $', $+, $1.
 123 check_taint      16, $&;
 124 check_taint      17, $`;
 125 check_taint      18, $';
 126 check_taint      19, $+;
 127 check_taint      20, $1;
 128 check_taint_not  21, $2;
 129
 130 /(.)/;  # untaint $&, $`, $', $+, $1.
 131 check_taint_not  22, $&;
 132 check_taint_not  23, $`;
 133 check_taint_not  24, $';
 134 check_taint_not  25, $+;
 135 check_taint_not  26, $1;
 136 check_taint_not  27, $2;
 137
 138 /(\W)/; # taint $&, $`, $', $+, $1.
 139 check_taint      28, $&;
 140 check_taint      29, $`;
 141 check_taint      30, $';
 142 check_taint      31, $+;
 143 check_taint      32, $1;
 144 check_taint_not  33, $2;
 145
 146 /(\s)/; # taint $&, $`, $', $+, $1.
 147 check_taint      34, $&;
 148 check_taint      35, $`;
 149 check_taint      36, $';
 150 check_taint      37, $+;
 151 check_taint      38, $1;
 152 check_taint_not  39, $2;
 153
 154 /(\S)/; # taint $&, $`, $', $+, $1.
 155 check_taint      40, $&;
 156 check_taint      41, $`;
 157 check_taint      42, $';
 158 check_taint      43, $+;
 159 check_taint      44, $1;
 160 check_taint_not  45, $2;
 161
 162 $_ = $a;        # untaint $_
 163
 164 check_taint_not  46, $_;
 165
 166 /(b)/;          # this must not taint
 167 check_taint_not  47, $&;
 168 check_taint_not  48, $`;
 169 check_taint_not  49, $';
 170 check_taint_not  50, $+;
 171 check_taint_not  51, $1;
 172 check_taint_not  52, $2;
 173
 174 $_ = $a;        # untaint $_
 175
 176 check_taint_not  53, $_;
 177
 178 $b = uc($a);    # taint $b
 179 s/(.+)/$b/;     # this must taint only the $_
 180
 181 check_taint      54, $_;
 182 check_taint_not  55, $&;
 183 check_taint_not  56, $`;
 184 check_taint_not  57, $';
 185 check_taint_not  58, $+;
 186 check_taint_not  59, $1;
 187 check_taint_not  60, $2;
 188
 189 $_ = $a;        # untaint $_
 190
 191 s/(.+)/b/;      # this must not taint
 192 check_taint_not  61, $_;
 193 check_taint_not  62, $&;
 194 check_taint_not  63, $`;
 195 check_taint_not  64, $';
 196 check_taint_not  65, $+;
 197 check_taint_not  66, $1;
 198 check_taint_not  67, $2;
 199
 200 $b = $a;        # untaint $b
 201
 202 ($b = $a) =~ s/\w/$&/;
 203 check_taint      68, $b;        # $b should be tainted.
 204 check_taint_not  69, $a;        # $a should be not.
 205
 206 $_ = $a;        # untaint $_
 207
 208 s/(\w)/\l$1/;   # this must taint
 209 check_taint      70, $_;
 210 check_taint      71, $&;
 211 check_taint      72, $`;
 212 check_taint      73, $';
 213 check_taint      74, $+;
 214 check_taint      75, $1;
 215 check_taint_not  76, $2;
 216
 217 $_ = $a;        # untaint $_
 218
 219 s/(\w)/\L$1/;   # this must taint
 220 check_taint      77, $_;
 221 check_taint      78, $&;
 222 check_taint      79, $`;
 223 check_taint      80, $';
 224 check_taint      81, $+;
 225 check_taint      82, $1;
 226 check_taint_not  83, $2;
 227
 228 $_ = $a;        # untaint $_
 229
 230 s/(\w)/\u$1/;   # this must taint
 231 check_taint      84, $_;
 232 check_taint      85, $&;
 233 check_taint      86, $`;
 234 check_taint      87, $';
 235 check_taint      88, $+;
 236 check_taint      89, $1;
 237 check_taint_not  90, $2;
 238
 239 $_ = $a;        # untaint $_
 240
 241 s/(\w)/\U$1/;   # this must taint
 242 check_taint      91, $_;
 243 check_taint      92, $&;
 244 check_taint      93, $`;
 245 check_taint      94, $';
 246 check_taint      95, $+;
 247 check_taint      96, $1;
 248 check_taint_not  97, $2;
 249
 250 # After all this tainting $a should be cool.
 251
 252 check_taint_not  98, $a;
 253
 254 sub last_without_setlocale { 98 }
 255
 256 # I think we've seen quite enough of taint.
 257 # Let us do some *real* locale work now,
 258 # unless setlocale() is missing (i.e. minitest).
 259
 260 exit unless $have_setlocale;
 261
 262 # Find locales.
 263
 264 debug "# Scanning for locales...\n";
 265
 266 # Note that it's okay that some languages have their native names
 267 # capitalized here even though that's not "right".  They are lowercased
 268 # anyway later during the scanning process (and besides, some clueless
 269 # vendor might have them capitalized erroneously anyway).
 270
 271 my $locales = <<EOF;
 272 Afrikaans:af:za:1 15
 273 Arabic:ar:dz eg sa:6 arabic8
 274 Brezhoneg Breton:br:fr:1 15
 275 Bulgarski Bulgarian:bg:bg:5
 276 Chinese:zh:cn tw:cn.EUC eucCN eucTW euc.CN euc.TW Big5 GB2312 tw.EUC
 277 Hrvatski Croatian:hr:hr:2
 278 Cymraeg Welsh:cy:cy:1 14 15
 279 Czech:cs:cz:2
 280 Dansk Danish:dk:da:1 15
 281 Nederlands Dutch:nl:be nl:1 15
 282 English American British:en:au ca gb ie nz us uk zw:1 15 cp850
 283 Esperanto:eo:eo:3
 284 Eesti Estonian:et:ee:4 6 13
 285 Suomi Finnish:fi:fi:1 15
 286 Flamish::fl:1 15
 287 Deutsch German:de:at be ch de lu:1 15
 288 Euskaraz Basque:eu:es fr:1 15
 289 Galego Galician:gl:es:1 15
 290 Ellada Greek:el:gr:7 g8
 291 Frysk:fy:nl:1 15
 292 Greenlandic:kl:gl:4 6
 293 Hebrew:iw:il:8 hebrew8
 294 Hungarian:hu:hu:2
 295 Indonesian:in:id:1 15
 296 Gaeilge Irish:ga:IE:1 14 15
 297 Italiano Italian:it:ch it:1 15
 298 Nihongo Japanese:ja:jp:euc eucJP jp.EUC sjis
 299 Korean:ko:kr:
 300 Latine Latin:la:va:1 15
 301 Latvian:lv:lv:4 6 13
 302 Lithuanian:lt:lt:4 6 13
 303 Macedonian:mk:mk:1 15
 304 Maltese:mt:mt:3
 305 Moldovan:mo:mo:2
 306 Norsk Norwegian:no no\@nynorsk:no:1 15
 307 Occitan:oc:es:1 15
 308 Polski Polish:pl:pl:2
 309 Rumanian:ro:ro:2
 310 Russki Russian:ru:ru su ua:5 koi8 koi8r KOI8-R koi8u cp1251 cp866
 311 Serbski Serbian:sr:yu:5
 312 Slovak:sk:sk:2
 313 Slovene Slovenian:sl:si:2
 314 Sqhip Albanian:sq:sq:1 15
 315 Svenska Swedish:sv:fi se:1 15
 316 Thai:th:th:11 tis620
 317 Turkish:tr:tr:9 turkish8
 318 Yiddish:yi::1 15
 319 EOF
 320
 321 if ($^O eq 'os390') {
 322     # These cause heartburn.  Broken locales?
 323     $locales =~ s/Svenska Swedish:sv:fi se:1 15\n//;
 324     $locales =~ s/Thai:th:th:11 tis620\n//;
 325 }
 326
 327 sub in_utf8 () { $^H & 0x08 || (${^OPEN} || "") =~ /:utf8/ }
 328
 329 if (in_utf8) {
 330     require "lib/locale/utf8";
 331 } else {
 332     require "lib/locale/latin1";
 333 }
 334
 335 my @Locale;
 336 my $Locale;
 337 my @Alnum_;
 338
 339 my @utf8locale;
 340 my %utf8skip;
 341
 342 sub getalnum_ {
 343     sort grep /\w/, map { chr } 0..255
 344 }
 345
 346 sub trylocale {
 347     my $locale = shift;
 348     if (setlocale(LC_ALL, $locale)) {
 349         push @Locale, $locale;
 350     }
 351 }
 352
 353 sub decode_encodings {
 354     my @enc;
 355
 356     foreach (split(/ /, shift)) {
 357         if (/^(\d+)$/) {
 358             push @enc, "ISO8859-$1";
 359             push @enc, "iso8859$1";     # HP
 360             if ($1 eq '1') {
 361                  push @enc, "roman8";   # HP
 362             }
 363         } else {
 364             push @enc, $_;
 365             push @enc, "$_.UTF-8";
 366         }
 367     }
 368     if ($^O eq 'os390') {
 369         push @enc, qw(IBM-037 IBM-819 IBM-1047);
 370     }
 371
 372     return @enc;
 373 }
 374
 375 trylocale("C");
 376 trylocale("POSIX");
 377 foreach (0..15) {
 378     trylocale("ISO8859-$_");
 379     trylocale("iso8859$_");
 380     trylocale("iso8859-$_");
 381     trylocale("iso_8859_$_");
 382     trylocale("isolatin$_");
 383     trylocale("isolatin-$_");
 384     trylocale("iso_latin_$_");
 385 }
 386
 387 # Sanitize the environment so that we can run the external 'locale'
 388 # program without the taint mode getting grumpy.
 389
 390 # $ENV{PATH} is special in VMS.
 391 delete $ENV{PATH} if $^O ne 'VMS' or $Config{d_setenv};
 392
 393 # Other subversive stuff.
 394 delete @ENV{qw(IFS CDPATH ENV BASH_ENV)};
 395
 396 if (-x "/usr/bin/locale" && open(LOCALES, "/usr/bin/locale -a 2>/dev/null|")) {
 397     while (<LOCALES>) {
 398         # It seems that /usr/bin/locale steadfastly outputs 8 bit data, which
 399         # ain't great when we're running this testPERL_UNICODE= so that utf8
 400         # locales will cause all IO hadles to default to (assume) utf8
 401         next unless utf8::valid($_);
 402         chomp;
 403         trylocale($_);
 404     }
 405     close(LOCALES);
 406 } elsif ($^O eq 'VMS' && defined($ENV{'SYS$I18N_LOCALE'}) && -d 'SYS$I18N_LOCALE') {
 407 # The SYS$I18N_LOCALE logical name search list was not present on
 408 # VAX VMS V5.5-12, but was on AXP && VAX VMS V6.2 as well as later versions.
 409     opendir(LOCALES, "SYS\$I18N_LOCALE:");
 410     while ($_ = readdir(LOCALES)) {
 411         chomp;
 412         trylocale($_);
 413     }
 414     close(LOCALES);
 415 } elsif ($^O eq 'openbsd' && -e '/usr/share/locale') {
 416
 417    # OpenBSD doesn't have a locale executable, so reading /usr/share/locale
 418    # is much easier and faster than the last resort method.
 419
 420     opendir(LOCALES, '/usr/share/locale');
 421     while ($_ = readdir(LOCALES)) {
 422         chomp;
 423         trylocale($_);
 424     }
 425     close(LOCALES);
 426 } else {
 427
 428     # This is going to be slow.
 429
 430     foreach my $locale (split(/\n/, $locales)) {
 431         my ($locale_name, $language_codes, $country_codes, $encodings) =
 432             split(/:/, $locale);
 433         my @enc = decode_encodings($encodings);
 434         foreach my $loc (split(/ /, $locale_name)) {
 435             trylocale($loc);
 436             foreach my $enc (@enc) {
 437                 trylocale("$loc.$enc");
 438             }
 439             $loc = lc $loc;
 440             foreach my $enc (@enc) {
 441                 trylocale("$loc.$enc");
 442             }
 443         }
 444         foreach my $lang (split(/ /, $language_codes)) {
 445             trylocale($lang);
 446             foreach my $country (split(/ /, $country_codes)) {
 447                 my $lc = "${lang}_${country}";
 448                 trylocale($lc);
 449                 foreach my $enc (@enc) {
 450                     trylocale("$lc.$enc");
 451                 }
 452                 my $lC = "${lang}_\U${country}";
 453                 trylocale($lC);
 454                 foreach my $enc (@enc) {
 455                     trylocale("$lC.$enc");
 456                 }
 457             }
 458         }
 459     }
 460 }
 461
 462 setlocale(LC_ALL, "C");
 463
 464 if ($^O eq 'darwin') {
 465     # Darwin 8/Mac OS X 10.4 and 10.5 have bad Basque locales: perl bug #35895,
 466     # Apple bug ID# 4139653. It also has a problem in Byelorussian.
 467     (my $v) = $Config{osvers} =~ /^(\d+)/;
 468     if ($v >= 8 and $v < 10) {
 469         debug "# Skipping eu_ES, be_BY locales -- buggy in Darwin\n";
 470         @Locale = grep ! m/^(eu_ES(?:\..*)?|be_BY\.CP1131)$/, @Locale;
 471     } elsif ($v < 12) {
 472         debug "# Skipping be_BY locales -- buggy in Darwin\n";
 473         @Locale = grep ! m/^be_BY\.CP1131$/, @Locale;
 474     }
 475 }
 476
 477 @Locale = sort @Locale;
 478
 479 debug "# Locales =\n";
 480 for ( @Locale ) {
 481     debug "# $_\n";
 482 }
 483
 484 my %Problem;
 485 my %Okay;
 486 my %Testing;
 487 my @Neoalpha;
 488 my %Neoalpha;
 489
 490 sub tryneoalpha {
 491     my ($Locale, $i, $test) = @_;
 492     unless ($test) {
 493         $Problem{$i}{$Locale} = 1;
 494         debug "# failed $i with locale '$Locale'\n";
 495     } else {
 496         push @{$Okay{$i}}, $Locale;
 497     }
 498 }
 499
 500 foreach $Locale (@Locale) {
 501     debug "# Locale = $Locale\n";
 502     @Alnum_ = getalnum_();
 503     debug "# w = ", join("",@Alnum_), "\n";
 504
 505     unless (setlocale(LC_ALL, $Locale)) {
 506         foreach (99..103) {
 507             $Problem{$_}{$Locale} = -1;
 508         }
 509         next;
 510     }
 511
 512     # Sieve the uppercase and the lowercase.
 513
 514     my %UPPER = ();
 515     my %lower = ();
 516     my %BoThCaSe = ();
 517     for (@Alnum_) {
 518         if (/[^\d_]/) { # skip digits and the _
 519             if (uc($_) eq $_) {
 520                 $UPPER{$_} = $_;
 521             }
 522             if (lc($_) eq $_) {
 523                 $lower{$_} = $_;
 524             }
 525         }
 526     }
 527     foreach (keys %UPPER) {
 528         $BoThCaSe{$_}++ if exists $lower{$_};
 529     }
 530     foreach (keys %lower) {
 531         $BoThCaSe{$_}++ if exists $UPPER{$_};
 532     }
 533     foreach (keys %BoThCaSe) {
 534         delete $UPPER{$_};
 535         delete $lower{$_};
 536     }
 537
 538     debug "# UPPER    = ", join("", sort keys %UPPER   ), "\n";
 539     debug "# lower    = ", join("", sort keys %lower   ), "\n";
 540     debug "# BoThCaSe = ", join("", sort keys %BoThCaSe), "\n";
 541
 542     # Find the alphabets that are not alphabets in the default locale.
 543
 544     {
 545         no locale;
 546
 547         @Neoalpha = ();
 548         for (keys %UPPER, keys %lower) {
 549             push(@Neoalpha, $_) if (/\W/);
 550             $Neoalpha{$_} = $_;
 551         }
 552     }
 553
 554     @Neoalpha = sort @Neoalpha;
 555
 556     debug "# Neoalpha = ", join("",@Neoalpha), "\n";
 557
 558     if (@Neoalpha == 0) {
 559         # If we have no Neoalphas the remaining tests are no-ops.
 560         debug "# no Neoalpha, skipping tests 99..102 for locale '$Locale'\n";
 561         foreach (99..102) {
 562             push @{$Okay{$_}}, $Locale;
 563         }
 564     } else {
 565
 566         # Test \w.
 567
 568         my $word = join('', @Neoalpha);
 569
 570         my $badutf8;
 571         {
 572             local $SIG{__WARN__} = sub {
 573                 $badutf8 = $_[0] =~ /Malformed UTF-8/;
 574             };
 575             $Locale =~ /utf-?8/i;
 576         }
 577
 578         if ($badutf8) {
 579             debug "# Locale name contains bad UTF-8, skipping test 99 for locale '$Locale'\n";
 580         } elsif ($Locale =~ /utf-?8/i) {
 581             debug "# unknown whether locale and Unicode have the same \\w, skipping test 99 for locale '$Locale'\n";
 582             push @{$Okay{99}}, $Locale;
 583         } else {
 584             if ($word =~ /^(\w+)$/) {
 585                 tryneoalpha($Locale, 99, 1);
 586             } else {
 587                 tryneoalpha($Locale, 99, 0);
 588             }
 589         }
 590
 591         # Cross-check the whole 8-bit character set.
 592
 593         for (map { chr } 0..255) {
 594             tryneoalpha($Locale, 100,
 595                         (/\w/ xor /\W/) ||
 596                         (/\d/ xor /\D/) ||
 597                         (/\s/ xor /\S/));
 598         }
 599
 600         # Test for read-only scalars' locale vs non-locale comparisons.
 601
 602         {
 603             no locale;
 604             $a = "qwerty";
 605             {
 606                 use locale;
 607                 tryneoalpha($Locale, 101, ($a cmp "qwerty") == 0);
 608             }
 609         }
 610
 611         {
 612             my ($from, $to, $lesser, $greater,
 613                 @test, %test, $test, $yes, $no, $sign);
 614
 615             for (0..9) {
 616                 # Select a slice.
 617                 $from = int(($_*@Alnum_)/10);
 618                 $to = $from + int(@Alnum_/10);
 619                 $to = $#Alnum_ if ($to > $#Alnum_);
 620                 $lesser  = join('', @Alnum_[$from..$to]);
 621                 # Select a slice one character on.
 622                 $from++; $to++;
 623                 $to = $#Alnum_ if ($to > $#Alnum_);
 624                 $greater = join('', @Alnum_[$from..$to]);
 625                 ($yes, $no, $sign) = ($lesser lt $greater
 626                                       ? ("    ", "not ", 1)
 627                                       : ("not ", "    ", -1));
 628                 # all these tests should FAIL (return 0).
 629                 # Exact lt or gt cannot be tested because
 630                 # in some locales, say, eacute and E may test equal.
 631                 @test =
 632                     (
 633                      $no.'    ($lesser  le $greater)',  # 1
 634                      'not      ($lesser  ne $greater)', # 2
 635                      '         ($lesser  eq $greater)', # 3
 636                      $yes.'    ($lesser  ge $greater)', # 4
 637                      $yes.'    ($lesser  ge $greater)', # 5
 638                      $yes.'    ($greater le $lesser )', # 7
 639                      'not      ($greater ne $lesser )', # 8
 640                      '         ($greater eq $lesser )', # 9
 641                      $no.'     ($greater ge $lesser )', # 10
 642                      'not (($lesser cmp $greater) == -($sign))' # 11
 643                      );
 644                 @test{@test} = 0 x @test;
 645                 $test = 0;
 646                 for my $ti (@test) {
 647                     $test{$ti} = eval $ti;
 648                     $test ||= $test{$ti}
 649                 }
 650                 tryneoalpha($Locale, 102, $test == 0);
 651                 if ($test) {
 652                     debug "# lesser  = '$lesser'\n";
 653                     debug "# greater = '$greater'\n";
 654                     debug "# lesser cmp greater = ",
 655                           $lesser cmp $greater, "\n";
 656                     debug "# greater cmp lesser = ",
 657                           $greater cmp $lesser, "\n";
 658                     debug "# (greater) from = $from, to = $to\n";
 659                     for my $ti (@test) {
 660                         debugf("# %-40s %-4s", $ti,
 661                                $test{$ti} ? 'FAIL' : 'ok');
 662                         if ($ti =~ /\(\.*(\$.+ +cmp +\$[^\)]+)\.*\)/) {
 663                             debugf("(%s == %4d)", $1, eval $1);
 664                         }
 665                         debug "\n#";
 666                     }
 667
 668                     last;
 669                 }
 670             }
 671         }
 672     }
 673
 674     use locale;
 675
 676     my ($x, $y) = (1.23, 1.23);
 677
 678     $a = "$x";
 679     printf ''; # printf used to reset locale to "C"
 680     $b = "$y";
 681
 682     debug "# 103..107: a = $a, b = $b, Locale = $Locale\n";
 683
 684     tryneoalpha($Locale, 103, $a eq $b);
 685
 686     my $c = "$x";
 687     my $z = sprintf ''; # sprintf used to reset locale to "C"
 688     my $d = "$y";
 689
 690     debug "# 104..107: c = $c, d = $d, Locale = $Locale\n";
 691
 692     tryneoalpha($Locale, 104, $c eq $d);
 693
 694     {
 695         use warnings;
 696         my $w = 0;
 697         local $SIG{__WARN__} =
 698             sub {
 699                 print "# @_\n";
 700                 $w++;
 701             };
 702
 703         # The == (among other ops) used to warn for locales
 704         # that had something else than "." as the radix character.
 705
 706         tryneoalpha($Locale, 105, $c == 1.23);
 707
 708         tryneoalpha($Locale, 106, $c == $x);
 709
 710         tryneoalpha($Locale, 107, $c == $d);
 711
 712         {
 713 #           no locale; # XXX did this ever work correctly?
 714
 715             my $e = "$x";
 716
 717             debug "# 108..110: e = $e, Locale = $Locale\n";
 718
 719             tryneoalpha($Locale, 108, $e == 1.23);
 720
 721             tryneoalpha($Locale, 109, $e == $x);
 722
 723             tryneoalpha($Locale, 110, $e == $c);
 724         }
 725
 726         my $f = "1.23";
 727         my $g = 2.34;
 728
 729         debug "# 111..115: f = $f, g = $g, locale = $Locale\n";
 730
 731         tryneoalpha($Locale, 111, $f == 1.23);
 732
 733         tryneoalpha($Locale, 112, $f == $x);
 734
 735         tryneoalpha($Locale, 113, $f == $c);
 736
 737         tryneoalpha($Locale, 114, abs(($f + $g) - 3.57) < 0.01);
 738
 739         tryneoalpha($Locale, 115, $w == 0);
 740     }
 741
 742     # Does taking lc separately differ from taking
 743     # the lc "in-line"?  (This was the bug 19990704.002, change #3568.)
 744     # The bug was in the caching of the 'o'-magic.
 745     {
 746         use locale;
 747
 748         sub lcA {
 749             my $lc0 = lc $_[0];
 750             my $lc1 = lc $_[1];
 751             return $lc0 cmp $lc1;
 752         }
 753
 754         sub lcB {
 755             return lc($_[0]) cmp lc($_[1]);
 756         }
 757
 758         my $x = "ab";
 759         my $y = "aa";
 760         my $z = "AB";
 761
 762         tryneoalpha($Locale, 116,
 763                     lcA($x, $y) == 1 && lcB($x, $y) == 1 ||
 764                     lcA($x, $z) == 0 && lcB($x, $z) == 0);
 765     }
 766
 767     # Does lc of an UPPER (if different from the UPPER) match
 768     # case-insensitively the UPPER, and does the UPPER match
 769     # case-insensitively the lc of the UPPER.  And vice versa.
 770     {
 771         use locale;
 772         no utf8;
 773         my $re = qr/[\[\(\{\*\+\?\|\^\$\\]/;
 774
 775         my @f = ();
 776         foreach my $x (keys %UPPER) {
 777             my $y = lc $x;
 778             next unless uc $y eq $x;
 779             print "# UPPER $x lc $y ",
 780             $x =~ /$y/i ? 1 : 0, " ",
 781             $y =~ /$x/i ? 1 : 0, "\n" if 0;
 782             #
 783             # If $x and $y contain regular expression characters
 784             # AND THEY lowercase (/i) to regular expression characters,
 785             # regcomp() will be mightily confused.  No, the \Q doesn't
 786             # help here (maybe regex engine internal lowercasing
 787             # is done after the \Q?)  An example of this happening is
 788             # the bg_BG (Bulgarian) locale under EBCDIC (OS/390 USS):
 789             # the chr(173) (the "[") is the lowercase of the chr(235).
 790             #
 791             # Similarly losing EBCDIC locales include cs_cz, cs_CZ,
 792             # el_gr, el_GR, en_us.IBM-037 (!), en_US.IBM-037 (!),
 793             # et_ee, et_EE, hr_hr, hr_HR, hu_hu, hu_HU, lt_LT,
 794             # mk_mk, mk_MK, nl_nl.IBM-037, nl_NL.IBM-037,
 795             # pl_pl, pl_PL, ro_ro, ro_RO, ru_ru, ru_RU,
 796             # sk_sk, sk_SK, sl_si, sl_SI, tr_tr, tr_TR.
 797             #
 798             # Similar things can happen even under (bastardised)
 799             # non-EBCDIC locales: in many European countries before the
 800             # advent of ISO 8859-x nationally customised versions of
 801             # ISO 646 were devised, reusing certain punctuation
 802             # characters for modified characters needed by the
 803             # country/language.  For example, the "|" might have
 804             # stood for U+00F6 or LATIN SMALL LETTER O WITH DIAERESIS.
 805             #
 806             if ($x =~ $re || $y =~ $re) {
 807                 print "# Regex characters in '$x' or '$y', skipping test 117 for locale '$Locale'\n";
 808                 next;
 809             }
 810             # With utf8 both will fail since the locale concept
 811             # of upper/lower does not work well in Unicode.
 812             push @f, $x unless $x =~ /$y/i == $y =~ /$x/i;
 813         }
 814
 815         foreach my $x (keys %lower) {
 816             my $y = uc $x;
 817             next unless lc $y eq $x;
 818             print "# lower $x uc $y ",
 819             $x =~ /$y/i ? 1 : 0, " ",
 820             $y =~ /$x/i ? 1 : 0, "\n" if 0;
 821             if ($x =~ $re || $y =~ $re) { # See above.
 822                 print "# Regex characters in '$x' or '$y', skipping test 117 for locale '$Locale'\n";
 823                 next;
 824             }
 825             # With utf8 both will fail since the locale concept
 826             # of upper/lower does not work well in Unicode.
 827             push @f, $x unless $x =~ /$y/i == $y =~ /$x/i;
 828         }
 829         tryneoalpha($Locale, 117, @f == 0);
 830         if (@f) {
 831             print "# failed 117 locale '$Locale' characters @f\n"
 832         }
 833     }
 834 }
 835
 836 # Recount the errors.
 837
 838 foreach (&last_without_setlocale()+1..$last_locales) {
 839     if ($Problem{$_} || !defined $Okay{$_} || !@{$Okay{$_}}) {
 840         if ($_ == 102) {
 841             print "# The failure of test 102 is not necessarily fatal.\n";
 842             print "# It usually indicates a problem in the environment,\n";
 843             print "# not in Perl itself.\n";
 844         }
 845         print "not ";
 846     }
 847     print "ok $_\n";
 848 }
 849
 850 # Give final advice.
 851
 852 my $didwarn = 0;
 853
 854 foreach (99..$last_locales) {
 855     if ($Problem{$_}) {
 856         my @f = sort keys %{ $Problem{$_} };
 857         my $f = join(" ", @f);
 858         $f =~ s/(.{50,60}) /$1\n#\t/g;
 859         print
 860             "#\n",
 861             "# The locale ", (@f == 1 ? "definition" : "definitions"), "\n#\n",
 862             "#\t", $f, "\n#\n",
 863             "# on your system may have errors because the locale test $_\n",
 864             "# failed in ", (@f == 1 ? "that locale" : "those locales"),
 865             ".\n";
 866         print <<EOW;
 867 #
 868 # If your users are not using these locales you are safe for the moment,
 869 # but please report this failure first to perlbug\@perl.com using the
 870 # perlbug script (as described in the INSTALL file) so that the exact
 871 # details of the failures can be sorted out first and then your operating
 872 # system supplier can be alerted about these anomalies.
 873 #
 874 EOW
 875         $didwarn = 1;
 876     }
 877 }
 878
 879 # Tell which locales were okay and which were not.
 880
 881 if ($didwarn) {
 882     my (@s, @F);
 883
 884     foreach my $l (@Locale) {
 885         my $p = 0;
 886         foreach my $t (102..$last_locales) {
 887             $p++ if $Problem{$t}{$l};
 888         }
 889         push @s, $l if $p == 0;
 890       push @F, $l unless $p == 0;
 891     }
 892
 893     if (@s) {
 894         my $s = join(" ", @s);
 895         $s =~ s/(.{50,60}) /$1\n#\t/g;
 896
 897         warn
 898             "# The following locales\n#\n",
 899             "#\t", $s, "\n#\n",
 900             "# tested okay.\n#\n",
 901     } else {
 902         warn "# None of your locales were fully okay.\n";
 903     }
 904
 905     if (@F) {
 906         my $F = join(" ", @F);
 907         $F =~ s/(.{50,60}) /$1\n#\t/g;
 908
 909         warn
 910           "# The following locales\n#\n",
 911           "#\t", $F, "\n#\n",
 912           "# had problems.\n#\n",
 913     } else {
 914         warn "# None of your locales were broken.\n";
 915     }
 916
 917     if (@utf8locale) {
 918         my $S = join(" ", @utf8locale);
 919         $S =~ s/(.{50,60}) /$1\n#\t/g;
 920
 921         warn "#\n# The following locales\n#\n",
 922              "#\t", $S, "\n#\n",
 923              "# were skipped for the tests ",
 924              join(" ", sort {$a<=>$b} keys %utf8skip), "\n",
 925             "# because UTF-8 and locales do not work together in Perl.\n#\n";
 926     }
 927 }
 928
 929 sub last_locales { 117 }
 930
 931 # Test that tainting and case changing works on utf8 strings.  These tests are
 932 # placed last to avoid disturbing the hard-coded test numbers above this in
 933 # this file.
 934 setlocale(LC_ALL, "C");
 935 {
 936     use locale;
 937
 938     my $i = &last_locales + 1;
 939
 940     foreach my $function ("uc", "ucfirst", "lc", "lcfirst") {
 941         my @list;   # List of code points to test for $function
 942
 943         # Used to calculate the changed case for ASCII characters by using the
 944         # ord, instead of using one of the functions under test.
 945         my $ascii_case_change_delta;
 946         my $above_latin1_case_change_delta; # Same for the specific ords > 255
 947                                             # that we use
 948
 949         # We test an ASCII character, which should change case and be tainted;
 950         # a Latin1 character, which shouldn't change case under this C locale,
 951         #   and is tainted.
 952         # an above-Latin1 character that when the case is changed would cross
 953         #   the 255/256 boundary, so doesn't change case and isn't tainted
 954         # (the \x{149} is one of these, but changes into 2 characters, the
 955         #   first one of which doesn't cross the boundary.
 956         # the final one in each list is an above-Latin1 character whose case
 957         #   does change, and shouldn't be tainted.  The code below uses its
 958         #   position in its list as a marker to indicate that it, unlike the
 959         #   other code points above ASCII, has a successful case change
 960         if ($function =~ /^u/) {
 961             #@list = ("\xff", "\x{fb00}", "\x{149}", "\x{101}");
 962             @list = ("", "a", "\xe0", "\xff", "\x{fb00}", "\x{149}", "\x{101}");
 963             $ascii_case_change_delta = -32;
 964             $above_latin1_case_change_delta = -1;
 965         }
 966         else {
 967             @list = ("", "A", "\xC0", "\x{1E9E}", "\x{100}");
 968             $ascii_case_change_delta = +32;
 969             $above_latin1_case_change_delta = +1;
 970         }
 971         $|=1;
 972         foreach my $j (0 .. $#list) {
 973             my $char = $list[$j];
 974             #print STDERR __LINE__, ": $char\n";
 975             #check_taint_not($i++, $char);
 976             utf8::upgrade($char);
 977             #check_taint_not($i++, $char);
 978             my $should_be = ($j == $#list)
 979                             ? chr(ord($char) + $above_latin1_case_change_delta)
 980                             : (length $char == 0 || ord($char) > 127)
 981                               ? $char
 982                               : chr(ord($char) + $ascii_case_change_delta);
 983
 984             # This monstrosity is in order to avoid using an eval, which might
 985             # perturb the results
 986             my $changed = ($function eq "uc")
 987                           ? uc($char)
 988                           : ($function eq "ucfirst")
 989                             ? ucfirst($char)
 990                             : ($function eq "lc")
 991                               ? lc($char)
 992                               : ($function eq "lcfirst")
 993                                 ? lcfirst($char)
 994                                 : croak("Unexpected function \"$function\"");
 995             ok($i++, $changed eq $should_be, "$function(\"$char\") should be \"$should_be\", got \"$changed\"");
 996
 997             # Tainting shouldn't happen for empty strings, or those characters
 998             # above 255.
 999             #print STDERR __LINE__, ": $char\n";
1000             (length($char) > 0 && ord($char) < 256)
1001             ? check_taint($i++, $changed)
1002             : check_taint_not($i++, $changed);
1003         }
1004     }
1005 }
1006
1007
1008 sub last { 165 }
1009
1010 # eof