X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/b695f709e8a342e35e482b0437eb6cdacdc58b6b..78325d7a7e6183fba46cbfb1dbd40def2996b940:/lib/locale.t diff --git a/lib/locale.t b/lib/locale.t index 19fba59..a66810b 100644 --- a/lib/locale.t +++ b/lib/locale.t @@ -43,13 +43,21 @@ eval { # Visual C's CRT goes silly on strings of the form "en_US.ISO8859-1" # and mingw32 uses said silly CRT -$have_setlocale = 0 if (($^O eq 'MSWin32' || $^O eq 'NetWare') && $Config{cc} =~ /^(cl|gcc)/i); +# This doesn't seem to be an issue any more, at least on Windows XP, +# so re-enable the tests for Windows XP onwards. +my $winxp = ($^O eq 'MSWin32' && defined &Win32::GetOSVersion && + join('.', (Win32::GetOSVersion())[1..2]) >= 5.1); +$have_setlocale = 0 if ((($^O eq 'MSWin32' && !$winxp) || $^O eq 'NetWare') && + $Config{cc} =~ /^(cl|gcc)/i); + +# UWIN seems to loop after test 98, just skip for now +$have_setlocale = 0 if ($^O =~ /^uwin/); my $last = $have_setlocale ? &last : &last_without_setlocale; print "1..$last\n"; -use vars qw(&LC_ALL); +sub LC_ALL (); $a = 'abc %'; @@ -250,7 +258,7 @@ debug "# Scanning for locales...\n"; # Note that it's okay that some languages have their native names # capitalized here even though that's not "right". They are lowercased # anyway later during the scanning process (and besides, some clueless -# vendor might have them capitalized errorneously anyway). +# vendor might have them capitalized erroneously anyway). my $locales = </dev/null|")) { while () { + # It seems that /usr/bin/locale steadfastly outputs 8 bit data, which + # ain't great when we're running this testPERL_UNICODE= so that utf8 + # locales will cause all IO hadles to default to (assume) utf8 + next unless utf8::valid($_); chomp; trylocale($_); } @@ -392,6 +404,17 @@ if (-x "/usr/bin/locale" && open(LOCALES, "/usr/bin/locale -a 2>/dev/null|")) { trylocale($_); } close(LOCALES); +} elsif ($^O eq 'openbsd' && -e '/usr/share/locale') { + + # OpenBSD doesn't have a locale executable, so reading /usr/share/locale + # is much easier and faster than the last resort method. + + opendir(LOCALES, '/usr/share/locale'); + while ($_ = readdir(LOCALES)) { + chomp; + trylocale($_); + } + close(LOCALES); } else { # This is going to be slow. @@ -430,11 +453,25 @@ if (-x "/usr/bin/locale" && open(LOCALES, "/usr/bin/locale -a 2>/dev/null|")) { setlocale(LC_ALL, "C"); -sub utf8locale { $_[0] =~ /utf-?8/i } +if ($^O eq 'darwin') { + # Darwin 8/Mac OS X 10.4 and 10.5 have bad Basque locales: perl bug #35895, + # Apple bug ID# 4139653. It also has a problem in Byelorussian. + (my $v) = $Config{osvers} =~ /^(\d+)/; + if ($v >= 8 and $v < 10) { + debug "# Skipping eu_ES, be_BY locales -- buggy in Darwin\n"; + @Locale = grep ! m/^(eu_ES(?:\..*)?|be_BY\.CP1131)$/, @Locale; + } elsif ($v < 12) { + debug "# Skipping be_BY locales -- buggy in Darwin\n"; + @Locale = grep ! m/^be_BY\.CP1131$/, @Locale; + } +} @Locale = sort @Locale; -debug "# Locales = @Locale\n"; +debug "# Locales =\n"; +for ( @Locale ) { + debug "# $_\n"; +} my %Problem; my %Okay; @@ -520,18 +557,29 @@ foreach $Locale (@Locale) { # Test \w. - if (utf8locale($Locale)) { - # utf8 and locales do not mix. - debug "# skipping UTF-8 locale '$Locale'\n"; - push @utf8locale, $Locale; - @utf8skip{99..102} = (); - } else { - my $word = join('', @Neoalpha); + my $word = join('', @Neoalpha); - $word =~ /^(\w+)$/; - - tryneoalpha($Locale, 99, $1 eq $word); + my $badutf8; + { + local $SIG{__WARN__} = sub { + $badutf8 = $_[0] =~ /Malformed UTF-8/; + }; + $Locale =~ /utf-?8/i; + } + + if ($badutf8) { + debug "# Locale name contains bad UTF-8, skipping test 99 for locale '$Locale'\n"; + } elsif ($Locale =~ /utf-?8/i) { + debug "# unknown whether locale and Unicode have the same \\w, skipping test 99 for locale '$Locale'\n"; + push @{$Okay{99}}, $Locale; + } else { + if ($word =~ /^(\w+)$/) { + tryneoalpha($Locale, 99, 1); + } else { + tryneoalpha($Locale, 99, 0); + } } + # Cross-check the whole 8-bit character set. for (map { chr } 0..255) { @@ -712,32 +760,68 @@ foreach $Locale (@Locale) { # case-insensitively the UPPER, and does the UPPER match # case-insensitively the lc of the UPPER. And vice versa. { - if (utf8locale($Locale)) { - # utf8 and locales do not mix. - debug "# skipping UTF-8 locale '$Locale'\n"; - push @utf8locale, $Locale; - $utf8skip{117}++; - } else { - use locale; - use locale; - no utf8; # so that the native 8-bit characters work - - my @f = (); - foreach my $x (keys %UPPER) { - my $y = lc $x; - next unless uc $y eq $x; - push @f, $x unless $x =~ /$y/i && $y =~ /$x/i; - } - foreach my $x (keys %lower) { - my $y = uc $x; - next unless lc $y eq $x; - push @f, $x unless $x =~ /$y/i && $y =~ /$x/i; + use locale; + no utf8; + my $re = qr/[\[\(\{\*\+\?\|\^\$\\]/; + + my @f = (); + foreach my $x (keys %UPPER) { + my $y = lc $x; + next unless uc $y eq $x; + print "# UPPER $x lc $y ", + $x =~ /$y/i ? 1 : 0, " ", + $y =~ /$x/i ? 1 : 0, "\n" if 0; + # + # If $x and $y contain regular expression characters + # AND THEY lowercase (/i) to regular expression characters, + # regcomp() will be mightily confused. No, the \Q doesn't + # help here (maybe regex engine internal lowercasing + # is done after the \Q?) An example of this happening is + # the bg_BG (Bulgarian) locale under EBCDIC (OS/390 USS): + # the chr(173) (the "[") is the lowercase of the chr(235). + # + # Similarly losing EBCDIC locales include cs_cz, cs_CZ, + # el_gr, el_GR, en_us.IBM-037 (!), en_US.IBM-037 (!), + # et_ee, et_EE, hr_hr, hr_HR, hu_hu, hu_HU, lt_LT, + # mk_mk, mk_MK, nl_nl.IBM-037, nl_NL.IBM-037, + # pl_pl, pl_PL, ro_ro, ro_RO, ru_ru, ru_RU, + # sk_sk, sk_SK, sl_si, sl_SI, tr_tr, tr_TR. + # + # Similar things can happen even under (bastardised) + # non-EBCDIC locales: in many European countries before the + # advent of ISO 8859-x nationally customised versions of + # ISO 646 were devised, reusing certain punctuation + # characters for modified characters needed by the + # country/language. For example, the "|" might have + # stood for U+00F6 or LATIN SMALL LETTER O WITH DIAERESIS. + # + if ($x =~ $re || $y =~ $re) { + print "# Regex characters in '$x' or '$y', skipping test 117 for locale '$Locale'\n"; + next; } - tryneoalpha($Locale, 117, @f == 0); - if (@f) { - print "# failed 117 locale '$Locale' characters @f\n" - } + # With utf8 both will fail since the locale concept + # of upper/lower does not work well in Unicode. + push @f, $x unless $x =~ /$y/i == $y =~ /$x/i; } + + foreach my $x (keys %lower) { + my $y = uc $x; + next unless lc $y eq $x; + print "# lower $x uc $y ", + $x =~ /$y/i ? 1 : 0, " ", + $y =~ /$x/i ? 1 : 0, "\n" if 0; + if ($x =~ $re || $y =~ $re) { # See above. + print "# Regex characters in '$x' or '$y', skipping test 117 for locale '$Locale'\n"; + next; + } + # With utf8 both will fail since the locale concept + # of upper/lower does not work well in Unicode. + push @f, $x unless $x =~ /$y/i == $y =~ /$x/i; + } + tryneoalpha($Locale, 117, @f == 0); + if (@f) { + print "# failed 117 locale '$Locale' characters @f\n" + } } } @@ -747,7 +831,7 @@ foreach (&last_without_setlocale()+1..$last) { if ($Problem{$_} || !defined $Okay{$_} || !@{$Okay{$_}}) { if ($_ == 102) { print "# The failure of test 102 is not necessarily fatal.\n"; - print "# It usually indicates a problem in the enviroment,\n"; + print "# It usually indicates a problem in the environment,\n"; print "# not in Perl itself.\n"; } print "not ";