X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/b695f709e8a342e35e482b0437eb6cdacdc58b6b..78325d7a7e6183fba46cbfb1dbd40def2996b940:/lib/locale.t

diff --git a/lib/locale.t b/lib/locale.t
index 19fba59..a66810b 100644
--- a/lib/locale.t
+++ b/lib/locale.t
@@ -43,13 +43,21 @@ eval {
 
 # Visual C's CRT goes silly on strings of the form "en_US.ISO8859-1"
 # and mingw32 uses said silly CRT
-$have_setlocale = 0 if (($^O eq 'MSWin32' || $^O eq 'NetWare') && $Config{cc} =~ /^(cl|gcc)/i);
+# This doesn't seem to be an issue any more, at least on Windows XP,
+# so re-enable the tests for Windows XP onwards.
+my $winxp = ($^O eq 'MSWin32' && defined &Win32::GetOSVersion &&
+		join('.', (Win32::GetOSVersion())[1..2]) >= 5.1);
+$have_setlocale = 0 if ((($^O eq 'MSWin32' && !$winxp) || $^O eq 'NetWare') &&
+		$Config{cc} =~ /^(cl|gcc)/i);
+
+# UWIN seems to loop after test 98, just skip for now
+$have_setlocale = 0 if ($^O =~ /^uwin/);
 
 my $last = $have_setlocale ? &last : &last_without_setlocale;
 
 print "1..$last\n";
 
-use vars qw(&LC_ALL);
+sub LC_ALL ();
 
 $a = 'abc %';
 
@@ -250,7 +258,7 @@ debug "# Scanning for locales...\n";
 # Note that it's okay that some languages have their native names
 # capitalized here even though that's not "right".  They are lowercased
 # anyway later during the scanning process (and besides, some clueless
-# vendor might have them capitalized errorneously anyway).
+# vendor might have them capitalized erroneously anyway).
 
 my $locales = <<EOF;
 Afrikaans:af:za:1 15
@@ -308,12 +316,12 @@ if ($^O eq 'os390') {
     $locales =~ s/Thai:th:th:11 tis620\n//;
 }
 
-sub in_utf8 () { $^H & 0x08 }
+sub in_utf8 () { $^H & 0x08 || (${^OPEN} || "") =~ /:utf8/ }
 
 if (in_utf8) {
-    require "locale/utf8";
+    require "lib/locale/utf8";
 } else {
-    require "locale/latin1";
+    require "lib/locale/latin1";
 }
 
 my @Locale;
@@ -379,6 +387,10 @@ delete @ENV{qw(IFS CDPATH ENV BASH_ENV)};
 
 if (-x "/usr/bin/locale" && open(LOCALES, "/usr/bin/locale -a 2>/dev/null|")) {
     while (<LOCALES>) {
+	# It seems that /usr/bin/locale steadfastly outputs 8 bit data, which
+	# ain't great when we're running this testPERL_UNICODE= so that utf8
+	# locales will cause all IO hadles to default to (assume) utf8
+	next unless utf8::valid($_);
         chomp;
 	trylocale($_);
     }
@@ -392,6 +404,17 @@ if (-x "/usr/bin/locale" && open(LOCALES, "/usr/bin/locale -a 2>/dev/null|")) {
         trylocale($_);
     }
     close(LOCALES);
+} elsif ($^O eq 'openbsd' && -e '/usr/share/locale') {
+
+   # OpenBSD doesn't have a locale executable, so reading /usr/share/locale
+   # is much easier and faster than the last resort method.
+
+    opendir(LOCALES, '/usr/share/locale');
+    while ($_ = readdir(LOCALES)) {
+        chomp;
+        trylocale($_);
+    }
+    close(LOCALES);
 } else {
 
     # This is going to be slow.
@@ -430,11 +453,25 @@ if (-x "/usr/bin/locale" && open(LOCALES, "/usr/bin/locale -a 2>/dev/null|")) {
 
 setlocale(LC_ALL, "C");
 
-sub utf8locale { $_[0] =~ /utf-?8/i }
+if ($^O eq 'darwin') {
+    # Darwin 8/Mac OS X 10.4 and 10.5 have bad Basque locales: perl bug #35895,
+    # Apple bug ID# 4139653. It also has a problem in Byelorussian.
+    (my $v) = $Config{osvers} =~ /^(\d+)/;
+    if ($v >= 8 and $v < 10) {
+	debug "# Skipping eu_ES, be_BY locales -- buggy in Darwin\n";
+	@Locale = grep ! m/^(eu_ES(?:\..*)?|be_BY\.CP1131)$/, @Locale;
+    } elsif ($v < 12) {
+	debug "# Skipping be_BY locales -- buggy in Darwin\n";
+	@Locale = grep ! m/^be_BY\.CP1131$/, @Locale;
+    }
+}
 
 @Locale = sort @Locale;
 
-debug "# Locales = @Locale\n";
+debug "# Locales =\n";
+for ( @Locale ) {
+    debug "# $_\n";
+}
 
 my %Problem;
 my %Okay;
@@ -520,18 +557,29 @@ foreach $Locale (@Locale) {
 
 	# Test \w.
     
-	if (utf8locale($Locale)) {
-	    # utf8 and locales do not mix.
-	    debug "# skipping UTF-8 locale '$Locale'\n";
-	    push @utf8locale, $Locale;
-            @utf8skip{99..102} = ();
-	} else {
-	    my $word = join('', @Neoalpha);
+	my $word = join('', @Neoalpha);
 
-	    $word =~ /^(\w+)$/;
- 
-	    tryneoalpha($Locale, 99, $1 eq $word);
+	my $badutf8;
+	{
+	    local $SIG{__WARN__} = sub {
+		$badutf8 = $_[0] =~ /Malformed UTF-8/;
+	    };
+	    $Locale =~ /utf-?8/i;
+	}
+
+	if ($badutf8) {
+	    debug "# Locale name contains bad UTF-8, skipping test 99 for locale '$Locale'\n";
+	} elsif ($Locale =~ /utf-?8/i) {
+	    debug "# unknown whether locale and Unicode have the same \\w, skipping test 99 for locale '$Locale'\n";
+	    push @{$Okay{99}}, $Locale;
+	} else {
+	    if ($word =~ /^(\w+)$/) {
+		tryneoalpha($Locale, 99, 1);
+	    } else {
+		tryneoalpha($Locale, 99, 0);
+	    }
 	}
+
 	# Cross-check the whole 8-bit character set.
 
 	for (map { chr } 0..255) {
@@ -712,32 +760,68 @@ foreach $Locale (@Locale) {
     # case-insensitively the UPPER, and does the UPPER match
     # case-insensitively the lc of the UPPER.  And vice versa.
     {
-        if (utf8locale($Locale)) {
-	    # utf8 and locales do not mix.
-	    debug "# skipping UTF-8 locale '$Locale'\n";
-	    push @utf8locale, $Locale;
-            $utf8skip{117}++;
-	} else {
-	    use locale;
-	    use locale;
-	    no utf8; # so that the native 8-bit characters work
-
-	    my @f = ();
-	    foreach my $x (keys %UPPER) {
-		my $y = lc $x;
-		next unless uc $y eq $x;
-		push @f, $x unless $x =~ /$y/i && $y =~ /$x/i;
-	    }
-	    foreach my $x (keys %lower) {
-		my $y = uc $x;
-		next unless lc $y eq $x;
-		push @f, $x unless $x =~ /$y/i && $y =~ /$x/i;
+        use locale;
+        no utf8;
+        my $re = qr/[\[\(\{\*\+\?\|\^\$\\]/;
+
+        my @f = ();
+        foreach my $x (keys %UPPER) {
+	    my $y = lc $x;
+	    next unless uc $y eq $x;
+	    print "# UPPER $x lc $y ",
+	    $x =~ /$y/i ? 1 : 0, " ",
+	    $y =~ /$x/i ? 1 : 0, "\n" if 0;
+	    #
+	    # If $x and $y contain regular expression characters
+	    # AND THEY lowercase (/i) to regular expression characters,
+	    # regcomp() will be mightily confused.  No, the \Q doesn't
+	    # help here (maybe regex engine internal lowercasing
+	    # is done after the \Q?)  An example of this happening is
+	    # the bg_BG (Bulgarian) locale under EBCDIC (OS/390 USS):
+	    # the chr(173) (the "[") is the lowercase of the chr(235).
+	    #
+	    # Similarly losing EBCDIC locales include cs_cz, cs_CZ,
+	    # el_gr, el_GR, en_us.IBM-037 (!), en_US.IBM-037 (!),
+	    # et_ee, et_EE, hr_hr, hr_HR, hu_hu, hu_HU, lt_LT,
+	    # mk_mk, mk_MK, nl_nl.IBM-037, nl_NL.IBM-037,
+	    # pl_pl, pl_PL, ro_ro, ro_RO, ru_ru, ru_RU,
+	    # sk_sk, sk_SK, sl_si, sl_SI, tr_tr, tr_TR.
+	    #
+	    # Similar things can happen even under (bastardised)
+	    # non-EBCDIC locales: in many European countries before the
+	    # advent of ISO 8859-x nationally customised versions of
+	    # ISO 646 were devised, reusing certain punctuation
+	    # characters for modified characters needed by the
+	    # country/language.  For example, the "|" might have
+	    # stood for U+00F6 or LATIN SMALL LETTER O WITH DIAERESIS.
+	    #
+	    if ($x =~ $re || $y =~ $re) {
+		print "# Regex characters in '$x' or '$y', skipping test 117 for locale '$Locale'\n";
+		next;
 	    }
-	    tryneoalpha($Locale, 117, @f == 0);
-	    if (@f) {
-		print "# failed 117 locale '$Locale' characters @f\n"
-  	    }
+	    # With utf8 both will fail since the locale concept
+	    # of upper/lower does not work well in Unicode.
+	    push @f, $x unless $x =~ /$y/i == $y =~ /$x/i;
         }
+
+	foreach my $x (keys %lower) {
+	    my $y = uc $x;
+	    next unless lc $y eq $x;
+	    print "# lower $x uc $y ",
+	    $x =~ /$y/i ? 1 : 0, " ",
+	    $y =~ /$x/i ? 1 : 0, "\n" if 0;
+	    if ($x =~ $re || $y =~ $re) { # See above.
+		print "# Regex characters in '$x' or '$y', skipping test 117 for locale '$Locale'\n";
+		next;
+	    }
+	    # With utf8 both will fail since the locale concept
+	    # of upper/lower does not work well in Unicode.
+	    push @f, $x unless $x =~ /$y/i == $y =~ /$x/i;
+	}
+	tryneoalpha($Locale, 117, @f == 0);
+	if (@f) {
+	    print "# failed 117 locale '$Locale' characters @f\n"
+	}
     }
 }
 
@@ -747,7 +831,7 @@ foreach (&last_without_setlocale()+1..$last) {
     if ($Problem{$_} || !defined $Okay{$_} || !@{$Okay{$_}}) {
 	if ($_ == 102) {
 	    print "# The failure of test 102 is not necessarily fatal.\n";
-	    print "# It usually indicates a problem in the enviroment,\n";
+	    print "# It usually indicates a problem in the environment,\n";
 	    print "# not in Perl itself.\n";
 	}
 	print "not ";