X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/4eac893cc971b0cfd7f95c7b1ad0632537436100..30962f6865ad51522f9f4f83ca691414fcb6eac6:/lib/locale.t diff --git a/lib/locale.t b/lib/locale.t index e782f07..c8741c3 100644 --- a/lib/locale.t +++ b/lib/locale.t @@ -19,6 +19,7 @@ BEGIN { print "1..0\n"; exit; } + require './loc_tools.pl'; $| = 1; } @@ -30,11 +31,15 @@ my $debug = $ENV{PERL_DEBUG_FULL_TEST} // 0; # Certain tests have been shown to be problematical for a few locales. Don't # fail them unless at least this percentage of the tested locales fail. -# Some Windows machines are defective in every in every locale but the C, -# calling \t printable; superscripts to be digits, etc. See -# http://markmail.org/message/5jwam4xsx4amsdnv +# On AIX machines, many locales call a no-break space a graphic. # (There aren't 1000 locales currently in existence, so 99.9 works) -my $acceptable_fold_failure_percentage = $^O eq 'MSWin32' ? 99.9 : 5; +my $acceptable_failure_percentage = ($^O =~ / ^ ( AIX ) $ /ix) + ? 99.9 + : 5; + +# The list of test numbers of the problematic tests. +my %problematical_tests; + use Dumpvalue; @@ -45,7 +50,7 @@ my $dumper = Dumpvalue->new( ); sub debug { return unless $debug; - my($mess) = join "", @_; + my($mess) = join "", '# ', @_; chop $mess; print $dumper->stringify($mess,1), "\n"; } @@ -59,26 +64,7 @@ sub debugf { printf @_ if $debug; } -my $have_setlocale = 0; -eval { - require POSIX; - import POSIX ':locale_h'; - $have_setlocale++; -}; - -# Visual C's CRT goes silly on strings of the form "en_US.ISO8859-1" -# and mingw32 uses said silly CRT -# This doesn't seem to be an issue any more, at least on Windows XP, -# so re-enable the tests for Windows XP onwards. -my $winxp = ($^O eq 'MSWin32' && defined &Win32::GetOSVersion && - join('.', (Win32::GetOSVersion())[1..2]) >= 5.1); -$have_setlocale = 0 if ((($^O eq 'MSWin32' && !$winxp) || $^O eq 'NetWare') && - $Config{cc} =~ /^(cl|gcc)/i); - -# UWIN seems to loop after taint tests, just skip for now -$have_setlocale = 0 if ($^O =~ /^uwin/); - -$a = 'abc %'; +$a = 'abc %9'; my $test_num = 0; @@ -105,13 +91,15 @@ sub is_tainted { # hello, camel two. sub check_taint ($;$) { my $message_tail = $_[1] // ""; - $message_tail = ": $message_tail" if $message_tail; + + # Extra blanks are so aligns with taint_not output + $message_tail = ": $message_tail" if $message_tail; ok is_tainted($_[0]), "verify that is tainted$message_tail"; } sub check_taint_not ($;$) { my $message_tail = $_[1] // ""; - $message_tail = ": $message_tail" if $message_tail; + $message_tail = ": $message_tail" if $message_tail; ok((not is_tainted($_[0])), "verify that isn't tainted$message_tail"); } @@ -121,325 +109,583 @@ check_taint_not $&, "not tainted outside 'use locale'"; use locale; # engage locale and therefore locale taint. -check_taint_not $a; - -check_taint uc($a); -check_taint "\U$a"; -check_taint ucfirst($a); -check_taint "\u$a"; -check_taint lc($a); -check_taint fc($a); -check_taint "\L$a"; -check_taint "\F$a"; -check_taint lcfirst($a); -check_taint "\l$a"; - -check_taint_not sprintf('%e', 123.456); -check_taint_not sprintf('%f', 123.456); -check_taint_not sprintf('%g', 123.456); -check_taint_not sprintf('%d', 123.456); -check_taint_not sprintf('%x', 123.456); +# BE SURE TO COPY ANYTHING YOU ADD to these tests to the block below for +# ":notcharacters" + +check_taint_not $a, '$a'; + +check_taint uc($a), 'uc($a)'; +check_taint "\U$a", '"\U$a"'; +check_taint ucfirst($a), 'ucfirst($a)'; +check_taint "\u$a", '"\u$a"'; +check_taint lc($a), 'lc($a)'; +check_taint fc($a), 'fc($a)'; +check_taint "\L$a", '"\L$a"'; +check_taint "\F$a", '"\F$a"'; +check_taint lcfirst($a), 'lcfirst($a)'; +check_taint "\l$a", '"\l$a"'; + +check_taint_not sprintf('%e', 123.456), "sprintf('%e', 123.456)"; +check_taint_not sprintf('%f', 123.456), "sprintf('%f', 123.456)"; +check_taint_not sprintf('%g', 123.456), "sprintf('%g', 123.456)"; +check_taint_not sprintf('%d', 123.456), "sprintf('%d', 123.456)"; +check_taint_not sprintf('%x', 123.456), "sprintf('%x', 123.456)"; $_ = $a; # untaint $_ $_ = uc($a); # taint $_ -check_taint $_; +check_taint $_, '$_ = uc($a)'; /(\w)/; # taint $&, $`, $', $+, $1. -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $&, "\$& from /(\\w)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; /(.)/; # untaint $&, $`, $', $+, $1. -check_taint_not $&; -check_taint_not $`; -check_taint_not $'; -check_taint_not $+; -check_taint_not $1; -check_taint_not $2; +check_taint_not $&, "\$& from /(.)/"; +check_taint_not $`, "\t\$`"; +check_taint_not $', "\t\$'"; +check_taint_not $+, "\t\$+"; +check_taint_not $1, "\t\$1"; +check_taint_not $2, "\t\$2"; /(\W)/; # taint $&, $`, $', $+, $1. -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $&, "\$& from /(\\W)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; +check_taint_not $`, "\t\$`"; +check_taint_not $', "\t\$'"; +check_taint_not $+, "\t\$+"; +check_taint_not $1, "\t\$1"; +check_taint_not $2, "\t\$2"; /(\s)/; # taint $&, $`, $', $+, $1. -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $&, "\$& from /(\\s)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; /(\S)/; # taint $&, $`, $', $+, $1. -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $&, "\$& from /(\\S)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +"0" =~ /(\d)/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /(\\d)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +/(\D)/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /(\\D)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +/([[:alnum:]])/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /([[:alnum:]])/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +/([[:^alnum:]])/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /([[:^alnum:]])/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +"a" =~ /(a)|(\w)/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /(a)|(\\w)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +ok($1 eq 'a', ("\t" x 5) . "\$1 is 'a'"); +ok(! defined $2, ("\t" x 5) . "\$2 is undefined"); +check_taint_not $2, "\t\$2"; +check_taint_not $3, "\t\$3"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +"\N{CYRILLIC SMALL LETTER A}" =~ /(\N{CYRILLIC CAPITAL LETTER A})/i; # no tainting because no locale dependence +check_taint_not $&, "\$& from /(\\N{CYRILLIC CAPITAL LETTER A})/i"; +check_taint_not $`, "\t\$`"; +check_taint_not $', "\t\$'"; +check_taint_not $+, "\t\$+"; +check_taint_not $1, "\t\$1"; +ok($1 eq "\N{CYRILLIC SMALL LETTER A}", ("\t" x 4) . "\t\$1 is 'small cyrillic a'"); +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /./"; + +"(\N{KELVIN SIGN})" =~ /(\N{KELVIN SIGN})/i; # taints because depends on locale +check_taint $&, "\$& from /(\\N{KELVIN SIGN})/i"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +"a:" =~ /(.)\b(.)/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /(.)\\b(.)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint $2, "\t\$2"; +check_taint_not $3, "\t\$3"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /./"; + +"aa" =~ /(.)\B(.)/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /(.)\\B(.)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint $2, "\t\$2"; +check_taint_not $3, "\t\$3"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /./"; + +"aaa" =~ /(.).(\1)/i; # notaint because not locale dependent +check_taint_not $&, "\$ & from /(.).(\\1)/"; +check_taint_not $`, "\t\$`"; +check_taint_not $', "\t\$'"; +check_taint_not $+, "\t\$+"; +check_taint_not $1, "\t\$1"; +check_taint_not $2, "\t\$2"; +check_taint_not $3, "\t\$3"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$ & from /./"; $_ = $a; # untaint $_ -check_taint_not $_; +check_taint_not $_, 'untainting $_ works'; /(b)/; # this must not taint -check_taint_not $&; -check_taint_not $`; -check_taint_not $'; -check_taint_not $+; -check_taint_not $1; -check_taint_not $2; +check_taint_not $&, "\$ & from /(b)/"; +check_taint_not $`, "\t\$`"; +check_taint_not $', "\t\$'"; +check_taint_not $+, "\t\$+"; +check_taint_not $1, "\t\$1"; +check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ -check_taint_not $_; +check_taint_not $_, 'untainting $_ works'; $b = uc($a); # taint $b s/(.+)/$b/; # this must taint only the $_ -check_taint $_; -check_taint_not $&; -check_taint_not $`; -check_taint_not $'; -check_taint_not $+; -check_taint_not $1; -check_taint_not $2; +check_taint $_, '$_ (wasn\'t tainted) from s/(.+)/$b/ where $b is tainted'; +check_taint_not $&, "\t\$&"; +check_taint_not $`, "\t\$`"; +check_taint_not $', "\t\$'"; +check_taint_not $+, "\t\$+"; +check_taint_not $1, "\t\$1"; +check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ s/(.+)/b/; # this must not taint -check_taint_not $_; -check_taint_not $&; -check_taint_not $`; -check_taint_not $'; -check_taint_not $+; -check_taint_not $1; -check_taint_not $2; +check_taint_not $_, '$_ (wasn\'t tainted) from s/(.+)/b/'; +check_taint_not $&, "\t\$&"; +check_taint_not $`, "\t\$`"; +check_taint_not $', "\t\$'"; +check_taint_not $+, "\t\$+"; +check_taint_not $1, "\t\$1"; +check_taint_not $2, "\t\$2"; $b = $a; # untaint $b ($b = $a) =~ s/\w/$&/; -check_taint $b; # $b should be tainted. -check_taint_not $a; # $a should be not. +check_taint $b, '$b from ($b = $a) =~ s/\w/$&/'; # $b should be tainted. +check_taint_not $a, '$a from ($b = $a) =~ s/\w/$&/'; # $a should be not. $_ = $a; # untaint $_ s/(\w)/\l$1/; # this must taint -check_taint $_; -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $_, '$_ (wasn\'t tainted) from s/(\w)/\l$1/,'; # this must taint +check_taint $&, "\t\$&"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ s/(\w)/\L$1/; # this must taint -check_taint $_; -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $_, '$_ (wasn\'t tainted) from s/(\w)/\L$1/,'; +check_taint $&, "\t\$&"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ s/(\w)/\u$1/; # this must taint -check_taint $_; -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $_, '$_ (wasn\'t tainted) from s/(\w)/\u$1/'; +check_taint $&, "\t\$&"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ s/(\w)/\U$1/; # this must taint -check_taint $_; -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $_, '$_ (wasn\'t tainted) from s/(\w)/\U$1/'; +check_taint $&, "\t\$&"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; # After all this tainting $a should be cool. -check_taint_not $a; +check_taint_not $a, '$a still not tainted'; + +"a" =~ /([a-z])/; +check_taint_not $1, '"a" =~ /([a-z])/'; +"foo.bar_baz" =~ /^(.*)[._](.*?)$/; # Bug 120675 +check_taint_not $1, '"foo.bar_baz" =~ /^(.*)[._](.*?)$/'; + +# BE SURE TO COPY ANYTHING YOU ADD to the block below { # This is just the previous tests copied here with a different # compile-time pragma. use locale ':not_characters'; # engage restricted locale with different # tainting rules - - check_taint_not $a; - - check_taint_not uc($a); - check_taint_not "\U$a"; - check_taint_not ucfirst($a); - check_taint_not "\u$a"; - check_taint_not lc($a); - check_taint_not fc($a); - check_taint_not "\L$a"; - check_taint_not "\F$a"; - check_taint_not lcfirst($a); - check_taint_not "\l$a"; - - check_taint_not sprintf('%e', 123.456); - check_taint_not sprintf('%f', 123.456); - check_taint_not sprintf('%g', 123.456); - check_taint_not sprintf('%d', 123.456); - check_taint_not sprintf('%x', 123.456); + check_taint_not $a, '$a'; + + check_taint_not uc($a), 'uc($a)'; + check_taint_not "\U$a", '"\U$a"'; + check_taint_not ucfirst($a), 'ucfirst($a)'; + check_taint_not "\u$a", '"\u$a"'; + check_taint_not lc($a), 'lc($a)'; + check_taint_not fc($a), 'fc($a)'; + check_taint_not "\L$a", '"\L$a"'; + check_taint_not "\F$a", '"\F$a"'; + check_taint_not lcfirst($a), 'lcfirst($a)'; + check_taint_not "\l$a", '"\l$a"'; + + check_taint_not sprintf('%e', 123.456), "sprintf('%e', 123.456)"; + check_taint_not sprintf('%f', 123.456), "sprintf('%f', 123.456)"; + check_taint_not sprintf('%g', 123.456), "sprintf('%g', 123.456)"; + check_taint_not sprintf('%d', 123.456), "sprintf('%d', 123.456)"; + check_taint_not sprintf('%x', 123.456), "sprintf('%x', 123.456)"; $_ = $a; # untaint $_ - $_ = uc($a); # taint $_ + $_ = uc($a); - check_taint_not $_; + check_taint_not $_, '$_ = uc($a)'; - /(\w)/; # taint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + /(\w)/; + check_taint_not $&, "\$& from /(\\w)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; /(.)/; # untaint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; - - /(\W)/; # taint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; - - /(\s)/; # taint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; - - /(\S)/; # taint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + check_taint_not $&, "\$& from /(.)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(\W)/; + check_taint_not $&, "\$& from /(\\W)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; - $_ = $a; # untaint $_ + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(\s)/; + check_taint_not $&, "\$& from /(\\s)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; - check_taint_not $_; + /(\S)/; + check_taint_not $&, "\$& from /(\\S)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; - /(b)/; # this must not taint - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; - $_ = $a; # untaint $_ + "0" =~ /(\d)/; + check_taint_not $&, "\$& from /(\\d)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + + /(\D)/; + check_taint_not $&, "\$& from /(\\D)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + + /([[:alnum:]])/; + check_taint_not $&, "\$& from /([[:alnum:]])/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + + /([[:^alnum:]])/; + check_taint_not $&, "\$& from /([[:^alnum:]])/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + "a" =~ /(a)|(\w)/; + check_taint_not $&, "\$& from /(a)|(\\w)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + ok($1 eq 'a', ("\t" x 5) . "\$1 is 'a'"); + ok(! defined $2, ("\t" x 5) . "\$2 is undefined"); + check_taint_not $2, "\t\$2"; + check_taint_not $3, "\t\$3"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + + "\N{CYRILLIC SMALL LETTER A}" =~ /(\N{CYRILLIC CAPITAL LETTER A})/i; + check_taint_not $&, "\$& from /(\\N{CYRILLIC CAPITAL LETTER A})/i"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + ok($1 eq "\N{CYRILLIC SMALL LETTER A}", ("\t" x 4) . "\t\$1 is 'small cyrillic a'"); + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /./"; + + "(\N{KELVIN SIGN})" =~ /(\N{KELVIN SIGN})/i; + check_taint_not $&, "\$& from /(\\N{KELVIN SIGN})/i"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; - check_taint_not $_; + "a:" =~ /(.)\b(.)/; + check_taint_not $&, "\$& from /(.)\\b(.)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + check_taint_not $3, "\t\$3"; - $b = uc($a); # taint $b - s/(.+)/$b/; # this must taint only the $_ + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /./"; + + "aa" =~ /(.)\B(.)/; + check_taint_not $&, "\$& from /(.)\\B(.)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + check_taint_not $3, "\t\$3"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /./"; + + "aaa" =~ /(.).(\1)/i; # notaint because not locale dependent + check_taint_not $&, "\$ & from /(.).(\\1)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + check_taint_not $3, "\t\$3"; - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$ & from /./"; $_ = $a; # untaint $_ - s/(.+)/b/; # this must not taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + check_taint_not $_, 'untainting $_ works'; + + /(b)/; + check_taint_not $&, "\$ & from /(b)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + $_ = $a; # untaint $_ + + check_taint_not $_, 'untainting $_ works'; + + s/(.+)/b/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(.+)/b/'; + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; $b = $a; # untaint $b ($b = $a) =~ s/\w/$&/; - check_taint_not $b; # $b should be tainted. - check_taint_not $a; # $a should be not. + check_taint_not $b, '$b from ($b = $a) =~ s/\w/$&/'; + check_taint_not $a, '$a from ($b = $a) =~ s/\w/$&/'; $_ = $a; # untaint $_ - s/(\w)/\l$1/; # this must taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + s/(\w)/\l$1/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(\w)/\l$1/,'; # this must taint + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ - s/(\w)/\L$1/; # this must taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + s/(\w)/\L$1/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(\w)/\L$1/,'; + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ - s/(\w)/\u$1/; # this must taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + s/(\w)/\u$1/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(\w)/\u$1/'; + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ - s/(\w)/\U$1/; # this must taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + s/(\w)/\U$1/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(\w)/\U$1/'; + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; # After all this tainting $a should be cool. - check_taint_not $a; + check_taint_not $a, '$a still not tainted'; + + "a" =~ /([a-z])/; + check_taint_not $1, '"a" =~ /([a-z])/'; + "foo.bar_baz" =~ /^(.*)[._](.*?)$/; # Bug 120675 + check_taint_not $1, '"foo.bar_baz" =~ /^(.*)[._](.*?)$/'; + } # Here are in scope of 'use locale' @@ -448,240 +694,31 @@ check_taint_not $a; # Let us do some *real* locale work now, # unless setlocale() is missing (i.e. minitest). -unless ($have_setlocale) { - print "1..$test_num\n"; - exit; -} - # The test number before our first setlocale() my $final_without_setlocale = $test_num; # Find locales. -debug "# Scanning for locales...\n"; - -# Note that it's okay that some languages have their native names -# capitalized here even though that's not "right". They are lowercased -# anyway later during the scanning process (and besides, some clueless -# vendor might have them capitalized erroneously anyway). - -my $locales = </dev/null|")) { - while () { - # It seems that /usr/bin/locale steadfastly outputs 8 bit data, which - # ain't great when we're running this testPERL_UNICODE= so that utf8 - # locales will cause all IO hadles to default to (assume) utf8 - next unless utf8::valid($_); - chomp; - trylocale($_); - } - close(LOCALES); -} elsif ($^O eq 'VMS' && defined($ENV{'SYS$I18N_LOCALE'}) && -d 'SYS$I18N_LOCALE') { -# The SYS$I18N_LOCALE logical name search list was not present on -# VAX VMS V5.5-12, but was on AXP && VAX VMS V6.2 as well as later versions. - opendir(LOCALES, "SYS\$I18N_LOCALE:"); - while ($_ = readdir(LOCALES)) { - chomp; - trylocale($_); - } - close(LOCALES); -} elsif (($^O eq 'openbsd' || $^O eq 'bitrig' ) && -e '/usr/share/locale') { - - # OpenBSD doesn't have a locale executable, so reading /usr/share/locale - # is much easier and faster than the last resort method. - - opendir(LOCALES, '/usr/share/locale'); - while ($_ = readdir(LOCALES)) { - chomp; - trylocale($_); - } - close(LOCALES); -} else { - - # This is going to be slow. - - foreach my $locale (split(/\n/, $locales)) { - my ($locale_name, $language_codes, $country_codes, $encodings) = - split(/:/, $locale); - my @enc = decode_encodings($encodings); - foreach my $loc (split(/ /, $locale_name)) { - trylocale($loc); - foreach my $enc (@enc) { - trylocale("$loc.$enc"); - } - $loc = lc $loc; - foreach my $enc (@enc) { - trylocale("$loc.$enc"); - } - } - foreach my $lang (split(/ /, $language_codes)) { - trylocale($lang); - foreach my $country (split(/ /, $country_codes)) { - my $lc = "${lang}_${country}"; - trylocale($lc); - foreach my $enc (@enc) { - trylocale("$lc.$enc"); - } - my $lC = "${lang}_\U${country}"; - trylocale($lC); - foreach my $enc (@enc) { - trylocale("$lC.$enc"); - } - } - } - } -} setlocale(&POSIX::LC_ALL, "C"); -if ($^O eq 'darwin') { - # Darwin 8/Mac OS X 10.4 and 10.5 have bad Basque locales: perl bug #35895, - # Apple bug ID# 4139653. It also has a problem in Byelorussian. - (my $v) = $Config{osvers} =~ /^(\d+)/; - if ($v >= 8 and $v < 10) { - debug "# Skipping eu_ES, be_BY locales -- buggy in Darwin\n"; - @Locale = grep ! m/^(eu_ES(?:\..*)?|be_BY\.CP1131)$/, @Locale; - } elsif ($v < 12) { - debug "# Skipping be_BY locales -- buggy in Darwin\n"; - @Locale = grep ! m/^be_BY\.CP1131$/, @Locale; - } -} - -@Locale = sort @Locale; - -debug "# Locales =\n"; -for ( @Locale ) { - debug "# $_\n"; -} +my %posixes; my %Problem; my %Okay; @@ -768,13 +805,71 @@ sub disp_chars { return $output; } +sub disp_str ($) { + my $string = shift; + + # Displays the string unambiguously. ASCII printables are always output + # as-is, though perhaps separated by blanks from other characters. If + # entirely printable ASCII, just returns the string. Otherwise if valid + # UTF-8 it uses the character names for non-printable-ASCII. Otherwise it + # outputs hex for each non-ASCII-printable byte. + + return $string if $string =~ / ^ [[:print:]]* $/xa; + + my $result = ""; + my $prev_was_punct = 1; # Beginning is considered punct + if (utf8::valid($string) && utf8::is_utf8($string)) { + use charnames (); + foreach my $char (split "", $string) { + + # Keep punctuation adjacent to other characters; otherwise + # separate them with a blank + if ($char =~ /[[:punct:]]/a) { + $result .= $char; + $prev_was_punct = 1; + } + elsif ($char =~ /[[:print:]]/a) { + $result .= " " unless $prev_was_punct; + $result .= $char; + $prev_was_punct = 0; + } + else { + $result .= " " unless $prev_was_punct; + $result .= charnames::viacode(ord $char); + $prev_was_punct = 0; + } + } + } + else { + use bytes; + foreach my $char (split "", $string) { + if ($char =~ /[[:punct:]]/a) { + $result .= $char; + $prev_was_punct = 1; + } + elsif ($char =~ /[[:print:]]/a) { + $result .= " " unless $prev_was_punct; + $result .= $char; + $prev_was_punct = 0; + } + else { + $result .= " " unless $prev_was_punct; + $result .= sprintf("%02X", ord $char); + $prev_was_punct = 0; + } + } + } + + return $result; +} + sub report_result { my ($Locale, $i, $pass_fail, $message) = @_; $message //= ""; $message = " ($message)" if $message; unless ($pass_fail) { $Problem{$i}{$Locale} = 1; - debug "# failed $i ($test_names{$i}) with locale '$Locale'$message\n"; + debug "failed $i ($test_names{$i}) with locale '$Locale'$message\n"; } else { push @{$Okay{$i}}, $Locale; } @@ -797,31 +892,38 @@ my $first_locales_test_number = $final_without_setlocale + 1; my $locales_test_number; my $not_necessarily_a_problem_test_number; my $first_casing_test_number; -my $final_casing_test_number; my %setlocale_failed; # List of locales that setlocale() didn't work on -foreach $Locale (@Locale) { +foreach my $Locale (@Locale) { $locales_test_number = $first_locales_test_number - 1; - debug "#\n"; - debug "# Locale = $Locale\n"; + debug "\n"; + debug "Locale = $Locale\n"; unless (setlocale(&POSIX::LC_ALL, $Locale)) { $setlocale_failed{$Locale} = $Locale; next; } - # We test UTF-8 locales only under ':not_characters'; otherwise they have - # documented deficiencies. Non- UTF-8 locales are tested only under plain - # 'use locale', as otherwise we would have to convert everything in them - # to Unicode. - # The locale name doesn't necessarily have to have "utf8" in it to be a - # UTF-8 locale, but it works mostly. - my $is_utf8_locale = $Locale =~ /UTF-?8/i; + # We test UTF-8 locales only under ':not_characters'; It is easier to + # test them in other test files than here. Non- UTF-8 locales are tested + # only under plain 'use locale', as otherwise we would have to convert + # everything in them to Unicode. my %UPPER = (); # All alpha X for which uc(X) == X and lc(X) != X my %lower = (); # All alpha X for which lc(X) == X and uc(X) != X my %BoThCaSe = (); # All alpha X for which uc(X) == lc(X) == X + my $is_utf8_locale = is_locale_utf8($Locale); + + debug "is utf8 locale? = $is_utf8_locale\n"; + + my $radix = localeconv()->{decimal_point}; + if ($radix !~ / ^ [[:ascii:]] + $/x) { + use bytes; + $radix = disp_chars(split "", $radix); + } + debug "radix = $radix\n"; + if (! $is_utf8_locale) { use locale; @{$posixes{'word'}} = grep /\w/, map { chr } 0..255; @@ -884,21 +986,21 @@ foreach $Locale (@Locale) { # Ordered, where possible, in groups of "this is a subset of the next # one" - debug "# :upper: = ", disp_chars(@{$posixes{'upper'}}), "\n"; - debug "# :lower: = ", disp_chars(@{$posixes{'lower'}}), "\n"; - debug "# :cased: = ", disp_chars(@{$posixes{'cased'}}), "\n"; - debug "# :alpha: = ", disp_chars(@{$posixes{'alpha'}}), "\n"; - debug "# :alnum: = ", disp_chars(@{$posixes{'alnum'}}), "\n"; - debug "# w = ", disp_chars(@{$posixes{'word'}}), "\n"; - debug "# :graph: = ", disp_chars(@{$posixes{'graph'}}), "\n"; - debug "# :print: = ", disp_chars(@{$posixes{'print'}}), "\n"; - debug "# d = ", disp_chars(@{$posixes{'digit'}}), "\n"; - debug "# :xdigit: = ", disp_chars(@{$posixes{'xdigit'}}), "\n"; - debug "# :blank: = ", disp_chars(@{$posixes{'blank'}}), "\n"; - debug "# s = ", disp_chars(@{$posixes{'space'}}), "\n"; - debug "# :punct: = ", disp_chars(@{$posixes{'punct'}}), "\n"; - debug "# :cntrl: = ", disp_chars(@{$posixes{'cntrl'}}), "\n"; - debug "# :ascii: = ", disp_chars(@{$posixes{'ascii'}}), "\n"; + debug ":upper: = ", disp_chars(@{$posixes{'upper'}}), "\n"; + debug ":lower: = ", disp_chars(@{$posixes{'lower'}}), "\n"; + debug ":cased: = ", disp_chars(@{$posixes{'cased'}}), "\n"; + debug ":alpha: = ", disp_chars(@{$posixes{'alpha'}}), "\n"; + debug ":alnum: = ", disp_chars(@{$posixes{'alnum'}}), "\n"; + debug " w = ", disp_chars(@{$posixes{'word'}}), "\n"; + debug ":graph: = ", disp_chars(@{$posixes{'graph'}}), "\n"; + debug ":print: = ", disp_chars(@{$posixes{'print'}}), "\n"; + debug " d = ", disp_chars(@{$posixes{'digit'}}), "\n"; + debug ":xdigit: = ", disp_chars(@{$posixes{'xdigit'}}), "\n"; + debug ":blank: = ", disp_chars(@{$posixes{'blank'}}), "\n"; + debug " s = ", disp_chars(@{$posixes{'space'}}), "\n"; + debug ":punct: = ", disp_chars(@{$posixes{'punct'}}), "\n"; + debug ":cntrl: = ", disp_chars(@{$posixes{'cntrl'}}), "\n"; + debug ":ascii: = ", disp_chars(@{$posixes{'ascii'}}), "\n"; foreach (keys %UPPER) { @@ -922,14 +1024,14 @@ foreach $Locale (@Locale) { } } - debug "# UPPER = ", disp_chars(keys %UPPER), "\n"; - debug "# lower = ", disp_chars(keys %lower), "\n"; - debug "# BoThCaSe = ", disp_chars(keys %BoThCaSe), "\n"; - debug "# Unassigned = ", disp_chars(sort { ord $a <=> ord $b } keys %Unassigned), "\n"; + debug "UPPER = ", disp_chars(sort { ord $a <=> ord $b } keys %UPPER), "\n"; + debug "lower = ", disp_chars(sort { ord $a <=> ord $b } keys %lower), "\n"; + debug "BoThCaSe = ", disp_chars(sort { ord $a <=> ord $b } keys %BoThCaSe), "\n"; + debug "Unassigned = ", disp_chars(sort { ord $a <=> ord $b } keys %Unassigned), "\n"; my @failures; my @fold_failures; - foreach my $x (sort keys %UPPER) { + foreach my $x (sort { ord $a <=> ord $b } keys %UPPER) { my $ok; my $fold_ok; if ($is_utf8_locale) { @@ -958,7 +1060,7 @@ foreach $Locale (@Locale) { undef @failures; undef @fold_failures; - foreach my $x (sort keys %lower) { + foreach my $x (sort { ord $a <=> ord $b } keys %lower) { my $ok; my $fold_ok; if ($is_utf8_locale) { @@ -994,9 +1096,9 @@ foreach $Locale (@Locale) { } } - @Added_alpha = sort @Added_alpha; + @Added_alpha = sort { ord $a <=> ord $b } @Added_alpha; - debug "# Added_alpha = ", disp_chars(@Added_alpha), "\n"; + debug "Added_alpha = ", disp_chars(@Added_alpha), "\n"; # Cross-check the whole 8-bit character set. @@ -1229,16 +1331,46 @@ foreach $Locale (@Locale) { ++$locales_test_number; undef @f; - $test_names{$locales_test_number} = 'Verify that [:digit:] is a subset of [:xdigit:]'; + my @xdigit_digits; # :digit: & :xdigit: + $test_names{$locales_test_number} = 'Verify that [:xdigit:] contains one or two blocks of 10 consecutive [:digit:] chars'; for (map { chr } 0..255) { if ($is_utf8_locale) { use locale ':not_characters'; - push @f, $_ if /[[:digit:]]/ and ! /[[:xdigit:]]/; + # For utf8 locales, we actually use a stricter test: that :digit: + # is a subset of :xdigit:, as we know that only 0-9 should match + push @f, $_ if /[[:digit:]]/ and ! /[[:xdigit:]]/; } else { - push @f, $_ if /[[:digit:]]/ and ! /[[:xdigit:]]/; + push @xdigit_digits, $_ if /[[:digit:]]/ and /[[:xdigit:]]/; + } + } + if (! $is_utf8_locale) { + + # For non-utf8 locales, @xdigit_digits is a list of the characters + # that are both :xdigit: and :digit:. Because :digit: is stored in + # increasing code point order (unless the tests above failed), + # @xdigit_digits is as well. There should be exactly 10 or + # 20 of these. + if (@xdigit_digits != 10 && @xdigit_digits != 20) { + @f = @xdigit_digits; + } + else { + + # Look for contiguity in the series, adding any wrong ones to @f + my @temp = @xdigit_digits; + while (@temp > 1) { + push @f, $temp[1] if ($temp[0] != $temp[1] - 1) + + # Skip this test for the 0th character of + # the second block of 10, as it won't be + # contiguous with the previous block + && (! defined $xdigit_digits[10] + || $temp[1] != $xdigit_digits[10]); + shift @temp; + } } } + report_multi_result($Locale, $locales_test_number, \@f); ++$locales_test_number; @@ -1260,17 +1392,22 @@ foreach $Locale (@Locale) { $test_names{$locales_test_number} = 'Verify that any additional members of [:xdigit:], are in groups of 6 consecutive code points'; my $previous_ord; my $count = 0; - for (map { chr } 0..255) { - next unless /[[:xdigit:]]/; - next if /[[:digit:]]/; - next if /[A-Fa-f]/; + for my $chr (map { chr } 0..255) { + next unless $chr =~ /[[:xdigit:]]/; + if ($is_utf8_locale) { + next if $chr =~ /[[:digit:]]/; + } + else { + next if grep { $chr eq $_ } @xdigit_digits; + } + next if $chr =~ /[A-Fa-f]/; if (defined $previous_ord) { if ($is_utf8_locale) { use locale ':not_characters'; - push @f, $_ if ord $_ != $previous_ord + 1; + push @f, $chr if ord $chr != $previous_ord + 1; } else { - push @f, $_ if ord $_ != $previous_ord + 1; + push @f, $chr if ord $chr != $previous_ord + 1; } } $count++; @@ -1278,7 +1415,7 @@ foreach $Locale (@Locale) { undef $previous_ord; } else { - $previous_ord = ord $_; + $previous_ord = ord $chr; } } report_multi_result($Locale, $locales_test_number, \@f); @@ -1463,7 +1600,10 @@ foreach $Locale (@Locale) { } report_multi_result($Locale, $locales_test_number, \@f); - $final_casing_test_number = $locales_test_number; + foreach ($first_casing_test_number..$locales_test_number) { + $problematical_tests{$_} = 1; + } + # Test for read-only scalars' locale vs non-locale comparisons. @@ -1543,20 +1683,20 @@ foreach $Locale (@Locale) { } report_result($Locale, $locales_test_number, $test == 0); if ($test) { - debug "# lesser = '$lesser'\n"; - debug "# greater = '$greater'\n"; - debug "# lesser cmp greater = ", + debug "lesser = '$lesser'\n"; + debug "greater = '$greater'\n"; + debug "lesser cmp greater = ", $lesser cmp $greater, "\n"; - debug "# greater cmp lesser = ", + debug "greater cmp lesser = ", $greater cmp $lesser, "\n"; - debug "# (greater) from = $from, to = $to\n"; + debug "(greater) from = $from, to = $to\n"; for my $ti (@test) { debugf("# %-40s %-4s", $ti, $test{$ti} ? 'FAIL' : 'ok'); if ($ti =~ /\(\.*(\$.+ +cmp +\$[^\)]+)\.*\)/) { debugf("(%s == %4d)", $1, eval $1); } - debug "\n#"; + debugf("\n#"); } last; @@ -1578,10 +1718,14 @@ foreach $Locale (@Locale) { my $ok12; my $ok13; my $ok14; + my $ok14_5; my $ok15; my $ok16; my $ok17; my $ok18; + my $ok19; + my $ok20; + my $ok21; my $c; my $d; @@ -1643,7 +1787,7 @@ foreach $Locale (@Locale) { $ok11 = $f == $c; $ok12 = abs(($f + $g) - 3.57) < 0.01; $ok13 = $w == 0; - $ok14 = $ok15 = $ok16 = 1; # Skip for non-utf8 locales + $ok14 = $ok14_5 = $ok15 = $ok16 = 1; # Skip for non-utf8 locales } { no locale; @@ -1698,15 +1842,20 @@ foreach $Locale (@Locale) { # Look for non-ASCII error messages, and verify that the first # such is in UTF-8 (the others almost certainly will be like the - # first). + # first). This is only done if the current locale has LC_MESSAGES $ok14 = 1; - foreach my $err (keys %!) { - use Errno; - $! = eval "&Errno::$err"; # Convert to strerror() output - my $strerror = "$!"; - if ("$strerror" =~ /\P{ASCII}/) { - $ok14 = utf8::is_utf8($strerror); - last; + $ok14_5 = 1; + if (setlocale(&POSIX::LC_MESSAGES, $Locale)) { + foreach my $err (keys %!) { + use Errno; + $! = eval "&Errno::$err"; # Convert to strerror() output + my $strerror = "$!"; + if ("$strerror" =~ /\P{ASCII}/) { + $ok14 = utf8::is_utf8($strerror); + no locale; + $ok14_5 = "$!" !~ /\P{ASCII}/; + last; + } } } @@ -1727,11 +1876,39 @@ foreach $Locale (@Locale) { $ok18 = $j eq sprintf("%g:%g", $h, $i); } + $ok19 = $ok20 = 1; + if (setlocale(&POSIX::LC_TIME, $Locale)) { # These tests aren't affected by + # :not_characters + my @times = CORE::localtime(); + + use locale; + $ok19 = POSIX::strftime("%p", @times) ne "%p"; # [perl #119425] + my $date = POSIX::strftime("'%A' '%B' '%Z' '%p'", @times); + debug("'Day' 'Month' 'TZ' 'am/pm' = ", disp_str($date)); + + # If there is any non-ascii, it better be UTF-8 in a UTF-8 locale, and + # not UTF-8 if the locale isn't UTF-8. + $ok20 = $date =~ / ^ \p{ASCII}+ $ /x + || $is_utf8_locale == utf8::is_utf8($date); + } + + $ok21 = 1; + foreach my $err (keys %!) { + no locale; + use Errno; + $! = eval "&Errno::$err"; # Convert to strerror() output + my $strerror = "$!"; + if ("$strerror" =~ /\P{ASCII}/) { + $ok21 = 0; + last; + } + } + report_result($Locale, ++$locales_test_number, $ok1); $test_names{$locales_test_number} = 'Verify that an intervening printf doesn\'t change assignment results'; my $first_a_test = $locales_test_number; - debug "# $first_a_test..$locales_test_number: \$a = $a, \$b = $b, Locale = $Locale\n"; + debug "$first_a_test..$locales_test_number: \$a = $a, \$b = $b, Locale = $Locale\n"; report_result($Locale, ++$locales_test_number, $ok2); $test_names{$locales_test_number} = 'Verify that an intervening sprintf doesn\'t change assignment results'; @@ -1740,14 +1917,17 @@ foreach $Locale (@Locale) { report_result($Locale, ++$locales_test_number, $ok3); $test_names{$locales_test_number} = 'Verify that a different locale radix works when doing "==" with a constant'; + $problematical_tests{$locales_test_number} = 1; report_result($Locale, ++$locales_test_number, $ok4); $test_names{$locales_test_number} = 'Verify that a different locale radix works when doing "==" with a scalar'; + $problematical_tests{$locales_test_number} = 1; report_result($Locale, ++$locales_test_number, $ok5); $test_names{$locales_test_number} = 'Verify that a different locale radix works when doing "==" with a scalar and an intervening sprintf'; + $problematical_tests{$locales_test_number} = 1; - debug "# $first_c_test..$locales_test_number: \$c = $c, \$d = $d, Locale = $Locale\n"; + debug "$first_c_test..$locales_test_number: \$c = $c, \$d = $d, Locale = $Locale\n"; report_result($Locale, ++$locales_test_number, $ok6); $test_names{$locales_test_number} = 'Verify that can assign stringified under inner no-locale block'; @@ -1758,28 +1938,37 @@ foreach $Locale (@Locale) { report_result($Locale, ++$locales_test_number, $ok8); $test_names{$locales_test_number} = 'Verify that "==" with a scalar and an intervening sprintf still works in inner no locale'; + $problematical_tests{$locales_test_number} = 1; - debug "# $first_e_test..$locales_test_number: \$e = $e, no locale\n"; + debug "$first_e_test..$locales_test_number: \$e = $e, no locale\n"; report_result($Locale, ++$locales_test_number, $ok9); $test_names{$locales_test_number} = 'Verify that after a no-locale block, a different locale radix still works when doing "==" with a constant'; + $problematical_tests{$locales_test_number} = 1; my $first_f_test = $locales_test_number; report_result($Locale, ++$locales_test_number, $ok10); $test_names{$locales_test_number} = 'Verify that after a no-locale block, a different locale radix still works when doing "==" with a scalar'; + $problematical_tests{$locales_test_number} = 1; report_result($Locale, ++$locales_test_number, $ok11); $test_names{$locales_test_number} = 'Verify that after a no-locale block, a different locale radix still works when doing "==" with a scalar and an intervening sprintf'; + $problematical_tests{$locales_test_number} = 1; report_result($Locale, ++$locales_test_number, $ok12); $test_names{$locales_test_number} = 'Verify that after a no-locale block, a different locale radix can participate in an addition and function call as numeric'; + $problematical_tests{$locales_test_number} = 1; report_result($Locale, ++$locales_test_number, $ok13); $test_names{$locales_test_number} = 'Verify that don\'t get warning under "==" even if radix is not a dot'; + $problematical_tests{$locales_test_number} = 1; report_result($Locale, ++$locales_test_number, $ok14); $test_names{$locales_test_number} = 'Verify that non-ASCII UTF-8 error messages are in UTF-8'; + report_result($Locale, ++$locales_test_number, $ok14_5); + $test_names{$locales_test_number} = '... and are ASCII outside "use locale"'; + report_result($Locale, ++$locales_test_number, $ok15); $test_names{$locales_test_number} = 'Verify that a number with a UTF-8 radix has a UTF-8 stringification'; @@ -1792,7 +1981,18 @@ foreach $Locale (@Locale) { report_result($Locale, ++$locales_test_number, $ok18); $test_names{$locales_test_number} = 'Verify that a sprintf of a number back within locale scope uses locale radix'; - debug "# $first_f_test..$locales_test_number: \$f = $f, \$g = $g, back to locale = $Locale\n"; + report_result($Locale, ++$locales_test_number, $ok19); + $test_names{$locales_test_number} = 'Verify that strftime doesn\'t return "%p" in locales where %p is empty'; + + report_result($Locale, ++$locales_test_number, $ok20); + $test_names{$locales_test_number} = 'Verify that strftime returns date with UTF-8 flag appropriately set'; + $problematical_tests{$locales_test_number} = 1; # This is broken in + # OS X 10.9.3 + + report_result($Locale, ++$locales_test_number, $ok21); + $test_names{$locales_test_number} = '"$!" is ASCII only outside of locale scope'; + + debug "$first_f_test..$locales_test_number: \$f = $f, \$g = $g, back to locale = $Locale\n"; # Does taking lc separately differ from taking # the lc "in-line"? (This was the bug 19990704.002, change #3568.) @@ -1852,11 +2052,11 @@ foreach $Locale (@Locale) { my @f = (); ++$locales_test_number; $test_names{$locales_test_number} = 'Verify case insensitive matching works'; - foreach my $x (sort keys %UPPER) { + foreach my $x (sort { ord $a <=> ord $b } keys %UPPER) { if (! $is_utf8_locale) { my $y = lc $x; next unless uc $y eq $x; - debug_more( "# UPPER=", disp_chars(($x)), + debug_more( "UPPER=", disp_chars(($x)), "; lc=", disp_chars(($y)), "; ", "; fc=", disp_chars((fc $x)), "; ", disp_chars(($x)), "=~/", disp_chars(($y)), "/i=", @@ -1893,9 +2093,7 @@ foreach $Locale (@Locale) { print "# Regex characters in '$x' or '$y', skipping test $locales_test_number for locale '$Locale'\n"; next; } - # With utf8 both will fail since the locale concept - # of upper/lower does not work well in Unicode. - push @f, $x unless $x =~ /$y/i == $y =~ /$x/i; + push @f, $x unless $x =~ /$y/i && $y =~ /$x/i; # fc is not a locale concept, so Perl uses lc for it. push @f, $x unless lc $x eq fc $x; @@ -1904,7 +2102,7 @@ foreach $Locale (@Locale) { use locale ':not_characters'; my $y = lc $x; next unless uc $y eq $x; - debug_more( "# UPPER=", disp_chars(($x)), + debug_more( "UPPER=", disp_chars(($x)), "; lc=", disp_chars(($y)), "; ", "; fc=", disp_chars((fc $x)), "; ", disp_chars(($x)), "=~/", disp_chars(($y)), "/i=", @@ -1914,8 +2112,6 @@ foreach $Locale (@Locale) { $y =~ /$x/i ? 1 : 0, "\n"); - # Here, we can fully test things, unlike plain 'use locale', - # because this form does work well with Unicode push @f, $x unless $x =~ /$y/i && $y =~ /$x/i; # The places where Unicode's lc is different from fc are @@ -1924,11 +2120,11 @@ foreach $Locale (@Locale) { } } - foreach my $x (sort keys %lower) { + foreach my $x (sort { ord $a <=> ord $b } keys %lower) { if (! $is_utf8_locale) { my $y = uc $x; next unless lc $y eq $x; - debug_more( "# lower=", disp_chars(($x)), + debug_more( "lower=", disp_chars(($x)), "; uc=", disp_chars(($y)), "; ", "; fc=", disp_chars((fc $x)), "; ", disp_chars(($x)), "=~/", disp_chars(($y)), "/i=", @@ -1941,9 +2137,7 @@ foreach $Locale (@Locale) { print "# Regex characters in '$x' or '$y', skipping test $locales_test_number for locale '$Locale'\n"; next; } - # With utf8 both will fail since the locale concept - # of upper/lower does not work well in Unicode. - push @f, $x unless $x =~ /$y/i == $y =~ /$x/i; + push @f, $x unless $x =~ /$y/i && $y =~ /$x/i; push @f, $x unless lc $x eq fc $x; } @@ -1951,7 +2145,7 @@ foreach $Locale (@Locale) { use locale ':not_characters'; my $y = uc $x; next unless lc $y eq $x; - debug_more( "# lower=", disp_chars(($x)), + debug_more( "lower=", disp_chars(($x)), "; uc=", disp_chars(($y)), "; ", "; fc=", disp_chars((fc $x)), "; ", disp_chars(($x)), "=~/", disp_chars(($y)), "/i=", @@ -1966,6 +2160,7 @@ foreach $Locale (@Locale) { } } report_multi_result($Locale, $locales_test_number, \@f); + $problematical_tests{$locales_test_number} = 1; } # [perl #109318] @@ -1973,6 +2168,7 @@ foreach $Locale (@Locale) { my @f = (); ++$locales_test_number; $test_names{$locales_test_number} = 'Verify atof with locale radix and negative exponent'; + $problematical_tests{$locales_test_number} = 1; my $radix = POSIX::localeconv()->{decimal_point}; my @nums = ( @@ -2006,49 +2202,58 @@ my $final_locales_test_number = $locales_test_number; # Recount the errors. -foreach ($first_locales_test_number..$final_locales_test_number) { +foreach $test_num ($first_locales_test_number..$final_locales_test_number) { if (%setlocale_failed) { print "not "; } - elsif ($Problem{$_} || !defined $Okay{$_} || !@{$Okay{$_}}) { + elsif ($Problem{$test_num} || !defined $Okay{$test_num} || !@{$Okay{$test_num}}) { if (defined $not_necessarily_a_problem_test_number - && $_ == $not_necessarily_a_problem_test_number) + && $test_num == $not_necessarily_a_problem_test_number) { print "# The failure of test $not_necessarily_a_problem_test_number is not necessarily fatal.\n"; print "# It usually indicates a problem in the environment,\n"; print "# not in Perl itself.\n"; } - if ($Okay{$_} && ($_ >= $first_casing_test_number - && $_ <= $final_casing_test_number)) - { + if ($Okay{$test_num} && grep { $_ == $test_num } keys %problematical_tests) { + no warnings 'experimental::autoderef'; # Round to nearest .1% - my $percent_fail = (int(.5 + (1000 * scalar(keys $Problem{$_}) + my $percent_fail = (int(.5 + (1000 * scalar(keys $Problem{$test_num}) / scalar(@Locale)))) / 10; - if (! $debug && $percent_fail < $acceptable_fold_failure_percentage) - { - $test_names{$_} .= 'TODO'; - print "# ", 100 - $percent_fail, "% of locales pass the following test, so it is likely that the failures\n"; - print "# are errors in the locale definitions. The test is marked TODO, as the\n"; - print "# problem is not likely to be Perl's\n"; + if ($percent_fail < $acceptable_failure_percentage) { + if (! $debug) { + $test_names{$test_num} .= 'TODO'; + print "# ", 100 - $percent_fail, "% of locales pass the following test, so it is likely that the failures\n"; + print "# are errors in the locale definitions. The test is marked TODO, as the\n"; + print "# problem is not likely to be Perl's\n"; + } + } + if ($debug) { + print "# $percent_fail% of locales (", + scalar(keys $Problem{$test_num}), + " of ", + scalar(@Locale), + ") fail the above test (TODO cut-off is ", + $acceptable_failure_percentage, + "%)\n"; } } print "#\n"; if ($debug) { print "# The code points that had this failure are given above. Look for lines\n"; - print "# that match 'failed $_'\n"; + print "# that match 'failed $test_num'\n"; } else { print "# For more details, rerun, with environment variable PERL_DEBUG_FULL_TEST=1.\n"; - print "# Then look at that output for lines that match 'failed $_'\n"; + print "# Then look at that output for lines that match 'failed $test_num'\n"; } print "not "; } - print "ok $_"; - if (defined $test_names{$_}) { + print "ok $test_num"; + if (defined $test_names{$test_num}) { # If TODO is in the test name, make it thus - my $todo = $test_names{$_} =~ s/TODO\s*//; - print " $test_names{$_}"; + my $todo = $test_names{$test_num} =~ s/TODO\s*//; + print " $test_names{$test_num}"; print " # TODO" if $todo; } print "\n"; @@ -2056,7 +2261,7 @@ foreach ($first_locales_test_number..$final_locales_test_number) { $test_num = $final_locales_test_number; -unless ( $^O eq 'dragonfly' ) { +unless ( $^O =~ m!^(dragonfly|openbsd|bitrig|mirbsd)$! ) { # perl #115808 use warnings; my $warned = 0; @@ -2086,17 +2291,19 @@ setlocale(&POSIX::LC_ALL, "C"); my $above_latin1_case_change_delta; # Same for the specific ords > 255 # that we use - # We test an ASCII character, which should change case and be tainted; + # We test an ASCII character, which should change case; # a Latin1 character, which shouldn't change case under this C locale, - # and is tainted. # an above-Latin1 character that when the case is changed would cross - # the 255/256 boundary, so doesn't change case and isn't tainted - # (the \x{149} is one of these, but changes into 2 characters, the + # the 255/256 boundary, so doesn't change case + # (the \x{149} is one of these, but changes into 2 characters, the # first one of which doesn't cross the boundary. # the final one in each list is an above-Latin1 character whose case - # does change, and shouldn't be tainted. The code below uses its - # position in its list as a marker to indicate that it, unlike the - # other code points above ASCII, has a successful case change + # does change. The code below uses its position in its list as a + # marker to indicate that it, unlike the other code points above + # ASCII, has a successful case change + # + # All casing operations under locale (but not :not_characters) should + # taint if ($function =~ /^u/) { @list = ("", "a", "\xe0", "\xff", "\x{fb00}", "\x{149}", "\x{101}"); $ascii_case_change_delta = -32; @@ -2170,9 +2377,9 @@ setlocale(&POSIX::LC_ALL, "C"); : "; not encoded in utf8)") . " should be \"$should_be\", got \"$changed\""); - # Tainting shouldn't happen for utf8 locales, empty - # strings, or those characters above 255. - (! $is_utf8_locale && length($char) > 0 && ord($char) < 256) + # Tainting shouldn't happen for use locale :not_character + # (a utf8 locale) + (! $is_utf8_locale) ? check_taint($changed) : check_taint_not($changed); @@ -2239,25 +2446,33 @@ if ($didwarn) { my $s = join(" ", @s); $s =~ s/(.{50,60}) /$1\n#\t/g; - warn + print "# The following locales\n#\n", "#\t", $s, "\n#\n", "# tested okay.\n#\n", } else { - warn "# None of your locales were fully okay.\n"; + print "# None of your locales were fully okay.\n"; } if (@F) { my $F = join(" ", @F); $F =~ s/(.{50,60}) /$1\n#\t/g; - warn + my $details = ""; + unless ($debug) { + $details = "# For more details, rerun, with environment variable PERL_DEBUG_FULL_TEST=1.\n"; + } + elsif ($debug == 1) { + $details = "# For even more details, rerun, with environment variable PERL_DEBUG_FULL_TEST=2.\n"; + } + + print "# The following locales\n#\n", "#\t", $F, "\n#\n", "# had problems.\n#\n", - "# For more details, rerun, with environment variable PERL_DEBUG_FULL_TEST=1.\n"; + $details; } else { - warn "# None of your locales were broken.\n"; + print "# None of your locales were broken.\n"; } }