X-Git-Url: https://perl5.git.perl.org/perl5.git/blobdiff_plain/1bfe8fea345bb2df5023ad56d399be427737a3d2..b05fd80e8a75b8bba137db6cd6688f7aa8a0ec39:/lib/locale.t diff --git a/lib/locale.t b/lib/locale.t index f133351..31b40f9 100644 --- a/lib/locale.t +++ b/lib/locale.t @@ -19,17 +19,27 @@ BEGIN { print "1..0\n"; exit; } + require './loc_tools.pl'; $| = 1; } use strict; use feature 'fc'; +# =1 adds debugging output; =2 increases the verbosity somewhat my $debug = $ENV{PERL_DEBUG_FULL_TEST} // 0; # Certain tests have been shown to be problematical for a few locales. Don't # fail them unless at least this percentage of the tested locales fail. -my $acceptable_fold_failure_percentage = 5; +# On AIX machines, many locales call a no-break space a graphic. +# (There aren't 1000 locales currently in existence, so 99.9 works) +my $acceptable_failure_percentage = ($^O =~ / ^ ( AIX ) $ /ix) + ? 99.9 + : 5; + +# The list of test numbers of the problematic tests. +my %problematical_tests; + use Dumpvalue; @@ -40,37 +50,21 @@ my $dumper = Dumpvalue->new( ); sub debug { return unless $debug; - my($mess) = join "", @_; + my($mess) = join "", '# ', @_; chop $mess; print $dumper->stringify($mess,1), "\n"; } +sub debug_more { + return unless $debug > 1; + return debug(@_); +} + sub debugf { printf @_ if $debug; } -my $have_setlocale = 0; -eval { - require POSIX; - import POSIX ':locale_h'; - $have_setlocale++; -}; - -# Visual C's CRT goes silly on strings of the form "en_US.ISO8859-1" -# and mingw32 uses said silly CRT -# This doesn't seem to be an issue any more, at least on Windows XP, -# so re-enable the tests for Windows XP onwards. -my $winxp = ($^O eq 'MSWin32' && defined &Win32::GetOSVersion && - join('.', (Win32::GetOSVersion())[1..2]) >= 5.1); -$have_setlocale = 0 if ((($^O eq 'MSWin32' && !$winxp) || $^O eq 'NetWare') && - $Config{cc} =~ /^(cl|gcc)/i); - -# UWIN seems to loop after taint tests, just skip for now -$have_setlocale = 0 if ($^O =~ /^uwin/); - -sub LC_ALL (); - -$a = 'abc %'; +$a = 'abc %9'; my $test_num = 0; @@ -97,13 +91,15 @@ sub is_tainted { # hello, camel two. sub check_taint ($;$) { my $message_tail = $_[1] // ""; - $message_tail = ": $message_tail" if $message_tail; + + # Extra blanks are so aligns with taint_not output + $message_tail = ": $message_tail" if $message_tail; ok is_tainted($_[0]), "verify that is tainted$message_tail"; } sub check_taint_not ($;$) { my $message_tail = $_[1] // ""; - $message_tail = ": $message_tail" if $message_tail; + $message_tail = ": $message_tail" if $message_tail; ok((not is_tainted($_[0])), "verify that isn't tainted$message_tail"); } @@ -113,325 +109,583 @@ check_taint_not $&, "not tainted outside 'use locale'"; use locale; # engage locale and therefore locale taint. -check_taint_not $a; - -check_taint uc($a); -check_taint "\U$a"; -check_taint ucfirst($a); -check_taint "\u$a"; -check_taint lc($a); -check_taint fc($a); -check_taint "\L$a"; -check_taint "\F$a"; -check_taint lcfirst($a); -check_taint "\l$a"; - -check_taint_not sprintf('%e', 123.456); -check_taint_not sprintf('%f', 123.456); -check_taint_not sprintf('%g', 123.456); -check_taint_not sprintf('%d', 123.456); -check_taint_not sprintf('%x', 123.456); +# BE SURE TO COPY ANYTHING YOU ADD to these tests to the block below for +# ":notcharacters" + +check_taint_not $a, '$a'; + +check_taint uc($a), 'uc($a)'; +check_taint "\U$a", '"\U$a"'; +check_taint ucfirst($a), 'ucfirst($a)'; +check_taint "\u$a", '"\u$a"'; +check_taint lc($a), 'lc($a)'; +check_taint fc($a), 'fc($a)'; +check_taint "\L$a", '"\L$a"'; +check_taint "\F$a", '"\F$a"'; +check_taint lcfirst($a), 'lcfirst($a)'; +check_taint "\l$a", '"\l$a"'; + +check_taint_not sprintf('%e', 123.456), "sprintf('%e', 123.456)"; +check_taint_not sprintf('%f', 123.456), "sprintf('%f', 123.456)"; +check_taint_not sprintf('%g', 123.456), "sprintf('%g', 123.456)"; +check_taint_not sprintf('%d', 123.456), "sprintf('%d', 123.456)"; +check_taint_not sprintf('%x', 123.456), "sprintf('%x', 123.456)"; $_ = $a; # untaint $_ $_ = uc($a); # taint $_ -check_taint $_; +check_taint $_, '$_ = uc($a)'; /(\w)/; # taint $&, $`, $', $+, $1. -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $&, "\$& from /(\\w)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; /(.)/; # untaint $&, $`, $', $+, $1. -check_taint_not $&; -check_taint_not $`; -check_taint_not $'; -check_taint_not $+; -check_taint_not $1; -check_taint_not $2; +check_taint_not $&, "\$& from /(.)/"; +check_taint_not $`, "\t\$`"; +check_taint_not $', "\t\$'"; +check_taint_not $+, "\t\$+"; +check_taint_not $1, "\t\$1"; +check_taint_not $2, "\t\$2"; /(\W)/; # taint $&, $`, $', $+, $1. -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $&, "\$& from /(\\W)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; +check_taint_not $`, "\t\$`"; +check_taint_not $', "\t\$'"; +check_taint_not $+, "\t\$+"; +check_taint_not $1, "\t\$1"; +check_taint_not $2, "\t\$2"; /(\s)/; # taint $&, $`, $', $+, $1. -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $&, "\$& from /(\\s)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; /(\S)/; # taint $&, $`, $', $+, $1. -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $&, "\$& from /(\\S)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +"0" =~ /(\d)/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /(\\d)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +/(\D)/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /(\\D)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +/([[:alnum:]])/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /([[:alnum:]])/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +/([[:^alnum:]])/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /([[:^alnum:]])/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +"a" =~ /(a)|(\w)/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /(a)|(\\w)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +ok($1 eq 'a', ("\t" x 5) . "\$1 is 'a'"); +ok(! defined $2, ("\t" x 5) . "\$2 is undefined"); +check_taint_not $2, "\t\$2"; +check_taint_not $3, "\t\$3"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +"\N{CYRILLIC SMALL LETTER A}" =~ /(\N{CYRILLIC CAPITAL LETTER A})/i; # no tainting because no locale dependence +check_taint_not $&, "\$& from /(\\N{CYRILLIC CAPITAL LETTER A})/i"; +check_taint_not $`, "\t\$`"; +check_taint_not $', "\t\$'"; +check_taint_not $+, "\t\$+"; +check_taint_not $1, "\t\$1"; +ok($1 eq "\N{CYRILLIC SMALL LETTER A}", ("\t" x 4) . "\t\$1 is 'small cyrillic a'"); +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /./"; + +"(\N{KELVIN SIGN})" =~ /(\N{KELVIN SIGN})/i; # taints because depends on locale +check_taint $&, "\$& from /(\\N{KELVIN SIGN})/i"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /(.)/"; + +"a:" =~ /(.)\b(.)/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /(.)\\b(.)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint $2, "\t\$2"; +check_taint_not $3, "\t\$3"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /./"; + +"aa" =~ /(.)\B(.)/; # taint $&, $`, $', $+, $1. +check_taint $&, "\$& from /(.)\\B(.)/"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint $2, "\t\$2"; +check_taint_not $3, "\t\$3"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$& from /./"; + +"aaa" =~ /(.).(\1)/i; # notaint because not locale dependent +check_taint_not $&, "\$ & from /(.).(\\1)/"; +check_taint_not $`, "\t\$`"; +check_taint_not $', "\t\$'"; +check_taint_not $+, "\t\$+"; +check_taint_not $1, "\t\$1"; +check_taint_not $2, "\t\$2"; +check_taint_not $3, "\t\$3"; + +/(.)/; # untaint $&, $`, $', $+, $1. +check_taint_not $&, "\$ & from /./"; $_ = $a; # untaint $_ -check_taint_not $_; +check_taint_not $_, 'untainting $_ works'; /(b)/; # this must not taint -check_taint_not $&; -check_taint_not $`; -check_taint_not $'; -check_taint_not $+; -check_taint_not $1; -check_taint_not $2; +check_taint_not $&, "\$ & from /(b)/"; +check_taint_not $`, "\t\$`"; +check_taint_not $', "\t\$'"; +check_taint_not $+, "\t\$+"; +check_taint_not $1, "\t\$1"; +check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ -check_taint_not $_; +check_taint_not $_, 'untainting $_ works'; $b = uc($a); # taint $b s/(.+)/$b/; # this must taint only the $_ -check_taint $_; -check_taint_not $&; -check_taint_not $`; -check_taint_not $'; -check_taint_not $+; -check_taint_not $1; -check_taint_not $2; +check_taint $_, '$_ (wasn\'t tainted) from s/(.+)/$b/ where $b is tainted'; +check_taint_not $&, "\t\$&"; +check_taint_not $`, "\t\$`"; +check_taint_not $', "\t\$'"; +check_taint_not $+, "\t\$+"; +check_taint_not $1, "\t\$1"; +check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ s/(.+)/b/; # this must not taint -check_taint_not $_; -check_taint_not $&; -check_taint_not $`; -check_taint_not $'; -check_taint_not $+; -check_taint_not $1; -check_taint_not $2; +check_taint_not $_, '$_ (wasn\'t tainted) from s/(.+)/b/'; +check_taint_not $&, "\t\$&"; +check_taint_not $`, "\t\$`"; +check_taint_not $', "\t\$'"; +check_taint_not $+, "\t\$+"; +check_taint_not $1, "\t\$1"; +check_taint_not $2, "\t\$2"; $b = $a; # untaint $b ($b = $a) =~ s/\w/$&/; -check_taint $b; # $b should be tainted. -check_taint_not $a; # $a should be not. +check_taint $b, '$b from ($b = $a) =~ s/\w/$&/'; # $b should be tainted. +check_taint_not $a, '$a from ($b = $a) =~ s/\w/$&/'; # $a should be not. $_ = $a; # untaint $_ s/(\w)/\l$1/; # this must taint -check_taint $_; -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $_, '$_ (wasn\'t tainted) from s/(\w)/\l$1/,'; # this must taint +check_taint $&, "\t\$&"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ s/(\w)/\L$1/; # this must taint -check_taint $_; -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $_, '$_ (wasn\'t tainted) from s/(\w)/\L$1/,'; +check_taint $&, "\t\$&"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ s/(\w)/\u$1/; # this must taint -check_taint $_; -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $_, '$_ (wasn\'t tainted) from s/(\w)/\u$1/'; +check_taint $&, "\t\$&"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ s/(\w)/\U$1/; # this must taint -check_taint $_; -check_taint $&; -check_taint $`; -check_taint $'; -check_taint $+; -check_taint $1; -check_taint_not $2; +check_taint $_, '$_ (wasn\'t tainted) from s/(\w)/\U$1/'; +check_taint $&, "\t\$&"; +check_taint $`, "\t\$`"; +check_taint $', "\t\$'"; +check_taint $+, "\t\$+"; +check_taint $1, "\t\$1"; +check_taint_not $2, "\t\$2"; # After all this tainting $a should be cool. -check_taint_not $a; +check_taint_not $a, '$a still not tainted'; + +"a" =~ /([a-z])/; +check_taint_not $1, '"a" =~ /([a-z])/'; +"foo.bar_baz" =~ /^(.*)[._](.*?)$/; # Bug 120675 +check_taint_not $1, '"foo.bar_baz" =~ /^(.*)[._](.*?)$/'; + +# BE SURE TO COPY ANYTHING YOU ADD to the block below { # This is just the previous tests copied here with a different # compile-time pragma. use locale ':not_characters'; # engage restricted locale with different # tainting rules - - check_taint_not $a; - - check_taint_not uc($a); - check_taint_not "\U$a"; - check_taint_not ucfirst($a); - check_taint_not "\u$a"; - check_taint_not lc($a); - check_taint_not fc($a); - check_taint_not "\L$a"; - check_taint_not "\F$a"; - check_taint_not lcfirst($a); - check_taint_not "\l$a"; - - check_taint_not sprintf('%e', 123.456); - check_taint_not sprintf('%f', 123.456); - check_taint_not sprintf('%g', 123.456); - check_taint_not sprintf('%d', 123.456); - check_taint_not sprintf('%x', 123.456); + check_taint_not $a, '$a'; + + check_taint_not uc($a), 'uc($a)'; + check_taint_not "\U$a", '"\U$a"'; + check_taint_not ucfirst($a), 'ucfirst($a)'; + check_taint_not "\u$a", '"\u$a"'; + check_taint_not lc($a), 'lc($a)'; + check_taint_not fc($a), 'fc($a)'; + check_taint_not "\L$a", '"\L$a"'; + check_taint_not "\F$a", '"\F$a"'; + check_taint_not lcfirst($a), 'lcfirst($a)'; + check_taint_not "\l$a", '"\l$a"'; + + check_taint_not sprintf('%e', 123.456), "sprintf('%e', 123.456)"; + check_taint_not sprintf('%f', 123.456), "sprintf('%f', 123.456)"; + check_taint_not sprintf('%g', 123.456), "sprintf('%g', 123.456)"; + check_taint_not sprintf('%d', 123.456), "sprintf('%d', 123.456)"; + check_taint_not sprintf('%x', 123.456), "sprintf('%x', 123.456)"; $_ = $a; # untaint $_ - $_ = uc($a); # taint $_ + $_ = uc($a); - check_taint_not $_; + check_taint_not $_, '$_ = uc($a)'; - /(\w)/; # taint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + /(\w)/; + check_taint_not $&, "\$& from /(\\w)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; /(.)/; # untaint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; - - /(\W)/; # taint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; - - /(\s)/; # taint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; - - /(\S)/; # taint $&, $`, $', $+, $1. - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + check_taint_not $&, "\$& from /(.)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(\W)/; + check_taint_not $&, "\$& from /(\\W)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; - $_ = $a; # untaint $_ + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(\s)/; + check_taint_not $&, "\$& from /(\\s)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; - check_taint_not $_; + /(\S)/; + check_taint_not $&, "\$& from /(\\S)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; - /(b)/; # this must not taint - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; - $_ = $a; # untaint $_ + "0" =~ /(\d)/; + check_taint_not $&, "\$& from /(\\d)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; - check_taint_not $_; + /(\D)/; + check_taint_not $&, "\$& from /(\\D)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + + /([[:alnum:]])/; + check_taint_not $&, "\$& from /([[:alnum:]])/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + + /([[:^alnum:]])/; + check_taint_not $&, "\$& from /([[:^alnum:]])/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + "a" =~ /(a)|(\w)/; + check_taint_not $&, "\$& from /(a)|(\\w)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + ok($1 eq 'a', ("\t" x 5) . "\$1 is 'a'"); + ok(! defined $2, ("\t" x 5) . "\$2 is undefined"); + check_taint_not $2, "\t\$2"; + check_taint_not $3, "\t\$3"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + + "\N{CYRILLIC SMALL LETTER A}" =~ /(\N{CYRILLIC CAPITAL LETTER A})/i; + check_taint_not $&, "\$& from /(\\N{CYRILLIC CAPITAL LETTER A})/i"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + ok($1 eq "\N{CYRILLIC SMALL LETTER A}", ("\t" x 4) . "\t\$1 is 'small cyrillic a'"); + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /./"; + + "(\N{KELVIN SIGN})" =~ /(\N{KELVIN SIGN})/i; + check_taint_not $&, "\$& from /(\\N{KELVIN SIGN})/i"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /(.)/"; + + "a:" =~ /(.)\b(.)/; + check_taint_not $&, "\$& from /(.)\\b(.)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + check_taint_not $3, "\t\$3"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /./"; + + "aa" =~ /(.)\B(.)/; + check_taint_not $&, "\$& from /(.)\\B(.)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + check_taint_not $3, "\t\$3"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$& from /./"; + + "aaa" =~ /(.).(\1)/i; # notaint because not locale dependent + check_taint_not $&, "\$ & from /(.).(\\1)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; + check_taint_not $3, "\t\$3"; + + /(.)/; # untaint $&, $`, $', $+, $1. + check_taint_not $&, "\$ & from /./"; + + $_ = $a; # untaint $_ - $b = uc($a); # taint $b - s/(.+)/$b/; # this must taint only the $_ + check_taint_not $_, 'untainting $_ works'; - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + /(b)/; + check_taint_not $&, "\$ & from /(b)/"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ - s/(.+)/b/; # this must not taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + check_taint_not $_, 'untainting $_ works'; + + s/(.+)/b/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(.+)/b/'; + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; $b = $a; # untaint $b ($b = $a) =~ s/\w/$&/; - check_taint_not $b; # $b should be tainted. - check_taint_not $a; # $a should be not. + check_taint_not $b, '$b from ($b = $a) =~ s/\w/$&/'; + check_taint_not $a, '$a from ($b = $a) =~ s/\w/$&/'; $_ = $a; # untaint $_ - s/(\w)/\l$1/; # this must taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + s/(\w)/\l$1/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(\w)/\l$1/,'; # this must taint + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ - s/(\w)/\L$1/; # this must taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + s/(\w)/\L$1/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(\w)/\L$1/,'; + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ - s/(\w)/\u$1/; # this must taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + s/(\w)/\u$1/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(\w)/\u$1/'; + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; $_ = $a; # untaint $_ - s/(\w)/\U$1/; # this must taint - check_taint_not $_; - check_taint_not $&; - check_taint_not $`; - check_taint_not $'; - check_taint_not $+; - check_taint_not $1; - check_taint_not $2; + s/(\w)/\U$1/; + check_taint_not $_, '$_ (wasn\'t tainted) from s/(\w)/\U$1/'; + check_taint_not $&, "\t\$&"; + check_taint_not $`, "\t\$`"; + check_taint_not $', "\t\$'"; + check_taint_not $+, "\t\$+"; + check_taint_not $1, "\t\$1"; + check_taint_not $2, "\t\$2"; # After all this tainting $a should be cool. - check_taint_not $a; + check_taint_not $a, '$a still not tainted'; + + "a" =~ /([a-z])/; + check_taint_not $1, '"a" =~ /([a-z])/'; + "foo.bar_baz" =~ /^(.*)[._](.*?)$/; # Bug 120675 + check_taint_not $1, '"foo.bar_baz" =~ /^(.*)[._](.*?)$/'; + } # Here are in scope of 'use locale' @@ -440,246 +694,174 @@ check_taint_not $a; # Let us do some *real* locale work now, # unless setlocale() is missing (i.e. minitest). -unless ($have_setlocale) { - print "1..$test_num\n"; - exit; -} - # The test number before our first setlocale() my $final_without_setlocale = $test_num; # Find locales. -debug "# Scanning for locales...\n"; - -# Note that it's okay that some languages have their native names -# capitalized here even though that's not "right". They are lowercased -# anyway later during the scanning process (and besides, some clueless -# vendor might have them capitalized erroneously anyway). - -my $locales = < ord $b } @_; + my $output = ""; + my $range_start; + my $start_class; + push @chars, chr(258); # This sentinel simplifies the loop termination + # logic + foreach my $i (0 .. @chars - 1) { + my $char = $chars[$i]; + my $range_end; + my $class; + + # We avoid using [:posix:] classes, as these are being tested in this + # file. Each equivalence class below is for things that can appear in + # a range; those that can't be in a range have class -1. 0 for those + # which should be output in hex; and >0 for the other ranges + if ($char =~ /[A-Z]/) { + $class = 2; + } + elsif ($char =~ /[a-z]/) { + $class = 3; + } + elsif ($char =~ /[0-9]/) { + $class = 4; + } + # Uncomment to get literal punctuation displayed instead of hex + #elsif ($char =~ /[[\]!"#\$\%&\'()*+,.\/:\\;<=>?\@\^_`{|}~-]/) { + # $class = -1; # Punct never appears in a range + #} + else { + $class = 0; # Output in hex + } -# Sanitize the environment so that we can run the external 'locale' -# program without the taint mode getting grumpy. - -# $ENV{PATH} is special in VMS. -delete $ENV{PATH} if $^O ne 'VMS' or $Config{d_setenv}; - -# Other subversive stuff. -delete @ENV{qw(IFS CDPATH ENV BASH_ENV)}; - -if (-x "/usr/bin/locale" && open(LOCALES, "/usr/bin/locale -a 2>/dev/null|")) { - while () { - # It seems that /usr/bin/locale steadfastly outputs 8 bit data, which - # ain't great when we're running this testPERL_UNICODE= so that utf8 - # locales will cause all IO hadles to default to (assume) utf8 - next unless utf8::valid($_); - chomp; - trylocale($_); - } - close(LOCALES); -} elsif ($^O eq 'VMS' && defined($ENV{'SYS$I18N_LOCALE'}) && -d 'SYS$I18N_LOCALE') { -# The SYS$I18N_LOCALE logical name search list was not present on -# VAX VMS V5.5-12, but was on AXP && VAX VMS V6.2 as well as later versions. - opendir(LOCALES, "SYS\$I18N_LOCALE:"); - while ($_ = readdir(LOCALES)) { - chomp; - trylocale($_); - } - close(LOCALES); -} elsif ($^O eq 'openbsd' && -e '/usr/share/locale') { - - # OpenBSD doesn't have a locale executable, so reading /usr/share/locale - # is much easier and faster than the last resort method. - - opendir(LOCALES, '/usr/share/locale'); - while ($_ = readdir(LOCALES)) { - chomp; - trylocale($_); - } - close(LOCALES); -} else { - - # This is going to be slow. - - foreach my $locale (split(/\n/, $locales)) { - my ($locale_name, $language_codes, $country_codes, $encodings) = - split(/:/, $locale); - my @enc = decode_encodings($encodings); - foreach my $loc (split(/ /, $locale_name)) { - trylocale($loc); - foreach my $enc (@enc) { - trylocale("$loc.$enc"); - } - $loc = lc $loc; - foreach my $enc (@enc) { - trylocale("$loc.$enc"); - } - } - foreach my $lang (split(/ /, $language_codes)) { - trylocale($lang); - foreach my $country (split(/ /, $country_codes)) { - my $lc = "${lang}_${country}"; - trylocale($lc); - foreach my $enc (@enc) { - trylocale("$lc.$enc"); - } - my $lC = "${lang}_\U${country}"; - trylocale($lC); - foreach my $enc (@enc) { - trylocale("$lC.$enc"); - } - } - } - } -} + if (! defined $range_start) { + if ($class < 0) { + $output .= " " . $char; + } + else { + $range_start = ord $char; + $start_class = $class; + } + } # A range ends if not consecutive, or the class-type changes + elsif (ord $char != ($range_end = ord($chars[$i-1])) + 1 + || $class != $start_class) + { -setlocale(LC_ALL, "C"); + # Here, the current character is not in the range. This means the + # previous character must have been. Output the range up through + # that one. + my $range_length = $range_end - $range_start + 1; + if ($start_class > 0) { + $output .= " " . chr($range_start); + $output .= "-" . chr($range_end) if $range_length > 1; + } + else { + $output .= sprintf(" %02X", $range_start); + $output .= sprintf("-%02X", $range_end) if $range_length > 1; + } -if ($^O eq 'darwin') { - # Darwin 8/Mac OS X 10.4 and 10.5 have bad Basque locales: perl bug #35895, - # Apple bug ID# 4139653. It also has a problem in Byelorussian. - (my $v) = $Config{osvers} =~ /^(\d+)/; - if ($v >= 8 and $v < 10) { - debug "# Skipping eu_ES, be_BY locales -- buggy in Darwin\n"; - @Locale = grep ! m/^(eu_ES(?:\..*)?|be_BY\.CP1131)$/, @Locale; - } elsif ($v < 12) { - debug "# Skipping be_BY locales -- buggy in Darwin\n"; - @Locale = grep ! m/^be_BY\.CP1131$/, @Locale; + # Handle the new current character, as potentially beginning a new + # range + undef $range_start; + redo; + } } + + $output =~ s/^ //; + return $output; } -@Locale = sort @Locale; +sub disp_str ($) { + my $string = shift; -debug "# Locales =\n"; -for ( @Locale ) { - debug "# $_\n"; -} + # Displays the string unambiguously. ASCII printables are always output + # as-is, though perhaps separated by blanks from other characters. If + # entirely printable ASCII, just returns the string. Otherwise if valid + # UTF-8 it uses the character names for non-printable-ASCII. Otherwise it + # outputs hex for each non-ASCII-printable byte. -my %Problem; -my %Okay; -my %Testing; -my @Added_alpha; # Alphas that aren't in the C locale. -my %test_names; + return $string if $string =~ / ^ [[:print:]]* $/xa; + + my $result = ""; + my $prev_was_punct = 1; # Beginning is considered punct + if (utf8::valid($string) && utf8::is_utf8($string)) { + use charnames (); + foreach my $char (split "", $string) { + + # Keep punctuation adjacent to other characters; otherwise + # separate them with a blank + if ($char =~ /[[:punct:]]/a) { + $result .= $char; + $prev_was_punct = 1; + } + elsif ($char =~ /[[:print:]]/a) { + $result .= " " unless $prev_was_punct; + $result .= $char; + $prev_was_punct = 0; + } + else { + $result .= " " unless $prev_was_punct; + $result .= charnames::viacode(ord $char); + $prev_was_punct = 0; + } + } + } + else { + use bytes; + foreach my $char (split "", $string) { + if ($char =~ /[[:punct:]]/a) { + $result .= $char; + $prev_was_punct = 1; + } + elsif ($char =~ /[[:print:]]/a) { + $result .= " " unless $prev_was_punct; + $result .= $char; + $prev_was_punct = 0; + } + else { + $result .= " " unless $prev_was_punct; + $result .= sprintf("%02X", ord $char); + $prev_was_punct = 0; + } + } + } + + return $result; +} sub report_result { my ($Locale, $i, $pass_fail, $message) = @_; @@ -687,7 +869,7 @@ sub report_result { $message = " ($message)" if $message; unless ($pass_fail) { $Problem{$i}{$Locale} = 1; - debug "# failed $i ($test_names{$i}) with locale '$Locale'$message\n"; + debug "failed $i ($test_names{$i}) with locale '$Locale'$message\n"; } else { push @{$Okay{$i}}, $Locale; } @@ -701,7 +883,7 @@ sub report_multi_result { my $message = ""; if (@$results_ref) { - $message = join " ", "for", map { sprintf '\\x%02X', ord $_ } @$results_ref; + $message = join " ", "for", disp_chars(@$results_ref); } report_result($Locale, $i, @$results_ref == 0, $message); } @@ -710,37 +892,54 @@ my $first_locales_test_number = $final_without_setlocale + 1; my $locales_test_number; my $not_necessarily_a_problem_test_number; my $first_casing_test_number; -my $final_casing_test_number; my %setlocale_failed; # List of locales that setlocale() didn't work on -foreach $Locale (@Locale) { +foreach my $Locale (@Locale) { $locales_test_number = $first_locales_test_number - 1; - debug "# Locale = $Locale\n"; + debug "\n"; + debug "Locale = $Locale\n"; - unless (setlocale(LC_ALL, $Locale)) { + unless (setlocale(&POSIX::LC_ALL, $Locale)) { $setlocale_failed{$Locale} = $Locale; next; } - # We test UTF-8 locales only under ':not_characters'; otherwise they have - # documented deficiencies. Non- UTF-8 locales are tested only under plain - # 'use locale', as otherwise we would have to convert everything in them - # to Unicode. - my $is_utf8_locale = $Locale =~ /UTF-?8/i; + # We test UTF-8 locales only under ':not_characters'; It is easier to + # test them in other test files than here. Non- UTF-8 locales are tested + # only under plain 'use locale', as otherwise we would have to convert + # everything in them to Unicode. my %UPPER = (); # All alpha X for which uc(X) == X and lc(X) != X my %lower = (); # All alpha X for which lc(X) == X and uc(X) != X my %BoThCaSe = (); # All alpha X for which uc(X) == lc(X) == X + my $is_utf8_locale = is_locale_utf8($Locale); + + debug "is utf8 locale? = $is_utf8_locale\n"; + + debug "radix = " . disp_str(localeconv()->{decimal_point}) . "\n"; + if (! $is_utf8_locale) { use locale; - @Alnum_ = sort grep /\w/, map { chr } 0..255; - - debug "# w = ", join("",@Alnum_), "\n"; + @{$posixes{'word'}} = grep /\w/, map { chr } 0..255; + @{$posixes{'digit'}} = grep /\d/, map { chr } 0..255; + @{$posixes{'space'}} = grep /\s/, map { chr } 0..255; + @{$posixes{'alpha'}} = grep /[[:alpha:]]/, map {chr } 0..255; + @{$posixes{'alnum'}} = grep /[[:alnum:]]/, map {chr } 0..255; + @{$posixes{'ascii'}} = grep /[[:ascii:]]/, map {chr } 0..255; + @{$posixes{'blank'}} = grep /[[:blank:]]/, map {chr } 0..255; + @{$posixes{'cntrl'}} = grep /[[:cntrl:]]/, map {chr } 0..255; + @{$posixes{'graph'}} = grep /[[:graph:]]/, map {chr } 0..255; + @{$posixes{'lower'}} = grep /[[:lower:]]/, map {chr } 0..255; + @{$posixes{'print'}} = grep /[[:print:]]/, map {chr } 0..255; + @{$posixes{'punct'}} = grep /[[:punct:]]/, map {chr } 0..255; + @{$posixes{'upper'}} = grep /[[:upper:]]/, map {chr } 0..255; + @{$posixes{'xdigit'}} = grep /[[:xdigit:]]/, map {chr } 0..255; + @{$posixes{'cased'}} = grep /[[:upper:]]/i, map {chr } 0..255; # Sieve the uppercase and the lowercase. - for (@Alnum_) { + for (@{$posixes{'word'}}) { if (/[^\d_]/) { # skip digits and the _ if (uc($_) eq $_) { $UPPER{$_} = $_; @@ -753,9 +952,22 @@ foreach $Locale (@Locale) { } else { use locale ':not_characters'; - @Alnum_ = sort grep /\w/, map { chr } 0..255; - debug "# w = ", join("",@Alnum_), "\n"; - for (@Alnum_) { + @{$posixes{'word'}} = grep /\w/, map { chr } 0..255; + @{$posixes{'digit'}} = grep /\d/, map { chr } 0..255; + @{$posixes{'space'}} = grep /\s/, map { chr } 0..255; + @{$posixes{'alpha'}} = grep /[[:alpha:]]/, map {chr } 0..255; + @{$posixes{'alnum'}} = grep /[[:alnum:]]/, map {chr } 0..255; + @{$posixes{'ascii'}} = grep /[[:ascii:]]/, map {chr } 0..255; + @{$posixes{'blank'}} = grep /[[:blank:]]/, map {chr } 0..255; + @{$posixes{'cntrl'}} = grep /[[:cntrl:]]/, map {chr } 0..255; + @{$posixes{'graph'}} = grep /[[:graph:]]/, map {chr } 0..255; + @{$posixes{'lower'}} = grep /[[:lower:]]/, map {chr } 0..255; + @{$posixes{'print'}} = grep /[[:print:]]/, map {chr } 0..255; + @{$posixes{'punct'}} = grep /[[:punct:]]/, map {chr } 0..255; + @{$posixes{'upper'}} = grep /[[:upper:]]/, map {chr } 0..255; + @{$posixes{'xdigit'}} = grep /[[:xdigit:]]/, map {chr } 0..255; + @{$posixes{'cased'}} = grep /[[:upper:]]/i, map {chr } 0..255; + for (@{$posixes{'word'}}) { if (/[^\d_]/) { # skip digits and the _ if (uc($_) eq $_) { $UPPER{$_} = $_; @@ -766,7 +978,27 @@ foreach $Locale (@Locale) { } } } + + # Ordered, where possible, in groups of "this is a subset of the next + # one" + debug ":upper: = ", disp_chars(@{$posixes{'upper'}}), "\n"; + debug ":lower: = ", disp_chars(@{$posixes{'lower'}}), "\n"; + debug ":cased: = ", disp_chars(@{$posixes{'cased'}}), "\n"; + debug ":alpha: = ", disp_chars(@{$posixes{'alpha'}}), "\n"; + debug ":alnum: = ", disp_chars(@{$posixes{'alnum'}}), "\n"; + debug " w = ", disp_chars(@{$posixes{'word'}}), "\n"; + debug ":graph: = ", disp_chars(@{$posixes{'graph'}}), "\n"; + debug ":print: = ", disp_chars(@{$posixes{'print'}}), "\n"; + debug " d = ", disp_chars(@{$posixes{'digit'}}), "\n"; + debug ":xdigit: = ", disp_chars(@{$posixes{'xdigit'}}), "\n"; + debug ":blank: = ", disp_chars(@{$posixes{'blank'}}), "\n"; + debug " s = ", disp_chars(@{$posixes{'space'}}), "\n"; + debug ":punct: = ", disp_chars(@{$posixes{'punct'}}), "\n"; + debug ":cntrl: = ", disp_chars(@{$posixes{'cntrl'}}), "\n"; + debug ":ascii: = ", disp_chars(@{$posixes{'ascii'}}), "\n"; + foreach (keys %UPPER) { + $BoThCaSe{$_}++ if exists $lower{$_}; } foreach (keys %lower) { @@ -777,13 +1009,24 @@ foreach $Locale (@Locale) { delete $lower{$_}; } - debug "# UPPER = ", join("", sort keys %UPPER ), "\n"; - debug "# lower = ", join("", sort keys %lower ), "\n"; - debug "# BoThCaSe = ", join("", sort keys %BoThCaSe), "\n"; + my %Unassigned; + foreach my $ord ( 0 .. 255 ) { + $Unassigned{chr $ord} = 1; + } + foreach my $class (keys %posixes) { + foreach my $char (@{$posixes{$class}}) { + delete $Unassigned{$char}; + } + } + + debug "UPPER = ", disp_chars(sort { ord $a <=> ord $b } keys %UPPER), "\n"; + debug "lower = ", disp_chars(sort { ord $a <=> ord $b } keys %lower), "\n"; + debug "BoThCaSe = ", disp_chars(sort { ord $a <=> ord $b } keys %BoThCaSe), "\n"; + debug "Unassigned = ", disp_chars(sort { ord $a <=> ord $b } keys %Unassigned), "\n"; my @failures; my @fold_failures; - foreach my $x (sort keys %UPPER) { + foreach my $x (sort { ord $a <=> ord $b } keys %UPPER) { my $ok; my $fold_ok; if ($is_utf8_locale) { @@ -812,7 +1055,7 @@ foreach $Locale (@Locale) { undef @failures; undef @fold_failures; - foreach my $x (sort keys %lower) { + foreach my $x (sort { ord $a <=> ord $b } keys %lower) { my $ok; my $fold_ok; if ($is_utf8_locale) { @@ -848,9 +1091,9 @@ foreach $Locale (@Locale) { } } - @Added_alpha = sort @Added_alpha; + @Added_alpha = sort { ord $a <=> ord $b } @Added_alpha; - debug "# Added_alpha = ", join("",@Added_alpha), "\n"; + debug "Added_alpha = ", disp_chars(@Added_alpha), "\n"; # Cross-check the whole 8-bit character set. @@ -941,6 +1184,21 @@ foreach $Locale (@Locale) { # The rules for the relationships are given in: # http://www.opengroup.org/onlinepubs/009695399/basedefs/xbd_chap07.html + + ++$locales_test_number; + undef @f; + $test_names{$locales_test_number} = 'Verify that [:lower:] contains at least a-z'; + for ('a' .. 'z') { + if ($is_utf8_locale) { + use locale ':not_characters'; + push @f, $_ unless /[[:lower:]]/; + } + else { + push @f, $_ unless /[[:lower:]]/; + } + } + report_multi_result($Locale, $locales_test_number, \@f); + ++$locales_test_number; undef @f; $test_names{$locales_test_number} = 'Verify that [:lower:] is a subset of [:alpha:]'; @@ -957,6 +1215,20 @@ foreach $Locale (@Locale) { ++$locales_test_number; undef @f; + $test_names{$locales_test_number} = 'Verify that [:upper:] contains at least A-Z'; + for ('A' .. 'Z') { + if ($is_utf8_locale) { + use locale ':not_characters'; + push @f, $_ unless /[[:upper:]]/; + } + else { + push @f, $_ unless /[[:upper:]]/; + } + } + report_multi_result($Locale, $locales_test_number, \@f); + + ++$locales_test_number; + undef @f; $test_names{$locales_test_number} = 'Verify that [:upper:] is a subset of [:alpha:]'; for (map { chr } 0..255) { if ($is_utf8_locale) { @@ -999,6 +1271,20 @@ foreach $Locale (@Locale) { ++$locales_test_number; undef @f; + $test_names{$locales_test_number} = 'Verify that [:digit:] contains at least 0-9'; + for ('0' .. '9') { + if ($is_utf8_locale) { + use locale ':not_characters'; + push @f, $_ unless /[[:digit:]]/; + } + else { + push @f, $_ unless /[[:digit:]]/; + } + } + report_multi_result($Locale, $locales_test_number, \@f); + + ++$locales_test_number; + undef @f; $test_names{$locales_test_number} = 'Verify that [:digit:] is a subset of [:alnum:]'; for (map { chr } 0..255) { if ($is_utf8_locale) { @@ -1013,33 +1299,121 @@ foreach $Locale (@Locale) { ++$locales_test_number; undef @f; - $test_names{$locales_test_number} = 'Verify that [:digit:] is a subset of [:xdigit:]'; + $test_names{$locales_test_number} = 'Verify that [:digit:] matches either 10 or 20 code points'; + report_result($Locale, $locales_test_number, @{$posixes{'digit'}} == 10 || @{$posixes{'digit'}} == 20); + + ++$locales_test_number; + undef @f; + $test_names{$locales_test_number} = 'Verify that if there is a second set of digits in [:digit:], they are consecutive'; + if (@{$posixes{'digit'}} == 20) { + my $previous_ord; + for (map { chr } 0..255) { + next unless /[[:digit:]]/; + next if /[0-9]/; + if (defined $previous_ord) { + if ($is_utf8_locale) { + use locale ':not_characters'; + push @f, $_ if ord $_ != $previous_ord + 1; + } + else { + push @f, $_ if ord $_ != $previous_ord + 1; + } + } + $previous_ord = ord $_; + } + } + report_multi_result($Locale, $locales_test_number, \@f); + + ++$locales_test_number; + undef @f; + my @xdigit_digits; # :digit: & :xdigit: + $test_names{$locales_test_number} = 'Verify that [:xdigit:] contains one or two blocks of 10 consecutive [:digit:] chars'; for (map { chr } 0..255) { if ($is_utf8_locale) { use locale ':not_characters'; - push @f, $_ if /[[:digit:]]/ and ! /[[:xdigit:]]/; + # For utf8 locales, we actually use a stricter test: that :digit: + # is a subset of :xdigit:, as we know that only 0-9 should match + push @f, $_ if /[[:digit:]]/ and ! /[[:xdigit:]]/; + } + else { + push @xdigit_digits, $_ if /[[:digit:]]/ and /[[:xdigit:]]/; + } + } + if (! $is_utf8_locale) { + + # For non-utf8 locales, @xdigit_digits is a list of the characters + # that are both :xdigit: and :digit:. Because :digit: is stored in + # increasing code point order (unless the tests above failed), + # @xdigit_digits is as well. There should be exactly 10 or + # 20 of these. + if (@xdigit_digits != 10 && @xdigit_digits != 20) { + @f = @xdigit_digits; } else { - push @f, $_ if /[[:digit:]]/ and ! /[[:xdigit:]]/; + + # Look for contiguity in the series, adding any wrong ones to @f + my @temp = @xdigit_digits; + while (@temp > 1) { + push @f, $temp[1] if ($temp[0] != $temp[1] - 1) + + # Skip this test for the 0th character of + # the second block of 10, as it won't be + # contiguous with the previous block + && (! defined $xdigit_digits[10] + || $temp[1] != $xdigit_digits[10]); + shift @temp; + } } } + report_multi_result($Locale, $locales_test_number, \@f); ++$locales_test_number; undef @f; - $test_names{$locales_test_number} = 'Verify that [:alnum:] is a subset of [:graph:]'; - for (map { chr } 0..255) { + $test_names{$locales_test_number} = 'Verify that [:xdigit:] contains at least A-F, a-f'; + for ('A' .. 'F', 'a' .. 'f') { if ($is_utf8_locale) { use locale ':not_characters'; - push @f, $_ if /[[:alnum:]]/ and ! /[[:graph:]]/; + push @f, $_ unless /[[:xdigit:]]/; } else { - push @f, $_ if /[[:alnum:]]/ and ! /[[:graph:]]/; + push @f, $_ unless /[[:xdigit:]]/; } } report_multi_result($Locale, $locales_test_number, \@f); - # Note that xdigit doesn't have to be a subset of alnum + ++$locales_test_number; + undef @f; + $test_names{$locales_test_number} = 'Verify that any additional members of [:xdigit:], are in groups of 6 consecutive code points'; + my $previous_ord; + my $count = 0; + for my $chr (map { chr } 0..255) { + next unless $chr =~ /[[:xdigit:]]/; + if ($is_utf8_locale) { + next if $chr =~ /[[:digit:]]/; + } + else { + next if grep { $chr eq $_ } @xdigit_digits; + } + next if $chr =~ /[A-Fa-f]/; + if (defined $previous_ord) { + if ($is_utf8_locale) { + use locale ':not_characters'; + push @f, $chr if ord $chr != $previous_ord + 1; + } + else { + push @f, $chr if ord $chr != $previous_ord + 1; + } + } + $count++; + if ($count == 6) { + undef $previous_ord; + } + else { + $previous_ord = ord $chr; + } + } + report_multi_result($Locale, $locales_test_number, \@f); ++$locales_test_number; undef @f; @@ -1055,6 +1429,8 @@ foreach $Locale (@Locale) { } report_multi_result($Locale, $locales_test_number, \@f); + # Note that xdigit doesn't have to be a subset of alnum + ++$locales_test_number; undef @f; $test_names{$locales_test_number} = 'Verify that [:punct:] is a subset of [:graph:]'; @@ -1071,6 +1447,46 @@ foreach $Locale (@Locale) { ++$locales_test_number; undef @f; + $test_names{$locales_test_number} = 'Verify that the space character is not in [:graph:]'; + if ($is_utf8_locale) { + use locale ':not_characters'; + push @f, " " if " " =~ /[[:graph:]]/; + } + else { + push @f, " " if " " =~ /[[:graph:]]/; + } + report_multi_result($Locale, $locales_test_number, \@f); + + ++$locales_test_number; + undef @f; + $test_names{$locales_test_number} = 'Verify that [:space:] contains at least [\f\n\r\t\cK ]'; + for (' ', "\f", "\n", "\r", "\t", "\cK") { + if ($is_utf8_locale) { + use locale ':not_characters'; + push @f, $_ unless /[[:space:]]/; + } + else { + push @f, $_ unless /[[:space:]]/; + } + } + report_multi_result($Locale, $locales_test_number, \@f); + + ++$locales_test_number; + undef @f; + $test_names{$locales_test_number} = 'Verify that [:blank:] contains at least [\t ]'; + for (' ', "\t") { + if ($is_utf8_locale) { + use locale ':not_characters'; + push @f, $_ unless /[[:blank:]]/; + } + else { + push @f, $_ unless /[[:blank:]]/; + } + } + report_multi_result($Locale, $locales_test_number, \@f); + + ++$locales_test_number; + undef @f; $test_names{$locales_test_number} = 'Verify that [:blank:] is a subset of [:space:]'; for (map { chr } 0..255) { if ($is_utf8_locale) { @@ -1099,6 +1515,18 @@ foreach $Locale (@Locale) { ++$locales_test_number; undef @f; + $test_names{$locales_test_number} = 'Verify that the space character is in [:print:]'; + if ($is_utf8_locale) { + use locale ':not_characters'; + push @f, " " if " " !~ /[[:print:]]/; + } + else { + push @f, " " if " " !~ /[[:print:]]/; + } + report_multi_result($Locale, $locales_test_number, \@f); + + ++$locales_test_number; + undef @f; $test_names{$locales_test_number} = 'Verify that isn\'t both [:cntrl:] and [:print:]'; for (map { chr } 0..255) { if ($is_utf8_locale) { @@ -1113,6 +1541,20 @@ foreach $Locale (@Locale) { ++$locales_test_number; undef @f; + $test_names{$locales_test_number} = 'Verify that isn\'t both [:alpha:] and [:digit:]'; + for (map { chr } 0..255) { + if ($is_utf8_locale) { + use locale ':not_characters'; + push @f, $_ if /[[:alpha:]]/ and /[[:digit:]]/; + } + else { + push @f, $_ if /[[:alpha:]]/ and /[[:digit:]]/; + } + } + report_multi_result($Locale, $locales_test_number, \@f); + + ++$locales_test_number; + undef @f; $test_names{$locales_test_number} = 'Verify that isn\'t both [:alnum:] and [:punct:]'; for (map { chr } 0..255) { if ($is_utf8_locale) { @@ -1153,7 +1595,10 @@ foreach $Locale (@Locale) { } report_multi_result($Locale, $locales_test_number, \@f); - $final_casing_test_number = $locales_test_number; + foreach ($first_casing_test_number..$locales_test_number) { + $problematical_tests{$_} = 1; + } + # Test for read-only scalars' locale vs non-locale comparisons. @@ -1182,14 +1627,14 @@ foreach $Locale (@Locale) { $not_necessarily_a_problem_test_number = $locales_test_number; for (0..9) { # Select a slice. - $from = int(($_*@Alnum_)/10); - $to = $from + int(@Alnum_/10); - $to = $#Alnum_ if ($to > $#Alnum_); - $lesser = join('', @Alnum_[$from..$to]); + $from = int(($_*@{$posixes{'word'}})/10); + $to = $from + int(@{$posixes{'word'}}/10); + $to = $#{$posixes{'word'}} if ($to > $#{$posixes{'word'}}); + $lesser = join('', @{$posixes{'word'}}[$from..$to]); # Select a slice one character on. $from++; $to++; - $to = $#Alnum_ if ($to > $#Alnum_); - $greater = join('', @Alnum_[$from..$to]); + $to = $#{$posixes{'word'}} if ($to > $#{$posixes{'word'}}); + $greater = join('', @{$posixes{'word'}}[$from..$to]); if ($is_utf8_locale) { use locale ':not_characters'; ($yes, $no, $sign) = ($lesser lt $greater @@ -1233,20 +1678,20 @@ foreach $Locale (@Locale) { } report_result($Locale, $locales_test_number, $test == 0); if ($test) { - debug "# lesser = '$lesser'\n"; - debug "# greater = '$greater'\n"; - debug "# lesser cmp greater = ", + debug "lesser = '$lesser'\n"; + debug "greater = '$greater'\n"; + debug "lesser cmp greater = ", $lesser cmp $greater, "\n"; - debug "# greater cmp lesser = ", + debug "greater cmp lesser = ", $greater cmp $lesser, "\n"; - debug "# (greater) from = $from, to = $to\n"; + debug "(greater) from = $from, to = $to\n"; for my $ti (@test) { debugf("# %-40s %-4s", $ti, $test{$ti} ? 'FAIL' : 'ok'); if ($ti =~ /\(\.*(\$.+ +cmp +\$[^\)]+)\.*\)/) { debugf("(%s == %4d)", $1, eval $1); } - debug "\n#"; + debugf("\n#"); } last; @@ -1268,14 +1713,23 @@ foreach $Locale (@Locale) { my $ok12; my $ok13; my $ok14; + my $ok14_5; my $ok15; my $ok16; + my $ok17; + my $ok18; + my $ok19; + my $ok20; + my $ok21; my $c; my $d; my $e; my $f; my $g; + my $h; + my $i; + my $j; if (! $is_utf8_locale) { use locale; @@ -1319,14 +1773,22 @@ foreach $Locale (@Locale) { $f = "1.23"; $g = 2.34; + $h = 1.5; + $i = 1.25; + $j = "$h:$i"; $ok9 = $f == 1.23; $ok10 = $f == $x; $ok11 = $f == $c; $ok12 = abs(($f + $g) - 3.57) < 0.01; $ok13 = $w == 0; - $ok14 = $ok15 = $ok16 = 1; # Skip for non-utf8 locales + $ok14 = $ok14_5 = $ok15 = $ok16 = 1; # Skip for non-utf8 locales + } + { + no locale; + $ok17 = "1.5:1.25" eq sprintf("%g:%g", $h, $i); } + $ok18 = $j eq sprintf("%g:%g", $h, $i); } else { use locale ':not_characters'; @@ -1363,6 +1825,9 @@ foreach $Locale (@Locale) { $f = "1.23"; $g = 2.34; + $h = 1.5; + $i = 1.25; + $j = "$h:$i"; $ok9 = $f == 1.23; $ok10 = $f == $x; @@ -1372,21 +1837,20 @@ foreach $Locale (@Locale) { # Look for non-ASCII error messages, and verify that the first # such is in UTF-8 (the others almost certainly will be like the - # first). + # first). This is only done if the current locale has LC_MESSAGES $ok14 = 1; - foreach my $err (keys %!) { - use Errno; - $! = eval "&Errno::$err"; # Convert to strerror() output - my $strerror = "$!"; - if ("$strerror" =~ /\P{ASCII}/) { - my $utf8_strerror = $strerror; - utf8::upgrade($utf8_strerror); - - # If $! was already in UTF-8, the upgrade was a no-op; - # otherwise they will be different byte strings. - use bytes; - $ok14 = $utf8_strerror eq $strerror; - last; + $ok14_5 = 1; + if (setlocale(&POSIX::LC_MESSAGES, $Locale)) { + foreach my $err (keys %!) { + use Errno; + $! = eval "&Errno::$err"; # Convert to strerror() output + my $strerror = "$!"; + if ("$strerror" =~ /\P{ASCII}/) { + $ok14 = utf8::is_utf8($strerror); + no locale; + $ok14_5 = "$!" !~ /\P{ASCII}/; + last; + } } } @@ -1395,15 +1859,43 @@ foreach $Locale (@Locale) { # stringification. my $string_g = "$g"; + my $sprintf_g = sprintf("%g", $g); + + $ok15 = $string_g =~ / ^ \p{ASCII}+ $ /x || utf8::is_utf8($string_g); + $ok16 = $sprintf_g eq $string_g; + } + { + no locale; + $ok17 = "1.5:1.25" eq sprintf("%g:%g", $h, $i); + } + $ok18 = $j eq sprintf("%g:%g", $h, $i); + } - my $utf8_string_g = "$g"; - utf8::upgrade($utf8_string_g); + $ok19 = $ok20 = 1; + if (setlocale(&POSIX::LC_TIME, $Locale)) { # These tests aren't affected by + # :not_characters + my @times = CORE::localtime(); - my $utf8_sprintf_g = sprintf("%g", $g); - utf8::upgrade($utf8_sprintf_g); - use bytes; - $ok15 = $utf8_string_g eq $string_g; - $ok16 = $utf8_sprintf_g eq $string_g; + use locale; + $ok19 = POSIX::strftime("%p", @times) ne "%p"; # [perl #119425] + my $date = POSIX::strftime("'%A' '%B' '%Z' '%p'", @times); + debug("'Day' 'Month' 'TZ' 'am/pm' = ", disp_str($date)); + + # If there is any non-ascii, it better be UTF-8 in a UTF-8 locale, and + # not UTF-8 if the locale isn't UTF-8. + $ok20 = $date =~ / ^ \p{ASCII}+ $ /x + || $is_utf8_locale == utf8::is_utf8($date); + } + + $ok21 = 1; + foreach my $err (keys %!) { + no locale; + use Errno; + $! = eval "&Errno::$err"; # Convert to strerror() output + my $strerror = "$!"; + if ("$strerror" =~ /\P{ASCII}/) { + $ok21 = 0; + last; } } @@ -1411,7 +1903,7 @@ foreach $Locale (@Locale) { $test_names{$locales_test_number} = 'Verify that an intervening printf doesn\'t change assignment results'; my $first_a_test = $locales_test_number; - debug "# $first_a_test..$locales_test_number: \$a = $a, \$b = $b, Locale = $Locale\n"; + debug "$first_a_test..$locales_test_number: \$a = $a, \$b = $b, Locale = $Locale\n"; report_result($Locale, ++$locales_test_number, $ok2); $test_names{$locales_test_number} = 'Verify that an intervening sprintf doesn\'t change assignment results'; @@ -1420,14 +1912,17 @@ foreach $Locale (@Locale) { report_result($Locale, ++$locales_test_number, $ok3); $test_names{$locales_test_number} = 'Verify that a different locale radix works when doing "==" with a constant'; + $problematical_tests{$locales_test_number} = 1; report_result($Locale, ++$locales_test_number, $ok4); $test_names{$locales_test_number} = 'Verify that a different locale radix works when doing "==" with a scalar'; + $problematical_tests{$locales_test_number} = 1; report_result($Locale, ++$locales_test_number, $ok5); $test_names{$locales_test_number} = 'Verify that a different locale radix works when doing "==" with a scalar and an intervening sprintf'; + $problematical_tests{$locales_test_number} = 1; - debug "# $first_c_test..$locales_test_number: \$c = $c, \$d = $d, Locale = $Locale\n"; + debug "$first_c_test..$locales_test_number: \$c = $c, \$d = $d, Locale = $Locale\n"; report_result($Locale, ++$locales_test_number, $ok6); $test_names{$locales_test_number} = 'Verify that can assign stringified under inner no-locale block'; @@ -1438,35 +1933,61 @@ foreach $Locale (@Locale) { report_result($Locale, ++$locales_test_number, $ok8); $test_names{$locales_test_number} = 'Verify that "==" with a scalar and an intervening sprintf still works in inner no locale'; + $problematical_tests{$locales_test_number} = 1; - debug "# $first_e_test..$locales_test_number: \$e = $e, no locale\n"; + debug "$first_e_test..$locales_test_number: \$e = $e, no locale\n"; report_result($Locale, ++$locales_test_number, $ok9); $test_names{$locales_test_number} = 'Verify that after a no-locale block, a different locale radix still works when doing "==" with a constant'; + $problematical_tests{$locales_test_number} = 1; my $first_f_test = $locales_test_number; report_result($Locale, ++$locales_test_number, $ok10); $test_names{$locales_test_number} = 'Verify that after a no-locale block, a different locale radix still works when doing "==" with a scalar'; + $problematical_tests{$locales_test_number} = 1; report_result($Locale, ++$locales_test_number, $ok11); $test_names{$locales_test_number} = 'Verify that after a no-locale block, a different locale radix still works when doing "==" with a scalar and an intervening sprintf'; + $problematical_tests{$locales_test_number} = 1; report_result($Locale, ++$locales_test_number, $ok12); $test_names{$locales_test_number} = 'Verify that after a no-locale block, a different locale radix can participate in an addition and function call as numeric'; + $problematical_tests{$locales_test_number} = 1; report_result($Locale, ++$locales_test_number, $ok13); $test_names{$locales_test_number} = 'Verify that don\'t get warning under "==" even if radix is not a dot'; + $problematical_tests{$locales_test_number} = 1; report_result($Locale, ++$locales_test_number, $ok14); $test_names{$locales_test_number} = 'Verify that non-ASCII UTF-8 error messages are in UTF-8'; + report_result($Locale, ++$locales_test_number, $ok14_5); + $test_names{$locales_test_number} = '... and are ASCII outside "use locale"'; + report_result($Locale, ++$locales_test_number, $ok15); $test_names{$locales_test_number} = 'Verify that a number with a UTF-8 radix has a UTF-8 stringification'; report_result($Locale, ++$locales_test_number, $ok16); $test_names{$locales_test_number} = 'Verify that a sprintf of a number with a UTF-8 radix yields UTF-8'; - debug "# $first_f_test..$locales_test_number: \$f = $f, \$g = $g, back to locale = $Locale\n"; + report_result($Locale, ++$locales_test_number, $ok17); + $test_names{$locales_test_number} = 'Verify that a sprintf of a number outside locale scope uses a dot radix'; + + report_result($Locale, ++$locales_test_number, $ok18); + $test_names{$locales_test_number} = 'Verify that a sprintf of a number back within locale scope uses locale radix'; + + report_result($Locale, ++$locales_test_number, $ok19); + $test_names{$locales_test_number} = 'Verify that strftime doesn\'t return "%p" in locales where %p is empty'; + + report_result($Locale, ++$locales_test_number, $ok20); + $test_names{$locales_test_number} = 'Verify that strftime returns date with UTF-8 flag appropriately set'; + $problematical_tests{$locales_test_number} = 1; # This is broken in + # OS X 10.9.3 + + report_result($Locale, ++$locales_test_number, $ok21); + $test_names{$locales_test_number} = '"$!" is ASCII only outside of locale scope'; + + debug "$first_f_test..$locales_test_number: \$f = $f, \$g = $g, back to locale = $Locale\n"; # Does taking lc separately differ from taking # the lc "in-line"? (This was the bug 19990704.002, change #3568.) @@ -1526,13 +2047,19 @@ foreach $Locale (@Locale) { my @f = (); ++$locales_test_number; $test_names{$locales_test_number} = 'Verify case insensitive matching works'; - foreach my $x (sort keys %UPPER) { + foreach my $x (sort { ord $a <=> ord $b } keys %UPPER) { if (! $is_utf8_locale) { my $y = lc $x; next unless uc $y eq $x; - print "# UPPER $x lc $y ", - $x =~ /$y/i ? 1 : 0, " ", - $y =~ /$x/i ? 1 : 0, "\n" if 0; + debug_more( "UPPER=", disp_chars(($x)), + "; lc=", disp_chars(($y)), "; ", + "; fc=", disp_chars((fc $x)), "; ", + disp_chars(($x)), "=~/", disp_chars(($y)), "/i=", + $x =~ /$y/i ? 1 : 0, + "; ", + disp_chars(($y)), "=~/", disp_chars(($x)), "/i=", + $y =~ /$x/i ? 1 : 0, + "\n"); # # If $x and $y contain regular expression characters # AND THEY lowercase (/i) to regular expression characters, @@ -1561,9 +2088,7 @@ foreach $Locale (@Locale) { print "# Regex characters in '$x' or '$y', skipping test $locales_test_number for locale '$Locale'\n"; next; } - # With utf8 both will fail since the locale concept - # of upper/lower does not work well in Unicode. - push @f, $x unless $x =~ /$y/i == $y =~ /$x/i; + push @f, $x unless $x =~ /$y/i && $y =~ /$x/i; # fc is not a locale concept, so Perl uses lc for it. push @f, $x unless lc $x eq fc $x; @@ -1572,12 +2097,16 @@ foreach $Locale (@Locale) { use locale ':not_characters'; my $y = lc $x; next unless uc $y eq $x; - print "# UPPER $x lc $y ", - $x =~ /$y/i ? 1 : 0, " ", - $y =~ /$x/i ? 1 : 0, "\n" if 0; + debug_more( "UPPER=", disp_chars(($x)), + "; lc=", disp_chars(($y)), "; ", + "; fc=", disp_chars((fc $x)), "; ", + disp_chars(($x)), "=~/", disp_chars(($y)), "/i=", + $x =~ /$y/i ? 1 : 0, + "; ", + disp_chars(($y)), "=~/", disp_chars(($x)), "/i=", + $y =~ /$x/i ? 1 : 0, + "\n"); - # Here, we can fully test things, unlike plain 'use locale', - # because this form does work well with Unicode push @f, $x unless $x =~ /$y/i && $y =~ /$x/i; # The places where Unicode's lc is different from fc are @@ -1586,20 +2115,24 @@ foreach $Locale (@Locale) { } } - foreach my $x (sort keys %lower) { + foreach my $x (sort { ord $a <=> ord $b } keys %lower) { if (! $is_utf8_locale) { my $y = uc $x; next unless lc $y eq $x; - print "# lower $x uc $y ", - $x =~ /$y/i ? 1 : 0, " ", - $y =~ /$x/i ? 1 : 0, "\n" if 0; + debug_more( "lower=", disp_chars(($x)), + "; uc=", disp_chars(($y)), "; ", + "; fc=", disp_chars((fc $x)), "; ", + disp_chars(($x)), "=~/", disp_chars(($y)), "/i=", + $x =~ /$y/i ? 1 : 0, + "; ", + disp_chars(($y)), "=~/", disp_chars(($x)), "/i=", + $y =~ /$x/i ? 1 : 0, + "\n"); if ($x =~ $re || $y =~ $re) { # See above. print "# Regex characters in '$x' or '$y', skipping test $locales_test_number for locale '$Locale'\n"; next; } - # With utf8 both will fail since the locale concept - # of upper/lower does not work well in Unicode. - push @f, $x unless $x =~ /$y/i == $y =~ /$x/i; + push @f, $x unless $x =~ /$y/i && $y =~ /$x/i; push @f, $x unless lc $x eq fc $x; } @@ -1607,15 +2140,22 @@ foreach $Locale (@Locale) { use locale ':not_characters'; my $y = uc $x; next unless lc $y eq $x; - print "# lower $x uc $y ", - $x =~ /$y/i ? 1 : 0, " ", - $y =~ /$x/i ? 1 : 0, "\n" if 0; + debug_more( "lower=", disp_chars(($x)), + "; uc=", disp_chars(($y)), "; ", + "; fc=", disp_chars((fc $x)), "; ", + disp_chars(($x)), "=~/", disp_chars(($y)), "/i=", + $x =~ /$y/i ? 1 : 0, + "; ", + disp_chars(($y)), "=~/", disp_chars(($x)), "/i=", + $y =~ /$x/i ? 1 : 0, + "\n"); push @f, $x unless $x =~ /$y/i && $y =~ /$x/i; push @f, $x unless lc $x eq fc $x; } } report_multi_result($Locale, $locales_test_number, \@f); + $problematical_tests{$locales_test_number} = 1; } # [perl #109318] @@ -1623,6 +2163,7 @@ foreach $Locale (@Locale) { my @f = (); ++$locales_test_number; $test_names{$locales_test_number} = 'Verify atof with locale radix and negative exponent'; + $problematical_tests{$locales_test_number} = 1; my $radix = POSIX::localeconv()->{decimal_point}; my @nums = ( @@ -1656,42 +2197,58 @@ my $final_locales_test_number = $locales_test_number; # Recount the errors. -foreach ($first_locales_test_number..$final_locales_test_number) { +foreach $test_num ($first_locales_test_number..$final_locales_test_number) { if (%setlocale_failed) { print "not "; } - elsif ($Problem{$_} || !defined $Okay{$_} || !@{$Okay{$_}}) { + elsif ($Problem{$test_num} || !defined $Okay{$test_num} || !@{$Okay{$test_num}}) { if (defined $not_necessarily_a_problem_test_number - && $_ == $not_necessarily_a_problem_test_number) + && $test_num == $not_necessarily_a_problem_test_number) { print "# The failure of test $not_necessarily_a_problem_test_number is not necessarily fatal.\n"; print "# It usually indicates a problem in the environment,\n"; print "# not in Perl itself.\n"; } - if ($Okay{$_} && ($_ >= $first_casing_test_number - && $_ <= $final_casing_test_number)) - { + if ($Okay{$test_num} && grep { $_ == $test_num } keys %problematical_tests) { + no warnings 'experimental::autoderef'; # Round to nearest .1% - my $percent_fail = (int(.5 + (1000 * scalar(keys $Problem{$_}) + my $percent_fail = (int(.5 + (1000 * scalar(keys $Problem{$test_num}) / scalar(@Locale)))) / 10; - if ($percent_fail < $acceptable_fold_failure_percentage) { - $test_names{$_} .= 'TODO'; - print "# ", 100 - $percent_fail, "% of locales pass the following test, so it is likely that the failures\n"; - print "# are errors in the locale definitions. The test is marked TODO, as the\n"; - print "# problem is not likely to be Perl's\n"; + if ($percent_fail < $acceptable_failure_percentage) { + if (! $debug) { + $test_names{$test_num} .= 'TODO'; + print "# ", 100 - $percent_fail, "% of locales pass the following test, so it is likely that the failures\n"; + print "# are errors in the locale definitions. The test is marked TODO, as the\n"; + print "# problem is not likely to be Perl's\n"; + } + } + if ($debug) { + print "# $percent_fail% of locales (", + scalar(keys $Problem{$test_num}), + " of ", + scalar(@Locale), + ") fail the above test (TODO cut-off is ", + $acceptable_failure_percentage, + "%)\n"; } } - unless ($debug) { - print "#\nFor more details, rerun, with environment variable PERL_DEBUG_FULL_TEST=1\n"; + print "#\n"; + if ($debug) { + print "# The code points that had this failure are given above. Look for lines\n"; + print "# that match 'failed $test_num'\n"; + } + else { + print "# For more details, rerun, with environment variable PERL_DEBUG_FULL_TEST=1.\n"; + print "# Then look at that output for lines that match 'failed $test_num'\n"; } print "not "; } - print "ok $_"; - if (defined $test_names{$_}) { + print "ok $test_num"; + if (defined $test_names{$test_num}) { # If TODO is in the test name, make it thus - my $todo = $test_names{$_} =~ s/TODO\s*//; - print " $test_names{$_}"; + my $todo = $test_names{$test_num} =~ s/TODO\s*//; + print " $test_names{$test_num}"; print " # TODO" if $todo; } print "\n"; @@ -1699,7 +2256,8 @@ foreach ($first_locales_test_number..$final_locales_test_number) { $test_num = $final_locales_test_number; -{ # perl #115808 +unless ( $^O =~ m!^(dragonfly|openbsd|bitrig|mirbsd)$! ) { + # perl #115808 use warnings; my $warned = 0; local $SIG{__WARN__} = sub { @@ -1714,7 +2272,7 @@ $test_num = $final_locales_test_number; # the time these were added above this in this file. # This also tests that locale overrides unicode_strings in the same scope for # non-utf8 strings. -setlocale(LC_ALL, "C"); +setlocale(&POSIX::LC_ALL, "C"); { use locale; use feature 'unicode_strings'; @@ -1728,17 +2286,19 @@ setlocale(LC_ALL, "C"); my $above_latin1_case_change_delta; # Same for the specific ords > 255 # that we use - # We test an ASCII character, which should change case and be tainted; + # We test an ASCII character, which should change case; # a Latin1 character, which shouldn't change case under this C locale, - # and is tainted. # an above-Latin1 character that when the case is changed would cross - # the 255/256 boundary, so doesn't change case and isn't tainted - # (the \x{149} is one of these, but changes into 2 characters, the + # the 255/256 boundary, so doesn't change case + # (the \x{149} is one of these, but changes into 2 characters, the # first one of which doesn't cross the boundary. # the final one in each list is an above-Latin1 character whose case - # does change, and shouldn't be tainted. The code below uses its - # position in its list as a marker to indicate that it, unlike the - # other code points above ASCII, has a successful case change + # does change. The code below uses its position in its list as a + # marker to indicate that it, unlike the other code points above + # ASCII, has a successful case change + # + # All casing operations under locale (but not :not_characters) should + # taint if ($function =~ /^u/) { @list = ("", "a", "\xe0", "\xff", "\x{fb00}", "\x{149}", "\x{101}"); $ascii_case_change_delta = -32; @@ -1812,9 +2372,9 @@ setlocale(LC_ALL, "C"); : "; not encoded in utf8)") . " should be \"$should_be\", got \"$changed\""); - # Tainting shouldn't happen for utf8 locales, empty - # strings, or those characters above 255. - (! $is_utf8_locale && length($char) > 0 && ord($char) < 256) + # Tainting shouldn't happen for use locale :not_character + # (a utf8 locale) + (! $is_utf8_locale) ? check_taint($changed) : check_taint_not($changed); @@ -1881,24 +2441,33 @@ if ($didwarn) { my $s = join(" ", @s); $s =~ s/(.{50,60}) /$1\n#\t/g; - warn + print "# The following locales\n#\n", "#\t", $s, "\n#\n", "# tested okay.\n#\n", } else { - warn "# None of your locales were fully okay.\n"; + print "# None of your locales were fully okay.\n"; } if (@F) { my $F = join(" ", @F); $F =~ s/(.{50,60}) /$1\n#\t/g; - warn + my $details = ""; + unless ($debug) { + $details = "# For more details, rerun, with environment variable PERL_DEBUG_FULL_TEST=1.\n"; + } + elsif ($debug == 1) { + $details = "# For even more details, rerun, with environment variable PERL_DEBUG_FULL_TEST=2.\n"; + } + + print "# The following locales\n#\n", "#\t", $F, "\n#\n", "# had problems.\n#\n", + $details; } else { - warn "# None of your locales were broken.\n"; + print "# None of your locales were broken.\n"; } }