t/uni/fold.t

   1 use strict;
   2 use warnings;
   3
   4 # re/fold_grind.t has more complex tests, but doesn't test every fold
   5 # This file also tests the fc() keyword.
   6
   7 BEGIN {
   8     chdir 't' if -d 't';
   9     @INC = '../lib';
  10     require Config; import Config;
  11     require './test.pl';
  12     require './loc_tools.pl';   # Contains find_utf8_ctype_locale()
  13 }
  14
  15 use feature 'unicode_strings';
  16 use Unicode::UCD qw(all_casefolds);
  17
  18 binmode *STDOUT, ":utf8";
  19
  20 our $TODO;
  21
  22
  23 plan("no_plan");
  24 # Read in the official case folding definitions.
  25 my $casefolds = all_casefolds();
  26 my @folds;
  27 my @CF;
  28 my @simple_folds;
  29 my %reverse_fold;
  30 use Unicode::UCD;
  31 use charnames();
  32
  33 foreach my $decimal_code_point (sort { $a <=> $b } keys %$casefolds) {
  34     # We only use simple folds in fc(), since the regex engine uses full case
  35     # folding.
  36
  37     my $name = charnames::viacode($decimal_code_point);
  38     my $type = $casefolds->{$decimal_code_point}{'status'};
  39     my $code = $casefolds->{$decimal_code_point}{'code'};
  40     my $simple = $casefolds->{$decimal_code_point}{'simple'};
  41     my $full = $casefolds->{$decimal_code_point}{'full'};
  42
  43     if ($simple && $simple ne $full) { # If there is a distinction
  44         push @simple_folds, [ $code, $simple, $type, $name ];
  45     }
  46
  47     push @CF, [ $code, $full, $type, $name ];
  48
  49     # Get the inverse fold for single-char mappings.
  50     $reverse_fold{pack "U0U*", hex $simple} = pack "U0U*", $decimal_code_point if $simple;
  51 }
  52
  53 foreach my $test_ref ( @simple_folds ) {
  54     use feature 'fc';
  55     my ($code, $mapping, $type, $name) = @$test_ref;
  56     my $c = pack("U0U*", hex $code);
  57     my $f = pack("U0U*", map { hex } split " ", $mapping);
  58
  59     my $against = join "", "qq{", map("\\x{$_}", split " ", $mapping), "}";
  60     {
  61         isnt(fc($c), $f, "$code - $name - $mapping - $type - Full casefolding, fc(\\x{$code}) ne $against");
  62         isnt("\F$c", $f, "$code - $name - $mapping - $type - Full casefolding, qq{\\F\\x{$code}} ne $against");
  63     }
  64 }
  65
  66 foreach my $test_ref (@CF) {
  67     my ($code, $mapping, $type, $name) = @$test_ref;
  68     my $c = pack("U0U*", hex $code);
  69     my $f = pack("U0U*", map { hex } split " ", $mapping);
  70     my $f_length = length $f;
  71     foreach my $test (
  72             qq[":$c:" =~ /:$c:/],
  73             qq[":$c:" =~ /:$c:/i],
  74             qq[":$c:" =~ /:[_$c]:/], # Place two chars in [] so doesn't get
  75                                      # optimized to a non-charclass
  76             qq[":$c:" =~ /:[_$c]:/i],
  77             qq[":$c:" =~ /:$f:/i],
  78             qq[":$f:" =~ /:$c:/i],
  79     ) {
  80         ok eval $test, "$code - $name - $mapping - $type - $test";
  81     }
  82
  83     {
  84         # fc() tests
  85         my $against = join "", "qq{", map("\\x{$_}", split " ", $mapping), "}";
  86         is(CORE::fc($c), $f,
  87             "$code - $name - $mapping - $type - fc(\\x{$code}) eq $against");
  88         is("\F$c", $f, "$code - $name - $mapping - $type - qq{\\F\\x{$code}} eq $against");
  89
  90         # And here we test bytes. For [A-Za-z0-9], the fold is the same as lc under
  91         # bytes. For everything else, it's the bytes that formed the original string.
  92         if ( $c =~ /[A-Za-z0-9]/ ) {
  93             use bytes;
  94             is(CORE::fc($c), lc($c), "$code - $name - fc and use bytes, ascii");
  95         } else {
  96             my $copy = "" . $c;
  97             utf8::encode($copy);
  98             is($copy, do { use bytes; CORE::fc($c) }, "$code - $name - fc and use bytes");
  99         }
 100     }
 101     # Certain tests weren't convenient to put in the list above since they are
 102     # TODO's in multi-character folds.
 103     if ($f_length == 1) {
 104
 105         # The qq loses the utf8ness of ":$f:".  These tests are not about
 106         # finding bugs in utf8ness, so make sure it's utf8.
 107         my $test = qq[my \$s = ":$f:"; utf8::upgrade(\$s); \$s =~ /:[_$c]:/i];
 108         ok eval $test, "$code - $name - $mapping - $type - $test";
 109         $test = qq[":$c:" =~ /:[_$f]:/i];
 110         ok eval $test, "$code - $name - $mapping - $type - $test";
 111     }
 112     else {
 113
 114         # There are two classes of multi-char folds that need more work.  For
 115         # example,
 116         #   ":ß:" =~ /:[_s]{2}:/i
 117         #   ":ss:" =~ /:[_ß]:/i
 118         #
 119         # Some of the old tests for the second case happened to pass somewhat
 120         # coincidentally.  But none would pass if changed to this.
 121         #   ":SS:" =~ /:[_ß]:/i
 122         #
 123         # As the capital SS doesn't get folded.  When those pass, it means
 124         # that the code has been changed to take into account folding in the
 125         # string, and all should pass, capitalized or not (this wouldn't be
 126         # true for [^complemented character classes], for which the fold case
 127         # is better, but these aren't used in this .t currently.  So, what is
 128         # done is to essentially upper-case the string for this class (but use
 129         # the reverse fold not uc(), as that is more correct)
 130         my $u;
 131         for my $i (0 .. $f_length - 1) {
 132             my $cur_char = substr($f, $i, 1);
 133             $u .= $reverse_fold{$cur_char} || $cur_char;
 134         }
 135         my $test;
 136
 137         # A multi-char fold should not match just one char;
 138         # e.g., ":ß:" !~ /:[_s]:/i
 139         $test = qq[":$c:" !~ /:[_$f]:/i];
 140         ok eval $test, "$code - $name - $mapping - $type - $test";
 141
 142         TODO: { # e.g., ":ß:" =~ /:[_s]{2}:/i
 143             local $TODO = 'Multi-char fold in [character class]';
 144
 145             $test = qq[":$c:" =~ /:[_$f]{$f_length}:/i];
 146             ok eval $test, "$code - $name - $mapping - $type - $test";
 147         }
 148
 149         # e.g., ":SS:" =~ /:[_ß]:/i now pass, so TODO has been removed, but
 150         # since they use '$u', they are left out of the main loop
 151         $test = qq[ my \$s = ":$u:"; utf8::upgrade(\$s); \$s =~ /:[_$c]:/i];
 152         ok eval $test, "$code - $name - $mapping - $type - $test";
 153     }
 154 }
 155
 156 {
 157     use utf8;
 158     use feature qw(fc);
 159     # These three come from the ICU project's test suite, more especifically
 160     # http://icu.sourcearchive.com/documentation/4.4~rc1-1/strcase_8cpp-source.html
 161
 162     my $s = "A\N{U+00df}\N{U+00b5}\N{U+fb03}\N{U+1040C}\N{U+0130}\N{U+0131}";
 163     #\N{LATIN CAPITAL LETTER A}\N{LATIN SMALL LETTER SHARP S}\N{MICRO SIGN}\N{LATIN SMALL LIGATURE FFI}\N{DESERET CAPITAL LETTER AY}\N{LATIN CAPITAL LETTER I WITH DOT ABOVE}\N{LATIN SMALL LETTER DOTLESS I}
 164
 165     my $f = "ass\N{U+03bc}ffi\N{U+10434}i\N{U+0307}\N{U+0131}";
 166     #\N{LATIN SMALL LETTER A}\N{LATIN SMALL LETTER S}\N{LATIN SMALL LETTER S}\N{GREEK SMALL LETTER MU}\N{LATIN SMALL LETTER F}\N{LATIN SMALL LETTER F}\N{LATIN SMALL LETTER I}\N{DESERET SMALL LETTER AY}\N{LATIN SMALL LETTER I}\N{COMBINING DOT ABOVE}\N{LATIN SMALL LETTER DOTLESS I}
 167
 168     is(fc($s), $f, "ICU's casefold test passes");
 169     is("\F$s", $f, "ICU's casefold test passes");
 170
 171     is( fc("aBİIıϐßﬃ񟿿"), "abi̇iıβssffi񟿿" );
 172     is( "\FaBİIıϐßﬃ񟿿", "abi̇iıβssffi񟿿" );
 173 #    TODO: {
 174 #        local $::TODO = "turkic special cases";
 175 #        is( fc "aBİIıϐßﬃ񟿿", "abiııβssffi񟿿" );
 176 #    }
 177
 178     # The next batch come from http://www.devdaily.com/java/jwarehouse/lucene/contrib/icu/src/test/org/apache/lucene/analysis/icu/TestICUFoldingFilter.java.shtml
 179     # Except the article got most casings wrong. Or maybe Lucene does.
 180
 181     is( fc("This is a test"), "this is a test" );
 182     is( fc("Ruß"), "russ"    );
 183     is( fc("ΜΆΪΟΣ"), "μάϊοσ" );
 184     is( fc("Μάϊος"), "μάϊοσ" );
 185     is( fc("𐐖"), "𐐾"       );
 186     is( fc("r\xe9sum\xe9"), "r\xe9sum\xe9" );
 187     is( fc("re\x{0301}sume\x{0301}"), "re\x{301}sume\x{301}" );
 188     is( fc("ELİF"), "eli\x{307}f" );
 189     is( fc("eli\x{307}f"), "eli\x{307}f");
 190
 191     # This batch comes from
 192     # http://www.java2s.com/Open-Source/Java-Document/Internationalization-Localization/icu4j/com/ibm/icu/dev/test/lang/UCharacterCaseTest.java.htm
 193     # Which uses ICU as the backend.
 194
 195     my @folding_mixed = (
 196         "\x{61}\x{42}\x{130}\x{49}\x{131}\x{3d0}\x{df}\x{fb03}",
 197         "A\x{df}\x{b5}\x{fb03}\x{1040C}\x{130}\x{131}",
 198     );
 199
 200     my @folding_default = (
 201         "\x{61}\x{62}\x{69}\x{307}\x{69}\x{131}\x{3b2}\x{73}\x{73}\x{66}\x{66}\x{69}",
 202         "ass\x{3bc}ffi\x{10434}i\x{307}\x{131}",
 203     );
 204
 205     my @folding_exclude_turkic = (
 206         "\x{61}\x{62}\x{69}\x{131}\x{131}\x{3b2}\x{73}\x{73}\x{66}\x{66}\x{69}",
 207         "ass\x{3bc}ffi\x{10434}i\x{131}",
 208     );
 209
 210     is( fc($folding_mixed[1]), $folding_default[1] );
 211
 212     is( fc($folding_mixed[0]), $folding_default[0] );
 213
 214 }
 215
 216 {
 217     use utf8;
 218     # Table stolen from tchrist's mail in
 219     # http://bugs.python.org/file23051/casing-tests.py
 220     # and http://98.245.80.27/tcpc/OSCON2011/case-test.python3
 221     # For reference, it's a longer version of what he posted here:
 222     # http://stackoverflow.com/questions/6991038/case-insensitive-storage-and-unicode-compatibility
 223
 224     #Couple of repeats because I'm lazy, not tchrist's fault.
 225
 226     #This should probably go in t/op/lc.t
 227
 228     my @test_table = (
 229 # ORIG LC_SIMPLE TC_SIMPLE UC_SIMPLE LC_FULL TC_FULL UC_FULL FC_SIMPLE FC_TURKIC FC_FULL
 230 [ 'þǽr rihtes', 'þǽr rihtes', 'Þǽr Rihtes', 'ÞǼR RIHTES', 'þǽr rihtes', 'Þǽr Rihtes', 'ÞǼR RIHTES', 'þǽr rihtes', 'þǽr rihtes', 'þǽr rihtes',  ],
 231 [ 'duȝeðlice', 'duȝeðlice', 'Duȝeðlice', 'DUȜEÐLICE', 'duȝeðlice', 'Duȝeðlice', 'DUȜEÐLICE', 'duȝeðlice', 'duȝeðlice', 'duȝeðlice',  ],
 232 [ 'Ævar Arnfjörð Bjarmason', 'ævar arnfjörð bjarmason', 'Ævar Arnfjörð Bjarmason', 'ÆVAR ARNFJÖRÐ BJARMASON', 'ævar arnfjörð bjarmason', 'Ævar Arnfjörð Bjarmason', 'ÆVAR ARNFJÖRÐ BJARMASON', 'ævar arnfjörð bjarmason', 'ævar arnfjörð bjarmason', 'ævar arnfjörð bjarmason',  ],
 233 [ 'Кириллица', 'кириллица', 'Кириллица', 'КИРИЛЛИЦА', 'кириллица', 'Кириллица', 'КИРИЛЛИЦА', 'кириллица', 'кириллица', 'кириллица',  ],
 234 [ 'ĳ', 'ĳ', 'Ĳ', 'Ĳ', 'ĳ', 'Ĳ', 'Ĳ', 'ĳ', 'ĳ', 'ĳ',  ],
 235 [ 'Van Dĳke', 'van dĳke', 'Van Dĳke', 'VAN DĲKE', 'van dĳke', 'Van Dĳke', 'VAN DĲKE', 'van dĳke', 'van dĳke', 'van dĳke',  ],
 236 [ 'VAN DĲKE', 'van dĳke', 'Van Dĳke', 'VAN DĲKE', 'van dĳke', 'Van Dĳke', 'VAN DĲKE', 'van dĳke', 'van dĳke', 'van dĳke',  ],
 237 [ 'eﬃcient', 'eﬃcient', 'Eﬃcient', 'EﬃCIENT', 'eﬃcient', 'Eﬃcient', 'EFFICIENT', 'eﬃcient', 'efficient', 'efficient',  ],
 238 [ 'ﬂour', 'ﬂour', 'ﬂour', 'ﬂOUR', 'ﬂour', 'Flour', 'FLOUR', 'ﬂour', 'flour', 'flour',  ],
 239 [ 'ﬂour and water', 'ﬂour and water', 'ﬂour And Water', 'ﬂOUR AND WATER', 'ﬂour and water', 'Flour And Water', 'FLOUR AND WATER', 'ﬂour and water', 'flour and water', 'flour and water',  ],
 240 [ 'ǳur', 'ǳur', 'ǲur', 'ǱUR', 'ǳur', 'ǲur', 'ǱUR', 'ǳur', 'ǳur', 'ǳur',  ],
 241 [ 'ǲur', 'ǳur', 'ǲur', 'ǱUR', 'ǳur', 'ǲur', 'ǱUR', 'ǳur', 'ǳur', 'ǳur',  ],
 242 [ 'ǱUR', 'ǳur', 'ǲur', 'ǱUR', 'ǳur', 'ǲur', 'ǱUR', 'ǳur', 'ǳur', 'ǳur',  ],
 243 [ 'ǳur mountain', 'ǳur mountain', 'ǲur Mountain', 'ǱUR MOUNTAIN', 'ǳur mountain', 'ǲur Mountain', 'ǱUR MOUNTAIN', 'ǳur mountain', 'ǳur mountain', 'ǳur mountain',  ],
 244 [ 'ǲur Mountain', 'ǳur mountain', 'ǲur Mountain', 'ǱUR MOUNTAIN', 'ǳur mountain', 'ǲur Mountain', 'ǱUR MOUNTAIN', 'ǳur mountain', 'ǳur mountain', 'ǳur mountain',  ],
 245 [ 'ǱUR MOUNTAIN', 'ǳur mountain', 'ǲur Mountain', 'ǱUR MOUNTAIN', 'ǳur mountain', 'ǲur Mountain', 'ǱUR MOUNTAIN', 'ǳur mountain', 'ǳur mountaın', 'ǳur mountain',  ],
 246 [ 'poſt', 'poſt', 'Poſt', 'POST', 'poſt', 'Poſt', 'POST', 'post', 'post', 'post',  ],
 247 [ 'poﬅ', 'poﬅ', 'Poﬅ', 'POﬅ', 'poﬅ', 'Poﬅ', 'POST', 'poﬅ', 'post', 'post',  ],
 248 [ 'ﬅop', 'ﬅop', 'ﬅop', 'ﬅOP', 'ﬅop', 'Stop', 'STOP', 'ﬅop', 'stop', 'stop',  ],
 249 [ 'tschüß', 'tschüß', 'Tschüß', 'TSCHÜß', 'tschüß', 'Tschüß', 'TSCHÜSS', 'tschüß', 'tschüss', 'tschüss',  ],
 250 [ 'TSCHÜẞ', 'tschüß', 'Tschüß', 'TSCHÜẞ', 'tschüß', 'Tschüß', 'TSCHÜẞ', 'tschüß', 'tschüss', 'tschüss',  ],
 251 [ 'weiß', 'weiß', 'Weiß', 'WEIß', 'weiß', 'Weiß', 'WEISS', 'weiß', 'weiss', 'weiss',  ],
 252 [ 'WEIẞ', 'weiß', 'Weiß', 'WEIẞ', 'weiß', 'Weiß', 'WEIẞ', 'weiß', 'weıss', 'weiss',  ],
 253 [ 'ẞIEW', 'ßiew', 'ẞiew', 'ẞIEW', 'ßiew', 'ẞiew', 'ẞIEW', 'ßiew', 'ssıew', 'ssiew',  ],
 254 [ 'ᾲ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι',  ],
 255 [ 'Ὰι', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι',  ],
 256 [ 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι',  ],
 257 [ 'ᾲ', 'ᾲ', 'ᾲ', 'ᾲ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ᾲ', 'ὰι', 'ὰι',  ],
 258 [ 'Ὰͅ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ᾲ', 'Ὰͅ', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι',  ],
 259 [ 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'Ὰι', 'ᾺΙ', 'ὰι', 'ὰι', 'ὰι',  ],
 260 [ 'ᾲ στο διάολο', 'ᾲ στο διάολο', 'ᾲ Στο Διάολο', 'ᾲ ΣΤΟ ΔΙΆΟΛΟ', 'ᾲ στο διάολο', 'Ὰͅ Στο Διάολο', 'ᾺΙ ΣΤΟ ΔΙΆΟΛΟ', 'ᾲ στο διάολο', 'ὰι στο διάολο', 'ὰι στο διάολο',  ],
 261 [ 'ᾲ στο διάολο', 'ᾲ στο διάολο', 'Ὰͅ Στο Διάολο', 'ᾺΙ ΣΤΟ ΔΙΆΟΛΟ', 'ᾲ στο διάολο', 'Ὰͅ Στο Διάολο', 'ᾺΙ ΣΤΟ ΔΙΆΟΛΟ', 'ὰι στο διάολο', 'ὰι στο διάολο', 'ὰι στο διάολο',  ],
 262 [ '𐐼𐐯𐑅𐐨𐑉𐐯𐐻', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻', '𐐔𐐯𐑅𐐨𐑉𐐯𐐻', '𐐔𐐇𐐝𐐀𐐡𐐇𐐓', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻', '𐐔𐐯𐑅𐐨𐑉𐐯𐐻', '𐐔𐐇𐐝𐐀𐐡𐐇𐐓', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻',  ],
 263 [ '𐐔𐐯𐑅𐐨𐑉𐐯𐐻', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻', '𐐔𐐯𐑅𐐨𐑉𐐯𐐻', '𐐔𐐇𐐝𐐀𐐡𐐇𐐓', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻', '𐐔𐐯𐑅𐐨𐑉𐐯𐐻', '𐐔𐐇𐐝𐐀𐐡𐐇𐐓', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻',  ],
 264 [ '𐐔𐐇𐐝𐐀𐐡𐐇𐐓', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻', '𐐔𐐯𐑅𐐨𐑉𐐯𐐻', '𐐔𐐇𐐝𐐀𐐡𐐇𐐓', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻', '𐐔𐐯𐑅𐐨𐑉𐐯𐐻', '𐐔𐐇𐐝𐐀𐐡𐐇𐐓', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻', '𐐼𐐯𐑅𐐨𐑉𐐯𐐻',  ],
 265 [ 'henry ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'henry ⅷ', 'henry ⅷ',  ],
 266 [ 'Henry Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'henry ⅷ', 'henry ⅷ',  ],
 267 [ 'HENRY Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'Henry Ⅷ', 'HENRY Ⅷ', 'henry ⅷ', 'henry ⅷ', 'henry ⅷ',  ],
 268 [ 'i work at ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'i work at ⓚ', 'i work at ⓚ',  ],
 269 [ 'I Work At Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'ı work at ⓚ', 'i work at ⓚ',  ],
 270 [ 'I WORK AT Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'I Work At Ⓚ', 'I WORK AT Ⓚ', 'i work at ⓚ', 'ı work at ⓚ', 'i work at ⓚ',  ],
 271 [ 'istambul', 'istambul', 'Istambul', 'ISTAMBUL', 'istambul', 'Istambul', 'ISTAMBUL', 'istambul', 'istambul', 'istambul',  ],
 272 [ 'i̇stanbul', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'i̇stanbul', 'i̇stanbul',  ],
 273 [ 'İstanbul', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'ı̇stanbul', 'i̇stanbul',  ],
 274 [ 'İSTANBUL', 'istanbul', 'İstanbul', 'İSTANBUL', 'i̇stanbul', 'İstanbul', 'İSTANBUL', 'İstanbul', 'istanbul', 'i̇stanbul',  ],
 275 [ 'στιγμας', 'στιγμας', 'Στιγμας', 'ΣΤΙΓΜΑΣ', 'στιγμας', 'Στιγμας', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'στιγμασ', 'στιγμασ',  ],
 276 [ 'στιγμασ', 'στιγμασ', 'Στιγμασ', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'Στιγμασ', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'στιγμασ', 'στιγμασ',  ],
 277 [ 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'Στιγμασ', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'Στιγμασ', 'ΣΤΙΓΜΑΣ', 'στιγμασ', 'στιγμασ', 'στιγμασ',  ],
 278 [ 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ',  ],
 279 [ 'Ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ',  ],
 280 [ 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'Ʀᴀʀᴇ', 'ƦᴀƦᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ', 'ʀᴀʀᴇ',  ],
 281 [ 'Ԧԧ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'ԧԧ', 'ԧԧ',  ],
 282 [ 'ԧԧ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'ԧԧ', 'ԧԧ',  ],
 283 [ 'Ԧԧ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'ԧԧ', 'ԧԧ',  ],
 284 [ 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'Ԧԧ', 'ԦԦ', 'ԧԧ', 'ԧԧ', 'ԧԧ',  ],
 285 [ "þǽr rihtes", "þǽr rihtes", "Þǽr Rihtes", "ÞǼR RIHTES", "þǽr rihtes", "Þǽr Rihtes", "ÞǼR RIHTES", "þǽr rihtes", "þǽr rihtes", "þǽr rihtes",  ],
 286 [ "duȝeðlice", "duȝeðlice", "Duȝeðlice", "DUȜEÐLICE", "duȝeðlice", "Duȝeðlice", "DUȜEÐLICE", "duȝeðlice", "duȝeðlice", "duȝeðlice",  ],
 287 [ "Van Dĳke", "van dĳke", "Van Dĳke", "VAN DĲKE", "van dĳke", "Van Dĳke", "VAN DĲKE", "van dĳke", "van dĳke", "van dĳke",  ],
 288 [ "ﬁ", "ﬁ", "ﬁ", "ﬁ", "ﬁ", "Fi", "FI", "ﬁ", "fi", "fi",  ],
 289 [ "ﬁlesystem", "ﬁlesystem", "ﬁlesystem", "ﬁLESYSTEM", "ﬁlesystem", "Filesystem", "FILESYSTEM", "ﬁlesystem", "filesystem", "filesystem",  ],
 290 [ "eﬃcient", "eﬃcient", "Eﬃcient", "EﬃCIENT", "eﬃcient", "Eﬃcient", "EFFICIENT", "eﬃcient", "efficient", "efficient",  ],
 291 [ "ﬂour and water", "ﬂour and water", "ﬂour And Water", "ﬂOUR AND WATER", "ﬂour and water", "Flour And Water", "FLOUR AND WATER", "ﬂour and water", "flour and water", "flour and water",  ],
 292 [ "ǳ", "ǳ", "ǲ", "Ǳ", "ǳ", "ǲ", "Ǳ", "ǳ", "ǳ", "ǳ",  ],
 293 [ "ǳur mountain", "ǳur mountain", "ǲur Mountain", "ǱUR MOUNTAIN", "ǳur mountain", "ǲur Mountain", "ǱUR MOUNTAIN", "ǳur mountain", "ǳur mountain", "ǳur mountain",  ],
 294 [ "poſt", "poſt", "Poſt", "POST", "poſt", "Poſt", "POST", "post", "post", "post",  ],
 295 [ "poﬅ", "poﬅ", "Poﬅ", "POﬅ", "poﬅ", "Poﬅ", "POST", "poﬅ", "post", "post",  ],
 296 [ "ﬅop", "ﬅop", "ﬅop", "ﬅOP", "ﬅop", "Stop", "STOP", "ﬅop", "stop", "stop",  ],
 297 [ "tschüß", "tschüß", "Tschüß", "TSCHÜß", "tschüß", "Tschüß", "TSCHÜSS", "tschüß", "tschüss", "tschüss",  ],
 298 [ "TSCHÜẞ", "tschüß", "Tschüß", "TSCHÜẞ", "tschüß", "Tschüß", "TSCHÜẞ", "tschüß", "tschüss", "tschüss",  ],
 299 [ "rußland", "rußland", "Rußland", "RUßLAND", "rußland", "Rußland", "RUSSLAND", "rußland", "russland", "russland",  ],
 300 [ "RUẞLAND", "rußland", "Rußland", "RUẞLAND", "rußland", "Rußland", "RUẞLAND", "rußland", "russland", "russland",  ],
 301 [ "weiß", "weiß", "Weiß", "WEIß", "weiß", "Weiß", "WEISS", "weiß", "weiss", "weiss",  ],
 302 [ "WEIẞ", "weiß", "Weiß", "WEIẞ", "weiß", "Weiß", "WEIẞ", "weiß", "weıss", "weiss",  ],
 303 [ "ẞIEW", "ßiew", "ẞiew", "ẞIEW", "ßiew", "ẞiew", "ẞIEW", "ßiew", "ssıew", "ssiew",  ],
 304 [ "ͅ", "ͅ", "Ι", "Ι", "ͅ", "Ι", "Ι", "ι", "ι", "ι",  ],
 305 [ "ᾲ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "Ὰͅ", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
 306 [ "Ὰι", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
 307 [ "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
 308 [ "ᾲ", "ᾲ", "ᾲ", "ᾲ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "ὰι", "ὰι",  ],
 309 [ "Ὰͅ", "ᾲ", "Ὰͅ", "ᾺΙ", "ᾲ", "Ὰͅ", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
 310 [ "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "Ὰι", "ᾺΙ", "ὰι", "ὰι", "ὰι",  ],
 311 [ "ᾲ στο διάολο", "ᾲ στο διάολο", "ᾲ Στο Διάολο", "ᾲ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "ὰι στο διάολο", "ὰι στο διάολο",  ],
 312 [ "ᾲ στο διάολο", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ᾲ στο διάολο", "Ὰͅ Στο Διάολο", "ᾺΙ ΣΤΟ ΔΙΆΟΛΟ", "ὰι στο διάολο", "ὰι στο διάολο", "ὰι στο διάολο",  ],
 313 [ "ⅷ", "ⅷ", "Ⅷ", "Ⅷ", "ⅷ", "Ⅷ", "Ⅷ", "ⅷ", "ⅷ", "ⅷ",  ],
 314 [ "henry ⅷ", "henry ⅷ", "Henry Ⅷ", "HENRY Ⅷ", "henry ⅷ", "Henry Ⅷ", "HENRY Ⅷ", "henry ⅷ", "henry ⅷ", "henry ⅷ",  ],
 315 [ "ⓚ", "ⓚ", "Ⓚ", "Ⓚ", "ⓚ", "Ⓚ", "Ⓚ", "ⓚ", "ⓚ", "ⓚ",  ],
 316 [ "i work at ⓚ", "i work at ⓚ", "I Work At Ⓚ", "I WORK AT Ⓚ", "i work at ⓚ", "I Work At Ⓚ", "I WORK AT Ⓚ", "i work at ⓚ", "i work at ⓚ", "i work at ⓚ",  ],
 317 [ "istambul", "istambul", "Istambul", "ISTAMBUL", "istambul", "Istambul", "ISTAMBUL", "istambul", "istambul", "istambul",  ],
 318 [ "i̇stanbul", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "i̇stanbul", "i̇stanbul",  ],
 319 [ "İstanbul", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "ı̇stanbul", "i̇stanbul",  ],
 320 [ "İSTANBUL", "istanbul", "İstanbul", "İSTANBUL", "i̇stanbul", "İstanbul", "İSTANBUL", "İstanbul", "istanbul", "i̇stanbul",  ],
 321 [ "στιγμας", "στιγμας", "Στιγμας", "ΣΤΙΓΜΑΣ", "στιγμας", "Στιγμας", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ",  ],
 322 [ "στιγμασ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ",  ],
 323 [ "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "Στιγμασ", "ΣΤΙΓΜΑΣ", "στιγμασ", "στιγμασ", "στιγμασ",  ],
 324 [ "ʀᴀʀᴇ", "ʀᴀʀᴇ", "Ʀᴀʀᴇ", "ƦᴀƦᴇ", "ʀᴀʀᴇ", "Ʀᴀʀᴇ", "ƦᴀƦᴇ", "ʀᴀʀᴇ", "ʀᴀʀᴇ", "ʀᴀʀᴇ",  ],
 325 [ "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐔𐐯𐑅𐐨𐑉𐐯𐐻", "𐐔𐐇𐐝𐐀𐐡𐐇𐐓", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐔𐐯𐑅𐐨𐑉𐐯𐐻", "𐐔𐐇𐐝𐐀𐐡𐐇𐐓", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻", "𐐼𐐯𐑅𐐨𐑉𐐯𐐻",  ],
 326 [ "Ԧԧ", "ԧԧ", "Ԧԧ", "ԦԦ", "ԧԧ", "Ԧԧ", "ԦԦ", "ԧԧ", "ԧԧ", "ԧԧ",  ],
 327 [ "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "ﬓﬔﬕﬖﬗ", "Մնﬔﬕﬖﬗ", "ՄՆՄԵՄԻՎՆՄԽ", "ﬓﬔﬕﬖﬗ", "մնմեմիվնմխ", "մնմեմիվնմխ",  ],
 328 [ "ŉ groot", "ŉ groot", "ŉ Groot", "ŉ GROOT", "ŉ groot", "ʼN Groot", "ʼN GROOT", "ŉ groot", "ʼn groot", "ʼn groot",  ],
 329 [ "ẚ", "ẚ", "ẚ", "ẚ", "ẚ", "Aʾ", "Aʾ", "ẚ", "aʾ", "aʾ",  ],
 330 [ "ﬀ", "ﬀ", "ﬀ", "ﬀ", "ﬀ", "Ff", "FF", "ﬀ", "ff", "ff",  ],
 331 [ "ǰ", "ǰ", "ǰ", "ǰ", "ǰ", "J̌", "J̌", "ǰ", "ǰ", "ǰ",  ],
 332 [ "550 nm or Å", "550 nm or å", "550 Nm Or Å", "550 NM OR Å", "550 nm or å", "550 Nm Or Å", "550 NM OR Å", "550 nm or å", "550 nm or å", "550 nm or å",  ],
 333 );
 334
 335     use feature qw(fc);
 336
 337     for (@test_table) {
 338         my ($simple_lc, $simple_tc, $simple_uc, $simple_fc) = @{$_}[1, 2, 3, 7];
 339         my ($orig, $lower, $titlecase, $upper, $fc_turkic, $fc_full) = @{$_}[0,4,5,6,8,9];
 340
 341         if ($orig =~ /(\P{Assigned})/) {   # So can fail gracefully in earlier
 342                                            # Unicode versions
 343             fail(sprintf "because U+%04X is unassigned", ord($1));
 344             next;
 345         }
 346         is( fc($orig), $fc_full, "fc('$orig') returns '$fc_full'" );
 347         is( "\F$orig", $fc_full, '\F works' );
 348         is( lc($orig), $lower,   "lc('$orig') returns '$lower'" );
 349         is( "\L$orig", $lower,   '\L works' );
 350         is( uc($orig), $upper,   "uc('$orig') returns '$upper'" );
 351         is( "\U$orig", $upper,   '\U works' );
 352     }
 353 }
 354
 355 {
 356     use feature qw(fc);
 357     package Eeyup  { use overload q{""} => sub { "\x{df}"   }, fallback => 1 }
 358     package Uunope { use overload q{""} => sub { "\x{30cb}" }, fallback => 1 }
 359     package Undef  { use overload q{""} => sub {   undef    }, fallback => 1 }
 360
 361     my $obj = bless {}, "Eeyup";
 362     is(fc($obj), "ss", "fc() works on overloaded objects returning latin-1");
 363     $obj = bless {}, "Eeyup";
 364     is("\F$obj", "ss", '\F works on overloaded objects returning latin-1');
 365
 366     $obj = bless {}, "Uunope";
 367     is(fc($obj), "\x{30cb}", "fc() works on overloaded objects returning UTF-8");
 368     $obj = bless {}, "Uunope";
 369     is("\F$obj", "\x{30cb}", '\F works on overloaded objects returning UTF-8');
 370
 371     $obj = bless {}, "Undef";
 372     my $warnings;
 373     {
 374         no warnings;
 375         use warnings "uninitialized";
 376         local $SIG{__WARN__} = sub { $warnings++; like(shift, qr/Use of uninitialized value (?:\$obj )?in fc/) };
 377         fc(undef);
 378         fc($obj);
 379     }
 380     is( $warnings, 2, "correct number of warnings" );
 381
 382     my $fetched = 0;
 383     package Derpy { sub TIESCALAR { bless {}, shift } sub FETCH { $fetched++; "\x{df}" } }
 384
 385     tie my $x, "Derpy";
 386
 387     is( fc($x), "ss", "fc() works on tied values" );
 388     is( $fetched, 1, "and only calls the magic once" );
 389
 390 }
 391
 392 {
 393     use feature qw( fc );
 394     my $troublesome1 = "\xdf" x 11; #SvLEN should be 12, SvCUR should be 11
 395                                     #So this should force fc() to grow the string.
 396
 397     is( fc($troublesome1), "ss" x 11, "fc() grows the string" );
 398
 399     my $troublesome2 = "abcdef:\x{df}:fjksjs"; #SvLEN should be 16, SvCUR should be 15
 400     is( fc($troublesome2), "abcdef:ss:fjksjs", "fc() expands \\x{DF} in the middle of a string that needs to grow" );
 401
 402     my $troublesome3 = ":\x{df}:";
 403     is( fc($troublesome3), ":ss:", "fc() expands \\x{DF} in the middle of a string" );
 404
 405
 406     my $troublesome4 = "\x{B5}"; #\N{MICRON SIGN} is latin-1, but its foldcase is in UTF-8
 407
 408     is( fc($troublesome4), "\x{3BC}", "fc() for a latin-1 \x{B5} returns UTF-8" );
 409     ok( !utf8::is_utf8($troublesome4), "fc() doesn't upgrade the original string" );
 410
 411
 412     my $troublesome5 = "\x{C9}abda\x{B5}aaf\x{C8}"; # Up until foldcasing \x{B5}, the string
 413                                                     # was in Latin-1. This tests that the
 414                                                     # results don't have illegal UTF-8
 415                                                     # (i.e. leftover latin-1) in them
 416
 417     is( fc($troublesome5), "\x{E9}abda\x{3BC}aaf\x{E8}" );
 418 }
 419
 420
 421 SKIP: {
 422     use feature qw( fc unicode_strings );
 423
 424     eval { require POSIX; import POSIX 'locale_h'; };
 425     unless (defined &POSIX::LC_ALL) {
 426        skip "no POSIX (or no Fcntl, or no dynamic loading)", 256;
 427     }
 428
 429     setlocale(&POSIX::LC_ALL, "C") if $Config{d_setlocale};
 430
 431     # This tests both code paths in pp_fc
 432
 433     for (0..0xff) {
 434         my $latin1 = chr;
 435         my $utf8   = $latin1;
 436         utf8::downgrade($latin1); #No-op, but doesn't hurt
 437         utf8::upgrade($utf8);
 438         is(fc($latin1), fc($utf8), "fc() gives the same results for \\x{$_} in Latin-1 and UTF-8 under unicode_strings");
 439         SKIP: {
 440               skip 'No locale testing without d_setlocale', 2 if(!$Config{d_setlocale});
 441               BEGIN {
 442                   if($Config{d_setlocale}) {
 443                       require locale; import locale;
 444                   }
 445               }
 446             is(fc($latin1), lc($latin1), "use locale; fc(qq{\\x{$_}}), lc(qq{\\x{$_}}) when qq{\\x{$_}} is in latin-1");
 447             is(fc($utf8), lc($utf8), "use locale; fc(qq{\\x{$_}}), lc(qq{\\x{$_}}) when qq{\\x{$_}} is in latin-1");
 448         }
 449         {
 450             no feature 'unicode_strings';
 451             is(fc($latin1), lc($latin1), "under nothing, fc() for <256 is the same as lc");
 452         }
 453     }
 454 }
 455
 456 my $utf8_locale = find_utf8_ctype_locale();
 457
 458 {
 459     use feature qw( fc );
 460     use locale;
 461     is(fc("\x{1E9E}"), fc("\x{17F}\x{17F}"), 'fc("\x{1E9E}") eq fc("\x{17F}\x{17F}")');
 462     SKIP: {
 463         skip 'Can\'t find a UTF-8 locale', 1 unless defined $utf8_locale;
 464         setlocale(&LC_CTYPE, $utf8_locale);
 465         is(fc("\x{1E9E}"), "ss", 'fc("\x{1E9E}") eq "ss" in a UTF-8 locale)');
 466     }
 467 }
 468
 469 SKIP: {
 470     skip 'Can\'t find a UTF-8 locale', 256 unless defined $utf8_locale;
 471
 472     use feature qw( fc unicode_strings );
 473
 474     # Get the official fc values outside locale.
 475     no locale;
 476     my @unicode_fc;
 477     for (0..0xff) {
 478         push @unicode_fc, fc(chr);
 479     }
 480
 481     # These should match the UTF-8 locale values
 482     setlocale(&LC_CTYPE, $utf8_locale);
 483     use locale;
 484     for (0..0xff) {
 485         is(fc(chr), $unicode_fc[$_], "In a UTF-8 locale, fc(chr $_) is the same as official Unicode");
 486     }
 487 }
 488
 489
 490 my $num_tests = curr_test() - 1;
 491
 492 plan($num_tests);