lib/feature/unicode_strings.t

   1 use warnings;
   2 use strict;
   3
   4 BEGIN {
   5     chdir 't' if -d 't';
   6     @INC = '../lib';
   7     require './test.pl';
   8 }
   9
  10 plan(20736);    # Determined by experimentation
  11
  12 # In this section, test the upper/lower/title case mappings for all characters
  13 # 0-255.
  14
  15 # First compute the case mappings without resorting to the functions we're
  16 # testing.
  17
  18 # Initialize the arrays so each $i maps to itself.
  19 my @posix_to_upper;
  20 for my $i (0 .. 255) {
  21     $posix_to_upper[$i] = chr($i);
  22 }
  23 my @posix_to_lower
  24 = my @posix_to_title
  25 = my @latin1_to_upper
  26 = my @latin1_to_lower
  27 = my @latin1_to_title
  28 = @posix_to_upper;
  29
  30 # Override the elements in the to_lower arrays that have different standard
  31 # lower case mappings.  (standard meaning they are 32 numbers apart)
  32 for my $i (0x41 .. 0x5A, 0xC0 .. 0xD6, 0xD8 .. 0xDE) {
  33     my $upper_ord = utf8::unicode_to_native $i;
  34     my $lower_ord = utf8::unicode_to_native($i + 32);
  35
  36     $latin1_to_lower[$upper_ord] = chr($lower_ord);
  37
  38     next if $i > 127;
  39
  40     $posix_to_lower[$upper_ord] = chr($lower_ord);
  41 }
  42
  43 # Same for upper and title
  44 for my $i (0x61 .. 0x7A, 0xE0 .. 0xF6, 0xF8 .. 0xFE) {
  45     my $lower_ord = utf8::unicode_to_native $i;
  46     my $upper_ord = utf8::unicode_to_native($i - 32);
  47
  48     $latin1_to_upper[$lower_ord] = chr($upper_ord);
  49     $latin1_to_title[$lower_ord] = chr($upper_ord);
  50
  51     next if $i > 127;
  52
  53     $posix_to_upper[$lower_ord] = chr($upper_ord);
  54     $posix_to_title[$lower_ord] = chr($upper_ord);
  55 }
  56
  57 # Override the abnormal cases.
  58 $latin1_to_upper[utf8::unicode_to_native 0xB5] = chr(0x39C);
  59 $latin1_to_title[utf8::unicode_to_native 0xB5] = chr(0x39C);
  60 $latin1_to_upper[utf8::unicode_to_native 0xDF] = 'SS';
  61 $latin1_to_title[utf8::unicode_to_native 0xDF] = 'Ss';
  62 $latin1_to_upper[utf8::unicode_to_native 0xFF] = chr(0x178);
  63 $latin1_to_title[utf8::unicode_to_native 0xFF] = chr(0x178);
  64
  65 my $repeat = 25;    # Length to make strings.
  66
  67 # Create hashes of strings in several ranges, both for uc and lc.
  68 my %posix;
  69 $posix{'uc'} = 'A' x $repeat;
  70 $posix{'lc'} = 'a' x $repeat ;
  71
  72 my %cyrillic;
  73 $cyrillic{'uc'} = chr(0x42F) x $repeat;
  74 $cyrillic{'lc'} = chr(0x44F) x $repeat;
  75
  76 my %latin1;
  77 $latin1{'uc'} = chr(utf8::unicode_to_native 0xD8) x $repeat;
  78 $latin1{'lc'} = chr(utf8::unicode_to_native 0xF8) x $repeat;
  79
  80 my %empty;
  81 $empty{'lc'} = $empty{'uc'} = "";
  82
  83 # Loop so prefix each character being tested with nothing, and the various
  84 # strings; then loop for suffixes of those strings as well.
  85 for my  $prefix (\%empty, \%posix, \%cyrillic, \%latin1) {
  86     for my  $suffix (\%empty, \%posix, \%cyrillic, \%latin1) {
  87         for my $i (0 .. 255) {  # For each possible posix or latin1 character
  88             my $cp = sprintf "U+%04X", $i;
  89
  90             # First try using latin1 (Unicode) semantics.
  91             use feature "unicode_strings";
  92
  93             my $phrase = 'in uni8bit';
  94             my $char = chr($i);
  95             my $pre_lc = $prefix->{'lc'};
  96             my $pre_uc = $prefix->{'uc'};
  97             my $post_lc = $suffix->{'lc'};
  98             my $post_uc = $suffix->{'uc'};
  99             my $to_upper = $pre_lc . $char . $post_lc;
 100             my $expected_upper = $pre_uc . $latin1_to_upper[$i] . $post_uc;
 101             my $to_lower = $pre_uc . $char . $post_uc;
 102             my $expected_lower = $pre_lc . $latin1_to_lower[$i] . $post_lc;
 103
 104             is (uc($to_upper), $expected_upper,
 105                 display("$cp: $phrase: Verify uc($to_upper) eq $expected_upper"));
 106             is (lc($to_lower), $expected_lower,
 107                 display("$cp: $phrase: Verify lc($to_lower) eq $expected_lower"));
 108
 109             if ($pre_uc eq "") {    # Title case if null prefix.
 110                 my $expected_title = $latin1_to_title[$i] . $post_lc;
 111                 is (ucfirst($to_upper), $expected_title,
 112                     display("$cp: $phrase: Verify ucfirst($to_upper) eq $expected_title"));
 113                 my $expected_lcfirst = $latin1_to_lower[$i] . $post_uc;
 114                 is (lcfirst($to_lower), $expected_lcfirst,
 115                     display("$cp: $phrase: Verify lcfirst($to_lower) eq $expected_lcfirst"));
 116             }
 117
 118             # Then try with posix semantics.
 119             no feature "unicode_strings";
 120             $phrase = 'no uni8bit';
 121
 122             # These don't contribute anything in this case.
 123             next if $suffix == \%cyrillic;
 124             next if $suffix == \%latin1;
 125             next if $prefix == \%cyrillic;
 126             next if $prefix == \%latin1;
 127
 128             $expected_upper = $pre_uc . $posix_to_upper[$i] . $post_uc;
 129             $expected_lower = $pre_lc . $posix_to_lower[$i] . $post_lc;
 130
 131             is (uc($to_upper), $expected_upper,
 132                 display("$cp: $phrase: Verify uc($to_upper) eq $expected_upper"));
 133             is (lc($to_lower), $expected_lower,
 134                 display("$cp: $phrase: Verify lc($to_lower) eq $expected_lower"));
 135
 136             if ($pre_uc eq "") {
 137                 my $expected_title = $posix_to_title[$i] . $post_lc;
 138                 is (ucfirst($to_upper), $expected_title,
 139                     display("$cp: $phrase: Verify ucfirst($to_upper) eq $expected_title"));
 140                 my $expected_lcfirst = $posix_to_lower[$i] . $post_uc;
 141                 is (lcfirst($to_lower), $expected_lcfirst,
 142                     display("$cp: $phrase: Verify lcfirst($to_lower) eq $expected_lcfirst"));
 143             }
 144         }
 145     }
 146 }
 147
 148 # In this section test that \w, \s, and \b (and complements) work correctly.
 149 # These are the only character classes affected by this pragma.  Above ASCII
 150 # range Latin-1 characters are in \w and \s iff the pragma is on.
 151
 152 # Construct the expected full Latin1 values without using anything we're
 153 # testing.  All these were determined manually by looking in the manual.
 154 # Boolean: is w[$i] a \w character?
 155 my @w = (0) x 256;
 156 for my $i ( 0x30 .. 0x39,   # 0-9
 157             0x41 .. 0x5a,   # A-Z
 158             0x61 .. 0x7a,   # a-z
 159             0x5F,           # _
 160             0xAA,           # FEMININE ORDINAL INDICATOR
 161             0xB5,           # MICRO SIGN
 162             0xBA,           # MASCULINE ORDINAL INDICATOR
 163             0xC0 .. 0xD6,   # various
 164             0xD8 .. 0xF6,   # various
 165             0xF8 .. 0xFF,   # various
 166         )
 167 {
 168     $w[utf8::unicode_to_native $i] = 1;
 169 }
 170
 171 # Boolean: is s[$i] a \s character?
 172 my @s = (0) x 256;
 173 $s[utf8::unicode_to_native 0x09] = 1;   # Tab
 174 $s[utf8::unicode_to_native 0x0A] = 1;   # LF
 175 $s[utf8::unicode_to_native 0x0B] = 1;   # VT
 176 $s[utf8::unicode_to_native 0x0C] = 1;   # FF
 177 $s[utf8::unicode_to_native 0x0D] = 1;   # CR
 178 $s[utf8::unicode_to_native 0x20] = 1;   # SPACE
 179 $s[utf8::unicode_to_native 0x85] = 1;   # NEL
 180 $s[utf8::unicode_to_native 0xA0] = 1;   # NO BREAK SPACE
 181
 182 for my $i (0 .. 255) {
 183     my $char = chr($i);
 184     my $hex_i = sprintf "%02X", $i;
 185     foreach my $which (\@s, \@w) {
 186         my $basic_name;
 187         if ($which == \@s) {
 188             $basic_name = 's';
 189         } else {
 190             $basic_name = 'w'
 191         }
 192
 193         # Test \w \W \s \S
 194         foreach my $complement (0, 1) {
 195             my $name = '\\' . (($complement) ? uc($basic_name) : $basic_name);
 196
 197             # in and out of [...]
 198             foreach my $charclass (0, 1) {
 199
 200                 # And like [^...] or just plain [...]
 201                 foreach my $complement_class (0, 1) {
 202                     next if ! $charclass && $complement_class;
 203
 204                     # Start with the boolean as to if the character is in the
 205                     # class, and then complement as needed.
 206                     my $expect_success = $which->[$i];
 207                     $expect_success = ! $expect_success if $complement;
 208                     $expect_success = ! $expect_success if $complement_class;
 209
 210                     my $test = $name;
 211                     $test = "^$test" if $complement_class;
 212                     $test = "[$test]" if $charclass;
 213                     $test = "^$test\$";
 214
 215                     use feature 'unicode_strings';
 216                     my $prefix = "in uni8bit; Verify chr(0x$hex_i)";
 217                     if ($expect_success) {
 218                         like($char, qr/$test/, display("$prefix =~ qr/$test/"));
 219                     } else {
 220                         unlike($char, qr/$test/, display("$prefix !~ qr/$test/"));
 221                     }
 222
 223                     no feature 'unicode_strings';
 224                     $prefix = "no uni8bit; Verify chr(0x$hex_i)";
 225
 226                     # With the legacy, nothing above 128 should be in the
 227                     # class
 228                     if (utf8::native_to_unicode($i) >= 128) {
 229                         $expect_success = 0;
 230                         $expect_success = ! $expect_success if $complement;
 231                         $expect_success = ! $expect_success if $complement_class;
 232                     }
 233                     if ($expect_success) {
 234                         like($char, qr/$test/, display("$prefix =~ qr/$test/"));
 235                     } else {
 236                         unlike($char, qr/$test/, display("$prefix !~ qr/$test/"));
 237                     }
 238                 }
 239             }
 240         }
 241     }
 242
 243     # Similarly for \b and \B.
 244     foreach my $complement (0, 1) {
 245         my $name = '\\' . (($complement) ? 'B' : 'b');
 246         my $expect_success = ! $w[$i];  # \b is complement of \w
 247         $expect_success = ! $expect_success if $complement;
 248
 249         my $string = "a$char";
 250         my $test = "(^a$name\\x{$hex_i}\$)";
 251
 252         use feature 'unicode_strings';
 253         my $prefix = "in uni8bit; Verify $string";
 254         if ($expect_success) {
 255             like($string, qr/$test/, display("$prefix =~ qr/$test/"));
 256         } else {
 257             unlike($string, qr/$test/, display("$prefix !~ qr/$test/"));
 258         }
 259
 260         no feature 'unicode_strings';
 261         $prefix = "no uni8bit; Verify $string";
 262         if (utf8::native_to_unicode($i) >= 128) {
 263             $expect_success = 1;
 264             $expect_success = ! $expect_success if $complement;
 265         }
 266         if ($expect_success) {
 267             like($string, qr/$test/, display("$prefix =~ qr/$test/"));
 268             like($string, qr/$test/, display("$prefix =~ qr/$test/"));
 269         } else {
 270             unlike($string, qr/$test/, display("$prefix !~ qr/$test/"));
 271         }
 272     }
 273 }