t/re/charset.t

   1 # Test the /a, /d, etc regex modifiers
   2
   3 BEGIN {
   4     chdir 't' if -d 't';
   5     require './test.pl';
   6     set_up_inc('../lib', '../dist/if');
   7     require './loc_tools.pl';
   8 }
   9
  10 use strict;
  11 use warnings;
  12 no warnings 'locale';   # Some /l tests use above-latin1 chars to make sure
  13                         # they work, even though they warn.
  14 use Config;
  15
  16 plan('no_plan');
  17
  18 # Each case is a valid element of its hash key.  Choose, where available, an
  19 # ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point.
  20 my %testcases = (
  21     '\w' => [ ord("A"), utf8::unicode_to_native(0xE2), 0x16B ],   # Below expects these to all be alpha
  22     '\d' => [ ord("0"), 0x0662 ],
  23     '\s' => [ ord("\t"), utf8::unicode_to_native(0xA0), 0x1680 ],  # Below expects these to be [:blank:]
  24     '[:cntrl:]' => [ utf8::unicode_to_native(0x00), utf8::unicode_to_native(0x88) ],
  25     '[:graph:]' => [ ord("&"), utf8::unicode_to_native(0xF7), 0x02C7 ], # Below expects these to be
  26                                                                      # [:print:]
  27     '[:lower:]' => [ ord("g"), utf8::unicode_to_native(0xE3), 0x0127 ],
  28     '[:punct:]' => [ ord('`'), ord('^'), ord('~'), ord('<'), ord('='), ord('>'), ord('|'), ord('-'), ord(','), ord(';'), ord(':'), ord('!'), ord('?'), ord('/'), ord('.'), ord('"'), ord('('), ord(')'), ord('['), ord(']'), ord('{'), ord('}'), ord('@'), ord('$'), ord('*'), ord('\\'), ord('&'), ord('#'), ord('%'), ord('+'), ord("'"), utf8::unicode_to_native(0xBF), 0x055C ],
  29     '[:upper:]' => [ ord("G"), utf8::unicode_to_native(0xC3), 0x0126 ],
  30     '[:xdigit:]' => [ ord("4"), 0xFF15 ],
  31 );
  32
  33 $testcases{'[:digit:]'} = $testcases{'\d'};
  34 $testcases{'[:alnum:]'} = $testcases{'\w'};
  35 $testcases{'[:alpha:]'} = $testcases{'\w'};
  36 $testcases{'[:blank:]'} = $testcases{'\s'};
  37 $testcases{'[:print:]'} = $testcases{'[:graph:]'};
  38 $testcases{'[:space:]'} = $testcases{'\s'};
  39 $testcases{'[:word:]'} = $testcases{'\w'};
  40
  41 my $utf8_locale;
  42
  43 my @charsets = qw(a d u aa);
  44 my $locales_ok = eval { locales_enabled('LC_CTYPE'); 1 };
  45 if (! is_miniperl() && $locales_ok) {
  46     require POSIX;
  47     my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
  48     if ($current_locale eq 'C') {
  49
  50         # test for d_setlocale is repeated here because this one is compile
  51         # time, and the one above is run time
  52         use if $Config{d_setlocale}, 'locale';
  53
  54         # Some implementations don't have the 128-255 range characters all
  55         # mean nothing under the C locale (an example being VMS).  This is
  56         # legal, but since we don't know what the right answers should be,
  57         # skip the locale tests in that situation.
  58         for my $i (128 .. 255) {
  59             goto skip_adding_C_locale
  60                               if chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/;
  61         }
  62         push @charsets, 'l';
  63
  64     skip_adding_C_locale:
  65
  66         # Use a pseudo-modifier 'L' to indicate to use /l with a UTF-8 locale
  67         $utf8_locale = find_utf8_ctype_locale();
  68         push @charsets, 'L' if defined $utf8_locale;
  69     }
  70 }
  71
  72 # For each possible character set...
  73 foreach my $charset (@charsets) {
  74     my $locale;
  75     my $charset_mod = lc $charset;
  76     my $charset_display;
  77     if ($charset_mod eq 'l') {
  78         $locale = POSIX::setlocale(&POSIX::LC_ALL, ($charset eq 'l')
  79                                             ? "C"
  80                                             : $utf8_locale
  81                            );
  82         die "Couldn't change locale" unless $locale;
  83         $charset_display = $charset_mod . " ($locale)";
  84     }
  85     else {
  86         $charset_display = $charset_mod;
  87     }
  88
  89     # And in utf8 or not
  90     foreach my $upgrade ("", 'utf8::upgrade($a); ') {
  91
  92         # reverse gets the, \w, \s, \d first.
  93         for my $class (reverse sort keys %testcases) {
  94
  95             # The complement of \w is \W; of [:posix:] is [:^posix:]
  96             my $complement = $class;
  97             if ($complement !~ s/ ( \[: ) /$1^/x) {
  98                 $complement = uc($class);
  99             }
 100
 101             # For each test case
 102             foreach my $ord (@{$testcases{$class}}) {
 103                 my $char = chr($ord);
 104                 $char = ($char eq '$') ? '\$' : display($char);
 105
 106                 # > 255 already implies upgraded.  Skip the ones that don't
 107                 # have an explicit upgrade.  This shows more clearly in the
 108                 # output which tests are in utf8, or not.
 109                 next if $ord > 255 && ! $upgrade;
 110
 111                 my $reason = "";    # Explanation output with each test
 112                 my $neg_reason = "";
 113                 my $match = 1;      # Calculated whether test regex should
 114                                     # match or not
 115
 116                 # Everything always matches in ASCII, or under /u, or under /l
 117                 # with a UTF-8 locale
 118                 if (utf8::native_to_unicode($ord) < 128
 119                     || $charset eq 'u'
 120                     || $charset eq 'L')
 121                 {
 122                     $reason = "\"$char\" is a $class under /$charset_display";
 123                     $neg_reason = "\"$char\" is not a $complement under /$charset_display";
 124                 }
 125                 elsif ($charset eq "a" || $charset eq "aa") {
 126                     $match = 0;
 127                     $reason = "\"$char\" is non-ASCII, which can't be a $class under /$charset_display";
 128                     $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /$charset_display";
 129                 }
 130                 elsif ($ord > 255) {
 131                     $reason = "\"$char\" is a $class under /$charset_display";
 132                     $neg_reason = "\"$char\" is not a $complement under /$charset_display";
 133                 }
 134                 elsif ($charset eq 'l') {
 135
 136                     # We are using the C locale, which is essentially ASCII,
 137                     # but under utf8, the above-latin1 chars are treated as
 138                     # Unicode)
 139                     $reason = "\"$char\" is not a $class in the C locale under /$charset_mod";
 140                     $neg_reason = "\"$char\" is a $complement in the C locale under /$charset_mod";
 141                     $match = 0;
 142                 }
 143                 elsif ($upgrade) {
 144                     $reason = "\"$char\" is a $class in utf8 under /$charset_display";
 145                     $neg_reason = "\"$char\" is not a $complement in utf8 under /$charset_display";
 146                 }
 147                 else {
 148                     $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /$charset_display";
 149                     $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /$charset_display (unless in utf8)";
 150                     $match = 0;
 151                 }
 152                 $reason = "; $reason" if $reason;
 153                 $neg_reason = "; $neg_reason" if $neg_reason;
 154
 155                 my $op;
 156                 my $neg_op;
 157                 if ($match) {
 158                     $op = '=~';
 159                     $neg_op = '!~';
 160                 }
 161                 else {
 162                     $op = '!~';
 163                     $neg_op = '=~';
 164                 }
 165
 166                 # In [...] or not
 167                 foreach my $bracketed (0, 1) {
 168                     my $lb = "";
 169                     my $rb = "";
 170                     if ($bracketed) {
 171
 172                         # Adds an extra char to the character class to make sure
 173                         # that the class doesn't get optimized away.
 174                         $lb = ($bracketed) ? '[_' : "";
 175                         $rb = ($bracketed) ? ']' : "";
 176                     }
 177                     else {  # [:posix:] must be inside outer [ ]
 178                         next if $class =~ /\[/;
 179                     }
 180
 181                     my $length = 10;    # For regexec.c regrepeat() cases by
 182                                         # matching more than one item
 183                     # Test both class and its complement, and with one or more
 184                     # than one item to match.
 185                     foreach my $eval (
 186                         qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb ) /x],
 187                         qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset_mod: $lb$class$rb\{$length} ) /x],
 188                     ) {
 189                         ok (eval $eval, $eval . $reason);
 190                     }
 191                     foreach my $eval (
 192                         qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb ) /x],
 193                         qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset_mod: $lb$complement$rb\{$length} ) /x],
 194                     ) {
 195                         ok (eval $eval, $eval . $neg_reason);
 196                     }
 197                 }
 198
 199                 next if $class ne '\w';
 200
 201                 # Test \b, \B at beginning and end of string
 202                 foreach my $eval (
 203                     qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: ^ \\b . ) /x],
 204                     qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b \$) /x],
 205                 ) {
 206                     ok (eval $eval, $eval . $reason);
 207                 }
 208                 foreach my $eval (
 209                     qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: ^ \\B . ) /x],
 210                     qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset_mod: . \\B \$ ) /x],
 211                 ) {
 212                     ok (eval $eval, $eval . $neg_reason);
 213                 }
 214
 215                 # Test \b, \B adjacent to a non-word char, both before it and
 216                 # after.  We test with ASCII, Latin1 and Unicode non-word chars
 217                 foreach my $space_ord (@{$testcases{'\s'}}) {
 218
 219                     # Useless to try to test non-utf8 when the ord itself
 220                     # forces utf8
 221                     next if $space_ord > 255 && ! $upgrade;
 222
 223                     my $space = display(chr $space_ord);
 224
 225                     foreach my $eval (
 226                         qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
 227                         qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset_mod: . \\b . ) /x],
 228                     ) {
 229                         ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w");
 230                     }
 231                     foreach my $eval (
 232                         qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
 233                         qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset_mod: . \\B . ) /x],
 234                     ) {
 235                         ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w");
 236                     }
 237                 }
 238
 239                 # Test \b, \B in the middle of two nominally word chars, but
 240                 # one or both may be considered non-word depending on range
 241                 # and charset.
 242                 foreach my $other_ord (@{$testcases{'\w'}}) {
 243                     next if $other_ord > 255 && ! $upgrade;
 244                     my $other = display(chr $other_ord);
 245
 246                     # Determine if the other char is a word char in current
 247                     # circumstances
 248                     my $other_is_word = 1;
 249                     my $other_reason = "\"$other\" is a $class under /$charset_display";
 250                     my $other_neg_reason = "\"$other\" is not a $complement under /$charset_display";
 251                     if (utf8::native_to_unicode($other_ord) > 127
 252                         && $charset ne 'u' && $charset ne 'L'
 253                         && (($charset eq "a" || $charset eq "aa")
 254                             || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade))))
 255                     {
 256                         $other_is_word = 0;
 257                         $other_reason = "\"$other\" is not a $class under /$charset_display";
 258                         $other_neg_reason = "\"$other\" is a $complement under /$charset_display";
 259                     }
 260                     my $both_reason = $reason;
 261                     $both_reason .= "; $other_reason" if $other_ord != $ord;
 262                     my $both_neg_reason = $neg_reason;
 263                     $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord;
 264
 265                     # If both are the same wordness, then \b will fail; \B
 266                     # succeed
 267                     if ($match == $other_is_word) {
 268                         $op = '!~';
 269                         $neg_op = '=~';
 270                     }
 271                     else {
 272                         $op = '=~';
 273                         $neg_op = '!~';
 274                     }
 275
 276                     foreach my $eval (
 277                         qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: $other \\b $char ) /x],
 278                         qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: $char \\b $other ) /x],
 279                     ) {
 280                         ok (eval $eval, $eval . $both_reason);
 281                     }
 282                     foreach my $eval (
 283                         qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: $other \\B $char ) /x],
 284                         qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: $char \\B $other ) /x],
 285                     ) {
 286                         ok (eval $eval, $eval . $both_neg_reason);
 287                     }
 288
 289                     next if $other_ord == $ord;
 290
 291                     # These start with the \b or \B.  They are included, based
 292                     # on source code analysis, to force the testing of the FBC
 293                     # (find_by_class) portions of regexec.c.
 294                     foreach my $eval (
 295                         qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset_mod: \\b $char ) /x],
 296                         qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset_mod: \\b $other ) /x],
 297                     ) {
 298                         ok (eval $eval, $eval . $both_reason);
 299                     }
 300                     foreach my $eval (
 301                         qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $char ) /x],
 302                         qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset_mod: \\B $other ) /x],
 303                     ) {
 304                         ok (eval $eval, $eval . $both_neg_reason);
 305                     }
 306                 }
 307             } # End of each test case in a class
 308         } # End of \w, \s, ...
 309     } # End of utf8 upgraded or not
 310 }
 311
 312 plan(curr_test() - 1);