t/re/charset.t

   1 # Test the /a, /d, etc regex modifiers
   2
   3 BEGIN {
   4     chdir 't' if -d 't';
   5     @INC = '../lib';
   6     require './test.pl';
   7 }
   8
   9 use strict;
  10 use warnings;
  11
  12 plan('no_plan');
  13
  14 # Each case is a valid element of its hash key.  Choose, where available, an
  15 # ASCII-range, Latin-1 non-ASCII range, and above Latin1 range code point.
  16 my %testcases = (
  17     '\w' => [ ord("A"), 0xE2, 0x16B ],   # Below expects these to all be alpha
  18     '\d' => [ ord("0"), 0x0662 ],
  19     '\s' => [ ord("\t"), 0xA0, 0x1680 ],  # Below expects these to be [:blank:]
  20     '[:cntrl:]' => [ 0x00, 0x88 ],
  21     '[:graph:]' => [ ord("&"), 0xF7, 0x02C7 ], # Below expects these to be
  22                                                # [:print:]
  23     '[:lower:]' => [ ord("g"), 0xE3, 0x0127 ],
  24     '[:punct:]' => [ ord("!"), 0xBF, 0x055C ],
  25     '[:upper:]' => [ ord("G"), 0xC3, 0x0126 ],
  26     '[:xdigit:]' => [ ord("4"), 0xFF15 ],
  27 );
  28
  29 $testcases{'[:digit:]'} = $testcases{'\d'};
  30 $testcases{'[:alnum:]'} = $testcases{'\w'};
  31 $testcases{'[:alpha:]'} = $testcases{'\w'};
  32 $testcases{'[:blank:]'} = $testcases{'\s'};
  33 $testcases{'[:print:]'} = $testcases{'[:graph:]'};
  34 $testcases{'[:space:]'} = $testcases{'\s'};
  35 $testcases{'[:word:]'} = $testcases{'\w'};
  36
  37 my @charsets = qw(a d u aa);
  38 if (! is_miniperl()) {
  39     require POSIX;
  40     my $current_locale = POSIX::setlocale( &POSIX::LC_ALL, "C") // "";
  41     if ($current_locale eq 'C') {
  42         use locale;
  43
  44         # Some locale implementations don't have the 128-255 characters all
  45         # mean nothing.  Skip the locale tests in that situation
  46         for my $i (128 .. 255) {
  47             goto bad_locale if chr($i) =~ /[[:print:]]/;
  48         }
  49         push @charsets, 'l';
  50     bad_locale:
  51     }
  52 }
  53
  54 # For each possible character set...
  55 foreach my $charset (@charsets) {
  56
  57     # And in utf8 or not
  58     foreach my $upgrade ("", 'utf8::upgrade($a); ') {
  59
  60         # reverse gets the, \w, \s, \d first.
  61         for my $class (reverse sort keys %testcases) {
  62
  63             # The complement of \w is \W; of [:posix:] is [:^posix:]
  64             my $complement = $class;
  65             if ($complement !~ s/ ( \[: ) /$1^/x) {
  66                 $complement = uc($class);
  67             }
  68
  69             # For each test case
  70             foreach my $ord (@{$testcases{$class}}) {
  71                 my $char = display(chr($ord));
  72
  73                 # > 255 already implies upgraded.  Skip the ones that don't
  74                 # have an explicit upgrade.  This shows more clearly in the
  75                 # output which tests are in utf8, or not.
  76                 next if $ord > 255 && ! $upgrade;
  77
  78                 my $reason = "";    # Explanation output with each test
  79                 my $neg_reason = "";
  80                 my $match = 1;      # Calculated whether test regex should
  81                                     # match or not
  82
  83                 # Everything always matches in ASCII, or under /u
  84                 if ($ord < 128 || $charset eq 'u') {
  85                     $reason = "\"$char\" is a $class under /$charset";
  86                     $neg_reason = "\"$char\" is not a $complement under /$charset";
  87                 }
  88                 elsif ($charset eq "a" || $charset eq "aa") {
  89                     $match = 0;
  90                     $reason = "\"$char\" is non-ASCII, which can't be a $class under /a";
  91                     $neg_reason = "\"$char\" is non-ASCII, which is a $complement under /a";
  92                 }
  93                 elsif ($ord > 255) {
  94                     $reason = "\"$char\" is a $class under /$charset";
  95                     $neg_reason = "\"$char\" is not a $complement under /$charset";
  96                 }
  97                 elsif ($charset eq 'l') {
  98
  99                     # We are using the C locale, which is essentially ASCII,
 100                     # but under utf8, the above-latin1 chars are treated as
 101                     # Unicode)
 102                     $reason = "\"$char\" is not a $class in this locale under /l";
 103                     $neg_reason = "\"$char\" is a $complement in this locale under /l";
 104                     $match = 0;
 105                 }
 106                 elsif ($upgrade) {
 107                     $reason = "\"$char\" is a $class in utf8 under /d";
 108                     $neg_reason = "\"$char\" is not a $complement in utf8 under /d";
 109                 }
 110                 else {
 111                     $reason = "\"$char\" is above-ASCII latin1, which requires utf8 to be a $class under /d";
 112                     $neg_reason = "\"$char\" is above-ASCII latin1, which is a $complement under /d (unless in utf8)";
 113                     $match = 0;
 114                 }
 115                 $reason = "; $reason" if $reason;
 116                 $neg_reason = "; $neg_reason" if $neg_reason;
 117
 118                 my $op;
 119                 my $neg_op;
 120                 if ($match) {
 121                     $op = '=~';
 122                     $neg_op = '!~';
 123                 }
 124                 else {
 125                     $op = '!~';
 126                     $neg_op = '=~';
 127                 }
 128
 129                 # In [...] or not
 130                 foreach my $bracketed (0, 1) {
 131                     my $lb = "";
 132                     my $rb = "";
 133                     if ($bracketed) {
 134
 135                         # Adds an extra char to the character class to make sure
 136                         # that the class doesn't get optimized away.
 137                         $lb = ($bracketed) ? '[_' : "";
 138                         $rb = ($bracketed) ? ']' : "";
 139                     }
 140                     else {  # [:posix:] must be inside outer [ ]
 141                         next if $class =~ /\[/;
 142                     }
 143
 144                     my $length = 10;    # For regexec.c regrepeat() cases by
 145                                         # matching more than one item
 146                     # Test both class and its complement, and with one or more
 147                     # than one item to match.
 148                     foreach my $eval (
 149                         qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: $lb$class$rb ) /x],
 150                         qq[my \$a = "$char" x $length; $upgrade\$a $op qr/ (?$charset: $lb$class$rb\{$length} ) /x],
 151                     ) {
 152                         ok (eval $eval, $eval . $reason);
 153                     }
 154                     foreach my $eval (
 155                         qq[my \$a = "$char"; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb ) /x],
 156                         qq[my \$a = "$char" x $length; $upgrade\$a $neg_op qr/ (?$charset: $lb$complement$rb\{$length} ) /x],
 157                     ) {
 158                         ok (eval $eval, $eval . $neg_reason);
 159                     }
 160                 }
 161
 162                 next if $class ne '\w';
 163
 164                 # Test \b, \B at beginning and end of string
 165                 foreach my $eval (
 166                     qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: ^ \\b . ) /x],
 167                     qq[my \$a = "$char"; $upgrade\$a $op qr/ (?$charset: . \\b \$) /x],
 168                 ) {
 169                     ok (eval $eval, $eval . $reason);
 170                 }
 171                 foreach my $eval (
 172                     qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: ^ \\B . ) /x],
 173                     qq[my \$a = "$char"; $upgrade\$a $neg_op qr/(?$charset: . \\B \$ ) /x],
 174                 ) {
 175                     ok (eval $eval, $eval . $neg_reason);
 176                 }
 177
 178                 # Test \b, \B adjacent to a non-word char, both before it and
 179                 # after.  We test with ASCII, Latin1 and Unicode non-word chars
 180                 foreach my $space_ord (@{$testcases{'\s'}}) {
 181
 182                     # Useless to try to test non-utf8 when the ord itself
 183                     # forces utf8
 184                     next if $space_ord > 255 && ! $upgrade;
 185
 186                     my $space = display(chr $space_ord);
 187
 188                     foreach my $eval (
 189                         qq[my \$a = "$space$char"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x],
 190                         qq[my \$a = "$char$space"; $upgrade\$a $op qr/ (?$charset: . \\b . ) /x],
 191                     ) {
 192                         ok (eval $eval, $eval . $reason . "; \"$space\" is not a \\w");
 193                     }
 194                     foreach my $eval (
 195                         qq[my \$a = "$space$char"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x],
 196                         qq[my \$a = "$char$space"; $upgrade\$a $neg_op qr/ (?$charset: . \\B . ) /x],
 197                     ) {
 198                         ok (eval $eval, $eval . $neg_reason . "; \"$space\" is not a \\w");
 199                     }
 200                 }
 201
 202                 # Test \b, \B in the middle of two nominally word chars, but
 203                 # one or both may be considered non-word depending on range
 204                 # and charset.
 205                 foreach my $other_ord (@{$testcases{'\w'}}) {
 206                     next if $other_ord > 255 && ! $upgrade;
 207                     my $other = display(chr $other_ord);
 208
 209                     # Determine if the other char is a word char in current
 210                     # circumstances
 211                     my $other_is_word = 1;
 212                     my $other_reason = "\"$other\" is a $class under /$charset";
 213                     my $other_neg_reason = "\"$other\" is not a $complement under /$charset";
 214                     if ($other_ord > 127
 215                         && $charset ne 'u'
 216                         && (($charset eq "a" || $charset eq "aa")
 217                             || ($other_ord < 256 && ($charset eq 'l' || ! $upgrade))))
 218                     {
 219                         $other_is_word = 0;
 220                         $other_reason = "\"$other\" is not a $class under /$charset";
 221                         $other_neg_reason = "\"$other\" is a $complement under /$charset";
 222                     }
 223                     my $both_reason = $reason;
 224                     $both_reason .= "; $other_reason" if $other_ord != $ord;
 225                     my $both_neg_reason = $neg_reason;
 226                     $both_neg_reason .= "; $other_neg_reason" if $other_ord != $ord;
 227
 228                     # If both are the same wordness, then \b will fail; \B
 229                     # succeed
 230                     if ($match == $other_is_word) {
 231                         $op = '!~';
 232                         $neg_op = '=~';
 233                     }
 234                     else {
 235                         $op = '=~';
 236                         $neg_op = '!~';
 237                     }
 238
 239                     foreach my $eval (
 240                         qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: $other \\b $char ) /x],
 241                         qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: $char \\b $other ) /x],
 242                     ) {
 243                         ok (eval $eval, $eval . $both_reason);
 244                     }
 245                     foreach my $eval (
 246                         qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: $other \\B $char ) /x],
 247                         qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: $char \\B $other ) /x],
 248                     ) {
 249                         ok (eval $eval, $eval . $both_neg_reason);
 250                     }
 251
 252                     next if $other_ord == $ord;
 253
 254                     # These start with the \b or \B.  They are included, based
 255                     # on source code analysis, to force the testing of the FBC
 256                     # (find_by_class) portions of regexec.c.
 257                     foreach my $eval (
 258                         qq[my \$a = "$other$char"; $upgrade\$a $op qr/ (?$charset: \\b $char ) /x],
 259                         qq[my \$a = "$char$other"; $upgrade\$a $op qr/ (?$charset: \\b $other ) /x],
 260                     ) {
 261                         ok (eval $eval, $eval . $both_reason);
 262                     }
 263                     foreach my $eval (
 264                         qq[my \$a = "$other$char"; $upgrade\$a $neg_op qr/ (?$charset: \\B $char ) /x],
 265                         qq[my \$a = "$char$other"; $upgrade\$a $neg_op qr/ (?$charset: \\B $other ) /x],
 266                     ) {
 267                         ok (eval $eval, $eval . $both_neg_reason);
 268                     }
 269                 }
 270             } # End of each test case in a class
 271         } # End of \w, \s, ...
 272     } # End of utf8 upgraded or not
 273 }
 274
 275 plan(curr_test() - 1);