t/re/reg_fold.t

   1 #!perl
   2
   3 BEGIN {
   4     chdir 't' if -d 't';
   5     @INC = '../lib';
   6     require './test.pl';
   7     skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec");
   8 }
   9
  10 use strict;
  11 use warnings;
  12 my @tests;
  13
  14 my $file="../lib/unicore/CaseFolding.txt";
  15 my @folds;
  16 use Unicode::UCD;
  17
  18 # Use the Unicode data file if we are on an ASCII platform (which its data is
  19 # for), and it is in the modern format (starting in Unicode 3.1.0) and it is
  20 # available.  This avoids being affected by potential bugs introduced by other
  21 # layers of Perl
  22 if (ord('A') == 65
  23     && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
  24     && open my $fh, "<", $file)
  25 {
  26     @folds = <$fh>;
  27 }
  28 else {
  29     my ($invlist_ref, $invmap_ref, undef, $default)
  30                                     = Unicode::UCD::prop_invmap('Case_Folding');
  31     for my $i (0 .. @$invlist_ref - 1 - 1) {
  32         next if $invmap_ref->[$i] == $default;
  33         my $adjust = -1;
  34         for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
  35             $adjust++;
  36
  37             # Single-code point maps go to a 'C' type
  38             if (! ref $invmap_ref->[$i]) {
  39                 push @folds, sprintf("%04X; C; %04X\n",
  40                                      $j,
  41                                      $invmap_ref->[$i] + $adjust);
  42             }
  43             else {  # Multi-code point maps go to 'F'.  prop_invmap()
  44                     # guarantees that no adjustment is needed for these,
  45                     # as the range will contain just one element
  46                 push @folds, sprintf("%04X; F; %s\n",
  47                                     $j,
  48                                     join " ", map { sprintf "%04X", $_ }
  49                                                     @{$invmap_ref->[$i]});
  50             }
  51         }
  52     }
  53 }
  54
  55 for  (@folds) {
  56     chomp;
  57     my ($line,$comment)= split/\s+#\s+/, $_;
  58     $comment = "" unless defined $comment;
  59     my ($cp,$type,@folded)=split/[\s;]+/,$line||'';
  60     next unless $type and ($type eq 'F' or $type eq 'C');
  61     my $fold_above_latin1 = grep { hex("0x$_") > 255 } @folded;
  62     $_="\\x{$_}" for @folded;
  63     my $cpv=hex("0x$cp");
  64     my $chr="\\x{$cp}";
  65     my @str;
  66     foreach my $swap (0, 1) {   # swap lhs and rhs, or not.
  67         foreach my $charclass (0, 1) {   # Put rhs in [...], or not
  68             my $lhs;
  69             my $rhs;
  70             if ($swap) {
  71                 $lhs = join "", @folded;
  72                 $rhs = $chr;
  73                 $rhs = "[$rhs]" if $charclass;
  74             } else {
  75                 #next if $charclass && @folded > 1;
  76                 $lhs = $chr;
  77                 $rhs = "";
  78                 foreach my $rhs_char (@folded) {
  79
  80                     # The colon is an unrelated character to the rest of the
  81                     # class, and makes sure no optimization into an EXACTish
  82                     # node occurs.
  83                     $rhs .= '[:' if $charclass;
  84                     $rhs .=  $rhs_char;
  85                     $rhs .= ']' if $charclass;
  86                 }
  87             }
  88             $lhs = "\"$lhs\"";
  89             $rhs = "/^$rhs\$/iu";
  90
  91             # Try both Latin1 and Unicode for code points below 256
  92             foreach my $upgrade ("", 'utf8::upgrade($c); ') {
  93                 if ($upgrade) { # No need to upgrade if already must be in
  94                                 # utf8
  95                     next if $swap && $fold_above_latin1;
  96                     next if !$swap && $cpv > 255;
  97                 }
  98                 my $eval = "my \$c = $lhs; $upgrade\$c =~ $rhs";
  99                 #print __LINE__, ": $eval\n";
 100                 push @tests, qq[ok(eval '$eval', '$eval - $comment')];
 101                 if (! $swap && $charclass && @folded > 1)
 102                 {
 103                     $tests[-1]="TODO: { local \$::TODO='A multi-char fold \"foo\", doesnt work for /[f][o][o]/i';\n$tests[-1] }"
 104                 }
 105             }
 106         }
 107     }
 108 }
 109
 110 # Now verify the case folding tables.  First compute the mappings without
 111 # resorting to the functions we're testing.
 112
 113 # Initialize the array so each $i maps to itself.
 114 my @fold_ascii;
 115 for my $i (0 .. 255) {
 116     $fold_ascii[$i] = $i;
 117 }
 118 my @fold_latin1 = @fold_ascii;
 119
 120 # Override the uppercase elements to fold to their lower case equivalents,
 121 # using the fact that 'A' in ASCII is 0x41, 'a' is 0x41+32, 'B' is 0x42, and
 122 # so on.  The same paradigm applies for most of the Latin1 range cased
 123 # characters, but in posix anything outside ASCII maps to itself, as we've
 124 # already set up.
 125 for my $i (0x41 .. 0x5A, 0xC0 .. 0xD6, 0xD8 .. 0xDE) {
 126     my $upper_ord = ord_latin1_to_native($i);
 127     my $lower_ord = ord_latin1_to_native($i + 32);
 128
 129     $fold_latin1[$upper_ord] = $lower_ord;
 130
 131     next if $i > 127;
 132     $fold_ascii[$upper_ord] = $lower_ord;
 133 }
 134
 135 # Same for folding lower to the upper equivalents
 136 for my $i (0x61 .. 0x7A, 0xE0 .. 0xF6, 0xF8 .. 0xFE) {
 137     my $lower_ord = ord_latin1_to_native($i);
 138     my $upper_ord = ord_latin1_to_native($i - 32);
 139
 140     $fold_latin1[$lower_ord] = $upper_ord;
 141
 142     next if $i > 127;
 143     $fold_ascii[$lower_ord] = $upper_ord;
 144 }
 145
 146 # Test every latin1 character for the correct values in both /u and /d
 147 for my $i (0 .. 255) {
 148     my $chr = sprintf "\\x%02X", $i;
 149     my $hex_fold_ascii = sprintf "0x%02X", $fold_ascii[$i];
 150     my $hex_fold_latin1 = sprintf "0x%02X", $fold_latin1[$i];
 151     push @tests, qq[like chr($hex_fold_ascii), qr/(?d:$chr)/i, 'chr($hex_fold_ascii) =~ qr/(?d:$chr)/i'];
 152     push @tests, qq[like chr($hex_fold_latin1), qr/(?u:$chr)/i, 'chr($hex_fold_latin1) =~ qr/(?u:$chr)/i'];
 153 }
 154
 155
 156 push @tests, qq[like chr(0x0430), qr/[=\x{0410}-\x{0411}]/i, 'Bug #71752 Unicode /i char in a range'];
 157 push @tests, qq[like 'a', qr/\\p{Upper}/i, "'a' =~ /\\\\p{Upper}/i"];
 158 push @tests, q[my $c = "\x{212A}"; my $p = qr/(?:^[K_]+$)/i; utf8::upgrade($p); like $c, $p, 'Bug #78994: my $c = "\x{212A}"; my $p = qr/(?:^[K_]+$)/i; utf8::upgrade($p); $c =~ $p'];
 159
 160 use charnames ":full";
 161 my $e_grave = latin1_to_native("\xE8");
 162 push @tests, q[my $re1 = "\N{WHITE SMILING FACE}";like $e_grave, qr/[\w$re1]/, 'my $re = "\N{WHITE SMILING FACE}"; $e_grave =~ qr/[\w$re]/'];
 163 push @tests, q[my $re2 = "\N{WHITE SMILING FACE}";like $e_grave, qr/\w|$re2/, 'my $re = "\N{WHITE SMILING FACE}"; $e_grave =~ qr/\w|$re/'];
 164
 165 eval join ";\n","plan tests=>". (scalar @tests), @tests, "1"
 166     or die $@;
 167 __DATA__