t/re/reg_fold.t

   1 #!perl
   2
   3 BEGIN {
   4     chdir 't' if -d 't';
   5     @INC = '../lib';
   6     require './test.pl';
   7     skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec");
   8 }
   9
  10 use strict;
  11 use warnings;
  12 my @tests;
  13
  14 my %todo_pass = map { $_ => 1 }
  15             qw(00DF 1E9E FB00 FB01 FB02 FB03 FB04 FB05 FB06);
  16
  17 my $file="../lib/unicore/CaseFolding.txt";
  18 open my $fh,"<",$file or die "Failed to read '$file': $!";
  19 while (<$fh>) {
  20     chomp;
  21     my ($line,$comment)= split/\s+#\s+/, $_;
  22     my ($cp,$type,@folded)=split/[\s;]+/,$line||'';
  23     next unless $type and ($type eq 'F' or $type eq 'C');
  24     next if $type eq 'C';   # 'C' tests now done by fold_grind.t
  25     my $fold_above_latin1 = grep { hex("0x$_") > 255 } @folded;
  26     $_="\\x{$_}" for @folded;
  27     my $cpv=hex("0x$cp");
  28     my $chr="\\x{$cp}";
  29     my @str;
  30     foreach my $swap (0, 1) {   # swap lhs and rhs, or not.
  31         foreach my $charclass (0) {   # Put rhs in [...], or not
  32             my $lhs;
  33             my $rhs;
  34             if ($swap) {
  35                 $lhs = join "", @folded;
  36                 $rhs = $chr;
  37                 $rhs = "[$rhs]" if $charclass;
  38             } else {
  39                 $lhs = $chr;
  40                 $rhs = "";
  41                 foreach my $rhs_char (@folded) {
  42                     $rhs .= '[' if $charclass;
  43                     $rhs .=  $rhs_char;
  44                     $rhs .= ']' if $charclass;
  45                 }
  46             }
  47             $lhs = "\"$lhs\"";
  48             $rhs = "/^$rhs\$/i";
  49
  50             # Try both Latin1 and Unicode for code points below 256
  51             foreach my $upgrade ("", 'utf8::upgrade($c); ') {
  52                 if ($upgrade) {
  53                     next if $swap && $fold_above_latin1;
  54                     next if !$swap && $cpv > 255;
  55                 }
  56                 my $eval = "my \$c = $lhs; $upgrade\$c =~ $rhs";
  57                 #print __LINE__, ": $eval\n";
  58                 push @tests, qq[ok(eval '$eval', '$eval - $comment')];
  59                 if ($charclass && @folded > 1 && $swap && ! $upgrade && ! $fold_above_latin1) {
  60                     $tests[-1]="TODO: { local \$::TODO='Multi-char, non-utf8 folded inside character class [ ] doesnt work';\n$tests[-1] }"
  61                 } elsif (! $upgrade && $cpv >= 128 && $cpv <= 255 && $cpv != 0xb5) {
  62                     $tests[-1]="TODO: { local \$::TODO='Most non-utf8 latin1 doesnt work';\n$tests[-1] }"
  63                 } elsif (! $swap && $charclass && @folded > 1
  64                     && ! $todo_pass{$cp})
  65                 {
  66                     # There are a few of these that pass; most fail.
  67                     $tests[-1]="TODO: { local \$::TODO='Some multi-char, f8 folded inside character class [ ] doesnt work';\n$tests[-1] }"
  68                 }
  69             }
  70         }
  71     }
  72 }
  73
  74 # Now verify the case folding tables.  First compute the mappings without
  75 # resorting to the functions we're testing.
  76
  77 # Initialize the array so each $i maps to itself.
  78 my @fold_ascii;
  79 for my $i (0 .. 255) {
  80     $fold_ascii[$i] = $i;
  81 }
  82 my @fold_latin1 = @fold_ascii;
  83
  84 # Override the uppercase elements to fold to their lower case equivalents,
  85 # using the fact that 'A' in ASCII is 0x41, 'a' is 0x41+32, 'B' is 0x42, and
  86 # so on.  The same paradigm applies for most of the Latin1 range cased
  87 # characters, but in posix anything outside ASCII maps to itself, as we've
  88 # already set up.
  89 for my $i (0x41 .. 0x5A, 0xC0 .. 0xD6, 0xD8 .. 0xDE) {
  90     my $upper_ord = ord_latin1_to_native($i);
  91     my $lower_ord = ord_latin1_to_native($i + 32);
  92
  93     $fold_latin1[$upper_ord] = $lower_ord;
  94
  95     next if $i > 127;
  96     $fold_ascii[$upper_ord] = $lower_ord;
  97 }
  98
  99 # Same for folding lower to the upper equivalents
 100 for my $i (0x61 .. 0x7A, 0xE0 .. 0xF6, 0xF8 .. 0xFE) {
 101     my $lower_ord = ord_latin1_to_native($i);
 102     my $upper_ord = ord_latin1_to_native($i - 32);
 103
 104     $fold_latin1[$lower_ord] = $upper_ord;
 105
 106     next if $i > 127;
 107     $fold_ascii[$lower_ord] = $upper_ord;
 108 }
 109
 110 # Test every latin1 character that the correct values in both /u and /d
 111 for my $i (0 .. 255) {
 112     my $chr = sprintf "\\x%02X", $i;
 113     my $hex_fold_ascii = sprintf "0x%02X", $fold_ascii[$i];
 114     my $hex_fold_latin1 = sprintf "0x%02X", $fold_latin1[$i];
 115     push @tests, qq[like chr($hex_fold_ascii), qr/(?d:$chr)/i, 'chr($hex_fold_ascii) =~ qr/(?d:$chr)/i'];
 116     push @tests, qq[like chr($hex_fold_latin1), qr/(?u:$chr)/i, 'chr($hex_fold_latin1) =~ qr/(?u:$chr)/i'];
 117 }
 118
 119
 120 push @tests, qq[like chr(0x0430), qr/[=\x{0410}-\x{0411}]/i, 'Bug #71752 Unicode /i char in a range'];
 121 push @tests, qq[like 'a', qr/\\p{Upper}/i, "'a' =~ /\\\\p{Upper}/i"];
 122 push @tests, q[my $c = "\x{212A}"; my $p = qr/(?:^[\x{004B}_]+$)/i; utf8::upgrade($p); like $c, $p, 'Bug #78994: my $c = "\x{212A}"; my $p = qr/(?:^[\x{004B}_]+$)/i; utf8::upgrade($p); $c =~ $p'];
 123
 124 use charnames ":full";
 125 push @tests, q[my $re1 = "\N{WHITE SMILING FACE}";like "\xE8", qr/[\w$re1]/, 'my $re = "\N{WHITE SMILING FACE}"; "\xE8" =~ qr/[\w$re]/'];
 126 push @tests, q[my $re2 = "\N{WHITE SMILING FACE}";like "\xE8", qr/\w|$re2/, 'my $re = "\N{WHITE SMILING FACE}"; "\xE8" =~ qr/\w|$re/'];
 127
 128 eval join ";\n","plan tests=>". (scalar @tests), @tests, "1"
 129     or die $@;
 130 __DATA__