This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
new perldelta
[perl5.git] / t / re / reg_fold.t
CommitLineData
24df86f6
RGS
1#!perl
2
3BEGIN {
4 chdir 't' if -d 't';
7bc44f18 5 require './test.pl';
624c42e2 6 set_up_inc('../lib');
164766b2 7 skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec");
24df86f6
RGS
8}
9
a0a388a1
YO
10use strict;
11use warnings;
a0a388a1 12my @tests;
a0a388a1
YO
13
14my $file="../lib/unicore/CaseFolding.txt";
10329c47
KW
15my @folds;
16use Unicode::UCD;
17
18# Use the Unicode data file if we are on an ASCII platform (which its data is
19# for), and it is in the modern format (starting in Unicode 3.1.0) and it is
20# available. This avoids being affected by potential bugs introduced by other
21# layers of Perl
22if (ord('A') == 65
5fede848 23 && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0
10329c47
KW
24 && open my $fh, "<", $file)
25{
26 @folds = <$fh>;
27}
28else {
29 my ($invlist_ref, $invmap_ref, undef, $default)
30 = Unicode::UCD::prop_invmap('Case_Folding');
31 for my $i (0 .. @$invlist_ref - 1 - 1) {
32 next if $invmap_ref->[$i] == $default;
33 my $adjust = -1;
34 for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) {
35 $adjust++;
36
37 # Single-code point maps go to a 'C' type
38 if (! ref $invmap_ref->[$i]) {
39 push @folds, sprintf("%04X; C; %04X\n",
40 $j,
41 $invmap_ref->[$i] + $adjust);
42 }
43 else { # Multi-code point maps go to 'F'. prop_invmap()
44 # guarantees that no adjustment is needed for these,
45 # as the range will contain just one element
46 push @folds, sprintf("%04X; F; %s\n",
47 $j,
48 join " ", map { sprintf "%04X", $_ }
49 @{$invmap_ref->[$i]});
50 }
51 }
52 }
53}
54
55for (@folds) {
a0a388a1
YO
56 chomp;
57 my ($line,$comment)= split/\s+#\s+/, $_;
10329c47 58 $comment = "" unless defined $comment;
1443f10d 59 my ($cp,$type,@folded)=split/[\s;]+/,$line||'';
a0a388a1 60 next unless $type and ($type eq 'F' or $type eq 'C');
1443f10d
KW
61 my $fold_above_latin1 = grep { hex("0x$_") > 255 } @folded;
62 $_="\\x{$_}" for @folded;
a0a388a1 63 my $cpv=hex("0x$cp");
1443f10d 64 my $chr="\\x{$cp}";
a0a388a1 65 my @str;
1443f10d 66 foreach my $swap (0, 1) { # swap lhs and rhs, or not.
8edd0c7f 67 foreach my $charclass (0, 1) { # Put rhs in [...], or not
1443f10d
KW
68 my $lhs;
69 my $rhs;
70 if ($swap) {
71 $lhs = join "", @folded;
72 $rhs = $chr;
73 $rhs = "[$rhs]" if $charclass;
74 } else {
8edd0c7f 75 #next if $charclass && @folded > 1;
1443f10d
KW
76 $lhs = $chr;
77 $rhs = "";
78 foreach my $rhs_char (@folded) {
31ae3604
KW
79
80 # The colon is an unrelated character to the rest of the
81 # class, and makes sure no optimization into an EXACTish
82 # node occurs.
83 $rhs .= '[:' if $charclass;
1443f10d
KW
84 $rhs .= $rhs_char;
85 $rhs .= ']' if $charclass;
86 }
87 }
88 $lhs = "\"$lhs\"";
6cdc48f8 89 $rhs = "/^$rhs\$/iu";
24df86f6 90
1443f10d
KW
91 # Try both Latin1 and Unicode for code points below 256
92 foreach my $upgrade ("", 'utf8::upgrade($c); ') {
59aef945
KW
93 if ($upgrade) { # No need to upgrade if already must be in
94 # utf8
1443f10d
KW
95 next if $swap && $fold_above_latin1;
96 next if !$swap && $cpv > 255;
97 }
98 my $eval = "my \$c = $lhs; $upgrade\$c =~ $rhs";
99 #print __LINE__, ": $eval\n";
100 push @tests, qq[ok(eval '$eval', '$eval - $comment')];
67fcf67c 101 if (! $swap && $charclass && @folded > 1)
2f7760b5 102 {
67fcf67c 103 $tests[-1]="TODO: { local \$::TODO='A multi-char fold \"foo\", doesnt work for /[f][o][o]/i';\n$tests[-1] }"
1443f10d 104 }
1443f10d
KW
105 }
106 }
a0a388a1 107 }
24df86f6 108}
2726813d 109
fad448f4
KW
110# Now verify the case folding tables. First compute the mappings without
111# resorting to the functions we're testing.
112
113# Initialize the array so each $i maps to itself.
114my @fold_ascii;
115for my $i (0 .. 255) {
116 $fold_ascii[$i] = $i;
117}
118my @fold_latin1 = @fold_ascii;
119
120# Override the uppercase elements to fold to their lower case equivalents,
121# using the fact that 'A' in ASCII is 0x41, 'a' is 0x41+32, 'B' is 0x42, and
122# so on. The same paradigm applies for most of the Latin1 range cased
123# characters, but in posix anything outside ASCII maps to itself, as we've
124# already set up.
125for my $i (0x41 .. 0x5A, 0xC0 .. 0xD6, 0xD8 .. 0xDE) {
6deb7a5e
KW
126 my $upper_ord = utf8::unicode_to_native($i);
127 my $lower_ord = utf8::unicode_to_native($i + 32);
fad448f4
KW
128
129 $fold_latin1[$upper_ord] = $lower_ord;
130
131 next if $i > 127;
132 $fold_ascii[$upper_ord] = $lower_ord;
133}
134
135# Same for folding lower to the upper equivalents
136for my $i (0x61 .. 0x7A, 0xE0 .. 0xF6, 0xF8 .. 0xFE) {
6deb7a5e
KW
137 my $lower_ord = utf8::unicode_to_native($i);
138 my $upper_ord = utf8::unicode_to_native($i - 32);
fad448f4
KW
139
140 $fold_latin1[$lower_ord] = $upper_ord;
141
142 next if $i > 127;
143 $fold_ascii[$lower_ord] = $upper_ord;
144}
145
59aef945 146# Test every latin1 character for the correct values in both /u and /d
fad448f4
KW
147for my $i (0 .. 255) {
148 my $chr = sprintf "\\x%02X", $i;
149 my $hex_fold_ascii = sprintf "0x%02X", $fold_ascii[$i];
150 my $hex_fold_latin1 = sprintf "0x%02X", $fold_latin1[$i];
151 push @tests, qq[like chr($hex_fold_ascii), qr/(?d:$chr)/i, 'chr($hex_fold_ascii) =~ qr/(?d:$chr)/i'];
fad448f4 152 push @tests, qq[like chr($hex_fold_latin1), qr/(?u:$chr)/i, 'chr($hex_fold_latin1) =~ qr/(?u:$chr)/i'];
fad448f4
KW
153}
154
155
2726813d 156push @tests, qq[like chr(0x0430), qr/[=\x{0410}-\x{0411}]/i, 'Bug #71752 Unicode /i char in a range'];
2726813d 157push @tests, qq[like 'a', qr/\\p{Upper}/i, "'a' =~ /\\\\p{Upper}/i"];
aaa63dae 158push @tests, q[my $c = "\x{212A}"; my $p = qr/(?:^[K_]+$)/i; utf8::upgrade($p); like $c, qr/$p/, 'Bug #78994: my $c = "\x{212A}"; my $p = qr/(?:^[K_]+$)/i; utf8::upgrade($p); $c =~ $p'];
2726813d 159
7b98bc43 160use charnames ":full";
7bc44f18 161my $e_grave = chr utf8::unicode_to_native(0xE8);
ce0a582e
KW
162push @tests, q[my $re1 = "\N{WHITE SMILING FACE}";like $e_grave, qr/[\w$re1]/, 'my $re = "\N{WHITE SMILING FACE}"; $e_grave =~ qr/[\w$re]/'];
163push @tests, q[my $re2 = "\N{WHITE SMILING FACE}";like $e_grave, qr/\w|$re2/, 'my $re = "\N{WHITE SMILING FACE}"; $e_grave =~ qr/\w|$re/'];
7b98bc43 164
b2a1b324 165eval join ";\n","plan tests=>". (scalar @tests), @tests, "1"
a0a388a1
YO
166 or die $@;
167__DATA__