Commit | Line | Data |
---|---|---|
24df86f6 RGS |
1 | #!perl |
2 | ||
3 | BEGIN { | |
4 | chdir 't' if -d 't'; | |
7bc44f18 | 5 | require './test.pl'; |
624c42e2 | 6 | set_up_inc('../lib'); |
164766b2 | 7 | skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec"); |
24df86f6 RGS |
8 | } |
9 | ||
a0a388a1 YO |
10 | use strict; |
11 | use warnings; | |
a0a388a1 | 12 | my @tests; |
a0a388a1 YO |
13 | |
14 | my $file="../lib/unicore/CaseFolding.txt"; | |
10329c47 KW |
15 | my @folds; |
16 | use Unicode::UCD; | |
17 | ||
18 | # Use the Unicode data file if we are on an ASCII platform (which its data is | |
19 | # for), and it is in the modern format (starting in Unicode 3.1.0) and it is | |
20 | # available. This avoids being affected by potential bugs introduced by other | |
21 | # layers of Perl | |
22 | if (ord('A') == 65 | |
5fede848 | 23 | && pack("C*", split /\./, Unicode::UCD::UnicodeVersion()) ge v3.1.0 |
10329c47 KW |
24 | && open my $fh, "<", $file) |
25 | { | |
26 | @folds = <$fh>; | |
27 | } | |
28 | else { | |
29 | my ($invlist_ref, $invmap_ref, undef, $default) | |
30 | = Unicode::UCD::prop_invmap('Case_Folding'); | |
31 | for my $i (0 .. @$invlist_ref - 1 - 1) { | |
32 | next if $invmap_ref->[$i] == $default; | |
33 | my $adjust = -1; | |
34 | for my $j ($invlist_ref->[$i] .. $invlist_ref->[$i+1] -1) { | |
35 | $adjust++; | |
36 | ||
37 | # Single-code point maps go to a 'C' type | |
38 | if (! ref $invmap_ref->[$i]) { | |
39 | push @folds, sprintf("%04X; C; %04X\n", | |
40 | $j, | |
41 | $invmap_ref->[$i] + $adjust); | |
42 | } | |
43 | else { # Multi-code point maps go to 'F'. prop_invmap() | |
44 | # guarantees that no adjustment is needed for these, | |
45 | # as the range will contain just one element | |
46 | push @folds, sprintf("%04X; F; %s\n", | |
47 | $j, | |
48 | join " ", map { sprintf "%04X", $_ } | |
49 | @{$invmap_ref->[$i]}); | |
50 | } | |
51 | } | |
52 | } | |
53 | } | |
54 | ||
55 | for (@folds) { | |
a0a388a1 YO |
56 | chomp; |
57 | my ($line,$comment)= split/\s+#\s+/, $_; | |
10329c47 | 58 | $comment = "" unless defined $comment; |
1443f10d | 59 | my ($cp,$type,@folded)=split/[\s;]+/,$line||''; |
a0a388a1 | 60 | next unless $type and ($type eq 'F' or $type eq 'C'); |
1443f10d KW |
61 | my $fold_above_latin1 = grep { hex("0x$_") > 255 } @folded; |
62 | $_="\\x{$_}" for @folded; | |
a0a388a1 | 63 | my $cpv=hex("0x$cp"); |
1443f10d | 64 | my $chr="\\x{$cp}"; |
a0a388a1 | 65 | my @str; |
1443f10d | 66 | foreach my $swap (0, 1) { # swap lhs and rhs, or not. |
8edd0c7f | 67 | foreach my $charclass (0, 1) { # Put rhs in [...], or not |
1443f10d KW |
68 | my $lhs; |
69 | my $rhs; | |
70 | if ($swap) { | |
71 | $lhs = join "", @folded; | |
72 | $rhs = $chr; | |
73 | $rhs = "[$rhs]" if $charclass; | |
74 | } else { | |
8edd0c7f | 75 | #next if $charclass && @folded > 1; |
1443f10d KW |
76 | $lhs = $chr; |
77 | $rhs = ""; | |
78 | foreach my $rhs_char (@folded) { | |
31ae3604 KW |
79 | |
80 | # The colon is an unrelated character to the rest of the | |
81 | # class, and makes sure no optimization into an EXACTish | |
82 | # node occurs. | |
83 | $rhs .= '[:' if $charclass; | |
1443f10d KW |
84 | $rhs .= $rhs_char; |
85 | $rhs .= ']' if $charclass; | |
86 | } | |
87 | } | |
88 | $lhs = "\"$lhs\""; | |
6cdc48f8 | 89 | $rhs = "/^$rhs\$/iu"; |
24df86f6 | 90 | |
1443f10d KW |
91 | # Try both Latin1 and Unicode for code points below 256 |
92 | foreach my $upgrade ("", 'utf8::upgrade($c); ') { | |
59aef945 KW |
93 | if ($upgrade) { # No need to upgrade if already must be in |
94 | # utf8 | |
1443f10d KW |
95 | next if $swap && $fold_above_latin1; |
96 | next if !$swap && $cpv > 255; | |
97 | } | |
98 | my $eval = "my \$c = $lhs; $upgrade\$c =~ $rhs"; | |
99 | #print __LINE__, ": $eval\n"; | |
100 | push @tests, qq[ok(eval '$eval', '$eval - $comment')]; | |
67fcf67c | 101 | if (! $swap && $charclass && @folded > 1) |
2f7760b5 | 102 | { |
67fcf67c | 103 | $tests[-1]="TODO: { local \$::TODO='A multi-char fold \"foo\", doesnt work for /[f][o][o]/i';\n$tests[-1] }" |
1443f10d | 104 | } |
1443f10d KW |
105 | } |
106 | } | |
a0a388a1 | 107 | } |
24df86f6 | 108 | } |
2726813d | 109 | |
fad448f4 KW |
110 | # Now verify the case folding tables. First compute the mappings without |
111 | # resorting to the functions we're testing. | |
112 | ||
113 | # Initialize the array so each $i maps to itself. | |
114 | my @fold_ascii; | |
115 | for my $i (0 .. 255) { | |
116 | $fold_ascii[$i] = $i; | |
117 | } | |
118 | my @fold_latin1 = @fold_ascii; | |
119 | ||
120 | # Override the uppercase elements to fold to their lower case equivalents, | |
121 | # using the fact that 'A' in ASCII is 0x41, 'a' is 0x41+32, 'B' is 0x42, and | |
122 | # so on. The same paradigm applies for most of the Latin1 range cased | |
123 | # characters, but in posix anything outside ASCII maps to itself, as we've | |
124 | # already set up. | |
125 | for my $i (0x41 .. 0x5A, 0xC0 .. 0xD6, 0xD8 .. 0xDE) { | |
6deb7a5e KW |
126 | my $upper_ord = utf8::unicode_to_native($i); |
127 | my $lower_ord = utf8::unicode_to_native($i + 32); | |
fad448f4 KW |
128 | |
129 | $fold_latin1[$upper_ord] = $lower_ord; | |
130 | ||
131 | next if $i > 127; | |
132 | $fold_ascii[$upper_ord] = $lower_ord; | |
133 | } | |
134 | ||
135 | # Same for folding lower to the upper equivalents | |
136 | for my $i (0x61 .. 0x7A, 0xE0 .. 0xF6, 0xF8 .. 0xFE) { | |
6deb7a5e KW |
137 | my $lower_ord = utf8::unicode_to_native($i); |
138 | my $upper_ord = utf8::unicode_to_native($i - 32); | |
fad448f4 KW |
139 | |
140 | $fold_latin1[$lower_ord] = $upper_ord; | |
141 | ||
142 | next if $i > 127; | |
143 | $fold_ascii[$lower_ord] = $upper_ord; | |
144 | } | |
145 | ||
59aef945 | 146 | # Test every latin1 character for the correct values in both /u and /d |
fad448f4 KW |
147 | for my $i (0 .. 255) { |
148 | my $chr = sprintf "\\x%02X", $i; | |
149 | my $hex_fold_ascii = sprintf "0x%02X", $fold_ascii[$i]; | |
150 | my $hex_fold_latin1 = sprintf "0x%02X", $fold_latin1[$i]; | |
151 | push @tests, qq[like chr($hex_fold_ascii), qr/(?d:$chr)/i, 'chr($hex_fold_ascii) =~ qr/(?d:$chr)/i']; | |
fad448f4 | 152 | push @tests, qq[like chr($hex_fold_latin1), qr/(?u:$chr)/i, 'chr($hex_fold_latin1) =~ qr/(?u:$chr)/i']; |
fad448f4 KW |
153 | } |
154 | ||
155 | ||
2726813d | 156 | push @tests, qq[like chr(0x0430), qr/[=\x{0410}-\x{0411}]/i, 'Bug #71752 Unicode /i char in a range']; |
2726813d | 157 | push @tests, qq[like 'a', qr/\\p{Upper}/i, "'a' =~ /\\\\p{Upper}/i"]; |
aaa63dae | 158 | push @tests, q[my $c = "\x{212A}"; my $p = qr/(?:^[K_]+$)/i; utf8::upgrade($p); like $c, qr/$p/, 'Bug #78994: my $c = "\x{212A}"; my $p = qr/(?:^[K_]+$)/i; utf8::upgrade($p); $c =~ $p']; |
2726813d | 159 | |
7b98bc43 | 160 | use charnames ":full"; |
7bc44f18 | 161 | my $e_grave = chr utf8::unicode_to_native(0xE8); |
ce0a582e KW |
162 | push @tests, q[my $re1 = "\N{WHITE SMILING FACE}";like $e_grave, qr/[\w$re1]/, 'my $re = "\N{WHITE SMILING FACE}"; $e_grave =~ qr/[\w$re]/']; |
163 | push @tests, q[my $re2 = "\N{WHITE SMILING FACE}";like $e_grave, qr/\w|$re2/, 'my $re = "\N{WHITE SMILING FACE}"; $e_grave =~ qr/\w|$re/']; | |
7b98bc43 | 164 | |
b2a1b324 | 165 | eval join ";\n","plan tests=>". (scalar @tests), @tests, "1" |
a0a388a1 YO |
166 | or die $@; |
167 | __DATA__ |