Commit | Line | Data |
---|---|---|
84281c31 A |
1 | #!./perl |
2 | # | |
3 | # Tests that have to do with checking whether characters have (or not have) | |
4 | # certain Unicode properties; belong (or not belong) to blocks, scripts, etc. | |
5 | # | |
6 | ||
7 | use strict; | |
8 | use warnings; | |
9 | use 5.010; | |
10 | ||
11 | my $IS_EBCDIC = ord ('A') == 193; | |
12 | ||
13 | sub run_tests; | |
14 | ||
15 | # | |
16 | # This is the data to test. | |
17 | # | |
18 | # This is a hash; keys are the property to test. | |
19 | # Values are arrays containing characters to test. The characters can | |
20 | # have the following formats: | |
21 | # '\N{CHARACTER NAME}' - Use character with that name | |
22 | # '\x{1234}' - Use character with that hex escape | |
23 | # '0x1234' - Use chr() to get that character | |
24 | # "a" - Character to use | |
25 | # | |
26 | # If a character entry starts with ! the character does not belong to the class | |
27 | # | |
28 | # If the class is just single letter, we use both \pL and \p{L} | |
29 | # | |
30 | ||
31 | use charnames ':full'; | |
32 | ||
33 | my @CLASSES = ( | |
34 | L => ["a", "A"], | |
35 | Ll => ["b", "!B"], | |
36 | Lu => ["!c", "C"], | |
37 | IsLl => ["d", "!D"], | |
38 | IsLu => ["!e", "E"], | |
39 | LC => ["f", "!1"], | |
40 | 'L&' => ["g", "!2"], | |
41 | 'Lowercase Letter' => ["h", "!H"], | |
42 | ||
43 | Common => ["!i", "3"], | |
44 | Inherited => ["!j", '\x{300}'], | |
45 | ||
46 | InBasicLatin => ['\N{LATIN CAPITAL LETTER A}'], | |
47 | InLatin1Supplement => ['\N{LATIN CAPITAL LETTER A WITH GRAVE}'], | |
48 | InLatinExtendedA => ['\N{LATIN CAPITAL LETTER A WITH MACRON}'], | |
49 | InLatinExtendedB => ['\N{LATIN SMALL LETTER B WITH STROKE}'], | |
50 | InKatakana => ['\N{KATAKANA LETTER SMALL A}'], | |
51 | IsLatin => ["0x100", "0x212b"], | |
52 | IsHebrew => ["0x5d0", "0xfb4f"], | |
53 | IsGreek => ["0x37a", "0x386", "!0x387", "0x388", | |
54 | "0x38a", "!0x38b", "0x38c"], | |
55 | HangulSyllables => ['\x{AC00}'], | |
56 | 'Script=Latin' => ['\x{0100}'], | |
57 | 'Block=LatinExtendedA' => ['\x{0100}'], | |
58 | 'Category=UppercaseLetter' => ['\x{0100}'], | |
59 | ||
60 | # | |
61 | # It's ok to repeat class names. | |
62 | # | |
63 | InLatin1Supplement => | |
64 | $IS_EBCDIC ? ['!\x{7f}', '\x{80}', '!\x{100}'] | |
65 | : ['!\x{7f}', '\x{80}', '\x{ff}', '!\x{100}'], | |
66 | InLatinExtendedA => | |
67 | ['!\x{7f}', '!\x{80}', '!\x{ff}', '\x{100}'], | |
68 | ||
69 | # | |
70 | # Properties are case-insensitive, and may have whitespace, | |
71 | # dashes and underscores. | |
72 | # | |
73 | 'in-latin1_SUPPLEMENT' => ['\x{80}', | |
74 | '\N{LATIN SMALL LETTER Y WITH DIAERESIS}'], | |
75 | ' ^ In Latin 1 Supplement ' | |
76 | => ['!\x{80}', '\N{COFFIN}'], | |
77 | 'latin-1 supplement' => ['\x{80}', "0xDF"], | |
78 | ||
79 | ); | |
80 | ||
81 | my @USER_DEFINED_PROPERTIES = ( | |
82 | # | |
83 | # User defined properties | |
84 | # | |
85 | InKana1 => ['\x{3040}', '!\x{303F}'], | |
86 | InKana2 => ['\x{3040}', '!\x{303F}'], | |
87 | InKana3 => ['\x{3041}', '!\x{3040}'], | |
88 | InNotKana => ['\x{3040}', '!\x{3041}'], | |
89 | InConsonant => ['d', '!e'], | |
90 | IsSyriac1 => ['\x{0712}', '!\x{072F}'], | |
91 | Syriac1 => ['\x{0712}', '!\x{072F}'], | |
92 | '# User-defined character properties my lack \n at the end', | |
93 | InGreekSmall => ['\N{GREEK SMALL LETTER PI}', | |
94 | '\N{GREEK SMALL LETTER FINAL SIGMA}'], | |
95 | InGreekCapital => ['\N{GREEK CAPITAL LETTER PI}', '!\x{03A2}'], | |
96 | Dash => ['-'], | |
97 | ASCII_Hex_Digit => ['!-', 'A'], | |
98 | AsciiHexAndDash => ['-', 'A'], | |
99 | ); | |
100 | ||
101 | ||
102 | # | |
103 | # From the short properties we populate POSIX-like classes. | |
104 | # | |
105 | my %SHORT_PROPERTIES = ( | |
106 | 'Ll' => ['m', '\N{CYRILLIC SMALL LETTER A}'], | |
107 | 'Lu' => ['M', '\N{GREEK CAPITAL LETTER ALPHA}'], | |
108 | 'Lo' => ['\N{HIRAGANA LETTER SMALL A}'], | |
109 | 'Mn' => ['\N{COMBINING GRAVE ACCENT}'], | |
110 | 'Nd' => ["0", '\N{ARABIC-INDIC DIGIT ZERO}'], | |
111 | 'Pc' => ["_"], | |
112 | 'Po' => ["!"], | |
113 | 'Zs' => [" "], | |
114 | 'Cc' => ['\x{00}'], | |
115 | ); | |
116 | ||
117 | # | |
118 | # Illegal properties | |
119 | # | |
120 | my @ILLEGAL_PROPERTIES = qw [q qrst]; | |
121 | ||
122 | my %d; | |
123 | ||
124 | while (my ($class, $chars) = each %SHORT_PROPERTIES) { | |
125 | push @{$d {IsAlpha}} => map {$class =~ /^[LM]/ ? $_ : "!$_"} @$chars; | |
126 | push @{$d {IsAlnum}} => map {$class =~ /^[LMN]./ ? $_ : "!$_"} @$chars; | |
127 | push @{$d {IsASCII}} => map {length ($_) == 1 || $_ eq '\x{00}' | |
128 | ? $_ : "!$_"} @$chars; | |
129 | push @{$d {IsCntrl}} => map {$class =~ /^C/ ? $_ : "!$_"} @$chars; | |
130 | push @{$d {IsBlank}} => map {$class =~ /^Z[lps]/ ? $_ : "!$_"} @$chars; | |
131 | push @{$d {IsDigit}} => map {$class =~ /^Nd$/ ? $_ : "!$_"} @$chars; | |
132 | push @{$d {IsGraph}} => map {$class =~ /^([LMNPS]|Co)/ | |
133 | ? $_ : "!$_"} @$chars; | |
134 | push @{$d {IsPrint}} => map {$class =~ /^([LMNPS]|Co|Zs)/ | |
135 | ? $_ : "!$_"} @$chars; | |
136 | push @{$d {IsLower}} => map {$class =~ /^Ll$/ ? $_ : "!$_"} @$chars; | |
137 | push @{$d {IsUpper}} => map {$class =~ /^L[ut]/ ? $_ : "!$_"} @$chars; | |
138 | push @{$d {IsPunct}} => map {$class =~ /^P/ ? $_ : "!$_"} @$chars; | |
139 | push @{$d {IsWord}} => map {$class =~ /^[LMN]/ || $_ eq "_" | |
140 | ? $_ : "!$_"} @$chars; | |
141 | push @{$d {IsSpace}} => map {$class =~ /^Z/ || | |
142 | length ($_) == 1 && ord ($_) >= 0x09 | |
143 | && ord ($_) <= 0x0D | |
144 | ? $_ : "!$_"} @$chars; | |
145 | } | |
146 | ||
147 | delete $d {IsASCII} if $IS_EBCDIC; | |
148 | ||
149 | push @CLASSES => "# Short properties" => %SHORT_PROPERTIES, | |
150 | "# POSIX like properties" => %d, | |
151 | "# User defined properties" => @USER_DEFINED_PROPERTIES; | |
152 | ||
153 | ||
154 | # | |
155 | # Calculate the number of tests. | |
156 | # | |
157 | my $count = 0; | |
158 | for (my $i = 0; $i < @CLASSES; $i += 2) { | |
159 | $i ++, redo if $CLASSES [$i] =~ /^\h*#\h*(.*)/; | |
160 | $count += (length $CLASSES [$i] == 1 ? 4 : 2) * @{$CLASSES [$i + 1]}; | |
161 | } | |
162 | $count += 2 * @ILLEGAL_PROPERTIES; | |
163 | $count += 2 * grep {length $_ == 1} @ILLEGAL_PROPERTIES; | |
164 | ||
165 | my $tests = 0; | |
166 | ||
167 | say "1..$count"; | |
168 | ||
169 | run_tests unless caller (); | |
170 | ||
171 | sub match { | |
172 | my ($char, $match, $nomatch) = @_; | |
173 | ||
174 | my ($str, $name); | |
175 | ||
176 | given ($char) { | |
177 | when (/^\\/) { | |
178 | $str = eval qq ["$char"]; | |
179 | $name = qq ["$char"]; | |
180 | } | |
181 | when (/^0x([0-9A-Fa-f]+)$/) { | |
182 | $str = chr hex $1; | |
183 | $name = "chr ($char)"; | |
184 | } | |
185 | default { | |
186 | $str = $char; | |
187 | $name = qq ["$char"]; | |
188 | } | |
189 | } | |
190 | ||
191 | print "not " unless $str =~ /$match/; | |
192 | print "ok ", ++ $tests, " - $name =~ /$match/\n"; | |
193 | print "not " unless $str !~ /$nomatch/; | |
194 | print "ok ", ++ $tests, " - $name !~ /$nomatch/\n"; | |
195 | } | |
196 | ||
197 | sub run_tests { | |
198 | ||
199 | while (@CLASSES) { | |
200 | my $class = shift @CLASSES; | |
201 | if ($class =~ /^\h*#\h*(.*)/) { | |
202 | print "# $1\n"; | |
203 | next; | |
204 | } | |
205 | last unless @CLASSES; | |
206 | my $chars = shift @CLASSES; | |
207 | my @in = grep {!/^!./} @$chars; | |
208 | my @out = map {s/^!(?=.)//; $_} grep { /^!./} @$chars; | |
209 | my $in_pat = eval qq ['\\p{$class}']; | |
210 | my $out_pat = eval qq ['\\P{$class}']; | |
211 | ||
212 | match $_, $in_pat, $out_pat for @in; | |
213 | match $_, $out_pat, $in_pat for @out; | |
214 | ||
215 | if (1 == length $class) { | |
216 | my $in_pat = eval qq ['\\p$class']; | |
217 | my $out_pat = eval qq ['\\P$class']; | |
218 | ||
219 | match $_, $in_pat, $out_pat for @in; | |
220 | match $_, $out_pat, $in_pat for @out; | |
221 | } | |
222 | } | |
223 | ||
224 | ||
225 | my $pat = qr /^Can't find Unicode property definition/; | |
226 | print "# Illegal properties\n"; | |
227 | foreach my $p (@ILLEGAL_PROPERTIES) { | |
228 | undef $@; | |
229 | my $r = eval "'a' =~ /\\p{$p}/; 1"; | |
230 | print "not " unless !$r && $@ && $@ =~ $pat; | |
231 | print "ok ", ++ $tests, " - Unknown Unicode property \\p{$p}\n"; | |
232 | undef $@; | |
233 | my $s = eval "'a' =~ /\\P{$p}/; 1"; | |
234 | print "not " unless !$s && $@ && $@ =~ $pat; | |
235 | print "ok ", ++ $tests, " - Unknown Unicode property \\P{$p}\n"; | |
236 | if (length $p == 1) { | |
237 | undef $@; | |
238 | my $r = eval "'a' =~ /\\p$p/; 1"; | |
239 | print "not " unless !$r && $@ && $@ =~ $pat; | |
240 | print "ok ", ++ $tests, " - Unknown Unicode property \\p$p\n"; | |
241 | undef $@; | |
242 | my $s = eval "'a' =~ /\\P$p/; 1"; | |
243 | print "not " unless !$s && $@ && $@ =~ $pat; | |
244 | print "ok ", ++ $tests, " - Unknown Unicode property \\P$p\n"; | |
245 | } | |
246 | } | |
247 | } | |
248 | ||
249 | ||
250 | # | |
251 | # User defined properties | |
252 | # | |
253 | ||
254 | sub InKana1 {<<'--'} | |
255 | 3040 309F | |
256 | 30A0 30FF | |
257 | -- | |
258 | ||
259 | sub InKana2 {<<'--'} | |
260 | +utf8::InHiragana | |
261 | +utf8::InKatakana | |
262 | -- | |
263 | ||
264 | sub InKana3 {<<'--'} | |
265 | +utf8::InHiragana | |
266 | +utf8::InKatakana | |
267 | -utf8::IsCn | |
268 | -- | |
269 | ||
270 | sub InNotKana {<<'--'} | |
271 | !utf8::InHiragana | |
272 | -utf8::InKatakana | |
273 | +utf8::IsCn | |
274 | -- | |
275 | ||
276 | sub InConsonant {<<'--'} # Not EBCDIC-aware. | |
277 | 0061 007f | |
278 | -0061 | |
279 | -0065 | |
280 | -0069 | |
281 | -006f | |
282 | -0075 | |
283 | -- | |
284 | ||
285 | sub IsSyriac1 {<<'--'} | |
286 | 0712 072C | |
287 | 0730 074A | |
288 | -- | |
289 | ||
290 | sub Syriac1 {<<'--'} | |
291 | 0712 072C | |
292 | 0730 074A | |
293 | -- | |
294 | ||
295 | sub InGreekSmall {return "03B1\t03C9"} | |
296 | sub InGreekCapital {return "0391\t03A9\n-03A2"} | |
297 | ||
298 | sub AsciiHexAndDash {<<'--'} | |
299 | +utf8::ASCII_Hex_Digit | |
300 | +utf8::Dash | |
301 | -- | |
302 | ||
303 | __END__ |