Commit | Line | Data |
---|---|---|
84281c31 A |
1 | #!./perl |
2 | # | |
3 | # Tests that have to do with checking whether characters have (or not have) | |
4 | # certain Unicode properties; belong (or not belong) to blocks, scripts, etc. | |
5 | # | |
6 | ||
7 | use strict; | |
8 | use warnings; | |
9 | use 5.010; | |
10 | ||
f3b02925 NC |
11 | BEGIN { |
12 | require './test.pl'; | |
13 | skip_all_if_miniperl("no dynamic loading on miniperl, no File::Spec (used by charnames)"); | |
14 | } | |
84281c31 A |
15 | |
16 | sub run_tests; | |
17 | ||
18 | # | |
19 | # This is the data to test. | |
20 | # | |
21 | # This is a hash; keys are the property to test. | |
22 | # Values are arrays containing characters to test. The characters can | |
23 | # have the following formats: | |
24 | # '\N{CHARACTER NAME}' - Use character with that name | |
25 | # '\x{1234}' - Use character with that hex escape | |
26 | # '0x1234' - Use chr() to get that character | |
27 | # "a" - Character to use | |
28 | # | |
29 | # If a character entry starts with ! the character does not belong to the class | |
30 | # | |
31 | # If the class is just single letter, we use both \pL and \p{L} | |
32 | # | |
33 | ||
34 | use charnames ':full'; | |
35 | ||
36 | my @CLASSES = ( | |
37 | L => ["a", "A"], | |
38 | Ll => ["b", "!B"], | |
39 | Lu => ["!c", "C"], | |
40 | IsLl => ["d", "!D"], | |
41 | IsLu => ["!e", "E"], | |
42 | LC => ["f", "!1"], | |
43 | 'L&' => ["g", "!2"], | |
44 | 'Lowercase Letter' => ["h", "!H"], | |
45 | ||
46 | Common => ["!i", "3"], | |
47 | Inherited => ["!j", '\x{300}'], | |
48 | ||
49 | InBasicLatin => ['\N{LATIN CAPITAL LETTER A}'], | |
50 | InLatin1Supplement => ['\N{LATIN CAPITAL LETTER A WITH GRAVE}'], | |
51 | InLatinExtendedA => ['\N{LATIN CAPITAL LETTER A WITH MACRON}'], | |
52 | InLatinExtendedB => ['\N{LATIN SMALL LETTER B WITH STROKE}'], | |
53 | InKatakana => ['\N{KATAKANA LETTER SMALL A}'], | |
54 | IsLatin => ["0x100", "0x212b"], | |
55 | IsHebrew => ["0x5d0", "0xfb4f"], | |
56 | IsGreek => ["0x37a", "0x386", "!0x387", "0x388", | |
57 | "0x38a", "!0x38b", "0x38c"], | |
58 | HangulSyllables => ['\x{AC00}'], | |
59 | 'Script=Latin' => ['\x{0100}'], | |
60 | 'Block=LatinExtendedA' => ['\x{0100}'], | |
61 | 'Category=UppercaseLetter' => ['\x{0100}'], | |
62 | ||
63 | # | |
64 | # It's ok to repeat class names. | |
65 | # | |
66 | InLatin1Supplement => | |
f3b02925 NC |
67 | $::IS_EBCDIC ? ['!\x{7f}', '\x{80}', '!\x{100}'] |
68 | : ['!\x{7f}', '\x{80}', '\x{ff}', '!\x{100}'], | |
84281c31 A |
69 | InLatinExtendedA => |
70 | ['!\x{7f}', '!\x{80}', '!\x{ff}', '\x{100}'], | |
71 | ||
72 | # | |
73 | # Properties are case-insensitive, and may have whitespace, | |
74 | # dashes and underscores. | |
75 | # | |
76 | 'in-latin1_SUPPLEMENT' => ['\x{80}', | |
77 | '\N{LATIN SMALL LETTER Y WITH DIAERESIS}'], | |
78 | ' ^ In Latin 1 Supplement ' | |
79 | => ['!\x{80}', '\N{COFFIN}'], | |
80 | 'latin-1 supplement' => ['\x{80}', "0xDF"], | |
81 | ||
82 | ); | |
83 | ||
84 | my @USER_DEFINED_PROPERTIES = ( | |
85 | # | |
86 | # User defined properties | |
87 | # | |
88 | InKana1 => ['\x{3040}', '!\x{303F}'], | |
89 | InKana2 => ['\x{3040}', '!\x{303F}'], | |
90 | InKana3 => ['\x{3041}', '!\x{3040}'], | |
91 | InNotKana => ['\x{3040}', '!\x{3041}'], | |
92 | InConsonant => ['d', '!e'], | |
93 | IsSyriac1 => ['\x{0712}', '!\x{072F}'], | |
9c8ea558 | 94 | '# User-defined character properties may lack \n at the end', |
84281c31 A |
95 | InGreekSmall => ['\N{GREEK SMALL LETTER PI}', |
96 | '\N{GREEK SMALL LETTER FINAL SIGMA}'], | |
97 | InGreekCapital => ['\N{GREEK CAPITAL LETTER PI}', '!\x{03A2}'], | |
98 | Dash => ['-'], | |
99 | ASCII_Hex_Digit => ['!-', 'A'], | |
d658a8a8 | 100 | IsAsciiHexAndDash => ['-', 'A'], |
84281c31 A |
101 | ); |
102 | ||
19c4061a KW |
103 | my @USER_CASELESS_PROPERTIES = ( |
104 | # | |
105 | # User defined properties which differ depending on /i. Second entry is | |
106 | # false regularly, true under /i | |
107 | # | |
108 | 'IsMyUpper' => ["M", "!m" ], | |
109 | ); | |
110 | ||
84281c31 A |
111 | |
112 | # | |
113 | # From the short properties we populate POSIX-like classes. | |
114 | # | |
115 | my %SHORT_PROPERTIES = ( | |
116 | 'Ll' => ['m', '\N{CYRILLIC SMALL LETTER A}'], | |
117 | 'Lu' => ['M', '\N{GREEK CAPITAL LETTER ALPHA}'], | |
118 | 'Lo' => ['\N{HIRAGANA LETTER SMALL A}'], | |
99870f4d KW |
119 | # is also in other alphabetic |
120 | 'Mn' => ['\N{HEBREW POINT RAFE}'], | |
84281c31 A |
121 | 'Nd' => ["0", '\N{ARABIC-INDIC DIGIT ZERO}'], |
122 | 'Pc' => ["_"], | |
123 | 'Po' => ["!"], | |
124 | 'Zs' => [" "], | |
125 | 'Cc' => ['\x{00}'], | |
126 | ); | |
127 | ||
128 | # | |
129 | # Illegal properties | |
130 | # | |
d658a8a8 DM |
131 | my @ILLEGAL_PROPERTIES = |
132 | qw[q qrst f foo isfoo infoo ISfoo INfoo Is::foo In::foo]; | |
84281c31 A |
133 | |
134 | my %d; | |
135 | ||
136 | while (my ($class, $chars) = each %SHORT_PROPERTIES) { | |
137 | push @{$d {IsAlpha}} => map {$class =~ /^[LM]/ ? $_ : "!$_"} @$chars; | |
138 | push @{$d {IsAlnum}} => map {$class =~ /^[LMN]./ ? $_ : "!$_"} @$chars; | |
139 | push @{$d {IsASCII}} => map {length ($_) == 1 || $_ eq '\x{00}' | |
140 | ? $_ : "!$_"} @$chars; | |
141 | push @{$d {IsCntrl}} => map {$class =~ /^C/ ? $_ : "!$_"} @$chars; | |
142 | push @{$d {IsBlank}} => map {$class =~ /^Z[lps]/ ? $_ : "!$_"} @$chars; | |
143 | push @{$d {IsDigit}} => map {$class =~ /^Nd$/ ? $_ : "!$_"} @$chars; | |
144 | push @{$d {IsGraph}} => map {$class =~ /^([LMNPS]|Co)/ | |
145 | ? $_ : "!$_"} @$chars; | |
146 | push @{$d {IsPrint}} => map {$class =~ /^([LMNPS]|Co|Zs)/ | |
147 | ? $_ : "!$_"} @$chars; | |
148 | push @{$d {IsLower}} => map {$class =~ /^Ll$/ ? $_ : "!$_"} @$chars; | |
149 | push @{$d {IsUpper}} => map {$class =~ /^L[ut]/ ? $_ : "!$_"} @$chars; | |
150 | push @{$d {IsPunct}} => map {$class =~ /^P/ ? $_ : "!$_"} @$chars; | |
151 | push @{$d {IsWord}} => map {$class =~ /^[LMN]/ || $_ eq "_" | |
152 | ? $_ : "!$_"} @$chars; | |
153 | push @{$d {IsSpace}} => map {$class =~ /^Z/ || | |
154 | length ($_) == 1 && ord ($_) >= 0x09 | |
155 | && ord ($_) <= 0x0D | |
156 | ? $_ : "!$_"} @$chars; | |
157 | } | |
158 | ||
f3b02925 | 159 | delete $d {IsASCII} if $::IS_EBCDIC; |
84281c31 A |
160 | |
161 | push @CLASSES => "# Short properties" => %SHORT_PROPERTIES, | |
162 | "# POSIX like properties" => %d, | |
163 | "# User defined properties" => @USER_DEFINED_PROPERTIES; | |
164 | ||
165 | ||
166 | # | |
167 | # Calculate the number of tests. | |
168 | # | |
169 | my $count = 0; | |
170 | for (my $i = 0; $i < @CLASSES; $i += 2) { | |
171 | $i ++, redo if $CLASSES [$i] =~ /^\h*#\h*(.*)/; | |
72d70e56 | 172 | $count += 2 * (length $CLASSES [$i] == 1 ? 4 : 2) * @{$CLASSES [$i + 1]}; |
84281c31 | 173 | } |
72d70e56 NC |
174 | $count += 4 * @ILLEGAL_PROPERTIES; |
175 | $count += 4 * grep {length $_ == 1} @ILLEGAL_PROPERTIES; | |
176 | $count += 8 * @USER_CASELESS_PROPERTIES; | |
84281c31 | 177 | |
72d70e56 | 178 | plan(tests => $count); |
84281c31 A |
179 | |
180 | run_tests unless caller (); | |
181 | ||
182 | sub match { | |
19c4061a KW |
183 | my ($char, $match, $nomatch, $caseless) = @_; |
184 | $caseless = "" unless defined $caseless; | |
185 | $caseless = 'i' if $caseless; | |
84281c31 A |
186 | |
187 | my ($str, $name); | |
188 | ||
189 | given ($char) { | |
190 | when (/^\\/) { | |
191 | $str = eval qq ["$char"]; | |
192 | $name = qq ["$char"]; | |
193 | } | |
194 | when (/^0x([0-9A-Fa-f]+)$/) { | |
195 | $str = chr hex $1; | |
196 | $name = "chr ($char)"; | |
197 | } | |
198 | default { | |
199 | $str = $char; | |
200 | $name = qq ["$char"]; | |
201 | } | |
202 | } | |
203 | ||
19c4061a KW |
204 | undef $@; |
205 | my $match_pat = eval "qr/$match/$caseless"; | |
72d70e56 NC |
206 | is($@, '', "$name compiled correctly to a regexp"); |
207 | like($str, $match_pat, "$name correctly matched"); | |
19c4061a KW |
208 | |
209 | undef $@; | |
210 | my $nomatch_pat = eval "qr/$nomatch/$caseless"; | |
72d70e56 NC |
211 | is($@, '', "$name compiled correctly to a regexp"); |
212 | unlike($str, $nomatch_pat, "$name correctly did not match"); | |
84281c31 A |
213 | } |
214 | ||
215 | sub run_tests { | |
216 | ||
217 | while (@CLASSES) { | |
218 | my $class = shift @CLASSES; | |
219 | if ($class =~ /^\h*#\h*(.*)/) { | |
220 | print "# $1\n"; | |
221 | next; | |
222 | } | |
223 | last unless @CLASSES; | |
224 | my $chars = shift @CLASSES; | |
225 | my @in = grep {!/^!./} @$chars; | |
226 | my @out = map {s/^!(?=.)//; $_} grep { /^!./} @$chars; | |
227 | my $in_pat = eval qq ['\\p{$class}']; | |
228 | my $out_pat = eval qq ['\\P{$class}']; | |
229 | ||
230 | match $_, $in_pat, $out_pat for @in; | |
231 | match $_, $out_pat, $in_pat for @out; | |
232 | ||
9c8ea558 | 233 | if (1 == length $class) { # Repeat without braces if name length 1 |
84281c31 A |
234 | my $in_pat = eval qq ['\\p$class']; |
235 | my $out_pat = eval qq ['\\P$class']; | |
236 | ||
237 | match $_, $in_pat, $out_pat for @in; | |
238 | match $_, $out_pat, $in_pat for @out; | |
239 | } | |
240 | } | |
241 | ||
242 | ||
243 | my $pat = qr /^Can't find Unicode property definition/; | |
244 | print "# Illegal properties\n"; | |
245 | foreach my $p (@ILLEGAL_PROPERTIES) { | |
246 | undef $@; | |
247 | my $r = eval "'a' =~ /\\p{$p}/; 1"; | |
72d70e56 NC |
248 | is($r, undef, "Unknown Unicode property \\p{$p}"); |
249 | like($@, $pat, "Unknown Unicode property \\p{$p}"); | |
84281c31 A |
250 | undef $@; |
251 | my $s = eval "'a' =~ /\\P{$p}/; 1"; | |
72d70e56 NC |
252 | is($s, undef, "Unknown Unicode property \\p{$p}"); |
253 | like($@, $pat, "Unknown Unicode property \\p{$p}"); | |
84281c31 A |
254 | if (length $p == 1) { |
255 | undef $@; | |
256 | my $r = eval "'a' =~ /\\p$p/; 1"; | |
72d70e56 NC |
257 | is($r, undef, "Unknown Unicode property \\p$p"); |
258 | like($@, $pat, "Unknown Unicode property \\p$p"); | |
84281c31 A |
259 | undef $@; |
260 | my $s = eval "'a' =~ /\\P$p/; 1"; | |
72d70e56 NC |
261 | is($r, undef, "Unknown Unicode property \\P$p"); |
262 | like($@, $pat, "Unknown Unicode property \\P$p"); | |
84281c31 A |
263 | } |
264 | } | |
19c4061a KW |
265 | |
266 | print "# User-defined properties with /i differences\n"; | |
267 | foreach my $class (shift @USER_CASELESS_PROPERTIES) { | |
268 | my $chars_ref = shift @USER_CASELESS_PROPERTIES; | |
269 | my @in = grep {!/^!./} @$chars_ref; | |
270 | my @out = map {s/^!(?=.)//; $_} grep { /^!./} @$chars_ref; | |
271 | my $in_pat = eval qq ['\\p{$class}']; | |
272 | my $out_pat = eval qq ['\\P{$class}']; | |
273 | ||
274 | # Verify works as regularly for not /i | |
275 | match $_, $in_pat, $out_pat for @in; | |
276 | match $_, $out_pat, $in_pat for @out; | |
277 | ||
278 | # Verify that adding /i doesn't change the in set. | |
279 | match $_, $in_pat, $out_pat, 'i' for @in; | |
280 | ||
281 | # Verify that adding /i does change the out set to match. | |
282 | match $_, $in_pat, $out_pat, 'i' for @out; | |
283 | } | |
84281c31 A |
284 | } |
285 | ||
286 | ||
287 | # | |
288 | # User defined properties | |
289 | # | |
290 | ||
291 | sub InKana1 {<<'--'} | |
292 | 3040 309F | |
293 | 30A0 30FF | |
294 | -- | |
295 | ||
296 | sub InKana2 {<<'--'} | |
297 | +utf8::InHiragana | |
298 | +utf8::InKatakana | |
299 | -- | |
300 | ||
301 | sub InKana3 {<<'--'} | |
302 | +utf8::InHiragana | |
303 | +utf8::InKatakana | |
304 | -utf8::IsCn | |
305 | -- | |
306 | ||
307 | sub InNotKana {<<'--'} | |
308 | !utf8::InHiragana | |
309 | -utf8::InKatakana | |
310 | +utf8::IsCn | |
311 | -- | |
312 | ||
313 | sub InConsonant {<<'--'} # Not EBCDIC-aware. | |
314 | 0061 007f | |
315 | -0061 | |
316 | -0065 | |
317 | -0069 | |
318 | -006f | |
319 | -0075 | |
320 | -- | |
321 | ||
322 | sub IsSyriac1 {<<'--'} | |
323 | 0712 072C | |
324 | 0730 074A | |
325 | -- | |
326 | ||
84281c31 A |
327 | sub InGreekSmall {return "03B1\t03C9"} |
328 | sub InGreekCapital {return "0391\t03A9\n-03A2"} | |
329 | ||
d658a8a8 | 330 | sub IsAsciiHexAndDash {<<'--'} |
84281c31 A |
331 | +utf8::ASCII_Hex_Digit |
332 | +utf8::Dash | |
333 | -- | |
334 | ||
19c4061a KW |
335 | sub IsMyUpper { |
336 | my $caseless = shift; | |
337 | if ($caseless) { | |
338 | return "0041\t005A\n0061\t007A" | |
339 | } | |
340 | else { | |
341 | return "0041\t005A" | |
342 | } | |
343 | } | |
344 | ||
d658a8a8 DM |
345 | # fake user-defined properties; these subs shouldn't be called, because |
346 | # their names don't start with In or Is | |
347 | ||
348 | sub f { die } | |
349 | sub foo { die } | |
350 | sub isfoo { die } | |
351 | sub infoo { die } | |
352 | sub ISfoo { die } | |
353 | sub INfoo { die } | |
354 | sub Is::foo { die } | |
355 | sub In::foo { die } | |
84281c31 | 356 | __END__ |