Commit | Line | Data |
---|---|---|
bdd8600f KW |
1 | #!perl -w |
2 | ||
31f05a37 | 3 | BEGIN { |
9b0711ee | 4 | require 'loc_tools.pl'; # Contains find_utf8_ctype_locale() |
31f05a37 KW |
5 | } |
6 | ||
bdd8600f KW |
7 | use strict; |
8 | use Test::More; | |
569f7fc5 | 9 | use Config; |
bdd8600f KW |
10 | |
11 | use XS::APItest; | |
12 | ||
2e1414ce | 13 | use Unicode::UCD qw(prop_invlist prop_invmap); |
c9c05358 | 14 | |
5073ffbd | 15 | sub truth($) { # Converts values so is() works |
e2efe419 | 16 | return (shift) ? 1 : 0; |
c9c05358 KW |
17 | } |
18 | ||
569f7fc5 | 19 | my $locale; |
31f05a37 | 20 | my $utf8_locale; |
569f7fc5 JR |
21 | if($Config{d_setlocale}) { |
22 | require POSIX; | |
23 | $locale = POSIX::setlocale( &POSIX::LC_ALL, "C"); | |
24 | if (defined $locale && $locale eq 'C') { | |
25 | BEGIN { | |
26 | if($Config{d_setlocale}) { | |
27 | require locale; import locale; # make \w work right in non-ASCII lands | |
28 | } | |
29 | } | |
30 | ||
31 | # Some locale implementations don't have the 128-255 characters all | |
32 | # mean nothing. Skip the locale tests in that situation | |
33 | for my $i (128 .. 255) { | |
34 | if (chr($i) =~ /[[:print:]]/) { | |
35 | undef $locale; | |
36 | last; | |
37 | } | |
981746b9 | 38 | } |
31f05a37 | 39 | |
9b0711ee | 40 | $utf8_locale = find_utf8_ctype_locale(); |
981746b9 KW |
41 | } |
42 | } | |
43 | ||
e2efe419 KW |
44 | my %properties = ( |
45 | # name => Lookup-property name | |
46 | alnum => 'Word', | |
981746b9 | 47 | wordchar => 'Word', |
15861f94 | 48 | alphanumeric => 'Alnum', |
e2efe419 KW |
49 | alpha => 'Alpha', |
50 | ascii => 'ASCII', | |
51 | blank => 'Blank', | |
52 | cntrl => 'Control', | |
53 | digit => 'Digit', | |
54 | graph => 'Graph', | |
55 | idfirst => '_Perl_IDStart', | |
eba68aa0 | 56 | idcont => '_Perl_IDCont', |
e2efe419 KW |
57 | lower => 'Lower', |
58 | print => 'Print', | |
59 | psxspc => 'XPosixSpace', | |
60 | punct => 'XPosixPunct', | |
61 | quotemeta => '_Perl_Quotemeta', | |
62 | space => 'XPerlSpace', | |
840f8e92 | 63 | vertws => 'VertSpace', |
e2efe419 KW |
64 | upper => 'Upper', |
65 | xdigit => 'XDigit', | |
66 | ); | |
67 | ||
5073ffbd KW |
68 | my @warnings; |
69 | local $SIG{__WARN__} = sub { push @warnings, @_ }; | |
70 | ||
c9c05358 | 71 | use charnames (); |
e2efe419 KW |
72 | foreach my $name (sort keys %properties) { |
73 | my $property = $properties{$name}; | |
74 | my @invlist = prop_invlist($property, '_perl_core_internal_ok'); | |
75 | if (! @invlist) { | |
76 | fail("No inversion list found for $property"); | |
77 | next; | |
78 | } | |
79 | ||
80 | # Include all the Latin1 code points, plus 0x100. | |
81 | my @code_points = (0 .. 256); | |
82 | ||
83 | # Then include the next few boundaries above those from this property | |
84 | my $above_latins = 0; | |
85 | foreach my $range_start (@invlist) { | |
86 | next if $range_start < 257; | |
87 | push @code_points, $range_start - 1, $range_start; | |
88 | $above_latins++; | |
89 | last if $above_latins > 5; | |
90 | } | |
91 | ||
eba68aa0 KW |
92 | # This makes sure we are using the Perl definition of idfirst and idcont, |
93 | # and not the Unicode. There are a few differences. | |
94 | push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first|cont)/; | |
2f4622f0 KW |
95 | if ($name eq "idcont") { # And some that are continuation but not start |
96 | push @code_points, ord("\N{GREEK ANO TELEIA}"), | |
97 | ord("\N{COMBINING GRAVE ACCENT}"); | |
98 | } | |
f91dcd13 | 99 | |
e2efe419 KW |
100 | # And finally one non-Unicode code point. |
101 | push @code_points, 0x110000; # Above Unicode, no prop should match | |
4f6ef7cc | 102 | no warnings 'non_unicode'; |
e2efe419 | 103 | |
2e1414ce KW |
104 | for my $j (@code_points) { |
105 | my $i = utf8::native_to_unicode($j); | |
c9c05358 | 106 | my $function = uc($name); |
c9c05358 | 107 | |
1fdd5e53 | 108 | my $matches = Unicode::UCD::search_invlist(\@invlist, $i); |
e2efe419 KW |
109 | if (! defined $matches) { |
110 | $matches = 0; | |
111 | } | |
112 | else { | |
113 | $matches = truth(! ($matches % 2)); | |
114 | } | |
5073ffbd | 115 | |
e2efe419 KW |
116 | my $ret; |
117 | my $char_name = charnames::viacode($i) // "No name"; | |
94a62bdc | 118 | my $display_name = sprintf "\\x{%02X, %s}", $i, $char_name; |
e2efe419 KW |
119 | |
120 | if ($name eq 'quotemeta') { # There is only one macro for this, and is | |
121 | # defined only for Latin1 range | |
122 | $ret = truth eval "test_is${function}($i)"; | |
123 | if ($@) { | |
124 | fail $@; | |
125 | } | |
126 | else { | |
127 | my $truth = truth($matches && $i < 256); | |
128 | is ($ret, $truth, "is${function}( $display_name ) == $truth"); | |
129 | } | |
c9c05358 KW |
130 | next; |
131 | } | |
981746b9 KW |
132 | |
133 | # vertws is always all of Unicode; ALNUM_A and ALNUM_L1 are not | |
134 | # defined as they were added later, after WORDCHAR was created to be a | |
135 | # clearer synonym for ALNUM | |
840f8e92 | 136 | if ($name ne 'vertws') { |
981746b9 | 137 | if ($name ne 'alnum') { |
195d0f3b KW |
138 | $ret = truth eval "test_is${function}_A($i)"; |
139 | if ($@) { | |
140 | fail($@); | |
141 | } | |
142 | else { | |
143 | my $truth = truth($matches && $i < 128); | |
144 | is ($ret, $truth, "is${function}_A( $display_name ) == $truth"); | |
145 | } | |
146 | $ret = truth eval "test_is${function}_L1($i)"; | |
147 | if ($@) { | |
148 | fail($@); | |
149 | } | |
150 | else { | |
151 | my $truth = truth($matches && $i < 256); | |
152 | is ($ret, $truth, "is${function}_L1( $display_name ) == $truth"); | |
153 | } | |
981746b9 KW |
154 | } |
155 | ||
8ff203a1 | 156 | if (defined $locale) { |
569f7fc5 | 157 | require locale; import locale; |
981746b9 | 158 | |
31f05a37 | 159 | POSIX::setlocale( &POSIX::LC_ALL, "C"); |
31a09021 KW |
160 | $ret = truth eval "test_is${function}_LC($i)"; |
161 | if ($@) { | |
162 | fail($@); | |
163 | } | |
164 | else { | |
165 | my $truth = truth($matches && $i < 128); | |
31f05a37 KW |
166 | is ($ret, $truth, "is${function}_LC( $display_name ) == $truth (C locale)"); |
167 | } | |
168 | } | |
169 | ||
170 | if (defined $utf8_locale) { | |
171 | use locale; | |
172 | ||
173 | POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale); | |
174 | $ret = truth eval "test_is${function}_LC($i)"; | |
175 | if ($@) { | |
176 | fail($@); | |
177 | } | |
178 | else { | |
179 | ||
180 | # UTF-8 locale works on full range 0-255 | |
181 | my $truth = truth($matches && $i < 256); | |
182 | is ($ret, $truth, "is${function}_LC( $display_name ) == $truth ($utf8_locale)"); | |
31a09021 | 183 | } |
8ff203a1 | 184 | } |
840f8e92 | 185 | } |
c9c05358 | 186 | |
e2efe419 KW |
187 | $ret = truth eval "test_is${function}_uni($i)"; |
188 | if ($@) { | |
189 | fail($@); | |
190 | } | |
191 | else { | |
192 | is ($ret, $matches, "is${function}_uni( $display_name ) == $matches"); | |
193 | } | |
c9c05358 | 194 | |
981746b9 | 195 | if (defined $locale && $name ne 'vertws') { |
569f7fc5 | 196 | require locale; import locale; |
981746b9 | 197 | |
31f05a37 | 198 | POSIX::setlocale( &POSIX::LC_ALL, "C"); |
981746b9 KW |
199 | $ret = truth eval "test_is${function}_LC_uvchr('$i')"; |
200 | if ($@) { | |
201 | fail($@); | |
202 | } | |
203 | else { | |
204 | my $truth = truth($matches && ($i < 128 || $i > 255)); | |
31f05a37 KW |
205 | is ($ret, $truth, "is${function}_LC_uvchr( $display_name ) == $truth (C locale)"); |
206 | } | |
207 | } | |
208 | ||
209 | if (defined $utf8_locale && $name ne 'vertws') { | |
210 | use locale; | |
211 | ||
212 | POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale); | |
213 | $ret = truth eval "test_is${function}_LC_uvchr('$i')"; | |
214 | if ($@) { | |
215 | fail($@); | |
216 | } | |
217 | else { | |
218 | my $truth = truth($matches); | |
219 | is ($ret, $truth, "is${function}_LC_uvchr( $display_name ) == $truth ($utf8_locale)"); | |
981746b9 KW |
220 | } |
221 | } | |
222 | ||
c9c05358 KW |
223 | my $char = chr($i); |
224 | utf8::upgrade($char); | |
225 | $char = quotemeta $char if $char eq '\\' || $char eq "'"; | |
e2efe419 KW |
226 | $ret = truth eval "test_is${function}_utf8('$char')"; |
227 | if ($@) { | |
228 | fail($@); | |
229 | } | |
230 | else { | |
231 | is ($ret, $matches, "is${function}_utf8( $display_name ) == $matches"); | |
232 | } | |
981746b9 | 233 | |
8ff203a1 | 234 | if ($name ne 'vertws' && defined $locale) { |
569f7fc5 | 235 | require locale; import locale; |
981746b9 | 236 | |
31f05a37 | 237 | POSIX::setlocale( &POSIX::LC_ALL, "C"); |
31a09021 KW |
238 | $ret = truth eval "test_is${function}_LC_utf8('$char')"; |
239 | if ($@) { | |
240 | fail($@); | |
241 | } | |
242 | else { | |
243 | my $truth = truth($matches && ($i < 128 || $i > 255)); | |
31f05a37 KW |
244 | is ($ret, $truth, "is${function}_LC_utf8( $display_name ) == $truth (C locale)"); |
245 | } | |
246 | } | |
247 | ||
248 | if ($name ne 'vertws' && defined $utf8_locale) { | |
249 | use locale; | |
250 | ||
251 | POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale); | |
252 | $ret = truth eval "test_is${function}_LC_utf8('$char')"; | |
253 | if ($@) { | |
254 | fail($@); | |
255 | } | |
256 | else { | |
257 | my $truth = truth($matches); | |
258 | is ($ret, $truth, "is${function}_LC_utf8( $display_name ) == $truth ($utf8_locale)"); | |
31a09021 | 259 | } |
8ff203a1 | 260 | } |
c9c05358 KW |
261 | } |
262 | } | |
263 | ||
2e1414ce KW |
264 | my %to_properties = ( |
265 | FOLD => 'Case_Folding', | |
266 | LOWER => 'Lowercase_Mapping', | |
267 | TITLE => 'Titlecase_Mapping', | |
268 | UPPER => 'Uppercase_Mapping', | |
269 | ); | |
270 | ||
271 | ||
272 | foreach my $name (sort keys %to_properties) { | |
273 | my $property = $to_properties{$name}; | |
274 | my ($list_ref, $map_ref, $format, $missing) | |
275 | = prop_invmap($property, ); | |
276 | if (! $list_ref || ! $map_ref) { | |
277 | fail("No inversion map found for $property"); | |
278 | next; | |
279 | } | |
280 | if ($format ne "al") { | |
281 | fail("Unexpected inversion map format ('$format') found for $property"); | |
282 | next; | |
283 | } | |
284 | ||
285 | # Include all the Latin1 code points, plus 0x100. | |
286 | my @code_points = (0 .. 256); | |
287 | ||
288 | # Then include the next few multi-char folds above those from this | |
289 | # property, and include the next few single folds as well | |
290 | my $above_latins = 0; | |
291 | my $multi_char = 0; | |
292 | for my $i (0 .. @{$list_ref} - 1) { | |
293 | my $range_start = $list_ref->[$i]; | |
294 | next if $range_start < 257; | |
295 | if (ref $map_ref->[$i] && $multi_char < 5) { | |
296 | push @code_points, $range_start - 1 if $code_points[-1] != $range_start - 1; | |
297 | push @code_points, $range_start; | |
298 | $multi_char++; | |
299 | } | |
300 | elsif ($above_latins < 5) { | |
301 | push @code_points, $range_start - 1 if $code_points[-1] != $range_start - 1; | |
302 | push @code_points, $range_start; | |
303 | $above_latins++; | |
304 | } | |
305 | last if $above_latins >= 5 && $multi_char >= 5; | |
306 | } | |
307 | ||
308 | # And finally one non-Unicode code point. | |
309 | push @code_points, 0x110000; # Above Unicode, no prop should match | |
310 | no warnings 'non_unicode'; | |
311 | ||
312 | # $j is native; $i unicode. | |
313 | for my $j (@code_points) { | |
314 | my $i = utf8::native_to_unicode($j); | |
315 | my $function = $name; | |
316 | ||
1fdd5e53 | 317 | my $index = Unicode::UCD::search_invlist(\@{$list_ref}, $j); |
2e1414ce KW |
318 | |
319 | my $ret; | |
320 | my $char_name = charnames::viacode($j) // "No name"; | |
321 | my $display_name = sprintf "\\N{U+%02X, %s}", $i, $char_name; | |
322 | ||
323 | # Test the base function | |
324 | $ret = eval "test_to${function}($j)"; | |
325 | if ($@) { | |
326 | fail($@); | |
327 | } | |
328 | else { | |
329 | my $should_be = ($i < 128 && $map_ref->[$index] != $missing) | |
330 | ? $map_ref->[$index] + $j - $list_ref->[$index] | |
331 | : $j; | |
332 | is ($ret, $should_be, sprintf("to${function}( $display_name ) == 0x%02X", $should_be)); | |
333 | } | |
334 | ||
335 | # Test _L1 | |
336 | if ($name eq 'LOWER') { | |
337 | $ret = eval "test_to${function}_L1($j)"; | |
338 | if ($@) { | |
339 | fail($@); | |
340 | } | |
341 | else { | |
342 | my $should_be = ($i < 256 && $map_ref->[$index] != $missing) | |
343 | ? $map_ref->[$index] + $j - $list_ref->[$index] | |
344 | : $j; | |
345 | is ($ret, $should_be, sprintf("to${function}_L1( $display_name ) == 0x%02X", $should_be)); | |
346 | } | |
347 | } | |
348 | ||
31f05a37 KW |
349 | if ($name ne 'TITLE') { # Test _LC; titlecase is not defined in locales. |
350 | if (defined $locale) { | |
baa60164 | 351 | require locale; import locale; |
2e1414ce | 352 | |
baa60164 KW |
353 | POSIX::setlocale( &POSIX::LC_ALL, "C"); |
354 | $ret = eval "test_to${function}_LC($j)"; | |
355 | if ($@) { | |
356 | fail($@); | |
357 | } | |
358 | else { | |
359 | my $should_be = ($i < 128 && $map_ref->[$index] != $missing) | |
2e1414ce KW |
360 | ? $map_ref->[$index] + $j - $list_ref->[$index] |
361 | : $j; | |
baa60164 KW |
362 | is ($ret, $should_be, sprintf("to${function}_LC( $display_name ) == 0x%02X (C locale)", $should_be)); |
363 | } | |
31f05a37 KW |
364 | } |
365 | ||
366 | if (defined $utf8_locale) { | |
367 | use locale; | |
368 | ||
369 | SKIP: { | |
370 | skip "to${property}_LC does not work for LATIN SMALL LETTER SHARP S", 1 | |
371 | if $j == 0xDF && ($name eq 'FOLD' || $name eq 'UPPER'); | |
372 | ||
373 | POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale); | |
374 | $ret = eval "test_to${function}_LC($j)"; | |
375 | if ($@) { | |
376 | fail($@); | |
377 | } | |
378 | else { | |
379 | my $should_be = ($i < 256 | |
380 | && ! ref $map_ref->[$index] | |
381 | && $map_ref->[$index] != $missing | |
382 | ) | |
383 | ? $map_ref->[$index] + $j - $list_ref->[$index] | |
384 | : $j; | |
385 | is ($ret, $should_be, sprintf("to${function}_LC( $display_name ) == 0x%02X ($utf8_locale)", $should_be)); | |
386 | } | |
387 | } | |
2e1414ce KW |
388 | } |
389 | } | |
390 | ||
391 | # The _uni and _utf8 functions return both the ordinal of the first | |
392 | # code point of the result, and the result in utf8. The .xs tests | |
393 | # return these in an array, in [0] and [1] respectively, with [2] the | |
394 | # length of the utf8 in bytes. | |
395 | my $utf8_should_be = ""; | |
396 | my $first_ord_should_be; | |
397 | if (ref $map_ref->[$index]) { # A multi-char result | |
398 | for my $j (0 .. @{$map_ref->[$index]} - 1) { | |
399 | $utf8_should_be .= chr $map_ref->[$index][$j]; | |
400 | } | |
401 | ||
402 | $first_ord_should_be = $map_ref->[$index][0]; | |
403 | } | |
404 | else { # A single-char result | |
405 | $first_ord_should_be = ($map_ref->[$index] != $missing) | |
406 | ? $map_ref->[$index] + $j - $list_ref->[$index] | |
407 | : $j; | |
408 | $utf8_should_be = chr $first_ord_should_be; | |
409 | } | |
410 | utf8::upgrade($utf8_should_be); | |
411 | ||
412 | # Test _uni | |
413 | my $s; | |
414 | my $len; | |
415 | $ret = eval "test_to${function}_uni($j)"; | |
416 | if ($@) { | |
417 | fail($@); | |
418 | } | |
419 | else { | |
420 | is ($ret->[0], $first_ord_should_be, sprintf("to${function}_uni( $display_name ) == 0x%02X", $first_ord_should_be)); | |
421 | is ($ret->[1], $utf8_should_be, sprintf("utf8 of to${function}_uni( $display_name )")); | |
422 | use bytes; | |
423 | is ($ret->[2], length $utf8_should_be, sprintf("number of bytes in utf8 of to${function}_uni( $display_name )")); | |
424 | } | |
425 | ||
426 | # Test _utf8 | |
427 | my $char = chr($j); | |
428 | utf8::upgrade($char); | |
429 | $char = quotemeta $char if $char eq '\\' || $char eq "'"; | |
430 | $ret = eval "test_to${function}_utf8('$char')"; | |
431 | if ($@) { | |
432 | fail($@); | |
433 | } | |
434 | else { | |
435 | is ($ret->[0], $first_ord_should_be, sprintf("to${function}_utf8( $display_name ) == 0x%02X", $first_ord_should_be)); | |
436 | is ($ret->[1], $utf8_should_be, sprintf("utf8 of to${function}_utf8( $display_name )")); | |
437 | use bytes; | |
438 | is ($ret->[2], length $utf8_should_be, sprintf("number of bytes in utf8 of to${function}_uni( $display_name )")); | |
439 | } | |
440 | ||
441 | } | |
442 | } | |
443 | ||
5073ffbd KW |
444 | # This is primarily to make sure that no non-Unicode warnings get generated |
445 | is(scalar @warnings, 0, "No warnings were generated " . join ", ", @warnings); | |
446 | ||
bdd8600f | 447 | done_testing; |