4 require 'loc_tools.pl'; # Contains find_utf8_ctype_locale()
13 use Unicode::UCD qw(prop_invlist prop_invmap);
15 sub truth($) { # Converts values so is() works
16 return (shift) ? 1 : 0;
21 if($Config{d_setlocale}) {
23 $locale = POSIX::setlocale( &POSIX::LC_ALL, "C");
24 if (defined $locale && $locale eq 'C') {
26 if($Config{d_setlocale}) {
27 require locale; import locale; # make \w work right in non-ASCII lands
31 # Some locale implementations don't have the 128-255 characters all
32 # mean nothing. Skip the locale tests in that situation
33 for my $i (128 .. 255) {
34 if (chr($i) =~ /[[:print:]]/) {
40 $utf8_locale = find_utf8_ctype_locale();
45 # name => Lookup-property name
48 alphanumeric => 'Alnum',
55 idfirst => '_Perl_IDStart',
56 idcont => '_Perl_IDCont',
59 psxspc => 'XPosixSpace',
60 punct => 'XPosixPunct',
61 quotemeta => '_Perl_Quotemeta',
62 space => 'XPerlSpace',
63 vertws => 'VertSpace',
69 local $SIG{__WARN__} = sub { push @warnings, @_ };
72 foreach my $name (sort keys %properties) {
73 my $property = $properties{$name};
74 my @invlist = prop_invlist($property, '_perl_core_internal_ok');
76 fail("No inversion list found for $property");
80 # Include all the Latin1 code points, plus 0x100.
81 my @code_points = (0 .. 256);
83 # Then include the next few boundaries above those from this property
85 foreach my $range_start (@invlist) {
86 next if $range_start < 257;
87 push @code_points, $range_start - 1, $range_start;
89 last if $above_latins > 5;
92 # This makes sure we are using the Perl definition of idfirst and idcont,
93 # and not the Unicode. There are a few differences.
94 push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first|cont)/;
95 if ($name eq "idcont") { # And some that are continuation but not start
96 push @code_points, ord("\N{GREEK ANO TELEIA}"),
97 ord("\N{COMBINING GRAVE ACCENT}");
100 # And finally one non-Unicode code point.
101 push @code_points, 0x110000; # Above Unicode, no prop should match
102 no warnings 'non_unicode';
104 for my $j (@code_points) {
105 my $i = utf8::native_to_unicode($j);
106 my $function = uc($name);
108 my $matches = Unicode::UCD::search_invlist(\@invlist, $i);
109 if (! defined $matches) {
113 $matches = truth(! ($matches % 2));
117 my $char_name = charnames::viacode($i) // "No name";
118 my $display_name = sprintf "\\x{%02X, %s}", $i, $char_name;
120 if ($name eq 'quotemeta') { # There is only one macro for this, and is
121 # defined only for Latin1 range
122 $ret = truth eval "test_is${function}($i)";
127 my $truth = truth($matches && $i < 256);
128 is ($ret, $truth, "is${function}( $display_name ) == $truth");
133 # vertws is always all of Unicode; ALNUM_A and ALNUM_L1 are not
134 # defined as they were added later, after WORDCHAR was created to be a
135 # clearer synonym for ALNUM
136 if ($name ne 'vertws') {
137 if ($name ne 'alnum') {
138 $ret = truth eval "test_is${function}_A($i)";
143 my $truth = truth($matches && $i < 128);
144 is ($ret, $truth, "is${function}_A( $display_name ) == $truth");
146 $ret = truth eval "test_is${function}_L1($i)";
151 my $truth = truth($matches && $i < 256);
152 is ($ret, $truth, "is${function}_L1( $display_name ) == $truth");
156 if (defined $locale) {
157 require locale; import locale;
159 POSIX::setlocale( &POSIX::LC_ALL, "C");
160 $ret = truth eval "test_is${function}_LC($i)";
165 my $truth = truth($matches && $i < 128);
166 is ($ret, $truth, "is${function}_LC( $display_name ) == $truth (C locale)");
170 if (defined $utf8_locale) {
173 POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale);
174 $ret = truth eval "test_is${function}_LC($i)";
180 # UTF-8 locale works on full range 0-255
181 my $truth = truth($matches && $i < 256);
182 is ($ret, $truth, "is${function}_LC( $display_name ) == $truth ($utf8_locale)");
187 $ret = truth eval "test_is${function}_uni($i)";
192 is ($ret, $matches, "is${function}_uni( $display_name ) == $matches");
195 if (defined $locale && $name ne 'vertws') {
196 require locale; import locale;
198 POSIX::setlocale( &POSIX::LC_ALL, "C");
199 $ret = truth eval "test_is${function}_LC_uvchr('$i')";
204 my $truth = truth($matches && ($i < 128 || $i > 255));
205 is ($ret, $truth, "is${function}_LC_uvchr( $display_name ) == $truth (C locale)");
209 if (defined $utf8_locale && $name ne 'vertws') {
212 POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale);
213 $ret = truth eval "test_is${function}_LC_uvchr('$i')";
218 my $truth = truth($matches);
219 is ($ret, $truth, "is${function}_LC_uvchr( $display_name ) == $truth ($utf8_locale)");
224 utf8::upgrade($char);
225 $char = quotemeta $char if $char eq '\\' || $char eq "'";
226 $ret = truth eval "test_is${function}_utf8('$char')";
231 is ($ret, $matches, "is${function}_utf8( $display_name ) == $matches");
234 if ($name ne 'vertws' && defined $locale) {
235 require locale; import locale;
237 POSIX::setlocale( &POSIX::LC_ALL, "C");
238 $ret = truth eval "test_is${function}_LC_utf8('$char')";
243 my $truth = truth($matches && ($i < 128 || $i > 255));
244 is ($ret, $truth, "is${function}_LC_utf8( $display_name ) == $truth (C locale)");
248 if ($name ne 'vertws' && defined $utf8_locale) {
251 POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale);
252 $ret = truth eval "test_is${function}_LC_utf8('$char')";
257 my $truth = truth($matches);
258 is ($ret, $truth, "is${function}_LC_utf8( $display_name ) == $truth ($utf8_locale)");
264 my %to_properties = (
265 FOLD => 'Case_Folding',
266 LOWER => 'Lowercase_Mapping',
267 TITLE => 'Titlecase_Mapping',
268 UPPER => 'Uppercase_Mapping',
272 foreach my $name (sort keys %to_properties) {
273 my $property = $to_properties{$name};
274 my ($list_ref, $map_ref, $format, $missing)
275 = prop_invmap($property, );
276 if (! $list_ref || ! $map_ref) {
277 fail("No inversion map found for $property");
280 if ($format ne "al") {
281 fail("Unexpected inversion map format ('$format') found for $property");
285 # Include all the Latin1 code points, plus 0x100.
286 my @code_points = (0 .. 256);
288 # Then include the next few multi-char folds above those from this
289 # property, and include the next few single folds as well
290 my $above_latins = 0;
292 for my $i (0 .. @{$list_ref} - 1) {
293 my $range_start = $list_ref->[$i];
294 next if $range_start < 257;
295 if (ref $map_ref->[$i] && $multi_char < 5) {
296 push @code_points, $range_start - 1 if $code_points[-1] != $range_start - 1;
297 push @code_points, $range_start;
300 elsif ($above_latins < 5) {
301 push @code_points, $range_start - 1 if $code_points[-1] != $range_start - 1;
302 push @code_points, $range_start;
305 last if $above_latins >= 5 && $multi_char >= 5;
308 # And finally one non-Unicode code point.
309 push @code_points, 0x110000; # Above Unicode, no prop should match
310 no warnings 'non_unicode';
312 # $j is native; $i unicode.
313 for my $j (@code_points) {
314 my $i = utf8::native_to_unicode($j);
315 my $function = $name;
317 my $index = Unicode::UCD::search_invlist(\@{$list_ref}, $j);
320 my $char_name = charnames::viacode($j) // "No name";
321 my $display_name = sprintf "\\N{U+%02X, %s}", $i, $char_name;
323 # Test the base function
324 $ret = eval "test_to${function}($j)";
329 my $should_be = ($i < 128 && $map_ref->[$index] != $missing)
330 ? $map_ref->[$index] + $j - $list_ref->[$index]
332 is ($ret, $should_be, sprintf("to${function}( $display_name ) == 0x%02X", $should_be));
336 if ($name eq 'LOWER') {
337 $ret = eval "test_to${function}_L1($j)";
342 my $should_be = ($i < 256 && $map_ref->[$index] != $missing)
343 ? $map_ref->[$index] + $j - $list_ref->[$index]
345 is ($ret, $should_be, sprintf("to${function}_L1( $display_name ) == 0x%02X", $should_be));
349 if ($name ne 'TITLE') { # Test _LC; titlecase is not defined in locales.
350 if (defined $locale) {
351 require locale; import locale;
353 POSIX::setlocale( &POSIX::LC_ALL, "C");
354 $ret = eval "test_to${function}_LC($j)";
359 my $should_be = ($i < 128 && $map_ref->[$index] != $missing)
360 ? $map_ref->[$index] + $j - $list_ref->[$index]
362 is ($ret, $should_be, sprintf("to${function}_LC( $display_name ) == 0x%02X (C locale)", $should_be));
366 if (defined $utf8_locale) {
370 skip "to${property}_LC does not work for LATIN SMALL LETTER SHARP S", 1
371 if $j == 0xDF && ($name eq 'FOLD' || $name eq 'UPPER');
373 POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale);
374 $ret = eval "test_to${function}_LC($j)";
379 my $should_be = ($i < 256
380 && ! ref $map_ref->[$index]
381 && $map_ref->[$index] != $missing
383 ? $map_ref->[$index] + $j - $list_ref->[$index]
385 is ($ret, $should_be, sprintf("to${function}_LC( $display_name ) == 0x%02X ($utf8_locale)", $should_be));
391 # The _uni and _utf8 functions return both the ordinal of the first
392 # code point of the result, and the result in utf8. The .xs tests
393 # return these in an array, in [0] and [1] respectively, with [2] the
394 # length of the utf8 in bytes.
395 my $utf8_should_be = "";
396 my $first_ord_should_be;
397 if (ref $map_ref->[$index]) { # A multi-char result
398 for my $j (0 .. @{$map_ref->[$index]} - 1) {
399 $utf8_should_be .= chr $map_ref->[$index][$j];
402 $first_ord_should_be = $map_ref->[$index][0];
404 else { # A single-char result
405 $first_ord_should_be = ($map_ref->[$index] != $missing)
406 ? $map_ref->[$index] + $j - $list_ref->[$index]
408 $utf8_should_be = chr $first_ord_should_be;
410 utf8::upgrade($utf8_should_be);
415 $ret = eval "test_to${function}_uni($j)";
420 is ($ret->[0], $first_ord_should_be, sprintf("to${function}_uni( $display_name ) == 0x%02X", $first_ord_should_be));
421 is ($ret->[1], $utf8_should_be, sprintf("utf8 of to${function}_uni( $display_name )"));
423 is ($ret->[2], length $utf8_should_be, sprintf("number of bytes in utf8 of to${function}_uni( $display_name )"));
428 utf8::upgrade($char);
429 $char = quotemeta $char if $char eq '\\' || $char eq "'";
430 $ret = eval "test_to${function}_utf8('$char')";
435 is ($ret->[0], $first_ord_should_be, sprintf("to${function}_utf8( $display_name ) == 0x%02X", $first_ord_should_be));
436 is ($ret->[1], $utf8_should_be, sprintf("utf8 of to${function}_utf8( $display_name )"));
438 is ($ret->[2], length $utf8_should_be, sprintf("number of bytes in utf8 of to${function}_uni( $display_name )"));
444 # This is primarily to make sure that no non-Unicode warnings get generated
445 is(scalar @warnings, 0, "No warnings were generated " . join ", ", @warnings);