9 use Unicode::UCD qw(prop_invlist prop_invmap);
11 sub truth($) { # Converts values so is() works
12 return (shift) ? 1 : 0;
16 if($Config{d_setlocale}) {
18 $locale = POSIX::setlocale( &POSIX::LC_ALL, "C");
19 if (defined $locale && $locale eq 'C') {
21 if($Config{d_setlocale}) {
22 require locale; import locale; # make \w work right in non-ASCII lands
26 # Some locale implementations don't have the 128-255 characters all
27 # mean nothing. Skip the locale tests in that situation
28 for my $i (128 .. 255) {
29 if (chr($i) =~ /[[:print:]]/) {
38 # name => Lookup-property name
41 alphanumeric => 'Alnum',
48 idfirst => '_Perl_IDStart',
49 idcont => '_Perl_IDCont',
52 psxspc => 'XPosixSpace',
53 punct => 'XPosixPunct',
54 quotemeta => '_Perl_Quotemeta',
55 space => 'XPerlSpace',
56 vertws => 'VertSpace',
62 local $SIG{__WARN__} = sub { push @warnings, @_ };
65 foreach my $name (sort keys %properties) {
66 my $property = $properties{$name};
67 my @invlist = prop_invlist($property, '_perl_core_internal_ok');
69 fail("No inversion list found for $property");
73 # Include all the Latin1 code points, plus 0x100.
74 my @code_points = (0 .. 256);
76 # Then include the next few boundaries above those from this property
78 foreach my $range_start (@invlist) {
79 next if $range_start < 257;
80 push @code_points, $range_start - 1, $range_start;
82 last if $above_latins > 5;
85 # This makes sure we are using the Perl definition of idfirst and idcont,
86 # and not the Unicode. There are a few differences.
87 push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first|cont)/;
88 if ($name eq "idcont") { # And some that are continuation but not start
89 push @code_points, ord("\N{GREEK ANO TELEIA}"),
90 ord("\N{COMBINING GRAVE ACCENT}");
93 # And finally one non-Unicode code point.
94 push @code_points, 0x110000; # Above Unicode, no prop should match
95 no warnings 'non_unicode';
97 for my $j (@code_points) {
98 my $i = utf8::native_to_unicode($j);
99 my $function = uc($name);
101 my $matches = Unicode::UCD::_search_invlist(\@invlist, $i);
102 if (! defined $matches) {
106 $matches = truth(! ($matches % 2));
110 my $char_name = charnames::viacode($i) // "No name";
111 my $display_name = sprintf "\\N{U+%02X, %s}", $i, $char_name;
113 if ($name eq 'quotemeta') { # There is only one macro for this, and is
114 # defined only for Latin1 range
115 $ret = truth eval "test_is${function}($i)";
120 my $truth = truth($matches && $i < 256);
121 is ($ret, $truth, "is${function}( $display_name ) == $truth");
126 # vertws is always all of Unicode; ALNUM_A and ALNUM_L1 are not
127 # defined as they were added later, after WORDCHAR was created to be a
128 # clearer synonym for ALNUM
129 if ($name ne 'vertws') {
130 if ($name ne 'alnum') {
131 $ret = truth eval "test_is${function}_A($i)";
136 my $truth = truth($matches && $i < 128);
137 is ($ret, $truth, "is${function}_A( $display_name ) == $truth");
139 $ret = truth eval "test_is${function}_L1($i)";
144 my $truth = truth($matches && $i < 256);
145 is ($ret, $truth, "is${function}_L1( $display_name ) == $truth");
149 if (defined $locale) {
150 require locale; import locale;
152 $ret = truth eval "test_is${function}_LC($i)";
157 my $truth = truth($matches && $i < 128);
158 is ($ret, $truth, "is${function}_LC( $display_name ) == $truth");
163 $ret = truth eval "test_is${function}_uni($i)";
168 is ($ret, $matches, "is${function}_uni( $display_name ) == $matches");
171 if (defined $locale && $name ne 'vertws') {
172 require locale; import locale;
174 $ret = truth eval "test_is${function}_LC_uvchr('$i')";
179 my $truth = truth($matches && ($i < 128 || $i > 255));
180 is ($ret, $truth, "is${function}_LC_uvchr( $display_name ) == $truth");
185 utf8::upgrade($char);
186 $char = quotemeta $char if $char eq '\\' || $char eq "'";
187 $ret = truth eval "test_is${function}_utf8('$char')";
192 is ($ret, $matches, "is${function}_utf8( $display_name ) == $matches");
195 if ($name ne 'vertws' && defined $locale) {
196 require locale; import locale;
198 $ret = truth eval "test_is${function}_LC_utf8('$char')";
203 my $truth = truth($matches && ($i < 128 || $i > 255));
204 is ($ret, $truth, "is${function}_LC_utf8( $display_name ) == $truth");
210 my %to_properties = (
211 FOLD => 'Case_Folding',
212 LOWER => 'Lowercase_Mapping',
213 TITLE => 'Titlecase_Mapping',
214 UPPER => 'Uppercase_Mapping',
218 foreach my $name (sort keys %to_properties) {
219 my $property = $to_properties{$name};
220 my ($list_ref, $map_ref, $format, $missing)
221 = prop_invmap($property, );
222 if (! $list_ref || ! $map_ref) {
223 fail("No inversion map found for $property");
226 if ($format ne "al") {
227 fail("Unexpected inversion map format ('$format') found for $property");
231 # Include all the Latin1 code points, plus 0x100.
232 my @code_points = (0 .. 256);
234 # Then include the next few multi-char folds above those from this
235 # property, and include the next few single folds as well
236 my $above_latins = 0;
238 for my $i (0 .. @{$list_ref} - 1) {
239 my $range_start = $list_ref->[$i];
240 next if $range_start < 257;
241 if (ref $map_ref->[$i] && $multi_char < 5) {
242 push @code_points, $range_start - 1 if $code_points[-1] != $range_start - 1;
243 push @code_points, $range_start;
246 elsif ($above_latins < 5) {
247 push @code_points, $range_start - 1 if $code_points[-1] != $range_start - 1;
248 push @code_points, $range_start;
251 last if $above_latins >= 5 && $multi_char >= 5;
254 # And finally one non-Unicode code point.
255 push @code_points, 0x110000; # Above Unicode, no prop should match
256 no warnings 'non_unicode';
258 # $j is native; $i unicode.
259 for my $j (@code_points) {
260 my $i = utf8::native_to_unicode($j);
261 my $function = $name;
263 my $index = Unicode::UCD::_search_invlist(\@{$list_ref}, $j);
266 my $char_name = charnames::viacode($j) // "No name";
267 my $display_name = sprintf "\\N{U+%02X, %s}", $i, $char_name;
269 # Test the base function
270 $ret = eval "test_to${function}($j)";
275 my $should_be = ($i < 128 && $map_ref->[$index] != $missing)
276 ? $map_ref->[$index] + $j - $list_ref->[$index]
278 is ($ret, $should_be, sprintf("to${function}( $display_name ) == 0x%02X", $should_be));
282 if ($name eq 'LOWER') {
283 $ret = eval "test_to${function}_L1($j)";
288 my $should_be = ($i < 256 && $map_ref->[$index] != $missing)
289 ? $map_ref->[$index] + $j - $list_ref->[$index]
291 is ($ret, $should_be, sprintf("to${function}_L1( $display_name ) == 0x%02X", $should_be));
295 if ($name ne 'TITLE' && defined $locale) {
296 require locale; import locale;
298 # Test _LC; titlecase is not defined in locales.
299 $ret = eval "test_to${function}_LC($j)";
304 my $should_be = ($i < 128 && $map_ref->[$index] != $missing)
305 ? $map_ref->[$index] + $j - $list_ref->[$index]
307 is ($ret, $should_be, sprintf("to${function}_LC( $display_name ) == 0x%02X", $should_be));
311 # The _uni and _utf8 functions return both the ordinal of the first
312 # code point of the result, and the result in utf8. The .xs tests
313 # return these in an array, in [0] and [1] respectively, with [2] the
314 # length of the utf8 in bytes.
315 my $utf8_should_be = "";
316 my $first_ord_should_be;
317 if (ref $map_ref->[$index]) { # A multi-char result
318 for my $j (0 .. @{$map_ref->[$index]} - 1) {
319 $utf8_should_be .= chr $map_ref->[$index][$j];
322 $first_ord_should_be = $map_ref->[$index][0];
324 else { # A single-char result
325 $first_ord_should_be = ($map_ref->[$index] != $missing)
326 ? $map_ref->[$index] + $j - $list_ref->[$index]
328 $utf8_should_be = chr $first_ord_should_be;
330 utf8::upgrade($utf8_should_be);
335 $ret = eval "test_to${function}_uni($j)";
340 is ($ret->[0], $first_ord_should_be, sprintf("to${function}_uni( $display_name ) == 0x%02X", $first_ord_should_be));
341 is ($ret->[1], $utf8_should_be, sprintf("utf8 of to${function}_uni( $display_name )"));
343 is ($ret->[2], length $utf8_should_be, sprintf("number of bytes in utf8 of to${function}_uni( $display_name )"));
348 utf8::upgrade($char);
349 $char = quotemeta $char if $char eq '\\' || $char eq "'";
350 $ret = eval "test_to${function}_utf8('$char')";
355 is ($ret->[0], $first_ord_should_be, sprintf("to${function}_utf8( $display_name ) == 0x%02X", $first_ord_should_be));
356 is ($ret->[1], $utf8_should_be, sprintf("utf8 of to${function}_utf8( $display_name )"));
358 is ($ret->[2], length $utf8_should_be, sprintf("number of bytes in utf8 of to${function}_uni( $display_name )"));
364 # This is primarily to make sure that no non-Unicode warnings get generated
365 is(scalar @warnings, 0, "No warnings were generated " . join ", ", @warnings);