This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
(perl #130335) fix numeric comparison for sort's built-in compare
[perl5.git] / ext / XS-APItest / t / handy.t
CommitLineData
bdd8600f
KW
1#!perl -w
2
31f05a37 3BEGIN {
629eeaee
KW
4 require 'loc_tools.pl'; # Contains locales_enabled() and
5 # find_utf8_ctype_locale()
31f05a37
KW
6}
7
bdd8600f
KW
8use strict;
9use Test::More;
569f7fc5 10use Config;
bdd8600f
KW
11
12use XS::APItest;
13
2e1414ce 14use Unicode::UCD qw(prop_invlist prop_invmap);
c9c05358 15
5073ffbd 16sub truth($) { # Converts values so is() works
e2efe419 17 return (shift) ? 1 : 0;
c9c05358
KW
18}
19
569f7fc5 20my $locale;
31f05a37 21my $utf8_locale;
629eeaee 22if(locales_enabled('LC_ALL')) {
569f7fc5
JR
23 require POSIX;
24 $locale = POSIX::setlocale( &POSIX::LC_ALL, "C");
25 if (defined $locale && $locale eq 'C') {
5f1269ab 26 use locale; # make \w work right in non-ASCII lands
569f7fc5
JR
27
28 # Some locale implementations don't have the 128-255 characters all
29 # mean nothing. Skip the locale tests in that situation
30 for my $i (128 .. 255) {
b5c3a14a 31 if (chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/) {
569f7fc5
JR
32 undef $locale;
33 last;
34 }
981746b9 35 }
31f05a37 36
9b0711ee 37 $utf8_locale = find_utf8_ctype_locale();
981746b9
KW
38 }
39}
40
e2efe419
KW
41my %properties = (
42 # name => Lookup-property name
43 alnum => 'Word',
981746b9 44 wordchar => 'Word',
15861f94 45 alphanumeric => 'Alnum',
ebdbc726 46 alpha => 'XPosixAlpha',
e2efe419
KW
47 ascii => 'ASCII',
48 blank => 'Blank',
49 cntrl => 'Control',
50 digit => 'Digit',
51 graph => 'Graph',
52 idfirst => '_Perl_IDStart',
eba68aa0 53 idcont => '_Perl_IDCont',
ebdbc726 54 lower => 'XPosixLower',
e2efe419
KW
55 print => 'Print',
56 psxspc => 'XPosixSpace',
57 punct => 'XPosixPunct',
58 quotemeta => '_Perl_Quotemeta',
59 space => 'XPerlSpace',
840f8e92 60 vertws => 'VertSpace',
ebdbc726 61 upper => 'XPosixUpper',
e2efe419
KW
62 xdigit => 'XDigit',
63 );
64
5073ffbd
KW
65my @warnings;
66local $SIG{__WARN__} = sub { push @warnings, @_ };
67
c9c05358 68use charnames ();
e2efe419
KW
69foreach my $name (sort keys %properties) {
70 my $property = $properties{$name};
71 my @invlist = prop_invlist($property, '_perl_core_internal_ok');
72 if (! @invlist) {
ebdbc726
KW
73
74 # An empty return could mean an unknown property, or merely that it is
75 # empty. Call in scalar context to differentiate
76 if (! prop_invlist($property, '_perl_core_internal_ok')) {
77 fail("No inversion list found for $property");
78 next;
79 }
e2efe419
KW
80 }
81
82 # Include all the Latin1 code points, plus 0x100.
83 my @code_points = (0 .. 256);
84
85 # Then include the next few boundaries above those from this property
86 my $above_latins = 0;
87 foreach my $range_start (@invlist) {
88 next if $range_start < 257;
89 push @code_points, $range_start - 1, $range_start;
90 $above_latins++;
91 last if $above_latins > 5;
92 }
93
eba68aa0
KW
94 # This makes sure we are using the Perl definition of idfirst and idcont,
95 # and not the Unicode. There are a few differences.
96 push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first|cont)/;
2f4622f0
KW
97 if ($name eq "idcont") { # And some that are continuation but not start
98 push @code_points, ord("\N{GREEK ANO TELEIA}"),
99 ord("\N{COMBINING GRAVE ACCENT}");
100 }
f91dcd13 101
e2efe419
KW
102 # And finally one non-Unicode code point.
103 push @code_points, 0x110000; # Above Unicode, no prop should match
4f6ef7cc 104 no warnings 'non_unicode';
e2efe419 105
2e1414ce
KW
106 for my $j (@code_points) {
107 my $i = utf8::native_to_unicode($j);
c9c05358 108 my $function = uc($name);
c9c05358 109
1fdd5e53 110 my $matches = Unicode::UCD::search_invlist(\@invlist, $i);
e2efe419
KW
111 if (! defined $matches) {
112 $matches = 0;
113 }
114 else {
115 $matches = truth(! ($matches % 2));
116 }
5073ffbd 117
e2efe419
KW
118 my $ret;
119 my $char_name = charnames::viacode($i) // "No name";
94a62bdc 120 my $display_name = sprintf "\\x{%02X, %s}", $i, $char_name;
e2efe419
KW
121
122 if ($name eq 'quotemeta') { # There is only one macro for this, and is
123 # defined only for Latin1 range
124 $ret = truth eval "test_is${function}($i)";
125 if ($@) {
126 fail $@;
127 }
128 else {
129 my $truth = truth($matches && $i < 256);
130 is ($ret, $truth, "is${function}( $display_name ) == $truth");
131 }
c9c05358
KW
132 next;
133 }
981746b9
KW
134
135 # vertws is always all of Unicode; ALNUM_A and ALNUM_L1 are not
136 # defined as they were added later, after WORDCHAR was created to be a
137 # clearer synonym for ALNUM
840f8e92 138 if ($name ne 'vertws') {
981746b9 139 if ($name ne 'alnum') {
195d0f3b
KW
140 $ret = truth eval "test_is${function}_A($i)";
141 if ($@) {
142 fail($@);
143 }
144 else {
b5c3a14a 145 my $truth = truth($matches && utf8::native_to_unicode($i) < 128);
195d0f3b
KW
146 is ($ret, $truth, "is${function}_A( $display_name ) == $truth");
147 }
148 $ret = truth eval "test_is${function}_L1($i)";
149 if ($@) {
150 fail($@);
151 }
152 else {
153 my $truth = truth($matches && $i < 256);
154 is ($ret, $truth, "is${function}_L1( $display_name ) == $truth");
155 }
981746b9
KW
156 }
157
8ff203a1 158 if (defined $locale) {
5f1269ab 159 use locale;
31f05a37 160 POSIX::setlocale( &POSIX::LC_ALL, "C");
31a09021
KW
161 $ret = truth eval "test_is${function}_LC($i)";
162 if ($@) {
163 fail($@);
164 }
165 else {
b5c3a14a 166 my $truth = truth($matches && utf8::native_to_unicode($i) < 128);
31f05a37
KW
167 is ($ret, $truth, "is${function}_LC( $display_name ) == $truth (C locale)");
168 }
169 }
170
171 if (defined $utf8_locale) {
172 use locale;
173
174 POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale);
175 $ret = truth eval "test_is${function}_LC($i)";
176 if ($@) {
177 fail($@);
178 }
179 else {
180
181 # UTF-8 locale works on full range 0-255
182 my $truth = truth($matches && $i < 256);
183 is ($ret, $truth, "is${function}_LC( $display_name ) == $truth ($utf8_locale)");
31a09021 184 }
8ff203a1 185 }
840f8e92 186 }
c9c05358 187
e2efe419
KW
188 $ret = truth eval "test_is${function}_uni($i)";
189 if ($@) {
190 fail($@);
191 }
192 else {
193 is ($ret, $matches, "is${function}_uni( $display_name ) == $matches");
194 }
c9c05358 195
981746b9 196 if (defined $locale && $name ne 'vertws') {
5f1269ab 197 use locale;
31f05a37 198 POSIX::setlocale( &POSIX::LC_ALL, "C");
981746b9
KW
199 $ret = truth eval "test_is${function}_LC_uvchr('$i')";
200 if ($@) {
201 fail($@);
202 }
203 else {
b5c3a14a 204 my $truth = truth($matches && (utf8::native_to_unicode($i) < 128 || $i > 255));
31f05a37
KW
205 is ($ret, $truth, "is${function}_LC_uvchr( $display_name ) == $truth (C locale)");
206 }
207 }
208
209 if (defined $utf8_locale && $name ne 'vertws') {
210 use locale;
211
212 POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale);
213 $ret = truth eval "test_is${function}_LC_uvchr('$i')";
214 if ($@) {
215 fail($@);
216 }
217 else {
218 my $truth = truth($matches);
219 is ($ret, $truth, "is${function}_LC_uvchr( $display_name ) == $truth ($utf8_locale)");
981746b9
KW
220 }
221 }
222
c9c05358
KW
223 my $char = chr($i);
224 utf8::upgrade($char);
225 $char = quotemeta $char if $char eq '\\' || $char eq "'";
e2efe419
KW
226 $ret = truth eval "test_is${function}_utf8('$char')";
227 if ($@) {
228 fail($@);
229 }
230 else {
231 is ($ret, $matches, "is${function}_utf8( $display_name ) == $matches");
232 }
981746b9 233
8ff203a1 234 if ($name ne 'vertws' && defined $locale) {
5f1269ab 235 use locale;
31f05a37 236 POSIX::setlocale( &POSIX::LC_ALL, "C");
31a09021
KW
237 $ret = truth eval "test_is${function}_LC_utf8('$char')";
238 if ($@) {
239 fail($@);
240 }
241 else {
b5c3a14a 242 my $truth = truth($matches && (utf8::native_to_unicode($i) < 128 || $i > 255));
31f05a37
KW
243 is ($ret, $truth, "is${function}_LC_utf8( $display_name ) == $truth (C locale)");
244 }
a9aff56e
KW
245 }
246
247 if ($name ne 'vertws' && defined $utf8_locale) {
248 use locale;
31f05a37
KW
249
250 POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale);
251 $ret = truth eval "test_is${function}_LC_utf8('$char')";
252 if ($@) {
253 fail($@);
254 }
255 else {
256 my $truth = truth($matches);
257 is ($ret, $truth, "is${function}_LC_utf8( $display_name ) == $truth ($utf8_locale)");
31a09021 258 }
8ff203a1 259 }
c9c05358
KW
260 }
261}
262
8c67216a
KW
263# Test isOCTAL()
264for my $i (0 .. 256, 0x110000) {
265 my $char_name = charnames::viacode($i) // "No name";
266 my $display_name = sprintf "\\N{U+%02X, %s}", $i, $char_name;
267 my $truth = truth($i >= ord('0') && $i <= ord('7'));
268
269 my $ret = truth test_isOCTAL_A($i);
270 is($ret, $truth, "isOCTAL_A( $display_name ) == $truth");
271
272 $ret = truth test_isOCTAL_L1($i);
273 is($ret, $truth, "isOCTAL_L1( $display_name ) == $truth");
274}
275
2e1414ce
KW
276my %to_properties = (
277 FOLD => 'Case_Folding',
278 LOWER => 'Lowercase_Mapping',
279 TITLE => 'Titlecase_Mapping',
280 UPPER => 'Uppercase_Mapping',
281 );
282
283
284foreach my $name (sort keys %to_properties) {
285 my $property = $to_properties{$name};
286 my ($list_ref, $map_ref, $format, $missing)
287 = prop_invmap($property, );
288 if (! $list_ref || ! $map_ref) {
289 fail("No inversion map found for $property");
290 next;
291 }
ebdbc726 292 if ($format !~ / ^ a l? $ /x) {
2e1414ce
KW
293 fail("Unexpected inversion map format ('$format') found for $property");
294 next;
295 }
296
297 # Include all the Latin1 code points, plus 0x100.
298 my @code_points = (0 .. 256);
299
300 # Then include the next few multi-char folds above those from this
301 # property, and include the next few single folds as well
302 my $above_latins = 0;
303 my $multi_char = 0;
304 for my $i (0 .. @{$list_ref} - 1) {
305 my $range_start = $list_ref->[$i];
306 next if $range_start < 257;
307 if (ref $map_ref->[$i] && $multi_char < 5) {
308 push @code_points, $range_start - 1 if $code_points[-1] != $range_start - 1;
309 push @code_points, $range_start;
310 $multi_char++;
311 }
312 elsif ($above_latins < 5) {
313 push @code_points, $range_start - 1 if $code_points[-1] != $range_start - 1;
314 push @code_points, $range_start;
315 $above_latins++;
316 }
317 last if $above_latins >= 5 && $multi_char >= 5;
318 }
319
320 # And finally one non-Unicode code point.
321 push @code_points, 0x110000; # Above Unicode, no prop should match
322 no warnings 'non_unicode';
323
324 # $j is native; $i unicode.
325 for my $j (@code_points) {
326 my $i = utf8::native_to_unicode($j);
327 my $function = $name;
328
1fdd5e53 329 my $index = Unicode::UCD::search_invlist(\@{$list_ref}, $j);
2e1414ce
KW
330
331 my $ret;
332 my $char_name = charnames::viacode($j) // "No name";
333 my $display_name = sprintf "\\N{U+%02X, %s}", $i, $char_name;
334
335 # Test the base function
336 $ret = eval "test_to${function}($j)";
337 if ($@) {
338 fail($@);
339 }
340 else {
341 my $should_be = ($i < 128 && $map_ref->[$index] != $missing)
342 ? $map_ref->[$index] + $j - $list_ref->[$index]
343 : $j;
344 is ($ret, $should_be, sprintf("to${function}( $display_name ) == 0x%02X", $should_be));
345 }
346
347 # Test _L1
348 if ($name eq 'LOWER') {
349 $ret = eval "test_to${function}_L1($j)";
350 if ($@) {
351 fail($@);
352 }
353 else {
354 my $should_be = ($i < 256 && $map_ref->[$index] != $missing)
355 ? $map_ref->[$index] + $j - $list_ref->[$index]
356 : $j;
357 is ($ret, $should_be, sprintf("to${function}_L1( $display_name ) == 0x%02X", $should_be));
358 }
359 }
360
31f05a37
KW
361 if ($name ne 'TITLE') { # Test _LC; titlecase is not defined in locales.
362 if (defined $locale) {
5f1269ab
KW
363 use locale;
364 POSIX::setlocale( &POSIX::LC_ALL, "C");
baa60164
KW
365 $ret = eval "test_to${function}_LC($j)";
366 if ($@) {
367 fail($@);
368 }
369 else {
370 my $should_be = ($i < 128 && $map_ref->[$index] != $missing)
2e1414ce
KW
371 ? $map_ref->[$index] + $j - $list_ref->[$index]
372 : $j;
baa60164
KW
373 is ($ret, $should_be, sprintf("to${function}_LC( $display_name ) == 0x%02X (C locale)", $should_be));
374 }
31f05a37
KW
375 }
376
377 if (defined $utf8_locale) {
378 use locale;
379
380 SKIP: {
381 skip "to${property}_LC does not work for LATIN SMALL LETTER SHARP S", 1
382 if $j == 0xDF && ($name eq 'FOLD' || $name eq 'UPPER');
383
384 POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale);
385 $ret = eval "test_to${function}_LC($j)";
386 if ($@) {
387 fail($@);
388 }
389 else {
390 my $should_be = ($i < 256
391 && ! ref $map_ref->[$index]
392 && $map_ref->[$index] != $missing
393 )
394 ? $map_ref->[$index] + $j - $list_ref->[$index]
395 : $j;
396 is ($ret, $should_be, sprintf("to${function}_LC( $display_name ) == 0x%02X ($utf8_locale)", $should_be));
397 }
398 }
2e1414ce
KW
399 }
400 }
401
402 # The _uni and _utf8 functions return both the ordinal of the first
403 # code point of the result, and the result in utf8. The .xs tests
404 # return these in an array, in [0] and [1] respectively, with [2] the
405 # length of the utf8 in bytes.
406 my $utf8_should_be = "";
407 my $first_ord_should_be;
408 if (ref $map_ref->[$index]) { # A multi-char result
409 for my $j (0 .. @{$map_ref->[$index]} - 1) {
410 $utf8_should_be .= chr $map_ref->[$index][$j];
411 }
412
413 $first_ord_should_be = $map_ref->[$index][0];
414 }
415 else { # A single-char result
416 $first_ord_should_be = ($map_ref->[$index] != $missing)
417 ? $map_ref->[$index] + $j - $list_ref->[$index]
418 : $j;
419 $utf8_should_be = chr $first_ord_should_be;
420 }
421 utf8::upgrade($utf8_should_be);
422
423 # Test _uni
424 my $s;
425 my $len;
426 $ret = eval "test_to${function}_uni($j)";
427 if ($@) {
428 fail($@);
429 }
430 else {
431 is ($ret->[0], $first_ord_should_be, sprintf("to${function}_uni( $display_name ) == 0x%02X", $first_ord_should_be));
432 is ($ret->[1], $utf8_should_be, sprintf("utf8 of to${function}_uni( $display_name )"));
433 use bytes;
434 is ($ret->[2], length $utf8_should_be, sprintf("number of bytes in utf8 of to${function}_uni( $display_name )"));
435 }
436
437 # Test _utf8
438 my $char = chr($j);
439 utf8::upgrade($char);
440 $char = quotemeta $char if $char eq '\\' || $char eq "'";
441 $ret = eval "test_to${function}_utf8('$char')";
442 if ($@) {
443 fail($@);
444 }
445 else {
446 is ($ret->[0], $first_ord_should_be, sprintf("to${function}_utf8( $display_name ) == 0x%02X", $first_ord_should_be));
447 is ($ret->[1], $utf8_should_be, sprintf("utf8 of to${function}_utf8( $display_name )"));
448 use bytes;
449 is ($ret->[2], length $utf8_should_be, sprintf("number of bytes in utf8 of to${function}_uni( $display_name )"));
450 }
451
452 }
453}
454
5073ffbd 455# This is primarily to make sure that no non-Unicode warnings get generated
5c335666
KW
456unless (is(scalar @warnings, 0, "No warnings were generated")) {
457 diag @warnings;
458}
5073ffbd 459
bdd8600f 460done_testing;