This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/handy.t: Add more tests
[perl5.git] / ext / XS-APItest / t / handy.t
CommitLineData
bdd8600f
KW
1#!perl -w
2
31f05a37 3BEGIN {
629eeaee
KW
4 require 'loc_tools.pl'; # Contains locales_enabled() and
5 # find_utf8_ctype_locale()
31f05a37
KW
6}
7
bdd8600f
KW
8use strict;
9use Test::More;
569f7fc5 10use Config;
bdd8600f
KW
11
12use XS::APItest;
13
01a11ab9
KW
14my $tab = " " x 4; # Indent subsidiary tests this much
15
4acecd39
KW
16use Unicode::UCD qw(search_invlist prop_invmap prop_invlist);
17my ($charname_list, $charname_map, $format, $default) = prop_invmap("Name Alias");
18
19sub get_charname($) {
20 my $cp = shift;
21
22 # If there is a an abbreviation for the code point name, use it
23 my $name_index = search_invlist(\@{$charname_list}, $cp);
24 if (defined $name_index) {
25 my $synonyms = $charname_map->[$name_index];
26 if (ref $synonyms) {
27 my $pat = qr/: abbreviation/;
28 my @abbreviations = grep { $_ =~ $pat } @$synonyms;
29 if (@abbreviations) {
30 return $abbreviations[0] =~ s/$pat//r;
31 }
32 }
33 }
34
35 # Otherwise, use the full name
36 use charnames ();
37 return charnames::viacode($cp) // "No name";
38}
39
5073ffbd 40sub truth($) { # Converts values so is() works
e2efe419 41 return (shift) ? 1 : 0;
c9c05358
KW
42}
43
01a11ab9 44my $base_locale;
31f05a37 45my $utf8_locale;
629eeaee 46if(locales_enabled('LC_ALL')) {
569f7fc5 47 require POSIX;
01a11ab9
KW
48 $base_locale = POSIX::setlocale( &POSIX::LC_ALL, "C");
49 if (defined $base_locale && $base_locale eq 'C') {
5f1269ab 50 use locale; # make \w work right in non-ASCII lands
569f7fc5
JR
51
52 # Some locale implementations don't have the 128-255 characters all
53 # mean nothing. Skip the locale tests in that situation
54 for my $i (128 .. 255) {
b5c3a14a 55 if (chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/) {
01a11ab9 56 undef $base_locale;
569f7fc5
JR
57 last;
58 }
981746b9 59 }
31f05a37 60
01a11ab9 61 $utf8_locale = find_utf8_ctype_locale() if $base_locale;
981746b9
KW
62 }
63}
64
01a11ab9
KW
65sub get_display_locale_or_skip($$) {
66
67 # Helper function intimately tied to its callers. It knows the loop
68 # iterates with a locale of "", meaning don't use locale; $base_locale
69 # meaning to use a non-UTF-8 locale; and $utf8_locale.
70 #
71 # It checks to see if the current test should be skipped or executed,
72 # returning an empty list for the former, and for the latter:
73 # ( 'locale display name',
74 # bool of is this a UTF-8 locale )
75 #
76 # The display name is the empty string if not using locale. Functions
77 # with _LC in their name are skipped unless in locale, and functions
78 # without _LC are executed only outside locale. However, if no locales at
79 # all are on the system, the _LC functions are executed outside locale.
80
81 my ($locale, $suffix) = @_;
82
83 # The test should be skipped if the input is for a non-existent locale
84 return unless defined $locale;
85
86 # Here the input is defined, either a locale name or "". If the test is
87 # for not using locales, we want to do the test for non-LC functions,
88 # and skip it for LC ones (except if there are no locales on the system,
89 # we do it for LC ones as if they weren't LC).
90 if ($locale eq "") {
91 return ("", 0) if $suffix !~ /LC/ || ! defined $base_locale;
92 return;
93 }
94
95 # Here the input is for a real locale. We don't test the non-LC functions
96 # for locales.
97 return if $suffix !~ /LC/;
98
99 # Here is for a LC function and a real locale. The base locale is not
100 # UTF-8.
101 return (" ($locale locale)", 0) if $locale eq $base_locale;
102
103 # The only other possibility is that we have a UTF-8 locale
104 return (" ($locale)", 1);
105}
106
e2efe419
KW
107my %properties = (
108 # name => Lookup-property name
109 alnum => 'Word',
981746b9 110 wordchar => 'Word',
15861f94 111 alphanumeric => 'Alnum',
ebdbc726 112 alpha => 'XPosixAlpha',
e2efe419
KW
113 ascii => 'ASCII',
114 blank => 'Blank',
115 cntrl => 'Control',
116 digit => 'Digit',
117 graph => 'Graph',
118 idfirst => '_Perl_IDStart',
eba68aa0 119 idcont => '_Perl_IDCont',
ebdbc726 120 lower => 'XPosixLower',
e2efe419
KW
121 print => 'Print',
122 psxspc => 'XPosixSpace',
123 punct => 'XPosixPunct',
124 quotemeta => '_Perl_Quotemeta',
125 space => 'XPerlSpace',
840f8e92 126 vertws => 'VertSpace',
ebdbc726 127 upper => 'XPosixUpper',
e2efe419
KW
128 xdigit => 'XDigit',
129 );
130
5073ffbd
KW
131my @warnings;
132local $SIG{__WARN__} = sub { push @warnings, @_ };
133
4acecd39 134
8dc91396
KW
135foreach my $name (sort keys %properties, 'octal') {
136 my @invlist;
137 if ($name eq 'octal') {
138 # Hand-roll an inversion list with 0-7 in it and nothing else.
139 push @invlist, ord "0", ord "8";
140 }
141 else {
9a009363
KW
142 my $property = $properties{$name};
143 @invlist = prop_invlist($property, '_perl_core_internal_ok');
144 if (! @invlist) {
145
146 # An empty return could mean an unknown property, or merely that
147 # it is empty. Call in scalar context to differentiate
148 if (! prop_invlist($property, '_perl_core_internal_ok')) {
149 fail("No inversion list found for $property");
150 next;
151 }
ebdbc726 152 }
e2efe419
KW
153 }
154
155 # Include all the Latin1 code points, plus 0x100.
156 my @code_points = (0 .. 256);
157
158 # Then include the next few boundaries above those from this property
159 my $above_latins = 0;
160 foreach my $range_start (@invlist) {
161 next if $range_start < 257;
162 push @code_points, $range_start - 1, $range_start;
163 $above_latins++;
164 last if $above_latins > 5;
165 }
166
eba68aa0
KW
167 # This makes sure we are using the Perl definition of idfirst and idcont,
168 # and not the Unicode. There are a few differences.
169 push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first|cont)/;
2f4622f0
KW
170 if ($name eq "idcont") { # And some that are continuation but not start
171 push @code_points, ord("\N{GREEK ANO TELEIA}"),
172 ord("\N{COMBINING GRAVE ACCENT}");
173 }
f91dcd13 174
e2efe419
KW
175 # And finally one non-Unicode code point.
176 push @code_points, 0x110000; # Above Unicode, no prop should match
4f6ef7cc 177 no warnings 'non_unicode';
e2efe419 178
2e1414ce
KW
179 for my $j (@code_points) {
180 my $i = utf8::native_to_unicode($j);
c9c05358 181 my $function = uc($name);
c9c05358 182
01a11ab9
KW
183 is (@warnings, 0, "Got no unexpected warnings in previous iteration")
184 or diag("@warnings");
185 undef @warnings;
186
4acecd39 187 my $matches = search_invlist(\@invlist, $i);
e2efe419
KW
188 if (! defined $matches) {
189 $matches = 0;
190 }
191 else {
192 $matches = truth(! ($matches % 2));
193 }
5073ffbd 194
e2efe419 195 my $ret;
4acecd39 196 my $char_name = get_charname($j);
94a62bdc 197 my $display_name = sprintf "\\x{%02X, %s}", $i, $char_name;
01a11ab9 198 my $display_call = "is${function}( $display_name )";
e2efe419
KW
199
200 if ($name eq 'quotemeta') { # There is only one macro for this, and is
201 # defined only for Latin1 range
202 $ret = truth eval "test_is${function}($i)";
01a11ab9 203 if (is ($@, "", "$display_call didn't give error")) {
e2efe419 204 my $truth = truth($matches && $i < 256);
01a11ab9 205 is ($ret, $truth, "${tab}And returns $truth");
e2efe419 206 }
c9c05358
KW
207 next;
208 }
981746b9 209
a7fe8528
KW
210 foreach my $suffix ("", "_A", "_L1", "_LC", "_uni", "_uvchr",
211 "_LC_uvchr", "_utf8", "_LC_utf8")
ee9e5f10 212 {
31f05a37 213
01a11ab9
KW
214 # Not all possible macros have been defined
215 if ($name eq 'vertws') {
31f05a37 216
01a11ab9 217 # vertws is always all of Unicode
a7fe8528 218 next if $suffix !~ / ^ _ ( uni | uvchr | utf8 ) $ /x;
981746b9 219 }
01a11ab9 220 elsif ($name eq 'alnum') {
981746b9 221
a7fe8528
KW
222 # ALNUM_A, ALNUM_L1, and ALNUM_uvchr are not defined as these
223 # suffixes were added later, after WORDCHAR was created to be
224 # a clearer synonym for ALNUM
225 next if $suffix eq '_A'
226 || $suffix eq '_L1'
227 || $suffix eq '_uvchr';
31a09021 228 }
8dc91396 229 elsif ($name eq 'octal') {
ee9e5f10 230 next if $suffix ne "" && $suffix ne '_A' && $suffix ne '_L1';
8dc91396 231 }
a9aff56e 232
01a11ab9
KW
233 foreach my $locale ("", $base_locale, $utf8_locale) {
234
235 my ($display_locale, $locale_is_utf8)
236 = get_display_locale_or_skip($locale, $suffix);
237 next unless defined $display_locale;
238
239 use if $locale, "locale";
240 POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
241
242 if ($suffix !~ /utf8/) { # _utf8 has to handled specially
243 my $display_call
244 = "is${function}$suffix( $display_name )$display_locale";
245 $ret = truth eval "test_is${function}$suffix($i)";
246 if (is ($@, "", "$display_call didn't give error")) {
247 my $truth = $matches;
248 if ($truth) {
249
250 # The single byte functions are false for
251 # above-Latin1
252 if ($i >= 256) {
ee9e5f10
KW
253 $truth = 0
254 if $suffix=~ / ^ ( _A | _L [1C] )? $ /x;
01a11ab9
KW
255 }
256 elsif (utf8::native_to_unicode($i) >= 128) {
257
ee9e5f10 258 # The no-suffix and _A functions are false
01a11ab9
KW
259 # for non-ASCII. So are _LC functions on a
260 # non-UTF-8 locale
261 $truth = 0 if $suffix eq "_A"
ee9e5f10 262 || $suffix eq ""
01a11ab9
KW
263 || ( $suffix =~ /LC/
264 && ! $locale_is_utf8);
265 }
266 }
267
268 is ($ret, $truth, "${tab}And correctly returns $truth");
269 }
270 }
271 else { # _utf8 suffix
272 my $char = chr($i);
273 utf8::upgrade($char);
274 $char = quotemeta $char if $char eq '\\' || $char eq "'";
275 my $truth;
276 if ( $suffix =~ /LC/
277 && ! $locale_is_utf8
278 && $i < 256
279 && utf8::native_to_unicode($i) >= 128)
280 { # The C-locale _LC function returns FALSE for Latin1
281 # above ASCII
282 $truth = 0;
283 }
284 else {
285 $truth = $matches;
286 }
31f05a37 287
01a11ab9
KW
288 my $display_call = "is${function}$suffix("
289 . " $display_name )$display_locale";
290 $ret = truth eval "test_is${function}$suffix('$char')";
291 if (is ($@, "", "$display_call didn't give error")) {
292 is ($ret, $truth,
293 "${tab}And correctly returned $truth");
294 }
295 }
31a09021 296 }
8ff203a1 297 }
c9c05358
KW
298 }
299}
300
2e1414ce 301my %to_properties = (
01a11ab9 302 FOLD => 'Case_Folding',
2e1414ce
KW
303 LOWER => 'Lowercase_Mapping',
304 TITLE => 'Titlecase_Mapping',
305 UPPER => 'Uppercase_Mapping',
306 );
307
308
309foreach my $name (sort keys %to_properties) {
310 my $property = $to_properties{$name};
311 my ($list_ref, $map_ref, $format, $missing)
312 = prop_invmap($property, );
313 if (! $list_ref || ! $map_ref) {
314 fail("No inversion map found for $property");
315 next;
316 }
ebdbc726 317 if ($format !~ / ^ a l? $ /x) {
2e1414ce
KW
318 fail("Unexpected inversion map format ('$format') found for $property");
319 next;
320 }
321
322 # Include all the Latin1 code points, plus 0x100.
323 my @code_points = (0 .. 256);
324
325 # Then include the next few multi-char folds above those from this
326 # property, and include the next few single folds as well
327 my $above_latins = 0;
328 my $multi_char = 0;
329 for my $i (0 .. @{$list_ref} - 1) {
330 my $range_start = $list_ref->[$i];
331 next if $range_start < 257;
332 if (ref $map_ref->[$i] && $multi_char < 5) {
01a11ab9
KW
333 push @code_points, $range_start - 1
334 if $code_points[-1] != $range_start - 1;
2e1414ce
KW
335 push @code_points, $range_start;
336 $multi_char++;
337 }
338 elsif ($above_latins < 5) {
01a11ab9
KW
339 push @code_points, $range_start - 1
340 if $code_points[-1] != $range_start - 1;
2e1414ce
KW
341 push @code_points, $range_start;
342 $above_latins++;
343 }
344 last if $above_latins >= 5 && $multi_char >= 5;
345 }
346
347 # And finally one non-Unicode code point.
348 push @code_points, 0x110000; # Above Unicode, no prop should match
349 no warnings 'non_unicode';
350
351 # $j is native; $i unicode.
352 for my $j (@code_points) {
353 my $i = utf8::native_to_unicode($j);
354 my $function = $name;
355
4acecd39 356 my $index = search_invlist(\@{$list_ref}, $j);
2e1414ce
KW
357
358 my $ret;
4acecd39 359 my $char_name = get_charname($j);
2e1414ce
KW
360 my $display_name = sprintf "\\N{U+%02X, %s}", $i, $char_name;
361
01a11ab9 362 foreach my $suffix ("", "_L1", "_LC") {
2e1414ce 363
01a11ab9
KW
364 # This is the only macro defined for L1
365 next if $suffix eq "_L1" && $function ne "LOWER";
2e1414ce 366
01a11ab9
KW
367 SKIP:
368 foreach my $locale ("", $base_locale, $utf8_locale) {
31f05a37 369
01a11ab9
KW
370 # titlecase is not defined in locales.
371 next if $name eq 'TITLE' && $suffix eq "_LC";
31f05a37 372
01a11ab9
KW
373 my ($display_locale, $locale_is_utf8)
374 = get_display_locale_or_skip($locale, $suffix);
375 next unless defined $display_locale;
31f05a37 376
01a11ab9
KW
377 skip("to${name}_LC does not work for LATIN SMALL LETTER SHARP S"
378 . "$display_locale", 1)
379 if $i == 0xDF && $name =~ / FOLD | UPPER /x
380 && $suffix eq "_LC" && $locale_is_utf8;
381
382 use if $locale, "locale";
383 POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
384
385 my $display_call = "to${function}$suffix("
386 . " $display_name )$display_locale";
387 $ret = eval "test_to${function}$suffix($j)";
388 if (is ($@, "", "$display_call didn't give error")) {
389 my $should_be;
390 if ($i > 255) {
391 $should_be = $j;
392 }
393 elsif ( $i > 127
394 && ( $suffix eq ""
395 || ($suffix eq "_LC" && ! $locale_is_utf8)))
396 {
397 $should_be = $j;
398 }
399 elsif ($map_ref->[$index] != $missing) {
400 $should_be = $map_ref->[$index] + $j - $list_ref->[$index]
31f05a37
KW
401 }
402 else {
01a11ab9 403 $should_be = $j;
31f05a37 404 }
01a11ab9
KW
405
406 is ($ret, $should_be,
407 sprintf("${tab}And correctly returned 0x%02X",
408 $should_be));
31f05a37 409 }
2e1414ce
KW
410 }
411 }
412
a7fe8528
KW
413 # The _uni, uvchr, and _utf8 functions return both the ordinal of the
414 # first code point of the result, and the result in utf8. The .xs
415 # tests return these in an array, in [0] and [1] respectively, with
416 # [2] the length of the utf8 in bytes.
2e1414ce
KW
417 my $utf8_should_be = "";
418 my $first_ord_should_be;
419 if (ref $map_ref->[$index]) { # A multi-char result
420 for my $j (0 .. @{$map_ref->[$index]} - 1) {
421 $utf8_should_be .= chr $map_ref->[$index][$j];
422 }
423
424 $first_ord_should_be = $map_ref->[$index][0];
425 }
426 else { # A single-char result
427 $first_ord_should_be = ($map_ref->[$index] != $missing)
01a11ab9
KW
428 ? $map_ref->[$index] + $j
429 - $list_ref->[$index]
2e1414ce
KW
430 : $j;
431 $utf8_should_be = chr $first_ord_should_be;
432 }
433 utf8::upgrade($utf8_should_be);
434
a7fe8528
KW
435 # Test _uni, uvchr
436 foreach my $suffix ('_uni', '_uvchr') {
2e1414ce
KW
437 my $s;
438 my $len;
a7fe8528
KW
439 my $display_call = "to${function}$suffix( $display_name )";
440 $ret = eval "test_to${function}$suffix($j)";
01a11ab9
KW
441 if (is ($@, "", "$display_call didn't give error")) {
442 is ($ret->[0], $first_ord_should_be,
443 sprintf("${tab}And correctly returned 0x%02X",
444 $first_ord_should_be));
445 is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
2e1414ce 446 use bytes;
01a11ab9
KW
447 is ($ret->[2], length $utf8_should_be,
448 "${tab}Got correct number of bytes for utf8 length");
2e1414ce 449 }
a7fe8528 450 }
2e1414ce
KW
451
452 # Test _utf8
453 my $char = chr($j);
454 utf8::upgrade($char);
455 $char = quotemeta $char if $char eq '\\' || $char eq "'";
01a11ab9
KW
456 {
457 my $display_call = "to${function}_utf8($display_name )";
458 $ret = eval "test_to${function}_utf8('$char')";
459 if (is ($@, "", "$display_call didn't give error")) {
460 is ($ret->[0], $first_ord_should_be,
461 sprintf("${tab}And correctly returned 0x%02X",
462 $first_ord_should_be));
463 is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
464 use bytes;
465 is ($ret->[2], length $utf8_should_be,
466 "${tab}Got correct number of bytes for utf8 length");
467 }
2e1414ce 468 }
2e1414ce
KW
469 }
470}
471
5073ffbd 472# This is primarily to make sure that no non-Unicode warnings get generated
01a11ab9
KW
473is(scalar @warnings, 0, "No unexpected warnings were generated in the tests")
474 or diag @warnings;
5073ffbd 475
bdd8600f 476done_testing;