This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/handy.t: Slightly simplify
[perl5.git] / ext / XS-APItest / t / handy.t
CommitLineData
bdd8600f
KW
1#!perl -w
2
31f05a37 3BEGIN {
629eeaee
KW
4 require 'loc_tools.pl'; # Contains locales_enabled() and
5 # find_utf8_ctype_locale()
31f05a37
KW
6}
7
bdd8600f
KW
8use strict;
9use Test::More;
569f7fc5 10use Config;
bdd8600f
KW
11
12use XS::APItest;
13
2e1414ce 14use Unicode::UCD qw(prop_invlist prop_invmap);
c9c05358 15
5073ffbd 16sub truth($) { # Converts values so is() works
e2efe419 17 return (shift) ? 1 : 0;
c9c05358
KW
18}
19
569f7fc5 20my $locale;
31f05a37 21my $utf8_locale;
629eeaee 22if(locales_enabled('LC_ALL')) {
569f7fc5
JR
23 require POSIX;
24 $locale = POSIX::setlocale( &POSIX::LC_ALL, "C");
25 if (defined $locale && $locale eq 'C') {
5f1269ab 26 use locale; # make \w work right in non-ASCII lands
569f7fc5
JR
27
28 # Some locale implementations don't have the 128-255 characters all
29 # mean nothing. Skip the locale tests in that situation
30 for my $i (128 .. 255) {
b5c3a14a 31 if (chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/) {
569f7fc5
JR
32 undef $locale;
33 last;
34 }
981746b9 35 }
31f05a37 36
9b0711ee 37 $utf8_locale = find_utf8_ctype_locale();
981746b9
KW
38 }
39}
40
e2efe419
KW
41my %properties = (
42 # name => Lookup-property name
43 alnum => 'Word',
981746b9 44 wordchar => 'Word',
15861f94 45 alphanumeric => 'Alnum',
ebdbc726 46 alpha => 'XPosixAlpha',
e2efe419
KW
47 ascii => 'ASCII',
48 blank => 'Blank',
49 cntrl => 'Control',
50 digit => 'Digit',
51 graph => 'Graph',
52 idfirst => '_Perl_IDStart',
eba68aa0 53 idcont => '_Perl_IDCont',
ebdbc726 54 lower => 'XPosixLower',
e2efe419
KW
55 print => 'Print',
56 psxspc => 'XPosixSpace',
57 punct => 'XPosixPunct',
58 quotemeta => '_Perl_Quotemeta',
59 space => 'XPerlSpace',
840f8e92 60 vertws => 'VertSpace',
ebdbc726 61 upper => 'XPosixUpper',
e2efe419
KW
62 xdigit => 'XDigit',
63 );
64
5073ffbd
KW
65my @warnings;
66local $SIG{__WARN__} = sub { push @warnings, @_ };
67
c9c05358 68use charnames ();
e2efe419
KW
69foreach my $name (sort keys %properties) {
70 my $property = $properties{$name};
71 my @invlist = prop_invlist($property, '_perl_core_internal_ok');
72 if (! @invlist) {
ebdbc726
KW
73
74 # An empty return could mean an unknown property, or merely that it is
75 # empty. Call in scalar context to differentiate
76 if (! prop_invlist($property, '_perl_core_internal_ok')) {
77 fail("No inversion list found for $property");
78 next;
79 }
e2efe419
KW
80 }
81
82 # Include all the Latin1 code points, plus 0x100.
83 my @code_points = (0 .. 256);
84
85 # Then include the next few boundaries above those from this property
86 my $above_latins = 0;
87 foreach my $range_start (@invlist) {
88 next if $range_start < 257;
89 push @code_points, $range_start - 1, $range_start;
90 $above_latins++;
91 last if $above_latins > 5;
92 }
93
eba68aa0
KW
94 # This makes sure we are using the Perl definition of idfirst and idcont,
95 # and not the Unicode. There are a few differences.
96 push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first|cont)/;
2f4622f0
KW
97 if ($name eq "idcont") { # And some that are continuation but not start
98 push @code_points, ord("\N{GREEK ANO TELEIA}"),
99 ord("\N{COMBINING GRAVE ACCENT}");
100 }
f91dcd13 101
e2efe419
KW
102 # And finally one non-Unicode code point.
103 push @code_points, 0x110000; # Above Unicode, no prop should match
4f6ef7cc 104 no warnings 'non_unicode';
e2efe419 105
2e1414ce
KW
106 for my $j (@code_points) {
107 my $i = utf8::native_to_unicode($j);
c9c05358 108 my $function = uc($name);
c9c05358 109
1fdd5e53 110 my $matches = Unicode::UCD::search_invlist(\@invlist, $i);
e2efe419
KW
111 if (! defined $matches) {
112 $matches = 0;
113 }
114 else {
115 $matches = truth(! ($matches % 2));
116 }
5073ffbd 117
e2efe419
KW
118 my $ret;
119 my $char_name = charnames::viacode($i) // "No name";
94a62bdc 120 my $display_name = sprintf "\\x{%02X, %s}", $i, $char_name;
e2efe419
KW
121
122 if ($name eq 'quotemeta') { # There is only one macro for this, and is
123 # defined only for Latin1 range
124 $ret = truth eval "test_is${function}($i)";
125 if ($@) {
126 fail $@;
127 }
128 else {
129 my $truth = truth($matches && $i < 256);
130 is ($ret, $truth, "is${function}( $display_name ) == $truth");
131 }
c9c05358
KW
132 next;
133 }
981746b9
KW
134
135 # vertws is always all of Unicode; ALNUM_A and ALNUM_L1 are not
136 # defined as they were added later, after WORDCHAR was created to be a
137 # clearer synonym for ALNUM
840f8e92 138 if ($name ne 'vertws') {
981746b9 139 if ($name ne 'alnum') {
195d0f3b
KW
140 $ret = truth eval "test_is${function}_A($i)";
141 if ($@) {
142 fail($@);
143 }
144 else {
b5c3a14a 145 my $truth = truth($matches && utf8::native_to_unicode($i) < 128);
195d0f3b
KW
146 is ($ret, $truth, "is${function}_A( $display_name ) == $truth");
147 }
148 $ret = truth eval "test_is${function}_L1($i)";
149 if ($@) {
150 fail($@);
151 }
152 else {
153 my $truth = truth($matches && $i < 256);
154 is ($ret, $truth, "is${function}_L1( $display_name ) == $truth");
155 }
981746b9
KW
156 }
157
8ff203a1 158 if (defined $locale) {
5f1269ab 159 use locale;
31f05a37 160 POSIX::setlocale( &POSIX::LC_ALL, "C");
31a09021
KW
161 $ret = truth eval "test_is${function}_LC($i)";
162 if ($@) {
163 fail($@);
164 }
165 else {
b5c3a14a 166 my $truth = truth($matches && utf8::native_to_unicode($i) < 128);
31f05a37
KW
167 is ($ret, $truth, "is${function}_LC( $display_name ) == $truth (C locale)");
168 }
169 }
170
171 if (defined $utf8_locale) {
172 use locale;
173
174 POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale);
175 $ret = truth eval "test_is${function}_LC($i)";
176 if ($@) {
177 fail($@);
178 }
179 else {
180
181 # UTF-8 locale works on full range 0-255
182 my $truth = truth($matches && $i < 256);
183 is ($ret, $truth, "is${function}_LC( $display_name ) == $truth ($utf8_locale)");
31a09021 184 }
8ff203a1 185 }
840f8e92 186 }
c9c05358 187
e2efe419
KW
188 $ret = truth eval "test_is${function}_uni($i)";
189 if ($@) {
190 fail($@);
191 }
192 else {
193 is ($ret, $matches, "is${function}_uni( $display_name ) == $matches");
194 }
c9c05358 195
981746b9 196 if (defined $locale && $name ne 'vertws') {
5f1269ab 197 use locale;
31f05a37 198 POSIX::setlocale( &POSIX::LC_ALL, "C");
981746b9
KW
199 $ret = truth eval "test_is${function}_LC_uvchr('$i')";
200 if ($@) {
201 fail($@);
202 }
203 else {
b5c3a14a 204 my $truth = truth($matches && (utf8::native_to_unicode($i) < 128 || $i > 255));
31f05a37
KW
205 is ($ret, $truth, "is${function}_LC_uvchr( $display_name ) == $truth (C locale)");
206 }
207 }
208
209 if (defined $utf8_locale && $name ne 'vertws') {
210 use locale;
211
212 POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale);
213 $ret = truth eval "test_is${function}_LC_uvchr('$i')";
214 if ($@) {
215 fail($@);
216 }
217 else {
218 my $truth = truth($matches);
219 is ($ret, $truth, "is${function}_LC_uvchr( $display_name ) == $truth ($utf8_locale)");
981746b9
KW
220 }
221 }
222
c9c05358
KW
223 my $char = chr($i);
224 utf8::upgrade($char);
225 $char = quotemeta $char if $char eq '\\' || $char eq "'";
e2efe419
KW
226 $ret = truth eval "test_is${function}_utf8('$char')";
227 if ($@) {
228 fail($@);
229 }
230 else {
231 is ($ret, $matches, "is${function}_utf8( $display_name ) == $matches");
232 }
981746b9 233
8ff203a1 234 if ($name ne 'vertws' && defined $locale) {
5f1269ab 235 use locale;
31f05a37 236 POSIX::setlocale( &POSIX::LC_ALL, "C");
31a09021
KW
237 $ret = truth eval "test_is${function}_LC_utf8('$char')";
238 if ($@) {
239 fail($@);
240 }
241 else {
b5c3a14a 242 my $truth = truth($matches && (utf8::native_to_unicode($i) < 128 || $i > 255));
31f05a37
KW
243 is ($ret, $truth, "is${function}_LC_utf8( $display_name ) == $truth (C locale)");
244 }
31f05a37
KW
245
246 POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale);
247 $ret = truth eval "test_is${function}_LC_utf8('$char')";
248 if ($@) {
249 fail($@);
250 }
251 else {
252 my $truth = truth($matches);
253 is ($ret, $truth, "is${function}_LC_utf8( $display_name ) == $truth ($utf8_locale)");
31a09021 254 }
8ff203a1 255 }
c9c05358
KW
256 }
257}
258
8c67216a
KW
259# Test isOCTAL()
260for my $i (0 .. 256, 0x110000) {
261 my $char_name = charnames::viacode($i) // "No name";
262 my $display_name = sprintf "\\N{U+%02X, %s}", $i, $char_name;
263 my $truth = truth($i >= ord('0') && $i <= ord('7'));
264
265 my $ret = truth test_isOCTAL_A($i);
266 is($ret, $truth, "isOCTAL_A( $display_name ) == $truth");
267
268 $ret = truth test_isOCTAL_L1($i);
269 is($ret, $truth, "isOCTAL_L1( $display_name ) == $truth");
270}
271
2e1414ce
KW
272my %to_properties = (
273 FOLD => 'Case_Folding',
274 LOWER => 'Lowercase_Mapping',
275 TITLE => 'Titlecase_Mapping',
276 UPPER => 'Uppercase_Mapping',
277 );
278
279
280foreach my $name (sort keys %to_properties) {
281 my $property = $to_properties{$name};
282 my ($list_ref, $map_ref, $format, $missing)
283 = prop_invmap($property, );
284 if (! $list_ref || ! $map_ref) {
285 fail("No inversion map found for $property");
286 next;
287 }
ebdbc726 288 if ($format !~ / ^ a l? $ /x) {
2e1414ce
KW
289 fail("Unexpected inversion map format ('$format') found for $property");
290 next;
291 }
292
293 # Include all the Latin1 code points, plus 0x100.
294 my @code_points = (0 .. 256);
295
296 # Then include the next few multi-char folds above those from this
297 # property, and include the next few single folds as well
298 my $above_latins = 0;
299 my $multi_char = 0;
300 for my $i (0 .. @{$list_ref} - 1) {
301 my $range_start = $list_ref->[$i];
302 next if $range_start < 257;
303 if (ref $map_ref->[$i] && $multi_char < 5) {
304 push @code_points, $range_start - 1 if $code_points[-1] != $range_start - 1;
305 push @code_points, $range_start;
306 $multi_char++;
307 }
308 elsif ($above_latins < 5) {
309 push @code_points, $range_start - 1 if $code_points[-1] != $range_start - 1;
310 push @code_points, $range_start;
311 $above_latins++;
312 }
313 last if $above_latins >= 5 && $multi_char >= 5;
314 }
315
316 # And finally one non-Unicode code point.
317 push @code_points, 0x110000; # Above Unicode, no prop should match
318 no warnings 'non_unicode';
319
320 # $j is native; $i unicode.
321 for my $j (@code_points) {
322 my $i = utf8::native_to_unicode($j);
323 my $function = $name;
324
1fdd5e53 325 my $index = Unicode::UCD::search_invlist(\@{$list_ref}, $j);
2e1414ce
KW
326
327 my $ret;
328 my $char_name = charnames::viacode($j) // "No name";
329 my $display_name = sprintf "\\N{U+%02X, %s}", $i, $char_name;
330
331 # Test the base function
332 $ret = eval "test_to${function}($j)";
333 if ($@) {
334 fail($@);
335 }
336 else {
337 my $should_be = ($i < 128 && $map_ref->[$index] != $missing)
338 ? $map_ref->[$index] + $j - $list_ref->[$index]
339 : $j;
340 is ($ret, $should_be, sprintf("to${function}( $display_name ) == 0x%02X", $should_be));
341 }
342
343 # Test _L1
344 if ($name eq 'LOWER') {
345 $ret = eval "test_to${function}_L1($j)";
346 if ($@) {
347 fail($@);
348 }
349 else {
350 my $should_be = ($i < 256 && $map_ref->[$index] != $missing)
351 ? $map_ref->[$index] + $j - $list_ref->[$index]
352 : $j;
353 is ($ret, $should_be, sprintf("to${function}_L1( $display_name ) == 0x%02X", $should_be));
354 }
355 }
356
31f05a37
KW
357 if ($name ne 'TITLE') { # Test _LC; titlecase is not defined in locales.
358 if (defined $locale) {
5f1269ab
KW
359 use locale;
360 POSIX::setlocale( &POSIX::LC_ALL, "C");
baa60164
KW
361 $ret = eval "test_to${function}_LC($j)";
362 if ($@) {
363 fail($@);
364 }
365 else {
366 my $should_be = ($i < 128 && $map_ref->[$index] != $missing)
2e1414ce
KW
367 ? $map_ref->[$index] + $j - $list_ref->[$index]
368 : $j;
baa60164
KW
369 is ($ret, $should_be, sprintf("to${function}_LC( $display_name ) == 0x%02X (C locale)", $should_be));
370 }
31f05a37
KW
371 }
372
373 if (defined $utf8_locale) {
374 use locale;
375
376 SKIP: {
377 skip "to${property}_LC does not work for LATIN SMALL LETTER SHARP S", 1
378 if $j == 0xDF && ($name eq 'FOLD' || $name eq 'UPPER');
379
380 POSIX::setlocale( &POSIX::LC_ALL, $utf8_locale);
381 $ret = eval "test_to${function}_LC($j)";
382 if ($@) {
383 fail($@);
384 }
385 else {
386 my $should_be = ($i < 256
387 && ! ref $map_ref->[$index]
388 && $map_ref->[$index] != $missing
389 )
390 ? $map_ref->[$index] + $j - $list_ref->[$index]
391 : $j;
392 is ($ret, $should_be, sprintf("to${function}_LC( $display_name ) == 0x%02X ($utf8_locale)", $should_be));
393 }
394 }
2e1414ce
KW
395 }
396 }
397
398 # The _uni and _utf8 functions return both the ordinal of the first
399 # code point of the result, and the result in utf8. The .xs tests
400 # return these in an array, in [0] and [1] respectively, with [2] the
401 # length of the utf8 in bytes.
402 my $utf8_should_be = "";
403 my $first_ord_should_be;
404 if (ref $map_ref->[$index]) { # A multi-char result
405 for my $j (0 .. @{$map_ref->[$index]} - 1) {
406 $utf8_should_be .= chr $map_ref->[$index][$j];
407 }
408
409 $first_ord_should_be = $map_ref->[$index][0];
410 }
411 else { # A single-char result
412 $first_ord_should_be = ($map_ref->[$index] != $missing)
413 ? $map_ref->[$index] + $j - $list_ref->[$index]
414 : $j;
415 $utf8_should_be = chr $first_ord_should_be;
416 }
417 utf8::upgrade($utf8_should_be);
418
419 # Test _uni
420 my $s;
421 my $len;
422 $ret = eval "test_to${function}_uni($j)";
423 if ($@) {
424 fail($@);
425 }
426 else {
427 is ($ret->[0], $first_ord_should_be, sprintf("to${function}_uni( $display_name ) == 0x%02X", $first_ord_should_be));
428 is ($ret->[1], $utf8_should_be, sprintf("utf8 of to${function}_uni( $display_name )"));
429 use bytes;
430 is ($ret->[2], length $utf8_should_be, sprintf("number of bytes in utf8 of to${function}_uni( $display_name )"));
431 }
432
433 # Test _utf8
434 my $char = chr($j);
435 utf8::upgrade($char);
436 $char = quotemeta $char if $char eq '\\' || $char eq "'";
437 $ret = eval "test_to${function}_utf8('$char')";
438 if ($@) {
439 fail($@);
440 }
441 else {
442 is ($ret->[0], $first_ord_should_be, sprintf("to${function}_utf8( $display_name ) == 0x%02X", $first_ord_should_be));
443 is ($ret->[1], $utf8_should_be, sprintf("utf8 of to${function}_utf8( $display_name )"));
444 use bytes;
445 is ($ret->[2], length $utf8_should_be, sprintf("number of bytes in utf8 of to${function}_uni( $display_name )"));
446 }
447
448 }
449}
450
5073ffbd 451# This is primarily to make sure that no non-Unicode warnings get generated
5c335666
KW
452unless (is(scalar @warnings, 0, "No warnings were generated")) {
453 diag @warnings;
454}
5073ffbd 455
bdd8600f 456done_testing;