This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/handy.t: White-space only
[perl5.git] / ext / XS-APItest / t / handy.t
CommitLineData
bdd8600f
KW
1#!perl -w
2
31f05a37 3BEGIN {
629eeaee
KW
4 require 'loc_tools.pl'; # Contains locales_enabled() and
5 # find_utf8_ctype_locale()
31f05a37
KW
6}
7
bdd8600f
KW
8use strict;
9use Test::More;
569f7fc5 10use Config;
bdd8600f
KW
11
12use XS::APItest;
13
2e1414ce 14use Unicode::UCD qw(prop_invlist prop_invmap);
c9c05358 15
01a11ab9
KW
16my $tab = " " x 4; # Indent subsidiary tests this much
17
5073ffbd 18sub truth($) { # Converts values so is() works
e2efe419 19 return (shift) ? 1 : 0;
c9c05358
KW
20}
21
01a11ab9 22my $base_locale;
31f05a37 23my $utf8_locale;
629eeaee 24if(locales_enabled('LC_ALL')) {
569f7fc5 25 require POSIX;
01a11ab9
KW
26 $base_locale = POSIX::setlocale( &POSIX::LC_ALL, "C");
27 if (defined $base_locale && $base_locale eq 'C') {
5f1269ab 28 use locale; # make \w work right in non-ASCII lands
569f7fc5
JR
29
30 # Some locale implementations don't have the 128-255 characters all
31 # mean nothing. Skip the locale tests in that situation
32 for my $i (128 .. 255) {
b5c3a14a 33 if (chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/) {
01a11ab9 34 undef $base_locale;
569f7fc5
JR
35 last;
36 }
981746b9 37 }
31f05a37 38
01a11ab9 39 $utf8_locale = find_utf8_ctype_locale() if $base_locale;
981746b9
KW
40 }
41}
42
01a11ab9
KW
43sub get_display_locale_or_skip($$) {
44
45 # Helper function intimately tied to its callers. It knows the loop
46 # iterates with a locale of "", meaning don't use locale; $base_locale
47 # meaning to use a non-UTF-8 locale; and $utf8_locale.
48 #
49 # It checks to see if the current test should be skipped or executed,
50 # returning an empty list for the former, and for the latter:
51 # ( 'locale display name',
52 # bool of is this a UTF-8 locale )
53 #
54 # The display name is the empty string if not using locale. Functions
55 # with _LC in their name are skipped unless in locale, and functions
56 # without _LC are executed only outside locale. However, if no locales at
57 # all are on the system, the _LC functions are executed outside locale.
58
59 my ($locale, $suffix) = @_;
60
61 # The test should be skipped if the input is for a non-existent locale
62 return unless defined $locale;
63
64 # Here the input is defined, either a locale name or "". If the test is
65 # for not using locales, we want to do the test for non-LC functions,
66 # and skip it for LC ones (except if there are no locales on the system,
67 # we do it for LC ones as if they weren't LC).
68 if ($locale eq "") {
69 return ("", 0) if $suffix !~ /LC/ || ! defined $base_locale;
70 return;
71 }
72
73 # Here the input is for a real locale. We don't test the non-LC functions
74 # for locales.
75 return if $suffix !~ /LC/;
76
77 # Here is for a LC function and a real locale. The base locale is not
78 # UTF-8.
79 return (" ($locale locale)", 0) if $locale eq $base_locale;
80
81 # The only other possibility is that we have a UTF-8 locale
82 return (" ($locale)", 1);
83}
84
e2efe419
KW
85my %properties = (
86 # name => Lookup-property name
87 alnum => 'Word',
981746b9 88 wordchar => 'Word',
15861f94 89 alphanumeric => 'Alnum',
ebdbc726 90 alpha => 'XPosixAlpha',
e2efe419
KW
91 ascii => 'ASCII',
92 blank => 'Blank',
93 cntrl => 'Control',
94 digit => 'Digit',
95 graph => 'Graph',
96 idfirst => '_Perl_IDStart',
eba68aa0 97 idcont => '_Perl_IDCont',
ebdbc726 98 lower => 'XPosixLower',
e2efe419
KW
99 print => 'Print',
100 psxspc => 'XPosixSpace',
101 punct => 'XPosixPunct',
102 quotemeta => '_Perl_Quotemeta',
103 space => 'XPerlSpace',
840f8e92 104 vertws => 'VertSpace',
ebdbc726 105 upper => 'XPosixUpper',
e2efe419
KW
106 xdigit => 'XDigit',
107 );
108
5073ffbd
KW
109my @warnings;
110local $SIG{__WARN__} = sub { push @warnings, @_ };
111
c9c05358 112use charnames ();
8dc91396
KW
113foreach my $name (sort keys %properties, 'octal') {
114 my @invlist;
115 if ($name eq 'octal') {
116 # Hand-roll an inversion list with 0-7 in it and nothing else.
117 push @invlist, ord "0", ord "8";
118 }
119 else {
9a009363
KW
120 my $property = $properties{$name};
121 @invlist = prop_invlist($property, '_perl_core_internal_ok');
122 if (! @invlist) {
123
124 # An empty return could mean an unknown property, or merely that
125 # it is empty. Call in scalar context to differentiate
126 if (! prop_invlist($property, '_perl_core_internal_ok')) {
127 fail("No inversion list found for $property");
128 next;
129 }
ebdbc726 130 }
e2efe419
KW
131 }
132
133 # Include all the Latin1 code points, plus 0x100.
134 my @code_points = (0 .. 256);
135
136 # Then include the next few boundaries above those from this property
137 my $above_latins = 0;
138 foreach my $range_start (@invlist) {
139 next if $range_start < 257;
140 push @code_points, $range_start - 1, $range_start;
141 $above_latins++;
142 last if $above_latins > 5;
143 }
144
eba68aa0
KW
145 # This makes sure we are using the Perl definition of idfirst and idcont,
146 # and not the Unicode. There are a few differences.
147 push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first|cont)/;
2f4622f0
KW
148 if ($name eq "idcont") { # And some that are continuation but not start
149 push @code_points, ord("\N{GREEK ANO TELEIA}"),
150 ord("\N{COMBINING GRAVE ACCENT}");
151 }
f91dcd13 152
e2efe419
KW
153 # And finally one non-Unicode code point.
154 push @code_points, 0x110000; # Above Unicode, no prop should match
4f6ef7cc 155 no warnings 'non_unicode';
e2efe419 156
2e1414ce
KW
157 for my $j (@code_points) {
158 my $i = utf8::native_to_unicode($j);
c9c05358 159 my $function = uc($name);
c9c05358 160
01a11ab9
KW
161 is (@warnings, 0, "Got no unexpected warnings in previous iteration")
162 or diag("@warnings");
163 undef @warnings;
164
1fdd5e53 165 my $matches = Unicode::UCD::search_invlist(\@invlist, $i);
e2efe419
KW
166 if (! defined $matches) {
167 $matches = 0;
168 }
169 else {
170 $matches = truth(! ($matches % 2));
171 }
5073ffbd 172
e2efe419
KW
173 my $ret;
174 my $char_name = charnames::viacode($i) // "No name";
94a62bdc 175 my $display_name = sprintf "\\x{%02X, %s}", $i, $char_name;
01a11ab9 176 my $display_call = "is${function}( $display_name )";
e2efe419
KW
177
178 if ($name eq 'quotemeta') { # There is only one macro for this, and is
179 # defined only for Latin1 range
180 $ret = truth eval "test_is${function}($i)";
01a11ab9 181 if (is ($@, "", "$display_call didn't give error")) {
e2efe419 182 my $truth = truth($matches && $i < 256);
01a11ab9 183 is ($ret, $truth, "${tab}And returns $truth");
e2efe419 184 }
c9c05358
KW
185 next;
186 }
981746b9 187
01a11ab9 188 foreach my $suffix (qw(_A _L1 _LC _uni _LC_uvchr _utf8 _LC_utf8)) {
31f05a37 189
01a11ab9
KW
190 # Not all possible macros have been defined
191 if ($name eq 'vertws') {
31f05a37 192
01a11ab9
KW
193 # vertws is always all of Unicode
194 next if $suffix ne "_uni" && $suffix ne "_utf8";
981746b9 195 }
01a11ab9 196 elsif ($name eq 'alnum') {
981746b9 197
01a11ab9
KW
198 # ALNUM_A and ALNUM_L1 are not defined as they were added
199 # later, after WORDCHAR was created to be a clearer synonym
200 # for ALNUM
201 next if $suffix eq '_A' || $suffix eq '_L1';
31a09021 202 }
8dc91396
KW
203 elsif ($name eq 'octal') {
204 next if $suffix ne '_A' && $suffix ne '_L1';
205 }
a9aff56e 206
01a11ab9
KW
207 foreach my $locale ("", $base_locale, $utf8_locale) {
208
209 my ($display_locale, $locale_is_utf8)
210 = get_display_locale_or_skip($locale, $suffix);
211 next unless defined $display_locale;
212
213 use if $locale, "locale";
214 POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
215
216 if ($suffix !~ /utf8/) { # _utf8 has to handled specially
217 my $display_call
218 = "is${function}$suffix( $display_name )$display_locale";
219 $ret = truth eval "test_is${function}$suffix($i)";
220 if (is ($@, "", "$display_call didn't give error")) {
221 my $truth = $matches;
222 if ($truth) {
223
224 # The single byte functions are false for
225 # above-Latin1
226 if ($i >= 256) {
227 $truth = 0 if $suffix =~ / ^ _A | _L [1C] $ /x;
228 }
229 elsif (utf8::native_to_unicode($i) >= 128) {
230
231 # The _A functions are false
232 # for non-ASCII. So are _LC functions on a
233 # non-UTF-8 locale
234 $truth = 0 if $suffix eq "_A"
235 || ( $suffix =~ /LC/
236 && ! $locale_is_utf8);
237 }
238 }
239
240 is ($ret, $truth, "${tab}And correctly returns $truth");
241 }
242 }
243 else { # _utf8 suffix
244 my $char = chr($i);
245 utf8::upgrade($char);
246 $char = quotemeta $char if $char eq '\\' || $char eq "'";
247 my $truth;
248 if ( $suffix =~ /LC/
249 && ! $locale_is_utf8
250 && $i < 256
251 && utf8::native_to_unicode($i) >= 128)
252 { # The C-locale _LC function returns FALSE for Latin1
253 # above ASCII
254 $truth = 0;
255 }
256 else {
257 $truth = $matches;
258 }
31f05a37 259
01a11ab9
KW
260 my $display_call = "is${function}$suffix("
261 . " $display_name )$display_locale";
262 $ret = truth eval "test_is${function}$suffix('$char')";
263 if (is ($@, "", "$display_call didn't give error")) {
264 is ($ret, $truth,
265 "${tab}And correctly returned $truth");
266 }
267 }
31a09021 268 }
8ff203a1 269 }
c9c05358
KW
270 }
271}
272
2e1414ce 273my %to_properties = (
01a11ab9 274 FOLD => 'Case_Folding',
2e1414ce
KW
275 LOWER => 'Lowercase_Mapping',
276 TITLE => 'Titlecase_Mapping',
277 UPPER => 'Uppercase_Mapping',
278 );
279
280
281foreach my $name (sort keys %to_properties) {
282 my $property = $to_properties{$name};
283 my ($list_ref, $map_ref, $format, $missing)
284 = prop_invmap($property, );
285 if (! $list_ref || ! $map_ref) {
286 fail("No inversion map found for $property");
287 next;
288 }
ebdbc726 289 if ($format !~ / ^ a l? $ /x) {
2e1414ce
KW
290 fail("Unexpected inversion map format ('$format') found for $property");
291 next;
292 }
293
294 # Include all the Latin1 code points, plus 0x100.
295 my @code_points = (0 .. 256);
296
297 # Then include the next few multi-char folds above those from this
298 # property, and include the next few single folds as well
299 my $above_latins = 0;
300 my $multi_char = 0;
301 for my $i (0 .. @{$list_ref} - 1) {
302 my $range_start = $list_ref->[$i];
303 next if $range_start < 257;
304 if (ref $map_ref->[$i] && $multi_char < 5) {
01a11ab9
KW
305 push @code_points, $range_start - 1
306 if $code_points[-1] != $range_start - 1;
2e1414ce
KW
307 push @code_points, $range_start;
308 $multi_char++;
309 }
310 elsif ($above_latins < 5) {
01a11ab9
KW
311 push @code_points, $range_start - 1
312 if $code_points[-1] != $range_start - 1;
2e1414ce
KW
313 push @code_points, $range_start;
314 $above_latins++;
315 }
316 last if $above_latins >= 5 && $multi_char >= 5;
317 }
318
319 # And finally one non-Unicode code point.
320 push @code_points, 0x110000; # Above Unicode, no prop should match
321 no warnings 'non_unicode';
322
323 # $j is native; $i unicode.
324 for my $j (@code_points) {
325 my $i = utf8::native_to_unicode($j);
326 my $function = $name;
327
1fdd5e53 328 my $index = Unicode::UCD::search_invlist(\@{$list_ref}, $j);
2e1414ce
KW
329
330 my $ret;
331 my $char_name = charnames::viacode($j) // "No name";
332 my $display_name = sprintf "\\N{U+%02X, %s}", $i, $char_name;
333
01a11ab9 334 foreach my $suffix ("", "_L1", "_LC") {
2e1414ce 335
01a11ab9
KW
336 # This is the only macro defined for L1
337 next if $suffix eq "_L1" && $function ne "LOWER";
2e1414ce 338
01a11ab9
KW
339 SKIP:
340 foreach my $locale ("", $base_locale, $utf8_locale) {
31f05a37 341
01a11ab9
KW
342 # titlecase is not defined in locales.
343 next if $name eq 'TITLE' && $suffix eq "_LC";
31f05a37 344
01a11ab9
KW
345 my ($display_locale, $locale_is_utf8)
346 = get_display_locale_or_skip($locale, $suffix);
347 next unless defined $display_locale;
31f05a37 348
01a11ab9
KW
349 skip("to${name}_LC does not work for LATIN SMALL LETTER SHARP S"
350 . "$display_locale", 1)
351 if $i == 0xDF && $name =~ / FOLD | UPPER /x
352 && $suffix eq "_LC" && $locale_is_utf8;
353
354 use if $locale, "locale";
355 POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
356
357 my $display_call = "to${function}$suffix("
358 . " $display_name )$display_locale";
359 $ret = eval "test_to${function}$suffix($j)";
360 if (is ($@, "", "$display_call didn't give error")) {
361 my $should_be;
362 if ($i > 255) {
363 $should_be = $j;
364 }
365 elsif ( $i > 127
366 && ( $suffix eq ""
367 || ($suffix eq "_LC" && ! $locale_is_utf8)))
368 {
369 $should_be = $j;
370 }
371 elsif ($map_ref->[$index] != $missing) {
372 $should_be = $map_ref->[$index] + $j - $list_ref->[$index]
31f05a37
KW
373 }
374 else {
01a11ab9 375 $should_be = $j;
31f05a37 376 }
01a11ab9
KW
377
378 is ($ret, $should_be,
379 sprintf("${tab}And correctly returned 0x%02X",
380 $should_be));
31f05a37 381 }
2e1414ce
KW
382 }
383 }
384
385 # The _uni and _utf8 functions return both the ordinal of the first
386 # code point of the result, and the result in utf8. The .xs tests
387 # return these in an array, in [0] and [1] respectively, with [2] the
388 # length of the utf8 in bytes.
389 my $utf8_should_be = "";
390 my $first_ord_should_be;
391 if (ref $map_ref->[$index]) { # A multi-char result
392 for my $j (0 .. @{$map_ref->[$index]} - 1) {
393 $utf8_should_be .= chr $map_ref->[$index][$j];
394 }
395
396 $first_ord_should_be = $map_ref->[$index][0];
397 }
398 else { # A single-char result
399 $first_ord_should_be = ($map_ref->[$index] != $missing)
01a11ab9
KW
400 ? $map_ref->[$index] + $j
401 - $list_ref->[$index]
2e1414ce
KW
402 : $j;
403 $utf8_should_be = chr $first_ord_should_be;
404 }
405 utf8::upgrade($utf8_should_be);
406
407 # Test _uni
408 my $s;
409 my $len;
01a11ab9 410 my $display_call = "to${function}_uni( $display_name )";
2e1414ce 411 $ret = eval "test_to${function}_uni($j)";
01a11ab9
KW
412 if (is ($@, "", "$display_call didn't give error")) {
413 is ($ret->[0], $first_ord_should_be,
414 sprintf("${tab}And correctly returned 0x%02X",
415 $first_ord_should_be));
416 is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
2e1414ce 417 use bytes;
01a11ab9
KW
418 is ($ret->[2], length $utf8_should_be,
419 "${tab}Got correct number of bytes for utf8 length");
2e1414ce
KW
420 }
421
422 # Test _utf8
423 my $char = chr($j);
424 utf8::upgrade($char);
425 $char = quotemeta $char if $char eq '\\' || $char eq "'";
01a11ab9
KW
426 {
427 my $display_call = "to${function}_utf8($display_name )";
428 $ret = eval "test_to${function}_utf8('$char')";
429 if (is ($@, "", "$display_call didn't give error")) {
430 is ($ret->[0], $first_ord_should_be,
431 sprintf("${tab}And correctly returned 0x%02X",
432 $first_ord_should_be));
433 is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
434 use bytes;
435 is ($ret->[2], length $utf8_should_be,
436 "${tab}Got correct number of bytes for utf8 length");
437 }
2e1414ce 438 }
2e1414ce
KW
439 }
440}
441
5073ffbd 442# This is primarily to make sure that no non-Unicode warnings get generated
01a11ab9
KW
443is(scalar @warnings, 0, "No unexpected warnings were generated in the tests")
444 or diag @warnings;
5073ffbd 445
bdd8600f 446done_testing;