This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/handy.t: Skip locale tests when not in locale
[perl5.git] / ext / XS-APItest / t / handy.t
... / ...
CommitLineData
1#!perl -w
2
3BEGIN {
4 require 'loc_tools.pl'; # Contains locales_enabled() and
5 # find_utf8_ctype_locale()
6}
7
8use strict;
9use Test::More;
10use Config;
11
12use XS::APItest;
13
14my $tab = " " x 4; # Indent subsidiary tests this much
15
16use Unicode::UCD qw(search_invlist prop_invmap prop_invlist);
17my ($charname_list, $charname_map, $format, $default) = prop_invmap("Name Alias");
18
19sub get_charname($) {
20 my $cp = shift;
21
22 # If there is a an abbreviation for the code point name, use it
23 my $name_index = search_invlist(\@{$charname_list}, $cp);
24 if (defined $name_index) {
25 my $synonyms = $charname_map->[$name_index];
26 if (ref $synonyms) {
27 my $pat = qr/: abbreviation/;
28 my @abbreviations = grep { $_ =~ $pat } @$synonyms;
29 if (@abbreviations) {
30 return $abbreviations[0] =~ s/$pat//r;
31 }
32 }
33 }
34
35 # Otherwise, use the full name
36 use charnames ();
37 return charnames::viacode($cp) // "No name";
38}
39
40sub truth($) { # Converts values so is() works
41 return (shift) ? 1 : 0;
42}
43
44my $base_locale;
45my $utf8_locale;
46if(locales_enabled('LC_ALL')) {
47 require POSIX;
48 $base_locale = POSIX::setlocale( &POSIX::LC_ALL, "C");
49 if (defined $base_locale && $base_locale eq 'C') {
50 use locale; # make \w work right in non-ASCII lands
51
52 # Some locale implementations don't have the 128-255 characters all
53 # mean nothing. Skip the locale tests in that situation
54 for my $i (128 .. 255) {
55 if (chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/) {
56 undef $base_locale;
57 last;
58 }
59 }
60
61 $utf8_locale = find_utf8_ctype_locale() if $base_locale;
62 }
63}
64
65sub get_display_locale_or_skip($$) {
66
67 # Helper function intimately tied to its callers. It knows the loop
68 # iterates with a locale of "", meaning don't use locale; $base_locale
69 # meaning to use a non-UTF-8 locale; and $utf8_locale.
70 #
71 # It checks to see if the current test should be skipped or executed,
72 # returning an empty list for the former, and for the latter:
73 # ( 'locale display name',
74 # bool of is this a UTF-8 locale )
75 #
76 # The display name is the empty string if not using locale. Functions
77 # with _LC in their name are skipped unless in locale, and functions
78 # without _LC are executed only outside locale.
79
80 my ($locale, $suffix) = @_;
81
82 # The test should be skipped if the input is for a non-existent locale
83 return unless defined $locale;
84
85 # Here the input is defined, either a locale name or "". If the test is
86 # for not using locales, we want to do the test for non-LC functions,
87 # and skip it for LC ones.
88 if ($locale eq "") {
89 return ("", 0) if $suffix !~ /LC/;
90 return;
91 }
92
93 # Here the input is for a real locale. We don't test the non-LC functions
94 # for locales.
95 return if $suffix !~ /LC/;
96
97 # Here is for a LC function and a real locale. The base locale is not
98 # UTF-8.
99 return (" ($locale locale)", 0) if $locale eq $base_locale;
100
101 # The only other possibility is that we have a UTF-8 locale
102 return (" ($locale)", 1);
103}
104
105sub try_malforming($$$)
106{
107 # Determines if the tests for malformed UTF-8 should be done. When done,
108 # the .xs code creates malformations by pretending the length is shorter
109 # than it actually is. Some things can't be malformed, and sometimes this
110 # test knows that the current code doesn't look for a malformation under
111 # various circumstances.
112
113 my ($i, $function, $using_locale) = @_;
114 # $i is unicode code point;
115
116 # Single bytes can't be malformed
117 return 0 if $i < ((ord "A" == 65) ? 128 : 160);
118
119 # ASCII doesn't need to ever look beyond the first byte.
120 return 0 if $function eq "ASCII";
121
122 # No controls above 255, so the code doesn't look at those
123 return 0 if $i > 255 && $function eq "CNTRL";
124
125 # No non-ASCII digits below 256, except if using locales.
126 return 0 if $i < 256 && ! $using_locale && $function =~ /X?DIGIT/;
127
128 return 1;
129}
130
131my %properties = (
132 # name => Lookup-property name
133 alnum => 'Word',
134 wordchar => 'Word',
135 alphanumeric => 'Alnum',
136 alpha => 'XPosixAlpha',
137 ascii => 'ASCII',
138 blank => 'Blank',
139 cntrl => 'Control',
140 digit => 'Digit',
141 graph => 'Graph',
142 idfirst => '_Perl_IDStart',
143 idcont => '_Perl_IDCont',
144 lower => 'XPosixLower',
145 print => 'Print',
146 psxspc => 'XPosixSpace',
147 punct => 'XPosixPunct',
148 quotemeta => '_Perl_Quotemeta',
149 space => 'XPerlSpace',
150 vertws => 'VertSpace',
151 upper => 'XPosixUpper',
152 xdigit => 'XDigit',
153 );
154
155my %seen;
156my @warnings;
157local $SIG{__WARN__} = sub { push @warnings, @_ };
158
159my %utf8_param_code = (
160 "_safe" => 0,
161 "_safe, malformed" => 1,
162 "deprecated unsafe" => -1,
163 "deprecated mathoms" => -2,
164 );
165
166foreach my $name (sort keys %properties, 'octal') {
167 my @invlist;
168 if ($name eq 'octal') {
169 # Hand-roll an inversion list with 0-7 in it and nothing else.
170 push @invlist, ord "0", ord "8";
171 }
172 else {
173 my $property = $properties{$name};
174 @invlist = prop_invlist($property, '_perl_core_internal_ok');
175 if (! @invlist) {
176
177 # An empty return could mean an unknown property, or merely that
178 # it is empty. Call in scalar context to differentiate
179 if (! prop_invlist($property, '_perl_core_internal_ok')) {
180 fail("No inversion list found for $property");
181 next;
182 }
183 }
184 }
185
186 # Include all the Latin1 code points, plus 0x100.
187 my @code_points = (0 .. 256);
188
189 # Then include the next few boundaries above those from this property
190 my $above_latins = 0;
191 foreach my $range_start (@invlist) {
192 next if $range_start < 257;
193 push @code_points, $range_start - 1, $range_start;
194 $above_latins++;
195 last if $above_latins > 5;
196 }
197
198 # This makes sure we are using the Perl definition of idfirst and idcont,
199 # and not the Unicode. There are a few differences.
200 push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first|cont)/;
201 if ($name eq "idcont") { # And some that are continuation but not start
202 push @code_points, ord("\N{GREEK ANO TELEIA}"),
203 ord("\N{COMBINING GRAVE ACCENT}");
204 }
205
206 # And finally one non-Unicode code point.
207 push @code_points, 0x110000; # Above Unicode, no prop should match
208 no warnings 'non_unicode';
209
210 for my $j (@code_points) {
211 my $i = utf8::native_to_unicode($j);
212 my $function = uc($name);
213
214 is (@warnings, 0, "Got no unexpected warnings in previous iteration")
215 or diag("@warnings");
216 undef @warnings;
217
218 my $matches = search_invlist(\@invlist, $j);
219 if (! defined $matches) {
220 $matches = 0;
221 }
222 else {
223 $matches = truth(! ($matches % 2));
224 }
225
226 my $ret;
227 my $char_name = get_charname($j);
228 my $display_name = sprintf "\\x{%02X, %s}", $j, $char_name;
229 my $display_call = "is${function}( $display_name )";
230
231 foreach my $suffix ("", "_A", "_L1", "_LC", "_uni", "_uvchr",
232 "_LC_uvchr", "_utf8", "_LC_utf8")
233 {
234
235 # Not all possible macros have been defined
236 if ($name eq 'vertws') {
237
238 # vertws is always all of Unicode
239 next if $suffix !~ / ^ _ ( uni | uvchr | utf8 ) $ /x;
240 }
241 elsif ($name eq 'alnum') {
242
243 # ALNUM_A, ALNUM_L1, and ALNUM_uvchr are not defined as these
244 # suffixes were added later, after WORDCHAR was created to be
245 # a clearer synonym for ALNUM
246 next if $suffix eq '_A'
247 || $suffix eq '_L1'
248 || $suffix eq '_uvchr';
249 }
250 elsif ($name eq 'octal') {
251 next if $suffix ne "" && $suffix ne '_A' && $suffix ne '_L1';
252 }
253 elsif ($name eq 'quotemeta') {
254 # There is only one macro for this, and is defined only for
255 # Latin1 range
256 next if $suffix ne ""
257 }
258
259 foreach my $locale ("", $base_locale, $utf8_locale) {
260
261 my ($display_locale, $locale_is_utf8)
262 = get_display_locale_or_skip($locale, $suffix);
263 next unless defined $display_locale;
264
265 use if $locale, "locale";
266 POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
267
268 if ($suffix !~ /utf8/) { # _utf8 has to handled specially
269 my $display_call
270 = "is${function}$suffix( $display_name )$display_locale";
271 $ret = truth eval "test_is${function}$suffix($j)";
272 if (is ($@, "", "$display_call didn't give error")) {
273 my $truth = $matches;
274 if ($truth) {
275
276 # The single byte functions are false for
277 # above-Latin1
278 if ($j >= 256) {
279 $truth = 0
280 if $suffix=~ / ^ ( _A | _L [1C] )? $ /x;
281 }
282 elsif ( $i >= 128
283 && $name ne 'quotemeta')
284 {
285
286 # The no-suffix and _A functions are false
287 # for non-ASCII. So are _LC functions on a
288 # non-UTF-8 locale
289 $truth = 0 if $suffix eq "_A"
290 || $suffix eq ""
291 || ( $suffix =~ /LC/
292 && ! $locale_is_utf8);
293 }
294 }
295
296 is ($ret, $truth, "${tab}And correctly returns $truth");
297 }
298 }
299 else { # _utf8 suffix
300 my $char = chr($j);
301 utf8::upgrade($char);
302 $char = quotemeta $char if $char eq '\\' || $char eq "'";
303 my $truth;
304 if ( $suffix =~ /LC/
305 && ! $locale_is_utf8
306 && $j < 256
307 && $i >= 128)
308 { # The C-locale _LC function returns FALSE for Latin1
309 # above ASCII
310 $truth = 0;
311 }
312 else {
313 $truth = $matches;
314 }
315
316 foreach my $utf8_param("_safe",
317 "_safe, malformed",
318 "deprecated unsafe"
319 )
320 {
321 my $utf8_param_code = $utf8_param_code{$utf8_param};
322 my $expect_error = $utf8_param_code > 0;
323 next if $expect_error
324 && ! try_malforming($i, $function,
325 $suffix =~ /LC/);
326
327 my $display_call = "is${function}$suffix( $display_name"
328 . ", $utf8_param )$display_locale";
329 $ret = truth eval "test_is${function}$suffix('$char',"
330 . " $utf8_param_code)";
331 if ($expect_error) {
332 isnt ($@, "",
333 "expected and got error in $display_call");
334 like($@, qr/Malformed UTF-8 character/,
335 "${tab}And got expected message");
336 if (is (@warnings, 1,
337 "${tab}Got a single warning besides"))
338 {
339 like($warnings[0],
340 qr/Malformed UTF-8 character.*short/,
341 "${tab}Got expected warning");
342 }
343 else {
344 diag("@warnings");
345 }
346 undef @warnings;
347 }
348 elsif (is ($@, "", "$display_call didn't give error")) {
349 is ($ret, $truth,
350 "${tab}And correctly returned $truth");
351 if ($utf8_param_code < 0) {
352 my $warnings_ok;
353 my $unique_function = "is" . $function . $suffix;
354 if (! $seen{$unique_function}++) {
355 $warnings_ok = is(@warnings, 1,
356 "${tab}This is first call to"
357 . " $unique_function; Got a single"
358 . " warning");
359 if ($warnings_ok) {
360 $warnings_ok = like($warnings[0],
361 qr/starting in Perl .* will require an additional parameter/,
362 "${tab}The warning was the expected"
363 . " deprecation one");
364 }
365 }
366 else {
367 $warnings_ok = is(@warnings, 0,
368 "${tab}This subsequent call to"
369 . " $unique_function did not warn");
370 }
371 $warnings_ok or diag("@warnings");
372 undef @warnings;
373 }
374 }
375 }
376 }
377 }
378 }
379 }
380}
381
382my %to_properties = (
383 FOLD => 'Case_Folding',
384 LOWER => 'Lowercase_Mapping',
385 TITLE => 'Titlecase_Mapping',
386 UPPER => 'Uppercase_Mapping',
387 );
388
389
390foreach my $name (sort keys %to_properties) {
391 my $property = $to_properties{$name};
392 my ($list_ref, $map_ref, $format, $missing)
393 = prop_invmap($property, );
394 if (! $list_ref || ! $map_ref) {
395 fail("No inversion map found for $property");
396 next;
397 }
398 if ($format !~ / ^ a l? $ /x) {
399 fail("Unexpected inversion map format ('$format') found for $property");
400 next;
401 }
402
403 # Include all the Latin1 code points, plus 0x100.
404 my @code_points = (0 .. 256);
405
406 # Then include the next few multi-char folds above those from this
407 # property, and include the next few single folds as well
408 my $above_latins = 0;
409 my $multi_char = 0;
410 for my $i (0 .. @{$list_ref} - 1) {
411 my $range_start = $list_ref->[$i];
412 next if $range_start < 257;
413 if (ref $map_ref->[$i] && $multi_char < 5) {
414 push @code_points, $range_start - 1
415 if $code_points[-1] != $range_start - 1;
416 push @code_points, $range_start;
417 $multi_char++;
418 }
419 elsif ($above_latins < 5) {
420 push @code_points, $range_start - 1
421 if $code_points[-1] != $range_start - 1;
422 push @code_points, $range_start;
423 $above_latins++;
424 }
425 last if $above_latins >= 5 && $multi_char >= 5;
426 }
427
428 # And finally one non-Unicode code point.
429 push @code_points, 0x110000; # Above Unicode, no prop should match
430 no warnings 'non_unicode';
431
432 # $j is native; $i unicode.
433 for my $j (@code_points) {
434 my $i = utf8::native_to_unicode($j);
435 my $function = $name;
436
437 my $index = search_invlist(\@{$list_ref}, $j);
438
439 my $ret;
440 my $char_name = get_charname($j);
441 my $display_name = sprintf "\\N{U+%02X, %s}", $j, $char_name;
442
443 foreach my $suffix ("", "_L1", "_LC") {
444
445 # This is the only macro defined for L1
446 next if $suffix eq "_L1" && $function ne "LOWER";
447
448 SKIP:
449 foreach my $locale ("", $base_locale, $utf8_locale) {
450
451 # titlecase is not defined in locales.
452 next if $name eq 'TITLE' && $suffix eq "_LC";
453
454 my ($display_locale, $locale_is_utf8)
455 = get_display_locale_or_skip($locale, $suffix);
456 next unless defined $display_locale;
457
458 skip("to${name}_LC does not work for LATIN SMALL LETTER SHARP S"
459 . "$display_locale", 1)
460 if $i == 0xDF && $name =~ / FOLD | UPPER /x
461 && $suffix eq "_LC" && $locale_is_utf8;
462
463 use if $locale, "locale";
464 POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
465
466 my $display_call = "to${function}$suffix("
467 . " $display_name )$display_locale";
468 $ret = eval "test_to${function}$suffix($j)";
469 if (is ($@, "", "$display_call didn't give error")) {
470 my $should_be;
471 if ($j > 255) {
472 $should_be = $j;
473 }
474 elsif ( $i > 127
475 && ( $suffix eq ""
476 || ($suffix eq "_LC" && ! $locale_is_utf8)))
477 {
478 $should_be = $j;
479 }
480 elsif ($map_ref->[$index] != $missing) {
481 $should_be = $map_ref->[$index] + $j - $list_ref->[$index]
482 }
483 else {
484 $should_be = $j;
485 }
486
487 is ($ret, $should_be,
488 sprintf("${tab}And correctly returned 0x%02X",
489 $should_be));
490 }
491 }
492 }
493
494 # The _uni, uvchr, and _utf8 functions return both the ordinal of the
495 # first code point of the result, and the result in utf8. The .xs
496 # tests return these in an array, in [0] and [1] respectively, with
497 # [2] the length of the utf8 in bytes.
498 my $utf8_should_be = "";
499 my $first_ord_should_be;
500 if (ref $map_ref->[$index]) { # A multi-char result
501 for my $j (0 .. @{$map_ref->[$index]} - 1) {
502 $utf8_should_be .= chr $map_ref->[$index][$j];
503 }
504
505 $first_ord_should_be = $map_ref->[$index][0];
506 }
507 else { # A single-char result
508 $first_ord_should_be = ($map_ref->[$index] != $missing)
509 ? $map_ref->[$index] + $j
510 - $list_ref->[$index]
511 : $j;
512 $utf8_should_be = chr $first_ord_should_be;
513 }
514 utf8::upgrade($utf8_should_be);
515
516 # Test _uni, uvchr
517 foreach my $suffix ('_uni', '_uvchr') {
518 my $s;
519 my $len;
520 my $display_call = "to${function}$suffix( $display_name )";
521 $ret = eval "test_to${function}$suffix($j)";
522 if (is ($@, "", "$display_call didn't give error")) {
523 is ($ret->[0], $first_ord_should_be,
524 sprintf("${tab}And correctly returned 0x%02X",
525 $first_ord_should_be));
526 is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
527 use bytes;
528 is ($ret->[2], length $utf8_should_be,
529 "${tab}Got correct number of bytes for utf8 length");
530 }
531 }
532
533 # Test _utf8
534 my $char = chr($j);
535 utf8::upgrade($char);
536 $char = quotemeta $char if $char eq '\\' || $char eq "'";
537 foreach my $utf8_param("_safe",
538 "_safe, malformed",
539 "deprecated unsafe",
540 "deprecated mathoms",
541 )
542 {
543 use Config;
544 next if $utf8_param eq 'deprecated mathoms'
545 && $Config{'ccflags'} =~ /-DNO_MATHOMS/;
546
547 my $utf8_param_code = $utf8_param_code{$utf8_param};
548 my $expect_error = $utf8_param_code > 0;
549
550 # Skip if can't malform (because is a UTF-8 invariant)
551 next if $expect_error && $i < ((ord "A" == 65) ? 128 : 160);
552
553 my $display_call = "to${function}_utf8($display_name, $utf8_param )";
554 $ret = eval "test_to${function}_utf8('$char', $utf8_param_code)";
555 if ($expect_error) {
556 isnt ($@, "", "expected and got error in $display_call");
557 like($@, qr/Malformed UTF-8 character/,
558 "${tab}And got expected message");
559 undef @warnings;
560 }
561 elsif (is ($@, "", "$display_call didn't give error")) {
562 is ($ret->[0], $first_ord_should_be,
563 sprintf("${tab}And correctly returned 0x%02X",
564 $first_ord_should_be));
565 is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
566 use bytes;
567 is ($ret->[2], length $utf8_should_be,
568 "${tab}Got correct number of bytes for utf8 length");
569 if ($utf8_param_code < 0) {
570 my $warnings_ok;
571 if (! $seen{"${function}_utf8$utf8_param"}++) {
572 $warnings_ok = is(@warnings, 1,
573 "${tab}Got a single warning");
574 if ($warnings_ok) {
575 my $expected;
576 if ($utf8_param_code == -2) {
577 my $lc_func = lc $function;
578 $expected
579 = qr/starting in Perl .* to_utf8_$lc_func\(\) will be removed/;
580 }
581 else {
582 $expected
583 = qr/starting in Perl .* will require an additional parameter/;
584 }
585 $warnings_ok = like($warnings[0], $expected,
586 "${tab}Got expected deprecation warning");
587 }
588 }
589 else {
590 $warnings_ok = is(@warnings, 0,
591 "${tab}Deprecation warned only the one time");
592 }
593 $warnings_ok or diag("@warnings");
594 undef @warnings;
595 }
596 }
597 }
598 }
599}
600
601# This is primarily to make sure that no non-Unicode warnings get generated
602is(scalar @warnings, 0, "No unexpected warnings were generated in the tests")
603 or diag @warnings;
604
605done_testing;