4 require 'loc_tools.pl'; # Contains locales_enabled() and
5 # find_utf8_ctype_locale()
14 my $tab = " " x 4; # Indent subsidiary tests this much
16 use Unicode::UCD qw(search_invlist prop_invmap prop_invlist);
17 my ($charname_list, $charname_map, $format, $default) = prop_invmap("Name Alias");
22 # If there is a an abbreviation for the code point name, use it
23 my $name_index = search_invlist(\@{$charname_list}, $cp);
24 if (defined $name_index) {
25 my $synonyms = $charname_map->[$name_index];
27 my $pat = qr/: abbreviation/;
28 my @abbreviations = grep { $_ =~ $pat } @$synonyms;
30 return $abbreviations[0] =~ s/$pat//r;
35 # Otherwise, use the full name
37 return charnames::viacode($cp) // "No name";
40 sub truth($) { # Converts values so is() works
41 return (shift) ? 1 : 0;
46 if(locales_enabled('LC_ALL')) {
48 $base_locale = POSIX::setlocale( &POSIX::LC_ALL, "C");
49 if (defined $base_locale && $base_locale eq 'C') {
50 use locale; # make \w work right in non-ASCII lands
52 # Some locale implementations don't have the 128-255 characters all
53 # mean nothing. Skip the locale tests in that situation
54 for my $u (128 .. 255) {
55 if (chr(utf8::unicode_to_native($u)) =~ /[[:print:]]/) {
61 $utf8_locale = find_utf8_ctype_locale() if $base_locale;
65 sub get_display_locale_or_skip($$) {
67 # Helper function intimately tied to its callers. It knows the loop
68 # iterates with a locale of "", meaning don't use locale; $base_locale
69 # meaning to use a non-UTF-8 locale; and $utf8_locale.
71 # It checks to see if the current test should be skipped or executed,
72 # returning an empty list for the former, and for the latter:
73 # ( 'locale display name',
74 # bool of is this a UTF-8 locale )
76 # The display name is the empty string if not using locale. Functions
77 # with _LC in their name are skipped unless in locale, and functions
78 # without _LC are executed only outside locale.
80 my ($locale, $suffix) = @_;
82 # The test should be skipped if the input is for a non-existent locale
83 return unless defined $locale;
85 # Here the input is defined, either a locale name or "". If the test is
86 # for not using locales, we want to do the test for non-LC functions,
87 # and skip it for LC ones.
89 return ("", 0) if $suffix !~ /LC/;
93 # Here the input is for a real locale. We don't test the non-LC functions
95 return if $suffix !~ /LC/;
97 # Here is for a LC function and a real locale. The base locale is not
99 return (" ($locale locale)", 0) if $locale eq $base_locale;
101 # The only other possibility is that we have a UTF-8 locale
102 return (" ($locale)", 1);
105 sub try_malforming($$$)
107 # Determines if the tests for malformed UTF-8 should be done. When done,
108 # the .xs code creates malformations by pretending the length is shorter
109 # than it actually is. Some things can't be malformed, and sometimes this
110 # test knows that the current code doesn't look for a malformation under
111 # various circumstances.
113 my ($u, $function, $using_locale) = @_;
114 # $u is unicode code point;
116 # Single bytes can't be malformed
117 return 0 if $u < ((ord "A" == 65) ? 128 : 160);
119 # ASCII doesn't need to ever look beyond the first byte.
120 return 0 if $function eq "ASCII";
122 # Nor, on EBCDIC systems, does CNTRL
123 return 0 if ord "A" != 65 && $function eq "CNTRL";
125 # No controls above 255, so the code doesn't look at those
126 return 0 if $u > 255 && $function eq "CNTRL";
128 # No non-ASCII digits below 256, except if using locales.
129 return 0 if $u < 256 && ! $using_locale && $function =~ /X?DIGIT/;
135 # name => Lookup-property name
138 alphanumeric => 'Alnum',
139 alpha => 'XPosixAlpha',
145 idfirst => '_Perl_IDStart',
146 idcont => '_Perl_IDCont',
147 lower => 'XPosixLower',
149 psxspc => 'XPosixSpace',
150 punct => 'XPosixPunct',
151 quotemeta => '_Perl_Quotemeta',
152 space => 'XPerlSpace',
153 vertws => 'VertSpace',
154 upper => 'XPosixUpper',
160 local $SIG{__WARN__} = sub { push @warnings, @_ };
162 my %utf8_param_code = (
164 "_safe, malformed" => 1,
165 "deprecated unsafe" => -1,
166 "deprecated mathoms" => -2,
169 # This test is split into this number of files.
170 my $num_test_files = $ENV{TEST_JOBS} || 1;
171 $::TEST_CHUNK = 0 if $num_test_files == 1 && ! defined $::TEST_CHUNK;
172 $num_test_files = 10 if $num_test_files > 10;
174 my $property_count = -1;
175 foreach my $name (sort keys %properties, 'octal') {
177 # We test every nth property in this run so that this test is split into
178 # smaller chunks to minimize test suite elapsed time when run in parallel.
180 next if $property_count % $num_test_files != $::TEST_CHUNK;
183 if ($name eq 'octal') {
184 # Hand-roll an inversion list with 0-7 in it and nothing else.
185 push @invlist, ord "0", ord "8";
188 my $property = $properties{$name};
189 @invlist = prop_invlist($property, '_perl_core_internal_ok');
192 # An empty return could mean an unknown property, or merely that
193 # it is empty. Call in scalar context to differentiate
194 if (! prop_invlist($property, '_perl_core_internal_ok')) {
195 fail("No inversion list found for $property");
201 # Include all the Latin1 code points, plus 0x100.
202 my @code_points = (0 .. 256);
204 # Then include the next few boundaries above those from this property
205 my $above_latins = 0;
206 foreach my $range_start (@invlist) {
207 next if $range_start < 257;
208 push @code_points, $range_start - 1, $range_start;
210 last if $above_latins > 5;
213 # This makes sure we are using the Perl definition of idfirst and idcont,
214 # and not the Unicode. There are a few differences.
215 push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first|cont)/;
216 if ($name eq "idcont") { # And some that are continuation but not start
217 push @code_points, ord("\N{GREEK ANO TELEIA}"),
218 ord("\N{COMBINING GRAVE ACCENT}");
221 # And finally one non-Unicode code point.
222 push @code_points, 0x110000; # Above Unicode, no prop should match
223 no warnings 'non_unicode';
225 for my $n (@code_points) {
226 my $u = utf8::native_to_unicode($n);
227 my $function = uc($name);
229 is (@warnings, 0, "Got no unexpected warnings in previous iteration")
230 or diag("@warnings");
233 my $matches = search_invlist(\@invlist, $n);
234 if (! defined $matches) {
238 $matches = truth(! ($matches % 2));
242 my $char_name = get_charname($n);
243 my $display_name = sprintf "\\x{%02X, %s}", $n, $char_name;
244 my $display_call = "is${function}( $display_name )";
246 foreach my $suffix ("", "_A", "_L1", "_LC", "_uni", "_uvchr",
247 "_LC_uvchr", "_utf8", "_LC_utf8")
250 # Not all possible macros have been defined
251 if ($name eq 'vertws') {
253 # vertws is always all of Unicode
254 next if $suffix !~ / ^ _ ( uni | uvchr | utf8 ) $ /x;
256 elsif ($name eq 'alnum') {
258 # ALNUM_A, ALNUM_L1, and ALNUM_uvchr are not defined as these
259 # suffixes were added later, after WORDCHAR was created to be
260 # a clearer synonym for ALNUM
261 next if $suffix eq '_A'
263 || $suffix eq '_uvchr';
265 elsif ($name eq 'octal') {
266 next if $suffix ne "" && $suffix ne '_A' && $suffix ne '_L1';
268 elsif ($name eq 'quotemeta') {
269 # There is only one macro for this, and is defined only for
271 next if $suffix ne ""
274 foreach my $locale ("", $base_locale, $utf8_locale) {
276 my ($display_locale, $locale_is_utf8)
277 = get_display_locale_or_skip($locale, $suffix);
278 next unless defined $display_locale;
280 use if $locale, "locale";
281 POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
283 if ($suffix !~ /utf8/) { # _utf8 has to handled specially
285 = "is${function}$suffix( $display_name )$display_locale";
286 $ret = truth eval "test_is${function}$suffix($n)";
287 if (is ($@, "", "$display_call didn't give error")) {
288 my $truth = $matches;
291 # The single byte functions are false for
295 if $suffix=~ / ^ ( _A | _L [1C] )? $ /x;
298 && $name ne 'quotemeta')
301 # The no-suffix and _A functions are false
302 # for non-ASCII. So are _LC functions on a
304 $truth = 0 if $suffix eq "_A"
307 && ! $locale_is_utf8);
311 is ($ret, $truth, "${tab}And correctly returns $truth");
314 else { # _utf8 suffix
316 utf8::upgrade($char);
317 $char = quotemeta $char if $char eq '\\' || $char eq "'";
323 { # The C-locale _LC function returns FALSE for Latin1
331 foreach my $utf8_param("_safe",
336 my $utf8_param_code = $utf8_param_code{$utf8_param};
337 my $expect_error = $utf8_param_code > 0;
338 next if $expect_error
339 && ! try_malforming($u, $function,
342 my $display_call = "is${function}$suffix( $display_name"
343 . ", $utf8_param )$display_locale";
344 $ret = truth eval "test_is${function}$suffix('$char',"
345 . " $utf8_param_code)";
348 "expected and got error in $display_call");
349 like($@, qr/Malformed UTF-8 character/,
350 "${tab}And got expected message");
351 if (is (@warnings, 1,
352 "${tab}Got a single warning besides"))
355 qr/Malformed UTF-8 character.*short/,
356 "${tab}Got expected warning");
363 elsif (is ($@, "", "$display_call didn't give error")) {
365 "${tab}And correctly returned $truth");
366 if ($utf8_param_code < 0) {
368 my $unique_function = "is" . $function . $suffix;
369 if (! $seen{$unique_function}++) {
370 $warnings_ok = is(@warnings, 1,
371 "${tab}This is first call to"
372 . " $unique_function; Got a single"
375 $warnings_ok = like($warnings[0],
376 qr/starting in Perl .* will require an additional parameter/,
377 "${tab}The warning was the expected"
378 . " deprecation one");
382 $warnings_ok = is(@warnings, 0,
383 "${tab}This subsequent call to"
384 . " $unique_function did not warn");
386 $warnings_ok or diag("@warnings");
397 my %to_properties = (
398 FOLD => 'Case_Folding',
399 LOWER => 'Lowercase_Mapping',
400 TITLE => 'Titlecase_Mapping',
401 UPPER => 'Uppercase_Mapping',
404 $property_count = -1;
405 foreach my $name (sort keys %to_properties) {
408 next if $property_count % $num_test_files != $::TEST_CHUNK;
410 my $property = $to_properties{$name};
411 my ($list_ref, $map_ref, $format, $missing)
412 = prop_invmap($property, );
413 if (! $list_ref || ! $map_ref) {
414 fail("No inversion map found for $property");
417 if ($format !~ / ^ a l? $ /x) {
418 fail("Unexpected inversion map format ('$format') found for $property");
422 # Include all the Latin1 code points, plus 0x100.
423 my @code_points = (0 .. 256);
425 # Then include the next few multi-char folds above those from this
426 # property, and include the next few single folds as well
427 my $above_latins = 0;
429 for my $i (0 .. @{$list_ref} - 1) {
430 my $range_start = $list_ref->[$i];
431 next if $range_start < 257;
432 if (ref $map_ref->[$i] && $multi_char < 5) {
433 push @code_points, $range_start - 1
434 if $code_points[-1] != $range_start - 1;
435 push @code_points, $range_start;
438 elsif ($above_latins < 5) {
439 push @code_points, $range_start - 1
440 if $code_points[-1] != $range_start - 1;
441 push @code_points, $range_start;
444 last if $above_latins >= 5 && $multi_char >= 5;
447 # And finally one non-Unicode code point.
448 push @code_points, 0x110000; # Above Unicode, no prop should match
449 no warnings 'non_unicode';
451 # $n is native; $u unicode.
452 for my $n (@code_points) {
453 my $u = utf8::native_to_unicode($n);
454 my $function = $name;
456 my $index = search_invlist(\@{$list_ref}, $n);
459 my $char_name = get_charname($n);
460 my $display_name = sprintf "\\N{U+%02X, %s}", $n, $char_name;
462 foreach my $suffix ("", "_L1", "_LC") {
464 # This is the only macro defined for L1
465 next if $suffix eq "_L1" && $function ne "LOWER";
468 foreach my $locale ("", $base_locale, $utf8_locale) {
470 # titlecase is not defined in locales.
471 next if $name eq 'TITLE' && $suffix eq "_LC";
473 my ($display_locale, $locale_is_utf8)
474 = get_display_locale_or_skip($locale, $suffix);
475 next unless defined $display_locale;
477 skip("to${name}_LC does not work for LATIN SMALL LETTER SHARP S"
478 . "$display_locale", 1)
479 if $u == 0xDF && $name =~ / FOLD | UPPER /x
480 && $suffix eq "_LC" && $locale_is_utf8;
482 use if $locale, "locale";
483 POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
485 my $display_call = "to${function}$suffix("
486 . " $display_name )$display_locale";
487 $ret = eval "test_to${function}$suffix($n)";
488 if (is ($@, "", "$display_call didn't give error")) {
495 || ($suffix eq "_LC" && ! $locale_is_utf8)))
499 elsif ($map_ref->[$index] != $missing) {
500 $should_be = $map_ref->[$index] + $n - $list_ref->[$index]
506 is ($ret, $should_be,
507 sprintf("${tab}And correctly returned 0x%02X",
513 # The _uni, uvchr, and _utf8 functions return both the ordinal of the
514 # first code point of the result, and the result in utf8. The .xs
515 # tests return these in an array, in [0] and [1] respectively, with
516 # [2] the length of the utf8 in bytes.
517 my $utf8_should_be = "";
518 my $first_ord_should_be;
519 if (ref $map_ref->[$index]) { # A multi-char result
520 for my $n (0 .. @{$map_ref->[$index]} - 1) {
521 $utf8_should_be .= chr $map_ref->[$index][$n];
524 $first_ord_should_be = $map_ref->[$index][0];
526 else { # A single-char result
527 $first_ord_should_be = ($map_ref->[$index] != $missing)
528 ? $map_ref->[$index] + $n
529 - $list_ref->[$index]
531 $utf8_should_be = chr $first_ord_should_be;
533 utf8::upgrade($utf8_should_be);
536 foreach my $suffix ('_uni', '_uvchr') {
539 my $display_call = "to${function}$suffix( $display_name )";
540 $ret = eval "test_to${function}$suffix($n)";
541 if (is ($@, "", "$display_call didn't give error")) {
542 is ($ret->[0], $first_ord_should_be,
543 sprintf("${tab}And correctly returned 0x%02X",
544 $first_ord_should_be));
545 is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
547 is ($ret->[2], length $utf8_should_be,
548 "${tab}Got correct number of bytes for utf8 length");
554 utf8::upgrade($char);
555 $char = quotemeta $char if $char eq '\\' || $char eq "'";
556 foreach my $utf8_param("_safe",
559 "deprecated mathoms",
563 next if $utf8_param eq 'deprecated mathoms'
564 && $Config{'ccflags'} =~ /-DNO_MATHOMS/;
566 my $utf8_param_code = $utf8_param_code{$utf8_param};
567 my $expect_error = $utf8_param_code > 0;
569 # Skip if can't malform (because is a UTF-8 invariant)
570 next if $expect_error && $u < ((ord "A" == 65) ? 128 : 160);
572 my $display_call = "to${function}_utf8($display_name, $utf8_param )";
573 $ret = eval "test_to${function}_utf8('$char', $utf8_param_code)";
575 isnt ($@, "", "expected and got error in $display_call");
576 like($@, qr/Malformed UTF-8 character/,
577 "${tab}And got expected message");
580 elsif (is ($@, "", "$display_call didn't give error")) {
581 is ($ret->[0], $first_ord_should_be,
582 sprintf("${tab}And correctly returned 0x%02X",
583 $first_ord_should_be));
584 is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
586 is ($ret->[2], length $utf8_should_be,
587 "${tab}Got correct number of bytes for utf8 length");
588 if ($utf8_param_code < 0) {
590 if (! $seen{"${function}_utf8$utf8_param"}++) {
591 $warnings_ok = is(@warnings, 1,
592 "${tab}Got a single warning");
595 if ($utf8_param_code == -2) {
596 my $lc_func = lc $function;
598 = qr/starting in Perl .* to_utf8_$lc_func\(\) will be removed/;
602 = qr/starting in Perl .* will require an additional parameter/;
604 $warnings_ok = like($warnings[0], $expected,
605 "${tab}Got expected deprecation warning");
609 $warnings_ok = is(@warnings, 0,
610 "${tab}Deprecation warned only the one time");
612 $warnings_ok or diag("@warnings");
620 # This is primarily to make sure that no non-Unicode warnings get generated
621 is(scalar @warnings, 0, "No unexpected warnings were generated in the tests")