ext/XS-APItest/t/handy_base.pl

   1 #!perl -w
   2
   3 BEGIN {
   4     require 'loc_tools.pl';   # Contains locales_enabled() and
   5                               # find_utf8_ctype_locale()
   6 }
   7
   8 use strict;
   9 use Test::More;
  10 use Config;
  11
  12 use XS::APItest;
  13
  14 my $tab = " " x 4;  # Indent subsidiary tests this much
  15
  16 use Unicode::UCD qw(search_invlist prop_invmap prop_invlist);
  17 my ($charname_list, $charname_map, $format, $default) = prop_invmap("Name Alias");
  18
  19 sub get_charname($) {
  20     my $cp = shift;
  21
  22     # If there is a an abbreviation for the code point name, use it
  23     my $name_index = search_invlist(\@{$charname_list}, $cp);
  24     if (defined $name_index) {
  25         my $synonyms = $charname_map->[$name_index];
  26         if (ref $synonyms) {
  27             my $pat = qr/: abbreviation/;
  28             my @abbreviations = grep { $_ =~ $pat } @$synonyms;
  29             if (@abbreviations) {
  30                 return $abbreviations[0] =~ s/$pat//r;
  31             }
  32         }
  33     }
  34
  35     # Otherwise, use the full name
  36     use charnames ();
  37     return charnames::viacode($cp) // "No name";
  38 }
  39
  40 sub truth($) {  # Converts values so is() works
  41     return (shift) ? 1 : 0;
  42 }
  43
  44 my $base_locale;
  45 my $utf8_locale;
  46 if(locales_enabled('LC_ALL')) {
  47     require POSIX;
  48     $base_locale = POSIX::setlocale( &POSIX::LC_ALL, "C");
  49     if (defined $base_locale && $base_locale eq 'C') {
  50         use locale; # make \w work right in non-ASCII lands
  51
  52         # Some locale implementations don't have the 128-255 characters all
  53         # mean nothing.  Skip the locale tests in that situation
  54         for my $u (128 .. 255) {
  55             if (chr(utf8::unicode_to_native($u)) =~ /[[:print:]]/) {
  56                 undef $base_locale;
  57                 last;
  58             }
  59         }
  60
  61         $utf8_locale = find_utf8_ctype_locale() if $base_locale;
  62     }
  63 }
  64
  65 sub get_display_locale_or_skip($$) {
  66
  67     # Helper function intimately tied to its callers.  It knows the loop
  68     # iterates with a locale of "", meaning don't use locale; $base_locale
  69     # meaning to use a non-UTF-8 locale; and $utf8_locale.
  70     #
  71     # It checks to see if the current test should be skipped or executed,
  72     # returning an empty list for the former, and for the latter:
  73     #   ( 'locale display name',
  74     #     bool of is this a UTF-8 locale )
  75     #
  76     # The display name is the empty string if not using locale.  Functions
  77     # with _LC in their name are skipped unless in locale, and functions
  78     # without _LC are executed only outside locale.
  79
  80     my ($locale, $suffix) = @_;
  81
  82     # The test should be skipped if the input is for a non-existent locale
  83     return unless defined $locale;
  84
  85     # Here the input is defined, either a locale name or "".  If the test is
  86     # for not using locales, we want to do the test for non-LC functions,
  87     # and skip it for LC ones.
  88     if ($locale eq "") {
  89         return ("", 0) if $suffix !~ /LC/;
  90         return;
  91     }
  92
  93     # Here the input is for a real locale.  We don't test the non-LC functions
  94     # for locales.
  95     return if $suffix !~ /LC/;
  96
  97     # Here is for a LC function and a real locale.  The base locale is not
  98     # UTF-8.
  99     return (" ($locale locale)", 0) if $locale eq $base_locale;
 100
 101     # The only other possibility is that we have a UTF-8 locale
 102     return (" ($locale)", 1);
 103 }
 104
 105 sub try_malforming($$$)
 106 {
 107     # Determines if the tests for malformed UTF-8 should be done.  When done,
 108     # the .xs code creates malformations by pretending the length is shorter
 109     # than it actually is.  Some things can't be malformed, and sometimes this
 110     # test knows that the current code doesn't look for a malformation under
 111     # various circumstances.
 112
 113     my ($u, $function, $using_locale) = @_;
 114     # $u is unicode code point;
 115
 116     # Single bytes can't be malformed
 117     return 0 if $u < ((ord "A" == 65) ? 128 : 160);
 118
 119     # ASCII doesn't need to ever look beyond the first byte.
 120     return 0 if $function eq "ASCII";
 121
 122     # Nor, on EBCDIC systems, does CNTRL
 123     return 0 if ord "A" != 65 && $function eq "CNTRL";
 124
 125     # No controls above 255, so the code doesn't look at those
 126     return 0 if $u > 255 && $function eq "CNTRL";
 127
 128     # No non-ASCII digits below 256, except if using locales.
 129     return 0 if $u < 256 && ! $using_locale && $function =~ /X?DIGIT/;
 130
 131     return 1;
 132 }
 133
 134 my %properties = (
 135                    # name => Lookup-property name
 136                    alnum => 'Word',
 137                    wordchar => 'Word',
 138                    alphanumeric => 'Alnum',
 139                    alpha => 'XPosixAlpha',
 140                    ascii => 'ASCII',
 141                    blank => 'Blank',
 142                    cntrl => 'Control',
 143                    digit => 'Digit',
 144                    graph => 'Graph',
 145                    idfirst => '_Perl_IDStart',
 146                    idcont => '_Perl_IDCont',
 147                    lower => 'XPosixLower',
 148                    print => 'Print',
 149                    psxspc => 'XPosixSpace',
 150                    punct => 'XPosixPunct',
 151                    quotemeta => '_Perl_Quotemeta',
 152                    space => 'XPerlSpace',
 153                    vertws => 'VertSpace',
 154                    upper => 'XPosixUpper',
 155                    xdigit => 'XDigit',
 156                 );
 157
 158 my %seen;
 159 my @warnings;
 160 local $SIG{__WARN__} = sub { push @warnings, @_ };
 161
 162 my %utf8_param_code = (
 163                         "_safe"                 =>  0,
 164                         "_safe, malformed"      =>  1,
 165                         "deprecated unsafe"     => -1,
 166                         "deprecated mathoms"    => -2,
 167                       );
 168
 169 # This test is split into this number of files.
 170 my $num_test_files = $ENV{TEST_JOBS} || 1;
 171 $num_test_files = 10 if $num_test_files > 10;
 172
 173 my $property_count = -1;
 174 foreach my $name (sort keys %properties, 'octal') {
 175
 176     # We test every nth property in this run so that this test is split into
 177     # smaller chunks to minimize test suite elapsed time when run in parallel.
 178     $property_count++;
 179     next if $property_count % $num_test_files != $::TEST_CHUNK;
 180
 181     my @invlist;
 182     if ($name eq 'octal') {
 183         # Hand-roll an inversion list with 0-7 in it and nothing else.
 184         push @invlist, ord "0", ord "8";
 185     }
 186     else {
 187         my $property = $properties{$name};
 188         @invlist = prop_invlist($property, '_perl_core_internal_ok');
 189         if (! @invlist) {
 190
 191             # An empty return could mean an unknown property, or merely that
 192             # it is empty.  Call in scalar context to differentiate
 193             if (! prop_invlist($property, '_perl_core_internal_ok')) {
 194                 fail("No inversion list found for $property");
 195                 next;
 196             }
 197         }
 198     }
 199
 200     # Include all the Latin1 code points, plus 0x100.
 201     my @code_points = (0 .. 256);
 202
 203     # Then include the next few boundaries above those from this property
 204     my $above_latins = 0;
 205     foreach my $range_start (@invlist) {
 206         next if $range_start < 257;
 207         push @code_points, $range_start - 1, $range_start;
 208         $above_latins++;
 209         last if $above_latins > 5;
 210     }
 211
 212     # This makes sure we are using the Perl definition of idfirst and idcont,
 213     # and not the Unicode.  There are a few differences.
 214     push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first|cont)/;
 215     if ($name eq "idcont") {    # And some that are continuation but not start
 216         push @code_points, ord("\N{GREEK ANO TELEIA}"),
 217                            ord("\N{COMBINING GRAVE ACCENT}");
 218     }
 219
 220     # And finally one non-Unicode code point.
 221     push @code_points, 0x110000;    # Above Unicode, no prop should match
 222     no warnings 'non_unicode';
 223
 224     for my $n (@code_points) {
 225         my $u = utf8::native_to_unicode($n);
 226         my $function = uc($name);
 227
 228         is (@warnings, 0, "Got no unexpected warnings in previous iteration")
 229            or diag("@warnings");
 230         undef @warnings;
 231
 232         my $matches = search_invlist(\@invlist, $n);
 233         if (! defined $matches) {
 234             $matches = 0;
 235         }
 236         else {
 237             $matches = truth(! ($matches % 2));
 238         }
 239
 240         my $ret;
 241         my $char_name = get_charname($n);
 242         my $display_name = sprintf "\\x{%02X, %s}", $n, $char_name;
 243         my $display_call = "is${function}( $display_name )";
 244
 245         foreach my $suffix ("", "_A", "_L1", "_LC", "_uni", "_uvchr",
 246                             "_LC_uvchr", "_utf8", "_LC_utf8")
 247         {
 248
 249             # Not all possible macros have been defined
 250             if ($name eq 'vertws') {
 251
 252                 # vertws is always all of Unicode
 253                 next if $suffix !~ / ^ _ ( uni | uvchr | utf8 ) $ /x;
 254             }
 255             elsif ($name eq 'alnum') {
 256
 257                 # ALNUM_A, ALNUM_L1, and ALNUM_uvchr are not defined as these
 258                 # suffixes were added later, after WORDCHAR was created to be
 259                 # a clearer synonym for ALNUM
 260                 next if    $suffix eq '_A'
 261                         || $suffix eq '_L1'
 262                         || $suffix eq '_uvchr';
 263             }
 264             elsif ($name eq 'octal') {
 265                 next if $suffix ne ""  && $suffix ne '_A' && $suffix ne '_L1';
 266             }
 267             elsif ($name eq 'quotemeta') {
 268                 # There is only one macro for this, and is defined only for
 269                 # Latin1 range
 270                 next if $suffix ne ""
 271             }
 272
 273             foreach my $locale ("", $base_locale, $utf8_locale) {
 274
 275                 my ($display_locale, $locale_is_utf8)
 276                                 = get_display_locale_or_skip($locale, $suffix);
 277                 next unless defined $display_locale;
 278
 279                 use if $locale, "locale";
 280                 POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
 281
 282                 if ($suffix !~ /utf8/) {    # _utf8 has to handled specially
 283                     my $display_call
 284                        = "is${function}$suffix( $display_name )$display_locale";
 285                     $ret = truth eval "test_is${function}$suffix($n)";
 286                     if (is ($@, "", "$display_call didn't give error")) {
 287                         my $truth = $matches;
 288                         if ($truth) {
 289
 290                             # The single byte functions are false for
 291                             # above-Latin1
 292                             if ($n >= 256) {
 293                                 $truth = 0
 294                                         if $suffix=~ / ^ ( _A | _L [1C] )? $ /x;
 295                             }
 296                             elsif (   $u >= 128
 297                                    && $name ne 'quotemeta')
 298                             {
 299
 300                                 # The no-suffix and _A functions are false
 301                                 # for non-ASCII.  So are  _LC  functions on a
 302                                 # non-UTF-8 locale
 303                                 $truth = 0 if    $suffix eq "_A"
 304                                               || $suffix eq ""
 305                                               || (     $suffix =~ /LC/
 306                                                   && ! $locale_is_utf8);
 307                             }
 308                         }
 309
 310                         is ($ret, $truth, "${tab}And correctly returns $truth");
 311                     }
 312                 }
 313                 else {  # _utf8 suffix
 314                     my $char = chr($n);
 315                     utf8::upgrade($char);
 316                     $char = quotemeta $char if $char eq '\\' || $char eq "'";
 317                     my $truth;
 318                     if (   $suffix =~ /LC/
 319                         && ! $locale_is_utf8
 320                         && $n < 256
 321                         && $u >= 128)
 322                     {   # The C-locale _LC function returns FALSE for Latin1
 323                         # above ASCII
 324                         $truth = 0;
 325                     }
 326                     else {
 327                         $truth = $matches;
 328                     }
 329
 330                     foreach my $utf8_param("_safe",
 331                                            "_safe, malformed",
 332                                            "deprecated unsafe"
 333                                           )
 334                     {
 335                         my $utf8_param_code = $utf8_param_code{$utf8_param};
 336                         my $expect_error = $utf8_param_code > 0;
 337                         next if      $expect_error
 338                                 && ! try_malforming($u, $function,
 339                                                     $suffix =~ /LC/);
 340
 341                         my $display_call = "is${function}$suffix( $display_name"
 342                                          . ", $utf8_param )$display_locale";
 343                         $ret = truth eval "test_is${function}$suffix('$char',"
 344                                         . " $utf8_param_code)";
 345                         if ($expect_error) {
 346                             isnt ($@, "",
 347                                     "expected and got error in $display_call");
 348                             like($@, qr/Malformed UTF-8 character/,
 349                                 "${tab}And got expected message");
 350                             if (is (@warnings, 1,
 351                                            "${tab}Got a single warning besides"))
 352                             {
 353                                 like($warnings[0],
 354                                      qr/Malformed UTF-8 character.*short/,
 355                                      "${tab}Got expected warning");
 356                             }
 357                             else {
 358                                 diag("@warnings");
 359                             }
 360                             undef @warnings;
 361                         }
 362                         elsif (is ($@, "", "$display_call didn't give error")) {
 363                             is ($ret, $truth,
 364                                 "${tab}And correctly returned $truth");
 365                             if ($utf8_param_code < 0) {
 366                                 my $warnings_ok;
 367                                 my $unique_function = "is" . $function . $suffix;
 368                                 if (! $seen{$unique_function}++) {
 369                                     $warnings_ok = is(@warnings, 1,
 370                                         "${tab}This is first call to"
 371                                       . " $unique_function; Got a single"
 372                                       . " warning");
 373                                     if ($warnings_ok) {
 374                                         $warnings_ok = like($warnings[0],
 375                 qr/starting in Perl .* will require an additional parameter/,
 376                                             "${tab}The warning was the expected"
 377                                           . " deprecation one");
 378                                     }
 379                                 }
 380                                 else {
 381                                     $warnings_ok = is(@warnings, 0,
 382                                         "${tab}This subsequent call to"
 383                                       . " $unique_function did not warn");
 384                                 }
 385                                 $warnings_ok or diag("@warnings");
 386                                 undef @warnings;
 387                             }
 388                         }
 389                     }
 390                 }
 391             }
 392         }
 393     }
 394 }
 395
 396 my %to_properties = (
 397                 FOLD  => 'Case_Folding',
 398                 LOWER => 'Lowercase_Mapping',
 399                 TITLE => 'Titlecase_Mapping',
 400                 UPPER => 'Uppercase_Mapping',
 401             );
 402
 403 $property_count = -1;
 404 foreach my $name (sort keys %to_properties) {
 405
 406     $property_count++;
 407     next if $property_count % $num_test_files != $::TEST_CHUNK;
 408
 409     my $property = $to_properties{$name};
 410     my ($list_ref, $map_ref, $format, $missing)
 411                                       = prop_invmap($property, );
 412     if (! $list_ref || ! $map_ref) {
 413         fail("No inversion map found for $property");
 414         next;
 415     }
 416     if ($format !~ / ^ a l? $ /x) {
 417         fail("Unexpected inversion map format ('$format') found for $property");
 418         next;
 419     }
 420
 421     # Include all the Latin1 code points, plus 0x100.
 422     my @code_points = (0 .. 256);
 423
 424     # Then include the next few multi-char folds above those from this
 425     # property, and include the next few single folds as well
 426     my $above_latins = 0;
 427     my $multi_char = 0;
 428     for my $i (0 .. @{$list_ref} - 1) {
 429         my $range_start = $list_ref->[$i];
 430         next if $range_start < 257;
 431         if (ref $map_ref->[$i] && $multi_char < 5)  {
 432             push @code_points, $range_start - 1
 433                                         if $code_points[-1] != $range_start - 1;
 434             push @code_points, $range_start;
 435             $multi_char++;
 436         }
 437         elsif ($above_latins < 5) {
 438             push @code_points, $range_start - 1
 439                                         if $code_points[-1] != $range_start - 1;
 440             push @code_points, $range_start;
 441             $above_latins++;
 442         }
 443         last if $above_latins >= 5 && $multi_char >= 5;
 444     }
 445
 446     # And finally one non-Unicode code point.
 447     push @code_points, 0x110000;    # Above Unicode, no prop should match
 448     no warnings 'non_unicode';
 449
 450     # $n is native; $u unicode.
 451     for my $n (@code_points) {
 452         my $u = utf8::native_to_unicode($n);
 453         my $function = $name;
 454
 455         my $index = search_invlist(\@{$list_ref}, $n);
 456
 457         my $ret;
 458         my $char_name = get_charname($n);
 459         my $display_name = sprintf "\\N{U+%02X, %s}", $n, $char_name;
 460
 461         foreach my $suffix ("", "_L1", "_LC") {
 462
 463             # This is the only macro defined for L1
 464             next if $suffix eq "_L1" && $function ne "LOWER";
 465
 466           SKIP:
 467             foreach my $locale ("", $base_locale, $utf8_locale) {
 468
 469                 # titlecase is not defined in locales.
 470                 next if $name eq 'TITLE' && $suffix eq "_LC";
 471
 472                 my ($display_locale, $locale_is_utf8)
 473                                 = get_display_locale_or_skip($locale, $suffix);
 474                 next unless defined $display_locale;
 475
 476                 skip("to${name}_LC does not work for LATIN SMALL LETTER SHARP S"
 477                   . "$display_locale", 1)
 478                             if  $u == 0xDF && $name =~ / FOLD | UPPER /x
 479                              && $suffix eq "_LC" && $locale_is_utf8;
 480
 481                 use if $locale, "locale";
 482                 POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
 483
 484                 my $display_call = "to${function}$suffix("
 485                                  . " $display_name )$display_locale";
 486                 $ret = eval "test_to${function}$suffix($n)";
 487                 if (is ($@, "", "$display_call didn't give error")) {
 488                     my $should_be;
 489                     if ($n > 255) {
 490                         $should_be = $n;
 491                     }
 492                     elsif (     $u > 127
 493                             && (   $suffix eq ""
 494                                 || ($suffix eq "_LC" && ! $locale_is_utf8)))
 495                     {
 496                         $should_be = $n;
 497                     }
 498                     elsif ($map_ref->[$index] != $missing) {
 499                         $should_be = $map_ref->[$index] + $n - $list_ref->[$index]
 500                     }
 501                     else {
 502                         $should_be = $n;
 503                     }
 504
 505                     is ($ret, $should_be,
 506                         sprintf("${tab}And correctly returned 0x%02X",
 507                                                               $should_be));
 508                 }
 509             }
 510         }
 511
 512         # The _uni, uvchr, and _utf8 functions return both the ordinal of the
 513         # first code point of the result, and the result in utf8.  The .xs
 514         # tests return these in an array, in [0] and [1] respectively, with
 515         # [2] the length of the utf8 in bytes.
 516         my $utf8_should_be = "";
 517         my $first_ord_should_be;
 518         if (ref $map_ref->[$index]) {   # A multi-char result
 519             for my $n (0 .. @{$map_ref->[$index]} - 1) {
 520                 $utf8_should_be .= chr $map_ref->[$index][$n];
 521             }
 522
 523             $first_ord_should_be = $map_ref->[$index][0];
 524         }
 525         else {  # A single-char result
 526             $first_ord_should_be = ($map_ref->[$index] != $missing)
 527                                     ? $map_ref->[$index] + $n
 528                                                          - $list_ref->[$index]
 529                                     : $n;
 530             $utf8_should_be = chr $first_ord_should_be;
 531         }
 532         utf8::upgrade($utf8_should_be);
 533
 534         # Test _uni, uvchr
 535         foreach my $suffix ('_uni', '_uvchr') {
 536             my $s;
 537             my $len;
 538             my $display_call = "to${function}$suffix( $display_name )";
 539             $ret = eval "test_to${function}$suffix($n)";
 540             if (is ($@, "", "$display_call didn't give error")) {
 541                 is ($ret->[0], $first_ord_should_be,
 542                     sprintf("${tab}And correctly returned 0x%02X",
 543                                                     $first_ord_should_be));
 544                 is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
 545                 use bytes;
 546                 is ($ret->[2], length $utf8_should_be,
 547                     "${tab}Got correct number of bytes for utf8 length");
 548             }
 549         }
 550
 551         # Test _utf8
 552         my $char = chr($n);
 553         utf8::upgrade($char);
 554         $char = quotemeta $char if $char eq '\\' || $char eq "'";
 555         foreach my $utf8_param("_safe",
 556                                 "_safe, malformed",
 557                                 "deprecated unsafe",
 558                                 "deprecated mathoms",
 559                                 )
 560         {
 561             use Config;
 562             next if    $utf8_param eq 'deprecated mathoms'
 563                     && $Config{'ccflags'} =~ /-DNO_MATHOMS/;
 564
 565             my $utf8_param_code = $utf8_param_code{$utf8_param};
 566             my $expect_error = $utf8_param_code > 0;
 567
 568             # Skip if can't malform (because is a UTF-8 invariant)
 569             next if $expect_error && $u < ((ord "A" == 65) ? 128 : 160);
 570
 571             my $display_call = "to${function}_utf8($display_name, $utf8_param )";
 572             $ret = eval   "test_to${function}_utf8('$char', $utf8_param_code)";
 573             if ($expect_error) {
 574                 isnt ($@, "", "expected and got error in $display_call");
 575                 like($@, qr/Malformed UTF-8 character/,
 576                      "${tab}And got expected message");
 577                 undef @warnings;
 578             }
 579             elsif (is ($@, "", "$display_call didn't give error")) {
 580                 is ($ret->[0], $first_ord_should_be,
 581                     sprintf("${tab}And correctly returned 0x%02X",
 582                                                     $first_ord_should_be));
 583                 is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
 584                 use bytes;
 585                 is ($ret->[2], length $utf8_should_be,
 586                     "${tab}Got correct number of bytes for utf8 length");
 587                 if ($utf8_param_code < 0) {
 588                     my $warnings_ok;
 589                     if (! $seen{"${function}_utf8$utf8_param"}++) {
 590                         $warnings_ok = is(@warnings, 1,
 591                                                    "${tab}Got a single warning");
 592                         if ($warnings_ok) {
 593                             my $expected;
 594                             if ($utf8_param_code == -2) {
 595                                 my $lc_func = lc $function;
 596                                 $expected
 597                 = qr/starting in Perl .* to_utf8_$lc_func\(\) will be removed/;
 598                             }
 599                             else {
 600                                 $expected
 601                 = qr/starting in Perl .* will require an additional parameter/;
 602                             }
 603                             $warnings_ok = like($warnings[0], $expected,
 604                                       "${tab}Got expected deprecation warning");
 605                         }
 606                     }
 607                     else {
 608                         $warnings_ok = is(@warnings, 0,
 609                                   "${tab}Deprecation warned only the one time");
 610                     }
 611                     $warnings_ok or diag("@warnings");
 612                     undef @warnings;
 613                 }
 614             }
 615         }
 616     }
 617 }
 618
 619 # This is primarily to make sure that no non-Unicode warnings get generated
 620 is(scalar @warnings, 0, "No unexpected warnings were generated in the tests")
 621   or diag @warnings;
 622
 623 done_testing;