ext/XS-APItest/t/handy.t

   1 #!perl -w
   2
   3 BEGIN {
   4     require 'loc_tools.pl';   # Contains locales_enabled() and
   5                               # find_utf8_ctype_locale()
   6 }
   7
   8 use strict;
   9 use Test::More;
  10 use Config;
  11
  12 use XS::APItest;
  13
  14 my $tab = " " x 4;  # Indent subsidiary tests this much
  15
  16 use Unicode::UCD qw(search_invlist prop_invmap prop_invlist);
  17 my ($charname_list, $charname_map, $format, $default) = prop_invmap("Name Alias");
  18
  19 sub get_charname($) {
  20     my $cp = shift;
  21
  22     # If there is a an abbreviation for the code point name, use it
  23     my $name_index = search_invlist(\@{$charname_list}, $cp);
  24     if (defined $name_index) {
  25         my $synonyms = $charname_map->[$name_index];
  26         if (ref $synonyms) {
  27             my $pat = qr/: abbreviation/;
  28             my @abbreviations = grep { $_ =~ $pat } @$synonyms;
  29             if (@abbreviations) {
  30                 return $abbreviations[0] =~ s/$pat//r;
  31             }
  32         }
  33     }
  34
  35     # Otherwise, use the full name
  36     use charnames ();
  37     return charnames::viacode($cp) // "No name";
  38 }
  39
  40 sub truth($) {  # Converts values so is() works
  41     return (shift) ? 1 : 0;
  42 }
  43
  44 my $base_locale;
  45 my $utf8_locale;
  46 if(locales_enabled('LC_ALL')) {
  47     require POSIX;
  48     $base_locale = POSIX::setlocale( &POSIX::LC_ALL, "C");
  49     if (defined $base_locale && $base_locale eq 'C') {
  50         use locale; # make \w work right in non-ASCII lands
  51
  52         # Some locale implementations don't have the 128-255 characters all
  53         # mean nothing.  Skip the locale tests in that situation
  54         for my $i (128 .. 255) {
  55             if (chr(utf8::unicode_to_native($i)) =~ /[[:print:]]/) {
  56                 undef $base_locale;
  57                 last;
  58             }
  59         }
  60
  61         $utf8_locale = find_utf8_ctype_locale() if $base_locale;
  62     }
  63 }
  64
  65 sub get_display_locale_or_skip($$) {
  66
  67     # Helper function intimately tied to its callers.  It knows the loop
  68     # iterates with a locale of "", meaning don't use locale; $base_locale
  69     # meaning to use a non-UTF-8 locale; and $utf8_locale.
  70     #
  71     # It checks to see if the current test should be skipped or executed,
  72     # returning an empty list for the former, and for the latter:
  73     #   ( 'locale display name',
  74     #     bool of is this a UTF-8 locale )
  75     #
  76     # The display name is the empty string if not using locale.  Functions
  77     # with _LC in their name are skipped unless in locale, and functions
  78     # without _LC are executed only outside locale.  However, if no locales at
  79     # all are on the system, the _LC functions are executed outside locale.
  80
  81     my ($locale, $suffix) = @_;
  82
  83     # The test should be skipped if the input is for a non-existent locale
  84     return unless defined $locale;
  85
  86     # Here the input is defined, either a locale name or "".  If the test is
  87     # for not using locales, we want to do the test for non-LC functions,
  88     # and skip it for LC ones (except if there are no locales on the system,
  89     # we do it for LC ones as if they weren't LC).
  90     if ($locale eq "") {
  91         return ("", 0) if $suffix !~ /LC/ || ! defined $base_locale;
  92         return;
  93     }
  94
  95     # Here the input is for a real locale.  We don't test the non-LC functions
  96     # for locales.
  97     return if $suffix !~ /LC/;
  98
  99     # Here is for a LC function and a real locale.  The base locale is not
 100     # UTF-8.
 101     return (" ($locale locale)", 0) if $locale eq $base_locale;
 102
 103     # The only other possibility is that we have a UTF-8 locale
 104     return (" ($locale)", 1);
 105 }
 106
 107 sub try_malforming($$$)
 108 {
 109     # Determines if the tests for malformed UTF-8 should be done.  When done,
 110     # the .xs code creates malformations by pretending the length is shorter
 111     # than it actually is.  Some things can't be malformed, and sometimes this
 112     # test knows that the current code doesn't look for a malformation under
 113     # various circumstances.
 114
 115     my ($i, $function, $using_locale) = @_;
 116
 117     # Single bytes can't be malformed
 118     return 0 if $i < ((ord "A" == 65) ? 128 : 160);
 119
 120     # ASCII doesn't need to ever look beyond the first byte.
 121     return 0 if $function eq "ASCII";
 122
 123     # No controls above 255, so the code doesn't look at those
 124     return 0 if $i > 255 && $function eq "CNTRL";
 125
 126     # No non-ASCII digits below 256, except if using locales.
 127     return 0 if $i < 256 && ! $using_locale && $function =~ /X?DIGIT/;
 128
 129     return 1;
 130 }
 131
 132 my %properties = (
 133                    # name => Lookup-property name
 134                    alnum => 'Word',
 135                    wordchar => 'Word',
 136                    alphanumeric => 'Alnum',
 137                    alpha => 'XPosixAlpha',
 138                    ascii => 'ASCII',
 139                    blank => 'Blank',
 140                    cntrl => 'Control',
 141                    digit => 'Digit',
 142                    graph => 'Graph',
 143                    idfirst => '_Perl_IDStart',
 144                    idcont => '_Perl_IDCont',
 145                    lower => 'XPosixLower',
 146                    print => 'Print',
 147                    psxspc => 'XPosixSpace',
 148                    punct => 'XPosixPunct',
 149                    quotemeta => '_Perl_Quotemeta',
 150                    space => 'XPerlSpace',
 151                    vertws => 'VertSpace',
 152                    upper => 'XPosixUpper',
 153                    xdigit => 'XDigit',
 154                 );
 155
 156 my @warnings;
 157 local $SIG{__WARN__} = sub { push @warnings, @_ };
 158
 159 my %utf8_param_code = (
 160                         "_safe"                 =>  0,
 161                         "_safe, malformed"      =>  1,
 162                         "unsafe"                => -1,
 163                       );
 164
 165 foreach my $name (sort keys %properties, 'octal') {
 166     my @invlist;
 167     if ($name eq 'octal') {
 168         # Hand-roll an inversion list with 0-7 in it and nothing else.
 169         push @invlist, ord "0", ord "8";
 170     }
 171     else {
 172         my $property = $properties{$name};
 173         @invlist = prop_invlist($property, '_perl_core_internal_ok');
 174         if (! @invlist) {
 175
 176             # An empty return could mean an unknown property, or merely that
 177             # it is empty.  Call in scalar context to differentiate
 178             if (! prop_invlist($property, '_perl_core_internal_ok')) {
 179                 fail("No inversion list found for $property");
 180                 next;
 181             }
 182         }
 183     }
 184
 185     # Include all the Latin1 code points, plus 0x100.
 186     my @code_points = (0 .. 256);
 187
 188     # Then include the next few boundaries above those from this property
 189     my $above_latins = 0;
 190     foreach my $range_start (@invlist) {
 191         next if $range_start < 257;
 192         push @code_points, $range_start - 1, $range_start;
 193         $above_latins++;
 194         last if $above_latins > 5;
 195     }
 196
 197     # This makes sure we are using the Perl definition of idfirst and idcont,
 198     # and not the Unicode.  There are a few differences.
 199     push @code_points, ord "\N{ESTIMATED SYMBOL}" if $name =~ /^id(first|cont)/;
 200     if ($name eq "idcont") {    # And some that are continuation but not start
 201         push @code_points, ord("\N{GREEK ANO TELEIA}"),
 202                            ord("\N{COMBINING GRAVE ACCENT}");
 203     }
 204
 205     # And finally one non-Unicode code point.
 206     push @code_points, 0x110000;    # Above Unicode, no prop should match
 207     no warnings 'non_unicode';
 208
 209     for my $j (@code_points) {
 210         my $i = utf8::native_to_unicode($j);
 211         my $function = uc($name);
 212
 213         is (@warnings, 0, "Got no unexpected warnings in previous iteration")
 214            or diag("@warnings");
 215         undef @warnings;
 216
 217         my $matches = search_invlist(\@invlist, $i);
 218         if (! defined $matches) {
 219             $matches = 0;
 220         }
 221         else {
 222             $matches = truth(! ($matches % 2));
 223         }
 224
 225         my $ret;
 226         my $char_name = get_charname($j);
 227         my $display_name = sprintf "\\x{%02X, %s}", $i, $char_name;
 228         my $display_call = "is${function}( $display_name )";
 229
 230         foreach my $suffix ("", "_A", "_L1", "_LC", "_uni", "_uvchr",
 231                             "_LC_uvchr", "_utf8", "_LC_utf8")
 232         {
 233
 234             # Not all possible macros have been defined
 235             if ($name eq 'vertws') {
 236
 237                 # vertws is always all of Unicode
 238                 next if $suffix !~ / ^ _ ( uni | uvchr | utf8 ) $ /x;
 239             }
 240             elsif ($name eq 'alnum') {
 241
 242                 # ALNUM_A, ALNUM_L1, and ALNUM_uvchr are not defined as these
 243                 # suffixes were added later, after WORDCHAR was created to be
 244                 # a clearer synonym for ALNUM
 245                 next if    $suffix eq '_A'
 246                         || $suffix eq '_L1'
 247                         || $suffix eq '_uvchr';
 248             }
 249             elsif ($name eq 'octal') {
 250                 next if $suffix ne ""  && $suffix ne '_A' && $suffix ne '_L1';
 251             }
 252             elsif ($name eq 'quotemeta') {
 253                 # There is only one macro for this, and is defined only for
 254                 # Latin1 range
 255                 next if $suffix ne ""
 256             }
 257
 258             foreach my $locale ("", $base_locale, $utf8_locale) {
 259
 260                 my ($display_locale, $locale_is_utf8)
 261                                 = get_display_locale_or_skip($locale, $suffix);
 262                 next unless defined $display_locale;
 263
 264                 use if $locale, "locale";
 265                 POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
 266
 267                 if ($suffix !~ /utf8/) {    # _utf8 has to handled specially
 268                     my $display_call
 269                        = "is${function}$suffix( $display_name )$display_locale";
 270                     $ret = truth eval "test_is${function}$suffix($i)";
 271                     if (is ($@, "", "$display_call didn't give error")) {
 272                         my $truth = $matches;
 273                         if ($truth) {
 274
 275                             # The single byte functions are false for
 276                             # above-Latin1
 277                             if ($i >= 256) {
 278                                 $truth = 0
 279                                         if $suffix=~ / ^ ( _A | _L [1C] )? $ /x;
 280                             }
 281                             elsif (   utf8::native_to_unicode($i) >= 128
 282                                    && $name ne 'quotemeta')
 283                             {
 284
 285                                 # The no-suffix and _A functions are false
 286                                 # for non-ASCII.  So are  _LC  functions on a
 287                                 # non-UTF-8 locale
 288                                 $truth = 0 if    $suffix eq "_A"
 289                                               || $suffix eq ""
 290                                               || (     $suffix =~ /LC/
 291                                                   && ! $locale_is_utf8);
 292                             }
 293                         }
 294
 295                         is ($ret, $truth, "${tab}And correctly returns $truth");
 296                     }
 297                 }
 298                 else {  # _utf8 suffix
 299                     my $char = chr($i);
 300                     utf8::upgrade($char);
 301                     $char = quotemeta $char if $char eq '\\' || $char eq "'";
 302                     my $truth;
 303                     if (   $suffix =~ /LC/
 304                         && ! $locale_is_utf8
 305                         && $i < 256
 306                         && utf8::native_to_unicode($i) >= 128)
 307                     {   # The C-locale _LC function returns FALSE for Latin1
 308                         # above ASCII
 309                         $truth = 0;
 310                     }
 311                     else {
 312                         $truth = $matches;
 313                     }
 314
 315                     foreach my $utf8_param("_safe",
 316                                            "_safe, malformed",
 317                                            "unsafe"
 318                                           )
 319                     {
 320                         my $utf8_param_code = $utf8_param_code{$utf8_param};
 321                         my $expect_error = $utf8_param_code > 0;
 322                         next if      $expect_error
 323                                 && ! try_malforming($i, $function, $suffix =~ /LC/);
 324
 325                         my $display_call = "is${function}$suffix( $display_name"
 326                                          . ", $utf8_param )$display_locale";
 327                         $ret = truth eval "test_is${function}$suffix('$char',"
 328                                         . " $utf8_param_code)";
 329                         if ($expect_error) {
 330                             isnt ($@, "",
 331                                     "expected and got error in $display_call");
 332                             like($@, qr/Malformed UTF-8 character/,
 333                                 "${tab}And got expected message");
 334                             if (is (@warnings, 1,
 335                                            "${tab}Got a single warning besides"))
 336                             {
 337                                 like($warnings[0],
 338                                      qr/Malformed UTF-8 character.*short/,
 339                                      "${tab}Got expected warning");
 340                             }
 341                             else {
 342                                 diag("@warnings");
 343                             }
 344                             undef @warnings;
 345                         }
 346                         elsif (is ($@, "", "$display_call didn't give error")) {
 347                             is ($ret, $truth,
 348                                 "${tab}And correctly returned $truth");
 349                         }
 350                     }
 351                 }
 352             }
 353         }
 354     }
 355 }
 356
 357 my %to_properties = (
 358                 FOLD  => 'Case_Folding',
 359                 LOWER => 'Lowercase_Mapping',
 360                 TITLE => 'Titlecase_Mapping',
 361                 UPPER => 'Uppercase_Mapping',
 362             );
 363
 364
 365 foreach my $name (sort keys %to_properties) {
 366     my $property = $to_properties{$name};
 367     my ($list_ref, $map_ref, $format, $missing)
 368                                       = prop_invmap($property, );
 369     if (! $list_ref || ! $map_ref) {
 370         fail("No inversion map found for $property");
 371         next;
 372     }
 373     if ($format !~ / ^ a l? $ /x) {
 374         fail("Unexpected inversion map format ('$format') found for $property");
 375         next;
 376     }
 377
 378     # Include all the Latin1 code points, plus 0x100.
 379     my @code_points = (0 .. 256);
 380
 381     # Then include the next few multi-char folds above those from this
 382     # property, and include the next few single folds as well
 383     my $above_latins = 0;
 384     my $multi_char = 0;
 385     for my $i (0 .. @{$list_ref} - 1) {
 386         my $range_start = $list_ref->[$i];
 387         next if $range_start < 257;
 388         if (ref $map_ref->[$i] && $multi_char < 5)  {
 389             push @code_points, $range_start - 1
 390                                         if $code_points[-1] != $range_start - 1;
 391             push @code_points, $range_start;
 392             $multi_char++;
 393         }
 394         elsif ($above_latins < 5) {
 395             push @code_points, $range_start - 1
 396                                         if $code_points[-1] != $range_start - 1;
 397             push @code_points, $range_start;
 398             $above_latins++;
 399         }
 400         last if $above_latins >= 5 && $multi_char >= 5;
 401     }
 402
 403     # And finally one non-Unicode code point.
 404     push @code_points, 0x110000;    # Above Unicode, no prop should match
 405     no warnings 'non_unicode';
 406
 407     # $j is native; $i unicode.
 408     for my $j (@code_points) {
 409         my $i = utf8::native_to_unicode($j);
 410         my $function = $name;
 411
 412         my $index = search_invlist(\@{$list_ref}, $j);
 413
 414         my $ret;
 415         my $char_name = get_charname($j);
 416         my $display_name = sprintf "\\N{U+%02X, %s}", $i, $char_name;
 417
 418         foreach my $suffix ("", "_L1", "_LC") {
 419
 420             # This is the only macro defined for L1
 421             next if $suffix eq "_L1" && $function ne "LOWER";
 422
 423           SKIP:
 424             foreach my $locale ("", $base_locale, $utf8_locale) {
 425
 426                 # titlecase is not defined in locales.
 427                 next if $name eq 'TITLE' && $suffix eq "_LC";
 428
 429                 my ($display_locale, $locale_is_utf8)
 430                                 = get_display_locale_or_skip($locale, $suffix);
 431                 next unless defined $display_locale;
 432
 433                 skip("to${name}_LC does not work for LATIN SMALL LETTER SHARP S"
 434                   . "$display_locale", 1)
 435                             if  $i == 0xDF && $name =~ / FOLD | UPPER /x
 436                              && $suffix eq "_LC" && $locale_is_utf8;
 437
 438                 use if $locale, "locale";
 439                 POSIX::setlocale( &POSIX::LC_ALL, $locale) if $locale;
 440
 441                 my $display_call = "to${function}$suffix("
 442                                  . " $display_name )$display_locale";
 443                 $ret = eval "test_to${function}$suffix($j)";
 444                 if (is ($@, "", "$display_call didn't give error")) {
 445                     my $should_be;
 446                     if ($i > 255) {
 447                         $should_be = $j;
 448                     }
 449                     elsif (    $i > 127
 450                             && (   $suffix eq ""
 451                                 || ($suffix eq "_LC" && ! $locale_is_utf8)))
 452                     {
 453                         $should_be = $j;
 454                     }
 455                     elsif ($map_ref->[$index] != $missing) {
 456                         $should_be = $map_ref->[$index] + $j - $list_ref->[$index]
 457                     }
 458                     else {
 459                         $should_be = $j;
 460                     }
 461
 462                     is ($ret, $should_be,
 463                         sprintf("${tab}And correctly returned 0x%02X",
 464                                                               $should_be));
 465                 }
 466             }
 467         }
 468
 469         # The _uni, uvchr, and _utf8 functions return both the ordinal of the
 470         # first code point of the result, and the result in utf8.  The .xs
 471         # tests return these in an array, in [0] and [1] respectively, with
 472         # [2] the length of the utf8 in bytes.
 473         my $utf8_should_be = "";
 474         my $first_ord_should_be;
 475         if (ref $map_ref->[$index]) {   # A multi-char result
 476             for my $j (0 .. @{$map_ref->[$index]} - 1) {
 477                 $utf8_should_be .= chr $map_ref->[$index][$j];
 478             }
 479
 480             $first_ord_should_be = $map_ref->[$index][0];
 481         }
 482         else {  # A single-char result
 483             $first_ord_should_be = ($map_ref->[$index] != $missing)
 484                                     ? $map_ref->[$index] + $j
 485                                                          - $list_ref->[$index]
 486                                     : $j;
 487             $utf8_should_be = chr $first_ord_should_be;
 488         }
 489         utf8::upgrade($utf8_should_be);
 490
 491         # Test _uni, uvchr
 492         foreach my $suffix ('_uni', '_uvchr') {
 493             my $s;
 494             my $len;
 495             my $display_call = "to${function}$suffix( $display_name )";
 496             $ret = eval "test_to${function}$suffix($j)";
 497             if (is ($@, "", "$display_call didn't give error")) {
 498                 is ($ret->[0], $first_ord_should_be,
 499                     sprintf("${tab}And correctly returned 0x%02X",
 500                                                     $first_ord_should_be));
 501                 is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
 502                 use bytes;
 503                 is ($ret->[2], length $utf8_should_be,
 504                     "${tab}Got correct number of bytes for utf8 length");
 505             }
 506         }
 507
 508         # Test _utf8
 509         my $char = chr($j);
 510         utf8::upgrade($char);
 511         $char = quotemeta $char if $char eq '\\' || $char eq "'";
 512         {
 513             my $display_call = "to${function}_utf8($display_name )";
 514             $ret = eval   "test_to${function}_utf8('$char')";
 515             if (is ($@, "", "$display_call didn't give error")) {
 516                 is ($ret->[0], $first_ord_should_be,
 517                     sprintf("${tab}And correctly returned 0x%02X",
 518                                                     $first_ord_should_be));
 519                 is ($ret->[1], $utf8_should_be, "${tab}Got correct utf8");
 520                 use bytes;
 521                 is ($ret->[2], length $utf8_should_be,
 522                     "${tab}Got correct number of bytes for utf8 length");
 523             }
 524         }
 525     }
 526 }
 527
 528 # This is primarily to make sure that no non-Unicode warnings get generated
 529 is(scalar @warnings, 0, "No unexpected warnings were generated in the tests")
 530   or diag @warnings;
 531
 532 done_testing;