ext/XS-APItest/t/utf8_warn_base.pl

   1 #!perl -w
   2
   3 # This is a base file to be used by various .t's in its directory
   4 # It tests various code points that are "problematic", and verifies that the
   5 # correct warnings/flags etc are generated when using them.  It also takes the
   6 # UTF-8 for some of them and perturbs it to be malformed in various ways, and
   7 # tests that this gets appropriately detected.
   8
   9 use strict;
  10 use Test::More;
  11
  12 BEGIN {
  13     use_ok('XS::APItest');
  14     require 'charset_tools.pl';
  15     require './t/utf8_setup.pl';
  16 };
  17
  18 $|=1;
  19
  20 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
  21                           # machines, and that is tested elsewhere
  22 use XS::APItest;
  23
  24 my @warnings_gotten;
  25
  26 use warnings 'utf8';
  27 local $SIG{__WARN__} = sub { my @copy = @_;
  28                              push @warnings_gotten, map { chomp; $_ } @copy;
  29                            };
  30
  31 my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
  32
  33 sub requires_extended_utf8($) {
  34
  35     # Returns a boolean as to whether or not the code point parameter fits
  36     # into 31 bits, subject to the convention that a negative code point
  37     # stands for one that overflows the word size, so won't fit in 31 bits.
  38
  39     my $cp = shift;
  40     return $cp > 0x7FFFFFFF;
  41 }
  42
  43 my @tests;
  44 {
  45     no warnings qw(portable overflow);
  46     @tests = (
  47         # $testname,
  48         # $bytes,                  UTF-8 string
  49         # $allowed_uv,             code point $bytes evaluates to; -1 if
  50         #                          overflows
  51         # $needed_to_discern_len   optional, how long an initial substring do
  52         #                          we need to tell that the string must be for
  53         #                          a code point in the category it falls in,
  54         #                          like being a surrogate; 0 indicates we need
  55         #                          the whole string.  Some categories have a
  56         #                          default that is used if this is omitted.
  57         [ "lowest surrogate",
  58             (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
  59             0xD800,
  60         ],
  61         [ "a middle surrogate",
  62             (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
  63             0xD90D,
  64         ],
  65         [ "highest surrogate",
  66             (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
  67             0xDFFF,
  68         ],
  69         [ "first of 32 consecutive non-character code points",
  70             (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
  71             0xFDD0,
  72         ],
  73         [ "a mid non-character code point of the 32 consecutive ones",
  74             (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
  75             0xFDE0,
  76         ],
  77         [ "final of 32 consecutive non-character code points",
  78             (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
  79             0xFDEF,
  80         ],
  81         [ "non-character code point U+FFFE",
  82             (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
  83             0xFFFE,
  84         ],
  85         [ "non-character code point U+FFFF",
  86             (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
  87             0xFFFF,
  88         ],
  89         [ "non-character code point U+1FFFE",
  90             (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
  91             0x1FFFE,
  92         ],
  93         [ "non-character code point U+1FFFF",
  94             (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
  95             0x1FFFF,
  96         ],
  97         [ "non-character code point U+2FFFE",
  98             (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
  99             0x2FFFE,
 100         ],
 101         [ "non-character code point U+2FFFF",
 102             (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
 103             0x2FFFF,
 104         ],
 105         [ "non-character code point U+3FFFE",
 106             (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
 107             0x3FFFE,
 108         ],
 109         [ "non-character code point U+3FFFF",
 110             (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
 111             0x3FFFF,
 112         ],
 113         [ "non-character code point U+4FFFE",
 114             (isASCII)
 115             ?               "\xf1\x8f\xbf\xbe"
 116             : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
 117             0x4FFFE,
 118         ],
 119         [ "non-character code point U+4FFFF",
 120             (isASCII)
 121             ?               "\xf1\x8f\xbf\xbf"
 122             : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
 123             0x4FFFF,
 124         ],
 125         [ "non-character code point U+5FFFE",
 126             (isASCII)
 127             ?              "\xf1\x9f\xbf\xbe"
 128             : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
 129             0x5FFFE,
 130         ],
 131         [ "non-character code point U+5FFFF",
 132             (isASCII)
 133             ?              "\xf1\x9f\xbf\xbf"
 134             : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
 135             0x5FFFF,
 136         ],
 137         [ "non-character code point U+6FFFE",
 138             (isASCII)
 139             ?              "\xf1\xaf\xbf\xbe"
 140             : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
 141             0x6FFFE,
 142         ],
 143         [ "non-character code point U+6FFFF",
 144             (isASCII)
 145             ?              "\xf1\xaf\xbf\xbf"
 146             : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
 147             0x6FFFF,
 148         ],
 149         [ "non-character code point U+7FFFE",
 150             (isASCII)
 151             ?              "\xf1\xbf\xbf\xbe"
 152             : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
 153             0x7FFFE,
 154         ],
 155         [ "non-character code point U+7FFFF",
 156             (isASCII)
 157             ?              "\xf1\xbf\xbf\xbf"
 158             : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
 159             0x7FFFF,
 160         ],
 161         [ "non-character code point U+8FFFE",
 162             (isASCII)
 163             ?              "\xf2\x8f\xbf\xbe"
 164             : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
 165             0x8FFFE,
 166         ],
 167         [ "non-character code point U+8FFFF",
 168             (isASCII)
 169             ?              "\xf2\x8f\xbf\xbf"
 170             : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
 171             0x8FFFF,
 172         ],
 173         [ "non-character code point U+9FFFE",
 174             (isASCII)
 175             ?              "\xf2\x9f\xbf\xbe"
 176             : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
 177             0x9FFFE,
 178         ],
 179         [ "non-character code point U+9FFFF",
 180             (isASCII)
 181             ?              "\xf2\x9f\xbf\xbf"
 182             : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
 183             0x9FFFF,
 184         ],
 185         [ "non-character code point U+AFFFE",
 186             (isASCII)
 187             ?              "\xf2\xaf\xbf\xbe"
 188             : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
 189             0xAFFFE,
 190         ],
 191         [ "non-character code point U+AFFFF",
 192             (isASCII)
 193             ?              "\xf2\xaf\xbf\xbf"
 194             : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
 195             0xAFFFF,
 196         ],
 197         [ "non-character code point U+BFFFE",
 198             (isASCII)
 199             ?              "\xf2\xbf\xbf\xbe"
 200             : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
 201             0xBFFFE,
 202         ],
 203         [ "non-character code point U+BFFFF",
 204             (isASCII)
 205             ?              "\xf2\xbf\xbf\xbf"
 206             : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
 207             0xBFFFF,
 208         ],
 209         [ "non-character code point U+CFFFE",
 210             (isASCII)
 211             ?              "\xf3\x8f\xbf\xbe"
 212             : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
 213             0xCFFFE,
 214         ],
 215         [ "non-character code point U+CFFFF",
 216             (isASCII)
 217             ?              "\xf3\x8f\xbf\xbf"
 218             : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
 219             0xCFFFF,
 220         ],
 221         [ "non-character code point U+DFFFE",
 222             (isASCII)
 223             ?              "\xf3\x9f\xbf\xbe"
 224             : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
 225             0xDFFFE,
 226         ],
 227         [ "non-character code point U+DFFFF",
 228             (isASCII)
 229             ?              "\xf3\x9f\xbf\xbf"
 230             : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
 231             0xDFFFF,
 232         ],
 233         [ "non-character code point U+EFFFE",
 234             (isASCII)
 235             ?              "\xf3\xaf\xbf\xbe"
 236             : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
 237             0xEFFFE,
 238         ],
 239         [ "non-character code point U+EFFFF",
 240             (isASCII)
 241             ?              "\xf3\xaf\xbf\xbf"
 242             : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
 243             0xEFFFF,
 244         ],
 245         [ "non-character code point U+FFFFE",
 246             (isASCII)
 247             ?              "\xf3\xbf\xbf\xbe"
 248             : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
 249             0xFFFFE,
 250         ],
 251         [ "non-character code point U+FFFFF",
 252             (isASCII)
 253             ?              "\xf3\xbf\xbf\xbf"
 254             : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
 255             0xFFFFF,
 256         ],
 257         [ "non-character code point U+10FFFE",
 258             (isASCII)
 259             ?              "\xf4\x8f\xbf\xbe"
 260             : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
 261             0x10FFFE,
 262         ],
 263         [ "non-character code point U+10FFFF",
 264             (isASCII)
 265             ?              "\xf4\x8f\xbf\xbf"
 266             : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
 267             0x10FFFF,
 268         ],
 269         [ "first non_unicode",
 270             (isASCII)
 271             ?              "\xf4\x90\x80\x80"
 272             : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
 273             0x110000,
 274             2,
 275         ],
 276         [ "non_unicode whose first byte tells that",
 277             (isASCII)
 278             ?              "\xf5\x80\x80\x80"
 279             : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
 280             (isASCII) ? 0x140000 : 0x200000,
 281             1,
 282         ],
 283         [ "lowest 32 bit code point",
 284             (isASCII)
 285             ?  "\xfe\x82\x80\x80\x80\x80\x80"
 286             : I8_to_native(
 287                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
 288             0x80000000,
 289             (isASCII) ? 1 : 8,
 290         ],
 291         [ "highest 32 bit code point",
 292             (isASCII)
 293             ?  "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
 294             : I8_to_native(
 295                "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
 296             0xFFFFFFFF,
 297             (isASCII) ? 1 : 8,
 298         ],
 299         [ "Lowest 33 bit code point",
 300             (isASCII)
 301             ?  "\xfe\x84\x80\x80\x80\x80\x80"
 302             : I8_to_native(
 303                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
 304             ($::is64bit) ? 0x100000000 : -1,   # Overflows on 32-bit systems
 305             (isASCII && ! $::is64bit) ? 2 : 1,
 306         ],
 307     );
 308
 309     if (! $::is64bit) {
 310         if (isASCII) {
 311             push @tests,
 312                 [ "overflow that old algorithm failed to detect",
 313                     "\xfe\x86\x80\x80\x80\x80\x80",
 314                     -1,
 315                     2,
 316                 ];
 317         }
 318     }
 319
 320     if ($::is64bit) {
 321         push @tests,
 322             [ "highest 64 bit code point",
 323               (isASCII)
 324               ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
 325               : I8_to_native(
 326                 "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
 327               0xFFFFFFFFFFFFFFFF,
 328               (isASCII) ? 1 : 2,
 329             ],
 330             [ "first 65 bit code point",
 331               (isASCII)
 332               ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
 333               : I8_to_native(
 334                 "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
 335               -1,
 336               (isASCII) ? 3 : 2,
 337             ];
 338         if (isASCII) {
 339             push @tests,
 340                 [ "Lowest code point requiring 13 bytes to represent",
 341                     "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
 342                     0x1000000000,
 343                     1,
 344                 ],
 345                 [ "overflow that old algorithm failed to detect",
 346                     "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
 347                     -1,
 348                     3,
 349                 ];
 350         }
 351         else {
 352             push @tests,    # These could falsely show wrongly in a naive
 353                             # implementation
 354                 [ "requires at least 32 bits",
 355                     I8_to_native(
 356                     "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
 357                     0x800000000,
 358                     7,
 359                 ],
 360                 [ "requires at least 32 bits",
 361                     I8_to_native(
 362                     "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
 363                     0x10000000000,
 364                     6,
 365                 ],
 366                 [ "requires at least 32 bits",
 367                     I8_to_native(
 368                     "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
 369                     0x200000000000,
 370                     5,
 371                 ],
 372                 [ "requires at least 32 bits",
 373                     I8_to_native(
 374                     "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
 375                     0x4000000000000,
 376                     4,
 377                 ],
 378                 [ "requires at least 32 bits",
 379                     I8_to_native(
 380                     "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
 381                     0x80000000000000,
 382                     3,
 383                 ],
 384                 [ "requires at least 32 bits",
 385                     I8_to_native(
 386                     "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
 387                     0x1000000000000000,
 388                     2,
 389                 ];
 390         }
 391     }
 392 }
 393
 394 sub flags_to_text($$)
 395 {
 396     my ($flags, $flags_to_text_ref) = @_;
 397
 398     # Returns a string containing a mnemonic representation of the bits that
 399     # are set in the $flags.  These are assumed to be flag bits.  The return
 400     # looks like "FOO|BAR|BAZ".  The second parameter is a reference to an
 401     # array that gives the textual representation of all the possible flags.
 402     # Element 0 is the text for the bit 0 flag; element 1 for bit 1; ....  If
 403     # no bits at all are set the string "0" is returned;
 404
 405     my @flag_text;
 406     my $shift = 0;
 407
 408     return "0" if $flags == 0;
 409
 410     while ($flags) {
 411         #diag sprintf "%x", $flags;
 412         if ($flags & 1) {
 413             push @flag_text, $flags_to_text_ref->[$shift];
 414         }
 415         $shift++;
 416         $flags >>= 1;
 417     }
 418
 419     return join "|", @flag_text;
 420 }
 421
 422 # Possible flag returns from utf8n_to_uvchr_error().  These should have G_,
 423 # instead of A_, D_, but the prefixes will be used in a a later commit, so
 424 # minimize churn by having them here.
 425 my @utf8n_flags_to_text =  ( qw(
 426         A_EMPTY
 427         A_CONTINUATION
 428         A_NON_CONTINUATION
 429         A_SHORT
 430         A_LONG
 431         A_LONG_AND_ITS_VALUE
 432         PLACEHOLDER
 433         A_OVERFLOW
 434         D_SURROGATE
 435         W_SURROGATE
 436         D_NONCHAR
 437         W_NONCHAR
 438         D_SUPER
 439         W_SUPER
 440         D_ABOVE_31_BIT
 441         W_ABOVE_31_BIT
 442         CHECK_ONLY
 443         NO_CONFIDENCE_IN_CURLEN_
 444     ) );
 445
 446 sub utf8n_display_call($)
 447 {
 448     # Converts an eval string that calls test_utf8n_to_uvchr into a more human
 449     # readable form, and returns it.  Doesn't work if the byte string contains
 450     # an apostrophe.  The return will look something like:
 451     #   test_utf8n_to_uvchr_error('$bytes', $length, $flags)
 452     #diag $_[0];
 453
 454     $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
 455     my $text1 = $1;     # Everything before the byte string
 456     my $bytes = $2;
 457     my $text2 = $3;     # Includes the length
 458     my $flags = $4;
 459
 460     return $text1
 461          . display_bytes($bytes)
 462          . $text2
 463          . flags_to_text($flags, \@utf8n_flags_to_text)
 464          . ')';
 465 }
 466
 467 sub uvchr_display_call($)
 468 {
 469     # Converts an eval string that calls test_uvchr_to_utf8 into a more human
 470     # readable form, and returns it.  The return will look something like:
 471     #   test_uvchr_to_utf8n_flags($uv, $flags)
 472     #diag $_[0];
 473
 474     my @flags_to_text =  ( qw(
 475             W_SURROGATE
 476             W_NONCHAR
 477             W_SUPER
 478             W_ABOVE_31_BIT
 479             D_SURROGATE
 480             D_NONCHAR
 481             D_SUPER
 482             D_ABOVE_31_BIT
 483        ) );
 484
 485     $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
 486     my $text = $1;
 487     my $cp = sprintf "%X", $2;
 488     my $flags = $3;
 489
 490     return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
 491 }
 492
 493 sub do_warnings_test(@)
 494 {
 495     my @expected_warnings = @_;
 496
 497     # Compares the input expected warnings array with @warnings_gotten,
 498     # generating a pass for each found, removing it from @warnings_gotten.
 499     # Any discrepancies generate test failures.  Returns TRUE if no
 500     # discrepcancies; otherwise FALSE.
 501
 502     my $succeeded = 1;
 503
 504     if (@expected_warnings == 0) {
 505         if (! is(@warnings_gotten, 0, "    Expected and got no warnings")) {
 506             output_warnings(@warnings_gotten);
 507             $succeeded = 0;
 508         }
 509         return $succeeded;
 510     }
 511
 512     # Check that we got all the expected warnings,
 513     # removing each one found
 514   WARNING:
 515     foreach my $expected (@expected_warnings) {
 516         foreach (my $i = 0; $i < @warnings_gotten; $i++) {
 517             if ($warnings_gotten[$i] =~ $expected) {
 518                 pass("    Expected and got warning: "
 519                     . " $warnings_gotten[$i]");
 520                 splice @warnings_gotten, $i, 1;
 521                 next WARNING;
 522             }
 523         }
 524         fail("    Expected a warning that matches "
 525             . $expected . " but didn't get it");
 526         $succeeded = 0;
 527     }
 528
 529     if (! is(@warnings_gotten, 0, "    Got no unexpected warnings")) {
 530         output_warnings(@warnings_gotten);
 531         $succeeded = 0;
 532     }
 533
 534     return $succeeded;
 535 }
 536
 537 # This test is split into this number of files.
 538 my $num_test_files = $ENV{TEST_JOBS} || 1;
 539 $num_test_files = 10 if $num_test_files > 10;
 540
 541 my $test_count = -1;
 542 foreach my $test (@tests) {
 543     $test_count++;
 544     next if $test_count % $num_test_files != $::TEST_CHUNK;
 545
 546     my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
 547
 548     my $length = length $bytes;
 549     my $will_overflow = $allowed_uv < 0;
 550
 551     my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
 552     my $display_bytes = display_bytes($bytes);
 553
 554     my $controlling_warning_category;
 555     my $utf8n_flag_to_warn;
 556     my $utf8n_flag_to_disallow;
 557     my $uvchr_flag_to_warn;
 558     my $uvchr_flag_to_disallow;
 559
 560     # We want to test that the independent flags are actually independent.
 561     # For example, that a surrogate doesn't trigger a non-character warning,
 562     # and conversely, turning off an above-Unicode flag doesn't suppress a
 563     # surrogate warning.  Earlier versions of this file used nested loops to
 564     # test all possible combinations.  But that creates lots of tests, making
 565     # this run too long.  What is now done instead is to use the complement of
 566     # the category we are testing to greatly reduce the combinatorial
 567     # explosion.  For example, if we have a surrogate and we aren't expecting
 568     # a warning about it, we set all the flags for non-surrogates to raise
 569     # warnings.  If one shows up, it indicates the flags aren't independent.
 570     my $utf8n_flag_to_warn_complement;
 571     my $utf8n_flag_to_disallow_complement;
 572     my $uvchr_flag_to_warn_complement;
 573     my $uvchr_flag_to_disallow_complement;
 574
 575     # Many of the code points being tested are middling in that if code point
 576     # edge cases work, these are very likely to as well.  Because this test
 577     # file takes a while to execute, we skip testing the edge effects of code
 578     # points deemed middling, while testing their basics and continuing to
 579     # fully test the non-middling code points.
 580     my $skip_most_tests = 0;
 581
 582     my $cp_message_qr;      # Pattern that matches the message raised when
 583                             # that message contains the problematic code
 584                             # point.  The message is the same (currently) both
 585                             # when going from/to utf8.
 586     my $non_cp_trailing_text;   # The suffix text when the message doesn't
 587                                 # contain a code point.  (This is a result of
 588                                 # some sort of malformation that means we
 589                                 # can't get an exact code poin
 590
 591     # Is this test malformed from the beginning?  If so, we know to generally
 592     # expect that the tests will show it isn't valid.
 593     my $initially_malformed = 0;
 594
 595     if ($will_overflow || $allowed_uv > 0x10FFFF) {
 596
 597         # Set the SUPER flags; later, we test for ABOVE_31_BIT as well.
 598         $utf8n_flag_to_warn     = $::UTF8_WARN_SUPER;
 599         $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
 600         $uvchr_flag_to_warn     = $::UNICODE_WARN_SUPER;
 601         $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
 602
 603         # Below, we add the flags for non-above-31 bit to the code points that
 604         # don't fit that category.  Special tests are done for this category
 605         # in the inner loop.
 606         $utf8n_flag_to_warn_complement     = $::UTF8_WARN_NONCHAR
 607                                             |$::UTF8_WARN_SURROGATE;
 608         $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
 609                                             |$::UTF8_DISALLOW_SURROGATE;
 610         $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_NONCHAR
 611                                             |$::UNICODE_WARN_SURROGATE;
 612         $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
 613                                             |$::UNICODE_DISALLOW_SURROGATE;
 614         $controlling_warning_category = 'non_unicode';
 615
 616         if ($will_overflow) {  # This is realy a malformation
 617             $non_cp_trailing_text = "if you see this, there is an error";
 618             $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
 619             $initially_malformed = 1;
 620         }
 621         elsif (requires_extended_utf8($allowed_uv)) {
 622             $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
 623                                 \Q and not portable\E/x;
 624             $non_cp_trailing_text = "is for a non-Unicode code point, and is not portable";
 625         }
 626         else {
 627             $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
 628                                 \Q may not be portable\E/x;
 629             $non_cp_trailing_text = "is for a non-Unicode code point, may not"
 630                                 . " be portable";
 631             $utf8n_flag_to_warn_complement     |= $::UTF8_WARN_ABOVE_31_BIT;
 632             $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_ABOVE_31_BIT;
 633             $uvchr_flag_to_warn_complement     |= $::UNICODE_WARN_ABOVE_31_BIT;
 634             $uvchr_flag_to_disallow_complement
 635                                             |= $::UNICODE_DISALLOW_ABOVE_31_BIT;
 636         }
 637     }
 638     elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
 639         $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/;
 640         $non_cp_trailing_text = "is for a surrogate";
 641         $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
 642         $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
 643
 644         $utf8n_flag_to_warn     = $::UTF8_WARN_SURROGATE;
 645         $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
 646         $uvchr_flag_to_warn     = $::UNICODE_WARN_SURROGATE;
 647         $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
 648
 649         $utf8n_flag_to_warn_complement     = $::UTF8_WARN_NONCHAR
 650                                             |$::UTF8_WARN_SUPER
 651                                             |$::UTF8_WARN_ABOVE_31_BIT;
 652         $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
 653                                             |$::UTF8_DISALLOW_SUPER
 654                                             |$::UTF8_DISALLOW_ABOVE_31_BIT;
 655         $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_NONCHAR
 656                                             |$::UNICODE_WARN_SUPER
 657                                             |$::UNICODE_WARN_ABOVE_31_BIT;
 658         $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
 659                                             |$::UNICODE_DISALLOW_SUPER
 660                                             |$::UNICODE_DISALLOW_ABOVE_31_BIT;
 661         $controlling_warning_category = 'surrogate';
 662     }
 663     elsif (   ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
 664            || ($allowed_uv & 0xFFFE) == 0xFFFE)
 665     {
 666         $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E
 667                             \Q is not recommended for open interchange\E/x;
 668         $non_cp_trailing_text = "if you see this, there is an error";
 669         $needed_to_discern_len = $length unless defined $needed_to_discern_len;
 670         if (   ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
 671             || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
 672         {
 673             $skip_most_tests = 1;
 674         }
 675
 676         $utf8n_flag_to_warn     = $::UTF8_WARN_NONCHAR;
 677         $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
 678         $uvchr_flag_to_warn     = $::UNICODE_WARN_NONCHAR;
 679         $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
 680
 681         $utf8n_flag_to_warn_complement     = $::UTF8_WARN_SURROGATE
 682                                             |$::UTF8_WARN_SUPER
 683                                             |$::UTF8_WARN_ABOVE_31_BIT;
 684         $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
 685                                             |$::UTF8_DISALLOW_SUPER
 686                                             |$::UTF8_DISALLOW_ABOVE_31_BIT;
 687         $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_SURROGATE
 688                                             |$::UNICODE_WARN_SUPER
 689                                             |$::UNICODE_WARN_ABOVE_31_BIT;
 690         $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
 691                                             |$::UNICODE_DISALLOW_SUPER
 692                                             |$::UNICODE_DISALLOW_ABOVE_31_BIT;
 693
 694         $controlling_warning_category = 'nonchar';
 695     }
 696     else {
 697         die "Can't figure out what type of warning to test for $testname"
 698     }
 699
 700     die 'Didn\'t set $needed_to_discern_len for ' . $testname
 701                                         unless defined $needed_to_discern_len;
 702
 703     {   # First test the isFOO calls
 704         use warnings; no warnings 'deprecated';   # Make sure these don't raise warnings
 705         undef @warnings_gotten;
 706
 707         my $ret = test_isUTF8_CHAR($bytes, $length);
 708         my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
 709         if ($initially_malformed) {
 710             is($ret, 0, "For $testname: isUTF8_CHAR() returns 0");
 711             is($ret_flags, 0, "    And isUTF8_CHAR_flags() returns 0");
 712         }
 713         else {
 714             is($ret, $length,
 715                "For $testname: isUTF8_CHAR() returns expected length: $length");
 716             is($ret_flags, $length, "    And isUTF8_CHAR_flags(...,0)"
 717                                   . " returns expected length: $length");
 718         }
 719         is(scalar @warnings_gotten, 0,
 720                 "    And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags generated"
 721               . " any warnings")
 722           or output_warnings(@warnings_gotten);
 723
 724         undef @warnings_gotten;
 725         $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
 726         if ($initially_malformed) {
 727             is($ret, 0, "    And isSTRICT_UTF8_CHAR() returns 0");
 728         }
 729         else {
 730             my $expected_ret = (   $testname =~ /surrogate|non-character/
 731                                 || $allowed_uv > 0x10FFFF)
 732                                ? 0
 733                                : $length;
 734             is($ret, $expected_ret, "    And isSTRICT_UTF8_CHAR() returns"
 735                                   . " expected length: $expected_ret");
 736             $ret = test_isUTF8_CHAR_flags($bytes, $length,
 737                                           $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
 738             is($ret, $expected_ret,
 739                     "    And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
 740                     . " acts like isSTRICT_UTF8_CHAR");
 741         }
 742         is(scalar @warnings_gotten, 0,
 743                 "    And neither isSTRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
 744               . " generated any warnings")
 745           or output_warnings(@warnings_gotten);
 746
 747         undef @warnings_gotten;
 748         $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
 749         if ($initially_malformed) {
 750             is($ret, 0, "    And isC9_STRICT_UTF8_CHAR() returns 0");
 751         }
 752         else {
 753             my $expected_ret = (   $testname =~ /surrogate/
 754                                 || $allowed_uv > 0x10FFFF)
 755                                ? 0
 756                                : $length;
 757             is($ret, $expected_ret, "    And isC9_STRICT_UTF8_CHAR()"
 758                                    ." returns expected length: $expected_ret");
 759             $ret = test_isUTF8_CHAR_flags($bytes, $length,
 760                                         $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
 761             is($ret, $expected_ret,
 762                   "    And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
 763                 . " acts like isC9_STRICT_UTF8_CHAR");
 764         }
 765         is(scalar @warnings_gotten, 0,
 766                 "    And neither isC9_STRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
 767               . " generated any warnings")
 768           or output_warnings(@warnings_gotten);
 769
 770         foreach my $disallow_type (0..2) {
 771             # 0 is don't disallow this type of code point
 772             # 1 is do disallow
 773             # 2 is do disallow, but only for above 31 bit
 774
 775             my $disallow_flags;
 776             my $expected_ret;
 777
 778             if ($initially_malformed) {
 779
 780                 # Malformations are by default disallowed, so testing with
 781                 # $disallow_type equal to 0 is sufficicient.
 782                 next if $disallow_type;
 783
 784                 $disallow_flags = 0;
 785                 $expected_ret = 0;
 786             }
 787             elsif ($disallow_type == 1) {
 788                 $disallow_flags = $utf8n_flag_to_disallow;
 789                 $expected_ret = 0;
 790             }
 791             elsif ($disallow_type == 2) {
 792                 next if ! requires_extended_utf8($allowed_uv);
 793                 $disallow_flags = $::UTF8_DISALLOW_ABOVE_31_BIT;
 794                 $expected_ret = 0;
 795             }
 796             else {  # type is 0
 797                 $disallow_flags = $utf8n_flag_to_disallow_complement;
 798                 $expected_ret = $length;
 799             }
 800
 801             $ret = test_isUTF8_CHAR_flags($bytes, $length, $disallow_flags);
 802             is($ret, $expected_ret, "    And isUTF8_CHAR_flags("
 803                                   . "$display_bytes, $disallow_flags) returns "
 804                                   . $expected_ret)
 805              or diag "The flags mean "
 806               . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
 807
 808             is(scalar @warnings_gotten, 0,
 809                     "    And isUTF8_CHAR_flags(...) generated no warnings")
 810             or output_warnings(@warnings_gotten);
 811
 812             # Test partial character handling, for each byte not a full character
 813             my $did_test_partial = 0;
 814             for (my $j = 1; $j < $length - 1; $j++) {
 815                 $did_test_partial = 1;
 816                 my $partial = substr($bytes, 0, $j);
 817                 my $ret_should_be;
 818                 my $comment;
 819                 if ($disallow_type || $initially_malformed) {
 820                     $ret_should_be = 0;
 821                     $comment = "disallowed";
 822                     if ($j < $needed_to_discern_len) {
 823                         $ret_should_be = 1;
 824                         $comment .= ", but need $needed_to_discern_len"
 825                                  . " bytes to discern:";
 826                     }
 827                 }
 828                 else {
 829                     $ret_should_be = 1;
 830                     $comment = "allowed";
 831                 }
 832
 833                 undef @warnings_gotten;
 834
 835                 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
 836                                                              $disallow_flags);
 837                 is($ret, $ret_should_be,
 838                     "    And is_utf8_valid_partial_char_flags("
 839                     . display_bytes($partial)
 840                     . ", $disallow_flags), $comment: returns $ret_should_be")
 841                  or diag "The flags mean "
 842                   . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
 843             }
 844
 845             if ($did_test_partial) {
 846                 is(scalar @warnings_gotten, 0,
 847                         "    And is_utf8_valid_partial_char_flags()"
 848                         . " generated no warnings for any of the lengths")
 849                     or output_warnings(@warnings_gotten);
 850             }
 851         }
 852     }
 853
 854     # Now test the to/from UTF-8 calls
 855     # This is more complicated than the malformations tested in other files in
 856     # this directory, as there are several orthogonal variables involved.  We
 857     # test most possible combinations
 858
 859     # We try various combinations of malformations that can occur
 860     foreach my $short (0, 1) {
 861       next if $skip_most_tests && $short;
 862       foreach my $unexpected_noncont (0, 1) {
 863         next if $skip_most_tests && $unexpected_noncont;
 864         foreach my $overlong (0, 1) {
 865           next if $overlong && $skip_most_tests;
 866
 867           # If we're creating an overlong, it can't be longer than the
 868           # maximum length, so skip if we're already at that length.
 869           next if $overlong && $length >= $::max_bytes;
 870
 871           foreach my $malformed_allow_type (0..2) {
 872             # 0 don't allow this malformation; ignored if no malformation
 873             # 1 allow, with REPLACEMENT CHARACTER returned
 874             # 2 allow, with intended code point returned.  All malformations
 875             #   other than overlong can't determine the intended code point,
 876             #   so this isn't valid for them.
 877             next if     $malformed_allow_type == 2
 878                     && ($will_overflow || $short || $unexpected_noncont);
 879             next if $skip_most_tests && $malformed_allow_type;
 880
 881             # Here we are in the innermost loop for malformations.  So we
 882             # know which ones are in effect.  Can now change the input to be
 883             # appropriately malformed.  We also can set up certain other
 884             # things now, like whether we expect a return flag from this
 885             # malformation, and which flag.
 886
 887             my $this_bytes = $bytes;
 888             my $this_length = $length;
 889             my $this_expected_len = $length;
 890             my $this_needed_to_discern_len = $needed_to_discern_len;
 891
 892             my @malformation_names;
 893             my @expected_malformation_warnings;
 894             my @expected_malformation_return_flags;
 895
 896             # Contains the flags for any allowed malformations.  Currently no
 897             # combinations of on/off are tested for.  It's either all are
 898             # allowed, or none are.
 899             my $allow_flags = 0;
 900
 901             if ($overlong) {
 902                 my $new_expected_len;
 903
 904                 # To force this malformation, we convert the original start
 905                 # byte into a continuation byte with the same data bits as
 906                 # originally. ...
 907                 my $start_byte = substr($this_bytes, 0, 1);
 908                 my $converted_to_continuation_byte
 909                                             = start_byte_to_cont($start_byte);
 910
 911                 # ... Then we prepend it with a known overlong sequence.  This
 912                 # should evaluate to the exact same code point as the
 913                 # original.  We try to avoid an overlong using Perl extended
 914                 # UTF-8.  The code points are the highest representable as
 915                 # overlongs on the respective platform without using extended
 916                 # UTF-8.
 917                 if (native_to_I8($start_byte) lt "\xFC") {
 918                     $start_byte = I8_to_native("\xFC");
 919                     $new_expected_len = 6;
 920                 }
 921                 elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") {
 922
 923                     # FE is not extended UTF-8 on EBCDIC
 924                     $start_byte = I8_to_native("\xFE");
 925                     $new_expected_len = 7;
 926                 }
 927                 else {  # Must use extended UTF-8.  On ASCII platforms, we
 928                         # could express some overlongs here starting with
 929                         # \xFE, but there's no real reason to do so.
 930                     $start_byte = I8_to_native("\xFF");
 931                     $new_expected_len = $::max_bytes;
 932                 }
 933
 934                 # Splice in the revise continuation byte, preceded by the
 935                 # start byte and the proper number of the lowest continuation
 936                 # bytes.
 937                 $this_bytes =   $start_byte
 938                              . ($native_lowest_continuation_chr
 939                                 x ( $new_expected_len - 1 - length($this_bytes)))
 940                              .  $converted_to_continuation_byte
 941                              .  substr($this_bytes, 1);
 942                 $this_length = length($this_bytes);
 943                 $this_needed_to_discern_len =    $new_expected_len
 944                                             - (  $this_expected_len
 945                                                - $this_needed_to_discern_len);
 946                 $this_expected_len = $new_expected_len;
 947                 push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
 948                 push @malformation_names, 'overlong';
 949
 950                 if ($malformed_allow_type == 2) {
 951                     $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
 952                 }
 953                 elsif ($malformed_allow_type) {
 954                     $allow_flags |= $::UTF8_ALLOW_LONG;
 955                 }
 956             }
 957
 958             if ($short) {
 959                 push @malformation_names, 'short';
 960
 961                 # To force this malformation, just tell the test to not look
 962                 # as far as it should into the input.
 963                 $this_length--;
 964                 $this_expected_len--;
 965                 push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
 966
 967                 $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
 968             }
 969
 970             if ($unexpected_noncont) {
 971                 push @malformation_names, 'unexpected non-continuation';
 972
 973                 # To force this malformation, change the final continuation
 974                 # byte into a non continuation.
 975                 my $pos = ($short) ? -2 : -1;
 976                 substr($this_bytes, $pos, 1) = '?';
 977                 $this_expected_len--;
 978                 push @expected_malformation_return_flags,
 979                                 $::UTF8_GOT_NON_CONTINUATION;
 980                 $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
 981                                                     if $malformed_allow_type;
 982             }
 983
 984             # Here, we've transformed the input with all of the desired
 985             # non-overflow malformations.  We are now in a position to
 986             # construct any potential warnings for those malformations.  But
 987             # it's a pain to get the detailed messages exactly right, so for
 988             # now XXX, only do so for those that return an explicit code
 989             # point.
 990
 991             if ($overlong) {
 992
 993                 # If one of the other malformation types is also in effect, we
 994                 # don't know what the intended code point was.
 995                 if ($short || $unexpected_noncont || $will_overflow) {
 996                     push @expected_malformation_warnings, qr/overlong/;
 997                 }
 998                 else {
 999                     my $wrong_bytes = display_bytes_no_quotes(
1000                                          substr($this_bytes, 0, $this_length));
1001                     my $correct_bytes = display_bytes_no_quotes($bytes);
1002                     my $prefix = ($allowed_uv > 0x10FFFF) ? "0x" : "U+";
1003                     push @expected_malformation_warnings,
1004                             qr/\QMalformed UTF-8 character: $wrong_bytes\E
1005                                \Q (overlong; instead use\E
1006                                \Q $correct_bytes to\E
1007                                \Q represent $prefix$uv_string)/x;
1008                 }
1009             }
1010             if ($short) {
1011                 push @expected_malformation_warnings, qr/too short/;
1012             }
1013             if ($unexpected_noncont) {
1014                 push @expected_malformation_warnings,
1015                                         qr/unexpected non-continuation byte/;
1016             }
1017
1018             # The overflow malformation is done differently than other
1019             # malformations.  It comes from manually typed tests in the test
1020             # array.  We now make it be treated like one of the other
1021             # malformations.  But some has to be deferred until the inner loop
1022             my $overflow_msg_pattern;
1023             if ($will_overflow) {
1024                 push @malformation_names, 'overflow';
1025
1026                 $overflow_msg_pattern = display_bytes_no_quotes(
1027                                     substr($this_bytes, 0, $this_expected_len));
1028                 $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E
1029                                            \Q $overflow_msg_pattern\E
1030                                            \Q (overflows)\E/x;
1031                 push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW;
1032                 $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type;
1033             }
1034
1035             # And we can create the malformation-related text for the the test
1036             # names we eventually will generate.
1037             my $malformations_name = "";
1038             if (@malformation_names) {
1039                 $malformations_name .= "dis" unless $malformed_allow_type;
1040                 $malformations_name .= "allowed ";
1041                 $malformations_name .= "malformation";
1042                 $malformations_name .= "s" if @malformation_names > 1;
1043                 $malformations_name .= ": ";
1044                 $malformations_name .=  join "/", @malformation_names;
1045                 $malformations_name =  " ($malformations_name)";
1046             }
1047
1048             # Done setting up the malformation related stuff
1049
1050             foreach my $do_disallow (0, 1) {
1051               next if $skip_most_tests && ! $do_disallow;
1052
1053               # We classify the warnings into certain "interesting" types,
1054               # described later
1055               foreach my $warning_type (0..4) {
1056                 next if $skip_most_tests && $warning_type != 1;
1057                 foreach my $use_warn_flag (0, 1) {
1058                     next if $skip_most_tests && ! $use_warn_flag;
1059
1060                     # Finally, here is the inner loop
1061
1062                     my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn;
1063                     my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow;
1064                     my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn;
1065                     my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow;
1066
1067                     my $eval_warn;
1068                     my $expect_regular_warnings;
1069                     my $expect_warnings_for_malformed;
1070                     my $expect_warnings_for_overflow;
1071
1072                     if ($warning_type == 0) {
1073                         $eval_warn = "use warnings; no warnings 'deprecated'";
1074                         $expect_regular_warnings = $use_warn_flag;
1075
1076                         # We ordinarily expect overflow warnings here.  But it
1077                         # is somewhat more complicated, and the final
1078                         # determination is deferred to one place in the filw
1079                         # where we handle overflow.
1080                         $expect_warnings_for_overflow = 1;
1081
1082                         # We would ordinarily expect malformed warnings in
1083                         # this case, but not if malformations are allowed.
1084                         $expect_warnings_for_malformed
1085                                                 = $malformed_allow_type == 0;
1086                     }
1087                     elsif ($warning_type == 1) {
1088                         $eval_warn = "no warnings";
1089                         $expect_regular_warnings = 0;
1090                         $expect_warnings_for_overflow = 0;
1091                         $expect_warnings_for_malformed = 0;
1092                     }
1093                     elsif ($warning_type == 2) {
1094                         $eval_warn = "no warnings; use warnings 'utf8'";
1095                         $expect_regular_warnings = $use_warn_flag;
1096                         $expect_warnings_for_overflow = 1;
1097                         $expect_warnings_for_malformed
1098                                                 = $malformed_allow_type == 0;
1099                     }
1100                     elsif ($warning_type == 3) {
1101                         $eval_warn = "no warnings; use warnings"
1102                                    . " '$controlling_warning_category'";
1103                         $expect_regular_warnings = $use_warn_flag;
1104                         $expect_warnings_for_overflow
1105                             = $controlling_warning_category eq 'non_unicode';
1106                         $expect_warnings_for_malformed = 0;
1107                     }
1108                     elsif ($warning_type == 4) {  # Like type 3, but uses the
1109                                                   # above-31-bit flags
1110                         # The complement flags were set up so that the
1111                         # above-31-bit flags have been tested that they don't
1112                         # trigger wrongly for too small code points.  And the
1113                         # flags have been set up so that those small code
1114                         # points are tested for being above Unicode.  What's
1115                         # left to test is that the large code points do
1116                         # trigger the above-31-bit flags.
1117                         next if ! requires_extended_utf8($allowed_uv);
1118                         next if $controlling_warning_category ne 'non_unicode';
1119                         $eval_warn = "no warnings; use warnings 'non_unicode'";
1120                         $expect_regular_warnings = 1;
1121                         $expect_warnings_for_overflow = 1;
1122                         $expect_warnings_for_malformed = 0;
1123                         $this_utf8n_flag_to_warn   = $::UTF8_WARN_ABOVE_31_BIT;
1124                         $this_utf8n_flag_to_disallow
1125                                                 = $::UTF8_DISALLOW_ABOVE_31_BIT;
1126                         $this_uvchr_flag_to_warn = $::UNICODE_WARN_ABOVE_31_BIT;
1127                         $this_uvchr_flag_to_disallow
1128                                              = $::UNICODE_DISALLOW_ABOVE_31_BIT;
1129                     }
1130                     else {
1131                        die "Unexpected warning type '$warning_type'";
1132                     }
1133
1134                     # We only need to test the case where all warnings are
1135                     # enabled (type 0) to see if turning off the warning flag
1136                     # causes things to not be output.  If those pass, then
1137                     # turning on some sub-category of warnings, or turning off
1138                     # warnings altogether are extremely likely to not output
1139                     # warnings either, given how the warnings subsystem is
1140                     # supposed to work, and this file assumes it does work.
1141                     next if $warning_type != 0 && ! $use_warn_flag;
1142
1143                     # The convention is that the 'got' flag is the same value
1144                     # as the disallow one.  If this were violated, the tests
1145                     # here should start failing.
1146                     my $return_flag = $this_utf8n_flag_to_disallow;
1147
1148                     # If we aren't expecting warnings/disallow for this, turn
1149                     # on all the other flags.  That makes sure that they all
1150                     # are independent of this flag, and so we don't need to
1151                     # test them individually.
1152                     my $this_warning_flags  = ($use_warn_flag)
1153                                               ? $this_utf8n_flag_to_warn
1154                                               : $utf8n_flag_to_warn_complement;
1155                     my $this_disallow_flags = ($do_disallow)
1156                                           ? $this_utf8n_flag_to_disallow
1157                                           : $utf8n_flag_to_disallow_complement;
1158                     my $expected_uv = $allowed_uv;
1159                     my $this_uv_string = $uv_string;
1160
1161                     my @expected_return_flags
1162                                         = @expected_malformation_return_flags;
1163                     my @expected_warnings;
1164                     push @expected_warnings, @expected_malformation_warnings
1165                                             if $expect_warnings_for_malformed;
1166
1167                     # The overflow malformation is done differently than other
1168                     # malformations.  It comes from manually typed tests in
1169                     # the test array, but it also is above Unicode and uses
1170                     # Perl extended UTF-8, so affects some of the flags being
1171                     # tested.  We now make it be treated like one of the other
1172                     # generated malformations.
1173                     if ($will_overflow) {
1174
1175                         # An overflow is (way) above Unicode, and overrides
1176                         # everything else.
1177                         $expect_regular_warnings = 0;
1178
1179                         # Earlier, we tentatively calculated whether this
1180                         # should emit a message or not.  It's tentative
1181                         # because, even if we ordinarily would output it, we
1182                         # don't if malformations are allowed -- except an
1183                         # overflow is also a SUPER and ABOVE_31_BIT, and if
1184                         # warnings for those are enabled, the overflow
1185                         # warning does get raised.
1186                         if (   $expect_warnings_for_overflow
1187                             && (    $malformed_allow_type == 0
1188                                 ||   (   $this_warning_flags
1189                                       & ($::UTF8_WARN_SUPER
1190                                         |$::UTF8_WARN_ABOVE_31_BIT))))
1191                         {
1192                             push @expected_warnings, $overflow_msg_pattern;
1193                         }
1194                     }
1195
1196                     # It may be that the malformations have shortened the
1197                     # amount of input we look at so much that we can't tell
1198                     # what the category the code point was in.  Otherwise, set
1199                     # up the expected return flags based on the warnings and
1200                     # disallowments.
1201                     if ($this_expected_len < $this_needed_to_discern_len) {
1202                         $expect_regular_warnings = 0;
1203                     }
1204                     elsif (   ($this_warning_flags & $this_utf8n_flag_to_warn)
1205                            || (  $this_disallow_flags
1206                                & $this_utf8n_flag_to_disallow))
1207                     {
1208                         push @expected_return_flags, $return_flag;
1209                     }
1210
1211                     # Finish setting up the expected warning.
1212                     if ($expect_regular_warnings) {
1213
1214                         # So far the array contains warnings generated by
1215                         # malformations.  Add the expected regular one.
1216                         unshift @expected_warnings, $cp_message_qr;
1217
1218                         # But it may need to be modified, because either of
1219                         # these malformations means we can't determine the
1220                         # expected code point.
1221                         if ($short || $unexpected_noncont) {
1222                             my $first_byte = substr($this_bytes, 0, 1);
1223                             $expected_warnings[0] = display_bytes(
1224                                     substr($this_bytes, 0, $this_expected_len));
1225                             $expected_warnings[0]
1226                                 = qr/[Aa]\Qny UTF-8 sequence that starts with\E
1227                                      \Q $expected_warnings[0]\E
1228                                      \Q $non_cp_trailing_text\E/x;
1229                         }
1230                     }
1231
1232                     # Is effectively disallowed if we've set up a malformation
1233                     # (unless malformations are allowed), even if the flag
1234                     # indicates it is allowed.  Fix up test name to indicate
1235                     # this as well
1236                     my $disallowed = 0;
1237                     if (   $this_disallow_flags & $this_utf8n_flag_to_disallow
1238                         && $this_expected_len >= $this_needed_to_discern_len)
1239                     {
1240                         $disallowed = 1;
1241                     }
1242                     if ($malformations_name) {
1243                         if ($malformed_allow_type == 0) {
1244                             $disallowed = 1;
1245                         }
1246                         elsif ($malformed_allow_type == 1) {
1247
1248                             # Even if allowed, the malformation returns the
1249                             # REPLACEMENT CHARACTER.
1250                             $expected_uv = 0xFFFD;
1251                             $this_uv_string = "0xFFFD"
1252                         }
1253                     }
1254
1255                     my $this_name = "utf8n_to_uvchr_error() $testname: "
1256                                                 . (($disallowed)
1257                                                    ? 'disallowed'
1258                                                    : 'allowed');
1259                     $this_name .= ", $eval_warn";
1260                     $this_name .= ", " . ((  $this_warning_flags
1261                                             & $this_utf8n_flag_to_warn)
1262                                           ? 'with flag for raising warnings'
1263                                           : 'no flag for raising warnings');
1264                     $this_name .= $malformations_name;
1265
1266                     # Do the actual test using an eval
1267                     undef @warnings_gotten;
1268                     my $ret_ref;
1269                     my $this_flags
1270                         = $allow_flags|$this_warning_flags|$this_disallow_flags;
1271                     my $eval_text =      "$eval_warn; \$ret_ref"
1272                             . " = test_utf8n_to_uvchr_error("
1273                             . "'$this_bytes', $this_length, $this_flags)";
1274                     eval "$eval_text";
1275                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1276                     {
1277                         diag "\$@='$@'; call was: "
1278                            . utf8n_display_call($eval_text);
1279                         next;
1280                     }
1281                     if ($disallowed) {
1282                         is($ret_ref->[0], 0, "    And returns 0")
1283                           or diag "Call was: " . utf8n_display_call($eval_text);
1284                     }
1285                     else {
1286                         is($ret_ref->[0], $expected_uv,
1287                                 "    And returns expected uv: "
1288                               . $this_uv_string)
1289                           or diag "Call was: " . utf8n_display_call($eval_text);
1290                     }
1291                     is($ret_ref->[1], $this_expected_len,
1292                                         "    And returns expected length:"
1293                                       . " $this_expected_len")
1294                       or diag "Call was: " . utf8n_display_call($eval_text);
1295
1296                     my $returned_flags = $ret_ref->[2];
1297
1298                     for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
1299                         if ($expected_return_flags[$i] & $returned_flags) {
1300                             if ($expected_return_flags[$i]
1301                                             == $::UTF8_DISALLOW_ABOVE_31_BIT)
1302                             {
1303                                 pass("    Expected and got return flag for"
1304                                    . " above_31_bit");
1305                             }
1306                                    # The first entries in this are
1307                                    # malformations
1308                             elsif ($i > @malformation_names - 1)  {
1309                                 pass("    Expected and got return flag"
1310                                    . " for " . $controlling_warning_category);
1311                             }
1312                             else {
1313                                 pass("    Expected and got return flag for "
1314                                    . $malformation_names[$i]
1315                                    . " malformation");
1316                             }
1317                             $returned_flags &= ~$expected_return_flags[$i];
1318                             splice @expected_return_flags, $i, 1;
1319                         }
1320                     }
1321
1322                     is($returned_flags, 0,
1323                        "    Got no unexpected return flags")
1324                       or diag "The unexpected flags gotten were: "
1325                            . (flags_to_text($returned_flags,
1326                                             \@utf8n_flags_to_text)
1327                                 # We strip off any prefixes from the flag
1328                                 # names
1329                              =~ s/ \b [A-Z] _ //xgr);
1330                     is (scalar @expected_return_flags, 0,
1331                         "    Got all expected return flags")
1332                         or diag "The expected flags not gotten were: "
1333                            . (flags_to_text(eval join("|",
1334                                                         @expected_return_flags),
1335                                             \@utf8n_flags_to_text)
1336                                 # We strip off any prefixes from the flag
1337                                 # names
1338                              =~ s/ \b [A-Z] _ //xgr);
1339
1340                     do_warnings_test(@expected_warnings)
1341                       or diag "Call was: " . utf8n_display_call($eval_text);
1342                     undef @warnings_gotten;
1343
1344                     # Check CHECK_ONLY results when the input is
1345                     # disallowed.  Do this when actually disallowed,
1346                     # not just when the $this_disallow_flags is set
1347                     if ($disallowed) {
1348                         my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
1349                         my $eval_text = "use warnings; no warnings 'deprecated'; \$ret_ref ="
1350                                       . " test_utf8n_to_uvchr_error('"
1351                                       . "$this_bytes', $this_length,"
1352                                       . " $this_flags)";
1353                         eval $eval_text;
1354                         if (! ok ("$@ eq ''",
1355                             "    And eval succeeded with CHECK_ONLY"))
1356                         {
1357                             diag "\$@='$@'; Call was: "
1358                                . utf8n_display_call($eval_text);
1359                             next;
1360                         }
1361                         is($ret_ref->[0], 0, "    CHECK_ONLY: Returns 0")
1362                           or diag "Call was: " . utf8n_display_call($eval_text);
1363                         is($ret_ref->[1], -1,
1364                                        "    CHECK_ONLY: returns -1 for length")
1365                           or diag "Call was: " . utf8n_display_call($eval_text);
1366                         if (! is(scalar @warnings_gotten, 0,
1367                                       "    CHECK_ONLY: no warnings generated"))
1368                         {
1369                             diag "Call was: " . utf8n_display_call($eval_text);
1370                             output_warnings(@warnings_gotten);
1371                         }
1372                     }
1373
1374                     # Now repeat some of the above, but for
1375                     # uvchr_to_utf8_flags().  Since this comes from an
1376                     # existing code point, it hasn't overflowed, and isn't
1377                     # malformed.
1378                     next if @malformation_names;
1379
1380                     $this_warning_flags = ($use_warn_flag)
1381                                           ? $this_uvchr_flag_to_warn
1382                                           : 0;
1383                     $this_disallow_flags = ($do_disallow)
1384                                            ? $this_uvchr_flag_to_disallow
1385                                            : 0;
1386
1387                     $disallowed = $this_disallow_flags
1388                                 & $this_uvchr_flag_to_disallow;
1389                     $this_name .= ", " . ((  $this_warning_flags
1390                                            & $this_utf8n_flag_to_warn)
1391                                           ? 'with flag for raising warnings'
1392                                           : 'no flag for raising warnings');
1393
1394                     $this_name = "uvchr_to_utf8_flags() $testname: "
1395                                             . (($disallowed)
1396                                                 ? 'disallowed'
1397                                                 : 'allowed');
1398                     $this_name .= ", $eval_warn";
1399                     $this_name .= ", " . ((  $this_warning_flags
1400                                            & $this_uvchr_flag_to_warn)
1401                                         ? 'with warning flag'
1402                                         : 'no warning flag');
1403
1404                     undef @warnings_gotten;
1405                     my $ret;
1406                     $this_flags = $this_warning_flags|$this_disallow_flags;
1407                     $eval_text = "$eval_warn; \$ret ="
1408                             . " test_uvchr_to_utf8_flags("
1409                             . "$allowed_uv, $this_flags)";
1410                     eval "$eval_text";
1411                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1412                     {
1413                         diag "\$@='$@'; call was: "
1414                            . uvchr_display_call($eval_text);
1415                         next;
1416                     }
1417                     if ($disallowed) {
1418                         is($ret, undef, "    And returns undef")
1419                           or diag "Call was: " . uvchr_display_call($eval_text);
1420                     }
1421                     else {
1422                         is($ret, $this_bytes, "    And returns expected string")
1423                           or diag "Call was: " . uvchr_display_call($eval_text);
1424                     }
1425
1426                     do_warnings_test(@expected_warnings)
1427                       or diag "Call was: " . uvchr_display_call($eval_text);
1428                 }
1429               }
1430             }
1431           }
1432         }
1433       }
1434     }
1435 }
1436
1437 done_testing;