This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8n_to_uvchr() Properly test for extended UTF-8
[perl5.git] / ext / XS-APItest / t / utf8_warn_base.pl
1 #!perl -w
2
3 # This is a base file to be used by various .t's in its directory
4 # It tests various code points that are "problematic", and verifies that the
5 # correct warnings/flags etc are generated when using them.  It also takes the
6 # UTF-8 for some of them and perturbs it to be malformed in various ways, and
7 # tests that this gets appropriately detected.
8
9 use strict;
10 use Test::More;
11
12 BEGIN {
13     use_ok('XS::APItest');
14     require 'charset_tools.pl';
15     require './t/utf8_setup.pl';
16 };
17
18 $|=1;
19
20 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
21                           # machines, and that is tested elsewhere
22 use XS::APItest;
23
24 my @warnings_gotten;
25
26 use warnings 'utf8';
27 local $SIG{__WARN__} = sub { my @copy = @_;
28                              push @warnings_gotten, map { chomp; $_ } @copy;
29                            };
30
31 my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
32 my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
33
34 sub requires_extended_utf8($) {
35
36     # Returns a boolean as to whether or not the code point parameter fits
37     # into 31 bits, subject to the convention that a negative code point
38     # stands for one that overflows the word size, so won't fit in 31 bits.
39
40     return shift > $highest_non_extended_utf8_cp;
41 }
42
43 my @tests;
44 {
45     no warnings qw(portable overflow);
46     @tests = (
47         # $testname,
48         # $bytes,                  UTF-8 string
49         # $allowed_uv,             code point $bytes evaluates to; -1 if
50         #                          overflows
51         # $needed_to_discern_len   optional, how long an initial substring do
52         #                          we need to tell that the string must be for
53         #                          a code point in the category it falls in,
54         #                          like being a surrogate; 0 indicates we need
55         #                          the whole string.  Some categories have a
56         #                          default that is used if this is omitted.
57         [ "lowest surrogate",
58             (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
59             0xD800,
60         ],
61         [ "a middle surrogate",
62             (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
63             0xD90D,
64         ],
65         [ "highest surrogate",
66             (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
67             0xDFFF,
68         ],
69         [ "first of 32 consecutive non-character code points",
70             (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
71             0xFDD0,
72         ],
73         [ "a mid non-character code point of the 32 consecutive ones",
74             (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
75             0xFDE0,
76         ],
77         [ "final of 32 consecutive non-character code points",
78             (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
79             0xFDEF,
80         ],
81         [ "non-character code point U+FFFE",
82             (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
83             0xFFFE,
84         ],
85         [ "non-character code point U+FFFF",
86             (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
87             0xFFFF,
88         ],
89         [ "non-character code point U+1FFFE",
90             (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
91             0x1FFFE,
92         ],
93         [ "non-character code point U+1FFFF",
94             (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
95             0x1FFFF,
96         ],
97         [ "non-character code point U+2FFFE",
98             (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
99             0x2FFFE,
100         ],
101         [ "non-character code point U+2FFFF",
102             (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
103             0x2FFFF,
104         ],
105         [ "non-character code point U+3FFFE",
106             (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
107             0x3FFFE,
108         ],
109         [ "non-character code point U+3FFFF",
110             (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
111             0x3FFFF,
112         ],
113         [ "non-character code point U+4FFFE",
114             (isASCII)
115             ?               "\xf1\x8f\xbf\xbe"
116             : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
117             0x4FFFE,
118         ],
119         [ "non-character code point U+4FFFF",
120             (isASCII)
121             ?               "\xf1\x8f\xbf\xbf"
122             : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
123             0x4FFFF,
124         ],
125         [ "non-character code point U+5FFFE",
126             (isASCII)
127             ?              "\xf1\x9f\xbf\xbe"
128             : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
129             0x5FFFE,
130         ],
131         [ "non-character code point U+5FFFF",
132             (isASCII)
133             ?              "\xf1\x9f\xbf\xbf"
134             : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
135             0x5FFFF,
136         ],
137         [ "non-character code point U+6FFFE",
138             (isASCII)
139             ?              "\xf1\xaf\xbf\xbe"
140             : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
141             0x6FFFE,
142         ],
143         [ "non-character code point U+6FFFF",
144             (isASCII)
145             ?              "\xf1\xaf\xbf\xbf"
146             : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
147             0x6FFFF,
148         ],
149         [ "non-character code point U+7FFFE",
150             (isASCII)
151             ?              "\xf1\xbf\xbf\xbe"
152             : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
153             0x7FFFE,
154         ],
155         [ "non-character code point U+7FFFF",
156             (isASCII)
157             ?              "\xf1\xbf\xbf\xbf"
158             : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
159             0x7FFFF,
160         ],
161         [ "non-character code point U+8FFFE",
162             (isASCII)
163             ?              "\xf2\x8f\xbf\xbe"
164             : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
165             0x8FFFE,
166         ],
167         [ "non-character code point U+8FFFF",
168             (isASCII)
169             ?              "\xf2\x8f\xbf\xbf"
170             : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
171             0x8FFFF,
172         ],
173         [ "non-character code point U+9FFFE",
174             (isASCII)
175             ?              "\xf2\x9f\xbf\xbe"
176             : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
177             0x9FFFE,
178         ],
179         [ "non-character code point U+9FFFF",
180             (isASCII)
181             ?              "\xf2\x9f\xbf\xbf"
182             : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
183             0x9FFFF,
184         ],
185         [ "non-character code point U+AFFFE",
186             (isASCII)
187             ?              "\xf2\xaf\xbf\xbe"
188             : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
189             0xAFFFE,
190         ],
191         [ "non-character code point U+AFFFF",
192             (isASCII)
193             ?              "\xf2\xaf\xbf\xbf"
194             : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
195             0xAFFFF,
196         ],
197         [ "non-character code point U+BFFFE",
198             (isASCII)
199             ?              "\xf2\xbf\xbf\xbe"
200             : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
201             0xBFFFE,
202         ],
203         [ "non-character code point U+BFFFF",
204             (isASCII)
205             ?              "\xf2\xbf\xbf\xbf"
206             : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
207             0xBFFFF,
208         ],
209         [ "non-character code point U+CFFFE",
210             (isASCII)
211             ?              "\xf3\x8f\xbf\xbe"
212             : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
213             0xCFFFE,
214         ],
215         [ "non-character code point U+CFFFF",
216             (isASCII)
217             ?              "\xf3\x8f\xbf\xbf"
218             : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
219             0xCFFFF,
220         ],
221         [ "non-character code point U+DFFFE",
222             (isASCII)
223             ?              "\xf3\x9f\xbf\xbe"
224             : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
225             0xDFFFE,
226         ],
227         [ "non-character code point U+DFFFF",
228             (isASCII)
229             ?              "\xf3\x9f\xbf\xbf"
230             : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
231             0xDFFFF,
232         ],
233         [ "non-character code point U+EFFFE",
234             (isASCII)
235             ?              "\xf3\xaf\xbf\xbe"
236             : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
237             0xEFFFE,
238         ],
239         [ "non-character code point U+EFFFF",
240             (isASCII)
241             ?              "\xf3\xaf\xbf\xbf"
242             : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
243             0xEFFFF,
244         ],
245         [ "non-character code point U+FFFFE",
246             (isASCII)
247             ?              "\xf3\xbf\xbf\xbe"
248             : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
249             0xFFFFE,
250         ],
251         [ "non-character code point U+FFFFF",
252             (isASCII)
253             ?              "\xf3\xbf\xbf\xbf"
254             : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
255             0xFFFFF,
256         ],
257         [ "non-character code point U+10FFFE",
258             (isASCII)
259             ?              "\xf4\x8f\xbf\xbe"
260             : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
261             0x10FFFE,
262         ],
263         [ "non-character code point U+10FFFF",
264             (isASCII)
265             ?              "\xf4\x8f\xbf\xbf"
266             : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
267             0x10FFFF,
268         ],
269         [ "first non_unicode",
270             (isASCII)
271             ?              "\xf4\x90\x80\x80"
272             : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
273             0x110000,
274             2,
275         ],
276         [ "non_unicode whose first byte tells that",
277             (isASCII)
278             ?              "\xf5\x80\x80\x80"
279             : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
280             (isASCII) ? 0x140000 : 0x200000,
281             1,
282         ],
283         [ "lowest 32 bit code point",
284             (isASCII)
285             ?  "\xfe\x82\x80\x80\x80\x80\x80"
286             : I8_to_native(
287                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
288             0x80000000,
289         ],
290         [ "highest 32 bit code point",
291             (isASCII)
292             ?  "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
293             : I8_to_native(
294                "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
295             0xFFFFFFFF,
296         ],
297         [ "Lowest 33 bit code point",
298             (isASCII)
299             ?  "\xfe\x84\x80\x80\x80\x80\x80"
300             : I8_to_native(
301                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
302             ($::is64bit) ? 0x100000000 : -1,   # Overflows on 32-bit systems
303             (isASCII && ! $::is64bit) ? 2 : 1,
304         ],
305     );
306
307     if (! $::is64bit) {
308         if (isASCII) {
309             push @tests,
310                 [ "overflow that old algorithm failed to detect",
311                     "\xfe\x86\x80\x80\x80\x80\x80",
312                     -1,
313                     2,
314                 ];
315         }
316     }
317
318     if ($::is64bit) {
319         push @tests,
320             [ "highest 64 bit code point",
321               (isASCII)
322               ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
323               : I8_to_native(
324                 "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
325               0xFFFFFFFFFFFFFFFF,
326               (isASCII) ? 1 : 2,
327             ],
328             [ "first 65 bit code point",
329               (isASCII)
330               ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
331               : I8_to_native(
332                 "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
333               -1,
334               (isASCII) ? 3 : 2,
335             ];
336         if (isASCII) {
337             push @tests,
338                 [ "Lowest code point requiring 13 bytes to represent",
339                     "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
340                     0x1000000000,
341                 ],
342                 [ "overflow that old algorithm failed to detect",
343                     "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
344                     -1,
345                     3,
346                 ];
347         }
348         else {
349             push @tests,    # These could falsely show wrongly in a naive
350                             # implementation
351                 [ "requires at least 32 bits",
352                     I8_to_native(
353                     "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
354                     0x800000000,
355                 ],
356                 [ "requires at least 32 bits",
357                     I8_to_native(
358                     "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
359                     0x10000000000,
360                 ],
361                 [ "requires at least 32 bits",
362                     I8_to_native(
363                     "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
364                     0x200000000000,
365                 ],
366                 [ "requires at least 32 bits",
367                     I8_to_native(
368                     "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
369                     0x4000000000000,
370                 ],
371                 [ "requires at least 32 bits",
372                     I8_to_native(
373                     "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
374                     0x80000000000000,
375                 ],
376                 [ "requires at least 32 bits",
377                     I8_to_native(
378                     "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
379                     0x1000000000000000,
380                 ];
381         }
382     }
383 }
384
385 sub flags_to_text($$)
386 {
387     my ($flags, $flags_to_text_ref) = @_;
388
389     # Returns a string containing a mnemonic representation of the bits that
390     # are set in the $flags.  These are assumed to be flag bits.  The return
391     # looks like "FOO|BAR|BAZ".  The second parameter is a reference to an
392     # array that gives the textual representation of all the possible flags.
393     # Element 0 is the text for the bit 0 flag; element 1 for bit 1; ....  If
394     # no bits at all are set the string "0" is returned;
395
396     my @flag_text;
397     my $shift = 0;
398
399     return "0" if $flags == 0;
400
401     while ($flags) {
402         #diag sprintf "%x", $flags;
403         if ($flags & 1) {
404             push @flag_text, $flags_to_text_ref->[$shift];
405         }
406         $shift++;
407         $flags >>= 1;
408     }
409
410     return join "|", @flag_text;
411 }
412
413 # Possible flag returns from utf8n_to_uvchr_error().  These should have G_,
414 # instead of A_, D_, but the prefixes will be used in a a later commit, so
415 # minimize churn by having them here.
416 my @utf8n_flags_to_text =  ( qw(
417         A_EMPTY
418         A_CONTINUATION
419         A_NON_CONTINUATION
420         A_SHORT
421         A_LONG
422         A_LONG_AND_ITS_VALUE
423         PLACEHOLDER
424         A_OVERFLOW
425         D_SURROGATE
426         W_SURROGATE
427         D_NONCHAR
428         W_NONCHAR
429         D_SUPER
430         W_SUPER
431         D_PERL_EXTENDED
432         W_PERL_EXTENDED
433         CHECK_ONLY
434         NO_CONFIDENCE_IN_CURLEN_
435     ) );
436
437 sub utf8n_display_call($)
438 {
439     # Converts an eval string that calls test_utf8n_to_uvchr into a more human
440     # readable form, and returns it.  Doesn't work if the byte string contains
441     # an apostrophe.  The return will look something like:
442     #   test_utf8n_to_uvchr_error('$bytes', $length, $flags)
443     #diag $_[0];
444
445     $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
446     my $text1 = $1;     # Everything before the byte string
447     my $bytes = $2;
448     my $text2 = $3;     # Includes the length
449     my $flags = $4;
450
451     return $text1
452          . display_bytes($bytes)
453          . $text2
454          . flags_to_text($flags, \@utf8n_flags_to_text)
455          . ')';
456 }
457
458 sub uvchr_display_call($)
459 {
460     # Converts an eval string that calls test_uvchr_to_utf8 into a more human
461     # readable form, and returns it.  The return will look something like:
462     #   test_uvchr_to_utf8n_flags($uv, $flags)
463     #diag $_[0];
464
465     my @flags_to_text =  ( qw(
466             W_SURROGATE
467             W_NONCHAR
468             W_SUPER
469             W_PERL_EXTENDED
470             D_SURROGATE
471             D_NONCHAR
472             D_SUPER
473             D_PERL_EXTENDED
474        ) );
475
476     $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
477     my $text = $1;
478     my $cp = sprintf "%X", $2;
479     my $flags = $3;
480
481     return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
482 }
483
484 sub do_warnings_test(@)
485 {
486     my @expected_warnings = @_;
487
488     # Compares the input expected warnings array with @warnings_gotten,
489     # generating a pass for each found, removing it from @warnings_gotten.
490     # Any discrepancies generate test failures.  Returns TRUE if no
491     # discrepcancies; otherwise FALSE.
492
493     my $succeeded = 1;
494
495     if (@expected_warnings == 0) {
496         if (! is(@warnings_gotten, 0, "    Expected and got no warnings")) {
497             output_warnings(@warnings_gotten);
498             $succeeded = 0;
499         }
500         return $succeeded;
501     }
502
503     # Check that we got all the expected warnings,
504     # removing each one found
505   WARNING:
506     foreach my $expected (@expected_warnings) {
507         foreach (my $i = 0; $i < @warnings_gotten; $i++) {
508             if ($warnings_gotten[$i] =~ $expected) {
509                 pass("    Expected and got warning: "
510                     . " $warnings_gotten[$i]");
511                 splice @warnings_gotten, $i, 1;
512                 next WARNING;
513             }
514         }
515         fail("    Expected a warning that matches "
516             . $expected . " but didn't get it");
517         $succeeded = 0;
518     }
519
520     if (! is(@warnings_gotten, 0, "    Got no unexpected warnings")) {
521         output_warnings(@warnings_gotten);
522         $succeeded = 0;
523     }
524
525     return $succeeded;
526 }
527
528 # This test is split into this number of files.
529 my $num_test_files = $ENV{TEST_JOBS} || 1;
530 $num_test_files = 10 if $num_test_files > 10;
531
532 my $test_count = -1;
533 foreach my $test (@tests) {
534     $test_count++;
535     next if $test_count % $num_test_files != $::TEST_CHUNK;
536
537     my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
538
539     my $length = length $bytes;
540     my $will_overflow = $allowed_uv < 0;
541
542     my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
543     my $display_bytes = display_bytes($bytes);
544
545     my $controlling_warning_category;
546     my $utf8n_flag_to_warn;
547     my $utf8n_flag_to_disallow;
548     my $uvchr_flag_to_warn;
549     my $uvchr_flag_to_disallow;
550
551     # We want to test that the independent flags are actually independent.
552     # For example, that a surrogate doesn't trigger a non-character warning,
553     # and conversely, turning off an above-Unicode flag doesn't suppress a
554     # surrogate warning.  Earlier versions of this file used nested loops to
555     # test all possible combinations.  But that creates lots of tests, making
556     # this run too long.  What is now done instead is to use the complement of
557     # the category we are testing to greatly reduce the combinatorial
558     # explosion.  For example, if we have a surrogate and we aren't expecting
559     # a warning about it, we set all the flags for non-surrogates to raise
560     # warnings.  If one shows up, it indicates the flags aren't independent.
561     my $utf8n_flag_to_warn_complement;
562     my $utf8n_flag_to_disallow_complement;
563     my $uvchr_flag_to_warn_complement;
564     my $uvchr_flag_to_disallow_complement;
565
566     # Many of the code points being tested are middling in that if code point
567     # edge cases work, these are very likely to as well.  Because this test
568     # file takes a while to execute, we skip testing the edge effects of code
569     # points deemed middling, while testing their basics and continuing to
570     # fully test the non-middling code points.
571     my $skip_most_tests = 0;
572
573     my $cp_message_qr;      # Pattern that matches the message raised when
574                             # that message contains the problematic code
575                             # point.  The message is the same (currently) both
576                             # when going from/to utf8.
577     my $non_cp_trailing_text;   # The suffix text when the message doesn't
578                                 # contain a code point.  (This is a result of
579                                 # some sort of malformation that means we
580                                 # can't get an exact code poin
581     my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
582                         \Q requires a Perl extension, and so is not\E
583                         \Q portable\E/x;
584     my $extended_non_cp_trailing_text
585                         = "is a Perl extension, and so is not portable";
586
587     # Is this test malformed from the beginning?  If so, we know to generally
588     # expect that the tests will show it isn't valid.
589     my $initially_malformed = 0;
590
591     if ($will_overflow || $allowed_uv > 0x10FFFF) {
592
593         # Set the SUPER flags; later, we test for PERL_EXTENDED as well.
594         $utf8n_flag_to_warn     = $::UTF8_WARN_SUPER;
595         $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
596         $uvchr_flag_to_warn     = $::UNICODE_WARN_SUPER;
597         $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
598
599         # Below, we add the flags for non-perl_extended to the code points
600         # that don't fit that category.  Special tests are done for this
601         # category in the inner loop.
602         $utf8n_flag_to_warn_complement     = $::UTF8_WARN_NONCHAR
603                                             |$::UTF8_WARN_SURROGATE;
604         $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
605                                             |$::UTF8_DISALLOW_SURROGATE;
606         $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_NONCHAR
607                                             |$::UNICODE_WARN_SURROGATE;
608         $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
609                                             |$::UNICODE_DISALLOW_SURROGATE;
610         $controlling_warning_category = 'non_unicode';
611
612         if ($will_overflow) {  # This is realy a malformation
613             $non_cp_trailing_text = "if you see this, there is an error";
614             $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
615             $initially_malformed = 1;
616         }
617         elsif (requires_extended_utf8($allowed_uv)) {
618             $cp_message_qr = $extended_cp_message_qr;
619             $non_cp_trailing_text = $extended_non_cp_trailing_text;
620             $needed_to_discern_len = 1 unless defined $needed_to_discern_len;
621         }
622         else {
623             $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
624                                 \Q may not be portable\E/x;
625             $non_cp_trailing_text = "is for a non-Unicode code point, may not"
626                                 . " be portable";
627             $utf8n_flag_to_warn_complement     |= $::UTF8_WARN_PERL_EXTENDED;
628             $utf8n_flag_to_disallow_complement
629                                            |= $::UTF8_DISALLOW_PERL_EXTENDED;
630             $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED;
631             $uvchr_flag_to_disallow_complement
632                                         |= $::UNICODE_DISALLOW_PERL_EXTENDED;
633         }
634     }
635     elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
636         $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/;
637         $non_cp_trailing_text = "is for a surrogate";
638         $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
639         $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
640
641         $utf8n_flag_to_warn     = $::UTF8_WARN_SURROGATE;
642         $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
643         $uvchr_flag_to_warn     = $::UNICODE_WARN_SURROGATE;
644         $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
645
646         $utf8n_flag_to_warn_complement     = $::UTF8_WARN_NONCHAR
647                                             |$::UTF8_WARN_SUPER
648                                             |$::UTF8_WARN_PERL_EXTENDED;
649         $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
650                                             |$::UTF8_DISALLOW_SUPER
651                                             |$::UTF8_DISALLOW_PERL_EXTENDED;
652         $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_NONCHAR
653                                             |$::UNICODE_WARN_SUPER
654                                             |$::UNICODE_WARN_PERL_EXTENDED;
655         $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
656                                             |$::UNICODE_DISALLOW_SUPER
657                                             |$::UNICODE_DISALLOW_PERL_EXTENDED;
658         $controlling_warning_category = 'surrogate';
659     }
660     elsif (   ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
661            || ($allowed_uv & 0xFFFE) == 0xFFFE)
662     {
663         $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E
664                             \Q is not recommended for open interchange\E/x;
665         $non_cp_trailing_text = "if you see this, there is an error";
666         $needed_to_discern_len = $length unless defined $needed_to_discern_len;
667         if (   ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
668             || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
669         {
670             $skip_most_tests = 1;
671         }
672
673         $utf8n_flag_to_warn     = $::UTF8_WARN_NONCHAR;
674         $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
675         $uvchr_flag_to_warn     = $::UNICODE_WARN_NONCHAR;
676         $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
677
678         $utf8n_flag_to_warn_complement     = $::UTF8_WARN_SURROGATE
679                                             |$::UTF8_WARN_SUPER
680                                             |$::UTF8_WARN_PERL_EXTENDED;
681         $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
682                                             |$::UTF8_DISALLOW_SUPER
683                                             |$::UTF8_DISALLOW_PERL_EXTENDED;
684         $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_SURROGATE
685                                             |$::UNICODE_WARN_SUPER
686                                             |$::UNICODE_WARN_PERL_EXTENDED;
687         $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
688                                             |$::UNICODE_DISALLOW_SUPER
689                                             |$::UNICODE_DISALLOW_PERL_EXTENDED;
690
691         $controlling_warning_category = 'nonchar';
692     }
693     else {
694         die "Can't figure out what type of warning to test for $testname"
695     }
696
697     die 'Didn\'t set $needed_to_discern_len for ' . $testname
698                                         unless defined $needed_to_discern_len;
699
700     {   # First test the isFOO calls
701         use warnings; no warnings 'deprecated';   # Make sure these don't raise warnings
702         undef @warnings_gotten;
703
704         my $ret = test_isUTF8_CHAR($bytes, $length);
705         my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
706         if ($initially_malformed) {
707             is($ret, 0, "For $testname: isUTF8_CHAR() returns 0");
708             is($ret_flags, 0, "    And isUTF8_CHAR_flags() returns 0");
709         }
710         else {
711             is($ret, $length,
712                "For $testname: isUTF8_CHAR() returns expected length: $length");
713             is($ret_flags, $length, "    And isUTF8_CHAR_flags(...,0)"
714                                   . " returns expected length: $length");
715         }
716         is(scalar @warnings_gotten, 0,
717                 "    And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags generated"
718               . " any warnings")
719           or output_warnings(@warnings_gotten);
720
721         undef @warnings_gotten;
722         $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
723         if ($initially_malformed) {
724             is($ret, 0, "    And isSTRICT_UTF8_CHAR() returns 0");
725         }
726         else {
727             my $expected_ret = (   $testname =~ /surrogate|non-character/
728                                 || $allowed_uv > 0x10FFFF)
729                                ? 0
730                                : $length;
731             is($ret, $expected_ret, "    And isSTRICT_UTF8_CHAR() returns"
732                                   . " expected length: $expected_ret");
733             $ret = test_isUTF8_CHAR_flags($bytes, $length,
734                                           $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
735             is($ret, $expected_ret,
736                     "    And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
737                     . " acts like isSTRICT_UTF8_CHAR");
738         }
739         is(scalar @warnings_gotten, 0,
740                 "    And neither isSTRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
741               . " generated any warnings")
742           or output_warnings(@warnings_gotten);
743
744         undef @warnings_gotten;
745         $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
746         if ($initially_malformed) {
747             is($ret, 0, "    And isC9_STRICT_UTF8_CHAR() returns 0");
748         }
749         else {
750             my $expected_ret = (   $testname =~ /surrogate/
751                                 || $allowed_uv > 0x10FFFF)
752                                ? 0
753                                : $length;
754             is($ret, $expected_ret, "    And isC9_STRICT_UTF8_CHAR()"
755                                    ." returns expected length: $expected_ret");
756             $ret = test_isUTF8_CHAR_flags($bytes, $length,
757                                         $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
758             is($ret, $expected_ret,
759                   "    And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
760                 . " acts like isC9_STRICT_UTF8_CHAR");
761         }
762         is(scalar @warnings_gotten, 0,
763                 "    And neither isC9_STRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
764               . " generated any warnings")
765           or output_warnings(@warnings_gotten);
766
767         foreach my $disallow_type (0..2) {
768             # 0 is don't disallow this type of code point
769             # 1 is do disallow
770             # 2 is do disallow, but only code points requiring
771             #   perl-extended-UTF8
772
773             my $disallow_flags;
774             my $expected_ret;
775
776             if ($initially_malformed) {
777
778                 # Malformations are by default disallowed, so testing with
779                 # $disallow_type equal to 0 is sufficicient.
780                 next if $disallow_type;
781
782                 $disallow_flags = 0;
783                 $expected_ret = 0;
784             }
785             elsif ($disallow_type == 1) {
786                 $disallow_flags = $utf8n_flag_to_disallow;
787                 $expected_ret = 0;
788             }
789             elsif ($disallow_type == 2) {
790                 next if ! requires_extended_utf8($allowed_uv);
791                 $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED;
792                 $expected_ret = 0;
793             }
794             else {  # type is 0
795                 $disallow_flags = $utf8n_flag_to_disallow_complement;
796                 $expected_ret = $length;
797             }
798
799             $ret = test_isUTF8_CHAR_flags($bytes, $length, $disallow_flags);
800             is($ret, $expected_ret, "    And isUTF8_CHAR_flags("
801                                   . "$display_bytes, $disallow_flags) returns "
802                                   . $expected_ret)
803              or diag "The flags mean "
804               . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
805
806             is(scalar @warnings_gotten, 0,
807                     "    And isUTF8_CHAR_flags(...) generated no warnings")
808             or output_warnings(@warnings_gotten);
809
810             # Test partial character handling, for each byte not a full character
811             my $did_test_partial = 0;
812             for (my $j = 1; $j < $length - 1; $j++) {
813                 $did_test_partial = 1;
814                 my $partial = substr($bytes, 0, $j);
815                 my $ret_should_be;
816                 my $comment;
817                 if ($disallow_type || $initially_malformed) {
818                     $ret_should_be = 0;
819                     $comment = "disallowed";
820                     if ($j < $needed_to_discern_len) {
821                         $ret_should_be = 1;
822                         $comment .= ", but need $needed_to_discern_len"
823                                  . " bytes to discern:";
824                     }
825                 }
826                 else {
827                     $ret_should_be = 1;
828                     $comment = "allowed";
829                 }
830
831                 undef @warnings_gotten;
832
833                 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
834                                                              $disallow_flags);
835                 is($ret, $ret_should_be,
836                     "    And is_utf8_valid_partial_char_flags("
837                     . display_bytes($partial)
838                     . ", $disallow_flags), $comment: returns $ret_should_be")
839                  or diag "The flags mean "
840                   . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
841             }
842
843             if ($did_test_partial) {
844                 is(scalar @warnings_gotten, 0,
845                         "    And is_utf8_valid_partial_char_flags()"
846                         . " generated no warnings for any of the lengths")
847                     or output_warnings(@warnings_gotten);
848             }
849         }
850     }
851
852     # Now test the to/from UTF-8 calls
853     # This is more complicated than the malformations tested in other files in
854     # this directory, as there are several orthogonal variables involved.  We
855     # test most possible combinations
856
857     # We try various combinations of malformations that can occur
858     foreach my $short (0, 1) {
859       next if $skip_most_tests && $short;
860       foreach my $unexpected_noncont (0, 1) {
861         next if $skip_most_tests && $unexpected_noncont;
862         foreach my $overlong (0, 1) {
863           next if $overlong && $skip_most_tests;
864
865           # If we're creating an overlong, it can't be longer than the
866           # maximum length, so skip if we're already at that length.
867           next if $overlong && $length >= $::max_bytes;
868
869           my $this_cp_message_qr = $cp_message_qr;
870           my $this_non_cp_trailing_text = $non_cp_trailing_text;
871
872           foreach my $malformed_allow_type (0..2) {
873             # 0 don't allow this malformation; ignored if no malformation
874             # 1 allow, with REPLACEMENT CHARACTER returned
875             # 2 allow, with intended code point returned.  All malformations
876             #   other than overlong can't determine the intended code point,
877             #   so this isn't valid for them.
878             next if     $malformed_allow_type == 2
879                     && ($will_overflow || $short || $unexpected_noncont);
880             next if $skip_most_tests && $malformed_allow_type;
881
882             # Here we are in the innermost loop for malformations.  So we
883             # know which ones are in effect.  Can now change the input to be
884             # appropriately malformed.  We also can set up certain other
885             # things now, like whether we expect a return flag from this
886             # malformation, and which flag.
887
888             my $this_bytes = $bytes;
889             my $this_length = $length;
890             my $this_expected_len = $length;
891             my $this_needed_to_discern_len = $needed_to_discern_len;
892
893             my @malformation_names;
894             my @expected_malformation_warnings;
895             my @expected_malformation_return_flags;
896
897             # Contains the flags for any allowed malformations.  Currently no
898             # combinations of on/off are tested for.  It's either all are
899             # allowed, or none are.
900             my $allow_flags = 0;
901             my $overlong_is_in_perl_extended_utf8 = 0;
902             my $dont_use_overlong_cp = 0;
903
904             if ($overlong) {
905                 my $new_expected_len;
906
907                 # To force this malformation, we convert the original start
908                 # byte into a continuation byte with the same data bits as
909                 # originally. ...
910                 my $start_byte = substr($this_bytes, 0, 1);
911                 my $converted_to_continuation_byte
912                                             = start_byte_to_cont($start_byte);
913
914                 # ... Then we prepend it with a known overlong sequence.  This
915                 # should evaluate to the exact same code point as the
916                 # original.  We try to avoid an overlong using Perl extended
917                 # UTF-8.  The code points are the highest representable as
918                 # overlongs on the respective platform without using extended
919                 # UTF-8.
920                 if (native_to_I8($start_byte) lt "\xFC") {
921                     $start_byte = I8_to_native("\xFC");
922                     $new_expected_len = 6;
923                 }
924                 elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") {
925
926                     # FE is not extended UTF-8 on EBCDIC
927                     $start_byte = I8_to_native("\xFE");
928                     $new_expected_len = 7;
929                 }
930                 else {  # Must use extended UTF-8.  On ASCII platforms, we
931                         # could express some overlongs here starting with
932                         # \xFE, but there's no real reason to do so.
933                     $overlong_is_in_perl_extended_utf8 = 1;
934                     $start_byte = I8_to_native("\xFF");
935                     $new_expected_len = $::max_bytes;
936                     $this_cp_message_qr = $extended_cp_message_qr;
937
938                     # The warning that gets raised doesn't include the code
939                     # point in the message if the code point can be expressed
940                     # without using extended UTF-8, but the particular
941                     # overlong sequence used is in extended UTF-8.  To do
942                     # otherwise would be confusing to the user, as it would
943                     # claim the code point requires extended, when it doesn't.
944                     $dont_use_overlong_cp = 1
945                                     unless requires_extended_utf8($allowed_uv);
946                     $this_non_cp_trailing_text = $extended_non_cp_trailing_text;
947                 }
948
949                 # Splice in the revise continuation byte, preceded by the
950                 # start byte and the proper number of the lowest continuation
951                 # bytes.
952                 $this_bytes =   $start_byte
953                              . ($native_lowest_continuation_chr
954                                 x ( $new_expected_len - 1 - length($this_bytes)))
955                              .  $converted_to_continuation_byte
956                              .  substr($this_bytes, 1);
957                 $this_length = length($this_bytes);
958                 $this_needed_to_discern_len =    $new_expected_len
959                                             - (  $this_expected_len
960                                                - $this_needed_to_discern_len);
961                 $this_expected_len = $new_expected_len;
962                 push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
963                 push @malformation_names, 'overlong';
964
965                 if ($malformed_allow_type == 2) {
966                     $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
967                 }
968                 elsif ($malformed_allow_type) {
969                     $allow_flags |= $::UTF8_ALLOW_LONG;
970                 }
971             }
972
973             if ($short) {
974                 push @malformation_names, 'short';
975
976                 # To force this malformation, just tell the test to not look
977                 # as far as it should into the input.
978                 $this_length--;
979                 $this_expected_len--;
980                 push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
981
982                 $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
983             }
984
985             if ($unexpected_noncont) {
986                 push @malformation_names, 'unexpected non-continuation';
987
988                 # To force this malformation, change the final continuation
989                 # byte into a non continuation.
990                 my $pos = ($short) ? -2 : -1;
991                 substr($this_bytes, $pos, 1) = '?';
992                 $this_expected_len--;
993                 push @expected_malformation_return_flags,
994                                 $::UTF8_GOT_NON_CONTINUATION;
995                 $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
996                                                     if $malformed_allow_type;
997             }
998
999             # Here, we've transformed the input with all of the desired
1000             # non-overflow malformations.  We are now in a position to
1001             # construct any potential warnings for those malformations.  But
1002             # it's a pain to get the detailed messages exactly right, so for
1003             # now XXX, only do so for those that return an explicit code
1004             # point.
1005
1006             if ($overlong) {
1007
1008                 # If one of the other malformation types is also in effect, we
1009                 # don't know what the intended code point was.
1010                 if ($short || $unexpected_noncont || $will_overflow) {
1011                     push @expected_malformation_warnings, qr/overlong/;
1012                 }
1013                 else {
1014                     my $wrong_bytes = display_bytes_no_quotes(
1015                                          substr($this_bytes, 0, $this_length));
1016                     my $correct_bytes = display_bytes_no_quotes($bytes);
1017                     my $prefix = ($allowed_uv > 0x10FFFF) ? "0x" : "U+";
1018                     push @expected_malformation_warnings,
1019                             qr/\QMalformed UTF-8 character: $wrong_bytes\E
1020                                \Q (overlong; instead use\E
1021                                \Q $correct_bytes to\E
1022                                \Q represent $prefix$uv_string)/x;
1023                 }
1024             }
1025             if ($short) {
1026                 push @expected_malformation_warnings, qr/too short/;
1027             }
1028             if ($unexpected_noncont) {
1029                 push @expected_malformation_warnings,
1030                                         qr/unexpected non-continuation byte/;
1031             }
1032
1033             # The overflow malformation is done differently than other
1034             # malformations.  It comes from manually typed tests in the test
1035             # array.  We now make it be treated like one of the other
1036             # malformations.  But some has to be deferred until the inner loop
1037             my $overflow_msg_pattern;
1038             if ($will_overflow) {
1039                 push @malformation_names, 'overflow';
1040
1041                 $overflow_msg_pattern = display_bytes_no_quotes(
1042                                     substr($this_bytes, 0, $this_expected_len));
1043                 $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E
1044                                            \Q $overflow_msg_pattern\E
1045                                            \Q (overflows)\E/x;
1046                 push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW;
1047                 $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type;
1048             }
1049
1050             # And we can create the malformation-related text for the the test
1051             # names we eventually will generate.
1052             my $malformations_name = "";
1053             if (@malformation_names) {
1054                 $malformations_name .= "dis" unless $malformed_allow_type;
1055                 $malformations_name .= "allowed ";
1056                 $malformations_name .= "malformation";
1057                 $malformations_name .= "s" if @malformation_names > 1;
1058                 $malformations_name .= ": ";
1059                 $malformations_name .=  join "/", @malformation_names;
1060                 $malformations_name =  " ($malformations_name)";
1061             }
1062
1063             # Done setting up the malformation related stuff
1064
1065             foreach my $do_disallow (0, 1) {
1066               next if $skip_most_tests && ! $do_disallow;
1067
1068               # We classify the warnings into certain "interesting" types,
1069               # described later
1070               foreach my $warning_type (0..4) {
1071                 next if $skip_most_tests && $warning_type != 1;
1072                 foreach my $use_warn_flag (0, 1) {
1073                     next if $skip_most_tests && ! $use_warn_flag;
1074
1075                     # Finally, here is the inner loop
1076
1077                     my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn;
1078                     my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow;
1079                     my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn;
1080                     my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow;
1081
1082                     my $eval_warn;
1083                     my $expect_regular_warnings;
1084                     my $expect_warnings_for_malformed;
1085                     my $expect_warnings_for_overflow;
1086
1087                     if ($warning_type == 0) {
1088                         $eval_warn = "use warnings; no warnings 'deprecated'";
1089                         $expect_regular_warnings = $use_warn_flag;
1090
1091                         # We ordinarily expect overflow warnings here.  But it
1092                         # is somewhat more complicated, and the final
1093                         # determination is deferred to one place in the filw
1094                         # where we handle overflow.
1095                         $expect_warnings_for_overflow = 1;
1096
1097                         # We would ordinarily expect malformed warnings in
1098                         # this case, but not if malformations are allowed.
1099                         $expect_warnings_for_malformed
1100                                                 = $malformed_allow_type == 0;
1101                     }
1102                     elsif ($warning_type == 1) {
1103                         $eval_warn = "no warnings";
1104                         $expect_regular_warnings = 0;
1105                         $expect_warnings_for_overflow = 0;
1106                         $expect_warnings_for_malformed = 0;
1107                     }
1108                     elsif ($warning_type == 2) {
1109                         $eval_warn = "no warnings; use warnings 'utf8'";
1110                         $expect_regular_warnings = $use_warn_flag;
1111                         $expect_warnings_for_overflow = 1;
1112                         $expect_warnings_for_malformed
1113                                                 = $malformed_allow_type == 0;
1114                     }
1115                     elsif ($warning_type == 3) {
1116                         $eval_warn = "no warnings; use warnings"
1117                                    . " '$controlling_warning_category'";
1118                         $expect_regular_warnings = $use_warn_flag;
1119                         $expect_warnings_for_overflow
1120                             = $controlling_warning_category eq 'non_unicode';
1121                         $expect_warnings_for_malformed = 0;
1122                     }
1123                     elsif ($warning_type == 4) {  # Like type 3, but uses the
1124                                                   # PERL_EXTENDED flags
1125                         # The complement flags were set up so that the
1126                         # PERL_EXTENDED flags have been tested that they don't
1127                         # trigger wrongly for too small code points.  And the
1128                         # flags have been set up so that those small code
1129                         # points are tested for being above Unicode.  What's
1130                         # left to test is that the large code points do
1131                         # trigger the PERL_EXTENDED flags.
1132                         next if ! requires_extended_utf8($allowed_uv);
1133                         next if $controlling_warning_category ne 'non_unicode';
1134                         $eval_warn = "no warnings; use warnings 'non_unicode'";
1135                         $expect_regular_warnings = 1;
1136                         $expect_warnings_for_overflow = 1;
1137                         $expect_warnings_for_malformed = 0;
1138                         $this_utf8n_flag_to_warn = $::UTF8_WARN_PERL_EXTENDED;
1139                         $this_utf8n_flag_to_disallow
1140                                              = $::UTF8_DISALLOW_PERL_EXTENDED;
1141                         $this_uvchr_flag_to_warn
1142                                               = $::UNICODE_WARN_PERL_EXTENDED;
1143                         $this_uvchr_flag_to_disallow
1144                                           = $::UNICODE_DISALLOW_PERL_EXTENDED;
1145                     }
1146                     else {
1147                        die "Unexpected warning type '$warning_type'";
1148                     }
1149
1150                     # We only need to test the case where all warnings are
1151                     # enabled (type 0) to see if turning off the warning flag
1152                     # causes things to not be output.  If those pass, then
1153                     # turning on some sub-category of warnings, or turning off
1154                     # warnings altogether are extremely likely to not output
1155                     # warnings either, given how the warnings subsystem is
1156                     # supposed to work, and this file assumes it does work.
1157                     next if $warning_type != 0 && ! $use_warn_flag;
1158
1159                     # The convention is that the 'got' flag is the same value
1160                     # as the disallow one.  If this were violated, the tests
1161                     # here should start failing.
1162                     my $return_flag = $this_utf8n_flag_to_disallow;
1163
1164                     # If we aren't expecting warnings/disallow for this, turn
1165                     # on all the other flags.  That makes sure that they all
1166                     # are independent of this flag, and so we don't need to
1167                     # test them individually.
1168                     my $this_warning_flags
1169                             = ($use_warn_flag)
1170                               ? $this_utf8n_flag_to_warn
1171                               : ($overlong_is_in_perl_extended_utf8
1172                                 ? ($utf8n_flag_to_warn_complement
1173                                     & ~$::UTF8_WARN_PERL_EXTENDED)
1174                                 :  $utf8n_flag_to_warn_complement);
1175                     my $this_disallow_flags
1176                             = ($do_disallow)
1177                               ? $this_utf8n_flag_to_disallow
1178                               : ($overlong_is_in_perl_extended_utf8
1179                                  ? ($utf8n_flag_to_disallow_complement
1180                                     & ~$::UTF8_DISALLOW_PERL_EXTENDED)
1181                                  :  $utf8n_flag_to_disallow_complement);
1182                     my $expected_uv = $allowed_uv;
1183                     my $this_uv_string = $uv_string;
1184
1185                     my @expected_return_flags
1186                                         = @expected_malformation_return_flags;
1187                     my @expected_warnings;
1188                     push @expected_warnings, @expected_malformation_warnings
1189                                             if $expect_warnings_for_malformed;
1190
1191                     # The overflow malformation is done differently than other
1192                     # malformations.  It comes from manually typed tests in
1193                     # the test array, but it also is above Unicode and uses
1194                     # Perl extended UTF-8, so affects some of the flags being
1195                     # tested.  We now make it be treated like one of the other
1196                     # generated malformations.
1197                     if ($will_overflow) {
1198
1199                         # An overflow is (way) above Unicode, and overrides
1200                         # everything else.
1201                         $expect_regular_warnings = 0;
1202
1203                         # Earlier, we tentatively calculated whether this
1204                         # should emit a message or not.  It's tentative
1205                         # because, even if we ordinarily would output it, we
1206                         # don't if malformations are allowed -- except an
1207                         # overflow is also a SUPER and PERL_EXTENDED, and if
1208                         # warnings for those are enabled, the overflow
1209                         # warning does get raised.
1210                         if (   $expect_warnings_for_overflow
1211                             && (    $malformed_allow_type == 0
1212                                 ||   (   $this_warning_flags
1213                                       & ($::UTF8_WARN_SUPER
1214                                         |$::UTF8_WARN_PERL_EXTENDED))))
1215                         {
1216                             push @expected_warnings, $overflow_msg_pattern;
1217                         }
1218                     }
1219
1220                     # It may be that the malformations have shortened the
1221                     # amount of input we look at so much that we can't tell
1222                     # what the category the code point was in.  Otherwise, set
1223                     # up the expected return flags based on the warnings and
1224                     # disallowments.
1225                     if ($this_expected_len < $this_needed_to_discern_len) {
1226                         $expect_regular_warnings = 0;
1227                     }
1228                     elsif (   ($this_warning_flags & $this_utf8n_flag_to_warn)
1229                            || (  $this_disallow_flags
1230                                & $this_utf8n_flag_to_disallow))
1231                     {
1232                         push @expected_return_flags, $return_flag;
1233                     }
1234
1235                     # Finish setting up the expected warning.
1236                     if ($expect_regular_warnings) {
1237
1238                         # So far the array contains warnings generated by
1239                         # malformations.  Add the expected regular one.
1240                         unshift @expected_warnings, $this_cp_message_qr;
1241
1242                         # But it may need to be modified, because either of
1243                         # these malformations means we can't determine the
1244                         # expected code point.
1245                         if (   $short || $unexpected_noncont
1246                             || $dont_use_overlong_cp)
1247                         {
1248                             my $first_byte = substr($this_bytes, 0, 1);
1249                             $expected_warnings[0] = display_bytes(
1250                                     substr($this_bytes, 0, $this_expected_len));
1251                             $expected_warnings[0]
1252                                 = qr/[Aa]\Qny UTF-8 sequence that starts with\E
1253                                      \Q $expected_warnings[0]\E
1254                                      \Q $this_non_cp_trailing_text\E/x;
1255                         }
1256                     }
1257
1258                     # Is effectively disallowed if we've set up a malformation
1259                     # (unless malformations are allowed), even if the flag
1260                     # indicates it is allowed.  Fix up test name to indicate
1261                     # this as well
1262                     my $disallowed = 0;
1263                     if (   $this_disallow_flags & $this_utf8n_flag_to_disallow
1264                         && $this_expected_len >= $this_needed_to_discern_len)
1265                     {
1266                         $disallowed = 1;
1267                     }
1268                     if ($malformations_name) {
1269                         if ($malformed_allow_type == 0) {
1270                             $disallowed = 1;
1271                         }
1272                         elsif ($malformed_allow_type == 1) {
1273
1274                             # Even if allowed, the malformation returns the
1275                             # REPLACEMENT CHARACTER.
1276                             $expected_uv = 0xFFFD;
1277                             $this_uv_string = "0xFFFD"
1278                         }
1279                     }
1280
1281                     my $this_name = "utf8n_to_uvchr_error() $testname: "
1282                                                 . (($disallowed)
1283                                                    ? 'disallowed'
1284                                                    : 'allowed');
1285                     $this_name .= ", $eval_warn";
1286                     $this_name .= ", " . ((  $this_warning_flags
1287                                             & $this_utf8n_flag_to_warn)
1288                                           ? 'with flag for raising warnings'
1289                                           : 'no flag for raising warnings');
1290                     $this_name .= $malformations_name;
1291
1292                     # Do the actual test using an eval
1293                     undef @warnings_gotten;
1294                     my $ret_ref;
1295                     my $this_flags
1296                         = $allow_flags|$this_warning_flags|$this_disallow_flags;
1297                     my $eval_text =      "$eval_warn; \$ret_ref"
1298                             . " = test_utf8n_to_uvchr_error("
1299                             . "'$this_bytes', $this_length, $this_flags)";
1300                     eval "$eval_text";
1301                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1302                     {
1303                         diag "\$@='$@'; call was: "
1304                            . utf8n_display_call($eval_text);
1305                         next;
1306                     }
1307                     if ($disallowed) {
1308                         is($ret_ref->[0], 0, "    And returns 0")
1309                           or diag "Call was: " . utf8n_display_call($eval_text);
1310                     }
1311                     else {
1312                         is($ret_ref->[0], $expected_uv,
1313                                 "    And returns expected uv: "
1314                               . $this_uv_string)
1315                           or diag "Call was: " . utf8n_display_call($eval_text);
1316                     }
1317                     is($ret_ref->[1], $this_expected_len,
1318                                         "    And returns expected length:"
1319                                       . " $this_expected_len")
1320                       or diag "Call was: " . utf8n_display_call($eval_text);
1321
1322                     my $returned_flags = $ret_ref->[2];
1323
1324                     for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
1325                         if ($expected_return_flags[$i] & $returned_flags) {
1326                             if ($expected_return_flags[$i]
1327                                                 == $::UTF8_GOT_PERL_EXTENDED)
1328                             {
1329                                 pass("    Expected and got return flag for"
1330                                    . " PERL_EXTENDED");
1331                             }
1332                                    # The first entries in this are
1333                                    # malformations
1334                             elsif ($i > @malformation_names - 1)  {
1335                                 pass("    Expected and got return flag"
1336                                    . " for " . $controlling_warning_category);
1337                             }
1338                             else {
1339                                 pass("    Expected and got return flag for "
1340                                    . $malformation_names[$i]
1341                                    . " malformation");
1342                             }
1343                             $returned_flags &= ~$expected_return_flags[$i];
1344                             splice @expected_return_flags, $i, 1;
1345                         }
1346                     }
1347
1348                     is($returned_flags, 0,
1349                        "    Got no unexpected return flags")
1350                       or diag "The unexpected flags gotten were: "
1351                            . (flags_to_text($returned_flags,
1352                                             \@utf8n_flags_to_text)
1353                                 # We strip off any prefixes from the flag
1354                                 # names
1355                              =~ s/ \b [A-Z] _ //xgr);
1356                     is (scalar @expected_return_flags, 0,
1357                         "    Got all expected return flags")
1358                         or diag "The expected flags not gotten were: "
1359                            . (flags_to_text(eval join("|",
1360                                                         @expected_return_flags),
1361                                             \@utf8n_flags_to_text)
1362                                 # We strip off any prefixes from the flag
1363                                 # names
1364                              =~ s/ \b [A-Z] _ //xgr);
1365
1366                     do_warnings_test(@expected_warnings)
1367                       or diag "Call was: " . utf8n_display_call($eval_text);
1368                     undef @warnings_gotten;
1369
1370                     # Check CHECK_ONLY results when the input is
1371                     # disallowed.  Do this when actually disallowed,
1372                     # not just when the $this_disallow_flags is set
1373                     if ($disallowed) {
1374                         my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
1375                         my $eval_text = "use warnings; no warnings 'deprecated'; \$ret_ref ="
1376                                       . " test_utf8n_to_uvchr_error('"
1377                                       . "$this_bytes', $this_length,"
1378                                       . " $this_flags)";
1379                         eval $eval_text;
1380                         if (! ok ("$@ eq ''",
1381                             "    And eval succeeded with CHECK_ONLY"))
1382                         {
1383                             diag "\$@='$@'; Call was: "
1384                                . utf8n_display_call($eval_text);
1385                             next;
1386                         }
1387                         is($ret_ref->[0], 0, "    CHECK_ONLY: Returns 0")
1388                           or diag "Call was: " . utf8n_display_call($eval_text);
1389                         is($ret_ref->[1], -1,
1390                                        "    CHECK_ONLY: returns -1 for length")
1391                           or diag "Call was: " . utf8n_display_call($eval_text);
1392                         if (! is(scalar @warnings_gotten, 0,
1393                                       "    CHECK_ONLY: no warnings generated"))
1394                         {
1395                             diag "Call was: " . utf8n_display_call($eval_text);
1396                             output_warnings(@warnings_gotten);
1397                         }
1398                     }
1399
1400                     # Now repeat some of the above, but for
1401                     # uvchr_to_utf8_flags().  Since this comes from an
1402                     # existing code point, it hasn't overflowed, and isn't
1403                     # malformed.
1404                     next if @malformation_names;
1405
1406                     $this_warning_flags = ($use_warn_flag)
1407                                           ? $this_uvchr_flag_to_warn
1408                                           : 0;
1409                     $this_disallow_flags = ($do_disallow)
1410                                            ? $this_uvchr_flag_to_disallow
1411                                            : 0;
1412
1413                     $disallowed = $this_disallow_flags
1414                                 & $this_uvchr_flag_to_disallow;
1415                     $this_name .= ", " . ((  $this_warning_flags
1416                                            & $this_utf8n_flag_to_warn)
1417                                           ? 'with flag for raising warnings'
1418                                           : 'no flag for raising warnings');
1419
1420                     $this_name = "uvchr_to_utf8_flags() $testname: "
1421                                             . (($disallowed)
1422                                                 ? 'disallowed'
1423                                                 : 'allowed');
1424                     $this_name .= ", $eval_warn";
1425                     $this_name .= ", " . ((  $this_warning_flags
1426                                            & $this_uvchr_flag_to_warn)
1427                                         ? 'with warning flag'
1428                                         : 'no warning flag');
1429
1430                     undef @warnings_gotten;
1431                     my $ret;
1432                     $this_flags = $this_warning_flags|$this_disallow_flags;
1433                     $eval_text = "$eval_warn; \$ret ="
1434                             . " test_uvchr_to_utf8_flags("
1435                             . "$allowed_uv, $this_flags)";
1436                     eval "$eval_text";
1437                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1438                     {
1439                         diag "\$@='$@'; call was: "
1440                            . uvchr_display_call($eval_text);
1441                         next;
1442                     }
1443                     if ($disallowed) {
1444                         is($ret, undef, "    And returns undef")
1445                           or diag "Call was: " . uvchr_display_call($eval_text);
1446                     }
1447                     else {
1448                         is($ret, $this_bytes, "    And returns expected string")
1449                           or diag "Call was: " . uvchr_display_call($eval_text);
1450                     }
1451
1452                     do_warnings_test(@expected_warnings)
1453                       or diag "Call was: " . uvchr_display_call($eval_text);
1454                 }
1455               }
1456             }
1457           }
1458         }
1459       }
1460     }
1461 }
1462
1463 done_testing;