APItest/t/utf8_warn_base.pl: Move some setup code
[perl.git] / ext / XS-APItest / t / utf8_warn_base.pl
1 #!perl -w
2
3 # This is a base file to be used by various .t's in its directory
4 # It tests various code points that are "problematic", and verifies that the
5 # correct warnings/flags etc are generated when using them.  It also takes the
6 # UTF-8 for some of them and perturbs it to be malformed in various ways, and
7 # tests that this gets appropriately detected.
8
9 use strict;
10 use Test::More;
11
12 BEGIN {
13     use_ok('XS::APItest');
14     require 'charset_tools.pl';
15     require './t/utf8_setup.pl';
16 };
17
18 $|=1;
19
20 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
21                           # machines, and that is tested elsewhere
22 use XS::APItest;
23
24 my @warnings_gotten;
25
26 use warnings 'utf8';
27 local $SIG{__WARN__} = sub { my @copy = @_;
28                              push @warnings_gotten, map { chomp; $_ } @copy;
29                            };
30
31 my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
32 my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
33
34 sub requires_extended_utf8($) {
35
36     # Returns a boolean as to whether or not the code point parameter fits
37     # into 31 bits, subject to the convention that a negative code point
38     # stands for one that overflows the word size, so won't fit in 31 bits.
39
40     return shift > $highest_non_extended_utf8_cp;
41 }
42
43 sub overflow_discern_len($) {
44
45     # Returns how many bytes are needed to tell if a UTF-8 sequence is for a
46     # code point that won't fit in the platform's word size.  Only the length
47     # of the sequence representing a single code point is needed.
48
49     if (isASCII) {
50         return ($::is64bit) ? 3 : ((shift == $::max_bytes)
51                                    ? 1
52                                    : 2);
53     }
54
55     return ($::is64bit) ? 2 : 8;
56 }
57
58 my @tests;
59 {
60     no warnings qw(portable overflow);
61     @tests = (
62         # $testname,
63         # $bytes,                  UTF-8 string
64         # $allowed_uv,             code point $bytes evaluates to; -1 if
65         #                          overflows
66         # $needed_to_discern_len   optional, how long an initial substring do
67         #                          we need to tell that the string must be for
68         #                          a code point in the category it falls in,
69         #                          like being a surrogate; 0 indicates we need
70         #                          the whole string.  Some categories have a
71         #                          default that is used if this is omitted.
72         [ "lowest surrogate",
73             (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
74             0xD800,
75         ],
76         [ "a middle surrogate",
77             (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
78             0xD90D,
79         ],
80         [ "highest surrogate",
81             (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
82             0xDFFF,
83         ],
84         [ "first of 32 consecutive non-character code points",
85             (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
86             0xFDD0,
87         ],
88         [ "a mid non-character code point of the 32 consecutive ones",
89             (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
90             0xFDE0,
91         ],
92         [ "final of 32 consecutive non-character code points",
93             (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
94             0xFDEF,
95         ],
96         [ "non-character code point U+FFFE",
97             (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
98             0xFFFE,
99         ],
100         [ "non-character code point U+FFFF",
101             (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
102             0xFFFF,
103         ],
104         [ "non-character code point U+1FFFE",
105             (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
106             0x1FFFE,
107         ],
108         [ "non-character code point U+1FFFF",
109             (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
110             0x1FFFF,
111         ],
112         [ "non-character code point U+2FFFE",
113             (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
114             0x2FFFE,
115         ],
116         [ "non-character code point U+2FFFF",
117             (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
118             0x2FFFF,
119         ],
120         [ "non-character code point U+3FFFE",
121             (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
122             0x3FFFE,
123         ],
124         [ "non-character code point U+3FFFF",
125             (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
126             0x3FFFF,
127         ],
128         [ "non-character code point U+4FFFE",
129             (isASCII)
130             ?               "\xf1\x8f\xbf\xbe"
131             : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
132             0x4FFFE,
133         ],
134         [ "non-character code point U+4FFFF",
135             (isASCII)
136             ?               "\xf1\x8f\xbf\xbf"
137             : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
138             0x4FFFF,
139         ],
140         [ "non-character code point U+5FFFE",
141             (isASCII)
142             ?              "\xf1\x9f\xbf\xbe"
143             : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
144             0x5FFFE,
145         ],
146         [ "non-character code point U+5FFFF",
147             (isASCII)
148             ?              "\xf1\x9f\xbf\xbf"
149             : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
150             0x5FFFF,
151         ],
152         [ "non-character code point U+6FFFE",
153             (isASCII)
154             ?              "\xf1\xaf\xbf\xbe"
155             : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
156             0x6FFFE,
157         ],
158         [ "non-character code point U+6FFFF",
159             (isASCII)
160             ?              "\xf1\xaf\xbf\xbf"
161             : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
162             0x6FFFF,
163         ],
164         [ "non-character code point U+7FFFE",
165             (isASCII)
166             ?              "\xf1\xbf\xbf\xbe"
167             : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
168             0x7FFFE,
169         ],
170         [ "non-character code point U+7FFFF",
171             (isASCII)
172             ?              "\xf1\xbf\xbf\xbf"
173             : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
174             0x7FFFF,
175         ],
176         [ "non-character code point U+8FFFE",
177             (isASCII)
178             ?              "\xf2\x8f\xbf\xbe"
179             : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
180             0x8FFFE,
181         ],
182         [ "non-character code point U+8FFFF",
183             (isASCII)
184             ?              "\xf2\x8f\xbf\xbf"
185             : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
186             0x8FFFF,
187         ],
188         [ "non-character code point U+9FFFE",
189             (isASCII)
190             ?              "\xf2\x9f\xbf\xbe"
191             : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
192             0x9FFFE,
193         ],
194         [ "non-character code point U+9FFFF",
195             (isASCII)
196             ?              "\xf2\x9f\xbf\xbf"
197             : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
198             0x9FFFF,
199         ],
200         [ "non-character code point U+AFFFE",
201             (isASCII)
202             ?              "\xf2\xaf\xbf\xbe"
203             : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
204             0xAFFFE,
205         ],
206         [ "non-character code point U+AFFFF",
207             (isASCII)
208             ?              "\xf2\xaf\xbf\xbf"
209             : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
210             0xAFFFF,
211         ],
212         [ "non-character code point U+BFFFE",
213             (isASCII)
214             ?              "\xf2\xbf\xbf\xbe"
215             : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
216             0xBFFFE,
217         ],
218         [ "non-character code point U+BFFFF",
219             (isASCII)
220             ?              "\xf2\xbf\xbf\xbf"
221             : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
222             0xBFFFF,
223         ],
224         [ "non-character code point U+CFFFE",
225             (isASCII)
226             ?              "\xf3\x8f\xbf\xbe"
227             : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
228             0xCFFFE,
229         ],
230         [ "non-character code point U+CFFFF",
231             (isASCII)
232             ?              "\xf3\x8f\xbf\xbf"
233             : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
234             0xCFFFF,
235         ],
236         [ "non-character code point U+DFFFE",
237             (isASCII)
238             ?              "\xf3\x9f\xbf\xbe"
239             : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
240             0xDFFFE,
241         ],
242         [ "non-character code point U+DFFFF",
243             (isASCII)
244             ?              "\xf3\x9f\xbf\xbf"
245             : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
246             0xDFFFF,
247         ],
248         [ "non-character code point U+EFFFE",
249             (isASCII)
250             ?              "\xf3\xaf\xbf\xbe"
251             : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
252             0xEFFFE,
253         ],
254         [ "non-character code point U+EFFFF",
255             (isASCII)
256             ?              "\xf3\xaf\xbf\xbf"
257             : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
258             0xEFFFF,
259         ],
260         [ "non-character code point U+FFFFE",
261             (isASCII)
262             ?              "\xf3\xbf\xbf\xbe"
263             : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
264             0xFFFFE,
265         ],
266         [ "non-character code point U+FFFFF",
267             (isASCII)
268             ?              "\xf3\xbf\xbf\xbf"
269             : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
270             0xFFFFF,
271         ],
272         [ "non-character code point U+10FFFE",
273             (isASCII)
274             ?              "\xf4\x8f\xbf\xbe"
275             : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
276             0x10FFFE,
277         ],
278         [ "non-character code point U+10FFFF",
279             (isASCII)
280             ?              "\xf4\x8f\xbf\xbf"
281             : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
282             0x10FFFF,
283         ],
284         [ "first non_unicode",
285             (isASCII)
286             ?              "\xf4\x90\x80\x80"
287             : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
288             0x110000,
289             2,
290         ],
291         [ "non_unicode whose first byte tells that",
292             (isASCII)
293             ?              "\xf5\x80\x80\x80"
294             : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
295             (isASCII) ? 0x140000 : 0x200000,
296             1,
297         ],
298         [ "lowest 32 bit code point",
299             (isASCII)
300             ?  "\xfe\x82\x80\x80\x80\x80\x80"
301             : I8_to_native(
302                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
303             0x80000000,
304         ],
305         [ "highest 32 bit code point",
306             (isASCII)
307             ?  "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
308             : I8_to_native(
309                "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
310             0xFFFFFFFF,
311         ],
312         [ "Lowest 33 bit code point",
313             (isASCII)
314             ?  "\xfe\x84\x80\x80\x80\x80\x80"
315             : I8_to_native(
316                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
317             ($::is64bit) ? 0x100000000 : -1,   # Overflows on 32-bit systems
318         ],
319     );
320
321     if (! $::is64bit) {
322         if (isASCII) {
323             push @tests,
324                 [ "overflow that old algorithm failed to detect",
325                     "\xfe\x86\x80\x80\x80\x80\x80",
326                     -1,
327                 ];
328         }
329     }
330
331     if ($::is64bit) {
332         push @tests,
333             [ "highest 64 bit code point",
334               (isASCII)
335               ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
336               : I8_to_native(
337                 "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
338               0xFFFFFFFFFFFFFFFF,
339               (isASCII) ? 1 : 2,
340             ],
341             [ "first 65 bit code point",
342               (isASCII)
343               ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
344               : I8_to_native(
345                 "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
346               -1,
347             ];
348         if (isASCII) {
349             push @tests,
350                 [ "Lowest code point requiring 13 bytes to represent",
351                     "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
352                     0x1000000000,
353                 ],
354                 [ "overflow that old algorithm failed to detect",
355                     "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
356                     -1,
357                 ];
358         }
359         else {
360             push @tests,    # These could falsely show wrongly in a naive
361                             # implementation
362                 [ "requires at least 32 bits",
363                     I8_to_native(
364                     "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
365                     0x800000000,
366                 ],
367                 [ "requires at least 32 bits",
368                     I8_to_native(
369                     "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
370                     0x10000000000,
371                 ],
372                 [ "requires at least 32 bits",
373                     I8_to_native(
374                     "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
375                     0x200000000000,
376                 ],
377                 [ "requires at least 32 bits",
378                     I8_to_native(
379                     "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
380                     0x4000000000000,
381                 ],
382                 [ "requires at least 32 bits",
383                     I8_to_native(
384                     "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
385                     0x80000000000000,
386                 ],
387                 [ "requires at least 32 bits",
388                     I8_to_native(
389                     "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
390                     0x1000000000000000,
391                 ];
392         }
393     }
394 }
395
396 sub flags_to_text($$)
397 {
398     my ($flags, $flags_to_text_ref) = @_;
399
400     # Returns a string containing a mnemonic representation of the bits that
401     # are set in the $flags.  These are assumed to be flag bits.  The return
402     # looks like "FOO|BAR|BAZ".  The second parameter is a reference to an
403     # array that gives the textual representation of all the possible flags.
404     # Element 0 is the text for the bit 0 flag; element 1 for bit 1; ....  If
405     # no bits at all are set the string "0" is returned;
406
407     my @flag_text;
408     my $shift = 0;
409
410     return "0" if $flags == 0;
411
412     while ($flags) {
413         #diag sprintf "%x", $flags;
414         if ($flags & 1) {
415             push @flag_text, $flags_to_text_ref->[$shift];
416         }
417         $shift++;
418         $flags >>= 1;
419     }
420
421     return join "|", @flag_text;
422 }
423
424 # Possible flag returns from utf8n_to_uvchr_error().  These should have G_,
425 # instead of A_, D_, but the prefixes will be used in a a later commit, so
426 # minimize churn by having them here.
427 my @utf8n_flags_to_text =  ( qw(
428         A_EMPTY
429         A_CONTINUATION
430         A_NON_CONTINUATION
431         A_SHORT
432         A_LONG
433         A_LONG_AND_ITS_VALUE
434         PLACEHOLDER
435         A_OVERFLOW
436         D_SURROGATE
437         W_SURROGATE
438         D_NONCHAR
439         W_NONCHAR
440         D_SUPER
441         W_SUPER
442         D_PERL_EXTENDED
443         W_PERL_EXTENDED
444         CHECK_ONLY
445         NO_CONFIDENCE_IN_CURLEN_
446     ) );
447
448 sub utf8n_display_call($)
449 {
450     # Converts an eval string that calls test_utf8n_to_uvchr into a more human
451     # readable form, and returns it.  Doesn't work if the byte string contains
452     # an apostrophe.  The return will look something like:
453     #   test_utf8n_to_uvchr_error('$bytes', $length, $flags)
454     #diag $_[0];
455
456     $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
457     my $text1 = $1;     # Everything before the byte string
458     my $bytes = $2;
459     my $text2 = $3;     # Includes the length
460     my $flags = $4;
461
462     return $text1
463          . display_bytes($bytes)
464          . $text2
465          . flags_to_text($flags, \@utf8n_flags_to_text)
466          . ')';
467 }
468
469 sub uvchr_display_call($)
470 {
471     # Converts an eval string that calls test_uvchr_to_utf8 into a more human
472     # readable form, and returns it.  The return will look something like:
473     #   test_uvchr_to_utf8n_flags($uv, $flags)
474     #diag $_[0];
475
476     my @flags_to_text =  ( qw(
477             W_SURROGATE
478             W_NONCHAR
479             W_SUPER
480             W_PERL_EXTENDED
481             D_SURROGATE
482             D_NONCHAR
483             D_SUPER
484             D_PERL_EXTENDED
485        ) );
486
487     $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
488     my $text = $1;
489     my $cp = sprintf "%X", $2;
490     my $flags = $3;
491
492     return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
493 }
494
495 sub do_warnings_test(@)
496 {
497     my @expected_warnings = @_;
498
499     # Compares the input expected warnings array with @warnings_gotten,
500     # generating a pass for each found, removing it from @warnings_gotten.
501     # Any discrepancies generate test failures.  Returns TRUE if no
502     # discrepcancies; otherwise FALSE.
503
504     my $succeeded = 1;
505
506     if (@expected_warnings == 0) {
507         if (! is(@warnings_gotten, 0, "    Expected and got no warnings")) {
508             output_warnings(@warnings_gotten);
509             $succeeded = 0;
510         }
511         return $succeeded;
512     }
513
514     # Check that we got all the expected warnings,
515     # removing each one found
516   WARNING:
517     foreach my $expected (@expected_warnings) {
518         foreach (my $i = 0; $i < @warnings_gotten; $i++) {
519             if ($warnings_gotten[$i] =~ $expected) {
520                 pass("    Expected and got warning: "
521                     . " $warnings_gotten[$i]");
522                 splice @warnings_gotten, $i, 1;
523                 next WARNING;
524             }
525         }
526         fail("    Expected a warning that matches "
527             . $expected . " but didn't get it");
528         $succeeded = 0;
529     }
530
531     if (! is(@warnings_gotten, 0, "    Got no unexpected warnings")) {
532         output_warnings(@warnings_gotten);
533         $succeeded = 0;
534     }
535
536     return $succeeded;
537 }
538
539 # This test is split into this number of files.
540 my $num_test_files = $ENV{TEST_JOBS} || 1;
541 $num_test_files = 10 if $num_test_files > 10;
542
543 my $test_count = -1;
544 foreach my $test (@tests) {
545     $test_count++;
546     next if $test_count % $num_test_files != $::TEST_CHUNK;
547
548     my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
549
550     my $length = length $bytes;
551     my $will_overflow = $allowed_uv < 0;
552
553     my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
554     my $display_bytes = display_bytes($bytes);
555
556     my $controlling_warning_category;
557     my $utf8n_flag_to_warn;
558     my $utf8n_flag_to_disallow;
559     my $uvchr_flag_to_warn;
560     my $uvchr_flag_to_disallow;
561
562     # We want to test that the independent flags are actually independent.
563     # For example, that a surrogate doesn't trigger a non-character warning,
564     # and conversely, turning off an above-Unicode flag doesn't suppress a
565     # surrogate warning.  Earlier versions of this file used nested loops to
566     # test all possible combinations.  But that creates lots of tests, making
567     # this run too long.  What is now done instead is to use the complement of
568     # the category we are testing to greatly reduce the combinatorial
569     # explosion.  For example, if we have a surrogate and we aren't expecting
570     # a warning about it, we set all the flags for non-surrogates to raise
571     # warnings.  If one shows up, it indicates the flags aren't independent.
572     my $utf8n_flag_to_warn_complement;
573     my $utf8n_flag_to_disallow_complement;
574     my $uvchr_flag_to_warn_complement;
575     my $uvchr_flag_to_disallow_complement;
576
577     # Many of the code points being tested are middling in that if code point
578     # edge cases work, these are very likely to as well.  Because this test
579     # file takes a while to execute, we skip testing the edge effects of code
580     # points deemed middling, while testing their basics and continuing to
581     # fully test the non-middling code points.
582     my $skip_most_tests = 0;
583
584     my $cp_message_qr;      # Pattern that matches the message raised when
585                             # that message contains the problematic code
586                             # point.  The message is the same (currently) both
587                             # when going from/to utf8.
588     my $non_cp_trailing_text;   # The suffix text when the message doesn't
589                                 # contain a code point.  (This is a result of
590                                 # some sort of malformation that means we
591                                 # can't get an exact code poin
592     my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
593                         \Q requires a Perl extension, and so is not\E
594                         \Q portable\E/x;
595     my $extended_non_cp_trailing_text
596                         = "is a Perl extension, and so is not portable";
597
598     # Is this test malformed from the beginning?  If so, we know to generally
599     # expect that the tests will show it isn't valid.
600     my $initially_malformed = 0;
601
602     if ($will_overflow || $allowed_uv > 0x10FFFF) {
603
604         # Set the SUPER flags; later, we test for PERL_EXTENDED as well.
605         $utf8n_flag_to_warn     = $::UTF8_WARN_SUPER;
606         $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
607         $uvchr_flag_to_warn     = $::UNICODE_WARN_SUPER;
608         $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
609
610         # Below, we add the flags for non-perl_extended to the code points
611         # that don't fit that category.  Special tests are done for this
612         # category in the inner loop.
613         $utf8n_flag_to_warn_complement     = $::UTF8_WARN_NONCHAR
614                                             |$::UTF8_WARN_SURROGATE;
615         $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
616                                             |$::UTF8_DISALLOW_SURROGATE;
617         $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_NONCHAR
618                                             |$::UNICODE_WARN_SURROGATE;
619         $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
620                                             |$::UNICODE_DISALLOW_SURROGATE;
621         $controlling_warning_category = 'non_unicode';
622
623         if ($will_overflow) {  # This is realy a malformation
624             $non_cp_trailing_text = "if you see this, there is an error";
625             $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
626             $initially_malformed = 1;
627             if (! defined $needed_to_discern_len) {
628                 $needed_to_discern_len = overflow_discern_len($length);
629             }
630         }
631         elsif (requires_extended_utf8($allowed_uv)) {
632             $cp_message_qr = $extended_cp_message_qr;
633             $non_cp_trailing_text = $extended_non_cp_trailing_text;
634             $needed_to_discern_len = 1 unless defined $needed_to_discern_len;
635         }
636         else {
637             $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
638                                 \Q may not be portable\E/x;
639             $non_cp_trailing_text = "is for a non-Unicode code point, may not"
640                                 . " be portable";
641             $utf8n_flag_to_warn_complement     |= $::UTF8_WARN_PERL_EXTENDED;
642             $utf8n_flag_to_disallow_complement
643                                            |= $::UTF8_DISALLOW_PERL_EXTENDED;
644             $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED;
645             $uvchr_flag_to_disallow_complement
646                                         |= $::UNICODE_DISALLOW_PERL_EXTENDED;
647         }
648     }
649     elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
650         $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/;
651         $non_cp_trailing_text = "is for a surrogate";
652         $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
653         $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
654
655         $utf8n_flag_to_warn     = $::UTF8_WARN_SURROGATE;
656         $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
657         $uvchr_flag_to_warn     = $::UNICODE_WARN_SURROGATE;
658         $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
659
660         $utf8n_flag_to_warn_complement     = $::UTF8_WARN_NONCHAR
661                                             |$::UTF8_WARN_SUPER
662                                             |$::UTF8_WARN_PERL_EXTENDED;
663         $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
664                                             |$::UTF8_DISALLOW_SUPER
665                                             |$::UTF8_DISALLOW_PERL_EXTENDED;
666         $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_NONCHAR
667                                             |$::UNICODE_WARN_SUPER
668                                             |$::UNICODE_WARN_PERL_EXTENDED;
669         $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
670                                             |$::UNICODE_DISALLOW_SUPER
671                                             |$::UNICODE_DISALLOW_PERL_EXTENDED;
672         $controlling_warning_category = 'surrogate';
673     }
674     elsif (   ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
675            || ($allowed_uv & 0xFFFE) == 0xFFFE)
676     {
677         $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E
678                             \Q is not recommended for open interchange\E/x;
679         $non_cp_trailing_text = "if you see this, there is an error";
680         $needed_to_discern_len = $length unless defined $needed_to_discern_len;
681         if (   ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
682             || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
683         {
684             $skip_most_tests = 1;
685         }
686
687         $utf8n_flag_to_warn     = $::UTF8_WARN_NONCHAR;
688         $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
689         $uvchr_flag_to_warn     = $::UNICODE_WARN_NONCHAR;
690         $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
691
692         $utf8n_flag_to_warn_complement     = $::UTF8_WARN_SURROGATE
693                                             |$::UTF8_WARN_SUPER
694                                             |$::UTF8_WARN_PERL_EXTENDED;
695         $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
696                                             |$::UTF8_DISALLOW_SUPER
697                                             |$::UTF8_DISALLOW_PERL_EXTENDED;
698         $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_SURROGATE
699                                             |$::UNICODE_WARN_SUPER
700                                             |$::UNICODE_WARN_PERL_EXTENDED;
701         $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
702                                             |$::UNICODE_DISALLOW_SUPER
703                                             |$::UNICODE_DISALLOW_PERL_EXTENDED;
704
705         $controlling_warning_category = 'nonchar';
706     }
707     else {
708         die "Can't figure out what type of warning to test for $testname"
709     }
710
711     die 'Didn\'t set $needed_to_discern_len for ' . $testname
712                                         unless defined $needed_to_discern_len;
713
714     {   # First test the isFOO calls
715         use warnings; no warnings 'deprecated';   # Make sure these don't raise warnings
716         undef @warnings_gotten;
717
718         my $ret = test_isUTF8_CHAR($bytes, $length);
719         my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
720         if ($initially_malformed) {
721             is($ret, 0, "For $testname: isUTF8_CHAR() returns 0");
722             is($ret_flags, 0, "    And isUTF8_CHAR_flags() returns 0");
723         }
724         else {
725             is($ret, $length,
726                "For $testname: isUTF8_CHAR() returns expected length: $length");
727             is($ret_flags, $length, "    And isUTF8_CHAR_flags(...,0)"
728                                   . " returns expected length: $length");
729         }
730         is(scalar @warnings_gotten, 0,
731                 "    And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags generated"
732               . " any warnings")
733           or output_warnings(@warnings_gotten);
734
735         undef @warnings_gotten;
736         $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
737         if ($initially_malformed) {
738             is($ret, 0, "    And isSTRICT_UTF8_CHAR() returns 0");
739         }
740         else {
741             my $expected_ret = (   $testname =~ /surrogate|non-character/
742                                 || $allowed_uv > 0x10FFFF)
743                                ? 0
744                                : $length;
745             is($ret, $expected_ret, "    And isSTRICT_UTF8_CHAR() returns"
746                                   . " expected length: $expected_ret");
747             $ret = test_isUTF8_CHAR_flags($bytes, $length,
748                                           $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
749             is($ret, $expected_ret,
750                     "    And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
751                     . " acts like isSTRICT_UTF8_CHAR");
752         }
753         is(scalar @warnings_gotten, 0,
754                 "    And neither isSTRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
755               . " generated any warnings")
756           or output_warnings(@warnings_gotten);
757
758         undef @warnings_gotten;
759         $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
760         if ($initially_malformed) {
761             is($ret, 0, "    And isC9_STRICT_UTF8_CHAR() returns 0");
762         }
763         else {
764             my $expected_ret = (   $testname =~ /surrogate/
765                                 || $allowed_uv > 0x10FFFF)
766                                ? 0
767                                : $length;
768             is($ret, $expected_ret, "    And isC9_STRICT_UTF8_CHAR()"
769                                    ." returns expected length: $expected_ret");
770             $ret = test_isUTF8_CHAR_flags($bytes, $length,
771                                         $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
772             is($ret, $expected_ret,
773                   "    And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
774                 . " acts like isC9_STRICT_UTF8_CHAR");
775         }
776         is(scalar @warnings_gotten, 0,
777                 "    And neither isC9_STRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
778               . " generated any warnings")
779           or output_warnings(@warnings_gotten);
780
781         foreach my $disallow_type (0..2) {
782             # 0 is don't disallow this type of code point
783             # 1 is do disallow
784             # 2 is do disallow, but only code points requiring
785             #   perl-extended-UTF8
786
787             my $disallow_flags;
788             my $expected_ret;
789
790             if ($initially_malformed) {
791
792                 # Malformations are by default disallowed, so testing with
793                 # $disallow_type equal to 0 is sufficicient.
794                 next if $disallow_type;
795
796                 $disallow_flags = 0;
797                 $expected_ret = 0;
798             }
799             elsif ($disallow_type == 1) {
800                 $disallow_flags = $utf8n_flag_to_disallow;
801                 $expected_ret = 0;
802             }
803             elsif ($disallow_type == 2) {
804                 next if ! requires_extended_utf8($allowed_uv);
805                 $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED;
806                 $expected_ret = 0;
807             }
808             else {  # type is 0
809                 $disallow_flags = $utf8n_flag_to_disallow_complement;
810                 $expected_ret = $length;
811             }
812
813             $ret = test_isUTF8_CHAR_flags($bytes, $length, $disallow_flags);
814             is($ret, $expected_ret, "    And isUTF8_CHAR_flags("
815                                   . "$display_bytes, $disallow_flags) returns "
816                                   . $expected_ret)
817              or diag "The flags mean "
818               . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
819
820             is(scalar @warnings_gotten, 0,
821                     "    And isUTF8_CHAR_flags(...) generated no warnings")
822             or output_warnings(@warnings_gotten);
823
824             # Test partial character handling, for each byte not a full character
825             my $did_test_partial = 0;
826             for (my $j = 1; $j < $length - 1; $j++) {
827                 $did_test_partial = 1;
828                 my $partial = substr($bytes, 0, $j);
829                 my $ret_should_be;
830                 my $comment;
831                 if ($disallow_type || $initially_malformed) {
832                     $ret_should_be = 0;
833                     $comment = "disallowed";
834                     if ($j < $needed_to_discern_len) {
835                         $ret_should_be = 1;
836                         $comment .= ", but need $needed_to_discern_len"
837                                  . " bytes to discern:";
838                     }
839                 }
840                 else {
841                     $ret_should_be = 1;
842                     $comment = "allowed";
843                 }
844
845                 undef @warnings_gotten;
846
847                 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
848                                                              $disallow_flags);
849                 is($ret, $ret_should_be,
850                     "    And is_utf8_valid_partial_char_flags("
851                     . display_bytes($partial)
852                     . ", $disallow_flags), $comment: returns $ret_should_be")
853                  or diag "The flags mean "
854                   . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
855             }
856
857             if ($did_test_partial) {
858                 is(scalar @warnings_gotten, 0,
859                         "    And is_utf8_valid_partial_char_flags()"
860                         . " generated no warnings for any of the lengths")
861                     or output_warnings(@warnings_gotten);
862             }
863         }
864     }
865
866     # Now test the to/from UTF-8 calls
867     # This is more complicated than the malformations tested in other files in
868     # this directory, as there are several orthogonal variables involved.  We
869     # test most possible combinations
870
871     # We try various combinations of malformations that can occur
872     foreach my $short (0, 1) {
873       next if $skip_most_tests && $short;
874       foreach my $unexpected_noncont (0, 1) {
875         next if $skip_most_tests && $unexpected_noncont;
876         foreach my $overlong (0, 1) {
877           next if $overlong && $skip_most_tests;
878
879           # If we're creating an overlong, it can't be longer than the
880           # maximum length, so skip if we're already at that length.
881           next if $overlong && $length >= $::max_bytes;
882
883           my $this_cp_message_qr = $cp_message_qr;
884           my $this_non_cp_trailing_text = $non_cp_trailing_text;
885
886           foreach my $malformed_allow_type (0..2) {
887             # 0 don't allow this malformation; ignored if no malformation
888             # 1 allow, with REPLACEMENT CHARACTER returned
889             # 2 allow, with intended code point returned.  All malformations
890             #   other than overlong can't determine the intended code point,
891             #   so this isn't valid for them.
892             next if     $malformed_allow_type == 2
893                     && ($will_overflow || $short || $unexpected_noncont);
894             next if $skip_most_tests && $malformed_allow_type;
895
896             # Here we are in the innermost loop for malformations.  So we
897             # know which ones are in effect.  Can now change the input to be
898             # appropriately malformed.  We also can set up certain other
899             # things now, like whether we expect a return flag from this
900             # malformation, and which flag.
901
902             my $this_bytes = $bytes;
903             my $this_length = $length;
904             my $this_expected_len = $length;
905             my $this_needed_to_discern_len = $needed_to_discern_len;
906
907             my @malformation_names;
908             my @expected_malformation_warnings;
909             my @expected_malformation_return_flags;
910
911             # Contains the flags for any allowed malformations.  Currently no
912             # combinations of on/off are tested for.  It's either all are
913             # allowed, or none are.
914             my $allow_flags = 0;
915             my $overlong_is_in_perl_extended_utf8 = 0;
916             my $dont_use_overlong_cp = 0;
917
918             if ($overlong) {
919                 my $new_expected_len;
920
921                 # To force this malformation, we convert the original start
922                 # byte into a continuation byte with the same data bits as
923                 # originally. ...
924                 my $start_byte = substr($this_bytes, 0, 1);
925                 my $converted_to_continuation_byte
926                                             = start_byte_to_cont($start_byte);
927
928                 # ... Then we prepend it with a known overlong sequence.  This
929                 # should evaluate to the exact same code point as the
930                 # original.  We try to avoid an overlong using Perl extended
931                 # UTF-8.  The code points are the highest representable as
932                 # overlongs on the respective platform without using extended
933                 # UTF-8.
934                 if (native_to_I8($start_byte) lt "\xFC") {
935                     $start_byte = I8_to_native("\xFC");
936                     $new_expected_len = 6;
937                 }
938                 elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") {
939
940                     # FE is not extended UTF-8 on EBCDIC
941                     $start_byte = I8_to_native("\xFE");
942                     $new_expected_len = 7;
943                 }
944                 else {  # Must use extended UTF-8.  On ASCII platforms, we
945                         # could express some overlongs here starting with
946                         # \xFE, but there's no real reason to do so.
947                     $overlong_is_in_perl_extended_utf8 = 1;
948                     $start_byte = I8_to_native("\xFF");
949                     $new_expected_len = $::max_bytes;
950                     $this_cp_message_qr = $extended_cp_message_qr;
951
952                     # The warning that gets raised doesn't include the code
953                     # point in the message if the code point can be expressed
954                     # without using extended UTF-8, but the particular
955                     # overlong sequence used is in extended UTF-8.  To do
956                     # otherwise would be confusing to the user, as it would
957                     # claim the code point requires extended, when it doesn't.
958                     $dont_use_overlong_cp = 1
959                                     unless requires_extended_utf8($allowed_uv);
960                     $this_non_cp_trailing_text = $extended_non_cp_trailing_text;
961                 }
962
963                 # Splice in the revise continuation byte, preceded by the
964                 # start byte and the proper number of the lowest continuation
965                 # bytes.
966                 $this_bytes =   $start_byte
967                              . ($native_lowest_continuation_chr
968                                 x ( $new_expected_len - 1 - length($this_bytes)))
969                              .  $converted_to_continuation_byte
970                              .  substr($this_bytes, 1);
971                 $this_length = length($this_bytes);
972                 $this_needed_to_discern_len =    $new_expected_len
973                                             - (  $this_expected_len
974                                                - $this_needed_to_discern_len);
975                 $this_expected_len = $new_expected_len;
976             }
977
978             if ($short) {
979
980                 # To force this malformation, just tell the test to not look
981                 # as far as it should into the input.
982                 $this_length--;
983                 $this_expected_len--;
984
985                 $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
986             }
987
988             if ($unexpected_noncont) {
989
990                 # To force this malformation, change the final continuation
991                 # byte into a non continuation.
992                 my $pos = ($short) ? -2 : -1;
993                 substr($this_bytes, $pos, 1) = '?';
994                 $this_expected_len--;
995             }
996
997             # The whole point of a test that is malformed from the beginning
998             # is to test for that malformation.  If we've modified things so
999             # much that we don't have enough information to detect that
1000             # malformation, there's no point in testing.
1001             next if    $initially_malformed
1002                     && $this_expected_len < $this_needed_to_discern_len;
1003
1004             # Here, we've transformed the input with all of the desired
1005             # non-overflow malformations.  We are now in a position to
1006             # construct any potential warnings for those malformations.  But
1007             # it's a pain to get the detailed messages exactly right, so for
1008             # now XXX, only do so for those that return an explicit code
1009             # point.
1010
1011             if ($overlong) {
1012                 push @malformation_names, 'overlong';
1013                 push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
1014
1015                 # If one of the other malformation types is also in effect, we
1016                 # don't know what the intended code point was.
1017                 if ($short || $unexpected_noncont || $will_overflow) {
1018                     push @expected_malformation_warnings, qr/overlong/;
1019                 }
1020                 else {
1021                     my $wrong_bytes = display_bytes_no_quotes(
1022                                          substr($this_bytes, 0, $this_length));
1023                     my $correct_bytes = display_bytes_no_quotes($bytes);
1024                     my $prefix = ($allowed_uv > 0x10FFFF) ? "0x" : "U+";
1025                     push @expected_malformation_warnings,
1026                             qr/\QMalformed UTF-8 character: $wrong_bytes\E
1027                                \Q (overlong; instead use\E
1028                                \Q $correct_bytes to\E
1029                                \Q represent $prefix$uv_string)/x;
1030                 }
1031
1032                 if ($malformed_allow_type == 2) {
1033                     $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
1034                 }
1035                 elsif ($malformed_allow_type) {
1036                     $allow_flags |= $::UTF8_ALLOW_LONG;
1037                 }
1038             }
1039             if ($short) {
1040                 push @malformation_names, 'short';
1041                 push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
1042                 push @expected_malformation_warnings, qr/too short/;
1043             }
1044             if ($unexpected_noncont) {
1045                 push @malformation_names, 'unexpected non-continuation';
1046                 push @expected_malformation_return_flags,
1047                                 $::UTF8_GOT_NON_CONTINUATION;
1048                 $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
1049                                                     if $malformed_allow_type;
1050                 push @expected_malformation_warnings,
1051                                         qr/unexpected non-continuation byte/;
1052             }
1053
1054             # The overflow malformation is done differently than other
1055             # malformations.  It comes from manually typed tests in the test
1056             # array.  We now make it be treated like one of the other
1057             # malformations.  But some has to be deferred until the inner loop
1058             my $overflow_msg_pattern;
1059             if ($will_overflow) {
1060                 push @malformation_names, 'overflow';
1061
1062                 $overflow_msg_pattern = display_bytes_no_quotes(
1063                                     substr($this_bytes, 0, $this_expected_len));
1064                 $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E
1065                                            \Q $overflow_msg_pattern\E
1066                                            \Q (overflows)\E/x;
1067                 push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW;
1068                 $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type;
1069             }
1070
1071             # And we can create the malformation-related text for the the test
1072             # names we eventually will generate.
1073             my $malformations_name = "";
1074             if (@malformation_names) {
1075                 $malformations_name .= "dis" unless $malformed_allow_type;
1076                 $malformations_name .= "allowed ";
1077                 $malformations_name .= "malformation";
1078                 $malformations_name .= "s" if @malformation_names > 1;
1079                 $malformations_name .= ": ";
1080                 $malformations_name .=  join "/", @malformation_names;
1081                 $malformations_name =  " ($malformations_name)";
1082             }
1083
1084             # Done setting up the malformation related stuff
1085
1086             foreach my $do_disallow (0, 1) {
1087               next if $skip_most_tests && ! $do_disallow;
1088
1089               # We classify the warnings into certain "interesting" types,
1090               # described later
1091               foreach my $warning_type (0..4) {
1092                 next if $skip_most_tests && $warning_type != 1;
1093                 foreach my $use_warn_flag (0, 1) {
1094                     next if $skip_most_tests && ! $use_warn_flag;
1095
1096                     # Finally, here is the inner loop
1097
1098                     my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn;
1099                     my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow;
1100                     my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn;
1101                     my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow;
1102
1103                     my $eval_warn;
1104                     my $expect_regular_warnings;
1105                     my $expect_warnings_for_malformed;
1106                     my $expect_warnings_for_overflow;
1107
1108                     if ($warning_type == 0) {
1109                         $eval_warn = "use warnings; no warnings 'deprecated'";
1110                         $expect_regular_warnings = $use_warn_flag;
1111
1112                         # We ordinarily expect overflow warnings here.  But it
1113                         # is somewhat more complicated, and the final
1114                         # determination is deferred to one place in the filw
1115                         # where we handle overflow.
1116                         $expect_warnings_for_overflow = 1;
1117
1118                         # We would ordinarily expect malformed warnings in
1119                         # this case, but not if malformations are allowed.
1120                         $expect_warnings_for_malformed
1121                                                 = $malformed_allow_type == 0;
1122                     }
1123                     elsif ($warning_type == 1) {
1124                         $eval_warn = "no warnings";
1125                         $expect_regular_warnings = 0;
1126                         $expect_warnings_for_overflow = 0;
1127                         $expect_warnings_for_malformed = 0;
1128                     }
1129                     elsif ($warning_type == 2) {
1130                         $eval_warn = "no warnings; use warnings 'utf8'";
1131                         $expect_regular_warnings = $use_warn_flag;
1132                         $expect_warnings_for_overflow = 1;
1133                         $expect_warnings_for_malformed
1134                                                 = $malformed_allow_type == 0;
1135                     }
1136                     elsif ($warning_type == 3) {
1137                         $eval_warn = "no warnings; use warnings"
1138                                    . " '$controlling_warning_category'";
1139                         $expect_regular_warnings = $use_warn_flag;
1140                         $expect_warnings_for_overflow
1141                             = $controlling_warning_category eq 'non_unicode';
1142                         $expect_warnings_for_malformed = 0;
1143                     }
1144                     elsif ($warning_type == 4) {  # Like type 3, but uses the
1145                                                   # PERL_EXTENDED flags
1146                         # The complement flags were set up so that the
1147                         # PERL_EXTENDED flags have been tested that they don't
1148                         # trigger wrongly for too small code points.  And the
1149                         # flags have been set up so that those small code
1150                         # points are tested for being above Unicode.  What's
1151                         # left to test is that the large code points do
1152                         # trigger the PERL_EXTENDED flags.
1153                         next if ! requires_extended_utf8($allowed_uv);
1154                         next if $controlling_warning_category ne 'non_unicode';
1155                         $eval_warn = "no warnings; use warnings 'non_unicode'";
1156                         $expect_regular_warnings = 1;
1157                         $expect_warnings_for_overflow = 1;
1158                         $expect_warnings_for_malformed = 0;
1159                         $this_utf8n_flag_to_warn = $::UTF8_WARN_PERL_EXTENDED;
1160                         $this_utf8n_flag_to_disallow
1161                                              = $::UTF8_DISALLOW_PERL_EXTENDED;
1162                         $this_uvchr_flag_to_warn
1163                                               = $::UNICODE_WARN_PERL_EXTENDED;
1164                         $this_uvchr_flag_to_disallow
1165                                           = $::UNICODE_DISALLOW_PERL_EXTENDED;
1166                     }
1167                     else {
1168                        die "Unexpected warning type '$warning_type'";
1169                     }
1170
1171                     # We only need to test the case where all warnings are
1172                     # enabled (type 0) to see if turning off the warning flag
1173                     # causes things to not be output.  If those pass, then
1174                     # turning on some sub-category of warnings, or turning off
1175                     # warnings altogether are extremely likely to not output
1176                     # warnings either, given how the warnings subsystem is
1177                     # supposed to work, and this file assumes it does work.
1178                     next if $warning_type != 0 && ! $use_warn_flag;
1179
1180                     # The convention is that the 'got' flag is the same value
1181                     # as the disallow one.  If this were violated, the tests
1182                     # here should start failing.
1183                     my $return_flag = $this_utf8n_flag_to_disallow;
1184
1185                     # If we aren't expecting warnings/disallow for this, turn
1186                     # on all the other flags.  That makes sure that they all
1187                     # are independent of this flag, and so we don't need to
1188                     # test them individually.
1189                     my $this_warning_flags
1190                             = ($use_warn_flag)
1191                               ? $this_utf8n_flag_to_warn
1192                               : ($overlong_is_in_perl_extended_utf8
1193                                 ? ($utf8n_flag_to_warn_complement
1194                                     & ~$::UTF8_WARN_PERL_EXTENDED)
1195                                 :  $utf8n_flag_to_warn_complement);
1196                     my $this_disallow_flags
1197                             = ($do_disallow)
1198                               ? $this_utf8n_flag_to_disallow
1199                               : ($overlong_is_in_perl_extended_utf8
1200                                  ? ($utf8n_flag_to_disallow_complement
1201                                     & ~$::UTF8_DISALLOW_PERL_EXTENDED)
1202                                  :  $utf8n_flag_to_disallow_complement);
1203                     my $expected_uv = $allowed_uv;
1204                     my $this_uv_string = $uv_string;
1205
1206                     my @expected_return_flags
1207                                         = @expected_malformation_return_flags;
1208                     my @expected_warnings;
1209                     push @expected_warnings, @expected_malformation_warnings
1210                                             if $expect_warnings_for_malformed;
1211
1212                     # The overflow malformation is done differently than other
1213                     # malformations.  It comes from manually typed tests in
1214                     # the test array, but it also is above Unicode and uses
1215                     # Perl extended UTF-8, so affects some of the flags being
1216                     # tested.  We now make it be treated like one of the other
1217                     # generated malformations.
1218                     if ($will_overflow) {
1219
1220                         # An overflow is (way) above Unicode, and overrides
1221                         # everything else.
1222                         $expect_regular_warnings = 0;
1223
1224                         # Earlier, we tentatively calculated whether this
1225                         # should emit a message or not.  It's tentative
1226                         # because, even if we ordinarily would output it, we
1227                         # don't if malformations are allowed -- except an
1228                         # overflow is also a SUPER and PERL_EXTENDED, and if
1229                         # warnings for those are enabled, the overflow
1230                         # warning does get raised.
1231                         if (   $expect_warnings_for_overflow
1232                             && (    $malformed_allow_type == 0
1233                                 ||   (   $this_warning_flags
1234                                       & ($::UTF8_WARN_SUPER
1235                                         |$::UTF8_WARN_PERL_EXTENDED))))
1236                         {
1237                             push @expected_warnings, $overflow_msg_pattern;
1238                         }
1239                     }
1240
1241                     # It may be that the malformations have shortened the
1242                     # amount of input we look at so much that we can't tell
1243                     # what the category the code point was in.  Otherwise, set
1244                     # up the expected return flags based on the warnings and
1245                     # disallowments.
1246                     if ($this_expected_len < $this_needed_to_discern_len) {
1247                         $expect_regular_warnings = 0;
1248                     }
1249                     elsif (   ($this_warning_flags & $this_utf8n_flag_to_warn)
1250                            || (  $this_disallow_flags
1251                                & $this_utf8n_flag_to_disallow))
1252                     {
1253                         push @expected_return_flags, $return_flag;
1254                     }
1255
1256                     # Finish setting up the expected warning.
1257                     if ($expect_regular_warnings) {
1258
1259                         # So far the array contains warnings generated by
1260                         # malformations.  Add the expected regular one.
1261                         unshift @expected_warnings, $this_cp_message_qr;
1262
1263                         # But it may need to be modified, because either of
1264                         # these malformations means we can't determine the
1265                         # expected code point.
1266                         if (   $short || $unexpected_noncont
1267                             || $dont_use_overlong_cp)
1268                         {
1269                             my $first_byte = substr($this_bytes, 0, 1);
1270                             $expected_warnings[0] = display_bytes(
1271                                     substr($this_bytes, 0, $this_expected_len));
1272                             $expected_warnings[0]
1273                                 = qr/[Aa]\Qny UTF-8 sequence that starts with\E
1274                                      \Q $expected_warnings[0]\E
1275                                      \Q $this_non_cp_trailing_text\E/x;
1276                         }
1277                     }
1278
1279                     # Is effectively disallowed if we've set up a malformation
1280                     # (unless malformations are allowed), even if the flag
1281                     # indicates it is allowed.  Fix up test name to indicate
1282                     # this as well
1283                     my $disallowed = 0;
1284                     if (   $this_disallow_flags & $this_utf8n_flag_to_disallow
1285                         && $this_expected_len >= $this_needed_to_discern_len)
1286                     {
1287                         $disallowed = 1;
1288                     }
1289                     if ($malformations_name) {
1290                         if ($malformed_allow_type == 0) {
1291                             $disallowed = 1;
1292                         }
1293                         elsif ($malformed_allow_type == 1) {
1294
1295                             # Even if allowed, the malformation returns the
1296                             # REPLACEMENT CHARACTER.
1297                             $expected_uv = 0xFFFD;
1298                             $this_uv_string = "0xFFFD"
1299                         }
1300                     }
1301
1302                     my $this_name = "utf8n_to_uvchr_error() $testname: ";
1303                     if (! $initially_malformed) {
1304                         $this_name .= ($disallowed)
1305                                        ? 'disallowed, '
1306                                        : 'allowed, ';
1307                     }
1308                     $this_name .= "$eval_warn";
1309                     $this_name .= ", " . ((  $this_warning_flags
1310                                             & $this_utf8n_flag_to_warn)
1311                                           ? 'with flag for raising warnings'
1312                                           : 'no flag for raising warnings');
1313                     $this_name .= $malformations_name;
1314
1315                     # Do the actual test using an eval
1316                     undef @warnings_gotten;
1317                     my $ret_ref;
1318                     my $this_flags
1319                         = $allow_flags|$this_warning_flags|$this_disallow_flags;
1320                     my $eval_text =      "$eval_warn; \$ret_ref"
1321                             . " = test_utf8n_to_uvchr_error("
1322                             . "'$this_bytes', $this_length, $this_flags)";
1323                     eval "$eval_text";
1324                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1325                     {
1326                         diag "\$@='$@'; call was: "
1327                            . utf8n_display_call($eval_text);
1328                         next;
1329                     }
1330                     if ($disallowed) {
1331                         is($ret_ref->[0], 0, "    And returns 0")
1332                           or diag "Call was: " . utf8n_display_call($eval_text);
1333                     }
1334                     else {
1335                         is($ret_ref->[0], $expected_uv,
1336                                 "    And returns expected uv: "
1337                               . $this_uv_string)
1338                           or diag "Call was: " . utf8n_display_call($eval_text);
1339                     }
1340                     is($ret_ref->[1], $this_expected_len,
1341                                         "    And returns expected length:"
1342                                       . " $this_expected_len")
1343                       or diag "Call was: " . utf8n_display_call($eval_text);
1344
1345                     my $returned_flags = $ret_ref->[2];
1346
1347                     for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
1348                         if ($expected_return_flags[$i] & $returned_flags) {
1349                             if ($expected_return_flags[$i]
1350                                                 == $::UTF8_GOT_PERL_EXTENDED)
1351                             {
1352                                 pass("    Expected and got return flag for"
1353                                    . " PERL_EXTENDED");
1354                             }
1355                                    # The first entries in this are
1356                                    # malformations
1357                             elsif ($i > @malformation_names - 1)  {
1358                                 pass("    Expected and got return flag"
1359                                    . " for " . $controlling_warning_category);
1360                             }
1361                             else {
1362                                 pass("    Expected and got return flag for "
1363                                    . $malformation_names[$i]
1364                                    . " malformation");
1365                             }
1366                             $returned_flags &= ~$expected_return_flags[$i];
1367                             splice @expected_return_flags, $i, 1;
1368                         }
1369                     }
1370
1371                     is($returned_flags, 0,
1372                        "    Got no unexpected return flags")
1373                       or diag "The unexpected flags gotten were: "
1374                            . (flags_to_text($returned_flags,
1375                                             \@utf8n_flags_to_text)
1376                                 # We strip off any prefixes from the flag
1377                                 # names
1378                              =~ s/ \b [A-Z] _ //xgr);
1379                     is (scalar @expected_return_flags, 0,
1380                         "    Got all expected return flags")
1381                         or diag "The expected flags not gotten were: "
1382                            . (flags_to_text(eval join("|",
1383                                                         @expected_return_flags),
1384                                             \@utf8n_flags_to_text)
1385                                 # We strip off any prefixes from the flag
1386                                 # names
1387                              =~ s/ \b [A-Z] _ //xgr);
1388
1389                     do_warnings_test(@expected_warnings)
1390                       or diag "Call was: " . utf8n_display_call($eval_text);
1391                     undef @warnings_gotten;
1392
1393                     # Check CHECK_ONLY results when the input is
1394                     # disallowed.  Do this when actually disallowed,
1395                     # not just when the $this_disallow_flags is set
1396                     if ($disallowed) {
1397                         my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
1398                         my $eval_text = "use warnings; no warnings 'deprecated'; \$ret_ref ="
1399                                       . " test_utf8n_to_uvchr_error('"
1400                                       . "$this_bytes', $this_length,"
1401                                       . " $this_flags)";
1402                         eval $eval_text;
1403                         if (! ok ("$@ eq ''",
1404                             "    And eval succeeded with CHECK_ONLY"))
1405                         {
1406                             diag "\$@='$@'; Call was: "
1407                                . utf8n_display_call($eval_text);
1408                             next;
1409                         }
1410                         is($ret_ref->[0], 0, "    CHECK_ONLY: Returns 0")
1411                           or diag "Call was: " . utf8n_display_call($eval_text);
1412                         is($ret_ref->[1], -1,
1413                                        "    CHECK_ONLY: returns -1 for length")
1414                           or diag "Call was: " . utf8n_display_call($eval_text);
1415                         if (! is(scalar @warnings_gotten, 0,
1416                                       "    CHECK_ONLY: no warnings generated"))
1417                         {
1418                             diag "Call was: " . utf8n_display_call($eval_text);
1419                             output_warnings(@warnings_gotten);
1420                         }
1421                     }
1422
1423                     # Now repeat some of the above, but for
1424                     # uvchr_to_utf8_flags().  Since this comes from an
1425                     # existing code point, it hasn't overflowed, and isn't
1426                     # malformed.
1427                     next if @malformation_names;
1428
1429                     $this_warning_flags = ($use_warn_flag)
1430                                           ? $this_uvchr_flag_to_warn
1431                                           : 0;
1432                     $this_disallow_flags = ($do_disallow)
1433                                            ? $this_uvchr_flag_to_disallow
1434                                            : 0;
1435
1436                     $disallowed = $this_disallow_flags
1437                                 & $this_uvchr_flag_to_disallow;
1438                     $this_name .= ", " . ((  $this_warning_flags
1439                                            & $this_utf8n_flag_to_warn)
1440                                           ? 'with flag for raising warnings'
1441                                           : 'no flag for raising warnings');
1442
1443                     $this_name = "uvchr_to_utf8_flags() $testname: "
1444                                             . (($disallowed)
1445                                                 ? 'disallowed'
1446                                                 : 'allowed');
1447                     $this_name .= ", $eval_warn";
1448                     $this_name .= ", " . ((  $this_warning_flags
1449                                            & $this_uvchr_flag_to_warn)
1450                                         ? 'with warning flag'
1451                                         : 'no warning flag');
1452
1453                     undef @warnings_gotten;
1454                     my $ret;
1455                     $this_flags = $this_warning_flags|$this_disallow_flags;
1456                     $eval_text = "$eval_warn; \$ret ="
1457                             . " test_uvchr_to_utf8_flags("
1458                             . "$allowed_uv, $this_flags)";
1459                     eval "$eval_text";
1460                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1461                     {
1462                         diag "\$@='$@'; call was: "
1463                            . uvchr_display_call($eval_text);
1464                         next;
1465                     }
1466                     if ($disallowed) {
1467                         is($ret, undef, "    And returns undef")
1468                           or diag "Call was: " . uvchr_display_call($eval_text);
1469                     }
1470                     else {
1471                         is($ret, $this_bytes, "    And returns expected string")
1472                           or diag "Call was: " . uvchr_display_call($eval_text);
1473                     }
1474
1475                     do_warnings_test(@expected_warnings)
1476                       or diag "Call was: " . uvchr_display_call($eval_text);
1477                 }
1478               }
1479             }
1480           }
1481         }
1482       }
1483     }
1484 }
1485
1486 done_testing;