This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/utf8_warn_base.pl: Improve some more diagnostics
[perl5.git] / ext / XS-APItest / t / utf8_warn_base.pl
1 #!perl -w
2
3 # This is a base file to be used by various .t's in its directory
4
5 use strict;
6 use Test::More;
7
8 BEGIN {
9     use_ok('XS::APItest');
10     require 'charset_tools.pl';
11     require './t/utf8_setup.pl';
12 };
13
14 $|=1;
15
16 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
17                           # machines, and that is tested elsewhere
18
19 use XS::APItest;
20 use Data::Dumper;
21
22 my @warnings_gotten;
23
24 use warnings 'utf8';
25 local $SIG{__WARN__} = sub { push @warnings_gotten, @_ };
26
27 sub nonportable_regex ($) {
28
29     # Returns a pattern that matches the non-portable message raised either
30     # for the specific input code point, or the one generated when there
31     # is some malformation that precludes the message containing the specific
32     # code point
33
34     my $code_point = shift;
35
36     my $string = sprintf '(Code point 0x%X is not Unicode, and'
37                        . '|Any UTF-8 sequence that starts with'
38                        . ' "(\\\x[[:xdigit:]]{2})+" is for a'
39                        . ' non-Unicode code point, and is) not portable',
40                     $code_point;
41     return qr/$string/;
42 }
43
44 # Now test the cases where a legal code point is generated, but may or may not
45 # be allowed/warned on.
46 my @tests = (
47      # ($testname, $bytes, $disallow_flags, $controlling_warning_category,
48      #  $allowed_uv, $needed_to_discern_len )
49     [ "lowest surrogate",
50         (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
51         $::UTF8_DISALLOW_SURROGATE,
52         'surrogate', 0xD800,
53     ],
54     [ "a middle surrogate",
55         (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
56         $::UTF8_DISALLOW_SURROGATE,
57         'surrogate', 0xD90D,
58     ],
59     [ "highest surrogate",
60         (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
61         $::UTF8_DISALLOW_SURROGATE,
62         'surrogate', 0xDFFF,
63     ],
64     [ "first non_unicode",
65         (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
66         $::UTF8_DISALLOW_SUPER,
67         'non_unicode', 0x110000,
68         2,
69     ],
70     [ "non_unicode whose first byte tells that",
71         (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
72         $::UTF8_DISALLOW_SUPER,
73         'non_unicode',
74         (isASCII) ? 0x140000 : 0x200000,
75         1,
76     ],
77     [ "first of 32 consecutive non-character code points",
78         (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
79         $::UTF8_DISALLOW_NONCHAR,
80         'nonchar', 0xFDD0,
81     ],
82     [ "a mid non-character code point of the 32 consecutive ones",
83         (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
84         $::UTF8_DISALLOW_NONCHAR,
85         'nonchar', 0xFDE0,
86     ],
87     [ "final of 32 consecutive non-character code points",
88         (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
89         $::UTF8_DISALLOW_NONCHAR,
90         'nonchar', 0xFDEF,
91     ],
92     [ "non-character code point U+FFFE",
93         (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
94         $::UTF8_DISALLOW_NONCHAR,
95         'nonchar', 0xFFFE,
96     ],
97     [ "non-character code point U+FFFF",
98         (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
99         $::UTF8_DISALLOW_NONCHAR,
100         'nonchar', 0xFFFF,
101     ],
102     [ "non-character code point U+1FFFE",
103         (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
104         $::UTF8_DISALLOW_NONCHAR,
105         'nonchar', 0x1FFFE,
106     ],
107     [ "non-character code point U+1FFFF",
108         (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
109         $::UTF8_DISALLOW_NONCHAR,
110         'nonchar', 0x1FFFF,
111     ],
112     [ "non-character code point U+2FFFE",
113         (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
114         $::UTF8_DISALLOW_NONCHAR,
115         'nonchar', 0x2FFFE,
116     ],
117     [ "non-character code point U+2FFFF",
118         (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
119         $::UTF8_DISALLOW_NONCHAR,
120         'nonchar', 0x2FFFF,
121     ],
122     [ "non-character code point U+3FFFE",
123         (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
124         $::UTF8_DISALLOW_NONCHAR,
125         'nonchar', 0x3FFFE,
126     ],
127     [ "non-character code point U+3FFFF",
128         (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
129         $::UTF8_DISALLOW_NONCHAR,
130         'nonchar', 0x3FFFF,
131     ],
132     [ "non-character code point U+4FFFE",
133         (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
134         $::UTF8_DISALLOW_NONCHAR,
135         'nonchar', 0x4FFFE,
136     ],
137     [ "non-character code point U+4FFFF",
138         (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
139         $::UTF8_DISALLOW_NONCHAR,
140         'nonchar', 0x4FFFF,
141     ],
142     [ "non-character code point U+5FFFE",
143         (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
144         $::UTF8_DISALLOW_NONCHAR,
145         'nonchar', 0x5FFFE,
146     ],
147     [ "non-character code point U+5FFFF",
148         (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
149         $::UTF8_DISALLOW_NONCHAR,
150         'nonchar', 0x5FFFF,
151     ],
152     [ "non-character code point U+6FFFE",
153         (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
154         $::UTF8_DISALLOW_NONCHAR,
155         'nonchar', 0x6FFFE,
156     ],
157     [ "non-character code point U+6FFFF",
158         (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
159         $::UTF8_DISALLOW_NONCHAR,
160         'nonchar', 0x6FFFF,
161     ],
162     [ "non-character code point U+7FFFE",
163         (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
164         $::UTF8_DISALLOW_NONCHAR,
165         'nonchar', 0x7FFFE,
166     ],
167     [ "non-character code point U+7FFFF",
168         (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
169         $::UTF8_DISALLOW_NONCHAR,
170         'nonchar', 0x7FFFF,
171     ],
172     [ "non-character code point U+8FFFE",
173         (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
174         $::UTF8_DISALLOW_NONCHAR,
175         'nonchar', 0x8FFFE,
176     ],
177     [ "non-character code point U+8FFFF",
178         (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
179         $::UTF8_DISALLOW_NONCHAR,
180         'nonchar', 0x8FFFF,
181     ],
182     [ "non-character code point U+9FFFE",
183         (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
184         $::UTF8_DISALLOW_NONCHAR,
185         'nonchar', 0x9FFFE,
186     ],
187     [ "non-character code point U+9FFFF",
188         (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
189         $::UTF8_DISALLOW_NONCHAR,
190         'nonchar', 0x9FFFF,
191     ],
192     [ "non-character code point U+AFFFE",
193         (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
194         $::UTF8_DISALLOW_NONCHAR,
195         'nonchar', 0xAFFFE,
196     ],
197     [ "non-character code point U+AFFFF",
198         (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
199         $::UTF8_DISALLOW_NONCHAR,
200         'nonchar', 0xAFFFF,
201     ],
202     [ "non-character code point U+BFFFE",
203         (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
204         $::UTF8_DISALLOW_NONCHAR,
205         'nonchar', 0xBFFFE,
206     ],
207     [ "non-character code point U+BFFFF",
208         (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
209         $::UTF8_DISALLOW_NONCHAR,
210         'nonchar', 0xBFFFF,
211     ],
212     [ "non-character code point U+CFFFE",
213         (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
214         $::UTF8_DISALLOW_NONCHAR,
215         'nonchar', 0xCFFFE,
216     ],
217     [ "non-character code point U+CFFFF",
218         (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
219         $::UTF8_DISALLOW_NONCHAR,
220         'nonchar', 0xCFFFF,
221     ],
222     [ "non-character code point U+DFFFE",
223         (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
224         $::UTF8_DISALLOW_NONCHAR,
225         'nonchar', 0xDFFFE,
226     ],
227     [ "non-character code point U+DFFFF",
228         (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
229         $::UTF8_DISALLOW_NONCHAR,
230         'nonchar', 0xDFFFF,
231     ],
232     [ "non-character code point U+EFFFE",
233         (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
234         $::UTF8_DISALLOW_NONCHAR,
235         'nonchar', 0xEFFFE,
236     ],
237     [ "non-character code point U+EFFFF",
238         (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
239         $::UTF8_DISALLOW_NONCHAR,
240         'nonchar', 0xEFFFF,
241     ],
242     [ "non-character code point U+FFFFE",
243         (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
244         $::UTF8_DISALLOW_NONCHAR,
245         'nonchar', 0xFFFFE,
246     ],
247     [ "non-character code point U+FFFFF",
248         (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
249         $::UTF8_DISALLOW_NONCHAR,
250         'nonchar', 0xFFFFF,
251     ],
252     [ "non-character code point U+10FFFE",
253         (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
254         $::UTF8_DISALLOW_NONCHAR,
255         'nonchar', 0x10FFFE,
256     ],
257     [ "non-character code point U+10FFFF",
258         (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
259         $::UTF8_DISALLOW_NONCHAR,
260         'nonchar', 0x10FFFF,
261     ],
262     [ "requires at least 32 bits",
263         (isASCII)
264          ?  "\xfe\x82\x80\x80\x80\x80\x80"
265          : I8_to_native(
266             "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
267         # This code point is chosen so that it is representable in a UV on
268         # 32-bit machines
269         $::UTF8_DISALLOW_ABOVE_31_BIT,
270         'utf8', 0x80000000,
271         (isASCII) ? 1 : 8,
272     ],
273     [ "highest 32 bit code point",
274         (isASCII)
275          ?  "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
276          : I8_to_native(
277             "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
278         $::UTF8_DISALLOW_ABOVE_31_BIT,
279         'utf8', 0xFFFFFFFF,
280         (isASCII) ? 1 : 8,
281     ],
282     [ "requires at least 32 bits, and use SUPER-type flags, instead of"
283     . " ABOVE_31_BIT",
284         (isASCII)
285          ? "\xfe\x82\x80\x80\x80\x80\x80"
286          : I8_to_native(
287            "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
288         $::UTF8_DISALLOW_SUPER,
289         'utf8', 0x80000000,
290         1,
291     ],
292     [ "overflow with warnings/disallow for more than 31 bits",
293         # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
294         # with overflow.  The overflow malformation is never allowed, so
295         # preventing it takes precedence if the ABOVE_31_BIT options would
296         # otherwise allow in an overflowing value.  The ASCII code points (1
297         # for 32-bits; 1 for 64) were chosen because the old overflow
298         # detection algorithm did not catch them; this means this test also
299         # checks for that fix.  The EBCDIC are arbitrary overflowing ones
300         # since we have no reports of failures with it.
301        (($::is64bit)
302         ? ((isASCII)
303            ?    "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
304            : I8_to_native(
305                 "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
306         : ((isASCII)
307            ?    "\xfe\x86\x80\x80\x80\x80\x80"
308            : I8_to_native(
309                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
310         $::UTF8_DISALLOW_ABOVE_31_BIT,
311         'utf8', -1,
312         (isASCII || $::is64bit) ? 2 : 8,
313     ],
314 );
315
316 if (! $::is64bit) {
317     if (isASCII) {
318         no warnings qw{portable overflow};
319         push @tests,
320             [ "Lowest 33 bit code point: overflow",
321                 "\xFE\x84\x80\x80\x80\x80\x80",
322                 $::UTF8_DISALLOW_ABOVE_31_BIT,
323                 'utf8', -1,
324                 1,
325             ];
326     }
327 }
328 else {
329     no warnings qw{portable overflow};
330     push @tests,
331         [ "More than 32 bits",
332             (isASCII)
333             ?       "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
334             : I8_to_native(
335                     "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
336             $::UTF8_DISALLOW_ABOVE_31_BIT,
337             'utf8', 0x1000000000,
338             (isASCII) ? 1 : 7,
339         ];
340     if (! isASCII) {
341         push @tests,   # These could falsely show wrongly in a naive
342                        # implementation
343             [ "requires at least 32 bits",
344                 I8_to_native(
345                     "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
346                 $::UTF8_DISALLOW_ABOVE_31_BIT,
347                 'utf8', 0x800000000,
348                 7,
349             ],
350             [ "requires at least 32 bits",
351                 I8_to_native(
352                     "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
353                 $::UTF8_DISALLOW_ABOVE_31_BIT,
354                 'utf8', 0x10000000000,
355                 6,
356             ],
357             [ "requires at least 32 bits",
358                 I8_to_native(
359                     "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
360                 $::UTF8_DISALLOW_ABOVE_31_BIT,
361                 'utf8', 0x200000000000,
362                 5,
363             ],
364             [ "requires at least 32 bits",
365                 I8_to_native(
366                     "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
367                 $::UTF8_DISALLOW_ABOVE_31_BIT,
368                 'utf8', 0x4000000000000,
369                 4,
370             ],
371             [ "requires at least 32 bits",
372                 I8_to_native(
373                     "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
374                 $::UTF8_DISALLOW_ABOVE_31_BIT,
375                 'utf8', 0x80000000000000,
376                 3,
377             ],
378             [ "requires at least 32 bits",
379                 I8_to_native(
380                     "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
381                 $::UTF8_DISALLOW_ABOVE_31_BIT,
382                 'utf8', 0x1000000000000000,
383                 2,
384             ];
385     }
386 }
387
388 sub flags_to_text($$)
389 {
390     my ($flags, $flags_to_text_ref) = @_;
391
392     # Returns a string containing a mnemonic representation of the bits that
393     # are set in the $flags.  These are assumed to be flag bits.  The return
394     # looks like "FOO|BAR|BAZ".  The second parameter is a reference to an
395     # array that gives the textual representation of all the possible flags.
396     # Element 0 is the text for the bit 0 flag; element 1 for bit 1; ....  If
397     # no bits at all are set the string "0" is returned;
398
399     my @flag_text;
400     my $shift = 0;
401
402     return "0" if $flags == 0;
403
404     while ($flags) {
405         #diag sprintf "%x", $flags;
406         if ($flags & 1) {
407             push @flag_text, $flags_to_text_ref->[$shift];
408         }
409         $shift++;
410         $flags >>= 1;
411     }
412
413     return join "|", @flag_text;
414 }
415
416 # Possible flag returns from utf8n_to_uvchr_error().  These should have G_,
417 # instead of A_, D_, but the prefixes will be used in a a later commit, so
418 # minimize churn by having them here.
419 my @utf8n_flags_to_text =  ( qw(
420         A_EMPTY
421         A_CONTINUATION
422         A_NON_CONTINUATION
423         A_SHORT
424         A_LONG
425         A_LONG_AND_ITS_VALUE
426         PLACEHOLDER
427         A_OVERFLOW
428         D_SURROGATE
429         W_SURROGATE
430         D_NONCHAR
431         W_NONCHAR
432         D_SUPER
433         W_SUPER
434         D_ABOVE_31_BIT
435         W_ABOVE_31_BIT
436         CHECK_ONLY
437         NO_CONFIDENCE_IN_CURLEN_
438     ) );
439
440 sub utf8n_display_call($)
441 {
442     # Converts an eval string that calls test_utf8n_to_uvchr into a more human
443     # readable form, and returns it.  Doesn't work if the byte string contains
444     # an apostrophe.  The return will look something like:
445     #   test_utf8n_to_uvchr_error('$bytes', $length, $flags)
446     #diag $_[0];
447
448     $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
449     my $text1 = $1;     # Everything before the byte string
450     my $bytes = $2;
451     my $text2 = $3;     # Includes the length
452     my $flags = $4;
453
454     return $text1
455          . display_bytes($bytes)
456          . $text2
457          . flags_to_text($flags, \@utf8n_flags_to_text)
458          . ')';
459 }
460
461 sub uvchr_display_call($)
462 {
463     # Converts an eval string that calls test_uvchr_to_utf8 into a more human
464     # readable form, and returns it.  The return will look something like:
465     #   test_uvchr_to_utf8n_flags($uv, $flags)
466     #diag $_[0];
467
468     my @flags_to_text =  ( qw(
469             W_SURROGATE
470             W_NONCHAR
471             W_SUPER
472             W_ABOVE_31_BIT
473             D_SURROGATE
474             D_NONCHAR
475             D_SUPER
476             D_ABOVE_31_BIT
477        ) );
478
479     $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
480     my $text = $1;
481     my $cp = sprintf "%X", $2;
482     my $flags = $3;
483
484     return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
485 }
486
487 # This test is split into this number of files.
488 my $num_test_files = $ENV{TEST_JOBS} || 1;
489 $num_test_files = 10 if $num_test_files > 10;
490
491 my $test_count = -1;
492 foreach my $test (@tests) {
493     $test_count++;
494     next if $test_count % $num_test_files != $::TEST_CHUNK;
495
496     my ($testname, $bytes, $disallow_flags,
497         $controlling_warning_category, $allowed_uv, $needed_to_discern_len
498        ) = @$test;
499
500     my $length = length $bytes;
501     my $will_overflow = $allowed_uv < 0;
502
503     # The convention is that the got flag is the same value as the disallow
504     # one, and the warn flag is the next bit over.  If this were violated, the
505     # tests here should start failing.  We could do an eval under no strict to
506     # be sure.
507     my $expected_error_flags = $disallow_flags;
508     my $warn_flags = $disallow_flags << 1;
509
510     my $message;
511     if ($allowed_uv > 0x7FFFFFFF) {
512         $message = nonportable_regex($allowed_uv);
513     }
514     elsif ($allowed_uv > 0x10FFFF) {
515         $message = qr/(not Unicode|for a non-Unicode code point).* may not be portable/;
516     }
517     elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
518         $message = qr/surrogate/;
519         $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
520     }
521     elsif (   ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
522            || ($allowed_uv & 0xFFFE) == 0xFFFE)
523     {
524         $message = qr/Unicode non-character.*is not recommended for open interchange/;
525         $needed_to_discern_len = $length unless defined $needed_to_discern_len;
526     }
527     elsif ($will_overflow) {
528         $message = qr/overflows/;
529     }
530     else {
531         die "Can't figure out what type of warning to test for $testname"
532     }
533
534     die 'Didn\'t set $needed_to_discern_len for ' . $testname
535                                         unless defined $needed_to_discern_len;
536
537     {
538         use warnings;
539         undef @warnings_gotten;
540         my $ret = test_isUTF8_CHAR($bytes, $length);
541         my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
542         if ($will_overflow) {
543             is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
544             is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
545         }
546         else {
547             is($ret, $length,
548                "isUTF8_CHAR() $testname: returns expected length: $length");
549             is($ret_flags, $length, "isUTF8_CHAR_flags(...,0) $testname:"
550                                   . " returns expected length: $length");
551         }
552         is(scalar @warnings_gotten, 0,
553                 "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated"
554               . " no warnings")
555           or output_warnings(@warnings_gotten);
556
557         undef @warnings_gotten;
558         $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
559         if ($will_overflow) {
560             is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
561         }
562         else {
563             my $expected_ret = (   $testname =~ /surrogate|non-character/
564                                 || $allowed_uv > 0x10FFFF)
565                                ? 0
566                                : $length;
567             is($ret, $expected_ret, "isSTRICT_UTF8_CHAR() $testname: returns"
568                                   . " expected length: $expected_ret");
569             $ret = test_isUTF8_CHAR_flags($bytes, $length,
570                                           $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
571             is($ret, $expected_ret,
572                             "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
573                           . " acts like isSTRICT_UTF8_CHAR");
574         }
575         is(scalar @warnings_gotten, 0,
576                 "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
577               . " generated no warnings")
578           or output_warnings(@warnings_gotten);
579
580         undef @warnings_gotten;
581         $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
582         if ($will_overflow) {
583             is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
584         }
585         else {
586             my $expected_ret = (   $testname =~ /surrogate/
587                                 || $allowed_uv > 0x10FFFF)
588                                ? 0
589                                : $length;
590             is($ret, $expected_ret, "isC9_STRICT_UTF8_CHAR() $testname:"
591                                    ." returns expected length: $expected_ret");
592             $ret = test_isUTF8_CHAR_flags($bytes, $length,
593                                           $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
594             is($ret, $expected_ret,
595                           "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
596                         . " acts like isC9_STRICT_UTF8_CHAR");
597         }
598         is(scalar @warnings_gotten, 0,
599                 "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
600               . " generated no warnings")
601           or output_warnings(@warnings_gotten);
602
603         # Test partial character handling, for each byte not a full character
604         for my $j (1.. $length - 1) {
605
606             # Skip the test for the interaction between overflow and above-31
607             # bit.  It is really testing other things than the partial
608             # character tests, for which other tests in this file are
609             # sufficient
610             last if $will_overflow;
611
612             foreach my $disallow_flag (0, $disallow_flags) {
613                 my $partial = substr($bytes, 0, $j);
614                 my $ret_should_be;
615                 my $comment;
616                 if ($disallow_flag) {
617                     $ret_should_be = 0;
618                     $comment = "disallowed";
619                     if ($j < $needed_to_discern_len) {
620                         $ret_should_be = 1;
621                         $comment .= ", but need $needed_to_discern_len bytes"
622                                  .  " to discern:";
623                     }
624                 }
625                 else {
626                     $ret_should_be = 1;
627                     $comment = "allowed";
628                 }
629
630                 undef @warnings_gotten;
631
632                 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
633                                                              $disallow_flag);
634                 is($ret, $ret_should_be,
635                                 "$testname: is_utf8_valid_partial_char_flags("
636                                         . display_bytes($partial)
637                                         . "), $comment: returns $ret_should_be");
638                 is(scalar @warnings_gotten, 0,
639                         "$testname: is_utf8_valid_partial_char_flags()"
640                       . " generated no warnings")
641                   or output_warnings(@warnings_gotten);
642             }
643         }
644     }
645
646     # This is more complicated than the malformations tested earlier, as there
647     # are several orthogonal variables involved.  We test all the subclasses
648     # of utf8 warnings to verify they work with and without the utf8 class,
649     # and don't have effects on other sublass warnings
650     foreach my $trial_warning_category ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
651       foreach my $warn_flag (0, $warn_flags) {
652         foreach my $disallow_flag (0, $disallow_flags) {
653           foreach my $do_warning (0, 1) {
654
655             # We try each of the above with various combinations of
656             # malformations that can occur on the same input sequence.
657             foreach my $short ("", "short") {
658               foreach my $unexpected_noncont ("",
659                                               "unexpected non-continuation")
660               {
661                 foreach my $overlong ("", "overlong") {
662
663                     # If we're creating an overlong, it can't be longer than
664                     # the maximum length, so skip if we're already at that
665                     # length.
666                     next if $overlong && $length >= $::max_bytes;
667
668                     my @malformations;
669                     my @expected_return_flags;
670                     push @malformations, $short if $short;
671                     push @malformations, $unexpected_noncont
672                                                       if $unexpected_noncont;
673                     push @malformations, $overlong if $overlong;
674
675                     # The overflow malformation test in the input
676                     # array is coerced into being treated like one of
677                     # the others.
678                     if ($will_overflow) {
679                         push @malformations, 'overflow';
680                         push @expected_return_flags, $::UTF8_GOT_OVERFLOW;
681                     }
682
683                     my $malformations_name = join "/", @malformations;
684                     $malformations_name .= " malformation"
685                                                 if $malformations_name;
686                     $malformations_name .= "s" if @malformations > 1;
687                     my $this_bytes = $bytes;
688                     my $this_length = $length;
689                     my $expected_uv = $allowed_uv;
690                     my $this_expected_len = $length;
691                     my $this_needed_to_discern_len = $needed_to_discern_len;
692                     if ($malformations_name) {
693                         $expected_uv = 0;
694
695                         # Coerce the input into the desired
696                         # malformation
697                         if ($malformations_name =~ /overlong/) {
698
699                             # For an overlong, we convert the original
700                             # start byte into a continuation byte with
701                             # the same data bits as originally. ...
702                             substr($this_bytes, 0, 1)
703                                 = start_byte_to_cont(substr($this_bytes,
704                                                             0, 1));
705
706                             # ... Then we prepend it with a known
707                             # overlong sequence.  This should evaluate
708                             # to the exact same code point as the
709                             # original.
710                             $this_bytes
711                             = I8_to_native("\xff")
712                             . (I8_to_native(chr $::lowest_continuation)
713                             x ( $::max_bytes - 1 - length($this_bytes)))
714                             . $this_bytes;
715                             $this_length = length($this_bytes);
716                             $this_needed_to_discern_len
717                                  = $::max_bytes - ($this_expected_len
718                                                - $this_needed_to_discern_len);
719                             $this_expected_len = $::max_bytes;
720                             push @expected_return_flags, $::UTF8_GOT_LONG;
721                         }
722                         if ($malformations_name =~ /short/) {
723
724                             # Just tell the test to not look far
725                             # enough into the input.
726                             $this_length--;
727                             $this_expected_len--;
728                             push @expected_return_flags, $::UTF8_GOT_SHORT;
729                         }
730                         if ($malformations_name
731                                                 =~ /non-continuation/)
732                         {
733                             # Change the final continuation byte into
734                             # a non one.
735                             my $pos = ($short) ? -2 : -1;
736                             substr($this_bytes, $pos, 1) = '?';
737                             $this_expected_len--;
738                             push @expected_return_flags,
739                                             $::UTF8_GOT_NON_CONTINUATION;
740                         }
741                     }
742
743                     my $eval_warn = $do_warning
744                                 ? "use warnings '$trial_warning_category'"
745                                 : $trial_warning_category eq "utf8"
746                                     ? "no warnings 'utf8'"
747                                     : ( "use warnings 'utf8';"
748                                     . " no warnings '$trial_warning_category'");
749
750                     # Is effectively disallowed if we've set up a
751                     # malformation, even if the flag indicates it is
752                     # allowed.  Fix up test name to indicate this as
753                     # well
754                     my $disallowed = $disallow_flag
755                                 || $malformations_name;
756                     my $this_name = "utf8n_to_uvchr_error() $testname: "
757                                                 . (($disallow_flag)
758                                                 ? 'disallowed'
759                                                 : $disallowed
760                                                     ? $disallowed
761                                                     : 'allowed');
762                     $this_name .= ", $eval_warn";
763                     $this_name .= ", " . (($warn_flag)
764                                         ? 'with warning flag'
765                                         : 'no warning flag');
766
767                     undef @warnings_gotten;
768                     my $ret_ref;
769                     my $this_flags = $warn_flag | $disallow_flag;
770                     my $eval_text =      "$eval_warn; \$ret_ref"
771                             . " = test_utf8n_to_uvchr_error("
772                             . "'$this_bytes',"
773                             . " $this_length, $this_flags)";
774                     eval "$eval_text";
775                     if (! ok ("$@ eq ''",
776                         "$this_name: eval succeeded"))
777                     {
778                         diag "\$@='$@'; call was: "
779                            . utf8n_display_call($eval_text);
780                         next;
781                     }
782                     if ($disallowed) {
783                         is($ret_ref->[0], 0, "$this_name: Returns 0")
784                           or diag "Call was: " . utf8n_display_call($eval_text);
785                     }
786                     else {
787                         is($ret_ref->[0], $expected_uv,
788                                 "$this_name: Returns expected uv: "
789                                 . sprintf("0x%04X", $expected_uv))
790                           or diag "Call was: " . utf8n_display_call($eval_text);
791                     }
792                     is($ret_ref->[1], $this_expected_len,
793                                         "$this_name: Returns expected length:"
794                                       . " $this_expected_len")
795                       or diag "Call was: " . utf8n_display_call($eval_text);
796
797                     my $returned_flags = $ret_ref->[2];
798
799                     for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
800                         if (ok($expected_return_flags[$i] & $returned_flags,
801                             "Expected and got error bit return"
802                             . " for $malformations[$i] malformation"))
803                         {
804                             $returned_flags &= ~$expected_return_flags[$i];
805                         }
806                         splice @expected_return_flags, $i, 1;
807                     }
808                     is(scalar @expected_return_flags, 0,
809                             "Got all the expected malformation errors")
810                       or diag Dumper \@expected_return_flags;
811
812                     if (   $this_expected_len >= $this_needed_to_discern_len
813                         && ($warn_flag || $disallow_flag))
814                     {
815                         is($returned_flags, $expected_error_flags,
816                                 "Got the correct error flag")
817                           or diag "Call was: " . utf8n_display_call($eval_text);
818                     }
819                     else {
820                         is($returned_flags, 0, "Got no other error flag")
821                         or
822
823                         # We strip off any prefixes from the flag names
824                         diag "The unexpected flags were: "
825                            . (flags_to_text($returned_flags,
826                                             \@utf8n_flags_to_text)
827                              =~ s/ \b [A-Z] _ //xgr);
828                     }
829
830                     if (@malformations) {
831                         if (! $do_warning && $trial_warning_category eq 'utf8') {
832                             goto no_warnings_expected;
833                         }
834
835                         # Check that each malformation generates a
836                         # warning, removing that warning if found
837                     MALFORMATION:
838                         foreach my $malformation (@malformations) {
839                             foreach (my $i = 0; $i < @warnings_gotten; $i++) {
840                                 if ($warnings_gotten[$i] =~ /$malformation/) {
841                                     pass("Expected and got"
842                                     . "'$malformation' warning");
843                                     splice @warnings_gotten, $i, 1;
844                                     next MALFORMATION;
845                                 }
846                             }
847                             fail("Expected '$malformation' warning"
848                             . " but didn't get it");
849
850                         }
851                     }
852
853                     # Any overflow will override any super or above-31
854                     # warnings.
855                     goto no_warnings_expected
856                                 if $will_overflow || $this_expected_len
857                                         < $this_needed_to_discern_len;
858
859                     if (    ! $do_warning
860                         && (   $trial_warning_category eq 'utf8'
861                             || $trial_warning_category eq $controlling_warning_category))
862                     {
863                         goto no_warnings_expected;
864                     }
865                     elsif ($warn_flag) {
866                         if (is(scalar @warnings_gotten, 1,
867                             "$this_name: Got a single warning "))
868                         {
869                             like($warnings_gotten[0], $message,
870                                     "$this_name: Got expected warning")
871                                 or diag "Call was: "
872                                  . utf8n_display_call($eval_text);
873                         }
874                         else {
875                             diag "Call was: " . utf8n_display_call($eval_text);
876                             if (scalar @warnings_gotten) {
877                                 output_warnings(@warnings_gotten);
878                             }
879                         }
880                     }
881                     else {
882
883                     no_warnings_expected:
884                         unless (is(scalar @warnings_gotten, 0,
885                                 "$this_name: Got no warnings"))
886                         {
887                             diag "Call was: " . utf8n_display_call($eval_text);
888                             output_warnings(@warnings_gotten);
889                         }
890                     }
891
892                     # Check CHECK_ONLY results when the input is
893                     # disallowed.  Do this when actually disallowed,
894                     # not just when the $disallow_flag is set
895                     if ($disallowed) {
896                         undef @warnings_gotten;
897                         $this_flags = $disallow_flag|$::UTF8_CHECK_ONLY;
898                         $eval_text = "\$ret_ref = test_utf8n_to_uvchr_error("
899                                    . "'$this_bytes', $this_length, $this_flags)";
900                         eval "$eval_text";
901                         if (! ok ("$@ eq ''",
902                             "    And eval succeeded with CHECK_ONLY"))
903                         {
904                             diag "\$@='$@'; Call was: "
905                                . utf8n_display_call($eval_text);
906                             next;
907                         }
908                         is($ret_ref->[0], 0,
909                                         "$this_name, CHECK_ONLY: Returns 0")
910                           or diag "Call was: " . utf8n_display_call($eval_text);
911                         is($ret_ref->[1], -1,
912                             "$this_name: CHECK_ONLY: returns -1 for length")
913                           or diag "Call was: " . utf8n_display_call($eval_text);
914                         if (! is(scalar @warnings_gotten, 0,
915                             "$this_name, CHECK_ONLY: no warnings"
916                         . " generated"))
917                         {
918                             diag "Call was: " . utf8n_display_call($eval_text);
919                             output_warnings(@warnings_gotten);
920                         }
921                     }
922
923                     # Now repeat some of the above, but for
924                     # uvchr_to_utf8_flags().  Since this comes from an
925                     # existing code point, it hasn't overflowed, and
926                     # isn't malformed.
927                     next if @malformations;
928
929                     # The warning and disallow flags passed in are for
930                     # utf8n_to_uvchr_error().  Convert them for
931                     # uvchr_to_utf8_flags().
932                     my $uvchr_warn_flag = 0;
933                     my $uvchr_disallow_flag = 0;
934                     if ($warn_flag) {
935                         if ($warn_flag == $::UTF8_WARN_SURROGATE) {
936                             $uvchr_warn_flag = $::UNICODE_WARN_SURROGATE
937                         }
938                         elsif ($warn_flag == $::UTF8_WARN_NONCHAR) {
939                             $uvchr_warn_flag = $::UNICODE_WARN_NONCHAR
940                         }
941                         elsif ($warn_flag == $::UTF8_WARN_SUPER) {
942                             $uvchr_warn_flag = $::UNICODE_WARN_SUPER
943                         }
944                         elsif ($warn_flag == $::UTF8_WARN_ABOVE_31_BIT) {
945                             $uvchr_warn_flag
946                                         = $::UNICODE_WARN_ABOVE_31_BIT;
947                         }
948                         else {
949                             fail(sprintf "Unexpected warn flag: %x",
950                                 $warn_flag);
951                             next;
952                         }
953                     }
954                     if ($disallow_flag) {
955                         if ($disallow_flag == $::UTF8_DISALLOW_SURROGATE)
956                         {
957                             $uvchr_disallow_flag
958                                         = $::UNICODE_DISALLOW_SURROGATE;
959                         }
960                         elsif ($disallow_flag == $::UTF8_DISALLOW_NONCHAR)
961                         {
962                             $uvchr_disallow_flag
963                                         = $::UNICODE_DISALLOW_NONCHAR;
964                         }
965                         elsif ($disallow_flag == $::UTF8_DISALLOW_SUPER) {
966                             $uvchr_disallow_flag
967                                         = $::UNICODE_DISALLOW_SUPER;
968                         }
969                         elsif ($disallow_flag
970                                         == $::UTF8_DISALLOW_ABOVE_31_BIT)
971                         {
972                             $uvchr_disallow_flag =
973                                         $::UNICODE_DISALLOW_ABOVE_31_BIT;
974                         }
975                         else {
976                             fail(sprintf "Unexpected disallow flag: %x",
977                                 $disallow_flag);
978                             next;
979                         }
980                     }
981
982                     $disallowed = $uvchr_disallow_flag;
983
984                     $this_name = "uvchr_to_utf8_flags() $testname: "
985                                             . (($uvchr_disallow_flag)
986                                                 ? 'disallowed'
987                                                 : ($disallowed)
988                                                 ? 'ABOVE_31_BIT allowed'
989                                                 : 'allowed');
990                     $this_name .= ", $eval_warn";
991                     $this_name .= ", " . (($uvchr_warn_flag)
992                                         ? 'with warning flag'
993                                         : 'no warning flag');
994
995                     undef @warnings_gotten;
996                     my $ret;
997                     $this_flags = $uvchr_warn_flag | $uvchr_disallow_flag;
998                     $eval_text = "$eval_warn; \$ret ="
999                             . " test_uvchr_to_utf8_flags("
1000                             . "$allowed_uv, $this_flags)";
1001                     eval "$eval_text";
1002                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1003                     {
1004                         diag "\$@='$@'; call was: "
1005                            . uvchr_display_call($eval_text);
1006                         next;
1007                     }
1008                     if ($disallowed) {
1009                         is($ret, undef, "$this_name: Returns undef")
1010                           or diag "Call was: " . uvchr_display_call($eval_text);
1011                     }
1012                     else {
1013                         is($ret, $this_bytes, "$this_name: Returns expected string")
1014                           or diag "Call was: " . uvchr_display_call($eval_text);
1015                     }
1016                     if (! $do_warning
1017                         && ($trial_warning_category eq 'utf8' || $trial_warning_category eq $controlling_warning_category))
1018                     {
1019                         if (!is(scalar @warnings_gotten, 0,
1020                                 "$this_name: No warnings generated"))
1021                         {
1022                             diag "Call was: " . uvchr_display_call($eval_text);
1023                             output_warnings(@warnings_gotten);
1024                         }
1025                     }
1026                     elsif (       $uvchr_warn_flag
1027                         && (   $trial_warning_category eq 'utf8'
1028                             || $trial_warning_category eq $controlling_warning_category))
1029                     {
1030                         if (is(scalar @warnings_gotten, 1,
1031                             "$this_name: Got a single warning "))
1032                         {
1033                             like($warnings_gotten[0], $message,
1034                                     "$this_name: Got expected warning")
1035                                 or diag "Call was: "
1036                                       . uvchr_display_call($eval_text);
1037                         }
1038                         else {
1039                             diag "Call was: " . uvchr_display_call($eval_text);
1040                             output_warnings(@warnings_gotten)
1041                                                 if scalar @warnings_gotten;
1042                         }
1043                     }
1044                 }
1045               }
1046             }
1047           }
1048         }
1049       }
1050     }
1051 }
1052
1053 done_testing;