This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
ebca6871f7ad475be9059cc22ce6836718964ca8
[perl5.git] / ext / XS-APItest / t / utf8_warn_base.pl
1 #!perl -w
2
3 # This is a base file to be used by various .t's in its directory
4
5 use strict;
6 use Test::More;
7
8 BEGIN {
9     use_ok('XS::APItest');
10     require 'charset_tools.pl';
11     require './t/utf8_setup.pl';
12 };
13
14 $|=1;
15
16 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
17                           # machines, and that is tested elsewhere
18
19 use XS::APItest;
20 use Data::Dumper;
21
22 my @warnings_gotten;
23
24 use warnings 'utf8';
25 local $SIG{__WARN__} = sub { push @warnings_gotten, @_ };
26
27 sub nonportable_regex ($) {
28
29     # Returns a pattern that matches the non-portable message raised either
30     # for the specific input code point, or the one generated when there
31     # is some malformation that precludes the message containing the specific
32     # code point
33
34     my $code_point = shift;
35
36     my $string = sprintf '(Code point 0x%X is not Unicode, and'
37                        . '|Any UTF-8 sequence that starts with'
38                        . ' "(\\\x[[:xdigit:]]{2})+" is for a'
39                        . ' non-Unicode code point, and is) not portable',
40                     $code_point;
41     return qr/$string/;
42 }
43
44 # Now test the cases where a legal code point is generated, but may or may not
45 # be allowed/warned on.
46 my @tests = (
47      # ($testname, $bytes, $disallow_flags, $controlling_warning_category,
48      #  $allowed_uv, $needed_to_discern_len )
49     [ "lowest surrogate",
50         (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
51         $::UTF8_DISALLOW_SURROGATE,
52         'surrogate', 0xD800,
53     ],
54     [ "a middle surrogate",
55         (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
56         $::UTF8_DISALLOW_SURROGATE,
57         'surrogate', 0xD90D,
58     ],
59     [ "highest surrogate",
60         (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
61         $::UTF8_DISALLOW_SURROGATE,
62         'surrogate', 0xDFFF,
63     ],
64     [ "first non_unicode",
65         (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
66         $::UTF8_DISALLOW_SUPER,
67         'non_unicode', 0x110000,
68         2,
69     ],
70     [ "non_unicode whose first byte tells that",
71         (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
72         $::UTF8_DISALLOW_SUPER,
73         'non_unicode',
74         (isASCII) ? 0x140000 : 0x200000,
75         1,
76     ],
77     [ "first of 32 consecutive non-character code points",
78         (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
79         $::UTF8_DISALLOW_NONCHAR,
80         'nonchar', 0xFDD0,
81     ],
82     [ "a mid non-character code point of the 32 consecutive ones",
83         (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
84         $::UTF8_DISALLOW_NONCHAR,
85         'nonchar', 0xFDE0,
86     ],
87     [ "final of 32 consecutive non-character code points",
88         (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
89         $::UTF8_DISALLOW_NONCHAR,
90         'nonchar', 0xFDEF,
91     ],
92     [ "non-character code point U+FFFE",
93         (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
94         $::UTF8_DISALLOW_NONCHAR,
95         'nonchar', 0xFFFE,
96     ],
97     [ "non-character code point U+FFFF",
98         (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
99         $::UTF8_DISALLOW_NONCHAR,
100         'nonchar', 0xFFFF,
101     ],
102     [ "non-character code point U+1FFFE",
103         (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
104         $::UTF8_DISALLOW_NONCHAR,
105         'nonchar', 0x1FFFE,
106     ],
107     [ "non-character code point U+1FFFF",
108         (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
109         $::UTF8_DISALLOW_NONCHAR,
110         'nonchar', 0x1FFFF,
111     ],
112     [ "non-character code point U+2FFFE",
113         (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
114         $::UTF8_DISALLOW_NONCHAR,
115         'nonchar', 0x2FFFE,
116     ],
117     [ "non-character code point U+2FFFF",
118         (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
119         $::UTF8_DISALLOW_NONCHAR,
120         'nonchar', 0x2FFFF,
121     ],
122     [ "non-character code point U+3FFFE",
123         (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
124         $::UTF8_DISALLOW_NONCHAR,
125         'nonchar', 0x3FFFE,
126     ],
127     [ "non-character code point U+3FFFF",
128         (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
129         $::UTF8_DISALLOW_NONCHAR,
130         'nonchar', 0x3FFFF,
131     ],
132     [ "non-character code point U+4FFFE",
133         (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
134         $::UTF8_DISALLOW_NONCHAR,
135         'nonchar', 0x4FFFE,
136     ],
137     [ "non-character code point U+4FFFF",
138         (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
139         $::UTF8_DISALLOW_NONCHAR,
140         'nonchar', 0x4FFFF,
141     ],
142     [ "non-character code point U+5FFFE",
143         (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
144         $::UTF8_DISALLOW_NONCHAR,
145         'nonchar', 0x5FFFE,
146     ],
147     [ "non-character code point U+5FFFF",
148         (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
149         $::UTF8_DISALLOW_NONCHAR,
150         'nonchar', 0x5FFFF,
151     ],
152     [ "non-character code point U+6FFFE",
153         (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
154         $::UTF8_DISALLOW_NONCHAR,
155         'nonchar', 0x6FFFE,
156     ],
157     [ "non-character code point U+6FFFF",
158         (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
159         $::UTF8_DISALLOW_NONCHAR,
160         'nonchar', 0x6FFFF,
161     ],
162     [ "non-character code point U+7FFFE",
163         (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
164         $::UTF8_DISALLOW_NONCHAR,
165         'nonchar', 0x7FFFE,
166     ],
167     [ "non-character code point U+7FFFF",
168         (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
169         $::UTF8_DISALLOW_NONCHAR,
170         'nonchar', 0x7FFFF,
171     ],
172     [ "non-character code point U+8FFFE",
173         (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
174         $::UTF8_DISALLOW_NONCHAR,
175         'nonchar', 0x8FFFE,
176     ],
177     [ "non-character code point U+8FFFF",
178         (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
179         $::UTF8_DISALLOW_NONCHAR,
180         'nonchar', 0x8FFFF,
181     ],
182     [ "non-character code point U+9FFFE",
183         (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
184         $::UTF8_DISALLOW_NONCHAR,
185         'nonchar', 0x9FFFE,
186     ],
187     [ "non-character code point U+9FFFF",
188         (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
189         $::UTF8_DISALLOW_NONCHAR,
190         'nonchar', 0x9FFFF,
191     ],
192     [ "non-character code point U+AFFFE",
193         (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
194         $::UTF8_DISALLOW_NONCHAR,
195         'nonchar', 0xAFFFE,
196     ],
197     [ "non-character code point U+AFFFF",
198         (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
199         $::UTF8_DISALLOW_NONCHAR,
200         'nonchar', 0xAFFFF,
201     ],
202     [ "non-character code point U+BFFFE",
203         (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
204         $::UTF8_DISALLOW_NONCHAR,
205         'nonchar', 0xBFFFE,
206     ],
207     [ "non-character code point U+BFFFF",
208         (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
209         $::UTF8_DISALLOW_NONCHAR,
210         'nonchar', 0xBFFFF,
211     ],
212     [ "non-character code point U+CFFFE",
213         (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
214         $::UTF8_DISALLOW_NONCHAR,
215         'nonchar', 0xCFFFE,
216     ],
217     [ "non-character code point U+CFFFF",
218         (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
219         $::UTF8_DISALLOW_NONCHAR,
220         'nonchar', 0xCFFFF,
221     ],
222     [ "non-character code point U+DFFFE",
223         (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
224         $::UTF8_DISALLOW_NONCHAR,
225         'nonchar', 0xDFFFE,
226     ],
227     [ "non-character code point U+DFFFF",
228         (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
229         $::UTF8_DISALLOW_NONCHAR,
230         'nonchar', 0xDFFFF,
231     ],
232     [ "non-character code point U+EFFFE",
233         (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
234         $::UTF8_DISALLOW_NONCHAR,
235         'nonchar', 0xEFFFE,
236     ],
237     [ "non-character code point U+EFFFF",
238         (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
239         $::UTF8_DISALLOW_NONCHAR,
240         'nonchar', 0xEFFFF,
241     ],
242     [ "non-character code point U+FFFFE",
243         (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
244         $::UTF8_DISALLOW_NONCHAR,
245         'nonchar', 0xFFFFE,
246     ],
247     [ "non-character code point U+FFFFF",
248         (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
249         $::UTF8_DISALLOW_NONCHAR,
250         'nonchar', 0xFFFFF,
251     ],
252     [ "non-character code point U+10FFFE",
253         (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
254         $::UTF8_DISALLOW_NONCHAR,
255         'nonchar', 0x10FFFE,
256     ],
257     [ "non-character code point U+10FFFF",
258         (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
259         $::UTF8_DISALLOW_NONCHAR,
260         'nonchar', 0x10FFFF,
261     ],
262     [ "requires at least 32 bits",
263         (isASCII)
264          ?  "\xfe\x82\x80\x80\x80\x80\x80"
265          : I8_to_native(
266             "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
267         # This code point is chosen so that it is representable in a UV on
268         # 32-bit machines
269         $::UTF8_DISALLOW_ABOVE_31_BIT,
270         'utf8', 0x80000000,
271         (isASCII) ? 1 : 8,
272     ],
273     [ "highest 32 bit code point",
274         (isASCII)
275          ?  "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
276          : I8_to_native(
277             "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
278         $::UTF8_DISALLOW_ABOVE_31_BIT,
279         'utf8', 0xFFFFFFFF,
280         (isASCII) ? 1 : 8,
281     ],
282     [ "requires at least 32 bits, and use SUPER-type flags, instead of"
283     . " ABOVE_31_BIT",
284         (isASCII)
285          ? "\xfe\x82\x80\x80\x80\x80\x80"
286          : I8_to_native(
287            "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
288         $::UTF8_DISALLOW_SUPER,
289         'utf8', 0x80000000,
290         1,
291     ],
292     [ "overflow with warnings/disallow for more than 31 bits",
293         # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
294         # with overflow.  The overflow malformation is never allowed, so
295         # preventing it takes precedence if the ABOVE_31_BIT options would
296         # otherwise allow in an overflowing value.  The ASCII code points (1
297         # for 32-bits; 1 for 64) were chosen because the old overflow
298         # detection algorithm did not catch them; this means this test also
299         # checks for that fix.  The EBCDIC are arbitrary overflowing ones
300         # since we have no reports of failures with it.
301        (($::is64bit)
302         ? ((isASCII)
303            ?    "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
304            : I8_to_native(
305                 "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
306         : ((isASCII)
307            ?    "\xfe\x86\x80\x80\x80\x80\x80"
308            : I8_to_native(
309                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
310         $::UTF8_DISALLOW_ABOVE_31_BIT,
311         'utf8', -1,
312         (isASCII || $::is64bit) ? 2 : 8,
313     ],
314 );
315
316 if (! $::is64bit) {
317     if (isASCII) {
318         no warnings qw{portable overflow};
319         push @tests,
320             [ "Lowest 33 bit code point: overflow",
321                 "\xFE\x84\x80\x80\x80\x80\x80",
322                 $::UTF8_DISALLOW_ABOVE_31_BIT,
323                 'utf8', -1,
324                 1,
325             ];
326     }
327 }
328 else {
329     no warnings qw{portable overflow};
330     push @tests,
331         [ "More than 32 bits",
332             (isASCII)
333             ?       "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
334             : I8_to_native(
335                     "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
336             $::UTF8_DISALLOW_ABOVE_31_BIT,
337             'utf8', 0x1000000000,
338             (isASCII) ? 1 : 7,
339         ];
340     if (! isASCII) {
341         push @tests,   # These could falsely show wrongly in a naive
342                        # implementation
343             [ "requires at least 32 bits",
344                 I8_to_native(
345                     "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
346                 $::UTF8_DISALLOW_ABOVE_31_BIT,
347                 'utf8', 0x800000000,
348                 7,
349             ],
350             [ "requires at least 32 bits",
351                 I8_to_native(
352                     "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
353                 $::UTF8_DISALLOW_ABOVE_31_BIT,
354                 'utf8', 0x10000000000,
355                 6,
356             ],
357             [ "requires at least 32 bits",
358                 I8_to_native(
359                     "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
360                 $::UTF8_DISALLOW_ABOVE_31_BIT,
361                 'utf8', 0x200000000000,
362                 5,
363             ],
364             [ "requires at least 32 bits",
365                 I8_to_native(
366                     "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
367                 $::UTF8_DISALLOW_ABOVE_31_BIT,
368                 'utf8', 0x4000000000000,
369                 4,
370             ],
371             [ "requires at least 32 bits",
372                 I8_to_native(
373                     "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
374                 $::UTF8_DISALLOW_ABOVE_31_BIT,
375                 'utf8', 0x80000000000000,
376                 3,
377             ],
378             [ "requires at least 32 bits",
379                 I8_to_native(
380                     "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
381                 $::UTF8_DISALLOW_ABOVE_31_BIT,
382                 'utf8', 0x1000000000000000,
383                 2,
384             ];
385     }
386 }
387
388 sub flags_to_text($$)
389 {
390     my ($flags, $flags_to_text_ref) = @_;
391
392     # Returns a string containing a mnemonic representation of the bits that
393     # are set in the $flags.  These are assumed to be flag bits.  The return
394     # looks like "FOO|BAR|BAZ".  The second parameter is a reference to an
395     # array that gives the textual representation of all the possible flags.
396     # Element 0 is the text for the bit 0 flag; element 1 for bit 1; ....  If
397     # no bits at all are set the string "0" is returned;
398
399     my @flag_text;
400     my $shift = 0;
401
402     return "0" if $flags == 0;
403
404     while ($flags) {
405         #diag sprintf "%x", $flags;
406         if ($flags & 1) {
407             push @flag_text, $flags_to_text_ref->[$shift];
408         }
409         $shift++;
410         $flags >>= 1;
411     }
412
413     return join "|", @flag_text;
414 }
415
416 # Possible flag returns from utf8n_to_uvchr_error().  These should have G_,
417 # instead of A_, D_, but the prefixes will be used in a a later commit, so
418 # minimize churn by having them here.
419 my @utf8n_flags_to_text =  ( qw(
420         A_EMPTY
421         A_CONTINUATION
422         A_NON_CONTINUATION
423         A_SHORT
424         A_LONG
425         A_LONG_AND_ITS_VALUE
426         PLACEHOLDER
427         A_OVERFLOW
428         D_SURROGATE
429         W_SURROGATE
430         D_NONCHAR
431         W_NONCHAR
432         D_SUPER
433         W_SUPER
434         D_ABOVE_31_BIT
435         W_ABOVE_31_BIT
436         CHECK_ONLY
437         NO_CONFIDENCE_IN_CURLEN_
438     ) );
439
440 sub uvchr_display_call($)
441 {
442     # Converts an eval string that calls test_uvchr_to_utf8 into a more human
443     # readable form, and returns it.  The return will look something like:
444     #   test_uvchr_to_utf8n_flags($uv, $flags)
445     #diag $_[0];
446
447     my @flags_to_text =  ( qw(
448             W_SURROGATE
449             W_NONCHAR
450             W_SUPER
451             W_ABOVE_31_BIT
452             D_SURROGATE
453             D_NONCHAR
454             D_SUPER
455             D_ABOVE_31_BIT
456        ) );
457
458     $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
459     my $text = $1;
460     my $cp = sprintf "%X", $2;
461     my $flags = $3;
462
463     return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
464 }
465
466 # This test is split into this number of files.
467 my $num_test_files = $ENV{TEST_JOBS} || 1;
468 $num_test_files = 10 if $num_test_files > 10;
469
470 my $test_count = -1;
471 foreach my $test (@tests) {
472     $test_count++;
473     next if $test_count % $num_test_files != $::TEST_CHUNK;
474
475     my ($testname, $bytes, $disallow_flags,
476         $controlling_warning_category, $allowed_uv, $needed_to_discern_len
477        ) = @$test;
478
479     my $length = length $bytes;
480     my $will_overflow = $allowed_uv < 0;
481
482     # The convention is that the got flag is the same value as the disallow
483     # one, and the warn flag is the next bit over.  If this were violated, the
484     # tests here should start failing.  We could do an eval under no strict to
485     # be sure.
486     my $expected_error_flags = $disallow_flags;
487     my $warn_flags = $disallow_flags << 1;
488
489     my $message;
490     if ($allowed_uv > 0x7FFFFFFF) {
491         $message = nonportable_regex($allowed_uv);
492     }
493     elsif ($allowed_uv > 0x10FFFF) {
494         $message = qr/(not Unicode|for a non-Unicode code point).* may not be portable/;
495     }
496     elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
497         $message = qr/surrogate/;
498         $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
499     }
500     elsif (   ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
501            || ($allowed_uv & 0xFFFE) == 0xFFFE)
502     {
503         $message = qr/Unicode non-character.*is not recommended for open interchange/;
504         $needed_to_discern_len = $length unless defined $needed_to_discern_len;
505     }
506     elsif ($will_overflow) {
507         $message = qr/overflows/;
508     }
509     else {
510         die "Can't figure out what type of warning to test for $testname"
511     }
512
513     die 'Didn\'t set $needed_to_discern_len for ' . $testname
514                                         unless defined $needed_to_discern_len;
515
516     {
517         use warnings;
518         undef @warnings_gotten;
519         my $ret = test_isUTF8_CHAR($bytes, $length);
520         my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
521         if ($will_overflow) {
522             is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
523             is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
524         }
525         else {
526             is($ret, $length,
527                "isUTF8_CHAR() $testname: returns expected length: $length");
528             is($ret_flags, $length, "isUTF8_CHAR_flags(...,0) $testname:"
529                                   . " returns expected length: $length");
530         }
531         is(scalar @warnings_gotten, 0,
532                 "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated"
533               . " no warnings")
534           or output_warnings(@warnings_gotten);
535
536         undef @warnings_gotten;
537         $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
538         if ($will_overflow) {
539             is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
540         }
541         else {
542             my $expected_ret = (   $testname =~ /surrogate|non-character/
543                                 || $allowed_uv > 0x10FFFF)
544                                ? 0
545                                : $length;
546             is($ret, $expected_ret, "isSTRICT_UTF8_CHAR() $testname: returns"
547                                   . " expected length: $expected_ret");
548             $ret = test_isUTF8_CHAR_flags($bytes, $length,
549                                           $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
550             is($ret, $expected_ret,
551                             "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
552                           . " acts like isSTRICT_UTF8_CHAR");
553         }
554         is(scalar @warnings_gotten, 0,
555                 "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
556               . " generated no warnings")
557           or output_warnings(@warnings_gotten);
558
559         undef @warnings_gotten;
560         $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
561         if ($will_overflow) {
562             is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
563         }
564         else {
565             my $expected_ret = (   $testname =~ /surrogate/
566                                 || $allowed_uv > 0x10FFFF)
567                                ? 0
568                                : $length;
569             is($ret, $expected_ret, "isC9_STRICT_UTF8_CHAR() $testname:"
570                                    ." returns expected length: $expected_ret");
571             $ret = test_isUTF8_CHAR_flags($bytes, $length,
572                                           $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
573             is($ret, $expected_ret,
574                           "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
575                         . " acts like isC9_STRICT_UTF8_CHAR");
576         }
577         is(scalar @warnings_gotten, 0,
578                 "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
579               . " generated no warnings")
580           or output_warnings(@warnings_gotten);
581
582         # Test partial character handling, for each byte not a full character
583         for my $j (1.. $length - 1) {
584
585             # Skip the test for the interaction between overflow and above-31
586             # bit.  It is really testing other things than the partial
587             # character tests, for which other tests in this file are
588             # sufficient
589             last if $will_overflow;
590
591             foreach my $disallow_flag (0, $disallow_flags) {
592                 my $partial = substr($bytes, 0, $j);
593                 my $ret_should_be;
594                 my $comment;
595                 if ($disallow_flag) {
596                     $ret_should_be = 0;
597                     $comment = "disallowed";
598                     if ($j < $needed_to_discern_len) {
599                         $ret_should_be = 1;
600                         $comment .= ", but need $needed_to_discern_len bytes"
601                                  .  " to discern:";
602                     }
603                 }
604                 else {
605                     $ret_should_be = 1;
606                     $comment = "allowed";
607                 }
608
609                 undef @warnings_gotten;
610
611                 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
612                                                              $disallow_flag);
613                 is($ret, $ret_should_be,
614                                 "$testname: is_utf8_valid_partial_char_flags("
615                                         . display_bytes($partial)
616                                         . "), $comment: returns $ret_should_be");
617                 is(scalar @warnings_gotten, 0,
618                         "$testname: is_utf8_valid_partial_char_flags()"
619                       . " generated no warnings")
620                   or output_warnings(@warnings_gotten);
621             }
622         }
623     }
624
625     # This is more complicated than the malformations tested earlier, as there
626     # are several orthogonal variables involved.  We test all the subclasses
627     # of utf8 warnings to verify they work with and without the utf8 class,
628     # and don't have effects on other sublass warnings
629     foreach my $trial_warning_category ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
630       foreach my $warn_flag (0, $warn_flags) {
631         foreach my $disallow_flag (0, $disallow_flags) {
632           foreach my $do_warning (0, 1) {
633
634             # We try each of the above with various combinations of
635             # malformations that can occur on the same input sequence.
636             foreach my $short ("", "short") {
637               foreach my $unexpected_noncont ("",
638                                               "unexpected non-continuation")
639               {
640                 foreach my $overlong ("", "overlong") {
641
642                     # If we're creating an overlong, it can't be longer than
643                     # the maximum length, so skip if we're already at that
644                     # length.
645                     next if $overlong && $length >= $::max_bytes;
646
647                     my @malformations;
648                     my @expected_return_flags;
649                     push @malformations, $short if $short;
650                     push @malformations, $unexpected_noncont
651                                                       if $unexpected_noncont;
652                     push @malformations, $overlong if $overlong;
653
654                     # The overflow malformation test in the input
655                     # array is coerced into being treated like one of
656                     # the others.
657                     if ($will_overflow) {
658                         push @malformations, 'overflow';
659                         push @expected_return_flags, $::UTF8_GOT_OVERFLOW;
660                     }
661
662                     my $malformations_name = join "/", @malformations;
663                     $malformations_name .= " malformation"
664                                                 if $malformations_name;
665                     $malformations_name .= "s" if @malformations > 1;
666                     my $this_bytes = $bytes;
667                     my $this_length = $length;
668                     my $expected_uv = $allowed_uv;
669                     my $this_expected_len = $length;
670                     my $this_needed_to_discern_len = $needed_to_discern_len;
671                     if ($malformations_name) {
672                         $expected_uv = 0;
673
674                         # Coerce the input into the desired
675                         # malformation
676                         if ($malformations_name =~ /overlong/) {
677
678                             # For an overlong, we convert the original
679                             # start byte into a continuation byte with
680                             # the same data bits as originally. ...
681                             substr($this_bytes, 0, 1)
682                                 = start_byte_to_cont(substr($this_bytes,
683                                                             0, 1));
684
685                             # ... Then we prepend it with a known
686                             # overlong sequence.  This should evaluate
687                             # to the exact same code point as the
688                             # original.
689                             $this_bytes
690                             = I8_to_native("\xff")
691                             . (I8_to_native(chr $::lowest_continuation)
692                             x ( $::max_bytes - 1 - length($this_bytes)))
693                             . $this_bytes;
694                             $this_length = length($this_bytes);
695                             $this_needed_to_discern_len
696                                  = $::max_bytes - ($this_expected_len
697                                                - $this_needed_to_discern_len);
698                             $this_expected_len = $::max_bytes;
699                             push @expected_return_flags, $::UTF8_GOT_LONG;
700                         }
701                         if ($malformations_name =~ /short/) {
702
703                             # Just tell the test to not look far
704                             # enough into the input.
705                             $this_length--;
706                             $this_expected_len--;
707                             push @expected_return_flags, $::UTF8_GOT_SHORT;
708                         }
709                         if ($malformations_name
710                                                 =~ /non-continuation/)
711                         {
712                             # Change the final continuation byte into
713                             # a non one.
714                             my $pos = ($short) ? -2 : -1;
715                             substr($this_bytes, $pos, 1) = '?';
716                             $this_expected_len--;
717                             push @expected_return_flags,
718                                             $::UTF8_GOT_NON_CONTINUATION;
719                         }
720                     }
721
722                     my $eval_warn = $do_warning
723                                 ? "use warnings '$trial_warning_category'"
724                                 : $trial_warning_category eq "utf8"
725                                     ? "no warnings 'utf8'"
726                                     : ( "use warnings 'utf8';"
727                                     . " no warnings '$trial_warning_category'");
728
729                     # Is effectively disallowed if we've set up a
730                     # malformation, even if the flag indicates it is
731                     # allowed.  Fix up test name to indicate this as
732                     # well
733                     my $disallowed = $disallow_flag
734                                 || $malformations_name;
735                     my $this_name = "utf8n_to_uvchr_error() $testname: "
736                                                 . (($disallow_flag)
737                                                 ? 'disallowed'
738                                                 : $disallowed
739                                                     ? $disallowed
740                                                     : 'allowed');
741                     $this_name .= ", $eval_warn";
742                     $this_name .= ", " . (($warn_flag)
743                                         ? 'with warning flag'
744                                         : 'no warning flag');
745
746                     undef @warnings_gotten;
747                     my $ret_ref;
748                     my $display_bytes = display_bytes($this_bytes);
749                     my $call = "    Call was: $eval_warn; \$ret_ref"
750                             . " = test_utf8n_to_uvchr_error("
751                             . "'$display_bytes', $this_length,"
752                             . "$warn_flag"
753                             . "|$disallow_flag)";
754                     my $eval_text =      "$eval_warn; \$ret_ref"
755                             . " = test_utf8n_to_uvchr_error("
756                             . "'$this_bytes',"
757                             . " $this_length, $warn_flag"
758                             . "|$disallow_flag)";
759                     eval "$eval_text";
760                     if (! ok ("$@ eq ''",
761                         "$this_name: eval succeeded"))
762                     {
763                         diag "\$!='$!'; eval'd=\"$call\"";
764                         next;
765                     }
766                     if ($disallowed) {
767                         is($ret_ref->[0], 0, "$this_name: Returns 0")
768                           or diag $call;
769                     }
770                     else {
771                         is($ret_ref->[0], $expected_uv,
772                                 "$this_name: Returns expected uv: "
773                                 . sprintf("0x%04X", $expected_uv))
774                           or diag $call;
775                     }
776                     is($ret_ref->[1], $this_expected_len,
777                                         "$this_name: Returns expected length:"
778                                       . " $this_expected_len")
779                       or diag $call;
780
781                     my $returned_flags = $ret_ref->[2];
782
783                     for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
784                         if (ok($expected_return_flags[$i] & $returned_flags,
785                             "Expected and got error bit return"
786                             . " for $malformations[$i] malformation"))
787                         {
788                             $returned_flags &= ~$expected_return_flags[$i];
789                         }
790                         splice @expected_return_flags, $i, 1;
791                     }
792                     is(scalar @expected_return_flags, 0,
793                             "Got all the expected malformation errors")
794                       or diag Dumper \@expected_return_flags;
795
796                     if (   $this_expected_len >= $this_needed_to_discern_len
797                         && ($warn_flag || $disallow_flag))
798                     {
799                         is($returned_flags, $expected_error_flags,
800                                 "Got the correct error flag")
801                           or diag $call;
802                     }
803                     else {
804                         is($returned_flags, 0, "Got no other error flag")
805                         or
806
807                         # We strip off any prefixes from the flag names
808                         diag "The unexpected flags were: "
809                            . (flags_to_text($returned_flags,
810                                             \@utf8n_flags_to_text)
811                              =~ s/ \b [A-Z] _ //xgr);
812                     }
813
814                     if (@malformations) {
815                         if (! $do_warning && $trial_warning_category eq 'utf8') {
816                             goto no_warnings_expected;
817                         }
818
819                         # Check that each malformation generates a
820                         # warning, removing that warning if found
821                     MALFORMATION:
822                         foreach my $malformation (@malformations) {
823                             foreach (my $i = 0; $i < @warnings_gotten; $i++) {
824                                 if ($warnings_gotten[$i] =~ /$malformation/) {
825                                     pass("Expected and got"
826                                     . "'$malformation' warning");
827                                     splice @warnings_gotten, $i, 1;
828                                     next MALFORMATION;
829                                 }
830                             }
831                             fail("Expected '$malformation' warning"
832                             . " but didn't get it");
833
834                         }
835                     }
836
837                     # Any overflow will override any super or above-31
838                     # warnings.
839                     goto no_warnings_expected
840                                 if $will_overflow || $this_expected_len
841                                         < $this_needed_to_discern_len;
842
843                     if (    ! $do_warning
844                         && (   $trial_warning_category eq 'utf8'
845                             || $trial_warning_category eq $controlling_warning_category))
846                     {
847                         goto no_warnings_expected;
848                     }
849                     elsif ($warn_flag) {
850                         if (is(scalar @warnings_gotten, 1,
851                             "$this_name: Got a single warning "))
852                         {
853                             like($warnings_gotten[0], $message,
854                                     "$this_name: Got expected warning")
855                                 or diag $call;
856                         }
857                         else {
858                             diag $call;
859                             if (scalar @warnings_gotten) {
860                                 output_warnings(@warnings_gotten);
861                             }
862                         }
863                     }
864                     else {
865
866                     no_warnings_expected:
867                         unless (is(scalar @warnings_gotten, 0,
868                                 "$this_name: Got no warnings"))
869                         {
870                             diag $call;
871                             output_warnings(@warnings_gotten);
872                         }
873                     }
874
875                     # Check CHECK_ONLY results when the input is
876                     # disallowed.  Do this when actually disallowed,
877                     # not just when the $disallow_flag is set
878                     if ($disallowed) {
879                         undef @warnings_gotten;
880                         $ret_ref = test_utf8n_to_uvchr_error(
881                                     $this_bytes, $this_length,
882                                     $disallow_flag|$::UTF8_CHECK_ONLY);
883                         is($ret_ref->[0], 0,
884                                         "$this_name, CHECK_ONLY: Returns 0")
885                           or diag $call;
886                         is($ret_ref->[1], -1,
887                             "$this_name: CHECK_ONLY: returns -1 for length")
888                           or diag $call;
889                         if (! is(scalar @warnings_gotten, 0,
890                             "$this_name, CHECK_ONLY: no warnings"
891                         . " generated"))
892                         {
893                             diag $call;
894                             output_warnings(@warnings_gotten);
895                         }
896                     }
897
898                     # Now repeat some of the above, but for
899                     # uvchr_to_utf8_flags().  Since this comes from an
900                     # existing code point, it hasn't overflowed, and
901                     # isn't malformed.
902                     next if @malformations;
903
904                     # The warning and disallow flags passed in are for
905                     # utf8n_to_uvchr_error().  Convert them for
906                     # uvchr_to_utf8_flags().
907                     my $uvchr_warn_flag = 0;
908                     my $uvchr_disallow_flag = 0;
909                     if ($warn_flag) {
910                         if ($warn_flag == $::UTF8_WARN_SURROGATE) {
911                             $uvchr_warn_flag = $::UNICODE_WARN_SURROGATE
912                         }
913                         elsif ($warn_flag == $::UTF8_WARN_NONCHAR) {
914                             $uvchr_warn_flag = $::UNICODE_WARN_NONCHAR
915                         }
916                         elsif ($warn_flag == $::UTF8_WARN_SUPER) {
917                             $uvchr_warn_flag = $::UNICODE_WARN_SUPER
918                         }
919                         elsif ($warn_flag == $::UTF8_WARN_ABOVE_31_BIT) {
920                             $uvchr_warn_flag
921                                         = $::UNICODE_WARN_ABOVE_31_BIT;
922                         }
923                         else {
924                             fail(sprintf "Unexpected warn flag: %x",
925                                 $warn_flag);
926                             next;
927                         }
928                     }
929                     if ($disallow_flag) {
930                         if ($disallow_flag == $::UTF8_DISALLOW_SURROGATE)
931                         {
932                             $uvchr_disallow_flag
933                                         = $::UNICODE_DISALLOW_SURROGATE;
934                         }
935                         elsif ($disallow_flag == $::UTF8_DISALLOW_NONCHAR)
936                         {
937                             $uvchr_disallow_flag
938                                         = $::UNICODE_DISALLOW_NONCHAR;
939                         }
940                         elsif ($disallow_flag == $::UTF8_DISALLOW_SUPER) {
941                             $uvchr_disallow_flag
942                                         = $::UNICODE_DISALLOW_SUPER;
943                         }
944                         elsif ($disallow_flag
945                                         == $::UTF8_DISALLOW_ABOVE_31_BIT)
946                         {
947                             $uvchr_disallow_flag =
948                                         $::UNICODE_DISALLOW_ABOVE_31_BIT;
949                         }
950                         else {
951                             fail(sprintf "Unexpected disallow flag: %x",
952                                 $disallow_flag);
953                             next;
954                         }
955                     }
956
957                     $disallowed = $uvchr_disallow_flag;
958
959                     $this_name = "uvchr_to_utf8_flags() $testname: "
960                                             . (($uvchr_disallow_flag)
961                                                 ? 'disallowed'
962                                                 : ($disallowed)
963                                                 ? 'ABOVE_31_BIT allowed'
964                                                 : 'allowed');
965                     $this_name .= ", $eval_warn";
966                     $this_name .= ", " . (($uvchr_warn_flag)
967                                         ? 'with warning flag'
968                                         : 'no warning flag');
969
970                     undef @warnings_gotten;
971                     my $ret;
972                     my $this_flags = $uvchr_warn_flag | $uvchr_disallow_flag;
973                     $eval_text = "$eval_warn; \$ret ="
974                             . " test_uvchr_to_utf8_flags("
975                             . "$allowed_uv, $this_flags)";
976                     eval "$eval_text";
977                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
978                     {
979                         diag "\$@='$@'; call was: "
980                            . uvchr_display_call($eval_text);
981                         next;
982                     }
983                     if ($disallowed) {
984                         is($ret, undef, "$this_name: Returns undef")
985                           or diag "Call was: " . uvchr_display_call($eval_text);
986                     }
987                     else {
988                         is($ret, $this_bytes, "$this_name: Returns expected string")
989                           or diag "Call was: " . uvchr_display_call($eval_text);
990                     }
991                     if (! $do_warning
992                         && ($trial_warning_category eq 'utf8' || $trial_warning_category eq $controlling_warning_category))
993                     {
994                         if (!is(scalar @warnings_gotten, 0,
995                                 "$this_name: No warnings generated"))
996                         {
997                             diag "Call was: " . uvchr_display_call($eval_text);
998                             output_warnings(@warnings_gotten);
999                         }
1000                     }
1001                     elsif (       $uvchr_warn_flag
1002                         && (   $trial_warning_category eq 'utf8'
1003                             || $trial_warning_category eq $controlling_warning_category))
1004                     {
1005                         if (is(scalar @warnings_gotten, 1,
1006                             "$this_name: Got a single warning "))
1007                         {
1008                             like($warnings_gotten[0], $message,
1009                                     "$this_name: Got expected warning")
1010                                 or diag "Call was: "
1011                                       . uvchr_display_call($eval_text);
1012                         }
1013                         else {
1014                             diag "Call was: " . uvchr_display_call($eval_text);
1015                             output_warnings(@warnings_gotten)
1016                                                 if scalar @warnings_gotten;
1017                         }
1018                     }
1019                 }
1020               }
1021             }
1022           }
1023         }
1024       }
1025     }
1026 }
1027
1028 done_testing;