APItest/t/utf8_warn_base.pl: Make hash element optional
[perl.git] / ext / XS-APItest / t / utf8_warn_base.pl
1 #!perl -w
2
3 # This is a base file to be used by various .t's in its directory
4
5 use strict;
6 use Test::More;
7
8 BEGIN {
9     use_ok('XS::APItest');
10     require 'charset_tools.pl';
11     require './t/utf8_setup.pl';
12 };
13
14 $|=1;
15
16 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
17                           # machines, and that is tested elsewhere
18
19 use XS::APItest;
20 use Data::Dumper;
21
22 my @warnings;
23
24 use warnings 'utf8';
25 local $SIG{__WARN__} = sub { push @warnings, @_ };
26
27 sub nonportable_regex ($) {
28
29     # Returns a pattern that matches the non-portable message raised either
30     # for the specific input code point, or the one generated when there
31     # is some malformation that precludes the message containing the specific
32     # code point
33
34     my $code_point = shift;
35
36     my $string = sprintf '(Code point 0x%X is not Unicode, and'
37                        . '|Any UTF-8 sequence that starts with'
38                        . ' "(\\\x[[:xdigit:]]{2})+" is for a'
39                        . ' non-Unicode code point, and is) not portable',
40                     $code_point;
41     return qr/$string/;
42 }
43
44 # Now test the cases where a legal code point is generated, but may or may not
45 # be allowed/warned on.
46 my @tests = (
47      # ($testname, $bytes, $disallow_flags, $category,
48      #  $allowed_uv, $needed_to_discern_len )
49     [ "lowest surrogate",
50         (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
51         $::UTF8_DISALLOW_SURROGATE,
52         'surrogate', 0xD800,
53     ],
54     [ "a middle surrogate",
55         (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
56         $::UTF8_DISALLOW_SURROGATE,
57         'surrogate', 0xD90D,
58     ],
59     [ "highest surrogate",
60         (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
61         $::UTF8_DISALLOW_SURROGATE,
62         'surrogate', 0xDFFF,
63     ],
64     [ "first non_unicode",
65         (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
66         $::UTF8_DISALLOW_SUPER,
67         'non_unicode', 0x110000,
68         2,
69     ],
70     [ "non_unicode whose first byte tells that",
71         (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
72         $::UTF8_DISALLOW_SUPER,
73         'non_unicode',
74         (isASCII) ? 0x140000 : 0x200000,
75         1,
76     ],
77     [ "first of 32 consecutive non-character code points",
78         (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
79         $::UTF8_DISALLOW_NONCHAR,
80         'nonchar', 0xFDD0,
81     ],
82     [ "a mid non-character code point of the 32 consecutive ones",
83         (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
84         $::UTF8_DISALLOW_NONCHAR,
85         'nonchar', 0xFDE0,
86     ],
87     [ "final of 32 consecutive non-character code points",
88         (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
89         $::UTF8_DISALLOW_NONCHAR,
90         'nonchar', 0xFDEF,
91     ],
92     [ "non-character code point U+FFFE",
93         (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
94         $::UTF8_DISALLOW_NONCHAR,
95         'nonchar', 0xFFFE,
96     ],
97     [ "non-character code point U+FFFF",
98         (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
99         $::UTF8_DISALLOW_NONCHAR,
100         'nonchar', 0xFFFF,
101     ],
102     [ "non-character code point U+1FFFE",
103         (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
104         $::UTF8_DISALLOW_NONCHAR,
105         'nonchar', 0x1FFFE,
106     ],
107     [ "non-character code point U+1FFFF",
108         (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
109         $::UTF8_DISALLOW_NONCHAR,
110         'nonchar', 0x1FFFF,
111     ],
112     [ "non-character code point U+2FFFE",
113         (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
114         $::UTF8_DISALLOW_NONCHAR,
115         'nonchar', 0x2FFFE,
116     ],
117     [ "non-character code point U+2FFFF",
118         (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
119         $::UTF8_DISALLOW_NONCHAR,
120         'nonchar', 0x2FFFF,
121     ],
122     [ "non-character code point U+3FFFE",
123         (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
124         $::UTF8_DISALLOW_NONCHAR,
125         'nonchar', 0x3FFFE,
126     ],
127     [ "non-character code point U+3FFFF",
128         (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
129         $::UTF8_DISALLOW_NONCHAR,
130         'nonchar', 0x3FFFF,
131     ],
132     [ "non-character code point U+4FFFE",
133         (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
134         $::UTF8_DISALLOW_NONCHAR,
135         'nonchar', 0x4FFFE,
136     ],
137     [ "non-character code point U+4FFFF",
138         (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
139         $::UTF8_DISALLOW_NONCHAR,
140         'nonchar', 0x4FFFF,
141     ],
142     [ "non-character code point U+5FFFE",
143         (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
144         $::UTF8_DISALLOW_NONCHAR,
145         'nonchar', 0x5FFFE,
146     ],
147     [ "non-character code point U+5FFFF",
148         (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
149         $::UTF8_DISALLOW_NONCHAR,
150         'nonchar', 0x5FFFF,
151     ],
152     [ "non-character code point U+6FFFE",
153         (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
154         $::UTF8_DISALLOW_NONCHAR,
155         'nonchar', 0x6FFFE,
156     ],
157     [ "non-character code point U+6FFFF",
158         (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
159         $::UTF8_DISALLOW_NONCHAR,
160         'nonchar', 0x6FFFF,
161     ],
162     [ "non-character code point U+7FFFE",
163         (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
164         $::UTF8_DISALLOW_NONCHAR,
165         'nonchar', 0x7FFFE,
166     ],
167     [ "non-character code point U+7FFFF",
168         (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
169         $::UTF8_DISALLOW_NONCHAR,
170         'nonchar', 0x7FFFF,
171     ],
172     [ "non-character code point U+8FFFE",
173         (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
174         $::UTF8_DISALLOW_NONCHAR,
175         'nonchar', 0x8FFFE,
176     ],
177     [ "non-character code point U+8FFFF",
178         (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
179         $::UTF8_DISALLOW_NONCHAR,
180         'nonchar', 0x8FFFF,
181     ],
182     [ "non-character code point U+9FFFE",
183         (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
184         $::UTF8_DISALLOW_NONCHAR,
185         'nonchar', 0x9FFFE,
186     ],
187     [ "non-character code point U+9FFFF",
188         (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
189         $::UTF8_DISALLOW_NONCHAR,
190         'nonchar', 0x9FFFF,
191     ],
192     [ "non-character code point U+AFFFE",
193         (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
194         $::UTF8_DISALLOW_NONCHAR,
195         'nonchar', 0xAFFFE,
196     ],
197     [ "non-character code point U+AFFFF",
198         (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
199         $::UTF8_DISALLOW_NONCHAR,
200         'nonchar', 0xAFFFF,
201     ],
202     [ "non-character code point U+BFFFE",
203         (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
204         $::UTF8_DISALLOW_NONCHAR,
205         'nonchar', 0xBFFFE,
206     ],
207     [ "non-character code point U+BFFFF",
208         (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
209         $::UTF8_DISALLOW_NONCHAR,
210         'nonchar', 0xBFFFF,
211     ],
212     [ "non-character code point U+CFFFE",
213         (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
214         $::UTF8_DISALLOW_NONCHAR,
215         'nonchar', 0xCFFFE,
216     ],
217     [ "non-character code point U+CFFFF",
218         (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
219         $::UTF8_DISALLOW_NONCHAR,
220         'nonchar', 0xCFFFF,
221     ],
222     [ "non-character code point U+DFFFE",
223         (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
224         $::UTF8_DISALLOW_NONCHAR,
225         'nonchar', 0xDFFFE,
226     ],
227     [ "non-character code point U+DFFFF",
228         (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
229         $::UTF8_DISALLOW_NONCHAR,
230         'nonchar', 0xDFFFF,
231     ],
232     [ "non-character code point U+EFFFE",
233         (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
234         $::UTF8_DISALLOW_NONCHAR,
235         'nonchar', 0xEFFFE,
236     ],
237     [ "non-character code point U+EFFFF",
238         (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
239         $::UTF8_DISALLOW_NONCHAR,
240         'nonchar', 0xEFFFF,
241     ],
242     [ "non-character code point U+FFFFE",
243         (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
244         $::UTF8_DISALLOW_NONCHAR,
245         'nonchar', 0xFFFFE,
246     ],
247     [ "non-character code point U+FFFFF",
248         (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
249         $::UTF8_DISALLOW_NONCHAR,
250         'nonchar', 0xFFFFF,
251     ],
252     [ "non-character code point U+10FFFE",
253         (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
254         $::UTF8_DISALLOW_NONCHAR,
255         'nonchar', 0x10FFFE,
256     ],
257     [ "non-character code point U+10FFFF",
258         (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
259         $::UTF8_DISALLOW_NONCHAR,
260         'nonchar', 0x10FFFF,
261     ],
262     [ "requires at least 32 bits",
263         (isASCII)
264          ?  "\xfe\x82\x80\x80\x80\x80\x80"
265          : I8_to_native(
266             "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
267         # This code point is chosen so that it is representable in a UV on
268         # 32-bit machines
269         $::UTF8_DISALLOW_ABOVE_31_BIT,
270         'utf8', 0x80000000,
271         (isASCII) ? 1 : 8,
272     ],
273     [ "highest 32 bit code point",
274         (isASCII)
275          ?  "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
276          : I8_to_native(
277             "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
278         $::UTF8_DISALLOW_ABOVE_31_BIT,
279         'utf8', 0xFFFFFFFF,
280         (isASCII) ? 1 : 8,
281     ],
282     [ "requires at least 32 bits, and use SUPER-type flags, instead of"
283     . " ABOVE_31_BIT",
284         (isASCII)
285          ? "\xfe\x82\x80\x80\x80\x80\x80"
286          : I8_to_native(
287            "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
288         $::UTF8_DISALLOW_SUPER,
289         'utf8', 0x80000000,
290         1,
291     ],
292     [ "overflow with warnings/disallow for more than 31 bits",
293         # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
294         # with overflow.  The overflow malformation is never allowed, so
295         # preventing it takes precedence if the ABOVE_31_BIT options would
296         # otherwise allow in an overflowing value.  The ASCII code points (1
297         # for 32-bits; 1 for 64) were chosen because the old overflow
298         # detection algorithm did not catch them; this means this test also
299         # checks for that fix.  The EBCDIC are arbitrary overflowing ones
300         # since we have no reports of failures with it.
301        (($::is64bit)
302         ? ((isASCII)
303            ?    "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
304            : I8_to_native(
305                 "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
306         : ((isASCII)
307            ?    "\xfe\x86\x80\x80\x80\x80\x80"
308            : I8_to_native(
309                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
310         $::UTF8_DISALLOW_ABOVE_31_BIT,
311         'utf8', -1,
312         (isASCII || $::is64bit) ? 2 : 8,
313     ],
314 );
315
316 if (! $::is64bit) {
317     if (isASCII) {
318         no warnings qw{portable overflow};
319         push @tests,
320             [ "Lowest 33 bit code point: overflow",
321                 "\xFE\x84\x80\x80\x80\x80\x80",
322                 $::UTF8_DISALLOW_ABOVE_31_BIT,
323                 'utf8', -1,
324                 1,
325             ];
326     }
327 }
328 else {
329     no warnings qw{portable overflow};
330     push @tests,
331         [ "More than 32 bits",
332             (isASCII)
333             ?       "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
334             : I8_to_native(
335                     "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
336             $::UTF8_DISALLOW_ABOVE_31_BIT,
337             'utf8', 0x1000000000,
338             (isASCII) ? 1 : 7,
339         ];
340     if (! isASCII) {
341         push @tests,   # These could falsely show wrongly in a naive
342                        # implementation
343             [ "requires at least 32 bits",
344                 I8_to_native(
345                     "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
346                 $::UTF8_DISALLOW_ABOVE_31_BIT,
347                 'utf8', 0x800000000,
348                 7,
349             ],
350             [ "requires at least 32 bits",
351                 I8_to_native(
352                     "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
353                 $::UTF8_DISALLOW_ABOVE_31_BIT,
354                 'utf8', 0x10000000000,
355                 6,
356             ],
357             [ "requires at least 32 bits",
358                 I8_to_native(
359                     "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
360                 $::UTF8_DISALLOW_ABOVE_31_BIT,
361                 'utf8', 0x200000000000,
362                 5,
363             ],
364             [ "requires at least 32 bits",
365                 I8_to_native(
366                     "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
367                 $::UTF8_DISALLOW_ABOVE_31_BIT,
368                 'utf8', 0x4000000000000,
369                 4,
370             ],
371             [ "requires at least 32 bits",
372                 I8_to_native(
373                     "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
374                 $::UTF8_DISALLOW_ABOVE_31_BIT,
375                 'utf8', 0x80000000000000,
376                 3,
377             ],
378             [ "requires at least 32 bits",
379                 I8_to_native(
380                     "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
381                 $::UTF8_DISALLOW_ABOVE_31_BIT,
382                 'utf8', 0x1000000000000000,
383                 2,
384             ];
385     }
386 }
387
388 # This test is split into this number of files.
389 my $num_test_files = $ENV{TEST_JOBS} || 1;
390 $num_test_files = 10 if $num_test_files > 10;
391
392 my $test_count = -1;
393 foreach my $test (@tests) {
394     $test_count++;
395     next if $test_count % $num_test_files != $::TEST_CHUNK;
396
397     my ($testname, $bytes, $disallow_flags,
398         $category, $allowed_uv, $needed_to_discern_len
399        ) = @$test;
400
401     my $length = length $bytes;
402     my $will_overflow = $allowed_uv < 0;
403
404     # The convention is that the got flag is the same value as the disallow
405     # one, and the warn flag is the next bit over.  If this were violated, the
406     # tests here should start failing.  We could do an eval under no strict to
407     # be sure.
408     my $expected_error_flags = $disallow_flags;
409     my $warn_flags = $disallow_flags << 1;
410
411     my $message;
412     if ($allowed_uv > 0x7FFFFFFF) {
413         $message = nonportable_regex($allowed_uv);
414     }
415     elsif ($allowed_uv > 0x10FFFF) {
416         $message = qr/(not Unicode|for a non-Unicode code point).* may not be portable/;
417     }
418     elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
419         $message = qr/surrogate/;
420         $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
421     }
422     elsif (   ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
423            || ($allowed_uv & 0xFFFE) == 0xFFFE)
424     {
425         $message = qr/Unicode non-character.*is not recommended for open interchange/;
426         $needed_to_discern_len = $length unless defined $needed_to_discern_len;
427     }
428     elsif ($will_overflow) {
429         $message = qr/overflows/;
430     }
431     else {
432         die "Can't figure out what type of warning to test for $testname"
433     }
434
435     die 'Didn\'t set $needed_to_discern_len for ' . $testname
436                                         unless defined $needed_to_discern_len;
437
438     {
439         use warnings;
440         undef @warnings;
441         my $ret = test_isUTF8_CHAR($bytes, $length);
442         my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
443         if ($will_overflow) {
444             is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
445             is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
446         }
447         else {
448             is($ret, $length,
449                "isUTF8_CHAR() $testname: returns expected length: $length");
450             is($ret_flags, $length, "isUTF8_CHAR_flags(...,0) $testname:"
451                                   . " returns expected length: $length");
452         }
453         is(scalar @warnings, 0,
454                 "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated"
455               . " no warnings")
456           or output_warnings(@warnings);
457
458         undef @warnings;
459         $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
460         if ($will_overflow) {
461             is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
462         }
463         else {
464             my $expected_ret = (   $testname =~ /surrogate|non-character/
465                                 || $allowed_uv > 0x10FFFF)
466                                ? 0
467                                : $length;
468             is($ret, $expected_ret, "isSTRICT_UTF8_CHAR() $testname: returns"
469                                   . " expected length: $expected_ret");
470             $ret = test_isUTF8_CHAR_flags($bytes, $length,
471                                           $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
472             is($ret, $expected_ret,
473                             "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
474                           . " acts like isSTRICT_UTF8_CHAR");
475         }
476         is(scalar @warnings, 0,
477                 "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
478               . " generated no warnings")
479           or output_warnings(@warnings);
480
481         undef @warnings;
482         $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
483         if ($will_overflow) {
484             is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
485         }
486         else {
487             my $expected_ret = (   $testname =~ /surrogate/
488                                 || $allowed_uv > 0x10FFFF)
489                                ? 0
490                                : $length;
491             is($ret, $expected_ret, "isC9_STRICT_UTF8_CHAR() $testname:"
492                                    ." returns expected length: $expected_ret");
493             $ret = test_isUTF8_CHAR_flags($bytes, $length,
494                                           $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
495             is($ret, $expected_ret,
496                           "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
497                         . " acts like isC9_STRICT_UTF8_CHAR");
498         }
499         is(scalar @warnings, 0,
500                 "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
501               . " generated no warnings")
502           or output_warnings(@warnings);
503
504         # Test partial character handling, for each byte not a full character
505         for my $j (1.. $length - 1) {
506
507             # Skip the test for the interaction between overflow and above-31
508             # bit.  It is really testing other things than the partial
509             # character tests, for which other tests in this file are
510             # sufficient
511             last if $will_overflow;
512
513             foreach my $disallow_flag (0, $disallow_flags) {
514                 my $partial = substr($bytes, 0, $j);
515                 my $ret_should_be;
516                 my $comment;
517                 if ($disallow_flag) {
518                     $ret_should_be = 0;
519                     $comment = "disallowed";
520                     if ($j < $needed_to_discern_len) {
521                         $ret_should_be = 1;
522                         $comment .= ", but need $needed_to_discern_len bytes"
523                                  .  " to discern:";
524                     }
525                 }
526                 else {
527                     $ret_should_be = 1;
528                     $comment = "allowed";
529                 }
530
531                 undef @warnings;
532
533                 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
534                                                              $disallow_flag);
535                 is($ret, $ret_should_be,
536                                 "$testname: is_utf8_valid_partial_char_flags("
537                                         . display_bytes($partial)
538                                         . "), $comment: returns $ret_should_be");
539                 is(scalar @warnings, 0,
540                         "$testname: is_utf8_valid_partial_char_flags()"
541                       . " generated no warnings")
542                   or output_warnings(@warnings);
543             }
544         }
545     }
546
547     # This is more complicated than the malformations tested earlier, as there
548     # are several orthogonal variables involved.  We test all the subclasses
549     # of utf8 warnings to verify they work with and without the utf8 class,
550     # and don't have effects on other sublass warnings
551     foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
552       foreach my $warn_flag (0, $warn_flags) {
553         foreach my $disallow_flag (0, $disallow_flags) {
554           foreach my $do_warning (0, 1) {
555
556             # We try each of the above with various combinations of
557             # malformations that can occur on the same input sequence.
558             foreach my $short ("", "short") {
559               foreach my $unexpected_noncont ("",
560                                               "unexpected non-continuation")
561               {
562                 foreach my $overlong ("", "overlong") {
563
564                     # If we're creating an overlong, it can't be longer than
565                     # the maximum length, so skip if we're already at that
566                     # length.
567                     next if $overlong && $length >= $::max_bytes;
568
569                     my @malformations;
570                     my @expected_errors;
571                     push @malformations, $short if $short;
572                     push @malformations, $unexpected_noncont
573                                                       if $unexpected_noncont;
574                     push @malformations, $overlong if $overlong;
575
576                     # The overflow malformation test in the input
577                     # array is coerced into being treated like one of
578                     # the others.
579                     if ($will_overflow) {
580                         push @malformations, 'overflow';
581                         push @expected_errors, $::UTF8_GOT_OVERFLOW;
582                     }
583
584                     my $malformations_name = join "/", @malformations;
585                     $malformations_name .= " malformation"
586                                                 if $malformations_name;
587                     $malformations_name .= "s" if @malformations > 1;
588                     my $this_bytes = $bytes;
589                     my $this_length = $length;
590                     my $expected_uv = $allowed_uv;
591                     my $this_expected_len = $length;
592                     my $this_needed_to_discern_len = $needed_to_discern_len;
593                     if ($malformations_name) {
594                         $expected_uv = 0;
595
596                         # Coerce the input into the desired
597                         # malformation
598                         if ($malformations_name =~ /overlong/) {
599
600                             # For an overlong, we convert the original
601                             # start byte into a continuation byte with
602                             # the same data bits as originally. ...
603                             substr($this_bytes, 0, 1)
604                                 = start_byte_to_cont(substr($this_bytes,
605                                                             0, 1));
606
607                             # ... Then we prepend it with a known
608                             # overlong sequence.  This should evaluate
609                             # to the exact same code point as the
610                             # original.
611                             $this_bytes
612                             = I8_to_native("\xff")
613                             . (I8_to_native(chr $::lowest_continuation)
614                             x ( $::max_bytes - 1 - length($this_bytes)))
615                             . $this_bytes;
616                             $this_length = length($this_bytes);
617                             $this_needed_to_discern_len
618                                  = $::max_bytes - ($this_expected_len
619                                                - $this_needed_to_discern_len);
620                             $this_expected_len = $::max_bytes;
621                             push @expected_errors, $::UTF8_GOT_LONG;
622                         }
623                         if ($malformations_name =~ /short/) {
624
625                             # Just tell the test to not look far
626                             # enough into the input.
627                             $this_length--;
628                             $this_expected_len--;
629                             push @expected_errors, $::UTF8_GOT_SHORT;
630                         }
631                         if ($malformations_name
632                                                 =~ /non-continuation/)
633                         {
634                             # Change the final continuation byte into
635                             # a non one.
636                             my $pos = ($short) ? -2 : -1;
637                             substr($this_bytes, $pos, 1) = '?';
638                             $this_expected_len--;
639                             push @expected_errors,
640                                             $::UTF8_GOT_NON_CONTINUATION;
641                         }
642                     }
643
644                     my $eval_warn = $do_warning
645                                 ? "use warnings '$warning'"
646                                 : $warning eq "utf8"
647                                     ? "no warnings 'utf8'"
648                                     : ( "use warnings 'utf8';"
649                                     . " no warnings '$warning'");
650
651                     # Is effectively disallowed if we've set up a
652                     # malformation, even if the flag indicates it is
653                     # allowed.  Fix up test name to indicate this as
654                     # well
655                     my $disallowed = $disallow_flag
656                                 || $malformations_name;
657                     my $this_name = "utf8n_to_uvchr_error() $testname: "
658                                                 . (($disallow_flag)
659                                                 ? 'disallowed'
660                                                 : $disallowed
661                                                     ? $disallowed
662                                                     : 'allowed');
663                     $this_name .= ", $eval_warn";
664                     $this_name .= ", " . (($warn_flag)
665                                         ? 'with warning flag'
666                                         : 'no warning flag');
667
668                     undef @warnings;
669                     my $ret_ref;
670                     my $display_bytes = display_bytes($this_bytes);
671                     my $call = "    Call was: $eval_warn; \$ret_ref"
672                             . " = test_utf8n_to_uvchr_error("
673                             . "'$display_bytes', $this_length,"
674                             . "$warn_flag"
675                             . "|$disallow_flag)";
676                     my $eval_text =      "$eval_warn; \$ret_ref"
677                             . " = test_utf8n_to_uvchr_error("
678                             . "'$this_bytes',"
679                             . " $this_length, $warn_flag"
680                             . "|$disallow_flag)";
681                     eval "$eval_text";
682                     if (! ok ("$@ eq ''",
683                         "$this_name: eval succeeded"))
684                     {
685                         diag "\$!='$!'; eval'd=\"$call\"";
686                         next;
687                     }
688                     if ($disallowed) {
689                         is($ret_ref->[0], 0, "$this_name: Returns 0")
690                           or diag $call;
691                     }
692                     else {
693                         is($ret_ref->[0], $expected_uv,
694                                 "$this_name: Returns expected uv: "
695                                 . sprintf("0x%04X", $expected_uv))
696                           or diag $call;
697                     }
698                     is($ret_ref->[1], $this_expected_len,
699                                         "$this_name: Returns expected length:"
700                                       . " $this_expected_len")
701                       or diag $call;
702
703                     my $errors = $ret_ref->[2];
704
705                     for (my $i = @expected_errors - 1; $i >= 0; $i--) {
706                         if (ok($expected_errors[$i] & $errors,
707                             "Expected and got error bit return"
708                             . " for $malformations[$i] malformation"))
709                         {
710                             $errors &= ~$expected_errors[$i];
711                         }
712                         splice @expected_errors, $i, 1;
713                     }
714                     is(scalar @expected_errors, 0,
715                             "Got all the expected malformation errors")
716                       or diag Dumper \@expected_errors;
717
718                     if (   $this_expected_len >= $this_needed_to_discern_len
719                         && ($warn_flag || $disallow_flag))
720                     {
721                         is($errors, $expected_error_flags,
722                                 "Got the correct error flag")
723                           or diag $call;
724                     }
725                     else {
726                         is($errors, 0, "Got no other error flag");
727                     }
728
729                     if (@malformations) {
730                         if (! $do_warning && $warning eq 'utf8') {
731                             goto no_warnings_expected;
732                         }
733
734                         # Check that each malformation generates a
735                         # warning, removing that warning if found
736                     MALFORMATION:
737                         foreach my $malformation (@malformations) {
738                             foreach (my $i = 0; $i < @warnings; $i++) {
739                                 if ($warnings[$i] =~ /$malformation/) {
740                                     pass("Expected and got"
741                                     . "'$malformation' warning");
742                                     splice @warnings, $i, 1;
743                                     next MALFORMATION;
744                                 }
745                             }
746                             fail("Expected '$malformation' warning"
747                             . " but didn't get it");
748
749                         }
750                     }
751
752                     # Any overflow will override any super or above-31
753                     # warnings.
754                     goto no_warnings_expected
755                                 if $will_overflow || $this_expected_len
756                                         < $this_needed_to_discern_len;
757
758                     if (    ! $do_warning
759                         && (   $warning eq 'utf8'
760                             || $warning eq $category))
761                     {
762                         goto no_warnings_expected;
763                     }
764                     elsif ($warn_flag) {
765                         if (is(scalar @warnings, 1,
766                             "$this_name: Got a single warning "))
767                         {
768                             like($warnings[0], $message,
769                                     "$this_name: Got expected warning")
770                                 or diag $call;
771                         }
772                         else {
773                             diag $call;
774                             if (scalar @warnings) {
775                                 output_warnings(@warnings);
776                             }
777                         }
778                     }
779                     else {
780                     no_warnings_expected:
781                         unless (is(scalar @warnings, 0,
782                                 "$this_name: Got no warnings"))
783                         {
784                             diag $call;
785                             output_warnings(@warnings);
786                         }
787                     }
788
789                     # Check CHECK_ONLY results when the input is
790                     # disallowed.  Do this when actually disallowed,
791                     # not just when the $disallow_flag is set
792                     if ($disallowed) {
793                         undef @warnings;
794                         $ret_ref = test_utf8n_to_uvchr_error(
795                                     $this_bytes, $this_length,
796                                     $disallow_flag|$::UTF8_CHECK_ONLY);
797                         is($ret_ref->[0], 0,
798                                         "$this_name, CHECK_ONLY: Returns 0")
799                           or diag $call;
800                         is($ret_ref->[1], -1,
801                             "$this_name: CHECK_ONLY: returns -1 for length")
802                           or diag $call;
803                         if (! is(scalar @warnings, 0,
804                             "$this_name, CHECK_ONLY: no warnings"
805                         . " generated"))
806                         {
807                             diag $call;
808                             output_warnings(@warnings);
809                         }
810                     }
811
812                     # Now repeat some of the above, but for
813                     # uvchr_to_utf8_flags().  Since this comes from an
814                     # existing code point, it hasn't overflowed, and
815                     # isn't malformed.
816                     next if @malformations;
817
818                     # The warning and disallow flags passed in are for
819                     # utf8n_to_uvchr_error().  Convert them for
820                     # uvchr_to_utf8_flags().
821                     my $uvchr_warn_flag = 0;
822                     my $uvchr_disallow_flag = 0;
823                     if ($warn_flag) {
824                         if ($warn_flag == $::UTF8_WARN_SURROGATE) {
825                             $uvchr_warn_flag = $::UNICODE_WARN_SURROGATE
826                         }
827                         elsif ($warn_flag == $::UTF8_WARN_NONCHAR) {
828                             $uvchr_warn_flag = $::UNICODE_WARN_NONCHAR
829                         }
830                         elsif ($warn_flag == $::UTF8_WARN_SUPER) {
831                             $uvchr_warn_flag = $::UNICODE_WARN_SUPER
832                         }
833                         elsif ($warn_flag == $::UTF8_WARN_ABOVE_31_BIT) {
834                             $uvchr_warn_flag
835                                         = $::UNICODE_WARN_ABOVE_31_BIT;
836                         }
837                         else {
838                             fail(sprintf "Unexpected warn flag: %x",
839                                 $warn_flag);
840                             next;
841                         }
842                     }
843                     if ($disallow_flag) {
844                         if ($disallow_flag == $::UTF8_DISALLOW_SURROGATE)
845                         {
846                             $uvchr_disallow_flag
847                                         = $::UNICODE_DISALLOW_SURROGATE;
848                         }
849                         elsif ($disallow_flag == $::UTF8_DISALLOW_NONCHAR)
850                         {
851                             $uvchr_disallow_flag
852                                         = $::UNICODE_DISALLOW_NONCHAR;
853                         }
854                         elsif ($disallow_flag == $::UTF8_DISALLOW_SUPER) {
855                             $uvchr_disallow_flag
856                                         = $::UNICODE_DISALLOW_SUPER;
857                         }
858                         elsif ($disallow_flag
859                                         == $::UTF8_DISALLOW_ABOVE_31_BIT)
860                         {
861                             $uvchr_disallow_flag =
862                                         $::UNICODE_DISALLOW_ABOVE_31_BIT;
863                         }
864                         else {
865                             fail(sprintf "Unexpected disallow flag: %x",
866                                 $disallow_flag);
867                             next;
868                         }
869                     }
870
871                     $disallowed = $uvchr_disallow_flag;
872
873                     $this_name = "uvchr_to_utf8_flags() $testname: "
874                                             . (($uvchr_disallow_flag)
875                                                 ? 'disallowed'
876                                                 : ($disallowed)
877                                                 ? 'ABOVE_31_BIT allowed'
878                                                 : 'allowed');
879                     $this_name .= ", $eval_warn";
880                     $this_name .= ", " . (($uvchr_warn_flag)
881                                         ? 'with warning flag'
882                                         : 'no warning flag');
883
884                     undef @warnings;
885                     my $ret;
886                     my $warn_flag = sprintf "0x%x", $uvchr_warn_flag;
887                     my $disallow_flag = sprintf "0x%x",
888                                                 $uvchr_disallow_flag;
889                     $call = sprintf("    Call was: $eval_warn; \$ret"
890                                 . " = test_uvchr_to_utf8_flags("
891                                 . " 0x%x, $warn_flag|$disallow_flag)",
892                                 $allowed_uv);
893                     $eval_text = "$eval_warn; \$ret ="
894                             . " test_uvchr_to_utf8_flags("
895                             . "$allowed_uv, $warn_flag|"
896                             . "$disallow_flag)";
897                     eval "$eval_text";
898                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
899                     {
900                         diag "\$!='$!'; eval'd=\"$eval_text\"";
901                         next;
902                     }
903                     if ($disallowed) {
904                         is($ret, undef, "$this_name: Returns undef")
905                           or diag $call;
906                     }
907                     else {
908                         is($ret, $bytes, "$this_name: Returns expected string")
909                           or diag $call;
910                     }
911                     if (! $do_warning
912                         && ($warning eq 'utf8' || $warning eq $category))
913                     {
914                         if (!is(scalar @warnings, 0,
915                                 "$this_name: No warnings generated"))
916                         {
917                             diag $call;
918                             output_warnings(@warnings);
919                         }
920                     }
921                     elsif (       $uvchr_warn_flag
922                         && (   $warning eq 'utf8'
923                             || $warning eq $category))
924                     {
925                         if (is(scalar @warnings, 1,
926                             "$this_name: Got a single warning "))
927                         {
928                             like($warnings[0], $message,
929                                     "$this_name: Got expected warning")
930                                 or diag $call;
931                         }
932                         else {
933                             diag $call;
934                             output_warnings(@warnings)
935                                                 if scalar @warnings;
936                         }
937                     }
938                 }
939               }
940             }
941           }
942         }
943       }
944     }
945 }
946
947 done_testing;