d469038c342cbf8804cfe7d6ae913b8739f8a1e0
[perl.git] / ext / XS-APItest / t / utf8_warn_base.pl
1 #!perl -w
2
3 # This is a base file to be used by various .t's in its directory
4
5 use strict;
6 use Test::More;
7
8 BEGIN {
9     use_ok('XS::APItest');
10     require 'charset_tools.pl';
11     require './t/utf8_setup.pl';
12 };
13
14 $|=1;
15
16 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
17                           # machines, and that is tested elsewhere
18
19 use XS::APItest;
20 use Data::Dumper;
21
22 my @warnings;
23
24 use warnings 'utf8';
25 local $SIG{__WARN__} = sub { push @warnings, @_ };
26
27 sub nonportable_regex ($) {
28
29     # Returns a pattern that matches the non-portable message raised either
30     # for the specific input code point, or the one generated when there
31     # is some malformation that precludes the message containing the specific
32     # code point
33
34     my $code_point = shift;
35
36     my $string = sprintf '(Code point 0x%X is not Unicode, and'
37                        . '|Any UTF-8 sequence that starts with'
38                        . ' "(\\\x[[:xdigit:]]{2})+" is for a'
39                        . ' non-Unicode code point, and is) not portable',
40                     $code_point;
41     return qr/$string/;
42 }
43
44 # Now test the cases where a legal code point is generated, but may or may not
45 # be allowed/warned on.
46 my @tests = (
47      # ($testname, $bytes, $disallow_flags, $category,
48      #  $allowed_uv, $needed_to_discern_len )
49     [ "lowest surrogate",
50         (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
51         $::UTF8_DISALLOW_SURROGATE,
52         'surrogate', 0xD800,
53         2,
54     ],
55     [ "a middle surrogate",
56         (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
57         $::UTF8_DISALLOW_SURROGATE,
58         'surrogate', 0xD90D,
59         2,
60     ],
61     [ "highest surrogate",
62         (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
63         $::UTF8_DISALLOW_SURROGATE,
64         'surrogate', 0xDFFF,
65         2,
66     ],
67     [ "first non_unicode",
68         (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
69         $::UTF8_DISALLOW_SUPER,
70         'non_unicode', 0x110000,
71         2,
72     ],
73     [ "non_unicode whose first byte tells that",
74         (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
75         $::UTF8_DISALLOW_SUPER,
76         'non_unicode',
77         (isASCII) ? 0x140000 : 0x200000,
78         1,
79     ],
80     [ "first of 32 consecutive non-character code points",
81         (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
82         $::UTF8_DISALLOW_NONCHAR,
83         'nonchar', 0xFDD0,
84         (isASCII) ? 3 : 4,
85     ],
86     [ "a mid non-character code point of the 32 consecutive ones",
87         (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
88         $::UTF8_DISALLOW_NONCHAR,
89         'nonchar', 0xFDE0,
90         (isASCII) ? 3 : 4,
91     ],
92     [ "final of 32 consecutive non-character code points",
93         (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
94         $::UTF8_DISALLOW_NONCHAR,
95         'nonchar', 0xFDEF,
96         (isASCII) ? 3 : 4,
97     ],
98     [ "non-character code point U+FFFE",
99         (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
100         $::UTF8_DISALLOW_NONCHAR,
101         'nonchar', 0xFFFE,
102         (isASCII) ? 3 : 4,
103     ],
104     [ "non-character code point U+FFFF",
105         (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
106         $::UTF8_DISALLOW_NONCHAR,
107         'nonchar', 0xFFFF,
108         (isASCII) ? 3 : 4,
109     ],
110     [ "non-character code point U+1FFFE",
111         (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
112         $::UTF8_DISALLOW_NONCHAR,
113         'nonchar', 0x1FFFE,
114         4,
115     ],
116     [ "non-character code point U+1FFFF",
117         (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
118         $::UTF8_DISALLOW_NONCHAR,
119         'nonchar', 0x1FFFF,
120         4,
121     ],
122     [ "non-character code point U+2FFFE",
123         (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
124         $::UTF8_DISALLOW_NONCHAR,
125         'nonchar', 0x2FFFE,
126         4,
127     ],
128     [ "non-character code point U+2FFFF",
129         (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
130         $::UTF8_DISALLOW_NONCHAR,
131         'nonchar', 0x2FFFF,
132         4,
133     ],
134     [ "non-character code point U+3FFFE",
135         (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
136         $::UTF8_DISALLOW_NONCHAR,
137         'nonchar', 0x3FFFE,
138         4,
139     ],
140     [ "non-character code point U+3FFFF",
141         (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
142         $::UTF8_DISALLOW_NONCHAR,
143         'nonchar', 0x3FFFF,
144         4,
145     ],
146     [ "non-character code point U+4FFFE",
147         (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
148         $::UTF8_DISALLOW_NONCHAR,
149         'nonchar', 0x4FFFE,
150         (isASCII) ? 4 : 5,
151     ],
152     [ "non-character code point U+4FFFF",
153         (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
154         $::UTF8_DISALLOW_NONCHAR,
155         'nonchar', 0x4FFFF,
156         (isASCII) ? 4 : 5,
157     ],
158     [ "non-character code point U+5FFFE",
159         (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
160         $::UTF8_DISALLOW_NONCHAR,
161         'nonchar', 0x5FFFE,
162         (isASCII) ? 4 : 5,
163     ],
164     [ "non-character code point U+5FFFF",
165         (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
166         $::UTF8_DISALLOW_NONCHAR,
167         'nonchar', 0x5FFFF,
168         (isASCII) ? 4 : 5,
169     ],
170     [ "non-character code point U+6FFFE",
171         (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
172         $::UTF8_DISALLOW_NONCHAR,
173         'nonchar', 0x6FFFE,
174         (isASCII) ? 4 : 5,
175     ],
176     [ "non-character code point U+6FFFF",
177         (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
178         $::UTF8_DISALLOW_NONCHAR,
179         'nonchar', 0x6FFFF,
180         (isASCII) ? 4 : 5,
181     ],
182     [ "non-character code point U+7FFFE",
183         (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
184         $::UTF8_DISALLOW_NONCHAR,
185         'nonchar', 0x7FFFE,
186         (isASCII) ? 4 : 5,
187     ],
188     [ "non-character code point U+7FFFF",
189         (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
190         $::UTF8_DISALLOW_NONCHAR,
191         'nonchar', 0x7FFFF,
192         (isASCII) ? 4 : 5,
193     ],
194     [ "non-character code point U+8FFFE",
195         (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
196         $::UTF8_DISALLOW_NONCHAR,
197         'nonchar', 0x8FFFE,
198         (isASCII) ? 4 : 5,
199     ],
200     [ "non-character code point U+8FFFF",
201         (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
202         $::UTF8_DISALLOW_NONCHAR,
203         'nonchar', 0x8FFFF,
204         (isASCII) ? 4 : 5,
205     ],
206     [ "non-character code point U+9FFFE",
207         (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
208         $::UTF8_DISALLOW_NONCHAR,
209         'nonchar', 0x9FFFE,
210         (isASCII) ? 4 : 5,
211     ],
212     [ "non-character code point U+9FFFF",
213         (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
214         $::UTF8_DISALLOW_NONCHAR,
215         'nonchar', 0x9FFFF,
216         (isASCII) ? 4 : 5,
217     ],
218     [ "non-character code point U+AFFFE",
219         (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
220         $::UTF8_DISALLOW_NONCHAR,
221         'nonchar', 0xAFFFE,
222         (isASCII) ? 4 : 5,
223     ],
224     [ "non-character code point U+AFFFF",
225         (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
226         $::UTF8_DISALLOW_NONCHAR,
227         'nonchar', 0xAFFFF,
228         (isASCII) ? 4 : 5,
229     ],
230     [ "non-character code point U+BFFFE",
231         (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
232         $::UTF8_DISALLOW_NONCHAR,
233         'nonchar', 0xBFFFE,
234         (isASCII) ? 4 : 5,
235     ],
236     [ "non-character code point U+BFFFF",
237         (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
238         $::UTF8_DISALLOW_NONCHAR,
239         'nonchar', 0xBFFFF,
240         (isASCII) ? 4 : 5,
241     ],
242     [ "non-character code point U+CFFFE",
243         (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
244         $::UTF8_DISALLOW_NONCHAR,
245         'nonchar', 0xCFFFE,
246         (isASCII) ? 4 : 5,
247     ],
248     [ "non-character code point U+CFFFF",
249         (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
250         $::UTF8_DISALLOW_NONCHAR,
251         'nonchar', 0xCFFFF,
252         (isASCII) ? 4 : 5,
253     ],
254     [ "non-character code point U+DFFFE",
255         (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
256         $::UTF8_DISALLOW_NONCHAR,
257         'nonchar', 0xDFFFE,
258         (isASCII) ? 4 : 5,
259     ],
260     [ "non-character code point U+DFFFF",
261         (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
262         $::UTF8_DISALLOW_NONCHAR,
263         'nonchar', 0xDFFFF,
264         (isASCII) ? 4 : 5,
265     ],
266     [ "non-character code point U+EFFFE",
267         (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
268         $::UTF8_DISALLOW_NONCHAR,
269         'nonchar', 0xEFFFE,
270         (isASCII) ? 4 : 5,
271     ],
272     [ "non-character code point U+EFFFF",
273         (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
274         $::UTF8_DISALLOW_NONCHAR,
275         'nonchar', 0xEFFFF,
276         (isASCII) ? 4 : 5,
277     ],
278     [ "non-character code point U+FFFFE",
279         (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
280         $::UTF8_DISALLOW_NONCHAR,
281         'nonchar', 0xFFFFE,
282         (isASCII) ? 4 : 5,
283     ],
284     [ "non-character code point U+FFFFF",
285         (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
286         $::UTF8_DISALLOW_NONCHAR,
287         'nonchar', 0xFFFFF,
288         (isASCII) ? 4 : 5,
289     ],
290     [ "non-character code point U+10FFFE",
291         (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
292         $::UTF8_DISALLOW_NONCHAR,
293         'nonchar', 0x10FFFE,
294         (isASCII) ? 4 : 5,
295     ],
296     [ "non-character code point U+10FFFF",
297         (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
298         $::UTF8_DISALLOW_NONCHAR,
299         'nonchar', 0x10FFFF,
300         (isASCII) ? 4 : 5,
301     ],
302     [ "requires at least 32 bits",
303         (isASCII)
304          ?  "\xfe\x82\x80\x80\x80\x80\x80"
305          : I8_to_native(
306             "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
307         # This code point is chosen so that it is representable in a UV on
308         # 32-bit machines
309         $::UTF8_DISALLOW_ABOVE_31_BIT,
310         'utf8', 0x80000000,
311         (isASCII) ? 1 : 8,
312     ],
313     [ "highest 32 bit code point",
314         (isASCII)
315          ?  "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
316          : I8_to_native(
317             "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
318         $::UTF8_DISALLOW_ABOVE_31_BIT,
319         'utf8', 0xFFFFFFFF,
320         (isASCII) ? 1 : 8,
321     ],
322     [ "requires at least 32 bits, and use SUPER-type flags, instead of"
323     . " ABOVE_31_BIT",
324         (isASCII)
325          ? "\xfe\x82\x80\x80\x80\x80\x80"
326          : I8_to_native(
327            "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
328         $::UTF8_DISALLOW_SUPER,
329         'utf8', 0x80000000,
330         1,
331     ],
332     [ "overflow with warnings/disallow for more than 31 bits",
333         # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
334         # with overflow.  The overflow malformation is never allowed, so
335         # preventing it takes precedence if the ABOVE_31_BIT options would
336         # otherwise allow in an overflowing value.  The ASCII code points (1
337         # for 32-bits; 1 for 64) were chosen because the old overflow
338         # detection algorithm did not catch them; this means this test also
339         # checks for that fix.  The EBCDIC are arbitrary overflowing ones
340         # since we have no reports of failures with it.
341        (($::is64bit)
342         ? ((isASCII)
343            ?    "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
344            : I8_to_native(
345                 "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
346         : ((isASCII)
347            ?    "\xfe\x86\x80\x80\x80\x80\x80"
348            : I8_to_native(
349                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
350         $::UTF8_DISALLOW_ABOVE_31_BIT,
351         'utf8', -1,
352         (isASCII || $::is64bit) ? 2 : 8,
353     ],
354 );
355
356 if (! $::is64bit) {
357     if (isASCII) {
358         no warnings qw{portable overflow};
359         push @tests,
360             [ "Lowest 33 bit code point: overflow",
361                 "\xFE\x84\x80\x80\x80\x80\x80",
362                 $::UTF8_DISALLOW_ABOVE_31_BIT,
363                 'utf8', -1,
364                 1,
365             ];
366     }
367 }
368 else {
369     no warnings qw{portable overflow};
370     push @tests,
371         [ "More than 32 bits",
372             (isASCII)
373             ?       "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
374             : I8_to_native(
375                     "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
376             $::UTF8_DISALLOW_ABOVE_31_BIT,
377             'utf8', 0x1000000000,
378             (isASCII) ? 1 : 7,
379         ];
380     if (! isASCII) {
381         push @tests,   # These could falsely show wrongly in a naive
382                        # implementation
383             [ "requires at least 32 bits",
384                 I8_to_native(
385                     "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
386                 $::UTF8_DISALLOW_ABOVE_31_BIT,
387                 'utf8', 0x800000000,
388                 7,
389             ],
390             [ "requires at least 32 bits",
391                 I8_to_native(
392                     "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
393                 $::UTF8_DISALLOW_ABOVE_31_BIT,
394                 'utf8', 0x10000000000,
395                 6,
396             ],
397             [ "requires at least 32 bits",
398                 I8_to_native(
399                     "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
400                 $::UTF8_DISALLOW_ABOVE_31_BIT,
401                 'utf8', 0x200000000000,
402                 5,
403             ],
404             [ "requires at least 32 bits",
405                 I8_to_native(
406                     "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
407                 $::UTF8_DISALLOW_ABOVE_31_BIT,
408                 'utf8', 0x4000000000000,
409                 4,
410             ],
411             [ "requires at least 32 bits",
412                 I8_to_native(
413                     "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
414                 $::UTF8_DISALLOW_ABOVE_31_BIT,
415                 'utf8', 0x80000000000000,
416                 3,
417             ],
418             [ "requires at least 32 bits",
419                 I8_to_native(
420                     "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
421                 $::UTF8_DISALLOW_ABOVE_31_BIT,
422                 'utf8', 0x1000000000000000,
423                 2,
424             ];
425     }
426 }
427
428 # This test is split into this number of files.
429 my $num_test_files = $ENV{TEST_JOBS} || 1;
430 $num_test_files = 10 if $num_test_files > 10;
431
432 my $test_count = -1;
433 foreach my $test (@tests) {
434     $test_count++;
435     next if $test_count % $num_test_files != $::TEST_CHUNK;
436
437     my ($testname, $bytes, $disallow_flags,
438         $category, $allowed_uv, $needed_to_discern_len
439        ) = @$test;
440
441     my $length = length $bytes;
442     my $will_overflow = $allowed_uv < 0;
443
444     # The convention is that the got flag is the same value as the disallow
445     # one, and the warn flag is the next bit over.  If this were violated, the
446     # tests here should start failing.  We could do an eval under no strict to
447     # be sure.
448     my $expected_error_flags = $disallow_flags;
449     my $warn_flags = $disallow_flags << 1;
450
451     my $message;
452     if ($allowed_uv > 0x7FFFFFFF) {
453         $message = nonportable_regex($allowed_uv);
454     }
455     elsif ($allowed_uv > 0x10FFFF) {
456         $message = qr/(not Unicode|for a non-Unicode code point).* may not be portable/;
457     }
458     elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
459         $message = qr/surrogate/;
460     }
461     elsif (   ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
462            || ($allowed_uv & 0xFFFE) == 0xFFFE)
463     {
464         $message = qr/Unicode non-character.*is not recommended for open interchange/;
465     }
466     elsif ($will_overflow) {
467         $message = qr/overflows/;
468     }
469     else {
470         die "Can't figure out what type of warning to test for $testname"
471     }
472
473     {
474         use warnings;
475         undef @warnings;
476         my $ret = test_isUTF8_CHAR($bytes, $length);
477         my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
478         if ($will_overflow) {
479             is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
480             is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
481         }
482         else {
483             is($ret, $length,
484                "isUTF8_CHAR() $testname: returns expected length: $length");
485             is($ret_flags, $length, "isUTF8_CHAR_flags(...,0) $testname:"
486                                   . " returns expected length: $length");
487         }
488         is(scalar @warnings, 0,
489                 "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated"
490               . " no warnings")
491           or output_warnings(@warnings);
492
493         undef @warnings;
494         $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
495         if ($will_overflow) {
496             is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
497         }
498         else {
499             my $expected_ret = (   $testname =~ /surrogate|non-character/
500                                 || $allowed_uv > 0x10FFFF)
501                                ? 0
502                                : $length;
503             is($ret, $expected_ret, "isSTRICT_UTF8_CHAR() $testname: returns"
504                                   . " expected length: $expected_ret");
505             $ret = test_isUTF8_CHAR_flags($bytes, $length,
506                                           $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
507             is($ret, $expected_ret,
508                             "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
509                           . " acts like isSTRICT_UTF8_CHAR");
510         }
511         is(scalar @warnings, 0,
512                 "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
513               . " generated no warnings")
514           or output_warnings(@warnings);
515
516         undef @warnings;
517         $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
518         if ($will_overflow) {
519             is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
520         }
521         else {
522             my $expected_ret = (   $testname =~ /surrogate/
523                                 || $allowed_uv > 0x10FFFF)
524                                ? 0
525                                : $length;
526             is($ret, $expected_ret, "isC9_STRICT_UTF8_CHAR() $testname:"
527                                    ." returns expected length: $expected_ret");
528             $ret = test_isUTF8_CHAR_flags($bytes, $length,
529                                           $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
530             is($ret, $expected_ret,
531                           "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
532                         . " acts like isC9_STRICT_UTF8_CHAR");
533         }
534         is(scalar @warnings, 0,
535                 "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
536               . " generated no warnings")
537           or output_warnings(@warnings);
538
539         # Test partial character handling, for each byte not a full character
540         for my $j (1.. $length - 1) {
541
542             # Skip the test for the interaction between overflow and above-31
543             # bit.  It is really testing other things than the partial
544             # character tests, for which other tests in this file are
545             # sufficient
546             last if $will_overflow;
547
548             foreach my $disallow_flag (0, $disallow_flags) {
549                 my $partial = substr($bytes, 0, $j);
550                 my $ret_should_be;
551                 my $comment;
552                 if ($disallow_flag) {
553                     $ret_should_be = 0;
554                     $comment = "disallowed";
555                     if ($j < $needed_to_discern_len) {
556                         $ret_should_be = 1;
557                         $comment .= ", but need $needed_to_discern_len bytes"
558                                  .  " to discern:";
559                     }
560                 }
561                 else {
562                     $ret_should_be = 1;
563                     $comment = "allowed";
564                 }
565
566                 undef @warnings;
567
568                 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
569                                                              $disallow_flag);
570                 is($ret, $ret_should_be,
571                                 "$testname: is_utf8_valid_partial_char_flags("
572                                         . display_bytes($partial)
573                                         . "), $comment: returns $ret_should_be");
574                 is(scalar @warnings, 0,
575                         "$testname: is_utf8_valid_partial_char_flags()"
576                       . " generated no warnings")
577                   or output_warnings(@warnings);
578             }
579         }
580     }
581
582     # This is more complicated than the malformations tested earlier, as there
583     # are several orthogonal variables involved.  We test all the subclasses
584     # of utf8 warnings to verify they work with and without the utf8 class,
585     # and don't have effects on other sublass warnings
586     foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
587       foreach my $warn_flag (0, $warn_flags) {
588         foreach my $disallow_flag (0, $disallow_flags) {
589           foreach my $do_warning (0, 1) {
590
591             # We try each of the above with various combinations of
592             # malformations that can occur on the same input sequence.
593             foreach my $short ("", "short") {
594               foreach my $unexpected_noncont ("",
595                                               "unexpected non-continuation")
596               {
597                 foreach my $overlong ("", "overlong") {
598
599                     # If we're creating an overlong, it can't be longer than
600                     # the maximum length, so skip if we're already at that
601                     # length.
602                     next if $overlong && $length >= $::max_bytes;
603
604                     my @malformations;
605                     my @expected_errors;
606                     push @malformations, $short if $short;
607                     push @malformations, $unexpected_noncont
608                                                       if $unexpected_noncont;
609                     push @malformations, $overlong if $overlong;
610
611                     # The overflow malformation test in the input
612                     # array is coerced into being treated like one of
613                     # the others.
614                     if ($will_overflow) {
615                         push @malformations, 'overflow';
616                         push @expected_errors, $::UTF8_GOT_OVERFLOW;
617                     }
618
619                     my $malformations_name = join "/", @malformations;
620                     $malformations_name .= " malformation"
621                                                 if $malformations_name;
622                     $malformations_name .= "s" if @malformations > 1;
623                     my $this_bytes = $bytes;
624                     my $this_length = $length;
625                     my $expected_uv = $allowed_uv;
626                     my $this_expected_len = $length;
627                     my $this_needed_to_discern_len = $needed_to_discern_len;
628                     if ($malformations_name) {
629                         $expected_uv = 0;
630
631                         # Coerce the input into the desired
632                         # malformation
633                         if ($malformations_name =~ /overlong/) {
634
635                             # For an overlong, we convert the original
636                             # start byte into a continuation byte with
637                             # the same data bits as originally. ...
638                             substr($this_bytes, 0, 1)
639                                 = start_byte_to_cont(substr($this_bytes,
640                                                             0, 1));
641
642                             # ... Then we prepend it with a known
643                             # overlong sequence.  This should evaluate
644                             # to the exact same code point as the
645                             # original.
646                             $this_bytes
647                             = I8_to_native("\xff")
648                             . (I8_to_native(chr $::lowest_continuation)
649                             x ( $::max_bytes - 1 - length($this_bytes)))
650                             . $this_bytes;
651                             $this_length = length($this_bytes);
652                             $this_needed_to_discern_len
653                                  = $::max_bytes - ($this_expected_len
654                                                - $this_needed_to_discern_len);
655                             $this_expected_len = $::max_bytes;
656                             push @expected_errors, $::UTF8_GOT_LONG;
657                         }
658                         if ($malformations_name =~ /short/) {
659
660                             # Just tell the test to not look far
661                             # enough into the input.
662                             $this_length--;
663                             $this_expected_len--;
664                             push @expected_errors, $::UTF8_GOT_SHORT;
665                         }
666                         if ($malformations_name
667                                                 =~ /non-continuation/)
668                         {
669                             # Change the final continuation byte into
670                             # a non one.
671                             my $pos = ($short) ? -2 : -1;
672                             substr($this_bytes, $pos, 1) = '?';
673                             $this_expected_len--;
674                             push @expected_errors,
675                                             $::UTF8_GOT_NON_CONTINUATION;
676                         }
677                     }
678
679                     my $eval_warn = $do_warning
680                                 ? "use warnings '$warning'"
681                                 : $warning eq "utf8"
682                                     ? "no warnings 'utf8'"
683                                     : ( "use warnings 'utf8';"
684                                     . " no warnings '$warning'");
685
686                     # Is effectively disallowed if we've set up a
687                     # malformation, even if the flag indicates it is
688                     # allowed.  Fix up test name to indicate this as
689                     # well
690                     my $disallowed = $disallow_flag
691                                 || $malformations_name;
692                     my $this_name = "utf8n_to_uvchr_error() $testname: "
693                                                 . (($disallow_flag)
694                                                 ? 'disallowed'
695                                                 : $disallowed
696                                                     ? $disallowed
697                                                     : 'allowed');
698                     $this_name .= ", $eval_warn";
699                     $this_name .= ", " . (($warn_flag)
700                                         ? 'with warning flag'
701                                         : 'no warning flag');
702
703                     undef @warnings;
704                     my $ret_ref;
705                     my $display_bytes = display_bytes($this_bytes);
706                     my $call = "    Call was: $eval_warn; \$ret_ref"
707                             . " = test_utf8n_to_uvchr_error("
708                             . "'$display_bytes', $this_length,"
709                             . "$warn_flag"
710                             . "|$disallow_flag)";
711                     my $eval_text =      "$eval_warn; \$ret_ref"
712                             . " = test_utf8n_to_uvchr_error("
713                             . "'$this_bytes',"
714                             . " $this_length, $warn_flag"
715                             . "|$disallow_flag)";
716                     eval "$eval_text";
717                     if (! ok ("$@ eq ''",
718                         "$this_name: eval succeeded"))
719                     {
720                         diag "\$!='$!'; eval'd=\"$call\"";
721                         next;
722                     }
723                     if ($disallowed) {
724                         is($ret_ref->[0], 0, "$this_name: Returns 0")
725                           or diag $call;
726                     }
727                     else {
728                         is($ret_ref->[0], $expected_uv,
729                                 "$this_name: Returns expected uv: "
730                                 . sprintf("0x%04X", $expected_uv))
731                           or diag $call;
732                     }
733                     is($ret_ref->[1], $this_expected_len,
734                                         "$this_name: Returns expected length:"
735                                       . " $this_expected_len")
736                       or diag $call;
737
738                     my $errors = $ret_ref->[2];
739
740                     for (my $i = @expected_errors - 1; $i >= 0; $i--) {
741                         if (ok($expected_errors[$i] & $errors,
742                             "Expected and got error bit return"
743                             . " for $malformations[$i] malformation"))
744                         {
745                             $errors &= ~$expected_errors[$i];
746                         }
747                         splice @expected_errors, $i, 1;
748                     }
749                     is(scalar @expected_errors, 0,
750                             "Got all the expected malformation errors")
751                       or diag Dumper \@expected_errors;
752
753                     if (   $this_expected_len >= $this_needed_to_discern_len
754                         && ($warn_flag || $disallow_flag))
755                     {
756                         is($errors, $expected_error_flags,
757                                 "Got the correct error flag")
758                           or diag $call;
759                     }
760                     else {
761                         is($errors, 0, "Got no other error flag");
762                     }
763
764                     if (@malformations) {
765                         if (! $do_warning && $warning eq 'utf8') {
766                             goto no_warnings_expected;
767                         }
768
769                         # Check that each malformation generates a
770                         # warning, removing that warning if found
771                     MALFORMATION:
772                         foreach my $malformation (@malformations) {
773                             foreach (my $i = 0; $i < @warnings; $i++) {
774                                 if ($warnings[$i] =~ /$malformation/) {
775                                     pass("Expected and got"
776                                     . "'$malformation' warning");
777                                     splice @warnings, $i, 1;
778                                     next MALFORMATION;
779                                 }
780                             }
781                             fail("Expected '$malformation' warning"
782                             . " but didn't get it");
783
784                         }
785                     }
786
787                     # Any overflow will override any super or above-31
788                     # warnings.
789                     goto no_warnings_expected
790                                 if $will_overflow || $this_expected_len
791                                         < $this_needed_to_discern_len;
792
793                     if (    ! $do_warning
794                         && (   $warning eq 'utf8'
795                             || $warning eq $category))
796                     {
797                         goto no_warnings_expected;
798                     }
799                     elsif ($warn_flag) {
800                         if (is(scalar @warnings, 1,
801                             "$this_name: Got a single warning "))
802                         {
803                             like($warnings[0], $message,
804                                     "$this_name: Got expected warning")
805                                 or diag $call;
806                         }
807                         else {
808                             diag $call;
809                             if (scalar @warnings) {
810                                 output_warnings(@warnings);
811                             }
812                         }
813                     }
814                     else {
815                     no_warnings_expected:
816                         unless (is(scalar @warnings, 0,
817                                 "$this_name: Got no warnings"))
818                         {
819                             diag $call;
820                             output_warnings(@warnings);
821                         }
822                     }
823
824                     # Check CHECK_ONLY results when the input is
825                     # disallowed.  Do this when actually disallowed,
826                     # not just when the $disallow_flag is set
827                     if ($disallowed) {
828                         undef @warnings;
829                         $ret_ref = test_utf8n_to_uvchr_error(
830                                     $this_bytes, $this_length,
831                                     $disallow_flag|$::UTF8_CHECK_ONLY);
832                         is($ret_ref->[0], 0,
833                                         "$this_name, CHECK_ONLY: Returns 0")
834                           or diag $call;
835                         is($ret_ref->[1], -1,
836                             "$this_name: CHECK_ONLY: returns -1 for length")
837                           or diag $call;
838                         if (! is(scalar @warnings, 0,
839                             "$this_name, CHECK_ONLY: no warnings"
840                         . " generated"))
841                         {
842                             diag $call;
843                             output_warnings(@warnings);
844                         }
845                     }
846
847                     # Now repeat some of the above, but for
848                     # uvchr_to_utf8_flags().  Since this comes from an
849                     # existing code point, it hasn't overflowed, and
850                     # isn't malformed.
851                     next if @malformations;
852
853                     # The warning and disallow flags passed in are for
854                     # utf8n_to_uvchr_error().  Convert them for
855                     # uvchr_to_utf8_flags().
856                     my $uvchr_warn_flag = 0;
857                     my $uvchr_disallow_flag = 0;
858                     if ($warn_flag) {
859                         if ($warn_flag == $::UTF8_WARN_SURROGATE) {
860                             $uvchr_warn_flag = $::UNICODE_WARN_SURROGATE
861                         }
862                         elsif ($warn_flag == $::UTF8_WARN_NONCHAR) {
863                             $uvchr_warn_flag = $::UNICODE_WARN_NONCHAR
864                         }
865                         elsif ($warn_flag == $::UTF8_WARN_SUPER) {
866                             $uvchr_warn_flag = $::UNICODE_WARN_SUPER
867                         }
868                         elsif ($warn_flag == $::UTF8_WARN_ABOVE_31_BIT) {
869                             $uvchr_warn_flag
870                                         = $::UNICODE_WARN_ABOVE_31_BIT;
871                         }
872                         else {
873                             fail(sprintf "Unexpected warn flag: %x",
874                                 $warn_flag);
875                             next;
876                         }
877                     }
878                     if ($disallow_flag) {
879                         if ($disallow_flag == $::UTF8_DISALLOW_SURROGATE)
880                         {
881                             $uvchr_disallow_flag
882                                         = $::UNICODE_DISALLOW_SURROGATE;
883                         }
884                         elsif ($disallow_flag == $::UTF8_DISALLOW_NONCHAR)
885                         {
886                             $uvchr_disallow_flag
887                                         = $::UNICODE_DISALLOW_NONCHAR;
888                         }
889                         elsif ($disallow_flag == $::UTF8_DISALLOW_SUPER) {
890                             $uvchr_disallow_flag
891                                         = $::UNICODE_DISALLOW_SUPER;
892                         }
893                         elsif ($disallow_flag
894                                         == $::UTF8_DISALLOW_ABOVE_31_BIT)
895                         {
896                             $uvchr_disallow_flag =
897                                         $::UNICODE_DISALLOW_ABOVE_31_BIT;
898                         }
899                         else {
900                             fail(sprintf "Unexpected disallow flag: %x",
901                                 $disallow_flag);
902                             next;
903                         }
904                     }
905
906                     $disallowed = $uvchr_disallow_flag;
907
908                     $this_name = "uvchr_to_utf8_flags() $testname: "
909                                             . (($uvchr_disallow_flag)
910                                                 ? 'disallowed'
911                                                 : ($disallowed)
912                                                 ? 'ABOVE_31_BIT allowed'
913                                                 : 'allowed');
914                     $this_name .= ", $eval_warn";
915                     $this_name .= ", " . (($uvchr_warn_flag)
916                                         ? 'with warning flag'
917                                         : 'no warning flag');
918
919                     undef @warnings;
920                     my $ret;
921                     my $warn_flag = sprintf "0x%x", $uvchr_warn_flag;
922                     my $disallow_flag = sprintf "0x%x",
923                                                 $uvchr_disallow_flag;
924                     $call = sprintf("    Call was: $eval_warn; \$ret"
925                                 . " = test_uvchr_to_utf8_flags("
926                                 . " 0x%x, $warn_flag|$disallow_flag)",
927                                 $allowed_uv);
928                     $eval_text = "$eval_warn; \$ret ="
929                             . " test_uvchr_to_utf8_flags("
930                             . "$allowed_uv, $warn_flag|"
931                             . "$disallow_flag)";
932                     eval "$eval_text";
933                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
934                     {
935                         diag "\$!='$!'; eval'd=\"$eval_text\"";
936                         next;
937                     }
938                     if ($disallowed) {
939                         is($ret, undef, "$this_name: Returns undef")
940                           or diag $call;
941                     }
942                     else {
943                         is($ret, $bytes, "$this_name: Returns expected string")
944                           or diag $call;
945                     }
946                     if (! $do_warning
947                         && ($warning eq 'utf8' || $warning eq $category))
948                     {
949                         if (!is(scalar @warnings, 0,
950                                 "$this_name: No warnings generated"))
951                         {
952                             diag $call;
953                             output_warnings(@warnings);
954                         }
955                     }
956                     elsif (       $uvchr_warn_flag
957                         && (   $warning eq 'utf8'
958                             || $warning eq $category))
959                     {
960                         if (is(scalar @warnings, 1,
961                             "$this_name: Got a single warning "))
962                         {
963                             like($warnings[0], $message,
964                                     "$this_name: Got expected warning")
965                                 or diag $call;
966                         }
967                         else {
968                             diag $call;
969                             output_warnings(@warnings)
970                                                 if scalar @warnings;
971                         }
972                     }
973                 }
974               }
975             }
976           }
977         }
978       }
979     }
980 }
981
982 done_testing;