This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
68d60f6a2e52f248ec763520f387e5d31031dad6
[perl5.git] / ext / XS-APItest / t / utf8_warn_base.pl
1 #!perl -w
2
3 # This is a base file to be used by various .t's in its directory
4
5 use strict;
6 use Test::More;
7
8 BEGIN {
9     use_ok('XS::APItest');
10     require 'charset_tools.pl';
11     require './t/utf8_setup.pl';
12 };
13
14 $|=1;
15
16 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
17                           # machines, and that is tested elsewhere
18
19 use XS::APItest;
20 use Data::Dumper;
21
22 my @warnings;
23
24 use warnings 'utf8';
25 local $SIG{__WARN__} = sub { push @warnings, @_ };
26
27 sub nonportable_regex ($) {
28
29     # Returns a pattern that matches the non-portable message raised either
30     # for the specific input code point, or the one generated when there
31     # is some malformation that precludes the message containing the specific
32     # code point
33
34     my $code_point = shift;
35
36     my $string = sprintf '(Code point 0x%X is not Unicode, and'
37                        . '|Any UTF-8 sequence that starts with'
38                        . ' "(\\\x[[:xdigit:]]{2})+" is for a'
39                        . ' non-Unicode code point, and is) not portable',
40                     $code_point;
41     return qr/$string/;
42 }
43
44 # Now test the cases where a legal code point is generated, but may or may not
45 # be allowed/warned on.
46 my @tests = (
47      # ($testname, $bytes, $warn_flags, $disallow_flags, $expected_error_flags,
48      #  $category, $allowed_uv, $needed_to_discern_len )
49     [ "lowest surrogate",
50         (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
51         $::UTF8_WARN_SURROGATE, $::UTF8_DISALLOW_SURROGATE, $::UTF8_GOT_SURROGATE,
52         'surrogate', 0xD800,
53         2,
54     ],
55     [ "a middle surrogate",
56         (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
57         $::UTF8_WARN_SURROGATE, $::UTF8_DISALLOW_SURROGATE, $::UTF8_GOT_SURROGATE,
58         'surrogate', 0xD90D,
59         2,
60     ],
61     [ "highest surrogate",
62         (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
63         $::UTF8_WARN_SURROGATE, $::UTF8_DISALLOW_SURROGATE, $::UTF8_GOT_SURROGATE,
64         'surrogate', 0xDFFF,
65         2,
66     ],
67     [ "first non_unicode",
68         (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
69         $::UTF8_WARN_SUPER, $::UTF8_DISALLOW_SUPER, $::UTF8_GOT_SUPER,
70         'non_unicode', 0x110000,
71         2,
72     ],
73     [ "non_unicode whose first byte tells that",
74         (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
75         $::UTF8_WARN_SUPER, $::UTF8_DISALLOW_SUPER, $::UTF8_GOT_SUPER,
76         'non_unicode',
77         (isASCII) ? 0x140000 : 0x200000,
78         1,
79     ],
80     [ "first of 32 consecutive non-character code points",
81         (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
82         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
83         'nonchar', 0xFDD0,
84         (isASCII) ? 3 : 4,
85     ],
86     [ "a mid non-character code point of the 32 consecutive ones",
87         (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
88         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
89         'nonchar', 0xFDE0,
90         (isASCII) ? 3 : 4,
91     ],
92     [ "final of 32 consecutive non-character code points",
93         (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
94         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
95         'nonchar', 0xFDEF,
96         (isASCII) ? 3 : 4,
97     ],
98     [ "non-character code point U+FFFE",
99         (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
100         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
101         'nonchar', 0xFFFE,
102         (isASCII) ? 3 : 4,
103     ],
104     [ "non-character code point U+FFFF",
105         (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
106         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
107         'nonchar', 0xFFFF,
108         (isASCII) ? 3 : 4,
109     ],
110     [ "non-character code point U+1FFFE",
111         (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
112         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
113         'nonchar', 0x1FFFE,
114         4,
115     ],
116     [ "non-character code point U+1FFFF",
117         (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
118         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
119         'nonchar', 0x1FFFF,
120         4,
121     ],
122     [ "non-character code point U+2FFFE",
123         (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
124         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
125         'nonchar', 0x2FFFE,
126         4,
127     ],
128     [ "non-character code point U+2FFFF",
129         (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
130         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
131         'nonchar', 0x2FFFF,
132         4,
133     ],
134     [ "non-character code point U+3FFFE",
135         (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
136         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
137         'nonchar', 0x3FFFE,
138         4,
139     ],
140     [ "non-character code point U+3FFFF",
141         (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
142         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
143         'nonchar', 0x3FFFF,
144         4,
145     ],
146     [ "non-character code point U+4FFFE",
147         (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
148         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
149         'nonchar', 0x4FFFE,
150         (isASCII) ? 4 : 5,
151     ],
152     [ "non-character code point U+4FFFF",
153         (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
154         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
155         'nonchar', 0x4FFFF,
156         (isASCII) ? 4 : 5,
157     ],
158     [ "non-character code point U+5FFFE",
159         (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
160         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
161         'nonchar', 0x5FFFE,
162         (isASCII) ? 4 : 5,
163     ],
164     [ "non-character code point U+5FFFF",
165         (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
166         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
167         'nonchar', 0x5FFFF,
168         (isASCII) ? 4 : 5,
169     ],
170     [ "non-character code point U+6FFFE",
171         (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
172         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
173         'nonchar', 0x6FFFE,
174         (isASCII) ? 4 : 5,
175     ],
176     [ "non-character code point U+6FFFF",
177         (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
178         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
179         'nonchar', 0x6FFFF,
180         (isASCII) ? 4 : 5,
181     ],
182     [ "non-character code point U+7FFFE",
183         (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
184         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
185         'nonchar', 0x7FFFE,
186         (isASCII) ? 4 : 5,
187     ],
188     [ "non-character code point U+7FFFF",
189         (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
190         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
191         'nonchar', 0x7FFFF,
192         (isASCII) ? 4 : 5,
193     ],
194     [ "non-character code point U+8FFFE",
195         (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
196         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
197         'nonchar', 0x8FFFE,
198         (isASCII) ? 4 : 5,
199     ],
200     [ "non-character code point U+8FFFF",
201         (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
202         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
203         'nonchar', 0x8FFFF,
204         (isASCII) ? 4 : 5,
205     ],
206     [ "non-character code point U+9FFFE",
207         (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
208         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
209         'nonchar', 0x9FFFE,
210         (isASCII) ? 4 : 5,
211     ],
212     [ "non-character code point U+9FFFF",
213         (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
214         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
215         'nonchar', 0x9FFFF,
216         (isASCII) ? 4 : 5,
217     ],
218     [ "non-character code point U+AFFFE",
219         (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
220         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
221         'nonchar', 0xAFFFE,
222         (isASCII) ? 4 : 5,
223     ],
224     [ "non-character code point U+AFFFF",
225         (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
226         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
227         'nonchar', 0xAFFFF,
228         (isASCII) ? 4 : 5,
229     ],
230     [ "non-character code point U+BFFFE",
231         (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
232         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
233         'nonchar', 0xBFFFE,
234         (isASCII) ? 4 : 5,
235     ],
236     [ "non-character code point U+BFFFF",
237         (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
238         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
239         'nonchar', 0xBFFFF,
240         (isASCII) ? 4 : 5,
241     ],
242     [ "non-character code point U+CFFFE",
243         (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
244         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
245         'nonchar', 0xCFFFE,
246         (isASCII) ? 4 : 5,
247     ],
248     [ "non-character code point U+CFFFF",
249         (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
250         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
251         'nonchar', 0xCFFFF,
252         (isASCII) ? 4 : 5,
253     ],
254     [ "non-character code point U+DFFFE",
255         (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
256         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
257         'nonchar', 0xDFFFE,
258         (isASCII) ? 4 : 5,
259     ],
260     [ "non-character code point U+DFFFF",
261         (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
262         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
263         'nonchar', 0xDFFFF,
264         (isASCII) ? 4 : 5,
265     ],
266     [ "non-character code point U+EFFFE",
267         (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
268         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
269         'nonchar', 0xEFFFE,
270         (isASCII) ? 4 : 5,
271     ],
272     [ "non-character code point U+EFFFF",
273         (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
274         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
275         'nonchar', 0xEFFFF,
276         (isASCII) ? 4 : 5,
277     ],
278     [ "non-character code point U+FFFFE",
279         (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
280         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
281         'nonchar', 0xFFFFE,
282         (isASCII) ? 4 : 5,
283     ],
284     [ "non-character code point U+FFFFF",
285         (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
286         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
287         'nonchar', 0xFFFFF,
288         (isASCII) ? 4 : 5,
289     ],
290     [ "non-character code point U+10FFFE",
291         (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
292         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
293         'nonchar', 0x10FFFE,
294         (isASCII) ? 4 : 5,
295     ],
296     [ "non-character code point U+10FFFF",
297         (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
298         $::UTF8_WARN_NONCHAR, $::UTF8_DISALLOW_NONCHAR, $::UTF8_GOT_NONCHAR,
299         'nonchar', 0x10FFFF,
300         (isASCII) ? 4 : 5,
301     ],
302     [ "requires at least 32 bits",
303         (isASCII)
304          ?  "\xfe\x82\x80\x80\x80\x80\x80"
305          : I8_to_native(
306             "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
307         # This code point is chosen so that it is representable in a UV on
308         # 32-bit machines
309         $::UTF8_WARN_ABOVE_31_BIT, $::UTF8_DISALLOW_ABOVE_31_BIT,
310         $::UTF8_GOT_ABOVE_31_BIT,
311         'utf8', 0x80000000,
312         (isASCII) ? 1 : 8,
313     ],
314     [ "highest 32 bit code point",
315         (isASCII)
316          ?  "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
317          : I8_to_native(
318             "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
319         $::UTF8_WARN_ABOVE_31_BIT, $::UTF8_DISALLOW_ABOVE_31_BIT,
320         $::UTF8_GOT_ABOVE_31_BIT,
321         'utf8', 0xFFFFFFFF,
322         (isASCII) ? 1 : 8,
323     ],
324     [ "requires at least 32 bits, and use SUPER-type flags, instead of"
325     . " ABOVE_31_BIT",
326         (isASCII)
327          ? "\xfe\x82\x80\x80\x80\x80\x80"
328          : I8_to_native(
329            "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
330         $::UTF8_WARN_SUPER, $::UTF8_DISALLOW_SUPER, $::UTF8_GOT_SUPER,
331         'utf8', 0x80000000,
332         1,
333     ],
334     [ "overflow with warnings/disallow for more than 31 bits",
335         # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
336         # with overflow.  The overflow malformation is never allowed, so
337         # preventing it takes precedence if the ABOVE_31_BIT options would
338         # otherwise allow in an overflowing value.  The ASCII code points (1
339         # for 32-bits; 1 for 64) were chosen because the old overflow
340         # detection algorithm did not catch them; this means this test also
341         # checks for that fix.  The EBCDIC are arbitrary overflowing ones
342         # since we have no reports of failures with it.
343        (($::is64bit)
344         ? ((isASCII)
345            ?    "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
346            : I8_to_native(
347                 "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
348         : ((isASCII)
349            ?    "\xfe\x86\x80\x80\x80\x80\x80"
350            : I8_to_native(
351                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
352         $::UTF8_WARN_ABOVE_31_BIT,
353         $::UTF8_DISALLOW_ABOVE_31_BIT,
354         $::UTF8_GOT_ABOVE_31_BIT,
355         'utf8', -1,
356         (isASCII || $::is64bit) ? 2 : 8,
357     ],
358 );
359
360 if (! $::is64bit) {
361     if (isASCII) {
362         no warnings qw{portable overflow};
363         push @tests,
364             [ "Lowest 33 bit code point: overflow",
365                 "\xFE\x84\x80\x80\x80\x80\x80",
366                 $::UTF8_WARN_ABOVE_31_BIT, $::UTF8_DISALLOW_ABOVE_31_BIT,
367                 $::UTF8_GOT_ABOVE_31_BIT,
368                 'utf8', -1,
369                 1,
370             ];
371     }
372 }
373 else {
374     no warnings qw{portable overflow};
375     push @tests,
376         [ "More than 32 bits",
377             (isASCII)
378             ?       "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
379             : I8_to_native(
380                     "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
381             $::UTF8_WARN_ABOVE_31_BIT, $::UTF8_DISALLOW_ABOVE_31_BIT,
382             $::UTF8_GOT_ABOVE_31_BIT,
383             'utf8', 0x1000000000,
384             (isASCII) ? 1 : 7,
385         ];
386     if (! isASCII) {
387         push @tests,   # These could falsely show wrongly in a naive
388                        # implementation
389             [ "requires at least 32 bits",
390                 I8_to_native(
391                     "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
392                 $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT,
393                 $::UTF8_GOT_ABOVE_31_BIT,
394                 'utf8', 0x800000000,
395                 7,
396             ],
397             [ "requires at least 32 bits",
398                 I8_to_native(
399                     "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
400                 $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT,
401                 $::UTF8_GOT_ABOVE_31_BIT,
402                 'utf8', 0x10000000000,
403                 6,
404             ],
405             [ "requires at least 32 bits",
406                 I8_to_native(
407                     "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
408                 $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT,
409                 $::UTF8_GOT_ABOVE_31_BIT,
410                 'utf8', 0x200000000000,
411                 5,
412             ],
413             [ "requires at least 32 bits",
414                 I8_to_native(
415                     "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
416                 $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT,
417                 $::UTF8_GOT_ABOVE_31_BIT,
418                 'utf8', 0x4000000000000,
419                 4,
420             ],
421             [ "requires at least 32 bits",
422                 I8_to_native(
423                     "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
424                 $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT,
425                 $::UTF8_GOT_ABOVE_31_BIT,
426                 'utf8', 0x80000000000000,
427                 3,
428             ],
429             [ "requires at least 32 bits",
430                 I8_to_native(
431                     "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
432                 $::UTF8_WARN_ABOVE_31_BIT,$::UTF8_DISALLOW_ABOVE_31_BIT,
433                 $::UTF8_GOT_ABOVE_31_BIT,
434                 'utf8', 0x1000000000000000,
435                 2,
436             ];
437     }
438 }
439
440 # This test is split into this number of files.
441 my $num_test_files = $ENV{TEST_JOBS} || 1;
442 $num_test_files = 10 if $num_test_files > 10;
443
444 my $test_count = -1;
445 foreach my $test (@tests) {
446     $test_count++;
447     next if $test_count % $num_test_files != $::TEST_CHUNK;
448
449     my ($testname, $bytes, $warn_flags, $disallow_flags, $expected_error_flags,
450         $category, $allowed_uv, $needed_to_discern_len
451        ) = @$test;
452
453     my $length = length $bytes;
454     my $will_overflow = $allowed_uv < 0;
455
456     my $message;
457     if ($allowed_uv > 0x7FFFFFFF) {
458         $message = nonportable_regex($allowed_uv);
459     }
460     elsif ($allowed_uv > 0x10FFFF) {
461         $message = qr/(not Unicode|for a non-Unicode code point).* may not be portable/;
462     }
463     elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
464         $message = qr/surrogate/;
465     }
466     elsif (   ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
467            || ($allowed_uv & 0xFFFE) == 0xFFFE)
468     {
469         $message = qr/Unicode non-character.*is not recommended for open interchange/;
470     }
471     elsif ($will_overflow) {
472         $message = qr/overflows/;
473     }
474     else {
475         die "Can't figure out what type of warning to test for $testname"
476     }
477
478     {
479         use warnings;
480         undef @warnings;
481         my $ret = test_isUTF8_CHAR($bytes, $length);
482         my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
483         if ($will_overflow) {
484             is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
485             is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
486         }
487         else {
488             is($ret, $length,
489                "isUTF8_CHAR() $testname: returns expected length: $length");
490             is($ret_flags, $length, "isUTF8_CHAR_flags(...,0) $testname:"
491                                   . " returns expected length: $length");
492         }
493         is(scalar @warnings, 0,
494                 "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated"
495               . " no warnings")
496           or output_warnings(@warnings);
497
498         undef @warnings;
499         $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
500         if ($will_overflow) {
501             is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
502         }
503         else {
504             my $expected_ret = (   $testname =~ /surrogate|non-character/
505                                 || $allowed_uv > 0x10FFFF)
506                                ? 0
507                                : $length;
508             is($ret, $expected_ret, "isSTRICT_UTF8_CHAR() $testname: returns"
509                                   . " expected length: $expected_ret");
510             $ret = test_isUTF8_CHAR_flags($bytes, $length,
511                                           $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
512             is($ret, $expected_ret,
513                             "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
514                           . " acts like isSTRICT_UTF8_CHAR");
515         }
516         is(scalar @warnings, 0,
517                 "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
518               . " generated no warnings")
519           or output_warnings(@warnings);
520
521         undef @warnings;
522         $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
523         if ($will_overflow) {
524             is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
525         }
526         else {
527             my $expected_ret = (   $testname =~ /surrogate/
528                                 || $allowed_uv > 0x10FFFF)
529                                ? 0
530                                : $length;
531             is($ret, $expected_ret, "isC9_STRICT_UTF8_CHAR() $testname:"
532                                    ." returns expected length: $expected_ret");
533             $ret = test_isUTF8_CHAR_flags($bytes, $length,
534                                           $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
535             is($ret, $expected_ret,
536                           "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
537                         . " acts like isC9_STRICT_UTF8_CHAR");
538         }
539         is(scalar @warnings, 0,
540                 "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
541               . " generated no warnings")
542           or output_warnings(@warnings);
543
544         # Test partial character handling, for each byte not a full character
545         for my $j (1.. $length - 1) {
546
547             # Skip the test for the interaction between overflow and above-31
548             # bit.  It is really testing other things than the partial
549             # character tests, for which other tests in this file are
550             # sufficient
551             last if $will_overflow;
552
553             foreach my $disallow_flag (0, $disallow_flags) {
554                 my $partial = substr($bytes, 0, $j);
555                 my $ret_should_be;
556                 my $comment;
557                 if ($disallow_flag) {
558                     $ret_should_be = 0;
559                     $comment = "disallowed";
560                     if ($j < $needed_to_discern_len) {
561                         $ret_should_be = 1;
562                         $comment .= ", but need $needed_to_discern_len bytes"
563                                  .  " to discern:";
564                     }
565                 }
566                 else {
567                     $ret_should_be = 1;
568                     $comment = "allowed";
569                 }
570
571                 undef @warnings;
572
573                 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
574                                                              $disallow_flag);
575                 is($ret, $ret_should_be,
576                                 "$testname: is_utf8_valid_partial_char_flags("
577                                         . display_bytes($partial)
578                                         . "), $comment: returns $ret_should_be");
579                 is(scalar @warnings, 0,
580                         "$testname: is_utf8_valid_partial_char_flags()"
581                       . " generated no warnings")
582                   or output_warnings(@warnings);
583             }
584         }
585     }
586
587     # This is more complicated than the malformations tested earlier, as there
588     # are several orthogonal variables involved.  We test all the subclasses
589     # of utf8 warnings to verify they work with and without the utf8 class,
590     # and don't have effects on other sublass warnings
591     foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
592       foreach my $warn_flag (0, $warn_flags) {
593         foreach my $disallow_flag (0, $disallow_flags) {
594           foreach my $do_warning (0, 1) {
595
596             # We try each of the above with various combinations of
597             # malformations that can occur on the same input sequence.
598             foreach my $short ("", "short") {
599               foreach my $unexpected_noncont ("",
600                                               "unexpected non-continuation")
601               {
602                 foreach my $overlong ("", "overlong") {
603
604                     # If we're creating an overlong, it can't be longer than
605                     # the maximum length, so skip if we're already at that
606                     # length.
607                     next if $overlong && $length >= $::max_bytes;
608
609                     my @malformations;
610                     my @expected_errors;
611                     push @malformations, $short if $short;
612                     push @malformations, $unexpected_noncont
613                                                       if $unexpected_noncont;
614                     push @malformations, $overlong if $overlong;
615
616                     # The overflow malformation test in the input
617                     # array is coerced into being treated like one of
618                     # the others.
619                     if ($will_overflow) {
620                         push @malformations, 'overflow';
621                         push @expected_errors, $::UTF8_GOT_OVERFLOW;
622                     }
623
624                     my $malformations_name = join "/", @malformations;
625                     $malformations_name .= " malformation"
626                                                 if $malformations_name;
627                     $malformations_name .= "s" if @malformations > 1;
628                     my $this_bytes = $bytes;
629                     my $this_length = $length;
630                     my $expected_uv = $allowed_uv;
631                     my $this_expected_len = $length;
632                     my $this_needed_to_discern_len = $needed_to_discern_len;
633                     if ($malformations_name) {
634                         $expected_uv = 0;
635
636                         # Coerce the input into the desired
637                         # malformation
638                         if ($malformations_name =~ /overlong/) {
639
640                             # For an overlong, we convert the original
641                             # start byte into a continuation byte with
642                             # the same data bits as originally. ...
643                             substr($this_bytes, 0, 1)
644                                 = start_byte_to_cont(substr($this_bytes,
645                                                             0, 1));
646
647                             # ... Then we prepend it with a known
648                             # overlong sequence.  This should evaluate
649                             # to the exact same code point as the
650                             # original.
651                             $this_bytes
652                             = I8_to_native("\xff")
653                             . (I8_to_native(chr $::lowest_continuation)
654                             x ( $::max_bytes - 1 - length($this_bytes)))
655                             . $this_bytes;
656                             $this_length = length($this_bytes);
657                             $this_needed_to_discern_len
658                                  = $::max_bytes - ($this_expected_len
659                                                - $this_needed_to_discern_len);
660                             $this_expected_len = $::max_bytes;
661                             push @expected_errors, $::UTF8_GOT_LONG;
662                         }
663                         if ($malformations_name =~ /short/) {
664
665                             # Just tell the test to not look far
666                             # enough into the input.
667                             $this_length--;
668                             $this_expected_len--;
669                             push @expected_errors, $::UTF8_GOT_SHORT;
670                         }
671                         if ($malformations_name
672                                                 =~ /non-continuation/)
673                         {
674                             # Change the final continuation byte into
675                             # a non one.
676                             my $pos = ($short) ? -2 : -1;
677                             substr($this_bytes, $pos, 1) = '?';
678                             $this_expected_len--;
679                             push @expected_errors,
680                                             $::UTF8_GOT_NON_CONTINUATION;
681                         }
682                     }
683
684                     my $eval_warn = $do_warning
685                                 ? "use warnings '$warning'"
686                                 : $warning eq "utf8"
687                                     ? "no warnings 'utf8'"
688                                     : ( "use warnings 'utf8';"
689                                     . " no warnings '$warning'");
690
691                     # Is effectively disallowed if we've set up a
692                     # malformation, even if the flag indicates it is
693                     # allowed.  Fix up test name to indicate this as
694                     # well
695                     my $disallowed = $disallow_flag
696                                 || $malformations_name;
697                     my $this_name = "utf8n_to_uvchr_error() $testname: "
698                                                 . (($disallow_flag)
699                                                 ? 'disallowed'
700                                                 : $disallowed
701                                                     ? $disallowed
702                                                     : 'allowed');
703                     $this_name .= ", $eval_warn";
704                     $this_name .= ", " . (($warn_flag)
705                                         ? 'with warning flag'
706                                         : 'no warning flag');
707
708                     undef @warnings;
709                     my $ret_ref;
710                     my $display_bytes = display_bytes($this_bytes);
711                     my $call = "    Call was: $eval_warn; \$ret_ref"
712                             . " = test_utf8n_to_uvchr_error("
713                             . "'$display_bytes', $this_length,"
714                             . "$warn_flag"
715                             . "|$disallow_flag)";
716                     my $eval_text =      "$eval_warn; \$ret_ref"
717                             . " = test_utf8n_to_uvchr_error("
718                             . "'$this_bytes',"
719                             . " $this_length, $warn_flag"
720                             . "|$disallow_flag)";
721                     eval "$eval_text";
722                     if (! ok ("$@ eq ''",
723                         "$this_name: eval succeeded"))
724                     {
725                         diag "\$!='$!'; eval'd=\"$call\"";
726                         next;
727                     }
728                     if ($disallowed) {
729                         is($ret_ref->[0], 0, "$this_name: Returns 0")
730                           or diag $call;
731                     }
732                     else {
733                         is($ret_ref->[0], $expected_uv,
734                                 "$this_name: Returns expected uv: "
735                                 . sprintf("0x%04X", $expected_uv))
736                           or diag $call;
737                     }
738                     is($ret_ref->[1], $this_expected_len,
739                                         "$this_name: Returns expected length:"
740                                       . " $this_expected_len")
741                       or diag $call;
742
743                     my $errors = $ret_ref->[2];
744
745                     for (my $i = @expected_errors - 1; $i >= 0; $i--) {
746                         if (ok($expected_errors[$i] & $errors,
747                             "Expected and got error bit return"
748                             . " for $malformations[$i] malformation"))
749                         {
750                             $errors &= ~$expected_errors[$i];
751                         }
752                         splice @expected_errors, $i, 1;
753                     }
754                     is(scalar @expected_errors, 0,
755                             "Got all the expected malformation errors")
756                       or diag Dumper \@expected_errors;
757
758                     if (   $this_expected_len >= $this_needed_to_discern_len
759                         && ($warn_flag || $disallow_flag))
760                     {
761                         is($errors, $expected_error_flags,
762                                 "Got the correct error flag")
763                           or diag $call;
764                     }
765                     else {
766                         is($errors, 0, "Got no other error flag");
767                     }
768
769                     if (@malformations) {
770                         if (! $do_warning && $warning eq 'utf8') {
771                             goto no_warnings_expected;
772                         }
773
774                         # Check that each malformation generates a
775                         # warning, removing that warning if found
776                     MALFORMATION:
777                         foreach my $malformation (@malformations) {
778                             foreach (my $i = 0; $i < @warnings; $i++) {
779                                 if ($warnings[$i] =~ /$malformation/) {
780                                     pass("Expected and got"
781                                     . "'$malformation' warning");
782                                     splice @warnings, $i, 1;
783                                     next MALFORMATION;
784                                 }
785                             }
786                             fail("Expected '$malformation' warning"
787                             . " but didn't get it");
788
789                         }
790                     }
791
792                     # Any overflow will override any super or above-31
793                     # warnings.
794                     goto no_warnings_expected
795                                 if $will_overflow || $this_expected_len
796                                         < $this_needed_to_discern_len;
797
798                     if (    ! $do_warning
799                         && (   $warning eq 'utf8'
800                             || $warning eq $category))
801                     {
802                         goto no_warnings_expected;
803                     }
804                     elsif ($warn_flag) {
805                         if (is(scalar @warnings, 1,
806                             "$this_name: Got a single warning "))
807                         {
808                             like($warnings[0], $message,
809                                     "$this_name: Got expected warning")
810                                 or diag $call;
811                         }
812                         else {
813                             diag $call;
814                             if (scalar @warnings) {
815                                 output_warnings(@warnings);
816                             }
817                         }
818                     }
819                     else {
820                     no_warnings_expected:
821                         unless (is(scalar @warnings, 0,
822                                 "$this_name: Got no warnings"))
823                         {
824                             diag $call;
825                             output_warnings(@warnings);
826                         }
827                     }
828
829                     # Check CHECK_ONLY results when the input is
830                     # disallowed.  Do this when actually disallowed,
831                     # not just when the $disallow_flag is set
832                     if ($disallowed) {
833                         undef @warnings;
834                         $ret_ref = test_utf8n_to_uvchr_error(
835                                     $this_bytes, $this_length,
836                                     $disallow_flag|$::UTF8_CHECK_ONLY);
837                         is($ret_ref->[0], 0,
838                                         "$this_name, CHECK_ONLY: Returns 0")
839                           or diag $call;
840                         is($ret_ref->[1], -1,
841                             "$this_name: CHECK_ONLY: returns -1 for length")
842                           or diag $call;
843                         if (! is(scalar @warnings, 0,
844                             "$this_name, CHECK_ONLY: no warnings"
845                         . " generated"))
846                         {
847                             diag $call;
848                             output_warnings(@warnings);
849                         }
850                     }
851
852                     # Now repeat some of the above, but for
853                     # uvchr_to_utf8_flags().  Since this comes from an
854                     # existing code point, it hasn't overflowed, and
855                     # isn't malformed.
856                     next if @malformations;
857
858                     # The warning and disallow flags passed in are for
859                     # utf8n_to_uvchr_error().  Convert them for
860                     # uvchr_to_utf8_flags().
861                     my $uvchr_warn_flag = 0;
862                     my $uvchr_disallow_flag = 0;
863                     if ($warn_flag) {
864                         if ($warn_flag == $::UTF8_WARN_SURROGATE) {
865                             $uvchr_warn_flag = $::UNICODE_WARN_SURROGATE
866                         }
867                         elsif ($warn_flag == $::UTF8_WARN_NONCHAR) {
868                             $uvchr_warn_flag = $::UNICODE_WARN_NONCHAR
869                         }
870                         elsif ($warn_flag == $::UTF8_WARN_SUPER) {
871                             $uvchr_warn_flag = $::UNICODE_WARN_SUPER
872                         }
873                         elsif ($warn_flag == $::UTF8_WARN_ABOVE_31_BIT) {
874                             $uvchr_warn_flag
875                                         = $::UNICODE_WARN_ABOVE_31_BIT;
876                         }
877                         else {
878                             fail(sprintf "Unexpected warn flag: %x",
879                                 $warn_flag);
880                             next;
881                         }
882                     }
883                     if ($disallow_flag) {
884                         if ($disallow_flag == $::UTF8_DISALLOW_SURROGATE)
885                         {
886                             $uvchr_disallow_flag
887                                         = $::UNICODE_DISALLOW_SURROGATE;
888                         }
889                         elsif ($disallow_flag == $::UTF8_DISALLOW_NONCHAR)
890                         {
891                             $uvchr_disallow_flag
892                                         = $::UNICODE_DISALLOW_NONCHAR;
893                         }
894                         elsif ($disallow_flag == $::UTF8_DISALLOW_SUPER) {
895                             $uvchr_disallow_flag
896                                         = $::UNICODE_DISALLOW_SUPER;
897                         }
898                         elsif ($disallow_flag
899                                         == $::UTF8_DISALLOW_ABOVE_31_BIT)
900                         {
901                             $uvchr_disallow_flag =
902                                         $::UNICODE_DISALLOW_ABOVE_31_BIT;
903                         }
904                         else {
905                             fail(sprintf "Unexpected disallow flag: %x",
906                                 $disallow_flag);
907                             next;
908                         }
909                     }
910
911                     $disallowed = $uvchr_disallow_flag;
912
913                     $this_name = "uvchr_to_utf8_flags() $testname: "
914                                             . (($uvchr_disallow_flag)
915                                                 ? 'disallowed'
916                                                 : ($disallowed)
917                                                 ? 'ABOVE_31_BIT allowed'
918                                                 : 'allowed');
919                     $this_name .= ", $eval_warn";
920                     $this_name .= ", " . (($uvchr_warn_flag)
921                                         ? 'with warning flag'
922                                         : 'no warning flag');
923
924                     undef @warnings;
925                     my $ret;
926                     my $warn_flag = sprintf "0x%x", $uvchr_warn_flag;
927                     my $disallow_flag = sprintf "0x%x",
928                                                 $uvchr_disallow_flag;
929                     $call = sprintf("    Call was: $eval_warn; \$ret"
930                                 . " = test_uvchr_to_utf8_flags("
931                                 . " 0x%x, $warn_flag|$disallow_flag)",
932                                 $allowed_uv);
933                     $eval_text = "$eval_warn; \$ret ="
934                             . " test_uvchr_to_utf8_flags("
935                             . "$allowed_uv, $warn_flag|"
936                             . "$disallow_flag)";
937                     eval "$eval_text";
938                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
939                     {
940                         diag "\$!='$!'; eval'd=\"$eval_text\"";
941                         next;
942                     }
943                     if ($disallowed) {
944                         is($ret, undef, "$this_name: Returns undef")
945                           or diag $call;
946                     }
947                     else {
948                         is($ret, $bytes, "$this_name: Returns expected string")
949                           or diag $call;
950                     }
951                     if (! $do_warning
952                         && ($warning eq 'utf8' || $warning eq $category))
953                     {
954                         if (!is(scalar @warnings, 0,
955                                 "$this_name: No warnings generated"))
956                         {
957                             diag $call;
958                             output_warnings(@warnings);
959                         }
960                     }
961                     elsif (       $uvchr_warn_flag
962                         && (   $warning eq 'utf8'
963                             || $warning eq $category))
964                     {
965                         if (is(scalar @warnings, 1,
966                             "$this_name: Got a single warning "))
967                         {
968                             like($warnings[0], $message,
969                                     "$this_name: Got expected warning")
970                                 or diag $call;
971                         }
972                         else {
973                             diag $call;
974                             output_warnings(@warnings)
975                                                 if scalar @warnings;
976                         }
977                     }
978                 }
979               }
980             }
981           }
982         }
983       }
984     }
985 }
986
987 done_testing;