This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/utf8_warn_base.pl: Do formatting outside loop
[perl5.git] / ext / XS-APItest / t / utf8_warn_base.pl
1 #!perl -w
2
3 # This is a base file to be used by various .t's in its directory
4
5 use strict;
6 use Test::More;
7
8 BEGIN {
9     use_ok('XS::APItest');
10     require 'charset_tools.pl';
11     require './t/utf8_setup.pl';
12 };
13
14 $|=1;
15
16 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
17                           # machines, and that is tested elsewhere
18
19 use XS::APItest;
20 use Data::Dumper;
21
22 my @warnings_gotten;
23
24 use warnings 'utf8';
25 local $SIG{__WARN__} = sub { push @warnings_gotten, @_ };
26
27 sub nonportable_regex ($) {
28
29     # Returns a pattern that matches the non-portable message raised either
30     # for the specific input code point, or the one generated when there
31     # is some malformation that precludes the message containing the specific
32     # code point
33
34     my $code_point = shift;
35
36     my $string = sprintf '(Code point 0x%X is not Unicode, and'
37                        . '|Any UTF-8 sequence that starts with'
38                        . ' "(\\\x[[:xdigit:]]{2})+" is for a'
39                        . ' non-Unicode code point, and is) not portable',
40                     $code_point;
41     return qr/$string/;
42 }
43
44 # Now test the cases where a legal code point is generated, but may or may not
45 # be allowed/warned on.
46 my @tests = (
47      # ($testname, $bytes, $disallow_flags, $controlling_warning_category,
48      #  $allowed_uv, $needed_to_discern_len )
49     [ "lowest surrogate",
50         (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
51         $::UTF8_DISALLOW_SURROGATE,
52         'surrogate', 0xD800,
53     ],
54     [ "a middle surrogate",
55         (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
56         $::UTF8_DISALLOW_SURROGATE,
57         'surrogate', 0xD90D,
58     ],
59     [ "highest surrogate",
60         (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
61         $::UTF8_DISALLOW_SURROGATE,
62         'surrogate', 0xDFFF,
63     ],
64     [ "first non_unicode",
65         (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
66         $::UTF8_DISALLOW_SUPER,
67         'non_unicode', 0x110000,
68         2,
69     ],
70     [ "non_unicode whose first byte tells that",
71         (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
72         $::UTF8_DISALLOW_SUPER,
73         'non_unicode',
74         (isASCII) ? 0x140000 : 0x200000,
75         1,
76     ],
77     [ "first of 32 consecutive non-character code points",
78         (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
79         $::UTF8_DISALLOW_NONCHAR,
80         'nonchar', 0xFDD0,
81     ],
82     [ "a mid non-character code point of the 32 consecutive ones",
83         (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
84         $::UTF8_DISALLOW_NONCHAR,
85         'nonchar', 0xFDE0,
86     ],
87     [ "final of 32 consecutive non-character code points",
88         (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
89         $::UTF8_DISALLOW_NONCHAR,
90         'nonchar', 0xFDEF,
91     ],
92     [ "non-character code point U+FFFE",
93         (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
94         $::UTF8_DISALLOW_NONCHAR,
95         'nonchar', 0xFFFE,
96     ],
97     [ "non-character code point U+FFFF",
98         (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
99         $::UTF8_DISALLOW_NONCHAR,
100         'nonchar', 0xFFFF,
101     ],
102     [ "non-character code point U+1FFFE",
103         (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
104         $::UTF8_DISALLOW_NONCHAR,
105         'nonchar', 0x1FFFE,
106     ],
107     [ "non-character code point U+1FFFF",
108         (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
109         $::UTF8_DISALLOW_NONCHAR,
110         'nonchar', 0x1FFFF,
111     ],
112     [ "non-character code point U+2FFFE",
113         (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
114         $::UTF8_DISALLOW_NONCHAR,
115         'nonchar', 0x2FFFE,
116     ],
117     [ "non-character code point U+2FFFF",
118         (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
119         $::UTF8_DISALLOW_NONCHAR,
120         'nonchar', 0x2FFFF,
121     ],
122     [ "non-character code point U+3FFFE",
123         (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
124         $::UTF8_DISALLOW_NONCHAR,
125         'nonchar', 0x3FFFE,
126     ],
127     [ "non-character code point U+3FFFF",
128         (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
129         $::UTF8_DISALLOW_NONCHAR,
130         'nonchar', 0x3FFFF,
131     ],
132     [ "non-character code point U+4FFFE",
133         (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
134         $::UTF8_DISALLOW_NONCHAR,
135         'nonchar', 0x4FFFE,
136     ],
137     [ "non-character code point U+4FFFF",
138         (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
139         $::UTF8_DISALLOW_NONCHAR,
140         'nonchar', 0x4FFFF,
141     ],
142     [ "non-character code point U+5FFFE",
143         (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
144         $::UTF8_DISALLOW_NONCHAR,
145         'nonchar', 0x5FFFE,
146     ],
147     [ "non-character code point U+5FFFF",
148         (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
149         $::UTF8_DISALLOW_NONCHAR,
150         'nonchar', 0x5FFFF,
151     ],
152     [ "non-character code point U+6FFFE",
153         (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
154         $::UTF8_DISALLOW_NONCHAR,
155         'nonchar', 0x6FFFE,
156     ],
157     [ "non-character code point U+6FFFF",
158         (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
159         $::UTF8_DISALLOW_NONCHAR,
160         'nonchar', 0x6FFFF,
161     ],
162     [ "non-character code point U+7FFFE",
163         (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
164         $::UTF8_DISALLOW_NONCHAR,
165         'nonchar', 0x7FFFE,
166     ],
167     [ "non-character code point U+7FFFF",
168         (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
169         $::UTF8_DISALLOW_NONCHAR,
170         'nonchar', 0x7FFFF,
171     ],
172     [ "non-character code point U+8FFFE",
173         (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
174         $::UTF8_DISALLOW_NONCHAR,
175         'nonchar', 0x8FFFE,
176     ],
177     [ "non-character code point U+8FFFF",
178         (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
179         $::UTF8_DISALLOW_NONCHAR,
180         'nonchar', 0x8FFFF,
181     ],
182     [ "non-character code point U+9FFFE",
183         (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
184         $::UTF8_DISALLOW_NONCHAR,
185         'nonchar', 0x9FFFE,
186     ],
187     [ "non-character code point U+9FFFF",
188         (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
189         $::UTF8_DISALLOW_NONCHAR,
190         'nonchar', 0x9FFFF,
191     ],
192     [ "non-character code point U+AFFFE",
193         (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
194         $::UTF8_DISALLOW_NONCHAR,
195         'nonchar', 0xAFFFE,
196     ],
197     [ "non-character code point U+AFFFF",
198         (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
199         $::UTF8_DISALLOW_NONCHAR,
200         'nonchar', 0xAFFFF,
201     ],
202     [ "non-character code point U+BFFFE",
203         (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
204         $::UTF8_DISALLOW_NONCHAR,
205         'nonchar', 0xBFFFE,
206     ],
207     [ "non-character code point U+BFFFF",
208         (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
209         $::UTF8_DISALLOW_NONCHAR,
210         'nonchar', 0xBFFFF,
211     ],
212     [ "non-character code point U+CFFFE",
213         (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
214         $::UTF8_DISALLOW_NONCHAR,
215         'nonchar', 0xCFFFE,
216     ],
217     [ "non-character code point U+CFFFF",
218         (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
219         $::UTF8_DISALLOW_NONCHAR,
220         'nonchar', 0xCFFFF,
221     ],
222     [ "non-character code point U+DFFFE",
223         (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
224         $::UTF8_DISALLOW_NONCHAR,
225         'nonchar', 0xDFFFE,
226     ],
227     [ "non-character code point U+DFFFF",
228         (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
229         $::UTF8_DISALLOW_NONCHAR,
230         'nonchar', 0xDFFFF,
231     ],
232     [ "non-character code point U+EFFFE",
233         (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
234         $::UTF8_DISALLOW_NONCHAR,
235         'nonchar', 0xEFFFE,
236     ],
237     [ "non-character code point U+EFFFF",
238         (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
239         $::UTF8_DISALLOW_NONCHAR,
240         'nonchar', 0xEFFFF,
241     ],
242     [ "non-character code point U+FFFFE",
243         (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
244         $::UTF8_DISALLOW_NONCHAR,
245         'nonchar', 0xFFFFE,
246     ],
247     [ "non-character code point U+FFFFF",
248         (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
249         $::UTF8_DISALLOW_NONCHAR,
250         'nonchar', 0xFFFFF,
251     ],
252     [ "non-character code point U+10FFFE",
253         (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
254         $::UTF8_DISALLOW_NONCHAR,
255         'nonchar', 0x10FFFE,
256     ],
257     [ "non-character code point U+10FFFF",
258         (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
259         $::UTF8_DISALLOW_NONCHAR,
260         'nonchar', 0x10FFFF,
261     ],
262     [ "requires at least 32 bits",
263         (isASCII)
264          ?  "\xfe\x82\x80\x80\x80\x80\x80"
265          : I8_to_native(
266             "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
267         # This code point is chosen so that it is representable in a UV on
268         # 32-bit machines
269         $::UTF8_DISALLOW_ABOVE_31_BIT,
270         'utf8', 0x80000000,
271         (isASCII) ? 1 : 8,
272     ],
273     [ "highest 32 bit code point",
274         (isASCII)
275          ?  "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
276          : I8_to_native(
277             "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
278         $::UTF8_DISALLOW_ABOVE_31_BIT,
279         'utf8', 0xFFFFFFFF,
280         (isASCII) ? 1 : 8,
281     ],
282     [ "requires at least 32 bits, and use SUPER-type flags, instead of"
283     . " ABOVE_31_BIT",
284         (isASCII)
285          ? "\xfe\x82\x80\x80\x80\x80\x80"
286          : I8_to_native(
287            "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
288         $::UTF8_DISALLOW_SUPER,
289         'utf8', 0x80000000,
290         1,
291     ],
292     [ "overflow with warnings/disallow for more than 31 bits",
293         # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
294         # with overflow.  The overflow malformation is never allowed, so
295         # preventing it takes precedence if the ABOVE_31_BIT options would
296         # otherwise allow in an overflowing value.  The ASCII code points (1
297         # for 32-bits; 1 for 64) were chosen because the old overflow
298         # detection algorithm did not catch them; this means this test also
299         # checks for that fix.  The EBCDIC are arbitrary overflowing ones
300         # since we have no reports of failures with it.
301        (($::is64bit)
302         ? ((isASCII)
303            ?    "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
304            : I8_to_native(
305                 "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
306         : ((isASCII)
307            ?    "\xfe\x86\x80\x80\x80\x80\x80"
308            : I8_to_native(
309                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
310         $::UTF8_DISALLOW_ABOVE_31_BIT,
311         'utf8', -1,
312         (isASCII || $::is64bit) ? 2 : 8,
313     ],
314 );
315
316 if (! $::is64bit) {
317     if (isASCII) {
318         no warnings qw{portable overflow};
319         push @tests,
320             [ "Lowest 33 bit code point: overflow",
321                 "\xFE\x84\x80\x80\x80\x80\x80",
322                 $::UTF8_DISALLOW_ABOVE_31_BIT,
323                 'utf8', -1,
324                 1,
325             ];
326     }
327 }
328 else {
329     no warnings qw{portable overflow};
330     push @tests,
331         [ "More than 32 bits",
332             (isASCII)
333             ?       "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
334             : I8_to_native(
335                     "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
336             $::UTF8_DISALLOW_ABOVE_31_BIT,
337             'utf8', 0x1000000000,
338             (isASCII) ? 1 : 7,
339         ];
340     if (! isASCII) {
341         push @tests,   # These could falsely show wrongly in a naive
342                        # implementation
343             [ "requires at least 32 bits",
344                 I8_to_native(
345                     "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
346                 $::UTF8_DISALLOW_ABOVE_31_BIT,
347                 'utf8', 0x800000000,
348                 7,
349             ],
350             [ "requires at least 32 bits",
351                 I8_to_native(
352                     "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
353                 $::UTF8_DISALLOW_ABOVE_31_BIT,
354                 'utf8', 0x10000000000,
355                 6,
356             ],
357             [ "requires at least 32 bits",
358                 I8_to_native(
359                     "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
360                 $::UTF8_DISALLOW_ABOVE_31_BIT,
361                 'utf8', 0x200000000000,
362                 5,
363             ],
364             [ "requires at least 32 bits",
365                 I8_to_native(
366                     "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
367                 $::UTF8_DISALLOW_ABOVE_31_BIT,
368                 'utf8', 0x4000000000000,
369                 4,
370             ],
371             [ "requires at least 32 bits",
372                 I8_to_native(
373                     "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
374                 $::UTF8_DISALLOW_ABOVE_31_BIT,
375                 'utf8', 0x80000000000000,
376                 3,
377             ],
378             [ "requires at least 32 bits",
379                 I8_to_native(
380                     "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
381                 $::UTF8_DISALLOW_ABOVE_31_BIT,
382                 'utf8', 0x1000000000000000,
383                 2,
384             ];
385     }
386 }
387
388 sub flags_to_text($$)
389 {
390     my ($flags, $flags_to_text_ref) = @_;
391
392     # Returns a string containing a mnemonic representation of the bits that
393     # are set in the $flags.  These are assumed to be flag bits.  The return
394     # looks like "FOO|BAR|BAZ".  The second parameter is a reference to an
395     # array that gives the textual representation of all the possible flags.
396     # Element 0 is the text for the bit 0 flag; element 1 for bit 1; ....  If
397     # no bits at all are set the string "0" is returned;
398
399     my @flag_text;
400     my $shift = 0;
401
402     return "0" if $flags == 0;
403
404     while ($flags) {
405         #diag sprintf "%x", $flags;
406         if ($flags & 1) {
407             push @flag_text, $flags_to_text_ref->[$shift];
408         }
409         $shift++;
410         $flags >>= 1;
411     }
412
413     return join "|", @flag_text;
414 }
415
416 # Possible flag returns from utf8n_to_uvchr_error().  These should have G_,
417 # instead of A_, D_, but the prefixes will be used in a a later commit, so
418 # minimize churn by having them here.
419 my @utf8n_flags_to_text =  ( qw(
420         A_EMPTY
421         A_CONTINUATION
422         A_NON_CONTINUATION
423         A_SHORT
424         A_LONG
425         A_LONG_AND_ITS_VALUE
426         PLACEHOLDER
427         A_OVERFLOW
428         D_SURROGATE
429         W_SURROGATE
430         D_NONCHAR
431         W_NONCHAR
432         D_SUPER
433         W_SUPER
434         D_ABOVE_31_BIT
435         W_ABOVE_31_BIT
436         CHECK_ONLY
437         NO_CONFIDENCE_IN_CURLEN_
438     ) );
439
440 sub utf8n_display_call($)
441 {
442     # Converts an eval string that calls test_utf8n_to_uvchr into a more human
443     # readable form, and returns it.  Doesn't work if the byte string contains
444     # an apostrophe.  The return will look something like:
445     #   test_utf8n_to_uvchr_error('$bytes', $length, $flags)
446     #diag $_[0];
447
448     $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
449     my $text1 = $1;     # Everything before the byte string
450     my $bytes = $2;
451     my $text2 = $3;     # Includes the length
452     my $flags = $4;
453
454     return $text1
455          . display_bytes($bytes)
456          . $text2
457          . flags_to_text($flags, \@utf8n_flags_to_text)
458          . ')';
459 }
460
461 sub uvchr_display_call($)
462 {
463     # Converts an eval string that calls test_uvchr_to_utf8 into a more human
464     # readable form, and returns it.  The return will look something like:
465     #   test_uvchr_to_utf8n_flags($uv, $flags)
466     #diag $_[0];
467
468     my @flags_to_text =  ( qw(
469             W_SURROGATE
470             W_NONCHAR
471             W_SUPER
472             W_ABOVE_31_BIT
473             D_SURROGATE
474             D_NONCHAR
475             D_SUPER
476             D_ABOVE_31_BIT
477        ) );
478
479     $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
480     my $text = $1;
481     my $cp = sprintf "%X", $2;
482     my $flags = $3;
483
484     return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
485 }
486
487 # This test is split into this number of files.
488 my $num_test_files = $ENV{TEST_JOBS} || 1;
489 $num_test_files = 10 if $num_test_files > 10;
490
491 my $test_count = -1;
492 foreach my $test (@tests) {
493     $test_count++;
494     next if $test_count % $num_test_files != $::TEST_CHUNK;
495
496     my ($testname, $bytes, $disallow_flags,
497         $controlling_warning_category, $allowed_uv, $needed_to_discern_len
498        ) = @$test;
499
500     my $length = length $bytes;
501     my $will_overflow = $allowed_uv < 0;
502
503     my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
504
505     # The convention is that the got flag is the same value as the disallow
506     # one, and the warn flag is the next bit over.  If this were violated, the
507     # tests here should start failing.  We could do an eval under no strict to
508     # be sure.
509     my $expected_error_flags = $disallow_flags;
510     my $warn_flags = $disallow_flags << 1;
511
512     my $message;
513     if ($allowed_uv > 0x7FFFFFFF) {
514         $message = nonportable_regex($allowed_uv);
515     }
516     elsif ($allowed_uv > 0x10FFFF) {
517         $message = qr/(not Unicode|for a non-Unicode code point).* may not be portable/;
518     }
519     elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
520         $message = qr/surrogate/;
521         $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
522     }
523     elsif (   ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
524            || ($allowed_uv & 0xFFFE) == 0xFFFE)
525     {
526         $message = qr/Unicode non-character.*is not recommended for open interchange/;
527         $needed_to_discern_len = $length unless defined $needed_to_discern_len;
528     }
529     elsif ($will_overflow) {
530         $message = qr/overflows/;
531     }
532     else {
533         die "Can't figure out what type of warning to test for $testname"
534     }
535
536     die 'Didn\'t set $needed_to_discern_len for ' . $testname
537                                         unless defined $needed_to_discern_len;
538
539     {
540         use warnings;
541         undef @warnings_gotten;
542         my $ret = test_isUTF8_CHAR($bytes, $length);
543         my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
544         if ($will_overflow) {
545             is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
546             is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
547         }
548         else {
549             is($ret, $length,
550                "isUTF8_CHAR() $testname: returns expected length: $length");
551             is($ret_flags, $length, "isUTF8_CHAR_flags(...,0) $testname:"
552                                   . " returns expected length: $length");
553         }
554         is(scalar @warnings_gotten, 0,
555                 "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated"
556               . " no warnings")
557           or output_warnings(@warnings_gotten);
558
559         undef @warnings_gotten;
560         $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
561         if ($will_overflow) {
562             is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
563         }
564         else {
565             my $expected_ret = (   $testname =~ /surrogate|non-character/
566                                 || $allowed_uv > 0x10FFFF)
567                                ? 0
568                                : $length;
569             is($ret, $expected_ret, "isSTRICT_UTF8_CHAR() $testname: returns"
570                                   . " expected length: $expected_ret");
571             $ret = test_isUTF8_CHAR_flags($bytes, $length,
572                                           $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
573             is($ret, $expected_ret,
574                             "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
575                           . " acts like isSTRICT_UTF8_CHAR");
576         }
577         is(scalar @warnings_gotten, 0,
578                 "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
579               . " generated no warnings")
580           or output_warnings(@warnings_gotten);
581
582         undef @warnings_gotten;
583         $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
584         if ($will_overflow) {
585             is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
586         }
587         else {
588             my $expected_ret = (   $testname =~ /surrogate/
589                                 || $allowed_uv > 0x10FFFF)
590                                ? 0
591                                : $length;
592             is($ret, $expected_ret, "isC9_STRICT_UTF8_CHAR() $testname:"
593                                    ." returns expected length: $expected_ret");
594             $ret = test_isUTF8_CHAR_flags($bytes, $length,
595                                           $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
596             is($ret, $expected_ret,
597                           "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
598                         . " acts like isC9_STRICT_UTF8_CHAR");
599         }
600         is(scalar @warnings_gotten, 0,
601                 "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
602               . " generated no warnings")
603           or output_warnings(@warnings_gotten);
604
605         # Test partial character handling, for each byte not a full character
606         for my $j (1.. $length - 1) {
607
608             # Skip the test for the interaction between overflow and above-31
609             # bit.  It is really testing other things than the partial
610             # character tests, for which other tests in this file are
611             # sufficient
612             last if $will_overflow;
613
614             foreach my $disallow_flag (0, $disallow_flags) {
615                 my $partial = substr($bytes, 0, $j);
616                 my $ret_should_be;
617                 my $comment;
618                 if ($disallow_flag) {
619                     $ret_should_be = 0;
620                     $comment = "disallowed";
621                     if ($j < $needed_to_discern_len) {
622                         $ret_should_be = 1;
623                         $comment .= ", but need $needed_to_discern_len bytes"
624                                  .  " to discern:";
625                     }
626                 }
627                 else {
628                     $ret_should_be = 1;
629                     $comment = "allowed";
630                 }
631
632                 undef @warnings_gotten;
633
634                 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
635                                                              $disallow_flag);
636                 is($ret, $ret_should_be,
637                                 "$testname: is_utf8_valid_partial_char_flags("
638                                         . display_bytes($partial)
639                                         . "), $comment: returns $ret_should_be");
640                 is(scalar @warnings_gotten, 0,
641                         "$testname: is_utf8_valid_partial_char_flags()"
642                       . " generated no warnings")
643                   or output_warnings(@warnings_gotten);
644             }
645         }
646     }
647
648     # This is more complicated than the malformations tested earlier, as there
649     # are several orthogonal variables involved.  We test all the subclasses
650     # of utf8 warnings to verify they work with and without the utf8 class,
651     # and don't have effects on other sublass warnings
652     foreach my $trial_warning_category ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
653       foreach my $warn_flag (0, $warn_flags) {
654         foreach my $disallow_flag (0, $disallow_flags) {
655           foreach my $do_warning (0, 1) {
656
657             # We try each of the above with various combinations of
658             # malformations that can occur on the same input sequence.
659             foreach my $short ("", "short") {
660               foreach my $unexpected_noncont ("",
661                                               "unexpected non-continuation")
662               {
663                 foreach my $overlong ("", "overlong") {
664
665                     # If we're creating an overlong, it can't be longer than
666                     # the maximum length, so skip if we're already at that
667                     # length.
668                     next if $overlong && $length >= $::max_bytes;
669
670                     my @malformations;
671                     my @expected_return_flags;
672                     push @malformations, $short if $short;
673                     push @malformations, $unexpected_noncont
674                                                       if $unexpected_noncont;
675                     push @malformations, $overlong if $overlong;
676
677                     # The overflow malformation test in the input
678                     # array is coerced into being treated like one of
679                     # the others.
680                     if ($will_overflow) {
681                         push @malformations, 'overflow';
682                         push @expected_return_flags, $::UTF8_GOT_OVERFLOW;
683                     }
684
685                     my $malformations_name = join "/", @malformations;
686                     $malformations_name .= " malformation"
687                                                 if $malformations_name;
688                     $malformations_name .= "s" if @malformations > 1;
689                     my $this_bytes = $bytes;
690                     my $this_length = $length;
691                     my $expected_uv = $allowed_uv;
692                     my $this_expected_len = $length;
693                     my $this_needed_to_discern_len = $needed_to_discern_len;
694                     if ($malformations_name) {
695                         $expected_uv = 0;
696
697                         # Coerce the input into the desired
698                         # malformation
699                         if ($malformations_name =~ /overlong/) {
700
701                             # For an overlong, we convert the original
702                             # start byte into a continuation byte with
703                             # the same data bits as originally. ...
704                             substr($this_bytes, 0, 1)
705                                 = start_byte_to_cont(substr($this_bytes,
706                                                             0, 1));
707
708                             # ... Then we prepend it with a known
709                             # overlong sequence.  This should evaluate
710                             # to the exact same code point as the
711                             # original.
712                             $this_bytes
713                             = I8_to_native("\xff")
714                             . (I8_to_native(chr $::lowest_continuation)
715                             x ( $::max_bytes - 1 - length($this_bytes)))
716                             . $this_bytes;
717                             $this_length = length($this_bytes);
718                             $this_needed_to_discern_len
719                                  = $::max_bytes - ($this_expected_len
720                                                - $this_needed_to_discern_len);
721                             $this_expected_len = $::max_bytes;
722                             push @expected_return_flags, $::UTF8_GOT_LONG;
723                         }
724                         if ($malformations_name =~ /short/) {
725
726                             # Just tell the test to not look far
727                             # enough into the input.
728                             $this_length--;
729                             $this_expected_len--;
730                             push @expected_return_flags, $::UTF8_GOT_SHORT;
731                         }
732                         if ($malformations_name
733                                                 =~ /non-continuation/)
734                         {
735                             # Change the final continuation byte into
736                             # a non one.
737                             my $pos = ($short) ? -2 : -1;
738                             substr($this_bytes, $pos, 1) = '?';
739                             $this_expected_len--;
740                             push @expected_return_flags,
741                                             $::UTF8_GOT_NON_CONTINUATION;
742                         }
743                     }
744
745                     my $eval_warn = $do_warning
746                                 ? "use warnings '$trial_warning_category'"
747                                 : $trial_warning_category eq "utf8"
748                                     ? "no warnings 'utf8'"
749                                     : ( "use warnings 'utf8';"
750                                     . " no warnings '$trial_warning_category'");
751
752                     # Is effectively disallowed if we've set up a
753                     # malformation, even if the flag indicates it is
754                     # allowed.  Fix up test name to indicate this as
755                     # well
756                     my $disallowed = $disallow_flag
757                                 || $malformations_name;
758                     my $this_name = "utf8n_to_uvchr_error() $testname: "
759                                                 . (($disallow_flag)
760                                                 ? 'disallowed'
761                                                 : $disallowed
762                                                     ? $disallowed
763                                                     : 'allowed');
764                     $this_name .= ", $eval_warn";
765                     $this_name .= ", " . (($warn_flag)
766                                         ? 'with warning flag'
767                                         : 'no warning flag');
768
769                     undef @warnings_gotten;
770                     my $ret_ref;
771                     my $this_flags = $warn_flag | $disallow_flag;
772                     my $eval_text =      "$eval_warn; \$ret_ref"
773                             . " = test_utf8n_to_uvchr_error("
774                             . "'$this_bytes',"
775                             . " $this_length, $this_flags)";
776                     eval "$eval_text";
777                     if (! ok ("$@ eq ''",
778                         "$this_name: eval succeeded"))
779                     {
780                         diag "\$@='$@'; call was: "
781                            . utf8n_display_call($eval_text);
782                         next;
783                     }
784                     if ($disallowed) {
785                         is($ret_ref->[0], 0, "$this_name: Returns 0")
786                           or diag "Call was: " . utf8n_display_call($eval_text);
787                     }
788                     else {
789                         is($ret_ref->[0], $expected_uv,
790                                 "$this_name: Returns expected uv: "
791                                 . $uv_string)
792                           or diag "Call was: " . utf8n_display_call($eval_text);
793                     }
794                     is($ret_ref->[1], $this_expected_len,
795                                         "$this_name: Returns expected length:"
796                                       . " $this_expected_len")
797                       or diag "Call was: " . utf8n_display_call($eval_text);
798
799                     my $returned_flags = $ret_ref->[2];
800
801                     for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
802                         if (ok($expected_return_flags[$i] & $returned_flags,
803                             "Expected and got error bit return"
804                             . " for $malformations[$i] malformation"))
805                         {
806                             $returned_flags &= ~$expected_return_flags[$i];
807                         }
808                         splice @expected_return_flags, $i, 1;
809                     }
810                     is(scalar @expected_return_flags, 0,
811                             "Got all the expected malformation errors")
812                       or diag Dumper \@expected_return_flags;
813
814                     if (   $this_expected_len >= $this_needed_to_discern_len
815                         && ($warn_flag || $disallow_flag))
816                     {
817                         is($returned_flags, $expected_error_flags,
818                                 "Got the correct error flag")
819                           or diag "Call was: " . utf8n_display_call($eval_text);
820                     }
821                     else {
822                         is($returned_flags, 0, "Got no other error flag")
823                         or
824
825                         # We strip off any prefixes from the flag names
826                         diag "The unexpected flags were: "
827                            . (flags_to_text($returned_flags,
828                                             \@utf8n_flags_to_text)
829                              =~ s/ \b [A-Z] _ //xgr);
830                     }
831
832                     if (@malformations) {
833                         if (! $do_warning && $trial_warning_category eq 'utf8') {
834                             goto no_warnings_expected;
835                         }
836
837                         # Check that each malformation generates a
838                         # warning, removing that warning if found
839                     MALFORMATION:
840                         foreach my $malformation (@malformations) {
841                             foreach (my $i = 0; $i < @warnings_gotten; $i++) {
842                                 if ($warnings_gotten[$i] =~ /$malformation/) {
843                                     pass("Expected and got"
844                                     . "'$malformation' warning");
845                                     splice @warnings_gotten, $i, 1;
846                                     next MALFORMATION;
847                                 }
848                             }
849                             fail("Expected '$malformation' warning"
850                             . " but didn't get it");
851
852                         }
853                     }
854
855                     # Any overflow will override any super or above-31
856                     # warnings.
857                     goto no_warnings_expected
858                                 if $will_overflow || $this_expected_len
859                                         < $this_needed_to_discern_len;
860
861                     if (    ! $do_warning
862                         && (   $trial_warning_category eq 'utf8'
863                             || $trial_warning_category eq $controlling_warning_category))
864                     {
865                         goto no_warnings_expected;
866                     }
867                     elsif ($warn_flag) {
868                         if (is(scalar @warnings_gotten, 1,
869                             "$this_name: Got a single warning "))
870                         {
871                             like($warnings_gotten[0], $message,
872                                     "$this_name: Got expected warning")
873                                 or diag "Call was: "
874                                  . utf8n_display_call($eval_text);
875                         }
876                         else {
877                             diag "Call was: " . utf8n_display_call($eval_text);
878                             if (scalar @warnings_gotten) {
879                                 output_warnings(@warnings_gotten);
880                             }
881                         }
882                     }
883                     else {
884
885                     no_warnings_expected:
886                         unless (is(scalar @warnings_gotten, 0,
887                                 "$this_name: Got no warnings"))
888                         {
889                             diag "Call was: " . utf8n_display_call($eval_text);
890                             output_warnings(@warnings_gotten);
891                         }
892                     }
893
894                     # Check CHECK_ONLY results when the input is
895                     # disallowed.  Do this when actually disallowed,
896                     # not just when the $disallow_flag is set
897                     if ($disallowed) {
898                         undef @warnings_gotten;
899                         $this_flags = $disallow_flag|$::UTF8_CHECK_ONLY;
900                         $eval_text = "\$ret_ref = test_utf8n_to_uvchr_error("
901                                    . "'$this_bytes', $this_length, $this_flags)";
902                         eval "$eval_text";
903                         if (! ok ("$@ eq ''",
904                             "    And eval succeeded with CHECK_ONLY"))
905                         {
906                             diag "\$@='$@'; Call was: "
907                                . utf8n_display_call($eval_text);
908                             next;
909                         }
910                         is($ret_ref->[0], 0,
911                                         "$this_name, CHECK_ONLY: Returns 0")
912                           or diag "Call was: " . utf8n_display_call($eval_text);
913                         is($ret_ref->[1], -1,
914                             "$this_name: CHECK_ONLY: returns -1 for length")
915                           or diag "Call was: " . utf8n_display_call($eval_text);
916                         if (! is(scalar @warnings_gotten, 0,
917                             "$this_name, CHECK_ONLY: no warnings"
918                         . " generated"))
919                         {
920                             diag "Call was: " . utf8n_display_call($eval_text);
921                             output_warnings(@warnings_gotten);
922                         }
923                     }
924
925                     # Now repeat some of the above, but for
926                     # uvchr_to_utf8_flags().  Since this comes from an
927                     # existing code point, it hasn't overflowed, and
928                     # isn't malformed.
929                     next if @malformations;
930
931                     # The warning and disallow flags passed in are for
932                     # utf8n_to_uvchr_error().  Convert them for
933                     # uvchr_to_utf8_flags().
934                     my $uvchr_warn_flag = 0;
935                     my $uvchr_disallow_flag = 0;
936                     if ($warn_flag) {
937                         if ($warn_flag == $::UTF8_WARN_SURROGATE) {
938                             $uvchr_warn_flag = $::UNICODE_WARN_SURROGATE
939                         }
940                         elsif ($warn_flag == $::UTF8_WARN_NONCHAR) {
941                             $uvchr_warn_flag = $::UNICODE_WARN_NONCHAR
942                         }
943                         elsif ($warn_flag == $::UTF8_WARN_SUPER) {
944                             $uvchr_warn_flag = $::UNICODE_WARN_SUPER
945                         }
946                         elsif ($warn_flag == $::UTF8_WARN_ABOVE_31_BIT) {
947                             $uvchr_warn_flag
948                                         = $::UNICODE_WARN_ABOVE_31_BIT;
949                         }
950                         else {
951                             fail(sprintf "Unexpected warn flag: %x",
952                                 $warn_flag);
953                             next;
954                         }
955                     }
956                     if ($disallow_flag) {
957                         if ($disallow_flag == $::UTF8_DISALLOW_SURROGATE)
958                         {
959                             $uvchr_disallow_flag
960                                         = $::UNICODE_DISALLOW_SURROGATE;
961                         }
962                         elsif ($disallow_flag == $::UTF8_DISALLOW_NONCHAR)
963                         {
964                             $uvchr_disallow_flag
965                                         = $::UNICODE_DISALLOW_NONCHAR;
966                         }
967                         elsif ($disallow_flag == $::UTF8_DISALLOW_SUPER) {
968                             $uvchr_disallow_flag
969                                         = $::UNICODE_DISALLOW_SUPER;
970                         }
971                         elsif ($disallow_flag
972                                         == $::UTF8_DISALLOW_ABOVE_31_BIT)
973                         {
974                             $uvchr_disallow_flag =
975                                         $::UNICODE_DISALLOW_ABOVE_31_BIT;
976                         }
977                         else {
978                             fail(sprintf "Unexpected disallow flag: %x",
979                                 $disallow_flag);
980                             next;
981                         }
982                     }
983
984                     $disallowed = $uvchr_disallow_flag;
985
986                     $this_name = "uvchr_to_utf8_flags() $testname: "
987                                             . (($uvchr_disallow_flag)
988                                                 ? 'disallowed'
989                                                 : ($disallowed)
990                                                 ? 'ABOVE_31_BIT allowed'
991                                                 : 'allowed');
992                     $this_name .= ", $eval_warn";
993                     $this_name .= ", " . (($uvchr_warn_flag)
994                                         ? 'with warning flag'
995                                         : 'no warning flag');
996
997                     undef @warnings_gotten;
998                     my $ret;
999                     $this_flags = $uvchr_warn_flag | $uvchr_disallow_flag;
1000                     $eval_text = "$eval_warn; \$ret ="
1001                             . " test_uvchr_to_utf8_flags("
1002                             . "$allowed_uv, $this_flags)";
1003                     eval "$eval_text";
1004                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1005                     {
1006                         diag "\$@='$@'; call was: "
1007                            . uvchr_display_call($eval_text);
1008                         next;
1009                     }
1010                     if ($disallowed) {
1011                         is($ret, undef, "$this_name: Returns undef")
1012                           or diag "Call was: " . uvchr_display_call($eval_text);
1013                     }
1014                     else {
1015                         is($ret, $this_bytes, "$this_name: Returns expected string")
1016                           or diag "Call was: " . uvchr_display_call($eval_text);
1017                     }
1018                     if (! $do_warning
1019                         && ($trial_warning_category eq 'utf8' || $trial_warning_category eq $controlling_warning_category))
1020                     {
1021                         if (!is(scalar @warnings_gotten, 0,
1022                                 "$this_name: No warnings generated"))
1023                         {
1024                             diag "Call was: " . uvchr_display_call($eval_text);
1025                             output_warnings(@warnings_gotten);
1026                         }
1027                     }
1028                     elsif (       $uvchr_warn_flag
1029                         && (   $trial_warning_category eq 'utf8'
1030                             || $trial_warning_category eq $controlling_warning_category))
1031                     {
1032                         if (is(scalar @warnings_gotten, 1,
1033                             "$this_name: Got a single warning "))
1034                         {
1035                             like($warnings_gotten[0], $message,
1036                                     "$this_name: Got expected warning")
1037                                 or diag "Call was: "
1038                                       . uvchr_display_call($eval_text);
1039                         }
1040                         else {
1041                             diag "Call was: " . uvchr_display_call($eval_text);
1042                             output_warnings(@warnings_gotten)
1043                                                 if scalar @warnings_gotten;
1044                         }
1045                     }
1046                 }
1047               }
1048             }
1049           }
1050         }
1051       }
1052     }
1053 }
1054
1055 done_testing;