This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/utf8_warn_base.pl: Add a test
[perl5.git] / ext / XS-APItest / t / utf8_warn_base.pl
1 #!perl -w
2
3 # This is a base file to be used by various .t's in its directory
4 # It tests various malformed UTF-8 sequences and some code points that are
5 # "problematic", and verifies that the correct warnings/flags etc are
6 # generated when using them.  For the code points, it also takes the UTF-8 and
7 # perturbs it to be malformed in various ways, and tests that this gets
8 # appropriately detected.
9
10 use strict;
11 use Test::More;
12
13 BEGIN {
14     use_ok('XS::APItest');
15     require 'charset_tools.pl';
16     require './t/utf8_setup.pl';
17 };
18
19 $|=1;
20
21 no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
22                           # machines, and that is tested elsewhere
23 use XS::APItest;
24
25 my @warnings_gotten;
26
27 use warnings 'utf8';
28 local $SIG{__WARN__} = sub { my @copy = @_;
29                              push @warnings_gotten, map { chomp; $_ } @copy;
30                            };
31
32 my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
33 my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
34
35 sub requires_extended_utf8($) {
36
37     # Returns a boolean as to whether or not the code point parameter fits
38     # into 31 bits, subject to the convention that a negative code point
39     # stands for one that overflows the word size, so won't fit in 31 bits.
40
41     return shift > $highest_non_extended_utf8_cp;
42 }
43
44 sub is_extended_utf8($) {
45
46     # Returns a boolean as to whether or not the input UTF-8 sequence uses
47     # Perl extended UTF-8.
48
49     my $byte = substr(shift, 0, 1);
50     return ord $byte >= 0xFE if isASCII;
51     return $byte == I8_to_native("\xFF");
52 }
53
54 sub overflow_discern_len($) {
55
56     # Returns how many bytes are needed to tell if a non-overlong UTF-8
57     # sequence is for a code point that won't fit in the platform's word size.
58     # Only the length of the sequence representing a single code point is
59     # needed.
60
61     if (isASCII) {
62         return ($::is64bit) ? 3 : ((shift == $::max_bytes)
63                                    ? 1
64                                    : 2);
65     }
66
67     return ($::is64bit) ? 2 : 8;
68 }
69
70 sub overlong_discern_len($) {
71
72     # Returns how many bytes are needed to tell if the input UTF-8 sequence
73     # for a code point is overlong
74
75     my $string = shift;
76     my $length = length $string;
77     my $byte = ord native_to_I8(substr($string, 0, 1));
78     if (isASCII) {
79         return ($length == $::max_bytes)
80                   # This is constrained to 1 on 32-bit machines, as it
81                   # overflows there
82                 ? (($::is64bit) ? 7 : 1)
83                 : (($length == 2) ? 1 : 2);
84     }
85
86     return ($length == $::max_bytes) ? 8 : (($length <= 3) ? 1 : 2);
87 }
88
89 my @tests;
90 {
91     no warnings qw(portable overflow);
92     @tests = (
93         # $testname,
94         # $bytes,                  UTF-8 string
95         # $allowed_uv,             code point $bytes evaluates to; -1 if
96         #                          overflows
97         # $needed_to_discern_len   optional, how long an initial substring do
98         #                          we need to tell that the string must be for
99         #                          a code point in the category it falls in,
100         #                          like being a surrogate; 0 indicates we need
101         #                          the whole string.  Some categories have a
102         #                          default that is used if this is omitted.
103         [ "overlong malformation, lowest 2-byte",
104             (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"),
105             0,   # NUL
106         ],
107         [ "overlong malformation, highest 2-byte",
108             (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
109             (isASCII) ? 0x7F : utf8::unicode_to_native(0x9F),
110         ],
111         [ "overlong malformation, lowest 3-byte",
112             (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
113             0,   # NUL
114         ],
115         [ "overlong malformation, highest 3-byte",
116             (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
117             (isASCII) ? 0x7FF : 0x3FF,
118         ],
119         [ "lowest surrogate",
120             (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
121             0xD800,
122         ],
123         [ "a middle surrogate",
124             (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
125             0xD90D,
126         ],
127         [ "highest surrogate",
128             (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
129             0xDFFF,
130         ],
131         [ "first of 32 consecutive non-character code points",
132             (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
133             0xFDD0,
134         ],
135         [ "a mid non-character code point of the 32 consecutive ones",
136             (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
137             0xFDE0,
138         ],
139         [ "final of 32 consecutive non-character code points",
140             (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
141             0xFDEF,
142         ],
143         [ "non-character code point U+FFFE",
144             (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
145             0xFFFE,
146         ],
147         [ "non-character code point U+FFFF",
148             (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
149             0xFFFF,
150         ],
151         [ "overlong malformation, lowest 4-byte",
152             (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
153             0,   # NUL
154         ],
155         [ "overlong malformation, highest 4-byte",
156             (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
157             (isASCII) ? 0xFFFF : 0x3FFF,
158         ],
159         [ "non-character code point U+1FFFE",
160             (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
161             0x1FFFE,
162         ],
163         [ "non-character code point U+1FFFF",
164             (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
165             0x1FFFF,
166         ],
167         [ "non-character code point U+2FFFE",
168             (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
169             0x2FFFE,
170         ],
171         [ "non-character code point U+2FFFF",
172             (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
173             0x2FFFF,
174         ],
175         [ "non-character code point U+3FFFE",
176             (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
177             0x3FFFE,
178         ],
179         [ "non-character code point U+3FFFF",
180             (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
181             0x3FFFF,
182         ],
183         [ "non-character code point U+4FFFE",
184             (isASCII)
185             ?               "\xf1\x8f\xbf\xbe"
186             : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
187             0x4FFFE,
188         ],
189         [ "non-character code point U+4FFFF",
190             (isASCII)
191             ?               "\xf1\x8f\xbf\xbf"
192             : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
193             0x4FFFF,
194         ],
195         [ "non-character code point U+5FFFE",
196             (isASCII)
197             ?              "\xf1\x9f\xbf\xbe"
198             : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
199             0x5FFFE,
200         ],
201         [ "non-character code point U+5FFFF",
202             (isASCII)
203             ?              "\xf1\x9f\xbf\xbf"
204             : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
205             0x5FFFF,
206         ],
207         [ "non-character code point U+6FFFE",
208             (isASCII)
209             ?              "\xf1\xaf\xbf\xbe"
210             : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
211             0x6FFFE,
212         ],
213         [ "non-character code point U+6FFFF",
214             (isASCII)
215             ?              "\xf1\xaf\xbf\xbf"
216             : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
217             0x6FFFF,
218         ],
219         [ "non-character code point U+7FFFE",
220             (isASCII)
221             ?              "\xf1\xbf\xbf\xbe"
222             : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
223             0x7FFFE,
224         ],
225         [ "non-character code point U+7FFFF",
226             (isASCII)
227             ?              "\xf1\xbf\xbf\xbf"
228             : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
229             0x7FFFF,
230         ],
231         [ "non-character code point U+8FFFE",
232             (isASCII)
233             ?              "\xf2\x8f\xbf\xbe"
234             : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
235             0x8FFFE,
236         ],
237         [ "non-character code point U+8FFFF",
238             (isASCII)
239             ?              "\xf2\x8f\xbf\xbf"
240             : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
241             0x8FFFF,
242         ],
243         [ "non-character code point U+9FFFE",
244             (isASCII)
245             ?              "\xf2\x9f\xbf\xbe"
246             : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
247             0x9FFFE,
248         ],
249         [ "non-character code point U+9FFFF",
250             (isASCII)
251             ?              "\xf2\x9f\xbf\xbf"
252             : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
253             0x9FFFF,
254         ],
255         [ "non-character code point U+AFFFE",
256             (isASCII)
257             ?              "\xf2\xaf\xbf\xbe"
258             : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
259             0xAFFFE,
260         ],
261         [ "non-character code point U+AFFFF",
262             (isASCII)
263             ?              "\xf2\xaf\xbf\xbf"
264             : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
265             0xAFFFF,
266         ],
267         [ "non-character code point U+BFFFE",
268             (isASCII)
269             ?              "\xf2\xbf\xbf\xbe"
270             : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
271             0xBFFFE,
272         ],
273         [ "non-character code point U+BFFFF",
274             (isASCII)
275             ?              "\xf2\xbf\xbf\xbf"
276             : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
277             0xBFFFF,
278         ],
279         [ "non-character code point U+CFFFE",
280             (isASCII)
281             ?              "\xf3\x8f\xbf\xbe"
282             : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
283             0xCFFFE,
284         ],
285         [ "non-character code point U+CFFFF",
286             (isASCII)
287             ?              "\xf3\x8f\xbf\xbf"
288             : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
289             0xCFFFF,
290         ],
291         [ "non-character code point U+DFFFE",
292             (isASCII)
293             ?              "\xf3\x9f\xbf\xbe"
294             : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
295             0xDFFFE,
296         ],
297         [ "non-character code point U+DFFFF",
298             (isASCII)
299             ?              "\xf3\x9f\xbf\xbf"
300             : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
301             0xDFFFF,
302         ],
303         [ "non-character code point U+EFFFE",
304             (isASCII)
305             ?              "\xf3\xaf\xbf\xbe"
306             : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
307             0xEFFFE,
308         ],
309         [ "non-character code point U+EFFFF",
310             (isASCII)
311             ?              "\xf3\xaf\xbf\xbf"
312             : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
313             0xEFFFF,
314         ],
315         [ "non-character code point U+FFFFE",
316             (isASCII)
317             ?              "\xf3\xbf\xbf\xbe"
318             : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
319             0xFFFFE,
320         ],
321         [ "non-character code point U+FFFFF",
322             (isASCII)
323             ?              "\xf3\xbf\xbf\xbf"
324             : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
325             0xFFFFF,
326         ],
327         [ "non-character code point U+10FFFE",
328             (isASCII)
329             ?              "\xf4\x8f\xbf\xbe"
330             : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
331             0x10FFFE,
332         ],
333         [ "non-character code point U+10FFFF",
334             (isASCII)
335             ?              "\xf4\x8f\xbf\xbf"
336             : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
337             0x10FFFF,
338         ],
339         [ "first non_unicode",
340             (isASCII)
341             ?              "\xf4\x90\x80\x80"
342             : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
343             0x110000,
344             2,
345         ],
346         [ "non_unicode whose first byte tells that",
347             (isASCII)
348             ?              "\xf5\x80\x80\x80"
349             : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
350             (isASCII) ? 0x140000 : 0x200000,
351             1,
352         ],
353         [ "overlong malformation, lowest 5-byte",
354             (isASCII)
355             ?              "\xf8\x80\x80\x80\x80"
356             : I8_to_native("\xf8\xa0\xa0\xa0\xa0"),
357             0,   # NUL
358         ],
359         [ "overlong malformation, highest 5-byte",
360             (isASCII)
361             ?              "\xf8\x87\xbf\xbf\xbf"
362             : I8_to_native("\xf8\xa7\xbf\xbf\xbf"),
363             (isASCII) ? 0x1FFFFF : 0x3FFFF,
364         ],
365         [ "overlong malformation, lowest 6-byte",
366             (isASCII)
367             ?              "\xfc\x80\x80\x80\x80\x80"
368             : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"),
369             0,   # NUL
370         ],
371         [ "overlong malformation, highest 6-byte",
372             (isASCII)
373             ?              "\xfc\x83\xbf\xbf\xbf\xbf"
374             : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"),
375             (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
376         ],
377         [ "overlong malformation, lowest 7-byte",
378             (isASCII)
379             ?              "\xfe\x80\x80\x80\x80\x80\x80"
380             : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"),
381             0,   # NUL
382         ],
383         [ "overlong malformation, highest 7-byte",
384             (isASCII)
385             ?              "\xfe\x81\xbf\xbf\xbf\xbf\xbf"
386             : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"),
387             (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
388         ],
389         [ "lowest 32 bit code point",
390             (isASCII)
391             ?  "\xfe\x82\x80\x80\x80\x80\x80"
392             : I8_to_native(
393                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
394             0x80000000,
395         ],
396         [ "highest 32 bit code point",
397             (isASCII)
398             ?  "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
399             : I8_to_native(
400                "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
401             0xFFFFFFFF,
402         ],
403         [ "Lowest 33 bit code point",
404             (isASCII)
405             ?  "\xfe\x84\x80\x80\x80\x80\x80"
406             : I8_to_native(
407                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
408             ($::is64bit) ? 0x100000000 : -1,   # Overflows on 32-bit systems
409         ],
410     );
411
412     if (! $::is64bit) {
413         if (isASCII) {
414             push @tests,
415                 [ "overlong malformation, but naively looks like overflow",
416                     "\xff\x80\x80\x80\x80\x80\x80\x83\xbf\xbf\xbf\xbf\xbf",
417                     0xFFFFFFFF,
418                 ],
419                 [ "overflow that old algorithm failed to detect",
420                     "\xfe\x86\x80\x80\x80\x80\x80",
421                     -1,
422                 ];
423         }
424     }
425
426     push @tests,
427         [ "overlong malformation, lowest max-byte",
428             (isASCII)
429              ?      "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
430              : I8_to_native(
431                     "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
432             0,   # NUL
433         ],
434         [ "overlong malformation, highest max-byte",
435             (isASCII)    # 2**36-1 on ASCII; 2**30-1 on EBCDIC
436              ?      "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf"
437              : I8_to_native(
438                     "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"),
439             (isASCII) ? (($::is64bit) ? 0xFFFFFFFFF : -1) : 0x3FFFFFFF,
440         ];
441
442     if (isASCII) {
443         push @tests,
444             [ "Lowest code point requiring 13 bytes to represent", # 2**36
445                 "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
446                 ($::is64bit) ? 0x1000000000 : -1,    # overflows on 32bit
447             ],
448     };
449
450     if ($::is64bit) {
451         push @tests,
452             [ "highest 64 bit code point",
453               (isASCII)
454               ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
455               : I8_to_native(
456                 "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
457               0xFFFFFFFFFFFFFFFF,
458               (isASCII) ? 1 : 2,
459             ],
460             [ "first 65 bit code point",
461               (isASCII)
462               ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
463               : I8_to_native(
464                 "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
465               -1,
466             ];
467         if (isASCII) {
468             push @tests,
469                 [ "overflow that old algorithm failed to detect",
470                     "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
471                     -1,
472                 ];
473         }
474         else {
475             push @tests,    # These could falsely show wrongly in a naive
476                             # implementation
477                 [ "requires at least 32 bits",
478                     I8_to_native(
479                     "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
480                     0x800000000,
481                 ],
482                 [ "requires at least 32 bits",
483                     I8_to_native(
484                     "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
485                     0x10000000000,
486                 ],
487                 [ "requires at least 32 bits",
488                     I8_to_native(
489                     "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
490                     0x200000000000,
491                 ],
492                 [ "requires at least 32 bits",
493                     I8_to_native(
494                     "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
495                     0x4000000000000,
496                 ],
497                 [ "requires at least 32 bits",
498                     I8_to_native(
499                     "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
500                     0x80000000000000,
501                 ],
502                 [ "requires at least 32 bits",
503                     I8_to_native(
504                     "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
505                     0x1000000000000000,
506                 ];
507         }
508     }
509 }
510
511 sub flags_to_text($$)
512 {
513     my ($flags, $flags_to_text_ref) = @_;
514
515     # Returns a string containing a mnemonic representation of the bits that
516     # are set in the $flags.  These are assumed to be flag bits.  The return
517     # looks like "FOO|BAR|BAZ".  The second parameter is a reference to an
518     # array that gives the textual representation of all the possible flags.
519     # Element 0 is the text for the bit 0 flag; element 1 for bit 1; ....  If
520     # no bits at all are set the string "0" is returned;
521
522     my @flag_text;
523     my $shift = 0;
524
525     return "0" if $flags == 0;
526
527     while ($flags) {
528         #diag sprintf "%x", $flags;
529         if ($flags & 1) {
530             push @flag_text, $flags_to_text_ref->[$shift];
531         }
532         $shift++;
533         $flags >>= 1;
534     }
535
536     return join "|", @flag_text;
537 }
538
539 # Possible flag returns from utf8n_to_uvchr_error().  These should have G_,
540 # instead of A_, D_, but the prefixes will be used in a a later commit, so
541 # minimize churn by having them here.
542 my @utf8n_flags_to_text =  ( qw(
543         A_EMPTY
544         A_CONTINUATION
545         A_NON_CONTINUATION
546         A_SHORT
547         A_LONG
548         A_LONG_AND_ITS_VALUE
549         PLACEHOLDER
550         A_OVERFLOW
551         D_SURROGATE
552         W_SURROGATE
553         D_NONCHAR
554         W_NONCHAR
555         D_SUPER
556         W_SUPER
557         D_PERL_EXTENDED
558         W_PERL_EXTENDED
559         CHECK_ONLY
560         NO_CONFIDENCE_IN_CURLEN_
561     ) );
562
563 sub utf8n_display_call($)
564 {
565     # Converts an eval string that calls test_utf8n_to_uvchr into a more human
566     # readable form, and returns it.  Doesn't work if the byte string contains
567     # an apostrophe.  The return will look something like:
568     #   test_utf8n_to_uvchr_error('$bytes', $length, $flags)
569     #diag $_[0];
570
571     $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
572     my $text1 = $1;     # Everything before the byte string
573     my $bytes = $2;
574     my $text2 = $3;     # Includes the length
575     my $flags = $4;
576
577     return $text1
578          . display_bytes($bytes)
579          . $text2
580          . flags_to_text($flags, \@utf8n_flags_to_text)
581          . ')';
582 }
583
584 sub uvchr_display_call($)
585 {
586     # Converts an eval string that calls test_uvchr_to_utf8 into a more human
587     # readable form, and returns it.  The return will look something like:
588     #   test_uvchr_to_utf8n_flags($uv, $flags)
589     #diag $_[0];
590
591     my @flags_to_text =  ( qw(
592             W_SURROGATE
593             W_NONCHAR
594             W_SUPER
595             W_PERL_EXTENDED
596             D_SURROGATE
597             D_NONCHAR
598             D_SUPER
599             D_PERL_EXTENDED
600        ) );
601
602     $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
603     my $text = $1;
604     my $cp = sprintf "%X", $2;
605     my $flags = $3;
606
607     return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
608 }
609
610 sub do_warnings_test(@)
611 {
612     my @expected_warnings = @_;
613
614     # Compares the input expected warnings array with @warnings_gotten,
615     # generating a pass for each found, removing it from @warnings_gotten.
616     # Any discrepancies generate test failures.  Returns TRUE if no
617     # discrepcancies; otherwise FALSE.
618
619     my $succeeded = 1;
620
621     if (@expected_warnings == 0) {
622         if (! is(@warnings_gotten, 0, "    Expected and got no warnings")) {
623             output_warnings(@warnings_gotten);
624             $succeeded = 0;
625         }
626         return $succeeded;
627     }
628
629     # Check that we got all the expected warnings,
630     # removing each one found
631   WARNING:
632     foreach my $expected (@expected_warnings) {
633         foreach (my $i = 0; $i < @warnings_gotten; $i++) {
634             if ($warnings_gotten[$i] =~ $expected) {
635                 pass("    Expected and got warning: "
636                     . " $warnings_gotten[$i]");
637                 splice @warnings_gotten, $i, 1;
638                 next WARNING;
639             }
640         }
641         fail("    Expected a warning that matches "
642             . $expected . " but didn't get it");
643         $succeeded = 0;
644     }
645
646     if (! is(@warnings_gotten, 0, "    Got no unexpected warnings")) {
647         output_warnings(@warnings_gotten);
648         $succeeded = 0;
649     }
650
651     return $succeeded;
652 }
653
654 # This test is split into this number of files.
655 my $num_test_files = $ENV{TEST_JOBS} || 1;
656 $num_test_files = 10 if $num_test_files > 10;
657
658 my $test_count = -1;
659 foreach my $test (@tests) {
660     $test_count++;
661     next if $test_count % $num_test_files != $::TEST_CHUNK;
662
663     my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
664
665     my $length = length $bytes;
666     my $initially_overlong = $testname =~ /overlong/;
667     my $will_overflow = $allowed_uv < 0;
668
669     my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
670     my $display_bytes = display_bytes($bytes);
671
672     my $controlling_warning_category;
673     my $utf8n_flag_to_warn;
674     my $utf8n_flag_to_disallow;
675     my $uvchr_flag_to_warn;
676     my $uvchr_flag_to_disallow;
677
678     # We want to test that the independent flags are actually independent.
679     # For example, that a surrogate doesn't trigger a non-character warning,
680     # and conversely, turning off an above-Unicode flag doesn't suppress a
681     # surrogate warning.  Earlier versions of this file used nested loops to
682     # test all possible combinations.  But that creates lots of tests, making
683     # this run too long.  What is now done instead is to use the complement of
684     # the category we are testing to greatly reduce the combinatorial
685     # explosion.  For example, if we have a surrogate and we aren't expecting
686     # a warning about it, we set all the flags for non-surrogates to raise
687     # warnings.  If one shows up, it indicates the flags aren't independent.
688     my $utf8n_flag_to_warn_complement;
689     my $utf8n_flag_to_disallow_complement;
690     my $uvchr_flag_to_warn_complement;
691     my $uvchr_flag_to_disallow_complement;
692
693     # Many of the code points being tested are middling in that if code point
694     # edge cases work, these are very likely to as well.  Because this test
695     # file takes a while to execute, we skip testing the edge effects of code
696     # points deemed middling, while testing their basics and continuing to
697     # fully test the non-middling code points.
698     my $skip_most_tests = 0;
699
700     my $cp_message_qr;      # Pattern that matches the message raised when
701                             # that message contains the problematic code
702                             # point.  The message is the same (currently) both
703                             # when going from/to utf8.
704     my $non_cp_trailing_text;   # The suffix text when the message doesn't
705                                 # contain a code point.  (This is a result of
706                                 # some sort of malformation that means we
707                                 # can't get an exact code poin
708     my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
709                         \Q requires a Perl extension, and so is not\E
710                         \Q portable\E/x;
711     my $extended_non_cp_trailing_text
712                         = "is a Perl extension, and so is not portable";
713
714     # What bytes should have been used to specify a code point that has been
715     # specified as an overlong.
716     my $correct_bytes_for_overlong;
717
718     # Is this test malformed from the beginning?  If so, we know to generally
719     # expect that the tests will show it isn't valid.
720     my $initially_malformed = 0;
721
722     if ($initially_overlong) {
723         $non_cp_trailing_text = "if you see this, there is an error";
724         $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
725         if (! defined $needed_to_discern_len) {
726             $needed_to_discern_len = overlong_discern_len($bytes);
727         }
728         $initially_malformed = 1;
729         $correct_bytes_for_overlong = display_bytes_no_quotes(chr $allowed_uv);
730         $utf8n_flag_to_warn     = 0;
731         $utf8n_flag_to_disallow = 0;
732
733         $utf8n_flag_to_warn_complement =     $::UTF8_WARN_SURROGATE;
734         $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE;
735         if (! $will_overflow && $allowed_uv <= 0x10FFFF) {
736             $utf8n_flag_to_warn_complement     |= $::UTF8_WARN_SUPER;
737             $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_SUPER;
738             if (($allowed_uv & 0xFFFF) != 0xFFFF) {
739                 $utf8n_flag_to_warn_complement      |= $::UTF8_WARN_NONCHAR;
740                 $utf8n_flag_to_disallow_complement  |= $::UTF8_DISALLOW_NONCHAR;
741             }
742         }
743         if (! is_extended_utf8($bytes)) {
744             $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED;
745             $utf8n_flag_to_disallow_complement  |= $::UTF8_DISALLOW_PERL_EXTENDED;
746         }
747         $controlling_warning_category = 'utf8';
748     }
749     elsif($will_overflow || $allowed_uv > 0x10FFFF) {
750
751         # Set the SUPER flags; later, we test for PERL_EXTENDED as well.
752         $utf8n_flag_to_warn     = $::UTF8_WARN_SUPER;
753         $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
754         $uvchr_flag_to_warn     = $::UNICODE_WARN_SUPER;
755         $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
756
757         # Below, we add the flags for non-perl_extended to the code points
758         # that don't fit that category.  Special tests are done for this
759         # category in the inner loop.
760         $utf8n_flag_to_warn_complement     = $::UTF8_WARN_NONCHAR
761                                             |$::UTF8_WARN_SURROGATE;
762         $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
763                                             |$::UTF8_DISALLOW_SURROGATE;
764         $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_NONCHAR
765                                             |$::UNICODE_WARN_SURROGATE;
766         $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
767                                             |$::UNICODE_DISALLOW_SURROGATE;
768         $controlling_warning_category = 'non_unicode';
769
770         if ($will_overflow) {  # This is realy a malformation
771             $non_cp_trailing_text = "if you see this, there is an error";
772             $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
773             $initially_malformed = 1;
774             if (! defined $needed_to_discern_len) {
775                 $needed_to_discern_len = overflow_discern_len($length);
776             }
777         }
778         elsif (requires_extended_utf8($allowed_uv)) {
779             $cp_message_qr = $extended_cp_message_qr;
780             $non_cp_trailing_text = $extended_non_cp_trailing_text;
781             $needed_to_discern_len = 1 unless defined $needed_to_discern_len;
782         }
783         else {
784             $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
785                                 \Q may not be portable\E/x;
786             $non_cp_trailing_text = "is for a non-Unicode code point, may not"
787                                 . " be portable";
788             $utf8n_flag_to_warn_complement     |= $::UTF8_WARN_PERL_EXTENDED;
789             $utf8n_flag_to_disallow_complement
790                                            |= $::UTF8_DISALLOW_PERL_EXTENDED;
791             $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED;
792             $uvchr_flag_to_disallow_complement
793                                         |= $::UNICODE_DISALLOW_PERL_EXTENDED;
794         }
795     }
796     elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
797         $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/;
798         $non_cp_trailing_text = "is for a surrogate";
799         $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
800         $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
801
802         $utf8n_flag_to_warn     = $::UTF8_WARN_SURROGATE;
803         $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
804         $uvchr_flag_to_warn     = $::UNICODE_WARN_SURROGATE;
805         $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
806
807         $utf8n_flag_to_warn_complement     = $::UTF8_WARN_NONCHAR
808                                             |$::UTF8_WARN_SUPER
809                                             |$::UTF8_WARN_PERL_EXTENDED;
810         $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
811                                             |$::UTF8_DISALLOW_SUPER
812                                             |$::UTF8_DISALLOW_PERL_EXTENDED;
813         $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_NONCHAR
814                                             |$::UNICODE_WARN_SUPER
815                                             |$::UNICODE_WARN_PERL_EXTENDED;
816         $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
817                                             |$::UNICODE_DISALLOW_SUPER
818                                             |$::UNICODE_DISALLOW_PERL_EXTENDED;
819         $controlling_warning_category = 'surrogate';
820     }
821     elsif (   ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
822            || ($allowed_uv & 0xFFFE) == 0xFFFE)
823     {
824         $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E
825                             \Q is not recommended for open interchange\E/x;
826         $non_cp_trailing_text = "if you see this, there is an error";
827         $needed_to_discern_len = $length unless defined $needed_to_discern_len;
828         if (   ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
829             || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
830         {
831             $skip_most_tests = 1;
832         }
833
834         $utf8n_flag_to_warn     = $::UTF8_WARN_NONCHAR;
835         $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
836         $uvchr_flag_to_warn     = $::UNICODE_WARN_NONCHAR;
837         $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
838
839         $utf8n_flag_to_warn_complement     = $::UTF8_WARN_SURROGATE
840                                             |$::UTF8_WARN_SUPER
841                                             |$::UTF8_WARN_PERL_EXTENDED;
842         $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
843                                             |$::UTF8_DISALLOW_SUPER
844                                             |$::UTF8_DISALLOW_PERL_EXTENDED;
845         $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_SURROGATE
846                                             |$::UNICODE_WARN_SUPER
847                                             |$::UNICODE_WARN_PERL_EXTENDED;
848         $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
849                                             |$::UNICODE_DISALLOW_SUPER
850                                             |$::UNICODE_DISALLOW_PERL_EXTENDED;
851
852         $controlling_warning_category = 'nonchar';
853     }
854     else {
855         die "Can't figure out what type of warning to test for $testname"
856     }
857
858     die 'Didn\'t set $needed_to_discern_len for ' . $testname
859                                         unless defined $needed_to_discern_len;
860
861     # We try various combinations of malformations that can occur
862     foreach my $short (0, 1) {
863       next if $skip_most_tests && $short;
864       foreach my $unexpected_noncont (0, 1) {
865         next if $skip_most_tests && $unexpected_noncont;
866         foreach my $overlong (0, 1) {
867           next if $overlong && $skip_most_tests;
868           next if $initially_overlong && ! $overlong;
869
870           # If we're creating an overlong, it can't be longer than the
871           # maximum length, so skip if we're already at that length.
872           next if   (! $initially_overlong && $overlong)
873                    &&  $length >= $::max_bytes;
874
875           my $this_cp_message_qr = $cp_message_qr;
876           my $this_non_cp_trailing_text = $non_cp_trailing_text;
877
878           foreach my $malformed_allow_type (0..2) {
879             # 0 don't allow this malformation; ignored if no malformation
880             # 1 allow, with REPLACEMENT CHARACTER returned
881             # 2 allow, with intended code point returned.  All malformations
882             #   other than overlong can't determine the intended code point,
883             #   so this isn't valid for them.
884             next if     $malformed_allow_type == 2
885                     && ($will_overflow || $short || $unexpected_noncont);
886             next if $skip_most_tests && $malformed_allow_type;
887
888             # Here we are in the innermost loop for malformations.  So we
889             # know which ones are in effect.  Can now change the input to be
890             # appropriately malformed.  We also can set up certain other
891             # things now, like whether we expect a return flag from this
892             # malformation, and which flag.
893
894             my $this_bytes = $bytes;
895             my $this_length = $length;
896             my $this_expected_len = $length;
897             my $this_needed_to_discern_len = $needed_to_discern_len;
898
899             my @malformation_names;
900             my @expected_malformation_warnings;
901             my @expected_malformation_return_flags;
902
903             # Contains the flags for any allowed malformations.  Currently no
904             # combinations of on/off are tested for.  It's either all are
905             # allowed, or none are.
906             my $allow_flags = 0;
907             my $overlong_is_in_perl_extended_utf8 = 0;
908             my $dont_use_overlong_cp = 0;
909
910             if ($overlong) {
911                 if (! $initially_overlong) {
912                 my $new_expected_len;
913
914                 # To force this malformation, we convert the original start
915                 # byte into a continuation byte with the same data bits as
916                 # originally. ...
917                 my $start_byte = substr($this_bytes, 0, 1);
918                 my $converted_to_continuation_byte
919                                             = start_byte_to_cont($start_byte);
920
921                 # ... Then we prepend it with a known overlong sequence.  This
922                 # should evaluate to the exact same code point as the
923                 # original.  We try to avoid an overlong using Perl extended
924                 # UTF-8.  The code points are the highest representable as
925                 # overlongs on the respective platform without using extended
926                 # UTF-8.
927                 if (native_to_I8($start_byte) lt "\xFC") {
928                     $start_byte = I8_to_native("\xFC");
929                     $new_expected_len = 6;
930                 }
931                 elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") {
932
933                     # FE is not extended UTF-8 on EBCDIC
934                     $start_byte = I8_to_native("\xFE");
935                     $new_expected_len = 7;
936                 }
937                 else {  # Must use extended UTF-8.  On ASCII platforms, we
938                         # could express some overlongs here starting with
939                         # \xFE, but there's no real reason to do so.
940                     $overlong_is_in_perl_extended_utf8 = 1;
941                     $start_byte = I8_to_native("\xFF");
942                     $new_expected_len = $::max_bytes;
943                     $this_cp_message_qr = $extended_cp_message_qr;
944
945                     # The warning that gets raised doesn't include the code
946                     # point in the message if the code point can be expressed
947                     # without using extended UTF-8, but the particular
948                     # overlong sequence used is in extended UTF-8.  To do
949                     # otherwise would be confusing to the user, as it would
950                     # claim the code point requires extended, when it doesn't.
951                     $dont_use_overlong_cp = 1
952                                     unless requires_extended_utf8($allowed_uv);
953                     $this_non_cp_trailing_text = $extended_non_cp_trailing_text;
954                 }
955
956                 # Splice in the revise continuation byte, preceded by the
957                 # start byte and the proper number of the lowest continuation
958                 # bytes.
959                 $this_bytes =   $start_byte
960                              . ($native_lowest_continuation_chr
961                                 x ( $new_expected_len - 1 - length($this_bytes)))
962                              .  $converted_to_continuation_byte
963                              .  substr($this_bytes, 1);
964                 $this_length = length($this_bytes);
965                 $this_needed_to_discern_len =    $new_expected_len
966                                             - (  $this_expected_len
967                                                - $this_needed_to_discern_len);
968                 $this_expected_len = $new_expected_len;
969                 }
970             }
971
972             if ($short) {
973
974                 # To force this malformation, just tell the test to not look
975                 # as far as it should into the input.
976                 $this_length--;
977                 $this_expected_len--;
978
979                 $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
980             }
981
982             if ($unexpected_noncont) {
983
984                 # To force this malformation, change the final continuation
985                 # byte into a non continuation.
986                 my $pos = ($short) ? -2 : -1;
987                 substr($this_bytes, $pos, 1) = '?';
988                 $this_expected_len--;
989             }
990
991             # The whole point of a test that is malformed from the beginning
992             # is to test for that malformation.  If we've modified things so
993             # much that we don't have enough information to detect that
994             # malformation, there's no point in testing.
995             next if    $initially_malformed
996                     && $this_expected_len < $this_needed_to_discern_len;
997
998             # Here, we've transformed the input with all of the desired
999             # non-overflow malformations.  We are now in a position to
1000             # construct any potential warnings for those malformations.  But
1001             # it's a pain to get the detailed messages exactly right, so for
1002             # now XXX, only do so for those that return an explicit code
1003             # point.
1004
1005             if ($overlong) {
1006                 push @malformation_names, 'overlong';
1007                 push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
1008
1009                 # If one of the other malformation types is also in effect, we
1010                 # don't know what the intended code point was.
1011                 if ($short || $unexpected_noncont || $will_overflow) {
1012                     push @expected_malformation_warnings, qr/overlong/;
1013                 }
1014                 else {
1015                     my $wrong_bytes = display_bytes_no_quotes(
1016                                          substr($this_bytes, 0, $this_length));
1017                     if (! defined $correct_bytes_for_overlong) {
1018                         $correct_bytes_for_overlong
1019                                             = display_bytes_no_quotes($bytes);
1020                     }
1021                     my $prefix = ($allowed_uv > 0x10FFFF) ? "0x" : "U+";
1022                     push @expected_malformation_warnings,
1023                             qr/\QMalformed UTF-8 character: $wrong_bytes\E
1024                                \Q (overlong; instead use\E
1025                                \Q $correct_bytes_for_overlong to\E
1026                                \Q represent $prefix$uv_string)/x;
1027                 }
1028
1029                 if ($malformed_allow_type == 2) {
1030                     $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
1031                 }
1032                 elsif ($malformed_allow_type) {
1033                     $allow_flags |= $::UTF8_ALLOW_LONG;
1034                 }
1035             }
1036             if ($short) {
1037                 push @malformation_names, 'short';
1038                 push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
1039                 push @expected_malformation_warnings, qr/too short/;
1040             }
1041             if ($unexpected_noncont) {
1042                 push @malformation_names, 'unexpected non-continuation';
1043                 push @expected_malformation_return_flags,
1044                                 $::UTF8_GOT_NON_CONTINUATION;
1045                 $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
1046                                                     if $malformed_allow_type;
1047                 push @expected_malformation_warnings,
1048                                         qr/unexpected non-continuation byte/;
1049             }
1050
1051             # The overflow malformation is done differently than other
1052             # malformations.  It comes from manually typed tests in the test
1053             # array.  We now make it be treated like one of the other
1054             # malformations.  But some has to be deferred until the inner loop
1055             my $overflow_msg_pattern;
1056             if ($will_overflow) {
1057                 push @malformation_names, 'overflow';
1058
1059                 $overflow_msg_pattern = display_bytes_no_quotes(
1060                                     substr($this_bytes, 0, $this_expected_len));
1061                 $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E
1062                                            \Q $overflow_msg_pattern\E
1063                                            \Q (overflows)\E/x;
1064                 push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW;
1065                 $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type;
1066             }
1067
1068             # And we can create the malformation-related text for the the test
1069             # names we eventually will generate.
1070             my $malformations_name = "";
1071             if (@malformation_names) {
1072                 $malformations_name .= "dis" unless $malformed_allow_type;
1073                 $malformations_name .= "allowed ";
1074                 $malformations_name .= "malformation";
1075                 $malformations_name .= "s" if @malformation_names > 1;
1076                 $malformations_name .= ": ";
1077                 $malformations_name .=  join "/", @malformation_names;
1078                 $malformations_name =  " ($malformations_name)";
1079             }
1080
1081             # Done setting up the malformation related stuff
1082
1083             {   # First test the isFOO calls
1084                 use warnings; # XXX no warnings 'deprecated';   # Make sure these don't raise warnings
1085                 undef @warnings_gotten;
1086
1087                 my $ret = test_isUTF8_CHAR($this_bytes, $this_length);
1088                 my $ret_flags
1089                         = test_isUTF8_CHAR_flags($this_bytes, $this_length, 0);
1090                 if ($malformations_name) {
1091                     is($ret, 0, "For $testname$malformations_name: isUTF8_CHAR() returns 0");
1092                     is($ret_flags, 0, "    And isUTF8_CHAR_flags() returns 0");
1093                 }
1094                 else {
1095                     is($ret, $this_length, "For $testname: isUTF8_CHAR() returns"
1096                                          . " expected length: $this_length");
1097                     is($ret_flags, $this_length,
1098                        "    And isUTF8_CHAR_flags(...,0) returns expected"
1099                      . " length: $this_length");
1100                 }
1101                 is(scalar @warnings_gotten, 0,
1102                    "    And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags"
1103                  . " generated any warnings")
1104                 or output_warnings(@warnings_gotten);
1105
1106                 undef @warnings_gotten;
1107                 $ret = test_isSTRICT_UTF8_CHAR($this_bytes, $this_length);
1108                 if ($malformations_name) {
1109                     is($ret, 0, "    And isSTRICT_UTF8_CHAR() returns 0");
1110                 }
1111                 else {
1112                     my $expected_ret
1113                                 = (   $testname =~ /surrogate|non-character/
1114                                    || $allowed_uv > 0x10FFFF)
1115                                   ? 0
1116                                   : $this_length;
1117                     is($ret, $expected_ret,
1118                         "    And isSTRICT_UTF8_CHAR() returns expected"
1119                       . " length: $expected_ret");
1120                     $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1121                                         $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
1122                     is($ret, $expected_ret,
1123                        "    And isUTF8_CHAR_flags('"
1124                      . "DISALLOW_ILLEGAL_INTERCHANGE') acts like"
1125                      . " isSTRICT_UTF8_CHAR");
1126                 }
1127                 is(scalar @warnings_gotten, 0,
1128                         "    And neither isSTRICT_UTF8_CHAR() nor"
1129                       . " isUTF8_CHAR_flags generated any warnings")
1130                 or output_warnings(@warnings_gotten);
1131
1132                 undef @warnings_gotten;
1133                 $ret = test_isC9_STRICT_UTF8_CHAR($this_bytes, $this_length);
1134                 if ($malformations_name) {
1135                     is($ret, 0, "    And isC9_STRICT_UTF8_CHAR() returns 0");
1136                 }
1137                 else {
1138                     my $expected_ret = (   $testname =~ /surrogate/
1139                                         || $allowed_uv > 0x10FFFF)
1140                                        ? 0
1141                                        : $this_expected_len;
1142                     is($ret, $expected_ret, "    And isC9_STRICT_UTF8_CHAR()"
1143                                           . " returns expected length:"
1144                                           . " $expected_ret");
1145                     $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1146                                     $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
1147                     is($ret, $expected_ret,
1148                        "    And isUTF8_CHAR_flags('"
1149                      . "DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like"
1150                      . " isC9_STRICT_UTF8_CHAR");
1151                 }
1152                 is(scalar @warnings_gotten, 0,
1153                         "    And neither isC9_STRICT_UTF8_CHAR() nor"
1154                       . " isUTF8_CHAR_flags generated any warnings")
1155                 or output_warnings(@warnings_gotten);
1156
1157                 foreach my $disallow_type (0..2) {
1158                     # 0 is don't disallow this type of code point
1159                     # 1 is do disallow
1160                     # 2 is do disallow, but only code points requiring
1161                     #   perl-extended-UTF8
1162
1163                     my $disallow_flags;
1164                     my $expected_ret;
1165
1166                     if ($malformations_name) {
1167
1168                         # Malformations are by default disallowed, so testing
1169                         # with $disallow_type equal to 0 is sufficicient.
1170                         next if $disallow_type;
1171
1172                         $disallow_flags = 0;
1173                         $expected_ret = 0;
1174                     }
1175                     elsif ($disallow_type == 1) {
1176                         $disallow_flags = $utf8n_flag_to_disallow;
1177                         $expected_ret = 0;
1178                     }
1179                     elsif ($disallow_type == 2) {
1180                         next if ! requires_extended_utf8($allowed_uv);
1181                         $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED;
1182                         $expected_ret = 0;
1183                     }
1184                     else {  # type is 0
1185                         $disallow_flags = $utf8n_flag_to_disallow_complement;
1186                         $expected_ret = $this_length;
1187                     }
1188
1189                     $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1190                                                   $disallow_flags);
1191                     is($ret, $expected_ret,
1192                              "    And isUTF8_CHAR_flags($display_bytes,"
1193                            . " $disallow_flags) returns $expected_ret")
1194                       or diag "The flags mean "
1195                             . flags_to_text($disallow_flags,
1196                                             \@utf8n_flags_to_text);
1197                     is(scalar @warnings_gotten, 0,
1198                             "    And isUTF8_CHAR_flags(...) generated"
1199                           . " no warnings")
1200                       or output_warnings(@warnings_gotten);
1201
1202                     # Test partial character handling, for each byte not a
1203                     # full character
1204                     my $did_test_partial = 0;
1205                     for (my $j = 1; $j < $this_length - 1; $j++) {
1206                         $did_test_partial = 1;
1207                         my $partial = substr($this_bytes, 0, $j);
1208                         my $ret_should_be;
1209                         my $comment;
1210                         if ($disallow_type || $malformations_name) {
1211                             $ret_should_be = 0;
1212                             $comment = "disallowed";
1213
1214                             # The number of bytes required to tell if a
1215                             # sequence has something wrong is the smallest of
1216                             # all the things wrong with it.  We start with the
1217                             # number for this type of code point, if that is
1218                             # disallowed; or the whole length if not.  The
1219                             # latter is what a couple of the malformations
1220                             # require.
1221                             my $needed_to_tell = ($disallow_type)
1222                                                   ? $this_needed_to_discern_len
1223                                                   : $this_expected_len;
1224
1225                             # Then we see if the malformations that are
1226                             # detectable early in the string are present.
1227                             if ($overlong) {
1228                                 my $dl = overlong_discern_len($this_bytes);
1229                                 $needed_to_tell = $dl if $dl < $needed_to_tell;
1230                             }
1231                             if ($will_overflow) {
1232                                 my $dl = overflow_discern_len($length);
1233                                 $needed_to_tell = $dl if $dl < $needed_to_tell;
1234                             }
1235
1236                             if ($j < $needed_to_tell) {
1237                                 $ret_should_be = 1;
1238                                 $comment .= ", but need $needed_to_tell"
1239                                           . " bytes to discern:";
1240                             }
1241                         }
1242                         else {
1243                             $ret_should_be = 1;
1244                             $comment = "allowed";
1245                         }
1246
1247                         undef @warnings_gotten;
1248
1249                         $ret = test_is_utf8_valid_partial_char_flags($partial,
1250                                                         $j, $disallow_flags);
1251                         is($ret, $ret_should_be,
1252                             "    And is_utf8_valid_partial_char_flags("
1253                             . display_bytes($partial)
1254                             . ", $disallow_flags), $comment: returns"
1255                             . " $ret_should_be")
1256                         or diag "The flags mean "
1257                         . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
1258                     }
1259
1260                     if ($did_test_partial) {
1261                         is(scalar @warnings_gotten, 0,
1262                             "    And is_utf8_valid_partial_char_flags()"
1263                             . " generated no warnings for any of the lengths")
1264                           or output_warnings(@warnings_gotten);
1265                     }
1266                 }
1267             }
1268
1269             # Now test the to/from UTF-8 calls.  There are several orthogonal
1270             # variables involved.  We test most possible combinations
1271
1272             foreach my $do_disallow (0, 1) {
1273               if ($do_disallow) {
1274                 next if $initially_overlong;
1275               }
1276               else {
1277                 next if $skip_most_tests;
1278             }
1279
1280               # We classify the warnings into certain "interesting" types,
1281               # described later
1282               foreach my $warning_type (0..4) {
1283                 next if $skip_most_tests && $warning_type != 1;
1284                 foreach my $use_warn_flag (0, 1) {
1285                     if ($use_warn_flag) {
1286                         next if $initially_overlong;
1287                     }
1288                     else {
1289                         next if $skip_most_tests;
1290                     }
1291
1292                     # Finally, here is the inner loop
1293
1294                     my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn;
1295                     my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow;
1296                     my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn;
1297                     my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow;
1298
1299                     my $eval_warn;
1300                     my $expect_regular_warnings;
1301                     my $expect_warnings_for_malformed;
1302                     my $expect_warnings_for_overflow;
1303
1304                     if ($warning_type == 0) {
1305                         $eval_warn = "use warnings; no warnings 'deprecated'";
1306                         $expect_regular_warnings = $use_warn_flag;
1307
1308                         # We ordinarily expect overflow warnings here.  But it
1309                         # is somewhat more complicated, and the final
1310                         # determination is deferred to one place in the filw
1311                         # where we handle overflow.
1312                         $expect_warnings_for_overflow = 1;
1313
1314                         # We would ordinarily expect malformed warnings in
1315                         # this case, but not if malformations are allowed.
1316                         $expect_warnings_for_malformed
1317                                                 = $malformed_allow_type == 0;
1318                     }
1319                     elsif ($warning_type == 1) {
1320                         $eval_warn = "no warnings";
1321                         $expect_regular_warnings = 0;
1322                         $expect_warnings_for_overflow = 0;
1323                         $expect_warnings_for_malformed = 0;
1324                     }
1325                     elsif ($warning_type == 2) {
1326                         $eval_warn = "no warnings; use warnings 'utf8'";
1327                         $expect_regular_warnings = $use_warn_flag;
1328                         $expect_warnings_for_overflow = 1;
1329                         $expect_warnings_for_malformed
1330                                                 = $malformed_allow_type == 0;
1331                     }
1332                     elsif ($warning_type == 3) {
1333                         $eval_warn = "no warnings; use warnings"
1334                                    . " '$controlling_warning_category'";
1335                         $expect_regular_warnings = $use_warn_flag;
1336                         $expect_warnings_for_overflow
1337                             = $controlling_warning_category eq 'non_unicode';
1338                         $expect_warnings_for_malformed = 0;
1339                     }
1340                     elsif ($warning_type == 4) {  # Like type 3, but uses the
1341                                                   # PERL_EXTENDED flags
1342                         # The complement flags were set up so that the
1343                         # PERL_EXTENDED flags have been tested that they don't
1344                         # trigger wrongly for too small code points.  And the
1345                         # flags have been set up so that those small code
1346                         # points are tested for being above Unicode.  What's
1347                         # left to test is that the large code points do
1348                         # trigger the PERL_EXTENDED flags.
1349                         next if ! requires_extended_utf8($allowed_uv);
1350                         next if $controlling_warning_category ne 'non_unicode';
1351                         $eval_warn = "no warnings; use warnings 'non_unicode'";
1352                         $expect_regular_warnings = 1;
1353                         $expect_warnings_for_overflow = 1;
1354                         $expect_warnings_for_malformed = 0;
1355                         $this_utf8n_flag_to_warn = $::UTF8_WARN_PERL_EXTENDED;
1356                         $this_utf8n_flag_to_disallow
1357                                              = $::UTF8_DISALLOW_PERL_EXTENDED;
1358                         $this_uvchr_flag_to_warn
1359                                               = $::UNICODE_WARN_PERL_EXTENDED;
1360                         $this_uvchr_flag_to_disallow
1361                                           = $::UNICODE_DISALLOW_PERL_EXTENDED;
1362                     }
1363                     else {
1364                        die "Unexpected warning type '$warning_type'";
1365                     }
1366
1367                     # We only need to test the case where all warnings are
1368                     # enabled (type 0) to see if turning off the warning flag
1369                     # causes things to not be output.  If those pass, then
1370                     # turning on some sub-category of warnings, or turning off
1371                     # warnings altogether are extremely likely to not output
1372                     # warnings either, given how the warnings subsystem is
1373                     # supposed to work, and this file assumes it does work.
1374                     next if $warning_type != 0 && ! $use_warn_flag;
1375
1376                     # The convention is that the 'got' flag is the same value
1377                     # as the disallow one.  If this were violated, the tests
1378                     # here should start failing.
1379                     my $return_flag = $this_utf8n_flag_to_disallow;
1380
1381                     # If we aren't expecting warnings/disallow for this, turn
1382                     # on all the other flags.  That makes sure that they all
1383                     # are independent of this flag, and so we don't need to
1384                     # test them individually.
1385                     my $this_warning_flags
1386                             = ($use_warn_flag)
1387                               ? $this_utf8n_flag_to_warn
1388                               : ($overlong_is_in_perl_extended_utf8
1389                                 ? ($utf8n_flag_to_warn_complement
1390                                     & ~$::UTF8_WARN_PERL_EXTENDED)
1391                                 :  $utf8n_flag_to_warn_complement);
1392                     my $this_disallow_flags
1393                             = ($do_disallow)
1394                               ? $this_utf8n_flag_to_disallow
1395                               : ($overlong_is_in_perl_extended_utf8
1396                                  ? ($utf8n_flag_to_disallow_complement
1397                                     & ~$::UTF8_DISALLOW_PERL_EXTENDED)
1398                                  :  $utf8n_flag_to_disallow_complement);
1399                     my $expected_uv = $allowed_uv;
1400                     my $this_uv_string = $uv_string;
1401
1402                     my @expected_return_flags
1403                                         = @expected_malformation_return_flags;
1404                     my @expected_warnings;
1405                     push @expected_warnings, @expected_malformation_warnings
1406                                             if $expect_warnings_for_malformed;
1407
1408                     # The overflow malformation is done differently than other
1409                     # malformations.  It comes from manually typed tests in
1410                     # the test array, but it also is above Unicode and uses
1411                     # Perl extended UTF-8, so affects some of the flags being
1412                     # tested.  We now make it be treated like one of the other
1413                     # generated malformations.
1414                     if ($will_overflow) {
1415
1416                         # An overflow is (way) above Unicode, and overrides
1417                         # everything else.
1418                         $expect_regular_warnings = 0;
1419
1420                         # Earlier, we tentatively calculated whether this
1421                         # should emit a message or not.  It's tentative
1422                         # because, even if we ordinarily would output it, we
1423                         # don't if malformations are allowed -- except an
1424                         # overflow is also a SUPER and PERL_EXTENDED, and if
1425                         # warnings for those are enabled, the overflow
1426                         # warning does get raised.
1427                         if (   $expect_warnings_for_overflow
1428                             && (    $malformed_allow_type == 0
1429                                 ||   (   $this_warning_flags
1430                                       & ($::UTF8_WARN_SUPER
1431                                         |$::UTF8_WARN_PERL_EXTENDED))))
1432                         {
1433                             push @expected_warnings, $overflow_msg_pattern;
1434                         }
1435                     }
1436
1437                     # It may be that the malformations have shortened the
1438                     # amount of input we look at so much that we can't tell
1439                     # what the category the code point was in.  Otherwise, set
1440                     # up the expected return flags based on the warnings and
1441                     # disallowments.
1442                     if ($this_expected_len < $this_needed_to_discern_len) {
1443                         $expect_regular_warnings = 0;
1444                     }
1445                     elsif (   ($this_warning_flags & $this_utf8n_flag_to_warn)
1446                            || (  $this_disallow_flags
1447                                & $this_utf8n_flag_to_disallow))
1448                     {
1449                         push @expected_return_flags, $return_flag;
1450                     }
1451
1452                     # Finish setting up the expected warning.
1453                     if ($expect_regular_warnings) {
1454
1455                         # So far the array contains warnings generated by
1456                         # malformations.  Add the expected regular one.
1457                         unshift @expected_warnings, $this_cp_message_qr;
1458
1459                         # But it may need to be modified, because either of
1460                         # these malformations means we can't determine the
1461                         # expected code point.
1462                         if (   $short || $unexpected_noncont
1463                             || $dont_use_overlong_cp)
1464                         {
1465                             my $first_byte = substr($this_bytes, 0, 1);
1466                             $expected_warnings[0] = display_bytes(
1467                                     substr($this_bytes, 0, $this_expected_len));
1468                             $expected_warnings[0]
1469                                 = qr/[Aa]\Qny UTF-8 sequence that starts with\E
1470                                      \Q $expected_warnings[0]\E
1471                                      \Q $this_non_cp_trailing_text\E/x;
1472                         }
1473                     }
1474
1475                     # Is effectively disallowed if we've set up a malformation
1476                     # (unless malformations are allowed), even if the flag
1477                     # indicates it is allowed.  Fix up test name to indicate
1478                     # this as well
1479                     my $disallowed = 0;
1480                     if (   $this_disallow_flags & $this_utf8n_flag_to_disallow
1481                         && $this_expected_len >= $this_needed_to_discern_len)
1482                     {
1483                         $disallowed = 1;
1484                     }
1485                     if ($malformations_name) {
1486                         if ($malformed_allow_type == 0) {
1487                             $disallowed = 1;
1488                         }
1489                         elsif ($malformed_allow_type == 1) {
1490
1491                             # Even if allowed, the malformation returns the
1492                             # REPLACEMENT CHARACTER.
1493                             $expected_uv = 0xFFFD;
1494                             $this_uv_string = "0xFFFD"
1495                         }
1496                     }
1497
1498                     my $this_name = "utf8n_to_uvchr_error() $testname: ";
1499                     if (! $initially_malformed) {
1500                         $this_name .= ($disallowed)
1501                                        ? 'disallowed, '
1502                                        : 'allowed, ';
1503                     }
1504                     $this_name .= "$eval_warn";
1505                     $this_name .= ", " . ((  $this_warning_flags
1506                                             & $this_utf8n_flag_to_warn)
1507                                           ? 'with flag for raising warnings'
1508                                           : 'no flag for raising warnings');
1509                     $this_name .= $malformations_name;
1510
1511                     # Do the actual test using an eval
1512                     undef @warnings_gotten;
1513                     my $ret_ref;
1514                     my $this_flags
1515                         = $allow_flags|$this_warning_flags|$this_disallow_flags;
1516                     my $eval_text =      "$eval_warn; \$ret_ref"
1517                             . " = test_utf8n_to_uvchr_error("
1518                             . "'$this_bytes', $this_length, $this_flags)";
1519                     eval "$eval_text";
1520                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1521                     {
1522                         diag "\$@='$@'; call was: "
1523                            . utf8n_display_call($eval_text);
1524                         next;
1525                     }
1526                     if ($disallowed) {
1527                         is($ret_ref->[0], 0, "    And returns 0")
1528                           or diag "Call was: " . utf8n_display_call($eval_text);
1529                     }
1530                     else {
1531                         is($ret_ref->[0], $expected_uv,
1532                                 "    And returns expected uv: "
1533                               . $this_uv_string)
1534                           or diag "Call was: " . utf8n_display_call($eval_text);
1535                     }
1536                     is($ret_ref->[1], $this_expected_len,
1537                                         "    And returns expected length:"
1538                                       . " $this_expected_len")
1539                       or diag "Call was: " . utf8n_display_call($eval_text);
1540
1541                     my $returned_flags = $ret_ref->[2];
1542
1543                     for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
1544                         if ($expected_return_flags[$i] & $returned_flags) {
1545                             if ($expected_return_flags[$i]
1546                                                 == $::UTF8_GOT_PERL_EXTENDED)
1547                             {
1548                                 pass("    Expected and got return flag for"
1549                                    . " PERL_EXTENDED");
1550                             }
1551                                    # The first entries in this are
1552                                    # malformations
1553                             elsif ($i > @malformation_names - 1)  {
1554                                 pass("    Expected and got return flag"
1555                                    . " for " . $controlling_warning_category);
1556                             }
1557                             else {
1558                                 pass("    Expected and got return flag for "
1559                                    . $malformation_names[$i]
1560                                    . " malformation");
1561                             }
1562                             $returned_flags &= ~$expected_return_flags[$i];
1563                             splice @expected_return_flags, $i, 1;
1564                         }
1565                     }
1566
1567                     is($returned_flags, 0,
1568                        "    Got no unexpected return flags")
1569                       or diag "The unexpected flags gotten were: "
1570                            . (flags_to_text($returned_flags,
1571                                             \@utf8n_flags_to_text)
1572                                 # We strip off any prefixes from the flag
1573                                 # names
1574                              =~ s/ \b [A-Z] _ //xgr);
1575                     is (scalar @expected_return_flags, 0,
1576                         "    Got all expected return flags")
1577                         or diag "The expected flags not gotten were: "
1578                            . (flags_to_text(eval join("|",
1579                                                         @expected_return_flags),
1580                                             \@utf8n_flags_to_text)
1581                                 # We strip off any prefixes from the flag
1582                                 # names
1583                              =~ s/ \b [A-Z] _ //xgr);
1584
1585                     do_warnings_test(@expected_warnings)
1586                       or diag "Call was: " . utf8n_display_call($eval_text);
1587                     undef @warnings_gotten;
1588
1589                     # Check CHECK_ONLY results when the input is
1590                     # disallowed.  Do this when actually disallowed,
1591                     # not just when the $this_disallow_flags is set
1592                     if ($disallowed) {
1593                         my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
1594                         my $eval_text = "use warnings; no warnings 'deprecated'; \$ret_ref ="
1595                                       . " test_utf8n_to_uvchr_error('"
1596                                       . "$this_bytes', $this_length,"
1597                                       . " $this_flags)";
1598                         eval $eval_text;
1599                         if (! ok ("$@ eq ''",
1600                             "    And eval succeeded with CHECK_ONLY"))
1601                         {
1602                             diag "\$@='$@'; Call was: "
1603                                . utf8n_display_call($eval_text);
1604                             next;
1605                         }
1606                         is($ret_ref->[0], 0, "    CHECK_ONLY: Returns 0")
1607                           or diag "Call was: " . utf8n_display_call($eval_text);
1608                         is($ret_ref->[1], -1,
1609                                        "    CHECK_ONLY: returns -1 for length")
1610                           or diag "Call was: " . utf8n_display_call($eval_text);
1611                         if (! is(scalar @warnings_gotten, 0,
1612                                       "    CHECK_ONLY: no warnings generated"))
1613                         {
1614                             diag "Call was: " . utf8n_display_call($eval_text);
1615                             output_warnings(@warnings_gotten);
1616                         }
1617                     }
1618
1619                     # Now repeat some of the above, but for
1620                     # uvchr_to_utf8_flags().  Since this comes from an
1621                     # existing code point, it hasn't overflowed, and isn't
1622                     # malformed.
1623                     next if @malformation_names;
1624
1625                     $this_warning_flags = ($use_warn_flag)
1626                                           ? $this_uvchr_flag_to_warn
1627                                           : 0;
1628                     $this_disallow_flags = ($do_disallow)
1629                                            ? $this_uvchr_flag_to_disallow
1630                                            : 0;
1631
1632                     $disallowed = $this_disallow_flags
1633                                 & $this_uvchr_flag_to_disallow;
1634                     $this_name .= ", " . ((  $this_warning_flags
1635                                            & $this_utf8n_flag_to_warn)
1636                                           ? 'with flag for raising warnings'
1637                                           : 'no flag for raising warnings');
1638
1639                     $this_name = "uvchr_to_utf8_flags() $testname: "
1640                                             . (($disallowed)
1641                                                 ? 'disallowed'
1642                                                 : 'allowed');
1643                     $this_name .= ", $eval_warn";
1644                     $this_name .= ", " . ((  $this_warning_flags
1645                                            & $this_uvchr_flag_to_warn)
1646                                         ? 'with warning flag'
1647                                         : 'no warning flag');
1648
1649                     undef @warnings_gotten;
1650                     my $ret;
1651                     $this_flags = $this_warning_flags|$this_disallow_flags;
1652                     $eval_text = "$eval_warn; \$ret ="
1653                             . " test_uvchr_to_utf8_flags("
1654                             . "$allowed_uv, $this_flags)";
1655                     eval "$eval_text";
1656                     if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1657                     {
1658                         diag "\$@='$@'; call was: "
1659                            . uvchr_display_call($eval_text);
1660                         next;
1661                     }
1662                     if ($disallowed) {
1663                         is($ret, undef, "    And returns undef")
1664                           or diag "Call was: " . uvchr_display_call($eval_text);
1665                     }
1666                     else {
1667                         is($ret, $this_bytes, "    And returns expected string")
1668                           or diag "Call was: " . uvchr_display_call($eval_text);
1669                     }
1670
1671                     do_warnings_test(@expected_warnings)
1672                       or diag "Call was: " . uvchr_display_call($eval_text);
1673                 }
1674               }
1675             }
1676           }
1677         }
1678       }
1679     }
1680 }
1681
1682 done_testing;