This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
ext/XS-APItest/t/utf8_warn_base.pl: Fix for EBCDIC
[perl5.git] / ext / XS-APItest / t / utf8_warn_base.pl
1 #!perl -w
2
3 # This is a base file to be used by various .t's in its directory
4 # It tests various malformed UTF-8 sequences and some code points that are
5 # "problematic", and verifies that the correct warnings/flags etc are
6 # generated when using them.  For the code points, it also takes the UTF-8 and
7 # perturbs it to be malformed in various ways, and tests that this gets
8 # appropriately detected.
9
10 use strict;
11 use Test::More;
12
13 BEGIN {
14     use_ok('XS::APItest');
15     require 'charset_tools.pl';
16     require './t/utf8_setup.pl';
17 };
18
19 $|=1;
20
21 use XS::APItest;
22
23 my @warnings_gotten;
24
25 use warnings 'utf8';
26 local $SIG{__WARN__} = sub { my @copy = @_;
27                              push @warnings_gotten, map { chomp; $_ } @copy;
28                            };
29
30 my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
31 my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
32
33 # C5 is chosen as it is valid for both ASCII and EBCDIC platforms
34 my $known_start_byte = I8_to_native("\xC5");
35
36 sub requires_extended_utf8($) {
37
38     # Returns a boolean as to whether or not the code point parameter fits
39     # into 31 bits, subject to the convention that a negative code point
40     # stands for one that overflows the word size, so won't fit in 31 bits.
41
42     return shift > $highest_non_extended_utf8_cp;
43 }
44
45 sub is_extended_utf8($) {
46
47     # Returns a boolean as to whether or not the input UTF-8 sequence uses
48     # Perl extended UTF-8.
49
50     my $byte = substr(shift, 0, 1);
51     return ord $byte >= 0xFE if isASCII;
52     return $byte == I8_to_native("\xFF");
53 }
54
55 sub overflow_discern_len($) {
56
57     # Returns how many bytes are needed to tell if a non-overlong UTF-8
58     # sequence is for a code point that won't fit in the platform's word size.
59     # Only the length of the sequence representing a single code point is
60     # needed.
61
62     if (isASCII) {
63         return ($::is64bit) ? 3 : 1;
64
65         # Below is needed for code points above IV_MAX
66         #return ($::is64bit) ? 3 : ((shift == $::max_bytes)
67         #                           ? 1
68         #                           : 2);
69     }
70
71     return ($::is64bit) ? 2 : 8;
72 }
73
74 sub overlong_discern_len($) {
75
76     # Returns how many bytes are needed to tell if the input UTF-8 sequence
77     # for a code point is overlong
78
79     my $string = shift;
80     my $length = length $string;
81     my $byte = ord native_to_I8(substr($string, 0, 1));
82     if (isASCII) {
83         return ($byte >= 0xFE)
84                 ? ((! $::is64bit)
85                     ? 1
86                     : ($byte == 0xFF) ? 7 : 2)
87                 : (($length == 2) ? 1 : 2);
88         # Below is needed for code points above IV_MAX
89         #return ($length == $::max_bytes)
90         #          # This is constrained to 1 on 32-bit machines, as it
91         #          # overflows there
92         #        ? (($::is64bit) ? 7 : 1)
93         #        : (($length == 2) ? 1 : 2);
94     }
95
96     return ($length == $::max_bytes) ? 8 : (($length <= 3) ? 1 : 2);
97 }
98
99 my @tests;
100 {
101     no warnings qw(portable overflow);
102     @tests = (
103         # $testname,
104         # $bytes,                  UTF-8 string
105         # $allowed_uv,             code point $bytes evaluates to; -1 if
106         #                          overflows
107         # $needed_to_discern_len   optional, how long an initial substring do
108         #                          we need to tell that the string must be for
109         #                          a code point in the category it falls in,
110         #                          like being a surrogate; 0 indicates we need
111         #                          the whole string.  Some categories have a
112         #                          default that is used if this is omitted.
113         [ "orphan continuation byte malformation",
114             I8_to_native("$::I8c"),
115             0xFFFD,
116             1,
117         ],
118         [ "overlong malformation, lowest 2-byte",
119             (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"),
120             0,   # NUL
121         ],
122         [ "overlong malformation, highest 2-byte",
123             (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
124             (isASCII) ? 0x7F : 0xFF,
125         ],
126         [ "overlong malformation, lowest 3-byte",
127             (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
128             0,   # NUL
129         ],
130         [ "overlong malformation, highest 3-byte",
131             (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
132             (isASCII) ? 0x7FF : 0x3FF,
133         ],
134         [ "lowest surrogate",
135             (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
136             0xD800,
137         ],
138         [ "a middle surrogate",
139             (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
140             0xD90D,
141         ],
142         [ "highest surrogate",
143             (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
144             0xDFFF,
145         ],
146         [ "first of 32 consecutive non-character code points",
147             (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
148             0xFDD0,
149         ],
150         [ "a mid non-character code point of the 32 consecutive ones",
151             (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
152             0xFDE0,
153         ],
154         [ "final of 32 consecutive non-character code points",
155             (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
156             0xFDEF,
157         ],
158         [ "non-character code point U+FFFE",
159             (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
160             0xFFFE,
161         ],
162         [ "non-character code point U+FFFF",
163             (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
164             0xFFFF,
165         ],
166         [ "overlong malformation, lowest 4-byte",
167             (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
168             0,   # NUL
169         ],
170         [ "overlong malformation, highest 4-byte",
171             (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
172             (isASCII) ? 0xFFFF : 0x3FFF,
173         ],
174         [ "non-character code point U+1FFFE",
175             (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
176             0x1FFFE,
177         ],
178         [ "non-character code point U+1FFFF",
179             (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
180             0x1FFFF,
181         ],
182         [ "non-character code point U+2FFFE",
183             (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
184             0x2FFFE,
185         ],
186         [ "non-character code point U+2FFFF",
187             (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
188             0x2FFFF,
189         ],
190         [ "non-character code point U+3FFFE",
191             (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
192             0x3FFFE,
193         ],
194         [ "non-character code point U+3FFFF",
195             (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
196             0x3FFFF,
197         ],
198         [ "non-character code point U+4FFFE",
199             (isASCII)
200             ?               "\xf1\x8f\xbf\xbe"
201             : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
202             0x4FFFE,
203         ],
204         [ "non-character code point U+4FFFF",
205             (isASCII)
206             ?               "\xf1\x8f\xbf\xbf"
207             : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
208             0x4FFFF,
209         ],
210         [ "non-character code point U+5FFFE",
211             (isASCII)
212             ?              "\xf1\x9f\xbf\xbe"
213             : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
214             0x5FFFE,
215         ],
216         [ "non-character code point U+5FFFF",
217             (isASCII)
218             ?              "\xf1\x9f\xbf\xbf"
219             : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
220             0x5FFFF,
221         ],
222         [ "non-character code point U+6FFFE",
223             (isASCII)
224             ?              "\xf1\xaf\xbf\xbe"
225             : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
226             0x6FFFE,
227         ],
228         [ "non-character code point U+6FFFF",
229             (isASCII)
230             ?              "\xf1\xaf\xbf\xbf"
231             : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
232             0x6FFFF,
233         ],
234         [ "non-character code point U+7FFFE",
235             (isASCII)
236             ?              "\xf1\xbf\xbf\xbe"
237             : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
238             0x7FFFE,
239         ],
240         [ "non-character code point U+7FFFF",
241             (isASCII)
242             ?              "\xf1\xbf\xbf\xbf"
243             : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
244             0x7FFFF,
245         ],
246         [ "non-character code point U+8FFFE",
247             (isASCII)
248             ?              "\xf2\x8f\xbf\xbe"
249             : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
250             0x8FFFE,
251         ],
252         [ "non-character code point U+8FFFF",
253             (isASCII)
254             ?              "\xf2\x8f\xbf\xbf"
255             : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
256             0x8FFFF,
257         ],
258         [ "non-character code point U+9FFFE",
259             (isASCII)
260             ?              "\xf2\x9f\xbf\xbe"
261             : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
262             0x9FFFE,
263         ],
264         [ "non-character code point U+9FFFF",
265             (isASCII)
266             ?              "\xf2\x9f\xbf\xbf"
267             : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
268             0x9FFFF,
269         ],
270         [ "non-character code point U+AFFFE",
271             (isASCII)
272             ?              "\xf2\xaf\xbf\xbe"
273             : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
274             0xAFFFE,
275         ],
276         [ "non-character code point U+AFFFF",
277             (isASCII)
278             ?              "\xf2\xaf\xbf\xbf"
279             : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
280             0xAFFFF,
281         ],
282         [ "non-character code point U+BFFFE",
283             (isASCII)
284             ?              "\xf2\xbf\xbf\xbe"
285             : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
286             0xBFFFE,
287         ],
288         [ "non-character code point U+BFFFF",
289             (isASCII)
290             ?              "\xf2\xbf\xbf\xbf"
291             : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
292             0xBFFFF,
293         ],
294         [ "non-character code point U+CFFFE",
295             (isASCII)
296             ?              "\xf3\x8f\xbf\xbe"
297             : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
298             0xCFFFE,
299         ],
300         [ "non-character code point U+CFFFF",
301             (isASCII)
302             ?              "\xf3\x8f\xbf\xbf"
303             : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
304             0xCFFFF,
305         ],
306         [ "non-character code point U+DFFFE",
307             (isASCII)
308             ?              "\xf3\x9f\xbf\xbe"
309             : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
310             0xDFFFE,
311         ],
312         [ "non-character code point U+DFFFF",
313             (isASCII)
314             ?              "\xf3\x9f\xbf\xbf"
315             : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
316             0xDFFFF,
317         ],
318         [ "non-character code point U+EFFFE",
319             (isASCII)
320             ?              "\xf3\xaf\xbf\xbe"
321             : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
322             0xEFFFE,
323         ],
324         [ "non-character code point U+EFFFF",
325             (isASCII)
326             ?              "\xf3\xaf\xbf\xbf"
327             : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
328             0xEFFFF,
329         ],
330         [ "non-character code point U+FFFFE",
331             (isASCII)
332             ?              "\xf3\xbf\xbf\xbe"
333             : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
334             0xFFFFE,
335         ],
336         [ "non-character code point U+FFFFF",
337             (isASCII)
338             ?              "\xf3\xbf\xbf\xbf"
339             : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
340             0xFFFFF,
341         ],
342         [ "non-character code point U+10FFFE",
343             (isASCII)
344             ?              "\xf4\x8f\xbf\xbe"
345             : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
346             0x10FFFE,
347         ],
348         [ "non-character code point U+10FFFF",
349             (isASCII)
350             ?              "\xf4\x8f\xbf\xbf"
351             : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
352             0x10FFFF,
353         ],
354         [ "first non_unicode",
355             (isASCII)
356             ?              "\xf4\x90\x80\x80"
357             : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
358             0x110000,
359             2,
360         ],
361         [ "non_unicode whose first byte tells that",
362             (isASCII)
363             ?              "\xf5\x80\x80\x80"
364             : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
365             (isASCII) ? 0x140000 : 0x200000,
366             1,
367         ],
368         [ "overlong malformation, lowest 5-byte",
369             (isASCII)
370             ?              "\xf8\x80\x80\x80\x80"
371             : I8_to_native("\xf8\xa0\xa0\xa0\xa0"),
372             0,   # NUL
373         ],
374         [ "overlong malformation, highest 5-byte",
375             (isASCII)
376             ?              "\xf8\x87\xbf\xbf\xbf"
377             : I8_to_native("\xf8\xa7\xbf\xbf\xbf"),
378             (isASCII) ? 0x1FFFFF : 0x3FFFF,
379         ],
380         [ "overlong malformation, lowest 6-byte",
381             (isASCII)
382             ?              "\xfc\x80\x80\x80\x80\x80"
383             : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"),
384             0,   # NUL
385         ],
386         [ "overlong malformation, highest 6-byte",
387             (isASCII)
388             ?              "\xfc\x83\xbf\xbf\xbf\xbf"
389             : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"),
390             (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
391         ],
392         [ "overlong malformation, lowest 7-byte",
393             (isASCII)
394             ?              "\xfe\x80\x80\x80\x80\x80\x80"
395             : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"),
396             0,   # NUL
397         ],
398         [ "overlong malformation, highest 7-byte",
399             (isASCII)
400             ?              "\xfe\x81\xbf\xbf\xbf\xbf\xbf"
401             : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"),
402             (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
403         ],
404         [ "highest 31 bit code point",
405             (isASCII)
406             ?  "\xfd\xbf\xbf\xbf\xbf\xbf"
407             : I8_to_native(
408                "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"),
409             0x7FFFFFFF,
410             1,
411         ],
412         [ "lowest 32 bit code point",
413             (isASCII)
414             ?  "\xfe\x82\x80\x80\x80\x80\x80"
415             : I8_to_native(
416                 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
417             ($::is64bit) ? 0x80000000 : -1,   # Overflows on 32-bit systems
418             1,
419         ],
420         # Used when UV_MAX is allowed as a code point
421         #[ "highest 32 bit code point",
422         #    (isASCII)
423         #    ?  "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
424         #    : I8_to_native(
425         #       "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
426         #    0xFFFFFFFF,
427         #],
428         #[ "Lowest 33 bit code point",
429         #    (isASCII)
430         #    ?  "\xfe\x84\x80\x80\x80\x80\x80"
431         #    : I8_to_native(
432         #        "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
433         #    ($::is64bit) ? 0x100000000 : 0x0,   # Overflows on 32-bit systems
434         #],
435     );
436
437     if (! $::is64bit) {
438         if (isASCII) {
439             push @tests,
440                 [ "overlong malformation, but naively looks like overflow",
441                     "\xff\x80\x80\x80\x80\x80\x80\x81\xbf\xbf\xbf\xbf\xbf",
442                     0x7FFFFFFF,
443                 ],
444                 # Used when above IV_MAX are allowed.
445                 #[ "overlong malformation, but naively looks like overflow",
446                 #    "\xff\x80\x80\x80\x80\x80\x80\x83\xbf\xbf\xbf\xbf\xbf",
447                 #    0xFFFFFFFF,
448                 #],
449                 [ "overflow that old algorithm failed to detect",
450                     "\xfe\x86\x80\x80\x80\x80\x80",
451                     -1,
452                 ];
453         }
454     }
455
456     push @tests,
457         [ "overlong malformation, lowest max-byte",
458             (isASCII)
459              ?      "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
460              : I8_to_native(
461                     "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
462             0,   # NUL
463         ],
464         [ "overlong malformation, highest max-byte",
465             (isASCII)    # 2**36-1 on ASCII; 2**30-1 on EBCDIC
466              ?      "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf"
467              : I8_to_native(
468                     "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"),
469             (isASCII) ? (($::is64bit) ? 0xFFFFFFFFF : -1) : 0x3FFFFFFF,
470         ];
471
472     if (isASCII) {
473         push @tests,
474             [ "Lowest code point requiring 13 bytes to represent", # 2**36
475                 "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
476                 ($::is64bit) ? 0x1000000000 : -1,    # overflows on 32bit
477             ],
478     };
479
480     if ($::is64bit) {
481         push @tests,
482             [ "highest 63 bit code point",
483               (isASCII)
484               ? "\xff\x80\x87\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
485               : I8_to_native(
486                 "\xff\xa7\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
487               0x7FFFFFFFFFFFFFFF,
488               (isASCII) ? 1 : 2,
489             ],
490             [ "first 64 bit code point",
491               (isASCII)
492               ? "\xff\x80\x88\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
493               : I8_to_native(
494                 "\xff\xa8\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
495               -1,
496             ];
497             # Used when UV_MAX is allowed as a code point
498             #[ "highest 64 bit code point",
499             #  (isASCII)
500             #  ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
501             #  : I8_to_native(
502             #    "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
503             #  0xFFFFFFFFFFFFFFFF,
504             #  (isASCII) ? 1 : 2,
505             #],
506             #[ "first 65 bit code point",
507             #  (isASCII)
508             #  ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
509             #  : I8_to_native(
510             #    "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
511             #  0,
512             #];
513         if (isASCII) {
514             push @tests,
515                 [ "overflow that old algorithm failed to detect",
516                     "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
517                     -1,
518                 ];
519         }
520         else {
521             push @tests,    # These could falsely show wrongly in a naive
522                             # implementation
523                 [ "requires at least 32 bits",
524                     I8_to_native(
525                     "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
526                     0x800000000,
527                       40000000
528                 ],
529                 [ "requires at least 32 bits",
530                     I8_to_native(
531                     "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
532                     0x10000000000,
533                 ],
534                 [ "requires at least 32 bits",
535                     I8_to_native(
536                     "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
537                     0x200000000000,
538                 ],
539                 [ "requires at least 32 bits",
540                     I8_to_native(
541                     "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
542                     0x4000000000000,
543                 ],
544                 [ "requires at least 32 bits",
545                     I8_to_native(
546                     "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
547                     0x80000000000000,
548                 ],
549                 [ "requires at least 32 bits",
550                     I8_to_native(
551                     "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
552                     0x1000000000000000,
553                 ];
554         }
555     }
556 }
557
558 sub flags_to_text($$)
559 {
560     my ($flags, $flags_to_text_ref) = @_;
561
562     # Returns a string containing a mnemonic representation of the bits that
563     # are set in the $flags.  These are assumed to be flag bits.  The return
564     # looks like "FOO|BAR|BAZ".  The second parameter is a reference to an
565     # array that gives the textual representation of all the possible flags.
566     # Element 0 is the text for the bit 0 flag; element 1 for bit 1; ....  If
567     # no bits at all are set the string "0" is returned;
568
569     my @flag_text;
570     my $shift = 0;
571
572     return "0" if $flags == 0;
573
574     while ($flags) {
575         #diag sprintf "%x", $flags;
576         if ($flags & 1) {
577             push @flag_text, $flags_to_text_ref->[$shift];
578         }
579         $shift++;
580         $flags >>= 1;
581     }
582
583     return join "|", @flag_text;
584 }
585
586 # Possible flag returns from utf8n_to_uvchr_error().  These should have G_,
587 # instead of A_, D_, but the prefixes will be used in a a later commit, so
588 # minimize churn by having them here.
589 my @utf8n_flags_to_text =  ( qw(
590         A_EMPTY
591         A_CONTINUATION
592         A_NON_CONTINUATION
593         A_SHORT
594         A_LONG
595         A_LONG_AND_ITS_VALUE
596         PLACEHOLDER
597         A_OVERFLOW
598         D_SURROGATE
599         W_SURROGATE
600         D_NONCHAR
601         W_NONCHAR
602         D_SUPER
603         W_SUPER
604         D_PERL_EXTENDED
605         W_PERL_EXTENDED
606         CHECK_ONLY
607         NO_CONFIDENCE_IN_CURLEN_
608     ) );
609
610 sub utf8n_display_call($)
611 {
612     # Converts an eval string that calls test_utf8n_to_uvchr into a more human
613     # readable form, and returns it.  Doesn't work if the byte string contains
614     # an apostrophe.  The return will look something like:
615     #   test_utf8n_to_uvchr_error('$bytes', $length, $flags)
616     #diag $_[0];
617
618     $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
619     my $text1 = $1;     # Everything before the byte string
620     my $bytes = $2;
621     my $text2 = $3;     # Includes the length
622     my $flags = $4;
623
624     return $text1
625          . display_bytes($bytes)
626          . $text2
627          . flags_to_text($flags, \@utf8n_flags_to_text)
628          . ')';
629 }
630
631 my @uvchr_flags_to_text =  ( qw(
632         W_SURROGATE
633         W_NONCHAR
634         W_SUPER
635         W_PERL_EXTENDED
636         D_SURROGATE
637         D_NONCHAR
638         D_SUPER
639         D_PERL_EXTENDED
640 ) );
641
642 sub uvchr_display_call($)
643 {
644     # Converts an eval string that calls test_uvchr_to_utf8 into a more human
645     # readable form, and returns it.  The return will look something like:
646     #   test_uvchr_to_utf8n_flags($uv, $flags)
647     #diag $_[0];
648
649
650     $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
651     my $text = $1;
652     my $cp = sprintf "%X", $2;
653     my $flags = $3;
654
655     return "${text}0x$cp, " . flags_to_text($flags, \@uvchr_flags_to_text) . ')';
656 }
657
658 sub do_warnings_test(@)
659 {
660     my @expected_warnings = @_;
661
662     # Compares the input expected warnings array with @warnings_gotten,
663     # generating a pass for each found, removing it from @warnings_gotten.
664     # Any discrepancies generate test failures.  Returns TRUE if no
665     # discrepcancies; otherwise FALSE.
666
667     my $succeeded = 1;
668
669     if (@expected_warnings == 0) {
670         if (! is(@warnings_gotten, 0, "    Expected and got no warnings")) {
671             output_warnings(@warnings_gotten);
672             $succeeded = 0;
673         }
674         return $succeeded;
675     }
676
677     # Check that we got all the expected warnings,
678     # removing each one found
679   WARNING:
680     foreach my $expected (@expected_warnings) {
681         foreach (my $i = 0; $i < @warnings_gotten; $i++) {
682             if ($warnings_gotten[$i] =~ $expected) {
683                 pass("    Expected and got warning: "
684                     . " $warnings_gotten[$i]");
685                 splice @warnings_gotten, $i, 1;
686                 next WARNING;
687             }
688         }
689         fail("    Expected a warning that matches "
690             . $expected . " but didn't get it");
691         $succeeded = 0;
692     }
693
694     if (! is(@warnings_gotten, 0, "    Got no unexpected warnings")) {
695         output_warnings(@warnings_gotten);
696         $succeeded = 0;
697     }
698
699     return $succeeded;
700 }
701
702 my $min_cont = $::lowest_continuation;
703 my $continuation_shift = (isASCII) ? 6 : 5;
704 my $continuation_mask = (1 << $continuation_shift) - 1;
705
706 sub isUTF8_CHAR($$) {   # Uses first principals to determine if this I8 input
707                         # is legal.  (Doesn't work if overflows)
708     my ($native, $length) = @_;
709     my $i8 = native_to_I8($native);
710
711     # Uses first principals to calculate if $i8 is legal
712
713     return 0 if $length <= 0;
714
715     my $first = ord substr($i8, 0, 1);
716
717     # Invariant
718     return 1 if $length == 1 && $first < $min_cont;
719
720     return 0 if $first < 0xC0;  # Starts with continuation
721
722     # Calculate the number of leading 1 bits
723     my $utf8skip = 0;
724     my $bits = $first;
725     do {
726         $utf8skip++;
727         $bits = ($bits << 1) & 0xFF;
728     } while ($bits & 0x80);
729
730     return 0 if $utf8skip != $length;
731
732     # Accumulate the $code point.  The remaining bits in the start byte count
733     # towards it
734     my $cp = $bits >> $utf8skip;
735
736     for my $i (1 .. $length - 1) {
737         my $ord = ord substr($i8, $i, 1);
738
739         # Wrong if not a continuation
740         return 0 if $ord < $min_cont || $ord >= 0xC0;
741
742         $cp = ($cp << $continuation_shift)
743             | ($ord & $continuation_mask);
744     }
745
746     # If the calculated value can be expressed in fewer bytes than were passed
747     # in, is an illegal overlong.  XXX if 'chr' is not working properly, this
748     # may not be right
749     my $chr = uni_to_native(chr $cp);
750     utf8::upgrade($chr);
751
752     use bytes;
753     return 0 if length $chr < $length;
754
755     # Also, its possible on EBCDIC platforms that have more illegal start
756     # bytes than ASCII ones (like C3, C4) for something to have the same
757     # length but still be overlong.  We make sure the first byte isn't smaller
758     # than the first byte of the real representation.
759     return 0 if substr($native, 0, 1) lt substr($chr, 0, 1);
760
761     return 1;
762 }
763
764 sub start_mark($) {
765     my $len = shift;
766     return 0xFF if $len >  7;
767     return (0xFF & (0xFE << (7 - $len)));
768 }
769
770 sub start_mask($) {
771     my $len = shift;
772     return 0 if $len >  7;
773     return 0x1F >> ($len - 2);
774 }
775
776 # This test is split into this number of files.
777 my $num_test_files = $ENV{TEST_JOBS} || 1;
778 $num_test_files = 10 if $num_test_files > 10;
779
780 # We only really need to test utf8n_to_uvchr_msgs() once with this flag.
781 my $tested_CHECK_ONLY = 0;
782
783 my $test_count = -1;
784
785 # By setting this environment variable to this particular value, we test
786 # essentially all combinations of potential UTF-8, so that can get a
787 # comprehensive test of the decoding routine.  This test assumes the routine
788 # that does the translation from code point to UTF-8 is working.  An assert
789 # can be used in the routine to make sure that the dfa is working precisely
790 # correctly, and any flaws in it aren't being masked by the remainder of the
791 # function.
792 if ($::TEST_CHUNK == 0
793 && $ENV{PERL_DEBUG_FULL_TEST}
794 && $ENV{PERL_DEBUG_FULL_TEST} == 97)
795 {
796     # We construct UTF-8 (I8 on EBCDIC platforms converted later to native)
797
798     my $min_cont_mask = $min_cont | 0xF;
799     my @bytes = (   0,  # Placeholder to signify to use an empty string ""
800                  0x41,  # We assume that all the invariant characters are
801                         # properly in the same class, so this is an exemplar
802                         # character
803                 $min_cont .. 0xFF   # But test every non-invariant individually
804                 );
805     my $mark = $min_cont;
806     my $mask = (1 << $continuation_shift) - 1;
807     for my $byte1 (@bytes) {
808         for my $byte2 (@bytes) {
809             last if $byte2 && ! $byte1;      # Don't test empty preceding byte
810
811             last if $byte2 && $byte1 < 0xC0; # No need to test more than a
812                                              # single byte unless start byte
813                                              # indicates those.
814
815             for my $byte3 (@bytes) {
816                 last if $byte3 && ! $byte2;
817                 last if $byte3 && $byte1 < 0xE0;    # Only test 3 bytes for
818                                                     # 3-byte start byte
819
820                 # If the preceding byte is a start byte, it should fail, and
821                 # there is no need to test illegal bytes that follow.
822                 # Instead, limit ourselves to just a few legal bytes that
823                 # could follow.  This cuts down tremendously on the number of
824                 # tests executed.
825                 next if $byte2 >= 0xC0
826                      && $byte3 >= $min_cont
827                      && ($byte3 & $min_cont_mask) != $min_cont;
828
829                 for my $byte4 (@bytes) {
830                     last if $byte4 && ! $byte3;
831                     last if $byte4 && $byte1 < 0xF0;  # Only test 4 bytes for
832                                                       # 4 byte strings
833
834                     # Like for byte 3, we limit things that come after a
835                     # mispositioned start-byte to just a few things that
836                     # otherwise would be legal
837                     next if ($byte2 >= 0xC0 || $byte3 >= 0xC0)
838                           && $byte4 >= $min_cont
839                           && ($byte4 & $min_cont_mask) != $min_cont;
840
841                     for my $byte5 (@bytes) {
842                         last if $byte5 && ! $byte4;
843                         last if $byte5 && $byte1 < 0xF8;  # Only test 5 bytes for
844                                                           # 5 byte strings
845
846                         # Like for byte 4, we limit things that come after a
847                         # mispositioned start-byte to just a few things that
848                         # otherwise would be legal
849                         next if (   $byte2 >= 0xC0
850                                  || $byte3 >= 0xC0
851                                  || $byte4 >= 0xC0)
852                               && $byte4 >= $min_cont
853                               && ($byte4 & $min_cont_mask) != $min_cont;
854
855                         my $string = "";
856                         $string .= chr $byte1 if $byte1;
857                         $string .= chr $byte2 if $byte2;
858                         $string .= chr $byte3 if $byte3;
859                         $string .= chr $byte4 if $byte4;
860                         $string .= chr $byte5 if $byte5;
861
862                         my $length = length $string;
863                         next unless $length;
864                         last if $byte1 >= ((isASCII) ? 0xF6 : 0xFA);
865
866                         my $native = I8_to_native($string);
867                         my $is_valid = isUTF8_CHAR($native, $length);
868                         my $got_valid = test_isUTF8_CHAR($native, $length);
869                         my $got_strict
870                                     = test_isSTRICT_UTF8_CHAR($native, $length);
871                         my $got_C9
872                                  = test_isC9_STRICT_UTF8_CHAR($native, $length);
873                         my $ret = test_utf8n_to_uvchr_msgs($native, $length,
874                                             $::UTF8_WARN_ILLEGAL_INTERCHANGE);
875                         my $is_strict = $is_valid;
876                         my $is_C9 = $is_valid;
877
878                         if ($is_valid) {
879
880                             # Here, is legal UTF-8.  Verify that it returned
881                             # the correct code point, and if so, that it
882                             # correctly classifies the result.
883                             my $cp = $ret->[0];
884
885                             my $should_be_string;
886                             if ($length == 1) {
887                                 $should_be_string = native_to_I8(chr $cp);
888                             }
889                             else {
890
891                                 # Starting with the code point, use first
892                                 # principals to find the equivalent I8 string
893                                 my @bytes;
894                                 my $uv = ord native_to_uni(chr $cp);
895                                 for (my $i = $length - 1; $i > 0; $i--) {
896                                     $bytes[$i] = chr (($uv & $mask) | $mark);
897                                     $uv >>= $continuation_shift;
898                                 }
899                                 $bytes[0] = chr ($uv & start_mask($length)
900                                             | start_mark($length));
901                                 $should_be_string = join "", @bytes;
902                             }
903
904                             # If the original string and the inverse are the
905                             # same, it worked.
906                             my $test_name = "utf8n_to_uvchr_msgs("
907                                           . display_bytes($native)
908                                           . ") yields "
909                                           . sprintf ("0x%x", $cp)
910                                           . "; does its I8 eq original";
911                             if (is($should_be_string, $string, $test_name)) {
912                                 my $is_surrogate = $cp >= 0xD800
913                                                 && $cp <= 0xDFFF;
914                                 my $got_surrogate
915                                     = ($ret->[2] & $::UTF8_GOT_SURROGATE) != 0;
916                                 $is_strict = 0 if $is_surrogate;
917                                 $is_C9 = 0 if $is_surrogate;
918
919                                 my $is_super = $cp > 0x10FFFF;
920                                 my $got_super
921                                         = ($ret->[2] & $::UTF8_GOT_SUPER) != 0;
922                                 $is_strict = 0 if $is_super;
923                                 $is_C9 = 0 if $is_super;
924
925                                 my $is_nonchar = ! $is_super
926                                     && (   ($cp & 0xFFFE) == 0xFFFE
927                                         || ($cp >= 0xFDD0 && $cp <= 0xFDEF));
928                                 my $got_nonchar
929                                       = ($ret->[2] & $::UTF8_GOT_NONCHAR) != 0;
930                                 $is_strict = 0 if $is_nonchar;
931
932                                 is($got_surrogate, $is_surrogate,
933                                     "    And correctly flagged it as"
934                                   . ((! $is_surrogate) ? " not" : "")
935                                   . " being a surrogate");
936                                 is($got_super, $is_super,
937                                     "    And correctly flagged it as"
938                                   . ((! $is_super) ? " not" : "")
939                                   . " being above Unicode");
940                                 is($got_nonchar, $is_nonchar,
941                                     "    And correctly flagged it as"
942                                   . ((! $is_nonchar) ? " not" : "")
943                                   . " being a non-char");
944                             }
945
946                             # This is how we exit the loop normally if things
947                             # are working.  The fail-safe code above is used
948                             # when they aren't.
949                             goto done if $cp > 0x140001;
950                         }
951                         else {
952                             is($ret->[0], 0, "utf8n_to_uvchr_msgs("
953                                             . display_bytes($native)
954                                             . ") correctly returns error");
955                             if (! ($ret->[2] & ($::UTF8_GOT_SHORT
956                                                |$::UTF8_GOT_NON_CONTINUATION
957                                                |$::UTF8_GOT_LONG)))
958                             {
959                                 is($ret->[2] & ( $::UTF8_GOT_NONCHAR
960                                                 |$::UTF8_GOT_SURROGATE
961                                                 |$::UTF8_GOT_SUPER), 0,
962                                 "    And isn't a surrogate, non-char, nor"
963                                 . " above Unicode");
964                              }
965                         }
966
967                         is($got_valid == 0, $is_valid == 0,
968                            "    And isUTF8_CHAR() correctly returns "
969                          . (($got_valid == 0) ? "0" : "non-zero"));
970                         is($got_strict == 0, $is_strict == 0,
971                            "    And isSTRICT_UTF8_CHAR() correctly returns "
972                          . (($got_strict == 0) ? "0" : "non-zero"));
973                         is($got_C9 == 0, $is_C9 == 0,
974                            "    And isC9_UTF8_CHAR() correctly returns "
975                          . (($got_C9 == 0) ? "0" : "non-zero"));
976                     }
977                 }
978             }
979         }
980     }
981   done:
982 }
983
984 foreach my $test (@tests) {
985   $test_count++;
986   next if $test_count % $num_test_files != $::TEST_CHUNK;
987
988   my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
989
990   my $length = length $bytes;
991   my $initially_overlong = $testname =~ /overlong/;
992   my $initially_orphan   = $testname =~ /orphan/;
993   my $will_overflow = $allowed_uv < 0;
994
995   my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
996   my $display_bytes = display_bytes($bytes);
997
998   my $controlling_warning_category;
999   my $utf8n_flag_to_warn;
1000   my $utf8n_flag_to_disallow;
1001   my $uvchr_flag_to_warn;
1002   my $uvchr_flag_to_disallow;
1003
1004   # We want to test that the independent flags are actually independent.
1005   # For example, that a surrogate doesn't trigger a non-character warning,
1006   # and conversely, turning off an above-Unicode flag doesn't suppress a
1007   # surrogate warning.  Earlier versions of this file used nested loops to
1008   # test all possible combinations.  But that creates lots of tests, making
1009   # this run too long.  What is now done instead is to use the complement of
1010   # the category we are testing to greatly reduce the combinatorial
1011   # explosion.  For example, if we have a surrogate and we aren't expecting
1012   # a warning about it, we set all the flags for non-surrogates to raise
1013   # warnings.  If one shows up, it indicates the flags aren't independent.
1014   my $utf8n_flag_to_warn_complement;
1015   my $utf8n_flag_to_disallow_complement;
1016   my $uvchr_flag_to_warn_complement;
1017   my $uvchr_flag_to_disallow_complement;
1018
1019   # Many of the code points being tested are middling in that if code point
1020   # edge cases work, these are very likely to as well.  Because this test
1021   # file takes a while to execute, we skip testing the edge effects of code
1022   # points deemed middling, while testing their basics and continuing to
1023   # fully test the non-middling code points.
1024   my $skip_most_tests = 0;
1025
1026   my $cp_message_qr;      # Pattern that matches the message raised when
1027                           # that message contains the problematic code
1028                           # point.  The message is the same (currently) both
1029                           # when going from/to utf8.
1030   my $non_cp_trailing_text;   # The suffix text when the message doesn't
1031                               # contain a code point.  (This is a result of
1032                               # some sort of malformation that means we
1033                               # can't get an exact code poin
1034   my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
1035                       \Q requires a Perl extension, and so is not\E
1036                       \Q portable\E/x;
1037   my $extended_non_cp_trailing_text
1038                       = "is a Perl extension, and so is not portable";
1039
1040   # What bytes should have been used to specify a code point that has been
1041   # specified as an overlong.
1042   my $correct_bytes_for_overlong;
1043
1044   # Is this test malformed from the beginning?  If so, we know to generally
1045   # expect that the tests will show it isn't valid.
1046   my $initially_malformed = 0;
1047
1048   if ($initially_overlong || $initially_orphan) {
1049       $non_cp_trailing_text = "if you see this, there is an error";
1050       $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
1051       $initially_malformed = 1;
1052       $utf8n_flag_to_warn     = 0;
1053       $utf8n_flag_to_disallow = 0;
1054
1055       $utf8n_flag_to_warn_complement =     $::UTF8_WARN_SURROGATE;
1056       $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE;
1057       if (! $will_overflow && $allowed_uv <= 0x10FFFF) {
1058           $utf8n_flag_to_warn_complement     |= $::UTF8_WARN_SUPER;
1059           $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_SUPER;
1060           if (($allowed_uv & 0xFFFF) != 0xFFFF) {
1061               $utf8n_flag_to_warn_complement      |= $::UTF8_WARN_NONCHAR;
1062               $utf8n_flag_to_disallow_complement  |= $::UTF8_DISALLOW_NONCHAR;
1063           }
1064       }
1065       if (! is_extended_utf8($bytes)) {
1066           $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED;
1067           $utf8n_flag_to_disallow_complement  |= $::UTF8_DISALLOW_PERL_EXTENDED;
1068       }
1069
1070       $controlling_warning_category = 'utf8';
1071
1072       if ($initially_overlong) {
1073           if (! defined $needed_to_discern_len) {
1074               $needed_to_discern_len = overlong_discern_len($bytes);
1075           }
1076           $correct_bytes_for_overlong = display_bytes_no_quotes(chr $allowed_uv);
1077       }
1078   }
1079   elsif($will_overflow || $allowed_uv > 0x10FFFF) {
1080
1081       # Set the SUPER flags; later, we test for PERL_EXTENDED as well.
1082       $utf8n_flag_to_warn     = $::UTF8_WARN_SUPER;
1083       $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
1084       $uvchr_flag_to_warn     = $::UNICODE_WARN_SUPER;
1085       $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
1086
1087       # Below, we add the flags for non-perl_extended to the code points
1088       # that don't fit that category.  Special tests are done for this
1089       # category in the inner loop.
1090       $utf8n_flag_to_warn_complement     = $::UTF8_WARN_NONCHAR
1091                                           |$::UTF8_WARN_SURROGATE;
1092       $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
1093                                           |$::UTF8_DISALLOW_SURROGATE;
1094       $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_NONCHAR
1095                                           |$::UNICODE_WARN_SURROGATE;
1096       $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
1097                                           |$::UNICODE_DISALLOW_SURROGATE;
1098       $controlling_warning_category = 'non_unicode';
1099
1100       if ($will_overflow) {  # This is realy a malformation
1101           $non_cp_trailing_text = "if you see this, there is an error";
1102           $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
1103           $initially_malformed = 1;
1104           if (! defined $needed_to_discern_len) {
1105               $needed_to_discern_len = overflow_discern_len($length);
1106           }
1107       }
1108       elsif (requires_extended_utf8($allowed_uv)) {
1109           $cp_message_qr = $extended_cp_message_qr;
1110           $non_cp_trailing_text = $extended_non_cp_trailing_text;
1111           $needed_to_discern_len = 1 unless defined $needed_to_discern_len;
1112       }
1113       else {
1114           $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
1115                               \Q may not be portable\E/x;
1116           $non_cp_trailing_text = "is for a non-Unicode code point, may not"
1117                               . " be portable";
1118           $utf8n_flag_to_warn_complement     |= $::UTF8_WARN_PERL_EXTENDED;
1119           $utf8n_flag_to_disallow_complement
1120                                           |= $::UTF8_DISALLOW_PERL_EXTENDED;
1121           $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED;
1122           $uvchr_flag_to_disallow_complement
1123                                       |= $::UNICODE_DISALLOW_PERL_EXTENDED;
1124       }
1125   }
1126   elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
1127       $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/;
1128       $non_cp_trailing_text = "is for a surrogate";
1129       $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
1130       $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
1131
1132       $utf8n_flag_to_warn     = $::UTF8_WARN_SURROGATE;
1133       $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
1134       $uvchr_flag_to_warn     = $::UNICODE_WARN_SURROGATE;
1135       $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
1136
1137       $utf8n_flag_to_warn_complement     = $::UTF8_WARN_NONCHAR
1138                                           |$::UTF8_WARN_SUPER
1139                                           |$::UTF8_WARN_PERL_EXTENDED;
1140       $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
1141                                           |$::UTF8_DISALLOW_SUPER
1142                                           |$::UTF8_DISALLOW_PERL_EXTENDED;
1143       $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_NONCHAR
1144                                           |$::UNICODE_WARN_SUPER
1145                                           |$::UNICODE_WARN_PERL_EXTENDED;
1146       $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
1147                                           |$::UNICODE_DISALLOW_SUPER
1148                                           |$::UNICODE_DISALLOW_PERL_EXTENDED;
1149       $controlling_warning_category = 'surrogate';
1150   }
1151   elsif (   ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
1152           || ($allowed_uv & 0xFFFE) == 0xFFFE)
1153   {
1154       $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E
1155                           \Q is not recommended for open interchange\E/x;
1156       $non_cp_trailing_text = "if you see this, there is an error";
1157       $needed_to_discern_len = $length unless defined $needed_to_discern_len;
1158       if (   ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
1159           || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
1160       {
1161           $skip_most_tests = 1;
1162       }
1163
1164       $utf8n_flag_to_warn     = $::UTF8_WARN_NONCHAR;
1165       $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
1166       $uvchr_flag_to_warn     = $::UNICODE_WARN_NONCHAR;
1167       $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
1168
1169       $utf8n_flag_to_warn_complement     = $::UTF8_WARN_SURROGATE
1170                                           |$::UTF8_WARN_SUPER
1171                                           |$::UTF8_WARN_PERL_EXTENDED;
1172       $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
1173                                           |$::UTF8_DISALLOW_SUPER
1174                                           |$::UTF8_DISALLOW_PERL_EXTENDED;
1175       $uvchr_flag_to_warn_complement     = $::UNICODE_WARN_SURROGATE
1176                                           |$::UNICODE_WARN_SUPER
1177                                           |$::UNICODE_WARN_PERL_EXTENDED;
1178       $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
1179                                           |$::UNICODE_DISALLOW_SUPER
1180                                           |$::UNICODE_DISALLOW_PERL_EXTENDED;
1181
1182       $controlling_warning_category = 'nonchar';
1183   }
1184   else {
1185       die "Can't figure out what type of warning to test for $testname"
1186   }
1187
1188   die 'Didn\'t set $needed_to_discern_len for ' . $testname
1189                                       unless defined $needed_to_discern_len;
1190
1191   # We try various combinations of malformations that can occur
1192   foreach my $short (0, 1) {
1193     next if $skip_most_tests && $short;
1194     foreach my $unexpected_noncont (0, 1) {
1195       next if $skip_most_tests && $unexpected_noncont;
1196       foreach my $overlong (0, 1) {
1197         next if $overlong && $skip_most_tests;
1198         next if $initially_overlong && ! $overlong;
1199
1200         # If we're creating an overlong, it can't be longer than the
1201         # maximum length, so skip if we're already at that length.
1202         next if   (! $initially_overlong && $overlong)
1203                   &&  $length >= $::max_bytes;
1204
1205         my $this_cp_message_qr = $cp_message_qr;
1206         my $this_non_cp_trailing_text = $non_cp_trailing_text;
1207
1208         foreach my $malformed_allow_type (0..2) {
1209           # 0 don't allow this malformation; ignored if no malformation
1210           # 1 allow, with REPLACEMENT CHARACTER returned
1211           # 2 allow, with intended code point returned.  All malformations
1212           #   other than overlong can't determine the intended code point,
1213           #   so this isn't valid for them.
1214           next if     $malformed_allow_type == 2
1215                   && ($will_overflow || $short || $unexpected_noncont);
1216           next if $skip_most_tests && $malformed_allow_type;
1217
1218           # Here we are in the innermost loop for malformations.  So we
1219           # know which ones are in effect.  Can now change the input to be
1220           # appropriately malformed.  We also can set up certain other
1221           # things now, like whether we expect a return flag from this
1222           # malformation, and which flag.
1223
1224           my $this_bytes = $bytes;
1225           my $this_length = $length;
1226           my $this_expected_len = $length;
1227           my $this_needed_to_discern_len = $needed_to_discern_len;
1228
1229           my @malformation_names;
1230           my @expected_malformation_warnings;
1231           my @expected_malformation_return_flags;
1232
1233           # Contains the flags for any allowed malformations.  Currently no
1234           # combinations of on/off are tested for.  It's either all are
1235           # allowed, or none are.
1236           my $allow_flags = 0;
1237           my $overlong_is_in_perl_extended_utf8 = 0;
1238           my $dont_use_overlong_cp = 0;
1239
1240           if ($initially_orphan) {
1241               next if $overlong || $short || $unexpected_noncont;
1242           }
1243
1244           if ($overlong) {
1245               if (! $initially_overlong) {
1246                   my $new_expected_len;
1247
1248                   # To force this malformation, we convert the original start
1249                   # byte into a continuation byte with the same data bits as
1250                   # originally. ...
1251                   my $start_byte = substr($this_bytes, 0, 1);
1252                   my $converted_to_continuation_byte
1253                                           = start_byte_to_cont($start_byte);
1254
1255                   # ... Then we prepend it with a known overlong sequence.
1256                   # This should evaluate to the exact same code point as the
1257                   # original.  We try to avoid an overlong using Perl
1258                   # extended UTF-8.  The code points are the highest
1259                   # representable as overlongs on the respective platform
1260                   # without using extended UTF-8.
1261                   if (native_to_I8($start_byte) lt "\xFC") {
1262                       $start_byte = I8_to_native("\xFC");
1263                       $new_expected_len = 6;
1264                   }
1265                   elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") {
1266
1267                       # FE is not extended UTF-8 on EBCDIC
1268                       $start_byte = I8_to_native("\xFE");
1269                       $new_expected_len = 7;
1270                   }
1271                   else {  # Must use extended UTF-8.  On ASCII platforms, we
1272                           # could express some overlongs here starting with
1273                           # \xFE, but there's no real reason to do so.
1274                       $overlong_is_in_perl_extended_utf8 = 1;
1275                       $start_byte = I8_to_native("\xFF");
1276                       $new_expected_len = $::max_bytes;
1277                       $this_cp_message_qr = $extended_cp_message_qr;
1278
1279                       # The warning that gets raised doesn't include the
1280                       # code point in the message if the code point can be
1281                       # expressed without using extended UTF-8, but the
1282                       # particular overlong sequence used is in extended
1283                       # UTF-8.  To do otherwise would be confusing to the
1284                       # user, as it would claim the code point requires
1285                       # extended, when it doesn't.
1286                       $dont_use_overlong_cp = 1
1287                                   unless requires_extended_utf8($allowed_uv);
1288                       $this_non_cp_trailing_text
1289                                             = $extended_non_cp_trailing_text;
1290                   }
1291
1292                   # Splice in the revise continuation byte, preceded by the
1293                   # start byte and the proper number of the lowest
1294                   # continuation bytes.
1295                   $this_bytes =   $start_byte
1296                               . ($native_lowest_continuation_chr
1297                                   x (  $new_expected_len
1298                                       - 1
1299                                       - length($this_bytes)))
1300                               .  $converted_to_continuation_byte
1301                               .  substr($this_bytes, 1);
1302                   $this_length = length($this_bytes);
1303                   $this_needed_to_discern_len =    $new_expected_len
1304                                               - (  $this_expected_len
1305                                               - $this_needed_to_discern_len);
1306                   $this_expected_len = $new_expected_len;
1307               }
1308           }
1309
1310           if ($short) {
1311
1312               # To force this malformation, just tell the test to not look
1313               # as far as it should into the input.
1314               $this_length--;
1315               $this_expected_len--;
1316
1317               $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
1318           }
1319
1320           if ($unexpected_noncont) {
1321
1322               # To force this malformation, change the final continuation
1323               # byte into a start byte.
1324               my $pos = ($short) ? -2 : -1;
1325               substr($this_bytes, $pos, 1) = $known_start_byte;
1326               $this_expected_len--;
1327           }
1328
1329           # The whole point of a test that is malformed from the beginning
1330           # is to test for that malformation.  If we've modified things so
1331           # much that we don't have enough information to detect that
1332           # malformation, there's no point in testing.
1333           next if    $initially_malformed
1334                   && $this_expected_len < $this_needed_to_discern_len;
1335
1336           # Here, we've transformed the input with all of the desired
1337           # non-overflow malformations.  We are now in a position to
1338           # construct any potential warnings for those malformations.  But
1339           # it's a pain to get the detailed messages exactly right, so for
1340           # now XXX, only do so for those that return an explicit code
1341           # point.
1342
1343           if ($initially_orphan) {
1344               push @malformation_names, "orphan continuation";
1345               push @expected_malformation_return_flags,
1346                                                   $::UTF8_GOT_CONTINUATION;
1347               $allow_flags |= $::UTF8_ALLOW_CONTINUATION
1348                                                   if $malformed_allow_type;
1349               push @expected_malformation_warnings, qr/unexpected continuation/;
1350           }
1351
1352           if ($overlong) {
1353               push @malformation_names, 'overlong';
1354               push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
1355
1356               # If one of the other malformation types is also in effect, we
1357               # don't know what the intended code point was.
1358               if ($short || $unexpected_noncont || $will_overflow) {
1359                   push @expected_malformation_warnings, qr/overlong/;
1360               }
1361               else {
1362                   my $wrong_bytes = display_bytes_no_quotes(
1363                                         substr($this_bytes, 0, $this_length));
1364                   if (! defined $correct_bytes_for_overlong) {
1365                       $correct_bytes_for_overlong
1366                                           = display_bytes_no_quotes($bytes);
1367                   }
1368                   my $prefix = (   $allowed_uv > 0x10FFFF
1369                                 || ! isASCII && $allowed_uv < 256)
1370                                 ? "0x"
1371                                 : "U+";
1372                   push @expected_malformation_warnings,
1373                           qr/\QMalformed UTF-8 character: $wrong_bytes\E
1374                               \Q (overlong; instead use\E
1375                               \Q $correct_bytes_for_overlong to\E
1376                               \Q represent $prefix$uv_string)/x;
1377               }
1378
1379               if ($malformed_allow_type == 2) {
1380                   $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
1381               }
1382               elsif ($malformed_allow_type) {
1383                   $allow_flags |= $::UTF8_ALLOW_LONG;
1384               }
1385           }
1386           if ($short) {
1387               push @malformation_names, 'short';
1388               push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
1389               push @expected_malformation_warnings, qr/too short/;
1390           }
1391           if ($unexpected_noncont) {
1392               push @malformation_names, 'unexpected non-continuation';
1393               push @expected_malformation_return_flags,
1394                               $::UTF8_GOT_NON_CONTINUATION;
1395               $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
1396                                                   if $malformed_allow_type;
1397               push @expected_malformation_warnings,
1398                                       qr/unexpected non-continuation byte/;
1399           }
1400
1401           # The overflow malformation is done differently than other
1402           # malformations.  It comes from manually typed tests in the test
1403           # array.  We now make it be treated like one of the other
1404           # malformations.  But some has to be deferred until the inner loop
1405           my $overflow_msg_pattern;
1406           if ($will_overflow) {
1407               push @malformation_names, 'overflow';
1408
1409               $overflow_msg_pattern = display_bytes_no_quotes(
1410                                   substr($this_bytes, 0, $this_expected_len));
1411               $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E
1412                                           \Q $overflow_msg_pattern\E
1413                                           \Q (overflows)\E/x;
1414               push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW;
1415               $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type;
1416           }
1417
1418           # And we can create the malformation-related text for the the test
1419           # names we eventually will generate.
1420           my $malformations_name = "";
1421           if (@malformation_names) {
1422               $malformations_name .= "dis" unless $malformed_allow_type;
1423               $malformations_name .= "allowed ";
1424               $malformations_name .= "malformation";
1425               $malformations_name .= "s" if @malformation_names > 1;
1426               $malformations_name .= ": ";
1427               $malformations_name .=  join "/", @malformation_names;
1428               $malformations_name =  " ($malformations_name)";
1429           }
1430
1431           # Done setting up the malformation related stuff
1432
1433           {   # First test the isFOO calls
1434               use warnings; # XXX no warnings 'deprecated';   # Make sure these don't raise warnings
1435               undef @warnings_gotten;
1436
1437               my $ret = test_isUTF8_CHAR($this_bytes, $this_length);
1438               my $ret_flags
1439                       = test_isUTF8_CHAR_flags($this_bytes, $this_length, 0);
1440               if ($malformations_name) {
1441                   is($ret, 0, "For $testname$malformations_name: isUTF8_CHAR() returns 0");
1442                   is($ret_flags, 0, "    And isUTF8_CHAR_flags() returns 0");
1443               }
1444               else {
1445                   is($ret, $this_length, "For $testname: isUTF8_CHAR() returns"
1446                                         . " expected length: $this_length");
1447                   is($ret_flags, $this_length,
1448                       "    And isUTF8_CHAR_flags(...,0) returns expected"
1449                     . " length: $this_length");
1450               }
1451               is(scalar @warnings_gotten, 0,
1452                   "    And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags"
1453                 . " generated any warnings")
1454               or output_warnings(@warnings_gotten);
1455
1456               undef @warnings_gotten;
1457               $ret = test_isSTRICT_UTF8_CHAR($this_bytes, $this_length);
1458               if ($malformations_name) {
1459                   is($ret, 0, "    And isSTRICT_UTF8_CHAR() returns 0");
1460               }
1461               else {
1462                   my $expected_ret
1463                               = (   $testname =~ /surrogate|non-character/
1464                                   || $allowed_uv > 0x10FFFF)
1465                                 ? 0
1466                                 : $this_length;
1467                   is($ret, $expected_ret,
1468                       "    And isSTRICT_UTF8_CHAR() returns expected"
1469                     . " length: $expected_ret");
1470                   $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1471                                       $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
1472                   is($ret, $expected_ret,
1473                       "    And isUTF8_CHAR_flags('"
1474                     . "DISALLOW_ILLEGAL_INTERCHANGE') acts like"
1475                     . " isSTRICT_UTF8_CHAR");
1476               }
1477               is(scalar @warnings_gotten, 0,
1478                       "    And neither isSTRICT_UTF8_CHAR() nor"
1479                     . " isUTF8_CHAR_flags generated any warnings")
1480               or output_warnings(@warnings_gotten);
1481
1482               undef @warnings_gotten;
1483               $ret = test_isC9_STRICT_UTF8_CHAR($this_bytes, $this_length);
1484               if ($malformations_name) {
1485                   is($ret, 0, "    And isC9_STRICT_UTF8_CHAR() returns 0");
1486               }
1487               else {
1488                   my $expected_ret = (   $testname =~ /surrogate/
1489                                       || $allowed_uv > 0x10FFFF)
1490                                       ? 0
1491                                       : $this_expected_len;
1492                   is($ret, $expected_ret, "    And isC9_STRICT_UTF8_CHAR()"
1493                                         . " returns expected length:"
1494                                         . " $expected_ret");
1495                   $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1496                                   $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
1497                   is($ret, $expected_ret,
1498                       "    And isUTF8_CHAR_flags('"
1499                     . "DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like"
1500                     . " isC9_STRICT_UTF8_CHAR");
1501               }
1502               is(scalar @warnings_gotten, 0,
1503                       "    And neither isC9_STRICT_UTF8_CHAR() nor"
1504                     . " isUTF8_CHAR_flags generated any warnings")
1505               or output_warnings(@warnings_gotten);
1506
1507               foreach my $disallow_type (0..2) {
1508                   # 0 is don't disallow this type of code point
1509                   # 1 is do disallow
1510                   # 2 is do disallow, but only code points requiring
1511                   #   perl-extended-UTF8
1512
1513                   my $disallow_flags;
1514                   my $expected_ret;
1515
1516                   if ($malformations_name) {
1517
1518                       # Malformations are by default disallowed, so testing
1519                       # with $disallow_type equal to 0 is sufficicient.
1520                       next if $disallow_type;
1521
1522                       $disallow_flags = 0;
1523                       $expected_ret = 0;
1524                   }
1525                   elsif ($disallow_type == 1) {
1526                       $disallow_flags = $utf8n_flag_to_disallow;
1527                       $expected_ret = 0;
1528                   }
1529                   elsif ($disallow_type == 2) {
1530                       next if ! requires_extended_utf8($allowed_uv);
1531                       $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED;
1532                       $expected_ret = 0;
1533                   }
1534                   else {  # type is 0
1535                       $disallow_flags = $utf8n_flag_to_disallow_complement;
1536                       $expected_ret = $this_length;
1537                   }
1538
1539                   $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1540                                                 $disallow_flags);
1541                   is($ret, $expected_ret,
1542                             "    And isUTF8_CHAR_flags($display_bytes,"
1543                           . " $disallow_flags) returns $expected_ret")
1544                     or diag "The flags mean "
1545                           . flags_to_text($disallow_flags,
1546                                           \@utf8n_flags_to_text);
1547                   is(scalar @warnings_gotten, 0,
1548                           "    And isUTF8_CHAR_flags(...) generated"
1549                         . " no warnings")
1550                     or output_warnings(@warnings_gotten);
1551
1552                   # Test partial character handling, for each byte not a
1553                   # full character
1554                   my $did_test_partial = 0;
1555                   for (my $j = 1; $j < $this_length - 1; $j++) {
1556                       $did_test_partial = 1;
1557                       my $partial = substr($this_bytes, 0, $j);
1558                       my $ret_should_be;
1559                       my $comment;
1560                       if ($disallow_type || $malformations_name) {
1561                           $ret_should_be = 0;
1562                           $comment = "disallowed";
1563
1564                           # The number of bytes required to tell if a
1565                           # sequence has something wrong is the smallest of
1566                           # all the things wrong with it.  We start with the
1567                           # number for this type of code point, if that is
1568                           # disallowed; or the whole length if not.  The
1569                           # latter is what a couple of the malformations
1570                           # require.
1571                           my $needed_to_tell = ($disallow_type)
1572                                                 ? $this_needed_to_discern_len
1573                                                 : $this_expected_len;
1574
1575                           # Then we see if the malformations that are
1576                           # detectable early in the string are present.
1577                           if ($overlong) {
1578                               my $dl = overlong_discern_len($this_bytes);
1579                               $needed_to_tell = $dl if $dl < $needed_to_tell;
1580                           }
1581                           if ($will_overflow) {
1582                               my $dl = overflow_discern_len($length);
1583                               $needed_to_tell = $dl if $dl < $needed_to_tell;
1584                           }
1585
1586                           if ($j < $needed_to_tell) {
1587                               $ret_should_be = 1;
1588                               $comment .= ", but need $needed_to_tell"
1589                                         . " bytes to discern:";
1590                           }
1591                       }
1592                       else {
1593                           $ret_should_be = 1;
1594                           $comment = "allowed";
1595                       }
1596
1597                       undef @warnings_gotten;
1598
1599                       $ret = test_is_utf8_valid_partial_char_flags($partial,
1600                                                       $j, $disallow_flags);
1601                       is($ret, $ret_should_be,
1602                           "    And is_utf8_valid_partial_char_flags("
1603                           . display_bytes($partial)
1604                           . ", $disallow_flags), $comment: returns"
1605                           . " $ret_should_be")
1606                       or diag "The flags mean "
1607                       . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
1608                   }
1609
1610                   if ($did_test_partial) {
1611                       is(scalar @warnings_gotten, 0,
1612                           "    And is_utf8_valid_partial_char_flags()"
1613                           . " generated no warnings for any of the lengths")
1614                         or output_warnings(@warnings_gotten);
1615                   }
1616               }
1617           }
1618
1619           # Now test the to/from UTF-8 calls.  There are several orthogonal
1620           # variables involved.  We test most possible combinations
1621
1622           foreach my $do_disallow (0, 1) {
1623             if ($do_disallow) {
1624               next if $initially_overlong || $initially_orphan;
1625             }
1626             else {
1627               next if $skip_most_tests;
1628             }
1629
1630             # This tests four functions: utf8n_to_uvchr_error,
1631             # utf8n_to_uvchr_msgs, uvchr_to_utf8_flags, and
1632             # uvchr_to_utf8_msgs.  The first two are variants of each other,
1633             # and the final two also form a pair.  We use a loop 'which_func'
1634             # to determine which of each pair is being tested.  The main loop
1635             # tests either the first and third, or the 2nd and fourth.
1636             # which_func is sets whether we are expecting warnings or not in
1637             # certain places.  The _msgs() version of the functions expects
1638             # warnings even if lexical ones are turned off, so by making its
1639             # which_func == 1, we can say we want warnings; whereas the other
1640             # one with the value 0, doesn't get them.
1641             for my $which_func (0, 1) {
1642               my $utf8_func = ($which_func)
1643                           ? 'utf8n_to_uvchr_msgs'
1644                           : 'utf8n_to_uvchr_error';
1645
1646               # We classify the warnings into certain "interesting" types,
1647               # described later
1648               foreach my $warning_type (0..4) {
1649                 next if $skip_most_tests && $warning_type != 1;
1650                 foreach my $use_warn_flag (0, 1) {
1651                     if ($use_warn_flag) {
1652                         next if $initially_overlong || $initially_orphan;
1653
1654                         # Since foo_msgs() expects warnings even when lexical
1655                         # ones are turned off, we can skip testing it when
1656                         # they are turned on, with little likelihood of
1657                         # missing an error case.
1658                         next if $which_func;
1659                     }
1660                     else {
1661                         next if $skip_most_tests;
1662                     }
1663
1664                     # Finally, here is the inner loop
1665
1666                     my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn;
1667                     my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow;
1668                     my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn;
1669                     my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow;
1670
1671                     my $eval_warn;
1672                     my $expect_regular_warnings;
1673                     my $expect_warnings_for_malformed;
1674                     my $expect_warnings_for_overflow;
1675
1676                     if ($warning_type == 0) {
1677                         $eval_warn = "use warnings";
1678                         $expect_regular_warnings = $use_warn_flag;
1679
1680                         # We ordinarily expect overflow warnings here.  But it
1681                         # is somewhat more complicated, and the final
1682                         # determination is deferred to one place in the file
1683                         # where we handle overflow.
1684                         $expect_warnings_for_overflow = 1;
1685
1686                         # We would ordinarily expect malformed warnings in
1687                         # this case, but not if malformations are allowed.
1688                         $expect_warnings_for_malformed
1689                                                 = $malformed_allow_type == 0;
1690                     }
1691                     elsif ($warning_type == 1) {
1692                         $eval_warn = "no warnings";
1693                         $expect_regular_warnings = $which_func;
1694                         $expect_warnings_for_overflow = $which_func;
1695                         $expect_warnings_for_malformed = $which_func;
1696                     }
1697                     elsif ($warning_type == 2) {
1698                         $eval_warn = "no warnings; use warnings 'utf8'";
1699                         $expect_regular_warnings = $use_warn_flag;
1700                         $expect_warnings_for_overflow = 1;
1701                         $expect_warnings_for_malformed
1702                                                 = $malformed_allow_type == 0;
1703                     }
1704                     elsif ($warning_type == 3) {
1705                         $eval_warn = "no warnings; use warnings"
1706                                    . " '$controlling_warning_category'";
1707                         $expect_regular_warnings = $use_warn_flag;
1708                         $expect_warnings_for_overflow
1709                             = $controlling_warning_category eq 'non_unicode';
1710                         $expect_warnings_for_malformed = $which_func;
1711                     }
1712                     elsif ($warning_type == 4) {  # Like type 3, but uses the
1713                                                   # PERL_EXTENDED flags
1714                         # The complement flags were set up so that the
1715                         # PERL_EXTENDED flags have been tested that they don't
1716                         # trigger wrongly for too small code points.  And the
1717                         # flags have been set up so that those small code
1718                         # points are tested for being above Unicode.  What's
1719                         # left to test is that the large code points do
1720                         # trigger the PERL_EXTENDED flags.
1721                         next if ! requires_extended_utf8($allowed_uv);
1722                         next if $controlling_warning_category ne 'non_unicode';
1723                         $eval_warn = "no warnings; use warnings 'non_unicode'";
1724                         $expect_regular_warnings = 1;
1725                         $expect_warnings_for_overflow = 1;
1726                         $expect_warnings_for_malformed = 0;
1727                         $this_utf8n_flag_to_warn = $::UTF8_WARN_PERL_EXTENDED;
1728                         $this_utf8n_flag_to_disallow
1729                                              = $::UTF8_DISALLOW_PERL_EXTENDED;
1730                         $this_uvchr_flag_to_warn
1731                                               = $::UNICODE_WARN_PERL_EXTENDED;
1732                         $this_uvchr_flag_to_disallow
1733                                           = $::UNICODE_DISALLOW_PERL_EXTENDED;
1734                     }
1735                     else {
1736                        die "Unexpected warning type '$warning_type'";
1737                     }
1738
1739                     # We only need to test the case where all warnings are
1740                     # enabled (type 0) to see if turning off the warning flag
1741                     # causes things to not be output.  If those pass, then
1742                     # turning on some sub-category of warnings, or turning off
1743                     # warnings altogether are extremely likely to not output
1744                     # warnings either, given how the warnings subsystem is
1745                     # supposed to work, and this file assumes it does work.
1746                     next if $warning_type != 0 && ! $use_warn_flag;
1747
1748                     # The convention is that the 'got' flag is the same value
1749                     # as the disallow one.  If this were violated, the tests
1750                     # here should start failing.
1751                     my $return_flag = $this_utf8n_flag_to_disallow;
1752
1753                     # If we aren't expecting warnings/disallow for this, turn
1754                     # on all the other flags.  That makes sure that they all
1755                     # are independent of this flag, and so we don't need to
1756                     # test them individually.
1757                     my $this_warning_flags
1758                             = ($use_warn_flag)
1759                               ? $this_utf8n_flag_to_warn
1760                               : ($overlong_is_in_perl_extended_utf8
1761                                 ? ($utf8n_flag_to_warn_complement
1762                                     & ~$::UTF8_WARN_PERL_EXTENDED)
1763                                 :  $utf8n_flag_to_warn_complement);
1764                     my $this_disallow_flags
1765                             = ($do_disallow)
1766                               ? $this_utf8n_flag_to_disallow
1767                               : ($overlong_is_in_perl_extended_utf8
1768                                  ? ($utf8n_flag_to_disallow_complement
1769                                     & ~$::UTF8_DISALLOW_PERL_EXTENDED)
1770                                  :  $utf8n_flag_to_disallow_complement);
1771                     my $expected_uv = $allowed_uv;
1772                     my $this_uv_string = $uv_string;
1773
1774                     my @expected_return_flags
1775                                         = @expected_malformation_return_flags;
1776                     my @expected_warnings;
1777                     push @expected_warnings, @expected_malformation_warnings
1778                                             if $expect_warnings_for_malformed;
1779
1780                     # The overflow malformation is done differently than other
1781                     # malformations.  It comes from manually typed tests in
1782                     # the test array, but it also is above Unicode and uses
1783                     # Perl extended UTF-8, so affects some of the flags being
1784                     # tested.  We now make it be treated like one of the other
1785                     # generated malformations.
1786                     if ($will_overflow) {
1787
1788                         # An overflow is (way) above Unicode, and overrides
1789                         # everything else.
1790                         $expect_regular_warnings = 0;
1791
1792                         # Earlier, we tentatively calculated whether this
1793                         # should emit a message or not.  It's tentative
1794                         # because, even if we ordinarily would output it, we
1795                         # don't if malformations are allowed -- except an
1796                         # overflow is also a SUPER and PERL_EXTENDED, and if
1797                         # warnings for those are enabled, the overflow
1798                         # warning does get raised.
1799                         if (   $expect_warnings_for_overflow
1800                             && (    $malformed_allow_type == 0
1801                                 ||   (   $this_warning_flags
1802                                       & ($::UTF8_WARN_SUPER
1803                                         |$::UTF8_WARN_PERL_EXTENDED))))
1804                         {
1805                             push @expected_warnings, $overflow_msg_pattern;
1806                         }
1807                     }
1808
1809                     # It may be that the malformations have shortened the
1810                     # amount of input we look at so much that we can't tell
1811                     # what the category the code point was in.  Otherwise, set
1812                     # up the expected return flags based on the warnings and
1813                     # disallowments.
1814                     if ($this_expected_len < $this_needed_to_discern_len) {
1815                         $expect_regular_warnings = 0;
1816                     }
1817                     elsif (   ($this_warning_flags & $this_utf8n_flag_to_warn)
1818                            || (  $this_disallow_flags
1819                                & $this_utf8n_flag_to_disallow))
1820                     {
1821                         push @expected_return_flags, $return_flag;
1822                     }
1823
1824                     # Finish setting up the expected warning.
1825                     if ($expect_regular_warnings) {
1826
1827                         # So far the array contains warnings generated by
1828                         # malformations.  Add the expected regular one.
1829                         unshift @expected_warnings, $this_cp_message_qr;
1830
1831                         # But it may need to be modified, because either of
1832                         # these malformations means we can't determine the
1833                         # expected code point.
1834                         if (   $short || $unexpected_noncont
1835                             || $dont_use_overlong_cp)
1836                         {
1837                             my $first_byte = substr($this_bytes, 0, 1);
1838                             $expected_warnings[0] = display_bytes(
1839                                     substr($this_bytes, 0, $this_expected_len));
1840                             $expected_warnings[0]
1841                                 = qr/[Aa]\Qny UTF-8 sequence that starts with\E
1842                                      \Q $expected_warnings[0]\E
1843                                      \Q $this_non_cp_trailing_text\E/x;
1844                         }
1845                     }
1846
1847                     # Is effectively disallowed if we've set up a malformation
1848                     # (unless malformations are allowed), even if the flag
1849                     # indicates it is allowed.  Fix up test name to indicate
1850                     # this as well
1851                     my $disallowed = 0;
1852                     if (   $this_disallow_flags & $this_utf8n_flag_to_disallow
1853                         && $this_expected_len >= $this_needed_to_discern_len)
1854                     {
1855                         $disallowed = 1;
1856                     }
1857                     if ($malformations_name) {
1858                         if ($malformed_allow_type == 0) {
1859                             $disallowed = 1;
1860                         }
1861                         elsif ($malformed_allow_type == 1) {
1862
1863                             # Even if allowed, the malformation returns the
1864                             # REPLACEMENT CHARACTER.
1865                             $expected_uv = 0xFFFD;
1866                             $this_uv_string = "0xFFFD"
1867                         }
1868                     }
1869
1870                     my $this_name = "$utf8_func() $testname: ";
1871                     my @scratch_expected_return_flags = @expected_return_flags;
1872                     if (! $initially_malformed) {
1873                         $this_name .= ($disallowed)
1874                                        ? 'disallowed, '
1875                                        : 'allowed, ';
1876                     }
1877                     $this_name .= "$eval_warn";
1878                     $this_name .= ", " . ((  $this_warning_flags
1879                                             & $this_utf8n_flag_to_warn)
1880                                           ? 'with flag for raising warnings'
1881                                           : 'no flag for raising warnings');
1882                     $this_name .= $malformations_name;
1883
1884                     # Do the actual test using an eval
1885                     undef @warnings_gotten;
1886                     my $ret_ref;
1887                     my $this_flags
1888                         = $allow_flags|$this_warning_flags|$this_disallow_flags;
1889                     my $eval_text =      "$eval_warn; \$ret_ref"
1890                             . " = test_$utf8_func("
1891                             . "'$this_bytes', $this_length, $this_flags)";
1892                     eval "$eval_text";
1893                     if (! ok ($@ eq "", "$this_name: eval succeeded"))
1894                     {
1895                         diag "\$@='$@'; call was: "
1896                            . utf8n_display_call($eval_text);
1897                         next;
1898                     }
1899
1900                     if ($disallowed) {
1901                         is($ret_ref->[0], 0, "    And returns 0")
1902                           or diag "Call was: " . utf8n_display_call($eval_text);
1903                     }
1904                     else {
1905                         is($ret_ref->[0], $expected_uv,
1906                                 "    And returns expected uv: "
1907                               . $this_uv_string)
1908                           or diag "Call was: " . utf8n_display_call($eval_text);
1909                     }
1910                     is($ret_ref->[1], $this_expected_len,
1911                                         "    And returns expected length:"
1912                                       . " $this_expected_len")
1913                       or diag "Call was: " . utf8n_display_call($eval_text);
1914
1915                     my $returned_flags = $ret_ref->[2];
1916
1917                     for (my $i = @scratch_expected_return_flags - 1;
1918                          $i >= 0;
1919                          $i--)
1920                     {
1921                       if ($scratch_expected_return_flags[$i] & $returned_flags)
1922                       {
1923                           if ($scratch_expected_return_flags[$i]
1924                                               == $::UTF8_GOT_PERL_EXTENDED)
1925                           {
1926                               pass("    Expected and got return flag for"
1927                                   . " PERL_EXTENDED");
1928                           }
1929                                   # The first entries in this are
1930                                   # malformations
1931                           elsif ($i > @malformation_names - 1)  {
1932                               pass("    Expected and got return flag"
1933                                   . " for " . $controlling_warning_category);
1934                           }
1935                           else {
1936                               pass("    Expected and got return flag for "
1937                                   . $malformation_names[$i]
1938                                   . " malformation");
1939                           }
1940                           $returned_flags
1941                                       &= ~$scratch_expected_return_flags[$i];
1942                           splice @scratch_expected_return_flags, $i, 1;
1943                       }
1944                     }
1945
1946                     if (! is($returned_flags, 0,
1947                        "    Got no unexpected return flags"))
1948                     {
1949                         diag "The unexpected flags gotten were: "
1950                            . (flags_to_text($returned_flags,
1951                                             \@utf8n_flags_to_text)
1952                                 # We strip off any prefixes from the flag
1953                                 # names
1954                              =~ s/ \b [A-Z] _ //xgr);
1955                         diag "Call was: " . utf8n_display_call($eval_text);
1956                     }
1957
1958                     if (! is (scalar @scratch_expected_return_flags, 0,
1959                         "    Got all expected return flags"))
1960                     {
1961                         diag "The expected flags not gotten were: "
1962                            . (flags_to_text(eval join("|",
1963                                                 @scratch_expected_return_flags),
1964                                             \@utf8n_flags_to_text)
1965                                 # We strip off any prefixes from the flag
1966                                 # names
1967                              =~ s/ \b [A-Z] _ //xgr);
1968                         diag "Call was: " . utf8n_display_call($eval_text);
1969                     }
1970
1971                     if ($which_func) {
1972                         my @returned_warnings;
1973                         for my $element_ref (@{$ret_ref->[3]}) {
1974                             push @returned_warnings, $element_ref->{'text'};
1975                             my $text = $element_ref->{'text'};
1976                             my $flag = $element_ref->{'flag_bit'};
1977                             my $category = $element_ref->{'warning_category'};
1978
1979                             if (! ok(($flag & ($flag-1)) == 0,
1980                                       "flag for returned msg is a single bit"))
1981                             {
1982                               diag sprintf("flags are %x; msg=%s", $flag, $text);
1983                             }
1984                             else {
1985                               if (grep { $_ == $flag } @expected_return_flags) {
1986                                   pass("flag for returned msg is expected");
1987                               }
1988                               else {
1989                                   fail("flag ("
1990                                      . flags_to_text($flag, \@utf8n_flags_to_text)
1991                                      . ") for returned msg is expected");
1992                               }
1993                             }
1994
1995                             # In perl space, don't know the category numbers
1996                             isnt($category, 0,
1997                                           "returned category for msg isn't 0");
1998                         }
1999
2000                         ok(@warnings_gotten == 0, "$utf8_func raised no warnings;"
2001                               . " the next tests are for ones in the returned"
2002                               . " variable")
2003                             or diag join "\n", "The unexpected warnings were:",
2004                                                               @warnings_gotten;
2005                         @warnings_gotten = @returned_warnings;
2006                     }
2007
2008                     do_warnings_test(@expected_warnings)
2009                       or diag "Call was: " . utf8n_display_call($eval_text);
2010                     undef @warnings_gotten;
2011
2012                     # Check CHECK_ONLY results when the input is
2013                     # disallowed.  Do this when actually disallowed,
2014                     # not just when the $this_disallow_flags is set.  We only
2015                     # test once utf8n_to_uvchr_msgs() with this.
2016                     if (   $disallowed
2017                         && ($which_func == 0 || ! $tested_CHECK_ONLY))
2018                     {
2019                         $tested_CHECK_ONLY = 1;
2020                         my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
2021                         my $eval_text = "use warnings; \$ret_ref ="
2022                                       . " test_$utf8_func('"
2023                                       . "$this_bytes', $this_length,"
2024                                       . " $this_flags)";
2025                         eval $eval_text;
2026                         if (! ok ($@ eq "",
2027                             "    And eval succeeded with CHECK_ONLY"))
2028                         {
2029                             diag "\$@='$@'; Call was: "
2030                                . utf8n_display_call($eval_text);
2031                             next;
2032                         }
2033                         is($ret_ref->[0], 0, "    CHECK_ONLY: Returns 0")
2034                           or diag "Call was: " . utf8n_display_call($eval_text);
2035                         is($ret_ref->[1], -1,
2036                                        "    CHECK_ONLY: returns -1 for length")
2037                           or diag "Call was: " . utf8n_display_call($eval_text);
2038                         if (! is(scalar @warnings_gotten, 0,
2039                                       "    CHECK_ONLY: no warnings generated"))
2040                         {
2041                             diag "Call was: " . utf8n_display_call($eval_text);
2042                             output_warnings(@warnings_gotten);
2043                         }
2044                     }
2045
2046                     # Now repeat some of the above, but for
2047                     # uvchr_to_utf8_flags().  Since this comes from an
2048                     # existing code point, it hasn't overflowed, and isn't
2049                     # malformed.
2050                     next if @malformation_names;
2051
2052                     my $uvchr_func = ($which_func)
2053                                      ? 'uvchr_to_utf8_flags_msgs'
2054                                      : 'uvchr_to_utf8_flags';
2055
2056                     $this_warning_flags = ($use_warn_flag)
2057                                           ? $this_uvchr_flag_to_warn
2058                                           : 0;
2059                     $this_disallow_flags = ($do_disallow)
2060                                            ? $this_uvchr_flag_to_disallow
2061                                            : 0;
2062
2063                     $disallowed = $this_disallow_flags
2064                                 & $this_uvchr_flag_to_disallow;
2065                     $this_name .= ", " . ((  $this_warning_flags
2066                                            & $this_utf8n_flag_to_warn)
2067                                           ? 'with flag for raising warnings'
2068                                           : 'no flag for raising warnings');
2069
2070                     $this_name = "$uvchr_func() $testname: "
2071                                         . (($disallowed)
2072                                            ? 'disallowed'
2073                                            : 'allowed');
2074                     $this_name .= ", $eval_warn";
2075                     $this_name .= ", " . ((  $this_warning_flags
2076                                            & $this_uvchr_flag_to_warn)
2077                                         ? 'with warning flag'
2078                                         : 'no warning flag');
2079
2080                     undef @warnings_gotten;
2081                     my $ret;
2082                     $this_flags = $this_warning_flags|$this_disallow_flags;
2083                     $eval_text = "$eval_warn; \$ret ="
2084                             . " test_$uvchr_func("
2085                             . "$allowed_uv, $this_flags)";
2086                     eval "$eval_text";
2087                     if (! ok ($@ eq "", "$this_name: eval succeeded"))
2088                     {
2089                         diag "\$@='$@'; call was: "
2090                            . uvchr_display_call($eval_text);
2091                         next;
2092                     }
2093
2094                     if ($which_func) {
2095                         if (defined $ret->[1]) {
2096                             my @returned_warnings;
2097                             push @returned_warnings, $ret->[1]{'text'};
2098                             my $text = $ret->[1]{'text'};
2099                             my $flag = $ret->[1]{'flag_bit'};
2100                             my $category = $ret->[1]{'warning_category'};
2101
2102                             if (! ok(($flag & ($flag-1)) == 0,
2103                                         "flag for returned msg is a single bit"))
2104                             {
2105                                 diag sprintf("flags are %x; msg=%s", $flag, $text);
2106                             }
2107                             else {
2108                                 if ($flag & $this_uvchr_flag_to_disallow) {
2109                                     pass("flag for returned msg is expected");
2110                                 }
2111                                 else {
2112                                     fail("flag ("
2113                                         . flags_to_text($flag, \@utf8n_flags_to_text)
2114                                         . ") for returned msg is expected");
2115                                 }
2116                             }
2117
2118                             # In perl space, don't know the category numbers
2119                             isnt($category, 0,
2120                                             "returned category for msg isn't 0");
2121
2122                             ok(@warnings_gotten == 0, "$uvchr_func raised no warnings;"
2123                                 . " the next tests are for ones in the returned"
2124                                 . " variable")
2125                                 or diag join "\n", "The unexpected warnings were:",
2126                                                                 @warnings_gotten;
2127                             @warnings_gotten = @returned_warnings;
2128                         }
2129
2130                         $ret = $ret->[0];
2131                     }
2132
2133                     if ($disallowed) {
2134                         is($ret, undef, "    And returns undef")
2135                           or diag "Call was: " . uvchr_display_call($eval_text);
2136                     }
2137                     else {
2138                         is($ret, $this_bytes, "    And returns expected string")
2139                           or diag "Call was: " . uvchr_display_call($eval_text);
2140                     }
2141
2142                     do_warnings_test(@expected_warnings)
2143                       or diag "Call was: " . uvchr_display_call($eval_text);
2144                 }
2145               }
2146             }
2147           }
2148         }
2149       }
2150     }
2151   }
2152 }
2153
2154 done_testing;