This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest:t/utf8_warn_base.pl: Move a variable outside sub()
[perl5.git] / ext / XS-APItest / t / utf8_warn_base.pl
CommitLineData
6aa905cf
KW
1#!perl -w
2
3# This is a base file to be used by various .t's in its directory
3f055917
KW
4# It tests various malformed UTF-8 sequences and some code points that are
5# "problematic", and verifies that the correct warnings/flags etc are
6# generated when using them. For the code points, it also takes the UTF-8 and
7# perturbs it to be malformed in various ways, and tests that this gets
8# appropriately detected.
6aa905cf
KW
9
10use strict;
11use Test::More;
12
13BEGIN {
14 use_ok('XS::APItest');
15 require 'charset_tools.pl';
16 require './t/utf8_setup.pl';
17};
18
19$|=1;
20
6aa905cf 21use XS::APItest;
6aa905cf 22
9cdc3054 23my @warnings_gotten;
6aa905cf
KW
24
25use warnings 'utf8';
8d6f1506
KW
26local $SIG{__WARN__} = sub { my @copy = @_;
27 push @warnings_gotten, map { chomp; $_ } @copy;
28 };
6d736463 29
57ff5f59 30my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
9e35eec9
KW
31my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
32
08e73697
KW
33# C5 is chosen as it is valid for both ASCII and EBCDIC platforms
34my $known_start_byte = I8_to_native("\xC5");
35
6d736463
KW
36sub requires_extended_utf8($) {
37
38 # Returns a boolean as to whether or not the code point parameter fits
39 # into 31 bits, subject to the convention that a negative code point
40 # stands for one that overflows the word size, so won't fit in 31 bits.
41
57ff5f59 42 return shift > $highest_non_extended_utf8_cp;
6d736463
KW
43}
44
3f055917
KW
45sub is_extended_utf8($) {
46
47 # Returns a boolean as to whether or not the input UTF-8 sequence uses
48 # Perl extended UTF-8.
49
50 my $byte = substr(shift, 0, 1);
51 return ord $byte >= 0xFE if isASCII;
52 return $byte == I8_to_native("\xFF");
53}
54
404a1403
KW
55sub overflow_discern_len($) {
56
8e0327af
KW
57 # Returns how many bytes are needed to tell if a non-overlong UTF-8
58 # sequence is for a code point that won't fit in the platform's word size.
59 # Only the length of the sequence representing a single code point is
60 # needed.
404a1403
KW
61
62 if (isASCII) {
d22ec717
KW
63 return ($::is64bit) ? 3 : 1;
64
65 # Below is needed for code points above IV_MAX
66 #return ($::is64bit) ? 3 : ((shift == $::max_bytes)
67 # ? 1
68 # : 2);
404a1403
KW
69 }
70
71 return ($::is64bit) ? 2 : 8;
72}
73
8e0327af
KW
74sub overlong_discern_len($) {
75
76 # Returns how many bytes are needed to tell if the input UTF-8 sequence
77 # for a code point is overlong
78
79 my $string = shift;
80 my $length = length $string;
81 my $byte = ord native_to_I8(substr($string, 0, 1));
82 if (isASCII) {
d22ec717
KW
83 return ($byte >= 0xFE)
84 ? ((! $::is64bit)
85 ? 1
86 : ($byte == 0xFF) ? 7 : 2)
8e0327af 87 : (($length == 2) ? 1 : 2);
d22ec717
KW
88 # Below is needed for code points above IV_MAX
89 #return ($length == $::max_bytes)
90 # # This is constrained to 1 on 32-bit machines, as it
91 # # overflows there
92 # ? (($::is64bit) ? 7 : 1)
93 # : (($length == 2) ? 1 : 2);
8e0327af
KW
94 }
95
96 return ($length == $::max_bytes) ? 8 : (($length <= 3) ? 1 : 2);
97}
98
c7f9e4bb
KW
99my @tests;
100{
101 no warnings qw(portable overflow);
102 @tests = (
8f79178b
KW
103 # $testname,
104 # $bytes, UTF-8 string
105 # $allowed_uv, code point $bytes evaluates to; -1 if
106 # overflows
107 # $needed_to_discern_len optional, how long an initial substring do
108 # we need to tell that the string must be for
109 # a code point in the category it falls in,
110 # like being a surrogate; 0 indicates we need
111 # the whole string. Some categories have a
112 # default that is used if this is omitted.
1d21b5e7
KW
113 [ "orphan continuation byte malformation",
114 I8_to_native("$::I8c"),
115 0xFFFD,
116 1,
117 ],
3f055917
KW
118 [ "overlong malformation, lowest 2-byte",
119 (isASCII) ? "\xc0\x80" : I8_to_native("\xc0\xa0"),
120 0, # NUL
121 ],
122 [ "overlong malformation, highest 2-byte",
123 (isASCII) ? "\xc1\xbf" : I8_to_native("\xc4\xbf"),
d819dc50 124 (isASCII) ? 0x7F : 0xFF,
3f055917
KW
125 ],
126 [ "overlong malformation, lowest 3-byte",
127 (isASCII) ? "\xe0\x80\x80" : I8_to_native("\xe0\xa0\xa0"),
128 0, # NUL
129 ],
130 [ "overlong malformation, highest 3-byte",
131 (isASCII) ? "\xe0\x9f\xbf" : I8_to_native("\xe0\xbf\xbf"),
132 (isASCII) ? 0x7FF : 0x3FF,
133 ],
8f79178b
KW
134 [ "lowest surrogate",
135 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
136 0xD800,
137 ],
138 [ "a middle surrogate",
139 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
140 0xD90D,
141 ],
142 [ "highest surrogate",
143 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
144 0xDFFF,
145 ],
8f79178b
KW
146 [ "first of 32 consecutive non-character code points",
147 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
148 0xFDD0,
149 ],
150 [ "a mid non-character code point of the 32 consecutive ones",
151 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
152 0xFDE0,
153 ],
154 [ "final of 32 consecutive non-character code points",
155 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
156 0xFDEF,
157 ],
158 [ "non-character code point U+FFFE",
159 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
160 0xFFFE,
161 ],
162 [ "non-character code point U+FFFF",
163 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
164 0xFFFF,
165 ],
3f055917
KW
166 [ "overlong malformation, lowest 4-byte",
167 (isASCII) ? "\xf0\x80\x80\x80" : I8_to_native("\xf0\xa0\xa0\xa0"),
168 0, # NUL
169 ],
170 [ "overlong malformation, highest 4-byte",
171 (isASCII) ? "\xf0\x8F\xbf\xbf" : I8_to_native("\xf0\xaf\xbf\xbf"),
172 (isASCII) ? 0xFFFF : 0x3FFF,
173 ],
8f79178b
KW
174 [ "non-character code point U+1FFFE",
175 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
176 0x1FFFE,
177 ],
178 [ "non-character code point U+1FFFF",
179 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
180 0x1FFFF,
181 ],
182 [ "non-character code point U+2FFFE",
183 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
184 0x2FFFE,
185 ],
186 [ "non-character code point U+2FFFF",
187 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
188 0x2FFFF,
189 ],
190 [ "non-character code point U+3FFFE",
191 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
192 0x3FFFE,
193 ],
194 [ "non-character code point U+3FFFF",
195 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
196 0x3FFFF,
197 ],
198 [ "non-character code point U+4FFFE",
199 (isASCII)
200 ? "\xf1\x8f\xbf\xbe"
201 : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
202 0x4FFFE,
203 ],
204 [ "non-character code point U+4FFFF",
205 (isASCII)
206 ? "\xf1\x8f\xbf\xbf"
207 : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
208 0x4FFFF,
209 ],
210 [ "non-character code point U+5FFFE",
211 (isASCII)
212 ? "\xf1\x9f\xbf\xbe"
213 : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
214 0x5FFFE,
215 ],
216 [ "non-character code point U+5FFFF",
217 (isASCII)
218 ? "\xf1\x9f\xbf\xbf"
219 : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
220 0x5FFFF,
221 ],
222 [ "non-character code point U+6FFFE",
223 (isASCII)
224 ? "\xf1\xaf\xbf\xbe"
225 : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
226 0x6FFFE,
227 ],
228 [ "non-character code point U+6FFFF",
229 (isASCII)
230 ? "\xf1\xaf\xbf\xbf"
231 : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
232 0x6FFFF,
233 ],
234 [ "non-character code point U+7FFFE",
235 (isASCII)
236 ? "\xf1\xbf\xbf\xbe"
237 : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
238 0x7FFFE,
239 ],
240 [ "non-character code point U+7FFFF",
241 (isASCII)
242 ? "\xf1\xbf\xbf\xbf"
243 : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
244 0x7FFFF,
245 ],
246 [ "non-character code point U+8FFFE",
247 (isASCII)
248 ? "\xf2\x8f\xbf\xbe"
249 : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
250 0x8FFFE,
251 ],
252 [ "non-character code point U+8FFFF",
253 (isASCII)
254 ? "\xf2\x8f\xbf\xbf"
255 : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
256 0x8FFFF,
257 ],
258 [ "non-character code point U+9FFFE",
259 (isASCII)
260 ? "\xf2\x9f\xbf\xbe"
261 : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
262 0x9FFFE,
263 ],
264 [ "non-character code point U+9FFFF",
265 (isASCII)
266 ? "\xf2\x9f\xbf\xbf"
267 : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
268 0x9FFFF,
269 ],
270 [ "non-character code point U+AFFFE",
271 (isASCII)
272 ? "\xf2\xaf\xbf\xbe"
273 : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
274 0xAFFFE,
275 ],
276 [ "non-character code point U+AFFFF",
277 (isASCII)
278 ? "\xf2\xaf\xbf\xbf"
279 : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
280 0xAFFFF,
281 ],
282 [ "non-character code point U+BFFFE",
283 (isASCII)
284 ? "\xf2\xbf\xbf\xbe"
285 : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
286 0xBFFFE,
287 ],
288 [ "non-character code point U+BFFFF",
289 (isASCII)
290 ? "\xf2\xbf\xbf\xbf"
291 : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
292 0xBFFFF,
293 ],
294 [ "non-character code point U+CFFFE",
295 (isASCII)
296 ? "\xf3\x8f\xbf\xbe"
297 : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
298 0xCFFFE,
299 ],
300 [ "non-character code point U+CFFFF",
301 (isASCII)
302 ? "\xf3\x8f\xbf\xbf"
303 : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
304 0xCFFFF,
305 ],
306 [ "non-character code point U+DFFFE",
307 (isASCII)
308 ? "\xf3\x9f\xbf\xbe"
309 : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
310 0xDFFFE,
311 ],
312 [ "non-character code point U+DFFFF",
313 (isASCII)
314 ? "\xf3\x9f\xbf\xbf"
315 : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
316 0xDFFFF,
317 ],
318 [ "non-character code point U+EFFFE",
319 (isASCII)
320 ? "\xf3\xaf\xbf\xbe"
321 : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
322 0xEFFFE,
323 ],
324 [ "non-character code point U+EFFFF",
325 (isASCII)
326 ? "\xf3\xaf\xbf\xbf"
327 : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
328 0xEFFFF,
329 ],
330 [ "non-character code point U+FFFFE",
331 (isASCII)
332 ? "\xf3\xbf\xbf\xbe"
333 : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
334 0xFFFFE,
335 ],
336 [ "non-character code point U+FFFFF",
337 (isASCII)
338 ? "\xf3\xbf\xbf\xbf"
339 : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
340 0xFFFFF,
341 ],
342 [ "non-character code point U+10FFFE",
343 (isASCII)
344 ? "\xf4\x8f\xbf\xbe"
345 : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
346 0x10FFFE,
347 ],
348 [ "non-character code point U+10FFFF",
349 (isASCII)
350 ? "\xf4\x8f\xbf\xbf"
351 : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
352 0x10FFFF,
353 ],
7c9f4ec6
KW
354 [ "first non_unicode",
355 (isASCII)
356 ? "\xf4\x90\x80\x80"
357 : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
358 0x110000,
359 2,
360 ],
361 [ "non_unicode whose first byte tells that",
362 (isASCII)
363 ? "\xf5\x80\x80\x80"
364 : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
365 (isASCII) ? 0x140000 : 0x200000,
366 1,
367 ],
3f055917
KW
368 [ "overlong malformation, lowest 5-byte",
369 (isASCII)
370 ? "\xf8\x80\x80\x80\x80"
371 : I8_to_native("\xf8\xa0\xa0\xa0\xa0"),
372 0, # NUL
373 ],
374 [ "overlong malformation, highest 5-byte",
375 (isASCII)
376 ? "\xf8\x87\xbf\xbf\xbf"
377 : I8_to_native("\xf8\xa7\xbf\xbf\xbf"),
378 (isASCII) ? 0x1FFFFF : 0x3FFFF,
379 ],
380 [ "overlong malformation, lowest 6-byte",
381 (isASCII)
382 ? "\xfc\x80\x80\x80\x80\x80"
383 : I8_to_native("\xfc\xa0\xa0\xa0\xa0\xa0"),
384 0, # NUL
385 ],
386 [ "overlong malformation, highest 6-byte",
387 (isASCII)
388 ? "\xfc\x83\xbf\xbf\xbf\xbf"
389 : I8_to_native("\xfc\xa3\xbf\xbf\xbf\xbf"),
390 (isASCII) ? 0x3FFFFFF : 0x3FFFFF,
391 ],
392 [ "overlong malformation, lowest 7-byte",
393 (isASCII)
394 ? "\xfe\x80\x80\x80\x80\x80\x80"
395 : I8_to_native("\xfe\xa0\xa0\xa0\xa0\xa0\xa0"),
396 0, # NUL
397 ],
398 [ "overlong malformation, highest 7-byte",
399 (isASCII)
400 ? "\xfe\x81\xbf\xbf\xbf\xbf\xbf"
401 : I8_to_native("\xfe\xa1\xbf\xbf\xbf\xbf\xbf"),
402 (isASCII) ? 0x7FFFFFFF : 0x3FFFFFF,
403 ],
d22ec717 404 [ "highest 31 bit code point",
8f79178b 405 (isASCII)
d22ec717 406 ? "\xfd\xbf\xbf\xbf\xbf\xbf"
8f79178b 407 : I8_to_native(
d22ec717
KW
408 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"),
409 0x7FFFFFFF,
410 1,
8f79178b 411 ],
d22ec717 412 [ "lowest 32 bit code point",
f4da64d4 413 (isASCII)
d22ec717 414 ? "\xfe\x82\x80\x80\x80\x80\x80"
f4da64d4 415 : I8_to_native(
d22ec717
KW
416 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
417 ($::is64bit) ? 0x80000000 : -1, # Overflows on 32-bit systems
418 1,
f4da64d4 419 ],
d22ec717
KW
420 # Used when UV_MAX is allowed as a code point
421 #[ "highest 32 bit code point",
422 # (isASCII)
423 # ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
424 # : I8_to_native(
425 # "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
426 # 0xFFFFFFFF,
427 #],
428 #[ "Lowest 33 bit code point",
429 # (isASCII)
430 # ? "\xfe\x84\x80\x80\x80\x80\x80"
431 # : I8_to_native(
432 # "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
433 # ($::is64bit) ? 0x100000000 : 0x0, # Overflows on 32-bit systems
434 #],
8f79178b
KW
435 );
436
437 if (! $::is64bit) {
438 if (isASCII) {
8f79178b 439 push @tests,
cf8a8202 440 [ "overlong malformation, but naively looks like overflow",
d22ec717
KW
441 "\xff\x80\x80\x80\x80\x80\x80\x81\xbf\xbf\xbf\xbf\xbf",
442 0x7FFFFFFF,
cf8a8202 443 ],
d22ec717
KW
444 # Used when above IV_MAX are allowed.
445 #[ "overlong malformation, but naively looks like overflow",
446 # "\xff\x80\x80\x80\x80\x80\x80\x83\xbf\xbf\xbf\xbf\xbf",
447 # 0xFFFFFFFF,
448 #],
c81d4d83
KW
449 [ "overflow that old algorithm failed to detect",
450 "\xfe\x86\x80\x80\x80\x80\x80",
451 -1,
8f79178b
KW
452 ];
453 }
454 }
c81d4d83 455
3f055917
KW
456 push @tests,
457 [ "overlong malformation, lowest max-byte",
458 (isASCII)
459 ? "\xff\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
460 : I8_to_native(
461 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
462 0, # NUL
463 ],
464 [ "overlong malformation, highest max-byte",
465 (isASCII) # 2**36-1 on ASCII; 2**30-1 on EBCDIC
466 ? "\xff\x80\x80\x80\x80\x80\x80\xbf\xbf\xbf\xbf\xbf\xbf"
467 : I8_to_native(
468 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xbf\xbf\xbf\xbf\xbf\xbf"),
469 (isASCII) ? (($::is64bit) ? 0xFFFFFFFFF : -1) : 0x3FFFFFFF,
470 ];
471
45d8ef8c
KW
472 if (isASCII) {
473 push @tests,
474 [ "Lowest code point requiring 13 bytes to represent", # 2**36
475 "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
476 ($::is64bit) ? 0x1000000000 : -1, # overflows on 32bit
477 ],
478 };
479
c81d4d83 480 if ($::is64bit) {
ba627a0b 481 push @tests,
d22ec717 482 [ "highest 63 bit code point",
ba627a0b 483 (isASCII)
d22ec717 484 ? "\xff\x80\x87\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
ba627a0b 485 : I8_to_native(
d22ec717
KW
486 "\xff\xa7\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
487 0x7FFFFFFFFFFFFFFF,
ba627a0b
KW
488 (isASCII) ? 1 : 2,
489 ],
d22ec717 490 [ "first 64 bit code point",
ba627a0b 491 (isASCII)
d22ec717 492 ? "\xff\x80\x88\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
ba627a0b 493 : I8_to_native(
d22ec717 494 "\xff\xa8\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
ba627a0b 495 -1,
ba627a0b 496 ];
d22ec717
KW
497 # Used when UV_MAX is allowed as a code point
498 #[ "highest 64 bit code point",
499 # (isASCII)
500 # ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
501 # : I8_to_native(
502 # "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
503 # 0xFFFFFFFFFFFFFFFF,
504 # (isASCII) ? 1 : 2,
505 #],
506 #[ "first 65 bit code point",
507 # (isASCII)
508 # ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
509 # : I8_to_native(
510 # "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
511 # 0,
512 #];
c81d4d83
KW
513 if (isASCII) {
514 push @tests,
515 [ "overflow that old algorithm failed to detect",
516 "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
517 -1,
c81d4d83
KW
518 ];
519 }
520 else {
8f79178b
KW
521 push @tests, # These could falsely show wrongly in a naive
522 # implementation
523 [ "requires at least 32 bits",
524 I8_to_native(
6aa905cf 525 "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b 526 0x800000000,
5064495b 527 40000000
8f79178b
KW
528 ],
529 [ "requires at least 32 bits",
530 I8_to_native(
6aa905cf 531 "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b 532 0x10000000000,
8f79178b
KW
533 ],
534 [ "requires at least 32 bits",
535 I8_to_native(
6aa905cf 536 "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b 537 0x200000000000,
8f79178b
KW
538 ],
539 [ "requires at least 32 bits",
540 I8_to_native(
6aa905cf 541 "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b 542 0x4000000000000,
8f79178b
KW
543 ],
544 [ "requires at least 32 bits",
545 I8_to_native(
6aa905cf 546 "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b 547 0x80000000000000,
8f79178b
KW
548 ],
549 [ "requires at least 32 bits",
550 I8_to_native(
6aa905cf 551 "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b 552 0x1000000000000000,
8f79178b
KW
553 ];
554 }
6aa905cf 555 }
c7f9e4bb 556}
6aa905cf 557
5722c46d
KW
558sub flags_to_text($$)
559{
560 my ($flags, $flags_to_text_ref) = @_;
561
562 # Returns a string containing a mnemonic representation of the bits that
563 # are set in the $flags. These are assumed to be flag bits. The return
564 # looks like "FOO|BAR|BAZ". The second parameter is a reference to an
565 # array that gives the textual representation of all the possible flags.
566 # Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If
567 # no bits at all are set the string "0" is returned;
568
569 my @flag_text;
570 my $shift = 0;
571
572 return "0" if $flags == 0;
573
574 while ($flags) {
575 #diag sprintf "%x", $flags;
576 if ($flags & 1) {
577 push @flag_text, $flags_to_text_ref->[$shift];
578 }
579 $shift++;
580 $flags >>= 1;
581 }
582
583 return join "|", @flag_text;
584}
585
586# Possible flag returns from utf8n_to_uvchr_error(). These should have G_,
587# instead of A_, D_, but the prefixes will be used in a a later commit, so
588# minimize churn by having them here.
589my @utf8n_flags_to_text = ( qw(
590 A_EMPTY
591 A_CONTINUATION
592 A_NON_CONTINUATION
593 A_SHORT
594 A_LONG
595 A_LONG_AND_ITS_VALUE
596 PLACEHOLDER
597 A_OVERFLOW
598 D_SURROGATE
599 W_SURROGATE
600 D_NONCHAR
601 W_NONCHAR
602 D_SUPER
603 W_SUPER
d044b7a7
KW
604 D_PERL_EXTENDED
605 W_PERL_EXTENDED
5722c46d
KW
606 CHECK_ONLY
607 NO_CONFIDENCE_IN_CURLEN_
608 ) );
609
a8ee5133
KW
610sub utf8n_display_call($)
611{
612 # Converts an eval string that calls test_utf8n_to_uvchr into a more human
613 # readable form, and returns it. Doesn't work if the byte string contains
614 # an apostrophe. The return will look something like:
615 # test_utf8n_to_uvchr_error('$bytes', $length, $flags)
616 #diag $_[0];
617
618 $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
619 my $text1 = $1; # Everything before the byte string
620 my $bytes = $2;
621 my $text2 = $3; # Includes the length
622 my $flags = $4;
623
624 return $text1
625 . display_bytes($bytes)
626 . $text2
627 . flags_to_text($flags, \@utf8n_flags_to_text)
628 . ')';
629}
630
b8e5e8ad
KW
631my @uvchr_flags_to_text = ( qw(
632 W_SURROGATE
633 W_NONCHAR
634 W_SUPER
635 W_PERL_EXTENDED
636 D_SURROGATE
637 D_NONCHAR
638 D_SUPER
639 D_PERL_EXTENDED
640) );
641
d884ea32
KW
642sub uvchr_display_call($)
643{
644 # Converts an eval string that calls test_uvchr_to_utf8 into a more human
645 # readable form, and returns it. The return will look something like:
646 # test_uvchr_to_utf8n_flags($uv, $flags)
647 #diag $_[0];
648
d884ea32
KW
649
650 $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
651 my $text = $1;
652 my $cp = sprintf "%X", $2;
653 my $flags = $3;
654
b8e5e8ad 655 return "${text}0x$cp, " . flags_to_text($flags, \@uvchr_flags_to_text) . ')';
d884ea32
KW
656}
657
69485e19
KW
658sub do_warnings_test(@)
659{
660 my @expected_warnings = @_;
661
662 # Compares the input expected warnings array with @warnings_gotten,
663 # generating a pass for each found, removing it from @warnings_gotten.
664 # Any discrepancies generate test failures. Returns TRUE if no
665 # discrepcancies; otherwise FALSE.
666
667 my $succeeded = 1;
668
669 if (@expected_warnings == 0) {
670 if (! is(@warnings_gotten, 0, " Expected and got no warnings")) {
671 output_warnings(@warnings_gotten);
672 $succeeded = 0;
673 }
674 return $succeeded;
675 }
676
677 # Check that we got all the expected warnings,
678 # removing each one found
679 WARNING:
680 foreach my $expected (@expected_warnings) {
681 foreach (my $i = 0; $i < @warnings_gotten; $i++) {
682 if ($warnings_gotten[$i] =~ $expected) {
683 pass(" Expected and got warning: "
684 . " $warnings_gotten[$i]");
685 splice @warnings_gotten, $i, 1;
686 next WARNING;
687 }
688 }
689 fail(" Expected a warning that matches "
690 . $expected . " but didn't get it");
691 $succeeded = 0;
692 }
693
694 if (! is(@warnings_gotten, 0, " Got no unexpected warnings")) {
695 output_warnings(@warnings_gotten);
696 $succeeded = 0;
697 }
698
699 return $succeeded;
700}
701
6aa905cf
KW
702# This test is split into this number of files.
703my $num_test_files = $ENV{TEST_JOBS} || 1;
704$num_test_files = 10 if $num_test_files > 10;
705
37657a5b
KW
706# We only really need to test utf8n_to_uvchr_msgs() once with this flag.
707my $tested_CHECK_ONLY = 0;
708
6aa905cf
KW
709my $test_count = -1;
710foreach my $test (@tests) {
e0803729
KW
711 $test_count++;
712 next if $test_count % $num_test_files != $::TEST_CHUNK;
713
714 my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
715
716 my $length = length $bytes;
717 my $initially_overlong = $testname =~ /overlong/;
718 my $initially_orphan = $testname =~ /orphan/;
719 my $will_overflow = $allowed_uv < 0;
720
721 my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
722 my $display_bytes = display_bytes($bytes);
723
724 my $controlling_warning_category;
725 my $utf8n_flag_to_warn;
726 my $utf8n_flag_to_disallow;
727 my $uvchr_flag_to_warn;
728 my $uvchr_flag_to_disallow;
729
730 # We want to test that the independent flags are actually independent.
731 # For example, that a surrogate doesn't trigger a non-character warning,
732 # and conversely, turning off an above-Unicode flag doesn't suppress a
733 # surrogate warning. Earlier versions of this file used nested loops to
734 # test all possible combinations. But that creates lots of tests, making
735 # this run too long. What is now done instead is to use the complement of
736 # the category we are testing to greatly reduce the combinatorial
737 # explosion. For example, if we have a surrogate and we aren't expecting
738 # a warning about it, we set all the flags for non-surrogates to raise
739 # warnings. If one shows up, it indicates the flags aren't independent.
740 my $utf8n_flag_to_warn_complement;
741 my $utf8n_flag_to_disallow_complement;
742 my $uvchr_flag_to_warn_complement;
743 my $uvchr_flag_to_disallow_complement;
744
745 # Many of the code points being tested are middling in that if code point
746 # edge cases work, these are very likely to as well. Because this test
747 # file takes a while to execute, we skip testing the edge effects of code
748 # points deemed middling, while testing their basics and continuing to
749 # fully test the non-middling code points.
750 my $skip_most_tests = 0;
751
752 my $cp_message_qr; # Pattern that matches the message raised when
753 # that message contains the problematic code
754 # point. The message is the same (currently) both
755 # when going from/to utf8.
756 my $non_cp_trailing_text; # The suffix text when the message doesn't
757 # contain a code point. (This is a result of
758 # some sort of malformation that means we
759 # can't get an exact code poin
760 my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
761 \Q requires a Perl extension, and so is not\E
762 \Q portable\E/x;
763 my $extended_non_cp_trailing_text
764 = "is a Perl extension, and so is not portable";
765
766 # What bytes should have been used to specify a code point that has been
767 # specified as an overlong.
768 my $correct_bytes_for_overlong;
769
770 # Is this test malformed from the beginning? If so, we know to generally
771 # expect that the tests will show it isn't valid.
772 my $initially_malformed = 0;
773
774 if ($initially_overlong || $initially_orphan) {
775 $non_cp_trailing_text = "if you see this, there is an error";
776 $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
777 $initially_malformed = 1;
778 $utf8n_flag_to_warn = 0;
779 $utf8n_flag_to_disallow = 0;
780
781 $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE;
782 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE;
783 if (! $will_overflow && $allowed_uv <= 0x10FFFF) {
784 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_SUPER;
785 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_SUPER;
786 if (($allowed_uv & 0xFFFF) != 0xFFFF) {
787 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_NONCHAR;
788 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_NONCHAR;
789 }
790 }
791 if (! is_extended_utf8($bytes)) {
792 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED;
793 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_PERL_EXTENDED;
794 }
2282dfc4 795
e0803729 796 $controlling_warning_category = 'utf8';
2282dfc4 797
e0803729
KW
798 if ($initially_overlong) {
799 if (! defined $needed_to_discern_len) {
800 $needed_to_discern_len = overlong_discern_len($bytes);
801 }
802 $correct_bytes_for_overlong = display_bytes_no_quotes(chr $allowed_uv);
803 }
804 }
805 elsif($will_overflow || $allowed_uv > 0x10FFFF) {
806
807 # Set the SUPER flags; later, we test for PERL_EXTENDED as well.
808 $utf8n_flag_to_warn = $::UTF8_WARN_SUPER;
809 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
810 $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER;
811 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
812
813 # Below, we add the flags for non-perl_extended to the code points
814 # that don't fit that category. Special tests are done for this
815 # category in the inner loop.
816 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
817 |$::UTF8_WARN_SURROGATE;
818 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
819 |$::UTF8_DISALLOW_SURROGATE;
820 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
821 |$::UNICODE_WARN_SURROGATE;
822 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
823 |$::UNICODE_DISALLOW_SURROGATE;
824 $controlling_warning_category = 'non_unicode';
825
826 if ($will_overflow) { # This is realy a malformation
827 $non_cp_trailing_text = "if you see this, there is an error";
828 $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
829 $initially_malformed = 1;
830 if (! defined $needed_to_discern_len) {
831 $needed_to_discern_len = overflow_discern_len($length);
832 }
833 }
834 elsif (requires_extended_utf8($allowed_uv)) {
835 $cp_message_qr = $extended_cp_message_qr;
836 $non_cp_trailing_text = $extended_non_cp_trailing_text;
837 $needed_to_discern_len = 1 unless defined $needed_to_discern_len;
838 }
839 else {
840 $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
841 \Q may not be portable\E/x;
842 $non_cp_trailing_text = "is for a non-Unicode code point, may not"
843 . " be portable";
844 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED;
845 $utf8n_flag_to_disallow_complement
846 |= $::UTF8_DISALLOW_PERL_EXTENDED;
847 $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED;
848 $uvchr_flag_to_disallow_complement
849 |= $::UNICODE_DISALLOW_PERL_EXTENDED;
850 }
851 }
852 elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
853 $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/;
854 $non_cp_trailing_text = "is for a surrogate";
855 $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
856 $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
857
858 $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE;
859 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
860 $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE;
861 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
862
863 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
864 |$::UTF8_WARN_SUPER
865 |$::UTF8_WARN_PERL_EXTENDED;
866 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
867 |$::UTF8_DISALLOW_SUPER
868 |$::UTF8_DISALLOW_PERL_EXTENDED;
869 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
870 |$::UNICODE_WARN_SUPER
871 |$::UNICODE_WARN_PERL_EXTENDED;
872 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
873 |$::UNICODE_DISALLOW_SUPER
874 |$::UNICODE_DISALLOW_PERL_EXTENDED;
875 $controlling_warning_category = 'surrogate';
876 }
877 elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
878 || ($allowed_uv & 0xFFFE) == 0xFFFE)
879 {
880 $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E
881 \Q is not recommended for open interchange\E/x;
882 $non_cp_trailing_text = "if you see this, there is an error";
883 $needed_to_discern_len = $length unless defined $needed_to_discern_len;
884 if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
885 || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
886 {
887 $skip_most_tests = 1;
888 }
4816e15f 889
e0803729
KW
890 $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR;
891 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
892 $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR;
893 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
894
895 $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE
896 |$::UTF8_WARN_SUPER
897 |$::UTF8_WARN_PERL_EXTENDED;
898 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
899 |$::UTF8_DISALLOW_SUPER
900 |$::UTF8_DISALLOW_PERL_EXTENDED;
901 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE
902 |$::UNICODE_WARN_SUPER
903 |$::UNICODE_WARN_PERL_EXTENDED;
904 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
905 |$::UNICODE_DISALLOW_SUPER
906 |$::UNICODE_DISALLOW_PERL_EXTENDED;
907
908 $controlling_warning_category = 'nonchar';
909 }
910 else {
911 die "Can't figure out what type of warning to test for $testname"
912 }
913
914 die 'Didn\'t set $needed_to_discern_len for ' . $testname
915 unless defined $needed_to_discern_len;
916
917 # We try various combinations of malformations that can occur
918 foreach my $short (0, 1) {
919 next if $skip_most_tests && $short;
920 foreach my $unexpected_noncont (0, 1) {
921 next if $skip_most_tests && $unexpected_noncont;
922 foreach my $overlong (0, 1) {
923 next if $overlong && $skip_most_tests;
924 next if $initially_overlong && ! $overlong;
925
926 # If we're creating an overlong, it can't be longer than the
927 # maximum length, so skip if we're already at that length.
928 next if (! $initially_overlong && $overlong)
929 && $length >= $::max_bytes;
930
931 my $this_cp_message_qr = $cp_message_qr;
932 my $this_non_cp_trailing_text = $non_cp_trailing_text;
933
934 foreach my $malformed_allow_type (0..2) {
935 # 0 don't allow this malformation; ignored if no malformation
936 # 1 allow, with REPLACEMENT CHARACTER returned
937 # 2 allow, with intended code point returned. All malformations
938 # other than overlong can't determine the intended code point,
939 # so this isn't valid for them.
940 next if $malformed_allow_type == 2
941 && ($will_overflow || $short || $unexpected_noncont);
942 next if $skip_most_tests && $malformed_allow_type;
943
944 # Here we are in the innermost loop for malformations. So we
945 # know which ones are in effect. Can now change the input to be
946 # appropriately malformed. We also can set up certain other
947 # things now, like whether we expect a return flag from this
948 # malformation, and which flag.
949
950 my $this_bytes = $bytes;
951 my $this_length = $length;
952 my $this_expected_len = $length;
953 my $this_needed_to_discern_len = $needed_to_discern_len;
954
955 my @malformation_names;
956 my @expected_malformation_warnings;
957 my @expected_malformation_return_flags;
958
959 # Contains the flags for any allowed malformations. Currently no
960 # combinations of on/off are tested for. It's either all are
961 # allowed, or none are.
962 my $allow_flags = 0;
963 my $overlong_is_in_perl_extended_utf8 = 0;
964 my $dont_use_overlong_cp = 0;
965
966 if ($initially_orphan) {
967 next if $overlong || $short || $unexpected_noncont;
968 }
717dd9f9 969
e0803729
KW
970 if ($overlong) {
971 if (! $initially_overlong) {
972 my $new_expected_len;
973
974 # To force this malformation, we convert the original start
975 # byte into a continuation byte with the same data bits as
976 # originally. ...
977 my $start_byte = substr($this_bytes, 0, 1);
978 my $converted_to_continuation_byte
979 = start_byte_to_cont($start_byte);
980
981 # ... Then we prepend it with a known overlong sequence.
982 # This should evaluate to the exact same code point as the
983 # original. We try to avoid an overlong using Perl
984 # extended UTF-8. The code points are the highest
985 # representable as overlongs on the respective platform
986 # without using extended UTF-8.
987 if (native_to_I8($start_byte) lt "\xFC") {
988 $start_byte = I8_to_native("\xFC");
989 $new_expected_len = 6;
990 }
991 elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") {
992
993 # FE is not extended UTF-8 on EBCDIC
994 $start_byte = I8_to_native("\xFE");
995 $new_expected_len = 7;
996 }
997 else { # Must use extended UTF-8. On ASCII platforms, we
998 # could express some overlongs here starting with
999 # \xFE, but there's no real reason to do so.
1000 $overlong_is_in_perl_extended_utf8 = 1;
1001 $start_byte = I8_to_native("\xFF");
1002 $new_expected_len = $::max_bytes;
1003 $this_cp_message_qr = $extended_cp_message_qr;
1004
1005 # The warning that gets raised doesn't include the
1006 # code point in the message if the code point can be
1007 # expressed without using extended UTF-8, but the
1008 # particular overlong sequence used is in extended
1009 # UTF-8. To do otherwise would be confusing to the
1010 # user, as it would claim the code point requires
1011 # extended, when it doesn't.
1012 $dont_use_overlong_cp = 1
1013 unless requires_extended_utf8($allowed_uv);
1014 $this_non_cp_trailing_text
1015 = $extended_non_cp_trailing_text;
1016 }
1017
1018 # Splice in the revise continuation byte, preceded by the
1019 # start byte and the proper number of the lowest
1020 # continuation bytes.
1021 $this_bytes = $start_byte
1022 . ($native_lowest_continuation_chr
1023 x ( $new_expected_len
1024 - 1
1025 - length($this_bytes)))
1026 . $converted_to_continuation_byte
1027 . substr($this_bytes, 1);
1028 $this_length = length($this_bytes);
1029 $this_needed_to_discern_len = $new_expected_len
1030 - ( $this_expected_len
1031 - $this_needed_to_discern_len);
1032 $this_expected_len = $new_expected_len;
1033 }
1034 }
4816e15f 1035
e0803729 1036 if ($short) {
4816e15f 1037
e0803729
KW
1038 # To force this malformation, just tell the test to not look
1039 # as far as it should into the input.
1040 $this_length--;
1041 $this_expected_len--;
4816e15f 1042
e0803729
KW
1043 $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
1044 }
1d21b5e7 1045
e0803729 1046 if ($unexpected_noncont) {
4816e15f 1047
e0803729
KW
1048 # To force this malformation, change the final continuation
1049 # byte into a start byte.
1050 my $pos = ($short) ? -2 : -1;
1051 substr($this_bytes, $pos, 1) = $known_start_byte;
1052 $this_expected_len--;
1053 }
153bcbd6 1054
e0803729
KW
1055 # The whole point of a test that is malformed from the beginning
1056 # is to test for that malformation. If we've modified things so
1057 # much that we don't have enough information to detect that
1058 # malformation, there's no point in testing.
1059 next if $initially_malformed
1060 && $this_expected_len < $this_needed_to_discern_len;
1061
1062 # Here, we've transformed the input with all of the desired
1063 # non-overflow malformations. We are now in a position to
1064 # construct any potential warnings for those malformations. But
1065 # it's a pain to get the detailed messages exactly right, so for
1066 # now XXX, only do so for those that return an explicit code
1067 # point.
1068
1069 if ($initially_orphan) {
1070 push @malformation_names, "orphan continuation";
1071 push @expected_malformation_return_flags,
1072 $::UTF8_GOT_CONTINUATION;
1073 $allow_flags |= $::UTF8_ALLOW_CONTINUATION
1074 if $malformed_allow_type;
1075 push @expected_malformation_warnings, qr/unexpected continuation/;
1076 }
4816e15f 1077
e0803729
KW
1078 if ($overlong) {
1079 push @malformation_names, 'overlong';
1080 push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
4816e15f 1081
e0803729
KW
1082 # If one of the other malformation types is also in effect, we
1083 # don't know what the intended code point was.
1084 if ($short || $unexpected_noncont || $will_overflow) {
1085 push @expected_malformation_warnings, qr/overlong/;
1086 }
1087 else {
1088 my $wrong_bytes = display_bytes_no_quotes(
1089 substr($this_bytes, 0, $this_length));
1090 if (! defined $correct_bytes_for_overlong) {
1091 $correct_bytes_for_overlong
1092 = display_bytes_no_quotes($bytes);
1093 }
1094 my $prefix = ( $allowed_uv > 0x10FFFF
1095 || ! isASCII && $allowed_uv < 256)
1096 ? "0x"
1097 : "U+";
1098 push @expected_malformation_warnings,
1099 qr/\QMalformed UTF-8 character: $wrong_bytes\E
1100 \Q (overlong; instead use\E
1101 \Q $correct_bytes_for_overlong to\E
1102 \Q represent $prefix$uv_string)/x;
1103 }
4816e15f 1104
e0803729
KW
1105 if ($malformed_allow_type == 2) {
1106 $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
1107 }
1108 elsif ($malformed_allow_type) {
1109 $allow_flags |= $::UTF8_ALLOW_LONG;
1110 }
1111 }
1112 if ($short) {
1113 push @malformation_names, 'short';
1114 push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
1115 push @expected_malformation_warnings, qr/too short/;
1116 }
1117 if ($unexpected_noncont) {
1118 push @malformation_names, 'unexpected non-continuation';
1119 push @expected_malformation_return_flags,
1120 $::UTF8_GOT_NON_CONTINUATION;
1121 $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
1122 if $malformed_allow_type;
1123 push @expected_malformation_warnings,
1124 qr/unexpected non-continuation byte/;
1125 }
4816e15f 1126
e0803729
KW
1127 # The overflow malformation is done differently than other
1128 # malformations. It comes from manually typed tests in the test
1129 # array. We now make it be treated like one of the other
1130 # malformations. But some has to be deferred until the inner loop
1131 my $overflow_msg_pattern;
1132 if ($will_overflow) {
1133 push @malformation_names, 'overflow';
1134
1135 $overflow_msg_pattern = display_bytes_no_quotes(
1136 substr($this_bytes, 0, $this_expected_len));
1137 $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E
1138 \Q $overflow_msg_pattern\E
1139 \Q (overflows)\E/x;
1140 push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW;
1141 $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type;
1142 }
8e0e76af 1143
e0803729
KW
1144 # And we can create the malformation-related text for the the test
1145 # names we eventually will generate.
1146 my $malformations_name = "";
1147 if (@malformation_names) {
1148 $malformations_name .= "dis" unless $malformed_allow_type;
1149 $malformations_name .= "allowed ";
1150 $malformations_name .= "malformation";
1151 $malformations_name .= "s" if @malformation_names > 1;
1152 $malformations_name .= ": ";
1153 $malformations_name .= join "/", @malformation_names;
1154 $malformations_name = " ($malformations_name)";
1155 }
8e0e76af 1156
e0803729 1157 # Done setting up the malformation related stuff
8e0e76af 1158
e0803729
KW
1159 { # First test the isFOO calls
1160 use warnings; # XXX no warnings 'deprecated'; # Make sure these don't raise warnings
1161 undef @warnings_gotten;
8e0e76af 1162
e0803729
KW
1163 my $ret = test_isUTF8_CHAR($this_bytes, $this_length);
1164 my $ret_flags
1165 = test_isUTF8_CHAR_flags($this_bytes, $this_length, 0);
1166 if ($malformations_name) {
1167 is($ret, 0, "For $testname$malformations_name: isUTF8_CHAR() returns 0");
1168 is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0");
1169 }
1170 else {
1171 is($ret, $this_length, "For $testname: isUTF8_CHAR() returns"
1172 . " expected length: $this_length");
1173 is($ret_flags, $this_length,
1174 " And isUTF8_CHAR_flags(...,0) returns expected"
1175 . " length: $this_length");
1176 }
1177 is(scalar @warnings_gotten, 0,
1178 " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags"
1179 . " generated any warnings")
1180 or output_warnings(@warnings_gotten);
1181
1182 undef @warnings_gotten;
1183 $ret = test_isSTRICT_UTF8_CHAR($this_bytes, $this_length);
1184 if ($malformations_name) {
1185 is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0");
1186 }
1187 else {
1188 my $expected_ret
1189 = ( $testname =~ /surrogate|non-character/
1190 || $allowed_uv > 0x10FFFF)
1191 ? 0
1192 : $this_length;
1193 is($ret, $expected_ret,
1194 " And isSTRICT_UTF8_CHAR() returns expected"
1195 . " length: $expected_ret");
1196 $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1197 $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
1198 is($ret, $expected_ret,
1199 " And isUTF8_CHAR_flags('"
1200 . "DISALLOW_ILLEGAL_INTERCHANGE') acts like"
1201 . " isSTRICT_UTF8_CHAR");
1202 }
1203 is(scalar @warnings_gotten, 0,
1204 " And neither isSTRICT_UTF8_CHAR() nor"
1205 . " isUTF8_CHAR_flags generated any warnings")
1206 or output_warnings(@warnings_gotten);
1207
1208 undef @warnings_gotten;
1209 $ret = test_isC9_STRICT_UTF8_CHAR($this_bytes, $this_length);
1210 if ($malformations_name) {
1211 is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0");
1212 }
1213 else {
1214 my $expected_ret = ( $testname =~ /surrogate/
1215 || $allowed_uv > 0x10FFFF)
1216 ? 0
1217 : $this_expected_len;
1218 is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()"
1219 . " returns expected length:"
1220 . " $expected_ret");
1221 $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1222 $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
1223 is($ret, $expected_ret,
1224 " And isUTF8_CHAR_flags('"
1225 . "DISALLOW_ILLEGAL_C9_INTERCHANGE') acts like"
1226 . " isC9_STRICT_UTF8_CHAR");
1227 }
1228 is(scalar @warnings_gotten, 0,
1229 " And neither isC9_STRICT_UTF8_CHAR() nor"
1230 . " isUTF8_CHAR_flags generated any warnings")
1231 or output_warnings(@warnings_gotten);
1232
1233 foreach my $disallow_type (0..2) {
1234 # 0 is don't disallow this type of code point
1235 # 1 is do disallow
1236 # 2 is do disallow, but only code points requiring
1237 # perl-extended-UTF8
1238
1239 my $disallow_flags;
1240 my $expected_ret;
1241
1242 if ($malformations_name) {
1243
1244 # Malformations are by default disallowed, so testing
1245 # with $disallow_type equal to 0 is sufficicient.
1246 next if $disallow_type;
1247
1248 $disallow_flags = 0;
1249 $expected_ret = 0;
1250 }
1251 elsif ($disallow_type == 1) {
1252 $disallow_flags = $utf8n_flag_to_disallow;
1253 $expected_ret = 0;
1254 }
1255 elsif ($disallow_type == 2) {
1256 next if ! requires_extended_utf8($allowed_uv);
1257 $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED;
1258 $expected_ret = 0;
1259 }
1260 else { # type is 0
1261 $disallow_flags = $utf8n_flag_to_disallow_complement;
1262 $expected_ret = $this_length;
1263 }
1264
1265 $ret = test_isUTF8_CHAR_flags($this_bytes, $this_length,
1266 $disallow_flags);
1267 is($ret, $expected_ret,
1268 " And isUTF8_CHAR_flags($display_bytes,"
1269 . " $disallow_flags) returns $expected_ret")
1270 or diag "The flags mean "
1271 . flags_to_text($disallow_flags,
1272 \@utf8n_flags_to_text);
1273 is(scalar @warnings_gotten, 0,
1274 " And isUTF8_CHAR_flags(...) generated"
1275 . " no warnings")
1276 or output_warnings(@warnings_gotten);
1277
1278 # Test partial character handling, for each byte not a
1279 # full character
1280 my $did_test_partial = 0;
1281 for (my $j = 1; $j < $this_length - 1; $j++) {
1282 $did_test_partial = 1;
1283 my $partial = substr($this_bytes, 0, $j);
1284 my $ret_should_be;
1285 my $comment;
1286 if ($disallow_type || $malformations_name) {
1287 $ret_should_be = 0;
1288 $comment = "disallowed";
1289
1290 # The number of bytes required to tell if a
1291 # sequence has something wrong is the smallest of
1292 # all the things wrong with it. We start with the
1293 # number for this type of code point, if that is
1294 # disallowed; or the whole length if not. The
1295 # latter is what a couple of the malformations
1296 # require.
1297 my $needed_to_tell = ($disallow_type)
1298 ? $this_needed_to_discern_len
1299 : $this_expected_len;
1300
1301 # Then we see if the malformations that are
1302 # detectable early in the string are present.
1303 if ($overlong) {
1304 my $dl = overlong_discern_len($this_bytes);
1305 $needed_to_tell = $dl if $dl < $needed_to_tell;
1306 }
1307 if ($will_overflow) {
1308 my $dl = overflow_discern_len($length);
1309 $needed_to_tell = $dl if $dl < $needed_to_tell;
1310 }
8e0e76af 1311
e0803729
KW
1312 if ($j < $needed_to_tell) {
1313 $ret_should_be = 1;
1314 $comment .= ", but need $needed_to_tell"
1315 . " bytes to discern:";
1316 }
1317 }
1318 else {
1319 $ret_should_be = 1;
1320 $comment = "allowed";
1321 }
8e0e76af 1322
e0803729 1323 undef @warnings_gotten;
8e0e76af 1324
e0803729
KW
1325 $ret = test_is_utf8_valid_partial_char_flags($partial,
1326 $j, $disallow_flags);
1327 is($ret, $ret_should_be,
1328 " And is_utf8_valid_partial_char_flags("
1329 . display_bytes($partial)
1330 . ", $disallow_flags), $comment: returns"
1331 . " $ret_should_be")
8e0e76af 1332 or diag "The flags mean "
e0803729
KW
1333 . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
1334 }
1335
1336 if ($did_test_partial) {
1337 is(scalar @warnings_gotten, 0,
1338 " And is_utf8_valid_partial_char_flags()"
1339 . " generated no warnings for any of the lengths")
1340 or output_warnings(@warnings_gotten);
1341 }
1342 }
1343 }
8e0e76af 1344
e0803729
KW
1345 # Now test the to/from UTF-8 calls. There are several orthogonal
1346 # variables involved. We test most possible combinations
8e0e76af 1347
e0803729
KW
1348 foreach my $do_disallow (0, 1) {
1349 if ($do_disallow) {
1350 next if $initially_overlong || $initially_orphan;
1351 }
1352 else {
1353 next if $skip_most_tests;
8e0e76af
KW
1354 }
1355
e0803729
KW
1356 # This tests three functions. utf8n_to_uvchr_error,
1357 # utf8n_to_uvchr_msgs, and uvchr_to_utf8_flags. But only the
1358 # first two are variants of each other. We use a loop
1359 # 'which_func' to determine which of these. uvchr_to_utf8_flags
1360 # is done separately at the end of each iteration, only when
1361 # which_func is 0. which_func is numeric in part so we don't
1362 # have to type in the function name and risk misspelling it
1363 # somewhere, and also it sets whether we are expecting warnings
1364 # or not in certain places. The _msgs() version of the function
1365 # expects warnings even if lexical ones are turned off, so by
1366 # making its which_func == 1, we can say we want warnings;
1367 # whereas the other one with the value 0, doesn't get them.
1368 for my $which_func (0, 1) {
1369 my $func = ($which_func)
1370 ? 'utf8n_to_uvchr_msgs'
1371 : 'utf8n_to_uvchr_error';
6aa905cf 1372
69485e19
KW
1373 # We classify the warnings into certain "interesting" types,
1374 # described later
1375 foreach my $warning_type (0..4) {
1376 next if $skip_most_tests && $warning_type != 1;
1377 foreach my $use_warn_flag (0, 1) {
3f055917 1378 if ($use_warn_flag) {
1d21b5e7 1379 next if $initially_overlong || $initially_orphan;
37657a5b
KW
1380
1381 # Since utf8n_to_uvchr_msgs() expects warnings even
1382 # when lexical ones are turned off, we can skip
1383 # testing it when they are turned on, with little
1384 # likelihood of missing an error case.
1385 next if $which_func;
3f055917
KW
1386 }
1387 else {
1388 next if $skip_most_tests;
1389 }
69485e19 1390
8f79178b
KW
1391 # Finally, here is the inner loop
1392
69485e19
KW
1393 my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn;
1394 my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow;
1395 my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn;
1396 my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow;
1397
1398 my $eval_warn;
1399 my $expect_regular_warnings;
1400 my $expect_warnings_for_malformed;
1401 my $expect_warnings_for_overflow;
1402
1403 if ($warning_type == 0) {
d22ec717 1404 $eval_warn = "use warnings";
69485e19 1405 $expect_regular_warnings = $use_warn_flag;
717dd9f9
KW
1406
1407 # We ordinarily expect overflow warnings here. But it
1408 # is somewhat more complicated, and the final
1409 # determination is deferred to one place in the filw
1410 # where we handle overflow.
69485e19 1411 $expect_warnings_for_overflow = 1;
717dd9f9
KW
1412
1413 # We would ordinarily expect malformed warnings in
1414 # this case, but not if malformations are allowed.
1415 $expect_warnings_for_malformed
1416 = $malformed_allow_type == 0;
6aa905cf 1417 }
69485e19
KW
1418 elsif ($warning_type == 1) {
1419 $eval_warn = "no warnings";
37657a5b
KW
1420 $expect_regular_warnings = $which_func;
1421 $expect_warnings_for_overflow = $which_func;
1422 $expect_warnings_for_malformed = $which_func;
69485e19
KW
1423 }
1424 elsif ($warning_type == 2) {
1425 $eval_warn = "no warnings; use warnings 'utf8'";
1426 $expect_regular_warnings = $use_warn_flag;
1427 $expect_warnings_for_overflow = 1;
717dd9f9
KW
1428 $expect_warnings_for_malformed
1429 = $malformed_allow_type == 0;
69485e19
KW
1430 }
1431 elsif ($warning_type == 3) {
1432 $eval_warn = "no warnings; use warnings"
e4e140b4
KW
1433 . " '$controlling_warning_category'";
1434 $expect_regular_warnings = $use_warn_flag;
69485e19 1435 $expect_warnings_for_overflow
e4e140b4 1436 = $controlling_warning_category eq 'non_unicode';
37657a5b 1437 $expect_warnings_for_malformed = $which_func;
69485e19
KW
1438 }
1439 elsif ($warning_type == 4) { # Like type 3, but uses the
d044b7a7 1440 # PERL_EXTENDED flags
69485e19 1441 # The complement flags were set up so that the
d044b7a7 1442 # PERL_EXTENDED flags have been tested that they don't
69485e19
KW
1443 # trigger wrongly for too small code points. And the
1444 # flags have been set up so that those small code
1445 # points are tested for being above Unicode. What's
1446 # left to test is that the large code points do
d044b7a7 1447 # trigger the PERL_EXTENDED flags.
6d736463 1448 next if ! requires_extended_utf8($allowed_uv);
e4e140b4 1449 next if $controlling_warning_category ne 'non_unicode';
69485e19
KW
1450 $eval_warn = "no warnings; use warnings 'non_unicode'";
1451 $expect_regular_warnings = 1;
1452 $expect_warnings_for_overflow = 1;
1453 $expect_warnings_for_malformed = 0;
d044b7a7 1454 $this_utf8n_flag_to_warn = $::UTF8_WARN_PERL_EXTENDED;
69485e19 1455 $this_utf8n_flag_to_disallow
d044b7a7
KW
1456 = $::UTF8_DISALLOW_PERL_EXTENDED;
1457 $this_uvchr_flag_to_warn
1458 = $::UNICODE_WARN_PERL_EXTENDED;
69485e19 1459 $this_uvchr_flag_to_disallow
d044b7a7 1460 = $::UNICODE_DISALLOW_PERL_EXTENDED;
601e92f1
KW
1461 }
1462 else {
69485e19 1463 die "Unexpected warning type '$warning_type'";
601e92f1
KW
1464 }
1465
69485e19
KW
1466 # We only need to test the case where all warnings are
1467 # enabled (type 0) to see if turning off the warning flag
1468 # causes things to not be output. If those pass, then
1469 # turning on some sub-category of warnings, or turning off
1470 # warnings altogether are extremely likely to not output
1471 # warnings either, given how the warnings subsystem is
1472 # supposed to work, and this file assumes it does work.
1473 next if $warning_type != 0 && ! $use_warn_flag;
1474
1475 # The convention is that the 'got' flag is the same value
1476 # as the disallow one. If this were violated, the tests
1477 # here should start failing.
1478 my $return_flag = $this_utf8n_flag_to_disallow;
1479
e4e140b4
KW
1480 # If we aren't expecting warnings/disallow for this, turn
1481 # on all the other flags. That makes sure that they all
1482 # are independent of this flag, and so we don't need to
1483 # test them individually.
57ff5f59
KW
1484 my $this_warning_flags
1485 = ($use_warn_flag)
1486 ? $this_utf8n_flag_to_warn
1487 : ($overlong_is_in_perl_extended_utf8
1488 ? ($utf8n_flag_to_warn_complement
1489 & ~$::UTF8_WARN_PERL_EXTENDED)
1490 : $utf8n_flag_to_warn_complement);
1491 my $this_disallow_flags
1492 = ($do_disallow)
1493 ? $this_utf8n_flag_to_disallow
1494 : ($overlong_is_in_perl_extended_utf8
1495 ? ($utf8n_flag_to_disallow_complement
1496 & ~$::UTF8_DISALLOW_PERL_EXTENDED)
1497 : $utf8n_flag_to_disallow_complement);
6aa905cf 1498 my $expected_uv = $allowed_uv;
717dd9f9 1499 my $this_uv_string = $uv_string;
6aa905cf 1500
4816e15f
KW
1501 my @expected_return_flags
1502 = @expected_malformation_return_flags;
69485e19 1503 my @expected_warnings;
4816e15f 1504 push @expected_warnings, @expected_malformation_warnings
69485e19
KW
1505 if $expect_warnings_for_malformed;
1506
69485e19
KW
1507 # The overflow malformation is done differently than other
1508 # malformations. It comes from manually typed tests in
1509 # the test array, but it also is above Unicode and uses
1510 # Perl extended UTF-8, so affects some of the flags being
1511 # tested. We now make it be treated like one of the other
1512 # generated malformations.
1513 if ($will_overflow) {
1514
1515 # An overflow is (way) above Unicode, and overrides
1516 # everything else.
1517 $expect_regular_warnings = 0;
1518
717dd9f9
KW
1519 # Earlier, we tentatively calculated whether this
1520 # should emit a message or not. It's tentative
1521 # because, even if we ordinarily would output it, we
1522 # don't if malformations are allowed -- except an
d044b7a7 1523 # overflow is also a SUPER and PERL_EXTENDED, and if
717dd9f9
KW
1524 # warnings for those are enabled, the overflow
1525 # warning does get raised.
1526 if ( $expect_warnings_for_overflow
1527 && ( $malformed_allow_type == 0
1528 || ( $this_warning_flags
1529 & ($::UTF8_WARN_SUPER
d044b7a7 1530 |$::UTF8_WARN_PERL_EXTENDED))))
717dd9f9 1531 {
4816e15f 1532 push @expected_warnings, $overflow_msg_pattern;
6aa905cf 1533 }
69485e19
KW
1534 }
1535
1536 # It may be that the malformations have shortened the
1537 # amount of input we look at so much that we can't tell
1538 # what the category the code point was in. Otherwise, set
1539 # up the expected return flags based on the warnings and
1540 # disallowments.
1541 if ($this_expected_len < $this_needed_to_discern_len) {
1542 $expect_regular_warnings = 0;
1543 }
1544 elsif ( ($this_warning_flags & $this_utf8n_flag_to_warn)
1545 || ( $this_disallow_flags
1546 & $this_utf8n_flag_to_disallow))
1547 {
1548 push @expected_return_flags, $return_flag;
1549 }
1550
1551 # Finish setting up the expected warning.
1552 if ($expect_regular_warnings) {
1553
1554 # So far the array contains warnings generated by
1555 # malformations. Add the expected regular one.
57ff5f59 1556 unshift @expected_warnings, $this_cp_message_qr;
69485e19
KW
1557
1558 # But it may need to be modified, because either of
1559 # these malformations means we can't determine the
1560 # expected code point.
57ff5f59
KW
1561 if ( $short || $unexpected_noncont
1562 || $dont_use_overlong_cp)
1563 {
69485e19
KW
1564 my $first_byte = substr($this_bytes, 0, 1);
1565 $expected_warnings[0] = display_bytes(
1566 substr($this_bytes, 0, $this_expected_len));
1567 $expected_warnings[0]
1568 = qr/[Aa]\Qny UTF-8 sequence that starts with\E
1569 \Q $expected_warnings[0]\E
57ff5f59 1570 \Q $this_non_cp_trailing_text\E/x;
69485e19
KW
1571 }
1572 }
6aa905cf 1573
717dd9f9
KW
1574 # Is effectively disallowed if we've set up a malformation
1575 # (unless malformations are allowed), even if the flag
1576 # indicates it is allowed. Fix up test name to indicate
1577 # this as well
1578 my $disallowed = 0;
1579 if ( $this_disallow_flags & $this_utf8n_flag_to_disallow
1580 && $this_expected_len >= $this_needed_to_discern_len)
1581 {
1582 $disallowed = 1;
1583 }
1584 if ($malformations_name) {
1585 if ($malformed_allow_type == 0) {
1586 $disallowed = 1;
1587 }
1588 elsif ($malformed_allow_type == 1) {
1589
1590 # Even if allowed, the malformation returns the
1591 # REPLACEMENT CHARACTER.
1592 $expected_uv = 0xFFFD;
1593 $this_uv_string = "0xFFFD"
1594 }
1595 }
1596
37657a5b
KW
1597 my $this_name = "$func() $testname: ";
1598 my @scratch_expected_return_flags = @expected_return_flags;
a93dd121
KW
1599 if (! $initially_malformed) {
1600 $this_name .= ($disallowed)
1601 ? 'disallowed, '
1602 : 'allowed, ';
1603 }
1604 $this_name .= "$eval_warn";
69485e19
KW
1605 $this_name .= ", " . (( $this_warning_flags
1606 & $this_utf8n_flag_to_warn)
1607 ? 'with flag for raising warnings'
1608 : 'no flag for raising warnings');
1609 $this_name .= $malformations_name;
db0f09e6 1610
8f79178b 1611 # Do the actual test using an eval
9cdc3054 1612 undef @warnings_gotten;
6aa905cf 1613 my $ret_ref;
717dd9f9
KW
1614 my $this_flags
1615 = $allow_flags|$this_warning_flags|$this_disallow_flags;
6aa905cf 1616 my $eval_text = "$eval_warn; \$ret_ref"
37657a5b 1617 . " = test_$func("
69485e19 1618 . "'$this_bytes', $this_length, $this_flags)";
6aa905cf 1619 eval "$eval_text";
1a35ea23 1620 if (! ok ($@ eq "", "$this_name: eval succeeded"))
6aa905cf 1621 {
a8ee5133
KW
1622 diag "\$@='$@'; call was: "
1623 . utf8n_display_call($eval_text);
6aa905cf
KW
1624 next;
1625 }
37657a5b 1626
6aa905cf 1627 if ($disallowed) {
d402d77f 1628 is($ret_ref->[0], 0, " And returns 0")
a8ee5133 1629 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf
KW
1630 }
1631 else {
1632 is($ret_ref->[0], $expected_uv,
d402d77f 1633 " And returns expected uv: "
717dd9f9 1634 . $this_uv_string)
a8ee5133 1635 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf
KW
1636 }
1637 is($ret_ref->[1], $this_expected_len,
d402d77f 1638 " And returns expected length:"
6aa905cf 1639 . " $this_expected_len")
a8ee5133 1640 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf 1641
9cdc3054 1642 my $returned_flags = $ret_ref->[2];
6aa905cf 1643
37657a5b
KW
1644 for (my $i = @scratch_expected_return_flags - 1;
1645 $i >= 0;
1646 $i--)
1647 {
1648 if ($scratch_expected_return_flags[$i] & $returned_flags)
1649 {
1650 if ($scratch_expected_return_flags[$i]
1651 == $::UTF8_GOT_PERL_EXTENDED)
1652 {
1653 pass(" Expected and got return flag for"
1654 . " PERL_EXTENDED");
1655 }
1656 # The first entries in this are
1657 # malformations
1658 elsif ($i > @malformation_names - 1) {
1659 pass(" Expected and got return flag"
1660 . " for " . $controlling_warning_category);
1661 }
1662 else {
1663 pass(" Expected and got return flag for "
1664 . $malformation_names[$i]
1665 . " malformation");
1666 }
1667 $returned_flags
1668 &= ~$scratch_expected_return_flags[$i];
1669 splice @scratch_expected_return_flags, $i, 1;
1670 }
6aa905cf 1671 }
6aa905cf 1672
23038144
KW
1673 if (! is($returned_flags, 0,
1674 " Got no unexpected return flags"))
1675 {
1676 diag "The unexpected flags gotten were: "
5722c46d
KW
1677 . (flags_to_text($returned_flags,
1678 \@utf8n_flags_to_text)
69485e19
KW
1679 # We strip off any prefixes from the flag
1680 # names
1681 =~ s/ \b [A-Z] _ //xgr);
23038144
KW
1682 diag "Call was: " . utf8n_display_call($eval_text);
1683 }
1684
1685 if (! is (scalar @scratch_expected_return_flags, 0,
1686 " Got all expected return flags"))
1687 {
1688 diag "The expected flags not gotten were: "
69485e19 1689 . (flags_to_text(eval join("|",
37657a5b 1690 @scratch_expected_return_flags),
69485e19
KW
1691 \@utf8n_flags_to_text)
1692 # We strip off any prefixes from the flag
1693 # names
5722c46d 1694 =~ s/ \b [A-Z] _ //xgr);
23038144
KW
1695 diag "Call was: " . utf8n_display_call($eval_text);
1696 }
9cdc3054 1697
37657a5b
KW
1698 if ($which_func) {
1699 my @returned_warnings;
1700 for my $element_ref (@{$ret_ref->[3]}) {
1701 push @returned_warnings, $element_ref->{'text'};
1702 my $text = $element_ref->{'text'};
1703 my $flag = $element_ref->{'flag_bit'};
1704 my $category = $element_ref->{'warning_category'};
1705
1706 if (! ok(($flag & ($flag-1)) == 0,
1707 "flag for returned msg is a single bit"))
1708 {
1709 diag sprintf("flags are %x; msg=%s", $flag, $text);
1710 }
1711 else {
1712 if (grep { $_ == $flag } @expected_return_flags) {
1713 pass("flag for returned msg is expected");
1714 }
1715 else {
1716 fail("flag for returned msg is expected: "
1717 . flags_to_text($flag, \@utf8n_flags_to_text));
1718 }
1719 }
1720
1721 # In perl space, don't know the category numbers
1722 isnt($category, 0,
1723 "returned category for msg isn't 0");
1724 }
1725
1726 ok(@warnings_gotten == 0, "$func raised no warnings;"
1727 . " the next tests are for ones in the returned"
1728 . " variable")
1729 or diag join "\n", "The unexpected warnings were:",
1730 @warnings_gotten;
1731 @warnings_gotten = @returned_warnings;
1732 }
1733
69485e19
KW
1734 do_warnings_test(@expected_warnings)
1735 or diag "Call was: " . utf8n_display_call($eval_text);
1736 undef @warnings_gotten;
6aa905cf
KW
1737
1738 # Check CHECK_ONLY results when the input is
1739 # disallowed. Do this when actually disallowed,
37657a5b
KW
1740 # not just when the $this_disallow_flags is set. We only
1741 # test once utf8n_to_uvchr_msgs() with this.
1742 if ( $disallowed
1743 && ($which_func == 0 || ! $tested_CHECK_ONLY))
1744 {
1745 $tested_CHECK_ONLY = 1;
69485e19 1746 my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
d22ec717 1747 my $eval_text = "use warnings; \$ret_ref ="
37657a5b 1748 . " test_$func('"
69485e19
KW
1749 . "$this_bytes', $this_length,"
1750 . " $this_flags)";
1751 eval $eval_text;
1a35ea23 1752 if (! ok ($@ eq "",
a8ee5133
KW
1753 " And eval succeeded with CHECK_ONLY"))
1754 {
1755 diag "\$@='$@'; Call was: "
1756 . utf8n_display_call($eval_text);
1757 next;
1758 }
d402d77f 1759 is($ret_ref->[0], 0, " CHECK_ONLY: Returns 0")
a8ee5133 1760 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf 1761 is($ret_ref->[1], -1,
d402d77f 1762 " CHECK_ONLY: returns -1 for length")
a8ee5133 1763 or diag "Call was: " . utf8n_display_call($eval_text);
9cdc3054 1764 if (! is(scalar @warnings_gotten, 0,
d402d77f 1765 " CHECK_ONLY: no warnings generated"))
6aa905cf 1766 {
a8ee5133 1767 diag "Call was: " . utf8n_display_call($eval_text);
9cdc3054 1768 output_warnings(@warnings_gotten);
6aa905cf
KW
1769 }
1770 }
1771
1772 # Now repeat some of the above, but for
1773 # uvchr_to_utf8_flags(). Since this comes from an
8f79178b
KW
1774 # existing code point, it hasn't overflowed, and isn't
1775 # malformed.
69485e19 1776 next if @malformation_names;
37657a5b 1777 next if $which_func;
69485e19
KW
1778
1779 $this_warning_flags = ($use_warn_flag)
1780 ? $this_uvchr_flag_to_warn
1781 : 0;
1782 $this_disallow_flags = ($do_disallow)
1783 ? $this_uvchr_flag_to_disallow
1784 : 0;
1785
1786 $disallowed = $this_disallow_flags
1787 & $this_uvchr_flag_to_disallow;
1788 $this_name .= ", " . (( $this_warning_flags
1789 & $this_utf8n_flag_to_warn)
1790 ? 'with flag for raising warnings'
1791 : 'no flag for raising warnings');
6aa905cf
KW
1792
1793 $this_name = "uvchr_to_utf8_flags() $testname: "
69485e19 1794 . (($disallowed)
6aa905cf 1795 ? 'disallowed'
6aa905cf
KW
1796 : 'allowed');
1797 $this_name .= ", $eval_warn";
69485e19
KW
1798 $this_name .= ", " . (( $this_warning_flags
1799 & $this_uvchr_flag_to_warn)
6aa905cf
KW
1800 ? 'with warning flag'
1801 : 'no warning flag');
1802
9cdc3054 1803 undef @warnings_gotten;
6aa905cf 1804 my $ret;
69485e19 1805 $this_flags = $this_warning_flags|$this_disallow_flags;
6aa905cf
KW
1806 $eval_text = "$eval_warn; \$ret ="
1807 . " test_uvchr_to_utf8_flags("
d884ea32 1808 . "$allowed_uv, $this_flags)";
6aa905cf 1809 eval "$eval_text";
1a35ea23 1810 if (! ok ($@ eq "", "$this_name: eval succeeded"))
6aa905cf 1811 {
d884ea32
KW
1812 diag "\$@='$@'; call was: "
1813 . uvchr_display_call($eval_text);
6aa905cf
KW
1814 next;
1815 }
1816 if ($disallowed) {
d402d77f 1817 is($ret, undef, " And returns undef")
d884ea32 1818 or diag "Call was: " . uvchr_display_call($eval_text);
6aa905cf
KW
1819 }
1820 else {
d402d77f 1821 is($ret, $this_bytes, " And returns expected string")
d884ea32 1822 or diag "Call was: " . uvchr_display_call($eval_text);
6aa905cf 1823 }
69485e19
KW
1824
1825 do_warnings_test(@expected_warnings)
1826 or diag "Call was: " . uvchr_display_call($eval_text);
6aa905cf
KW
1827 }
1828 }
717dd9f9 1829 }
6aa905cf
KW
1830 }
1831 }
8f79178b 1832 }
6aa905cf 1833 }
e0803729 1834 }
6aa905cf
KW
1835}
1836
1837done_testing;