This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/utf8_warn_base.pl: Use a default value
[perl5.git] / ext / XS-APItest / t / utf8_warn_base.pl
CommitLineData
6aa905cf
KW
1#!perl -w
2
3# This is a base file to be used by various .t's in its directory
8f79178b
KW
4# It tests various code points that are "problematic", and verifies that the
5# correct warnings/flags etc are generated when using them. It also takes the
6# UTF-8 for some of them and perturbs it to be malformed in various ways, and
7# tests that this gets appropriately detected.
6aa905cf
KW
8
9use strict;
10use Test::More;
11
12BEGIN {
13 use_ok('XS::APItest');
14 require 'charset_tools.pl';
15 require './t/utf8_setup.pl';
16};
17
18$|=1;
19
20no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
21 # machines, and that is tested elsewhere
6aa905cf 22use XS::APItest;
6aa905cf 23
9cdc3054 24my @warnings_gotten;
6aa905cf
KW
25
26use warnings 'utf8';
8d6f1506
KW
27local $SIG{__WARN__} = sub { my @copy = @_;
28 push @warnings_gotten, map { chomp; $_ } @copy;
29 };
6d736463 30
57ff5f59 31my $highest_non_extended_utf8_cp = (isASCII) ? 0x7FFFFFFF : 0x3FFFFFFF;
9e35eec9
KW
32my $native_lowest_continuation_chr = I8_to_native(chr $::lowest_continuation);
33
6d736463
KW
34sub requires_extended_utf8($) {
35
36 # Returns a boolean as to whether or not the code point parameter fits
37 # into 31 bits, subject to the convention that a negative code point
38 # stands for one that overflows the word size, so won't fit in 31 bits.
39
57ff5f59 40 return shift > $highest_non_extended_utf8_cp;
6d736463
KW
41}
42
404a1403
KW
43sub overflow_discern_len($) {
44
45 # Returns how many bytes are needed to tell if a UTF-8 sequence is for a
46 # code point that won't fit in the platform's word size. Only the length
47 # of the sequence representing a single code point is needed.
48
49 if (isASCII) {
50 return ($::is64bit) ? 3 : ((shift == $::max_bytes)
51 ? 1
52 : 2);
53 }
54
55 return ($::is64bit) ? 2 : 8;
56}
57
c7f9e4bb
KW
58my @tests;
59{
60 no warnings qw(portable overflow);
61 @tests = (
8f79178b
KW
62 # $testname,
63 # $bytes, UTF-8 string
64 # $allowed_uv, code point $bytes evaluates to; -1 if
65 # overflows
66 # $needed_to_discern_len optional, how long an initial substring do
67 # we need to tell that the string must be for
68 # a code point in the category it falls in,
69 # like being a surrogate; 0 indicates we need
70 # the whole string. Some categories have a
71 # default that is used if this is omitted.
72 [ "lowest surrogate",
73 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
74 0xD800,
75 ],
76 [ "a middle surrogate",
77 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
78 0xD90D,
79 ],
80 [ "highest surrogate",
81 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
82 0xDFFF,
83 ],
8f79178b
KW
84 [ "first of 32 consecutive non-character code points",
85 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
86 0xFDD0,
87 ],
88 [ "a mid non-character code point of the 32 consecutive ones",
89 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
90 0xFDE0,
91 ],
92 [ "final of 32 consecutive non-character code points",
93 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
94 0xFDEF,
95 ],
96 [ "non-character code point U+FFFE",
97 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
98 0xFFFE,
99 ],
100 [ "non-character code point U+FFFF",
101 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
102 0xFFFF,
103 ],
104 [ "non-character code point U+1FFFE",
105 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
106 0x1FFFE,
107 ],
108 [ "non-character code point U+1FFFF",
109 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
110 0x1FFFF,
111 ],
112 [ "non-character code point U+2FFFE",
113 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
114 0x2FFFE,
115 ],
116 [ "non-character code point U+2FFFF",
117 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
118 0x2FFFF,
119 ],
120 [ "non-character code point U+3FFFE",
121 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
122 0x3FFFE,
123 ],
124 [ "non-character code point U+3FFFF",
125 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
126 0x3FFFF,
127 ],
128 [ "non-character code point U+4FFFE",
129 (isASCII)
130 ? "\xf1\x8f\xbf\xbe"
131 : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
132 0x4FFFE,
133 ],
134 [ "non-character code point U+4FFFF",
135 (isASCII)
136 ? "\xf1\x8f\xbf\xbf"
137 : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
138 0x4FFFF,
139 ],
140 [ "non-character code point U+5FFFE",
141 (isASCII)
142 ? "\xf1\x9f\xbf\xbe"
143 : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
144 0x5FFFE,
145 ],
146 [ "non-character code point U+5FFFF",
147 (isASCII)
148 ? "\xf1\x9f\xbf\xbf"
149 : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
150 0x5FFFF,
151 ],
152 [ "non-character code point U+6FFFE",
153 (isASCII)
154 ? "\xf1\xaf\xbf\xbe"
155 : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
156 0x6FFFE,
157 ],
158 [ "non-character code point U+6FFFF",
159 (isASCII)
160 ? "\xf1\xaf\xbf\xbf"
161 : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
162 0x6FFFF,
163 ],
164 [ "non-character code point U+7FFFE",
165 (isASCII)
166 ? "\xf1\xbf\xbf\xbe"
167 : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
168 0x7FFFE,
169 ],
170 [ "non-character code point U+7FFFF",
171 (isASCII)
172 ? "\xf1\xbf\xbf\xbf"
173 : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
174 0x7FFFF,
175 ],
176 [ "non-character code point U+8FFFE",
177 (isASCII)
178 ? "\xf2\x8f\xbf\xbe"
179 : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
180 0x8FFFE,
181 ],
182 [ "non-character code point U+8FFFF",
183 (isASCII)
184 ? "\xf2\x8f\xbf\xbf"
185 : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
186 0x8FFFF,
187 ],
188 [ "non-character code point U+9FFFE",
189 (isASCII)
190 ? "\xf2\x9f\xbf\xbe"
191 : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
192 0x9FFFE,
193 ],
194 [ "non-character code point U+9FFFF",
195 (isASCII)
196 ? "\xf2\x9f\xbf\xbf"
197 : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
198 0x9FFFF,
199 ],
200 [ "non-character code point U+AFFFE",
201 (isASCII)
202 ? "\xf2\xaf\xbf\xbe"
203 : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
204 0xAFFFE,
205 ],
206 [ "non-character code point U+AFFFF",
207 (isASCII)
208 ? "\xf2\xaf\xbf\xbf"
209 : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
210 0xAFFFF,
211 ],
212 [ "non-character code point U+BFFFE",
213 (isASCII)
214 ? "\xf2\xbf\xbf\xbe"
215 : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
216 0xBFFFE,
217 ],
218 [ "non-character code point U+BFFFF",
219 (isASCII)
220 ? "\xf2\xbf\xbf\xbf"
221 : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
222 0xBFFFF,
223 ],
224 [ "non-character code point U+CFFFE",
225 (isASCII)
226 ? "\xf3\x8f\xbf\xbe"
227 : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
228 0xCFFFE,
229 ],
230 [ "non-character code point U+CFFFF",
231 (isASCII)
232 ? "\xf3\x8f\xbf\xbf"
233 : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
234 0xCFFFF,
235 ],
236 [ "non-character code point U+DFFFE",
237 (isASCII)
238 ? "\xf3\x9f\xbf\xbe"
239 : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
240 0xDFFFE,
241 ],
242 [ "non-character code point U+DFFFF",
243 (isASCII)
244 ? "\xf3\x9f\xbf\xbf"
245 : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
246 0xDFFFF,
247 ],
248 [ "non-character code point U+EFFFE",
249 (isASCII)
250 ? "\xf3\xaf\xbf\xbe"
251 : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
252 0xEFFFE,
253 ],
254 [ "non-character code point U+EFFFF",
255 (isASCII)
256 ? "\xf3\xaf\xbf\xbf"
257 : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
258 0xEFFFF,
259 ],
260 [ "non-character code point U+FFFFE",
261 (isASCII)
262 ? "\xf3\xbf\xbf\xbe"
263 : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
264 0xFFFFE,
265 ],
266 [ "non-character code point U+FFFFF",
267 (isASCII)
268 ? "\xf3\xbf\xbf\xbf"
269 : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
270 0xFFFFF,
271 ],
272 [ "non-character code point U+10FFFE",
273 (isASCII)
274 ? "\xf4\x8f\xbf\xbe"
275 : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
276 0x10FFFE,
277 ],
278 [ "non-character code point U+10FFFF",
279 (isASCII)
280 ? "\xf4\x8f\xbf\xbf"
281 : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
282 0x10FFFF,
283 ],
7c9f4ec6
KW
284 [ "first non_unicode",
285 (isASCII)
286 ? "\xf4\x90\x80\x80"
287 : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
288 0x110000,
289 2,
290 ],
291 [ "non_unicode whose first byte tells that",
292 (isASCII)
293 ? "\xf5\x80\x80\x80"
294 : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
295 (isASCII) ? 0x140000 : 0x200000,
296 1,
297 ],
48ceb60a 298 [ "lowest 32 bit code point",
8f79178b
KW
299 (isASCII)
300 ? "\xfe\x82\x80\x80\x80\x80\x80"
301 : I8_to_native(
302 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b 303 0x80000000,
8f79178b
KW
304 ],
305 [ "highest 32 bit code point",
306 (isASCII)
307 ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
308 : I8_to_native(
309 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
310 0xFFFFFFFF,
8f79178b 311 ],
f4da64d4
KW
312 [ "Lowest 33 bit code point",
313 (isASCII)
314 ? "\xfe\x84\x80\x80\x80\x80\x80"
315 : I8_to_native(
316 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
317 ($::is64bit) ? 0x100000000 : -1, # Overflows on 32-bit systems
f4da64d4 318 ],
8f79178b
KW
319 );
320
321 if (! $::is64bit) {
322 if (isASCII) {
8f79178b 323 push @tests,
c81d4d83
KW
324 [ "overflow that old algorithm failed to detect",
325 "\xfe\x86\x80\x80\x80\x80\x80",
326 -1,
8f79178b
KW
327 ];
328 }
329 }
c81d4d83
KW
330
331 if ($::is64bit) {
ba627a0b
KW
332 push @tests,
333 [ "highest 64 bit code point",
334 (isASCII)
335 ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
336 : I8_to_native(
337 "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
338 0xFFFFFFFFFFFFFFFF,
339 (isASCII) ? 1 : 2,
340 ],
341 [ "first 65 bit code point",
342 (isASCII)
343 ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
344 : I8_to_native(
345 "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
346 -1,
ba627a0b 347 ];
c81d4d83
KW
348 if (isASCII) {
349 push @tests,
f6104ed7
KW
350 [ "Lowest code point requiring 13 bytes to represent",
351 "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
352 0x1000000000,
f6104ed7 353 ],
c81d4d83
KW
354 [ "overflow that old algorithm failed to detect",
355 "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
356 -1,
c81d4d83
KW
357 ];
358 }
359 else {
8f79178b
KW
360 push @tests, # These could falsely show wrongly in a naive
361 # implementation
362 [ "requires at least 32 bits",
363 I8_to_native(
6aa905cf 364 "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b 365 0x800000000,
8f79178b
KW
366 ],
367 [ "requires at least 32 bits",
368 I8_to_native(
6aa905cf 369 "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b 370 0x10000000000,
8f79178b
KW
371 ],
372 [ "requires at least 32 bits",
373 I8_to_native(
6aa905cf 374 "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b 375 0x200000000000,
8f79178b
KW
376 ],
377 [ "requires at least 32 bits",
378 I8_to_native(
6aa905cf 379 "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b 380 0x4000000000000,
8f79178b
KW
381 ],
382 [ "requires at least 32 bits",
383 I8_to_native(
6aa905cf 384 "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b 385 0x80000000000000,
8f79178b
KW
386 ],
387 [ "requires at least 32 bits",
388 I8_to_native(
6aa905cf 389 "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b 390 0x1000000000000000,
8f79178b
KW
391 ];
392 }
6aa905cf 393 }
c7f9e4bb 394}
6aa905cf 395
5722c46d
KW
396sub flags_to_text($$)
397{
398 my ($flags, $flags_to_text_ref) = @_;
399
400 # Returns a string containing a mnemonic representation of the bits that
401 # are set in the $flags. These are assumed to be flag bits. The return
402 # looks like "FOO|BAR|BAZ". The second parameter is a reference to an
403 # array that gives the textual representation of all the possible flags.
404 # Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If
405 # no bits at all are set the string "0" is returned;
406
407 my @flag_text;
408 my $shift = 0;
409
410 return "0" if $flags == 0;
411
412 while ($flags) {
413 #diag sprintf "%x", $flags;
414 if ($flags & 1) {
415 push @flag_text, $flags_to_text_ref->[$shift];
416 }
417 $shift++;
418 $flags >>= 1;
419 }
420
421 return join "|", @flag_text;
422}
423
424# Possible flag returns from utf8n_to_uvchr_error(). These should have G_,
425# instead of A_, D_, but the prefixes will be used in a a later commit, so
426# minimize churn by having them here.
427my @utf8n_flags_to_text = ( qw(
428 A_EMPTY
429 A_CONTINUATION
430 A_NON_CONTINUATION
431 A_SHORT
432 A_LONG
433 A_LONG_AND_ITS_VALUE
434 PLACEHOLDER
435 A_OVERFLOW
436 D_SURROGATE
437 W_SURROGATE
438 D_NONCHAR
439 W_NONCHAR
440 D_SUPER
441 W_SUPER
d044b7a7
KW
442 D_PERL_EXTENDED
443 W_PERL_EXTENDED
5722c46d
KW
444 CHECK_ONLY
445 NO_CONFIDENCE_IN_CURLEN_
446 ) );
447
a8ee5133
KW
448sub utf8n_display_call($)
449{
450 # Converts an eval string that calls test_utf8n_to_uvchr into a more human
451 # readable form, and returns it. Doesn't work if the byte string contains
452 # an apostrophe. The return will look something like:
453 # test_utf8n_to_uvchr_error('$bytes', $length, $flags)
454 #diag $_[0];
455
456 $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
457 my $text1 = $1; # Everything before the byte string
458 my $bytes = $2;
459 my $text2 = $3; # Includes the length
460 my $flags = $4;
461
462 return $text1
463 . display_bytes($bytes)
464 . $text2
465 . flags_to_text($flags, \@utf8n_flags_to_text)
466 . ')';
467}
468
d884ea32
KW
469sub uvchr_display_call($)
470{
471 # Converts an eval string that calls test_uvchr_to_utf8 into a more human
472 # readable form, and returns it. The return will look something like:
473 # test_uvchr_to_utf8n_flags($uv, $flags)
474 #diag $_[0];
475
476 my @flags_to_text = ( qw(
477 W_SURROGATE
478 W_NONCHAR
479 W_SUPER
d044b7a7 480 W_PERL_EXTENDED
d884ea32
KW
481 D_SURROGATE
482 D_NONCHAR
483 D_SUPER
d044b7a7 484 D_PERL_EXTENDED
d884ea32
KW
485 ) );
486
487 $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
488 my $text = $1;
489 my $cp = sprintf "%X", $2;
490 my $flags = $3;
491
492 return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
493}
494
69485e19
KW
495sub do_warnings_test(@)
496{
497 my @expected_warnings = @_;
498
499 # Compares the input expected warnings array with @warnings_gotten,
500 # generating a pass for each found, removing it from @warnings_gotten.
501 # Any discrepancies generate test failures. Returns TRUE if no
502 # discrepcancies; otherwise FALSE.
503
504 my $succeeded = 1;
505
506 if (@expected_warnings == 0) {
507 if (! is(@warnings_gotten, 0, " Expected and got no warnings")) {
508 output_warnings(@warnings_gotten);
509 $succeeded = 0;
510 }
511 return $succeeded;
512 }
513
514 # Check that we got all the expected warnings,
515 # removing each one found
516 WARNING:
517 foreach my $expected (@expected_warnings) {
518 foreach (my $i = 0; $i < @warnings_gotten; $i++) {
519 if ($warnings_gotten[$i] =~ $expected) {
520 pass(" Expected and got warning: "
521 . " $warnings_gotten[$i]");
522 splice @warnings_gotten, $i, 1;
523 next WARNING;
524 }
525 }
526 fail(" Expected a warning that matches "
527 . $expected . " but didn't get it");
528 $succeeded = 0;
529 }
530
531 if (! is(@warnings_gotten, 0, " Got no unexpected warnings")) {
532 output_warnings(@warnings_gotten);
533 $succeeded = 0;
534 }
535
536 return $succeeded;
537}
538
6aa905cf
KW
539# This test is split into this number of files.
540my $num_test_files = $ENV{TEST_JOBS} || 1;
541$num_test_files = 10 if $num_test_files > 10;
542
543my $test_count = -1;
544foreach my $test (@tests) {
545 $test_count++;
546 next if $test_count % $num_test_files != $::TEST_CHUNK;
547
d42d7565 548 my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
6aa905cf
KW
549
550 my $length = length $bytes;
04f42bf6 551 my $will_overflow = $allowed_uv < 0;
6aa905cf 552
2c511c58 553 my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
c0e5ee4b 554 my $display_bytes = display_bytes($bytes);
2c511c58 555
d42d7565 556 my $controlling_warning_category;
af816908
KW
557 my $utf8n_flag_to_warn;
558 my $utf8n_flag_to_disallow;
559 my $uvchr_flag_to_warn;
560 my $uvchr_flag_to_disallow;
b7e1f4b2 561
e4e140b4
KW
562 # We want to test that the independent flags are actually independent.
563 # For example, that a surrogate doesn't trigger a non-character warning,
564 # and conversely, turning off an above-Unicode flag doesn't suppress a
565 # surrogate warning. Earlier versions of this file used nested loops to
566 # test all possible combinations. But that creates lots of tests, making
567 # this run too long. What is now done instead is to use the complement of
568 # the category we are testing to greatly reduce the combinatorial
569 # explosion. For example, if we have a surrogate and we aren't expecting
570 # a warning about it, we set all the flags for non-surrogates to raise
571 # warnings. If one shows up, it indicates the flags aren't independent.
572 my $utf8n_flag_to_warn_complement;
573 my $utf8n_flag_to_disallow_complement;
574 my $uvchr_flag_to_warn_complement;
575 my $uvchr_flag_to_disallow_complement;
576
bf422d6a
KW
577 # Many of the code points being tested are middling in that if code point
578 # edge cases work, these are very likely to as well. Because this test
579 # file takes a while to execute, we skip testing the edge effects of code
580 # points deemed middling, while testing their basics and continuing to
581 # fully test the non-middling code points.
582 my $skip_most_tests = 0;
583
601e92f1
KW
584 my $cp_message_qr; # Pattern that matches the message raised when
585 # that message contains the problematic code
586 # point. The message is the same (currently) both
587 # when going from/to utf8.
588 my $non_cp_trailing_text; # The suffix text when the message doesn't
589 # contain a code point. (This is a result of
590 # some sort of malformation that means we
591 # can't get an exact code poin
57ff5f59
KW
592 my $extended_cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
593 \Q requires a Perl extension, and so is not\E
594 \Q portable\E/x;
595 my $extended_non_cp_trailing_text
596 = "is a Perl extension, and so is not portable";
601e92f1 597
c0e5ee4b
KW
598 # Is this test malformed from the beginning? If so, we know to generally
599 # expect that the tests will show it isn't valid.
600 my $initially_malformed = 0;
601
af816908
KW
602 if ($will_overflow || $allowed_uv > 0x10FFFF) {
603
d044b7a7 604 # Set the SUPER flags; later, we test for PERL_EXTENDED as well.
af816908
KW
605 $utf8n_flag_to_warn = $::UTF8_WARN_SUPER;
606 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
607 $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER;
608 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
609
d044b7a7
KW
610 # Below, we add the flags for non-perl_extended to the code points
611 # that don't fit that category. Special tests are done for this
612 # category in the inner loop.
e4e140b4
KW
613 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
614 |$::UTF8_WARN_SURROGATE;
615 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
616 |$::UTF8_DISALLOW_SURROGATE;
617 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
618 |$::UNICODE_WARN_SURROGATE;
619 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
620 |$::UNICODE_DISALLOW_SURROGATE;
621 $controlling_warning_category = 'non_unicode';
622
8f79178b 623 if ($will_overflow) { # This is realy a malformation
69485e19 624 $non_cp_trailing_text = "if you see this, there is an error";
601e92f1 625 $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
c0e5ee4b 626 $initially_malformed = 1;
404a1403
KW
627 if (! defined $needed_to_discern_len) {
628 $needed_to_discern_len = overflow_discern_len($length);
629 }
af816908 630 }
6d736463 631 elsif (requires_extended_utf8($allowed_uv)) {
57ff5f59
KW
632 $cp_message_qr = $extended_cp_message_qr;
633 $non_cp_trailing_text = $extended_non_cp_trailing_text;
634 $needed_to_discern_len = 1 unless defined $needed_to_discern_len;
af816908 635 }
601e92f1
KW
636 else {
637 $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
638 \Q may not be portable\E/x;
639 $non_cp_trailing_text = "is for a non-Unicode code point, may not"
640 . " be portable";
d044b7a7
KW
641 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_PERL_EXTENDED;
642 $utf8n_flag_to_disallow_complement
643 |= $::UTF8_DISALLOW_PERL_EXTENDED;
644 $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_PERL_EXTENDED;
e4e140b4 645 $uvchr_flag_to_disallow_complement
d044b7a7 646 |= $::UNICODE_DISALLOW_PERL_EXTENDED;
af816908 647 }
3022ad00
KW
648 }
649 elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
601e92f1
KW
650 $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/;
651 $non_cp_trailing_text = "is for a surrogate";
67e45424 652 $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
bf422d6a 653 $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
af816908
KW
654
655 $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE;
656 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
657 $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE;
658 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
e4e140b4
KW
659
660 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
661 |$::UTF8_WARN_SUPER
d044b7a7 662 |$::UTF8_WARN_PERL_EXTENDED;
e4e140b4
KW
663 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
664 |$::UTF8_DISALLOW_SUPER
d044b7a7 665 |$::UTF8_DISALLOW_PERL_EXTENDED;
e4e140b4
KW
666 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
667 |$::UNICODE_WARN_SUPER
d044b7a7 668 |$::UNICODE_WARN_PERL_EXTENDED;
e4e140b4
KW
669 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
670 |$::UNICODE_DISALLOW_SUPER
d044b7a7 671 |$::UNICODE_DISALLOW_PERL_EXTENDED;
d42d7565 672 $controlling_warning_category = 'surrogate';
3022ad00
KW
673 }
674 elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
675 || ($allowed_uv & 0xFFFE) == 0xFFFE)
676 {
601e92f1
KW
677 $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E
678 \Q is not recommended for open interchange\E/x;
679 $non_cp_trailing_text = "if you see this, there is an error";
67e45424 680 $needed_to_discern_len = $length unless defined $needed_to_discern_len;
bf422d6a
KW
681 if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
682 || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
683 {
684 $skip_most_tests = 1;
685 }
af816908
KW
686
687 $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR;
688 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
689 $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR;
690 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
e4e140b4
KW
691
692 $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE
693 |$::UTF8_WARN_SUPER
d044b7a7 694 |$::UTF8_WARN_PERL_EXTENDED;
e4e140b4
KW
695 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
696 |$::UTF8_DISALLOW_SUPER
d044b7a7 697 |$::UTF8_DISALLOW_PERL_EXTENDED;
e4e140b4
KW
698 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE
699 |$::UNICODE_WARN_SUPER
d044b7a7 700 |$::UNICODE_WARN_PERL_EXTENDED;
e4e140b4
KW
701 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
702 |$::UNICODE_DISALLOW_SUPER
d044b7a7 703 |$::UNICODE_DISALLOW_PERL_EXTENDED;
e4e140b4 704
d42d7565 705 $controlling_warning_category = 'nonchar';
3022ad00
KW
706 }
707 else {
708 die "Can't figure out what type of warning to test for $testname"
709 }
710
67e45424
KW
711 die 'Didn\'t set $needed_to_discern_len for ' . $testname
712 unless defined $needed_to_discern_len;
c0e5ee4b 713
8f79178b 714 { # First test the isFOO calls
c0e5ee4b 715 use warnings; no warnings 'deprecated'; # Make sure these don't raise warnings
9cdc3054 716 undef @warnings_gotten;
8f79178b 717
6aa905cf
KW
718 my $ret = test_isUTF8_CHAR($bytes, $length);
719 my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
c0e5ee4b 720 if ($initially_malformed) {
d402d77f
KW
721 is($ret, 0, "For $testname: isUTF8_CHAR() returns 0");
722 is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0");
6aa905cf
KW
723 }
724 else {
725 is($ret, $length,
d402d77f
KW
726 "For $testname: isUTF8_CHAR() returns expected length: $length");
727 is($ret_flags, $length, " And isUTF8_CHAR_flags(...,0)"
6aa905cf
KW
728 . " returns expected length: $length");
729 }
9cdc3054 730 is(scalar @warnings_gotten, 0,
d402d77f
KW
731 " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags generated"
732 . " any warnings")
9cdc3054 733 or output_warnings(@warnings_gotten);
6aa905cf 734
9cdc3054 735 undef @warnings_gotten;
6aa905cf 736 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
c0e5ee4b 737 if ($initially_malformed) {
d402d77f 738 is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0");
6aa905cf
KW
739 }
740 else {
741 my $expected_ret = ( $testname =~ /surrogate|non-character/
742 || $allowed_uv > 0x10FFFF)
743 ? 0
744 : $length;
d402d77f 745 is($ret, $expected_ret, " And isSTRICT_UTF8_CHAR() returns"
6aa905cf
KW
746 . " expected length: $expected_ret");
747 $ret = test_isUTF8_CHAR_flags($bytes, $length,
748 $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
749 is($ret, $expected_ret,
d402d77f
KW
750 " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
751 . " acts like isSTRICT_UTF8_CHAR");
6aa905cf 752 }
9cdc3054 753 is(scalar @warnings_gotten, 0,
d402d77f
KW
754 " And neither isSTRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
755 . " generated any warnings")
9cdc3054 756 or output_warnings(@warnings_gotten);
6aa905cf 757
9cdc3054 758 undef @warnings_gotten;
6aa905cf 759 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
c0e5ee4b 760 if ($initially_malformed) {
d402d77f 761 is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0");
6aa905cf
KW
762 }
763 else {
764 my $expected_ret = ( $testname =~ /surrogate/
765 || $allowed_uv > 0x10FFFF)
766 ? 0
767 : $length;
d402d77f 768 is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()"
6aa905cf
KW
769 ." returns expected length: $expected_ret");
770 $ret = test_isUTF8_CHAR_flags($bytes, $length,
8f79178b 771 $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
6aa905cf 772 is($ret, $expected_ret,
d402d77f
KW
773 " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
774 . " acts like isC9_STRICT_UTF8_CHAR");
6aa905cf 775 }
9cdc3054 776 is(scalar @warnings_gotten, 0,
d402d77f
KW
777 " And neither isC9_STRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
778 . " generated any warnings")
9cdc3054 779 or output_warnings(@warnings_gotten);
6aa905cf 780
c0e5ee4b
KW
781 foreach my $disallow_type (0..2) {
782 # 0 is don't disallow this type of code point
783 # 1 is do disallow
d044b7a7
KW
784 # 2 is do disallow, but only code points requiring
785 # perl-extended-UTF8
c0e5ee4b
KW
786
787 my $disallow_flags;
788 my $expected_ret;
789
790 if ($initially_malformed) {
791
792 # Malformations are by default disallowed, so testing with
793 # $disallow_type equal to 0 is sufficicient.
794 next if $disallow_type;
795
796 $disallow_flags = 0;
797 $expected_ret = 0;
798 }
799 elsif ($disallow_type == 1) {
800 $disallow_flags = $utf8n_flag_to_disallow;
801 $expected_ret = 0;
802 }
803 elsif ($disallow_type == 2) {
6d736463 804 next if ! requires_extended_utf8($allowed_uv);
d044b7a7 805 $disallow_flags = $::UTF8_DISALLOW_PERL_EXTENDED;
c0e5ee4b
KW
806 $expected_ret = 0;
807 }
808 else { # type is 0
809 $disallow_flags = $utf8n_flag_to_disallow_complement;
810 $expected_ret = $length;
811 }
812
813 $ret = test_isUTF8_CHAR_flags($bytes, $length, $disallow_flags);
814 is($ret, $expected_ret, " And isUTF8_CHAR_flags("
815 . "$display_bytes, $disallow_flags) returns "
816 . $expected_ret)
817 or diag "The flags mean "
818 . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
6aa905cf 819
c0e5ee4b
KW
820 is(scalar @warnings_gotten, 0,
821 " And isUTF8_CHAR_flags(...) generated no warnings")
822 or output_warnings(@warnings_gotten);
6aa905cf 823
c0e5ee4b 824 # Test partial character handling, for each byte not a full character
4c210ae9 825 my $did_test_partial = 0;
c0e5ee4b 826 for (my $j = 1; $j < $length - 1; $j++) {
4c210ae9 827 $did_test_partial = 1;
6aa905cf
KW
828 my $partial = substr($bytes, 0, $j);
829 my $ret_should_be;
830 my $comment;
c0e5ee4b 831 if ($disallow_type || $initially_malformed) {
6aa905cf
KW
832 $ret_should_be = 0;
833 $comment = "disallowed";
834 if ($j < $needed_to_discern_len) {
835 $ret_should_be = 1;
c0e5ee4b
KW
836 $comment .= ", but need $needed_to_discern_len"
837 . " bytes to discern:";
6aa905cf
KW
838 }
839 }
840 else {
841 $ret_should_be = 1;
842 $comment = "allowed";
843 }
844
9cdc3054 845 undef @warnings_gotten;
6aa905cf
KW
846
847 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
c0e5ee4b 848 $disallow_flags);
6aa905cf 849 is($ret, $ret_should_be,
c0e5ee4b
KW
850 " And is_utf8_valid_partial_char_flags("
851 . display_bytes($partial)
852 . ", $disallow_flags), $comment: returns $ret_should_be")
853 or diag "The flags mean "
854 . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
4c210ae9
KW
855 }
856
857 if ($did_test_partial) {
9cdc3054 858 is(scalar @warnings_gotten, 0,
d402d77f 859 " And is_utf8_valid_partial_char_flags()"
4c210ae9
KW
860 . " generated no warnings for any of the lengths")
861 or output_warnings(@warnings_gotten);
6aa905cf
KW
862 }
863 }
864 }
865
8f79178b 866 # Now test the to/from UTF-8 calls
69485e19
KW
867 # This is more complicated than the malformations tested in other files in
868 # this directory, as there are several orthogonal variables involved. We
869 # test most possible combinations
8f79178b 870
8139d291
KW
871 # We try various combinations of malformations that can occur
872 foreach my $short (0, 1) {
873 next if $skip_most_tests && $short;
874 foreach my $unexpected_noncont (0, 1) {
875 next if $skip_most_tests && $unexpected_noncont;
876 foreach my $overlong (0, 1) {
877 next if $overlong && $skip_most_tests;
8f79178b 878
8139d291
KW
879 # If we're creating an overlong, it can't be longer than the
880 # maximum length, so skip if we're already at that length.
881 next if $overlong && $length >= $::max_bytes;
882
57ff5f59
KW
883 my $this_cp_message_qr = $cp_message_qr;
884 my $this_non_cp_trailing_text = $non_cp_trailing_text;
885
717dd9f9
KW
886 foreach my $malformed_allow_type (0..2) {
887 # 0 don't allow this malformation; ignored if no malformation
888 # 1 allow, with REPLACEMENT CHARACTER returned
889 # 2 allow, with intended code point returned. All malformations
890 # other than overlong can't determine the intended code point,
891 # so this isn't valid for them.
892 next if $malformed_allow_type == 2
893 && ($will_overflow || $short || $unexpected_noncont);
894 next if $skip_most_tests && $malformed_allow_type;
717dd9f9 895
4816e15f
KW
896 # Here we are in the innermost loop for malformations. So we
897 # know which ones are in effect. Can now change the input to be
898 # appropriately malformed. We also can set up certain other
899 # things now, like whether we expect a return flag from this
900 # malformation, and which flag.
901
902 my $this_bytes = $bytes;
903 my $this_length = $length;
904 my $this_expected_len = $length;
905 my $this_needed_to_discern_len = $needed_to_discern_len;
906
907 my @malformation_names;
908 my @expected_malformation_warnings;
909 my @expected_malformation_return_flags;
910
717dd9f9
KW
911 # Contains the flags for any allowed malformations. Currently no
912 # combinations of on/off are tested for. It's either all are
913 # allowed, or none are.
914 my $allow_flags = 0;
57ff5f59
KW
915 my $overlong_is_in_perl_extended_utf8 = 0;
916 my $dont_use_overlong_cp = 0;
717dd9f9 917
4816e15f 918 if ($overlong) {
9e35eec9 919 my $new_expected_len;
4816e15f
KW
920
921 # To force this malformation, we convert the original start
922 # byte into a continuation byte with the same data bits as
923 # originally. ...
9e35eec9
KW
924 my $start_byte = substr($this_bytes, 0, 1);
925 my $converted_to_continuation_byte
926 = start_byte_to_cont($start_byte);
4816e15f
KW
927
928 # ... Then we prepend it with a known overlong sequence. This
929 # should evaluate to the exact same code point as the
9e35eec9
KW
930 # original. We try to avoid an overlong using Perl extended
931 # UTF-8. The code points are the highest representable as
932 # overlongs on the respective platform without using extended
933 # UTF-8.
934 if (native_to_I8($start_byte) lt "\xFC") {
935 $start_byte = I8_to_native("\xFC");
936 $new_expected_len = 6;
937 }
938 elsif (! isASCII && native_to_I8($start_byte) lt "\xFE") {
939
940 # FE is not extended UTF-8 on EBCDIC
941 $start_byte = I8_to_native("\xFE");
942 $new_expected_len = 7;
943 }
944 else { # Must use extended UTF-8. On ASCII platforms, we
945 # could express some overlongs here starting with
946 # \xFE, but there's no real reason to do so.
57ff5f59 947 $overlong_is_in_perl_extended_utf8 = 1;
9e35eec9
KW
948 $start_byte = I8_to_native("\xFF");
949 $new_expected_len = $::max_bytes;
57ff5f59
KW
950 $this_cp_message_qr = $extended_cp_message_qr;
951
952 # The warning that gets raised doesn't include the code
953 # point in the message if the code point can be expressed
954 # without using extended UTF-8, but the particular
955 # overlong sequence used is in extended UTF-8. To do
956 # otherwise would be confusing to the user, as it would
957 # claim the code point requires extended, when it doesn't.
958 $dont_use_overlong_cp = 1
959 unless requires_extended_utf8($allowed_uv);
960 $this_non_cp_trailing_text = $extended_non_cp_trailing_text;
9e35eec9
KW
961 }
962
963 # Splice in the revise continuation byte, preceded by the
964 # start byte and the proper number of the lowest continuation
965 # bytes.
966 $this_bytes = $start_byte
967 . ($native_lowest_continuation_chr
968 x ( $new_expected_len - 1 - length($this_bytes)))
969 . $converted_to_continuation_byte
970 . substr($this_bytes, 1);
4816e15f 971 $this_length = length($this_bytes);
9e35eec9
KW
972 $this_needed_to_discern_len = $new_expected_len
973 - ( $this_expected_len
4816e15f 974 - $this_needed_to_discern_len);
9e35eec9 975 $this_expected_len = $new_expected_len;
4816e15f
KW
976 push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
977 push @malformation_names, 'overlong';
717dd9f9
KW
978
979 if ($malformed_allow_type == 2) {
980 $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
981 }
982 elsif ($malformed_allow_type) {
983 $allow_flags |= $::UTF8_ALLOW_LONG;
984 }
4816e15f
KW
985 }
986
987 if ($short) {
988 push @malformation_names, 'short';
989
990 # To force this malformation, just tell the test to not look
991 # as far as it should into the input.
992 $this_length--;
993 $this_expected_len--;
994 push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
717dd9f9
KW
995
996 $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
4816e15f
KW
997 }
998
999 if ($unexpected_noncont) {
1000 push @malformation_names, 'unexpected non-continuation';
1001
1002 # To force this malformation, change the final continuation
1003 # byte into a non continuation.
1004 my $pos = ($short) ? -2 : -1;
1005 substr($this_bytes, $pos, 1) = '?';
1006 $this_expected_len--;
1007 push @expected_malformation_return_flags,
1008 $::UTF8_GOT_NON_CONTINUATION;
717dd9f9
KW
1009 $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
1010 if $malformed_allow_type;
4816e15f
KW
1011 }
1012
1013 # Here, we've transformed the input with all of the desired
1014 # non-overflow malformations. We are now in a position to
1015 # construct any potential warnings for those malformations. But
1016 # it's a pain to get the detailed messages exactly right, so for
1017 # now XXX, only do so for those that return an explicit code
1018 # point.
1019
1020 if ($overlong) {
1021
1022 # If one of the other malformation types is also in effect, we
1023 # don't know what the intended code point was.
1024 if ($short || $unexpected_noncont || $will_overflow) {
1025 push @expected_malformation_warnings, qr/overlong/;
1026 }
1027 else {
1028 my $wrong_bytes = display_bytes_no_quotes(
1029 substr($this_bytes, 0, $this_length));
1030 my $correct_bytes = display_bytes_no_quotes($bytes);
1031 my $prefix = ($allowed_uv > 0x10FFFF) ? "0x" : "U+";
1032 push @expected_malformation_warnings,
1033 qr/\QMalformed UTF-8 character: $wrong_bytes\E
1034 \Q (overlong; instead use\E
1035 \Q $correct_bytes to\E
1036 \Q represent $prefix$uv_string)/x;
1037 }
1038 }
1039 if ($short) {
1040 push @expected_malformation_warnings, qr/too short/;
1041 }
1042 if ($unexpected_noncont) {
1043 push @expected_malformation_warnings,
1044 qr/unexpected non-continuation byte/;
1045 }
1046
1047 # The overflow malformation is done differently than other
1048 # malformations. It comes from manually typed tests in the test
1049 # array. We now make it be treated like one of the other
1050 # malformations. But some has to be deferred until the inner loop
1051 my $overflow_msg_pattern;
1052 if ($will_overflow) {
1053 push @malformation_names, 'overflow';
1054
1055 $overflow_msg_pattern = display_bytes_no_quotes(
1056 substr($this_bytes, 0, $this_expected_len));
1057 $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E
1058 \Q $overflow_msg_pattern\E
1059 \Q (overflows)\E/x;
1060 push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW;
717dd9f9 1061 $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type;
4816e15f
KW
1062 }
1063
1064 # And we can create the malformation-related text for the the test
1065 # names we eventually will generate.
1066 my $malformations_name = "";
1067 if (@malformation_names) {
717dd9f9
KW
1068 $malformations_name .= "dis" unless $malformed_allow_type;
1069 $malformations_name .= "allowed ";
4816e15f
KW
1070 $malformations_name .= "malformation";
1071 $malformations_name .= "s" if @malformation_names > 1;
1072 $malformations_name .= ": ";
1073 $malformations_name .= join "/", @malformation_names;
1074 $malformations_name = " ($malformations_name)";
1075 }
1076
1077 # Done setting up the malformation related stuff
1078
8139d291
KW
1079 foreach my $do_disallow (0, 1) {
1080 next if $skip_most_tests && ! $do_disallow;
6aa905cf 1081
69485e19
KW
1082 # We classify the warnings into certain "interesting" types,
1083 # described later
1084 foreach my $warning_type (0..4) {
1085 next if $skip_most_tests && $warning_type != 1;
1086 foreach my $use_warn_flag (0, 1) {
1087 next if $skip_most_tests && ! $use_warn_flag;
1088
8f79178b
KW
1089 # Finally, here is the inner loop
1090
69485e19
KW
1091 my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn;
1092 my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow;
1093 my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn;
1094 my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow;
1095
1096 my $eval_warn;
1097 my $expect_regular_warnings;
1098 my $expect_warnings_for_malformed;
1099 my $expect_warnings_for_overflow;
1100
1101 if ($warning_type == 0) {
1102 $eval_warn = "use warnings; no warnings 'deprecated'";
1103 $expect_regular_warnings = $use_warn_flag;
717dd9f9
KW
1104
1105 # We ordinarily expect overflow warnings here. But it
1106 # is somewhat more complicated, and the final
1107 # determination is deferred to one place in the filw
1108 # where we handle overflow.
69485e19 1109 $expect_warnings_for_overflow = 1;
717dd9f9
KW
1110
1111 # We would ordinarily expect malformed warnings in
1112 # this case, but not if malformations are allowed.
1113 $expect_warnings_for_malformed
1114 = $malformed_allow_type == 0;
6aa905cf 1115 }
69485e19
KW
1116 elsif ($warning_type == 1) {
1117 $eval_warn = "no warnings";
1118 $expect_regular_warnings = 0;
1119 $expect_warnings_for_overflow = 0;
1120 $expect_warnings_for_malformed = 0;
1121 }
1122 elsif ($warning_type == 2) {
1123 $eval_warn = "no warnings; use warnings 'utf8'";
1124 $expect_regular_warnings = $use_warn_flag;
1125 $expect_warnings_for_overflow = 1;
717dd9f9
KW
1126 $expect_warnings_for_malformed
1127 = $malformed_allow_type == 0;
69485e19
KW
1128 }
1129 elsif ($warning_type == 3) {
1130 $eval_warn = "no warnings; use warnings"
e4e140b4
KW
1131 . " '$controlling_warning_category'";
1132 $expect_regular_warnings = $use_warn_flag;
69485e19 1133 $expect_warnings_for_overflow
e4e140b4 1134 = $controlling_warning_category eq 'non_unicode';
69485e19
KW
1135 $expect_warnings_for_malformed = 0;
1136 }
1137 elsif ($warning_type == 4) { # Like type 3, but uses the
d044b7a7 1138 # PERL_EXTENDED flags
69485e19 1139 # The complement flags were set up so that the
d044b7a7 1140 # PERL_EXTENDED flags have been tested that they don't
69485e19
KW
1141 # trigger wrongly for too small code points. And the
1142 # flags have been set up so that those small code
1143 # points are tested for being above Unicode. What's
1144 # left to test is that the large code points do
d044b7a7 1145 # trigger the PERL_EXTENDED flags.
6d736463 1146 next if ! requires_extended_utf8($allowed_uv);
e4e140b4 1147 next if $controlling_warning_category ne 'non_unicode';
69485e19
KW
1148 $eval_warn = "no warnings; use warnings 'non_unicode'";
1149 $expect_regular_warnings = 1;
1150 $expect_warnings_for_overflow = 1;
1151 $expect_warnings_for_malformed = 0;
d044b7a7 1152 $this_utf8n_flag_to_warn = $::UTF8_WARN_PERL_EXTENDED;
69485e19 1153 $this_utf8n_flag_to_disallow
d044b7a7
KW
1154 = $::UTF8_DISALLOW_PERL_EXTENDED;
1155 $this_uvchr_flag_to_warn
1156 = $::UNICODE_WARN_PERL_EXTENDED;
69485e19 1157 $this_uvchr_flag_to_disallow
d044b7a7 1158 = $::UNICODE_DISALLOW_PERL_EXTENDED;
601e92f1
KW
1159 }
1160 else {
69485e19 1161 die "Unexpected warning type '$warning_type'";
601e92f1
KW
1162 }
1163
69485e19
KW
1164 # We only need to test the case where all warnings are
1165 # enabled (type 0) to see if turning off the warning flag
1166 # causes things to not be output. If those pass, then
1167 # turning on some sub-category of warnings, or turning off
1168 # warnings altogether are extremely likely to not output
1169 # warnings either, given how the warnings subsystem is
1170 # supposed to work, and this file assumes it does work.
1171 next if $warning_type != 0 && ! $use_warn_flag;
1172
1173 # The convention is that the 'got' flag is the same value
1174 # as the disallow one. If this were violated, the tests
1175 # here should start failing.
1176 my $return_flag = $this_utf8n_flag_to_disallow;
1177
e4e140b4
KW
1178 # If we aren't expecting warnings/disallow for this, turn
1179 # on all the other flags. That makes sure that they all
1180 # are independent of this flag, and so we don't need to
1181 # test them individually.
57ff5f59
KW
1182 my $this_warning_flags
1183 = ($use_warn_flag)
1184 ? $this_utf8n_flag_to_warn
1185 : ($overlong_is_in_perl_extended_utf8
1186 ? ($utf8n_flag_to_warn_complement
1187 & ~$::UTF8_WARN_PERL_EXTENDED)
1188 : $utf8n_flag_to_warn_complement);
1189 my $this_disallow_flags
1190 = ($do_disallow)
1191 ? $this_utf8n_flag_to_disallow
1192 : ($overlong_is_in_perl_extended_utf8
1193 ? ($utf8n_flag_to_disallow_complement
1194 & ~$::UTF8_DISALLOW_PERL_EXTENDED)
1195 : $utf8n_flag_to_disallow_complement);
6aa905cf 1196 my $expected_uv = $allowed_uv;
717dd9f9 1197 my $this_uv_string = $uv_string;
6aa905cf 1198
4816e15f
KW
1199 my @expected_return_flags
1200 = @expected_malformation_return_flags;
69485e19 1201 my @expected_warnings;
4816e15f 1202 push @expected_warnings, @expected_malformation_warnings
69485e19
KW
1203 if $expect_warnings_for_malformed;
1204
69485e19
KW
1205 # The overflow malformation is done differently than other
1206 # malformations. It comes from manually typed tests in
1207 # the test array, but it also is above Unicode and uses
1208 # Perl extended UTF-8, so affects some of the flags being
1209 # tested. We now make it be treated like one of the other
1210 # generated malformations.
1211 if ($will_overflow) {
1212
1213 # An overflow is (way) above Unicode, and overrides
1214 # everything else.
1215 $expect_regular_warnings = 0;
1216
717dd9f9
KW
1217 # Earlier, we tentatively calculated whether this
1218 # should emit a message or not. It's tentative
1219 # because, even if we ordinarily would output it, we
1220 # don't if malformations are allowed -- except an
d044b7a7 1221 # overflow is also a SUPER and PERL_EXTENDED, and if
717dd9f9
KW
1222 # warnings for those are enabled, the overflow
1223 # warning does get raised.
1224 if ( $expect_warnings_for_overflow
1225 && ( $malformed_allow_type == 0
1226 || ( $this_warning_flags
1227 & ($::UTF8_WARN_SUPER
d044b7a7 1228 |$::UTF8_WARN_PERL_EXTENDED))))
717dd9f9 1229 {
4816e15f 1230 push @expected_warnings, $overflow_msg_pattern;
6aa905cf 1231 }
69485e19
KW
1232 }
1233
1234 # It may be that the malformations have shortened the
1235 # amount of input we look at so much that we can't tell
1236 # what the category the code point was in. Otherwise, set
1237 # up the expected return flags based on the warnings and
1238 # disallowments.
1239 if ($this_expected_len < $this_needed_to_discern_len) {
1240 $expect_regular_warnings = 0;
1241 }
1242 elsif ( ($this_warning_flags & $this_utf8n_flag_to_warn)
1243 || ( $this_disallow_flags
1244 & $this_utf8n_flag_to_disallow))
1245 {
1246 push @expected_return_flags, $return_flag;
1247 }
1248
1249 # Finish setting up the expected warning.
1250 if ($expect_regular_warnings) {
1251
1252 # So far the array contains warnings generated by
1253 # malformations. Add the expected regular one.
57ff5f59 1254 unshift @expected_warnings, $this_cp_message_qr;
69485e19
KW
1255
1256 # But it may need to be modified, because either of
1257 # these malformations means we can't determine the
1258 # expected code point.
57ff5f59
KW
1259 if ( $short || $unexpected_noncont
1260 || $dont_use_overlong_cp)
1261 {
69485e19
KW
1262 my $first_byte = substr($this_bytes, 0, 1);
1263 $expected_warnings[0] = display_bytes(
1264 substr($this_bytes, 0, $this_expected_len));
1265 $expected_warnings[0]
1266 = qr/[Aa]\Qny UTF-8 sequence that starts with\E
1267 \Q $expected_warnings[0]\E
57ff5f59 1268 \Q $this_non_cp_trailing_text\E/x;
69485e19
KW
1269 }
1270 }
6aa905cf 1271
717dd9f9
KW
1272 # Is effectively disallowed if we've set up a malformation
1273 # (unless malformations are allowed), even if the flag
1274 # indicates it is allowed. Fix up test name to indicate
1275 # this as well
1276 my $disallowed = 0;
1277 if ( $this_disallow_flags & $this_utf8n_flag_to_disallow
1278 && $this_expected_len >= $this_needed_to_discern_len)
1279 {
1280 $disallowed = 1;
1281 }
1282 if ($malformations_name) {
1283 if ($malformed_allow_type == 0) {
1284 $disallowed = 1;
1285 }
1286 elsif ($malformed_allow_type == 1) {
1287
1288 # Even if allowed, the malformation returns the
1289 # REPLACEMENT CHARACTER.
1290 $expected_uv = 0xFFFD;
1291 $this_uv_string = "0xFFFD"
1292 }
1293 }
1294
6aa905cf 1295 my $this_name = "utf8n_to_uvchr_error() $testname: "
69485e19
KW
1296 . (($disallowed)
1297 ? 'disallowed'
1298 : 'allowed');
6aa905cf 1299 $this_name .= ", $eval_warn";
69485e19
KW
1300 $this_name .= ", " . (( $this_warning_flags
1301 & $this_utf8n_flag_to_warn)
1302 ? 'with flag for raising warnings'
1303 : 'no flag for raising warnings');
1304 $this_name .= $malformations_name;
db0f09e6 1305
8f79178b 1306 # Do the actual test using an eval
9cdc3054 1307 undef @warnings_gotten;
6aa905cf 1308 my $ret_ref;
717dd9f9
KW
1309 my $this_flags
1310 = $allow_flags|$this_warning_flags|$this_disallow_flags;
6aa905cf
KW
1311 my $eval_text = "$eval_warn; \$ret_ref"
1312 . " = test_utf8n_to_uvchr_error("
69485e19 1313 . "'$this_bytes', $this_length, $this_flags)";
6aa905cf 1314 eval "$eval_text";
69485e19 1315 if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
6aa905cf 1316 {
a8ee5133
KW
1317 diag "\$@='$@'; call was: "
1318 . utf8n_display_call($eval_text);
6aa905cf
KW
1319 next;
1320 }
1321 if ($disallowed) {
d402d77f 1322 is($ret_ref->[0], 0, " And returns 0")
a8ee5133 1323 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf
KW
1324 }
1325 else {
1326 is($ret_ref->[0], $expected_uv,
d402d77f 1327 " And returns expected uv: "
717dd9f9 1328 . $this_uv_string)
a8ee5133 1329 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf
KW
1330 }
1331 is($ret_ref->[1], $this_expected_len,
d402d77f 1332 " And returns expected length:"
6aa905cf 1333 . " $this_expected_len")
a8ee5133 1334 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf 1335
9cdc3054 1336 my $returned_flags = $ret_ref->[2];
6aa905cf 1337
9cdc3054 1338 for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
69485e19
KW
1339 if ($expected_return_flags[$i] & $returned_flags) {
1340 if ($expected_return_flags[$i]
d044b7a7 1341 == $::UTF8_GOT_PERL_EXTENDED)
69485e19
KW
1342 {
1343 pass(" Expected and got return flag for"
d044b7a7 1344 . " PERL_EXTENDED");
69485e19
KW
1345 }
1346 # The first entries in this are
1347 # malformations
1348 elsif ($i > @malformation_names - 1) {
1349 pass(" Expected and got return flag"
e4e140b4 1350 . " for " . $controlling_warning_category);
69485e19
KW
1351 }
1352 else {
1353 pass(" Expected and got return flag for "
1354 . $malformation_names[$i]
1355 . " malformation");
1356 }
9cdc3054 1357 $returned_flags &= ~$expected_return_flags[$i];
69485e19 1358 splice @expected_return_flags, $i, 1;
6aa905cf 1359 }
6aa905cf 1360 }
6aa905cf 1361
69485e19
KW
1362 is($returned_flags, 0,
1363 " Got no unexpected return flags")
1364 or diag "The unexpected flags gotten were: "
5722c46d
KW
1365 . (flags_to_text($returned_flags,
1366 \@utf8n_flags_to_text)
69485e19
KW
1367 # We strip off any prefixes from the flag
1368 # names
1369 =~ s/ \b [A-Z] _ //xgr);
1370 is (scalar @expected_return_flags, 0,
1371 " Got all expected return flags")
1372 or diag "The expected flags not gotten were: "
1373 . (flags_to_text(eval join("|",
1374 @expected_return_flags),
1375 \@utf8n_flags_to_text)
1376 # We strip off any prefixes from the flag
1377 # names
5722c46d 1378 =~ s/ \b [A-Z] _ //xgr);
9cdc3054 1379
69485e19
KW
1380 do_warnings_test(@expected_warnings)
1381 or diag "Call was: " . utf8n_display_call($eval_text);
1382 undef @warnings_gotten;
6aa905cf
KW
1383
1384 # Check CHECK_ONLY results when the input is
1385 # disallowed. Do this when actually disallowed,
69485e19 1386 # not just when the $this_disallow_flags is set
6aa905cf 1387 if ($disallowed) {
69485e19
KW
1388 my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
1389 my $eval_text = "use warnings; no warnings 'deprecated'; \$ret_ref ="
1390 . " test_utf8n_to_uvchr_error('"
1391 . "$this_bytes', $this_length,"
1392 . " $this_flags)";
1393 eval $eval_text;
a8ee5133
KW
1394 if (! ok ("$@ eq ''",
1395 " And eval succeeded with CHECK_ONLY"))
1396 {
1397 diag "\$@='$@'; Call was: "
1398 . utf8n_display_call($eval_text);
1399 next;
1400 }
d402d77f 1401 is($ret_ref->[0], 0, " CHECK_ONLY: Returns 0")
a8ee5133 1402 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf 1403 is($ret_ref->[1], -1,
d402d77f 1404 " CHECK_ONLY: returns -1 for length")
a8ee5133 1405 or diag "Call was: " . utf8n_display_call($eval_text);
9cdc3054 1406 if (! is(scalar @warnings_gotten, 0,
d402d77f 1407 " CHECK_ONLY: no warnings generated"))
6aa905cf 1408 {
a8ee5133 1409 diag "Call was: " . utf8n_display_call($eval_text);
9cdc3054 1410 output_warnings(@warnings_gotten);
6aa905cf
KW
1411 }
1412 }
1413
1414 # Now repeat some of the above, but for
1415 # uvchr_to_utf8_flags(). Since this comes from an
8f79178b
KW
1416 # existing code point, it hasn't overflowed, and isn't
1417 # malformed.
69485e19
KW
1418 next if @malformation_names;
1419
1420 $this_warning_flags = ($use_warn_flag)
1421 ? $this_uvchr_flag_to_warn
1422 : 0;
1423 $this_disallow_flags = ($do_disallow)
1424 ? $this_uvchr_flag_to_disallow
1425 : 0;
1426
1427 $disallowed = $this_disallow_flags
1428 & $this_uvchr_flag_to_disallow;
1429 $this_name .= ", " . (( $this_warning_flags
1430 & $this_utf8n_flag_to_warn)
1431 ? 'with flag for raising warnings'
1432 : 'no flag for raising warnings');
6aa905cf
KW
1433
1434 $this_name = "uvchr_to_utf8_flags() $testname: "
69485e19 1435 . (($disallowed)
6aa905cf 1436 ? 'disallowed'
6aa905cf
KW
1437 : 'allowed');
1438 $this_name .= ", $eval_warn";
69485e19
KW
1439 $this_name .= ", " . (( $this_warning_flags
1440 & $this_uvchr_flag_to_warn)
6aa905cf
KW
1441 ? 'with warning flag'
1442 : 'no warning flag');
1443
9cdc3054 1444 undef @warnings_gotten;
6aa905cf 1445 my $ret;
69485e19 1446 $this_flags = $this_warning_flags|$this_disallow_flags;
6aa905cf
KW
1447 $eval_text = "$eval_warn; \$ret ="
1448 . " test_uvchr_to_utf8_flags("
d884ea32 1449 . "$allowed_uv, $this_flags)";
6aa905cf
KW
1450 eval "$eval_text";
1451 if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1452 {
d884ea32
KW
1453 diag "\$@='$@'; call was: "
1454 . uvchr_display_call($eval_text);
6aa905cf
KW
1455 next;
1456 }
1457 if ($disallowed) {
d402d77f 1458 is($ret, undef, " And returns undef")
d884ea32 1459 or diag "Call was: " . uvchr_display_call($eval_text);
6aa905cf
KW
1460 }
1461 else {
d402d77f 1462 is($ret, $this_bytes, " And returns expected string")
d884ea32 1463 or diag "Call was: " . uvchr_display_call($eval_text);
6aa905cf 1464 }
69485e19
KW
1465
1466 do_warnings_test(@expected_warnings)
1467 or diag "Call was: " . uvchr_display_call($eval_text);
6aa905cf
KW
1468 }
1469 }
717dd9f9 1470 }
6aa905cf
KW
1471 }
1472 }
8f79178b 1473 }
6aa905cf
KW
1474 }
1475}
1476
1477done_testing;