This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
utf8.c: Fix bugs with overlongs combined with other malformations.
[perl5.git] / ext / XS-APItest / t / utf8_warn_base.pl
CommitLineData
6aa905cf
KW
1#!perl -w
2
3# This is a base file to be used by various .t's in its directory
8f79178b
KW
4# It tests various code points that are "problematic", and verifies that the
5# correct warnings/flags etc are generated when using them. It also takes the
6# UTF-8 for some of them and perturbs it to be malformed in various ways, and
7# tests that this gets appropriately detected.
6aa905cf
KW
8
9use strict;
10use Test::More;
11
12BEGIN {
13 use_ok('XS::APItest');
14 require 'charset_tools.pl';
15 require './t/utf8_setup.pl';
16};
17
18$|=1;
19
20no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
21 # machines, and that is tested elsewhere
22
23use XS::APItest;
24use Data::Dumper;
25
9cdc3054 26my @warnings_gotten;
6aa905cf
KW
27
28use warnings 'utf8';
8d6f1506
KW
29local $SIG{__WARN__} = sub { my @copy = @_;
30 push @warnings_gotten, map { chomp; $_ } @copy;
31 };
c7f9e4bb
KW
32my @tests;
33{
34 no warnings qw(portable overflow);
35 @tests = (
8f79178b
KW
36 # $testname,
37 # $bytes, UTF-8 string
38 # $allowed_uv, code point $bytes evaluates to; -1 if
39 # overflows
40 # $needed_to_discern_len optional, how long an initial substring do
41 # we need to tell that the string must be for
42 # a code point in the category it falls in,
43 # like being a surrogate; 0 indicates we need
44 # the whole string. Some categories have a
45 # default that is used if this is omitted.
46 [ "lowest surrogate",
47 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
48 0xD800,
49 ],
50 [ "a middle surrogate",
51 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
52 0xD90D,
53 ],
54 [ "highest surrogate",
55 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
56 0xDFFF,
57 ],
8f79178b
KW
58 [ "first of 32 consecutive non-character code points",
59 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
60 0xFDD0,
61 ],
62 [ "a mid non-character code point of the 32 consecutive ones",
63 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
64 0xFDE0,
65 ],
66 [ "final of 32 consecutive non-character code points",
67 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
68 0xFDEF,
69 ],
70 [ "non-character code point U+FFFE",
71 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
72 0xFFFE,
73 ],
74 [ "non-character code point U+FFFF",
75 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
76 0xFFFF,
77 ],
78 [ "non-character code point U+1FFFE",
79 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
80 0x1FFFE,
81 ],
82 [ "non-character code point U+1FFFF",
83 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
84 0x1FFFF,
85 ],
86 [ "non-character code point U+2FFFE",
87 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
88 0x2FFFE,
89 ],
90 [ "non-character code point U+2FFFF",
91 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
92 0x2FFFF,
93 ],
94 [ "non-character code point U+3FFFE",
95 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
96 0x3FFFE,
97 ],
98 [ "non-character code point U+3FFFF",
99 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
100 0x3FFFF,
101 ],
102 [ "non-character code point U+4FFFE",
103 (isASCII)
104 ? "\xf1\x8f\xbf\xbe"
105 : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
106 0x4FFFE,
107 ],
108 [ "non-character code point U+4FFFF",
109 (isASCII)
110 ? "\xf1\x8f\xbf\xbf"
111 : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
112 0x4FFFF,
113 ],
114 [ "non-character code point U+5FFFE",
115 (isASCII)
116 ? "\xf1\x9f\xbf\xbe"
117 : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
118 0x5FFFE,
119 ],
120 [ "non-character code point U+5FFFF",
121 (isASCII)
122 ? "\xf1\x9f\xbf\xbf"
123 : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
124 0x5FFFF,
125 ],
126 [ "non-character code point U+6FFFE",
127 (isASCII)
128 ? "\xf1\xaf\xbf\xbe"
129 : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
130 0x6FFFE,
131 ],
132 [ "non-character code point U+6FFFF",
133 (isASCII)
134 ? "\xf1\xaf\xbf\xbf"
135 : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
136 0x6FFFF,
137 ],
138 [ "non-character code point U+7FFFE",
139 (isASCII)
140 ? "\xf1\xbf\xbf\xbe"
141 : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
142 0x7FFFE,
143 ],
144 [ "non-character code point U+7FFFF",
145 (isASCII)
146 ? "\xf1\xbf\xbf\xbf"
147 : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
148 0x7FFFF,
149 ],
150 [ "non-character code point U+8FFFE",
151 (isASCII)
152 ? "\xf2\x8f\xbf\xbe"
153 : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
154 0x8FFFE,
155 ],
156 [ "non-character code point U+8FFFF",
157 (isASCII)
158 ? "\xf2\x8f\xbf\xbf"
159 : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
160 0x8FFFF,
161 ],
162 [ "non-character code point U+9FFFE",
163 (isASCII)
164 ? "\xf2\x9f\xbf\xbe"
165 : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
166 0x9FFFE,
167 ],
168 [ "non-character code point U+9FFFF",
169 (isASCII)
170 ? "\xf2\x9f\xbf\xbf"
171 : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
172 0x9FFFF,
173 ],
174 [ "non-character code point U+AFFFE",
175 (isASCII)
176 ? "\xf2\xaf\xbf\xbe"
177 : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
178 0xAFFFE,
179 ],
180 [ "non-character code point U+AFFFF",
181 (isASCII)
182 ? "\xf2\xaf\xbf\xbf"
183 : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
184 0xAFFFF,
185 ],
186 [ "non-character code point U+BFFFE",
187 (isASCII)
188 ? "\xf2\xbf\xbf\xbe"
189 : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
190 0xBFFFE,
191 ],
192 [ "non-character code point U+BFFFF",
193 (isASCII)
194 ? "\xf2\xbf\xbf\xbf"
195 : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
196 0xBFFFF,
197 ],
198 [ "non-character code point U+CFFFE",
199 (isASCII)
200 ? "\xf3\x8f\xbf\xbe"
201 : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
202 0xCFFFE,
203 ],
204 [ "non-character code point U+CFFFF",
205 (isASCII)
206 ? "\xf3\x8f\xbf\xbf"
207 : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
208 0xCFFFF,
209 ],
210 [ "non-character code point U+DFFFE",
211 (isASCII)
212 ? "\xf3\x9f\xbf\xbe"
213 : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
214 0xDFFFE,
215 ],
216 [ "non-character code point U+DFFFF",
217 (isASCII)
218 ? "\xf3\x9f\xbf\xbf"
219 : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
220 0xDFFFF,
221 ],
222 [ "non-character code point U+EFFFE",
223 (isASCII)
224 ? "\xf3\xaf\xbf\xbe"
225 : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
226 0xEFFFE,
227 ],
228 [ "non-character code point U+EFFFF",
229 (isASCII)
230 ? "\xf3\xaf\xbf\xbf"
231 : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
232 0xEFFFF,
233 ],
234 [ "non-character code point U+FFFFE",
235 (isASCII)
236 ? "\xf3\xbf\xbf\xbe"
237 : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
238 0xFFFFE,
239 ],
240 [ "non-character code point U+FFFFF",
241 (isASCII)
242 ? "\xf3\xbf\xbf\xbf"
243 : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
244 0xFFFFF,
245 ],
246 [ "non-character code point U+10FFFE",
247 (isASCII)
248 ? "\xf4\x8f\xbf\xbe"
249 : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
250 0x10FFFE,
251 ],
252 [ "non-character code point U+10FFFF",
253 (isASCII)
254 ? "\xf4\x8f\xbf\xbf"
255 : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
256 0x10FFFF,
257 ],
7c9f4ec6
KW
258 [ "first non_unicode",
259 (isASCII)
260 ? "\xf4\x90\x80\x80"
261 : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
262 0x110000,
263 2,
264 ],
265 [ "non_unicode whose first byte tells that",
266 (isASCII)
267 ? "\xf5\x80\x80\x80"
268 : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
269 (isASCII) ? 0x140000 : 0x200000,
270 1,
271 ],
48ceb60a 272 [ "lowest 32 bit code point",
8f79178b
KW
273 (isASCII)
274 ? "\xfe\x82\x80\x80\x80\x80\x80"
275 : I8_to_native(
276 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b
KW
277 0x80000000,
278 (isASCII) ? 1 : 8,
279 ],
280 [ "highest 32 bit code point",
281 (isASCII)
282 ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
283 : I8_to_native(
284 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
285 0xFFFFFFFF,
286 (isASCII) ? 1 : 8,
287 ],
f4da64d4
KW
288 [ "Lowest 33 bit code point",
289 (isASCII)
290 ? "\xfe\x84\x80\x80\x80\x80\x80"
291 : I8_to_native(
292 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"),
293 ($::is64bit) ? 0x100000000 : -1, # Overflows on 32-bit systems
294 (isASCII && ! $::is64bit) ? 2 : 1,
295 ],
8f79178b
KW
296 );
297
298 if (! $::is64bit) {
299 if (isASCII) {
8f79178b 300 push @tests,
c81d4d83
KW
301 [ "overflow that old algorithm failed to detect",
302 "\xfe\x86\x80\x80\x80\x80\x80",
303 -1,
304 2,
8f79178b
KW
305 ];
306 }
307 }
c81d4d83
KW
308
309 if ($::is64bit) {
ba627a0b
KW
310 push @tests,
311 [ "highest 64 bit code point",
312 (isASCII)
313 ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
314 : I8_to_native(
315 "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"),
316 0xFFFFFFFFFFFFFFFF,
317 (isASCII) ? 1 : 2,
318 ],
319 [ "first 65 bit code point",
320 (isASCII)
321 ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80"
322 : I8_to_native(
323 "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
324 -1,
325 (isASCII) ? 3 : 2,
326 ];
c81d4d83
KW
327 if (isASCII) {
328 push @tests,
f6104ed7
KW
329 [ "Lowest code point requiring 13 bytes to represent",
330 "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
331 0x1000000000,
332 1,
333 ],
c81d4d83
KW
334 [ "overflow that old algorithm failed to detect",
335 "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
336 -1,
337 3,
338 ];
339 }
340 else {
8f79178b
KW
341 push @tests, # These could falsely show wrongly in a naive
342 # implementation
343 [ "requires at least 32 bits",
344 I8_to_native(
6aa905cf 345 "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b
KW
346 0x800000000,
347 7,
348 ],
349 [ "requires at least 32 bits",
350 I8_to_native(
6aa905cf 351 "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b
KW
352 0x10000000000,
353 6,
354 ],
355 [ "requires at least 32 bits",
356 I8_to_native(
6aa905cf 357 "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b
KW
358 0x200000000000,
359 5,
360 ],
361 [ "requires at least 32 bits",
362 I8_to_native(
6aa905cf 363 "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b
KW
364 0x4000000000000,
365 4,
366 ],
367 [ "requires at least 32 bits",
368 I8_to_native(
6aa905cf 369 "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b
KW
370 0x80000000000000,
371 3,
372 ],
373 [ "requires at least 32 bits",
374 I8_to_native(
6aa905cf 375 "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
8f79178b
KW
376 0x1000000000000000,
377 2,
378 ];
379 }
6aa905cf 380 }
c7f9e4bb 381}
6aa905cf 382
5722c46d
KW
383sub flags_to_text($$)
384{
385 my ($flags, $flags_to_text_ref) = @_;
386
387 # Returns a string containing a mnemonic representation of the bits that
388 # are set in the $flags. These are assumed to be flag bits. The return
389 # looks like "FOO|BAR|BAZ". The second parameter is a reference to an
390 # array that gives the textual representation of all the possible flags.
391 # Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If
392 # no bits at all are set the string "0" is returned;
393
394 my @flag_text;
395 my $shift = 0;
396
397 return "0" if $flags == 0;
398
399 while ($flags) {
400 #diag sprintf "%x", $flags;
401 if ($flags & 1) {
402 push @flag_text, $flags_to_text_ref->[$shift];
403 }
404 $shift++;
405 $flags >>= 1;
406 }
407
408 return join "|", @flag_text;
409}
410
411# Possible flag returns from utf8n_to_uvchr_error(). These should have G_,
412# instead of A_, D_, but the prefixes will be used in a a later commit, so
413# minimize churn by having them here.
414my @utf8n_flags_to_text = ( qw(
415 A_EMPTY
416 A_CONTINUATION
417 A_NON_CONTINUATION
418 A_SHORT
419 A_LONG
420 A_LONG_AND_ITS_VALUE
421 PLACEHOLDER
422 A_OVERFLOW
423 D_SURROGATE
424 W_SURROGATE
425 D_NONCHAR
426 W_NONCHAR
427 D_SUPER
428 W_SUPER
429 D_ABOVE_31_BIT
430 W_ABOVE_31_BIT
431 CHECK_ONLY
432 NO_CONFIDENCE_IN_CURLEN_
433 ) );
434
a8ee5133
KW
435sub utf8n_display_call($)
436{
437 # Converts an eval string that calls test_utf8n_to_uvchr into a more human
438 # readable form, and returns it. Doesn't work if the byte string contains
439 # an apostrophe. The return will look something like:
440 # test_utf8n_to_uvchr_error('$bytes', $length, $flags)
441 #diag $_[0];
442
443 $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
444 my $text1 = $1; # Everything before the byte string
445 my $bytes = $2;
446 my $text2 = $3; # Includes the length
447 my $flags = $4;
448
449 return $text1
450 . display_bytes($bytes)
451 . $text2
452 . flags_to_text($flags, \@utf8n_flags_to_text)
453 . ')';
454}
455
d884ea32
KW
456sub uvchr_display_call($)
457{
458 # Converts an eval string that calls test_uvchr_to_utf8 into a more human
459 # readable form, and returns it. The return will look something like:
460 # test_uvchr_to_utf8n_flags($uv, $flags)
461 #diag $_[0];
462
463 my @flags_to_text = ( qw(
464 W_SURROGATE
465 W_NONCHAR
466 W_SUPER
467 W_ABOVE_31_BIT
468 D_SURROGATE
469 D_NONCHAR
470 D_SUPER
471 D_ABOVE_31_BIT
472 ) );
473
474 $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
475 my $text = $1;
476 my $cp = sprintf "%X", $2;
477 my $flags = $3;
478
479 return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
480}
481
69485e19
KW
482sub do_warnings_test(@)
483{
484 my @expected_warnings = @_;
485
486 # Compares the input expected warnings array with @warnings_gotten,
487 # generating a pass for each found, removing it from @warnings_gotten.
488 # Any discrepancies generate test failures. Returns TRUE if no
489 # discrepcancies; otherwise FALSE.
490
491 my $succeeded = 1;
492
493 if (@expected_warnings == 0) {
494 if (! is(@warnings_gotten, 0, " Expected and got no warnings")) {
495 output_warnings(@warnings_gotten);
496 $succeeded = 0;
497 }
498 return $succeeded;
499 }
500
501 # Check that we got all the expected warnings,
502 # removing each one found
503 WARNING:
504 foreach my $expected (@expected_warnings) {
505 foreach (my $i = 0; $i < @warnings_gotten; $i++) {
506 if ($warnings_gotten[$i] =~ $expected) {
507 pass(" Expected and got warning: "
508 . " $warnings_gotten[$i]");
509 splice @warnings_gotten, $i, 1;
510 next WARNING;
511 }
512 }
513 fail(" Expected a warning that matches "
514 . $expected . " but didn't get it");
515 $succeeded = 0;
516 }
517
518 if (! is(@warnings_gotten, 0, " Got no unexpected warnings")) {
519 output_warnings(@warnings_gotten);
520 $succeeded = 0;
521 }
522
523 return $succeeded;
524}
525
6aa905cf
KW
526# This test is split into this number of files.
527my $num_test_files = $ENV{TEST_JOBS} || 1;
528$num_test_files = 10 if $num_test_files > 10;
529
530my $test_count = -1;
531foreach my $test (@tests) {
532 $test_count++;
533 next if $test_count % $num_test_files != $::TEST_CHUNK;
534
d42d7565 535 my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test;
6aa905cf
KW
536
537 my $length = length $bytes;
04f42bf6 538 my $will_overflow = $allowed_uv < 0;
6aa905cf 539
2c511c58 540 my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
c0e5ee4b 541 my $display_bytes = display_bytes($bytes);
2c511c58 542
d42d7565 543 my $controlling_warning_category;
af816908
KW
544 my $utf8n_flag_to_warn;
545 my $utf8n_flag_to_disallow;
546 my $uvchr_flag_to_warn;
547 my $uvchr_flag_to_disallow;
b7e1f4b2 548
e4e140b4
KW
549 # We want to test that the independent flags are actually independent.
550 # For example, that a surrogate doesn't trigger a non-character warning,
551 # and conversely, turning off an above-Unicode flag doesn't suppress a
552 # surrogate warning. Earlier versions of this file used nested loops to
553 # test all possible combinations. But that creates lots of tests, making
554 # this run too long. What is now done instead is to use the complement of
555 # the category we are testing to greatly reduce the combinatorial
556 # explosion. For example, if we have a surrogate and we aren't expecting
557 # a warning about it, we set all the flags for non-surrogates to raise
558 # warnings. If one shows up, it indicates the flags aren't independent.
559 my $utf8n_flag_to_warn_complement;
560 my $utf8n_flag_to_disallow_complement;
561 my $uvchr_flag_to_warn_complement;
562 my $uvchr_flag_to_disallow_complement;
563
bf422d6a
KW
564 # Many of the code points being tested are middling in that if code point
565 # edge cases work, these are very likely to as well. Because this test
566 # file takes a while to execute, we skip testing the edge effects of code
567 # points deemed middling, while testing their basics and continuing to
568 # fully test the non-middling code points.
569 my $skip_most_tests = 0;
570
601e92f1
KW
571 my $cp_message_qr; # Pattern that matches the message raised when
572 # that message contains the problematic code
573 # point. The message is the same (currently) both
574 # when going from/to utf8.
575 my $non_cp_trailing_text; # The suffix text when the message doesn't
576 # contain a code point. (This is a result of
577 # some sort of malformation that means we
578 # can't get an exact code poin
579
c0e5ee4b
KW
580 # Is this test malformed from the beginning? If so, we know to generally
581 # expect that the tests will show it isn't valid.
582 my $initially_malformed = 0;
583
af816908
KW
584 if ($will_overflow || $allowed_uv > 0x10FFFF) {
585
e4e140b4 586 # Set the SUPER flags; later, we test for ABOVE_31_BIT as well.
af816908
KW
587 $utf8n_flag_to_warn = $::UTF8_WARN_SUPER;
588 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
589 $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER;
590 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
591
e4e140b4
KW
592 # Below, we add the flags for non-above-31 bit to the code points that
593 # don't fit that category. Special tests are done for this category
594 # in the inner loop.
595 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
596 |$::UTF8_WARN_SURROGATE;
597 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
598 |$::UTF8_DISALLOW_SURROGATE;
599 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
600 |$::UNICODE_WARN_SURROGATE;
601 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
602 |$::UNICODE_DISALLOW_SURROGATE;
603 $controlling_warning_category = 'non_unicode';
604
8f79178b 605 if ($will_overflow) { # This is realy a malformation
69485e19 606 $non_cp_trailing_text = "if you see this, there is an error";
601e92f1 607 $cp_message_qr = qr/\Q$non_cp_trailing_text\E/;
c0e5ee4b 608 $initially_malformed = 1;
af816908
KW
609 }
610 elsif ($allowed_uv > 0x7FFFFFFF) {
601e92f1
KW
611 $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
612 \Q and not portable\E/x;
613 $non_cp_trailing_text = "is for a non-Unicode code point, and is not portable";
af816908 614 }
601e92f1
KW
615 else {
616 $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E
617 \Q may not be portable\E/x;
618 $non_cp_trailing_text = "is for a non-Unicode code point, may not"
619 . " be portable";
e4e140b4
KW
620 $utf8n_flag_to_warn_complement |= $::UTF8_WARN_ABOVE_31_BIT;
621 $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_ABOVE_31_BIT;
622 $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_ABOVE_31_BIT;
623 $uvchr_flag_to_disallow_complement
624 |= $::UNICODE_DISALLOW_ABOVE_31_BIT;
af816908 625 }
3022ad00
KW
626 }
627 elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
601e92f1
KW
628 $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/;
629 $non_cp_trailing_text = "is for a surrogate";
67e45424 630 $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
bf422d6a 631 $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
af816908
KW
632
633 $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE;
634 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
635 $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE;
636 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
e4e140b4
KW
637
638 $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR
639 |$::UTF8_WARN_SUPER
640 |$::UTF8_WARN_ABOVE_31_BIT;
641 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR
642 |$::UTF8_DISALLOW_SUPER
643 |$::UTF8_DISALLOW_ABOVE_31_BIT;
644 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR
645 |$::UNICODE_WARN_SUPER
646 |$::UNICODE_WARN_ABOVE_31_BIT;
647 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR
648 |$::UNICODE_DISALLOW_SUPER
649 |$::UNICODE_DISALLOW_ABOVE_31_BIT;
d42d7565 650 $controlling_warning_category = 'surrogate';
3022ad00
KW
651 }
652 elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
653 || ($allowed_uv & 0xFFFE) == 0xFFFE)
654 {
601e92f1
KW
655 $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E
656 \Q is not recommended for open interchange\E/x;
657 $non_cp_trailing_text = "if you see this, there is an error";
67e45424 658 $needed_to_discern_len = $length unless defined $needed_to_discern_len;
bf422d6a
KW
659 if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
660 || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
661 {
662 $skip_most_tests = 1;
663 }
af816908
KW
664
665 $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR;
666 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
667 $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR;
668 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
e4e140b4
KW
669
670 $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE
671 |$::UTF8_WARN_SUPER
672 |$::UTF8_WARN_ABOVE_31_BIT;
673 $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE
674 |$::UTF8_DISALLOW_SUPER
675 |$::UTF8_DISALLOW_ABOVE_31_BIT;
676 $uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE
677 |$::UNICODE_WARN_SUPER
678 |$::UNICODE_WARN_ABOVE_31_BIT;
679 $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE
680 |$::UNICODE_DISALLOW_SUPER
681 |$::UNICODE_DISALLOW_ABOVE_31_BIT;
682
d42d7565 683 $controlling_warning_category = 'nonchar';
3022ad00
KW
684 }
685 else {
686 die "Can't figure out what type of warning to test for $testname"
687 }
688
67e45424
KW
689 die 'Didn\'t set $needed_to_discern_len for ' . $testname
690 unless defined $needed_to_discern_len;
c0e5ee4b 691
8f79178b 692 { # First test the isFOO calls
c0e5ee4b 693 use warnings; no warnings 'deprecated'; # Make sure these don't raise warnings
9cdc3054 694 undef @warnings_gotten;
8f79178b 695
6aa905cf
KW
696 my $ret = test_isUTF8_CHAR($bytes, $length);
697 my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
c0e5ee4b 698 if ($initially_malformed) {
d402d77f
KW
699 is($ret, 0, "For $testname: isUTF8_CHAR() returns 0");
700 is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0");
6aa905cf
KW
701 }
702 else {
703 is($ret, $length,
d402d77f
KW
704 "For $testname: isUTF8_CHAR() returns expected length: $length");
705 is($ret_flags, $length, " And isUTF8_CHAR_flags(...,0)"
6aa905cf
KW
706 . " returns expected length: $length");
707 }
9cdc3054 708 is(scalar @warnings_gotten, 0,
d402d77f
KW
709 " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags generated"
710 . " any warnings")
9cdc3054 711 or output_warnings(@warnings_gotten);
6aa905cf 712
9cdc3054 713 undef @warnings_gotten;
6aa905cf 714 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
c0e5ee4b 715 if ($initially_malformed) {
d402d77f 716 is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0");
6aa905cf
KW
717 }
718 else {
719 my $expected_ret = ( $testname =~ /surrogate|non-character/
720 || $allowed_uv > 0x10FFFF)
721 ? 0
722 : $length;
d402d77f 723 is($ret, $expected_ret, " And isSTRICT_UTF8_CHAR() returns"
6aa905cf
KW
724 . " expected length: $expected_ret");
725 $ret = test_isUTF8_CHAR_flags($bytes, $length,
726 $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
727 is($ret, $expected_ret,
d402d77f
KW
728 " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
729 . " acts like isSTRICT_UTF8_CHAR");
6aa905cf 730 }
9cdc3054 731 is(scalar @warnings_gotten, 0,
d402d77f
KW
732 " And neither isSTRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
733 . " generated any warnings")
9cdc3054 734 or output_warnings(@warnings_gotten);
6aa905cf 735
9cdc3054 736 undef @warnings_gotten;
6aa905cf 737 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
c0e5ee4b 738 if ($initially_malformed) {
d402d77f 739 is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0");
6aa905cf
KW
740 }
741 else {
742 my $expected_ret = ( $testname =~ /surrogate/
743 || $allowed_uv > 0x10FFFF)
744 ? 0
745 : $length;
d402d77f 746 is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()"
6aa905cf
KW
747 ." returns expected length: $expected_ret");
748 $ret = test_isUTF8_CHAR_flags($bytes, $length,
8f79178b 749 $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
6aa905cf 750 is($ret, $expected_ret,
d402d77f
KW
751 " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
752 . " acts like isC9_STRICT_UTF8_CHAR");
6aa905cf 753 }
9cdc3054 754 is(scalar @warnings_gotten, 0,
d402d77f
KW
755 " And neither isC9_STRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
756 . " generated any warnings")
9cdc3054 757 or output_warnings(@warnings_gotten);
6aa905cf 758
c0e5ee4b
KW
759 foreach my $disallow_type (0..2) {
760 # 0 is don't disallow this type of code point
761 # 1 is do disallow
762 # 2 is do disallow, but only for above 31 bit
763
764 my $disallow_flags;
765 my $expected_ret;
766
767 if ($initially_malformed) {
768
769 # Malformations are by default disallowed, so testing with
770 # $disallow_type equal to 0 is sufficicient.
771 next if $disallow_type;
772
773 $disallow_flags = 0;
774 $expected_ret = 0;
775 }
776 elsif ($disallow_type == 1) {
777 $disallow_flags = $utf8n_flag_to_disallow;
778 $expected_ret = 0;
779 }
780 elsif ($disallow_type == 2) {
781 next if ! $will_overflow && $allowed_uv < 0x80000000;
782 $disallow_flags = $::UTF8_DISALLOW_ABOVE_31_BIT;
783 $expected_ret = 0;
784 }
785 else { # type is 0
786 $disallow_flags = $utf8n_flag_to_disallow_complement;
787 $expected_ret = $length;
788 }
789
790 $ret = test_isUTF8_CHAR_flags($bytes, $length, $disallow_flags);
791 is($ret, $expected_ret, " And isUTF8_CHAR_flags("
792 . "$display_bytes, $disallow_flags) returns "
793 . $expected_ret)
794 or diag "The flags mean "
795 . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
6aa905cf 796
c0e5ee4b
KW
797 is(scalar @warnings_gotten, 0,
798 " And isUTF8_CHAR_flags(...) generated no warnings")
799 or output_warnings(@warnings_gotten);
6aa905cf 800
c0e5ee4b
KW
801 # Test partial character handling, for each byte not a full character
802 for (my $j = 1; $j < $length - 1; $j++) {
6aa905cf
KW
803 my $partial = substr($bytes, 0, $j);
804 my $ret_should_be;
805 my $comment;
c0e5ee4b 806 if ($disallow_type || $initially_malformed) {
6aa905cf
KW
807 $ret_should_be = 0;
808 $comment = "disallowed";
809 if ($j < $needed_to_discern_len) {
810 $ret_should_be = 1;
c0e5ee4b
KW
811 $comment .= ", but need $needed_to_discern_len"
812 . " bytes to discern:";
6aa905cf
KW
813 }
814 }
815 else {
816 $ret_should_be = 1;
817 $comment = "allowed";
818 }
819
9cdc3054 820 undef @warnings_gotten;
6aa905cf
KW
821
822 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
c0e5ee4b 823 $disallow_flags);
6aa905cf 824 is($ret, $ret_should_be,
c0e5ee4b
KW
825 " And is_utf8_valid_partial_char_flags("
826 . display_bytes($partial)
827 . ", $disallow_flags), $comment: returns $ret_should_be")
828 or diag "The flags mean "
829 . flags_to_text($disallow_flags, \@utf8n_flags_to_text);
9cdc3054 830 is(scalar @warnings_gotten, 0,
d402d77f 831 " And is_utf8_valid_partial_char_flags()"
6aa905cf 832 . " generated no warnings")
9cdc3054 833 or output_warnings(@warnings_gotten);
6aa905cf
KW
834 }
835 }
836 }
837
8f79178b 838 # Now test the to/from UTF-8 calls
69485e19
KW
839 # This is more complicated than the malformations tested in other files in
840 # this directory, as there are several orthogonal variables involved. We
841 # test most possible combinations
8f79178b 842
8139d291
KW
843 # We try various combinations of malformations that can occur
844 foreach my $short (0, 1) {
845 next if $skip_most_tests && $short;
846 foreach my $unexpected_noncont (0, 1) {
847 next if $skip_most_tests && $unexpected_noncont;
848 foreach my $overlong (0, 1) {
849 next if $overlong && $skip_most_tests;
8f79178b 850
8139d291
KW
851 # If we're creating an overlong, it can't be longer than the
852 # maximum length, so skip if we're already at that length.
853 next if $overlong && $length >= $::max_bytes;
854
717dd9f9
KW
855 foreach my $malformed_allow_type (0..2) {
856 # 0 don't allow this malformation; ignored if no malformation
857 # 1 allow, with REPLACEMENT CHARACTER returned
858 # 2 allow, with intended code point returned. All malformations
859 # other than overlong can't determine the intended code point,
860 # so this isn't valid for them.
861 next if $malformed_allow_type == 2
862 && ($will_overflow || $short || $unexpected_noncont);
863 next if $skip_most_tests && $malformed_allow_type;
717dd9f9 864
4816e15f
KW
865 # Here we are in the innermost loop for malformations. So we
866 # know which ones are in effect. Can now change the input to be
867 # appropriately malformed. We also can set up certain other
868 # things now, like whether we expect a return flag from this
869 # malformation, and which flag.
870
871 my $this_bytes = $bytes;
872 my $this_length = $length;
873 my $this_expected_len = $length;
874 my $this_needed_to_discern_len = $needed_to_discern_len;
875
876 my @malformation_names;
877 my @expected_malformation_warnings;
878 my @expected_malformation_return_flags;
879
717dd9f9
KW
880 # Contains the flags for any allowed malformations. Currently no
881 # combinations of on/off are tested for. It's either all are
882 # allowed, or none are.
883 my $allow_flags = 0;
884
4816e15f
KW
885 if ($overlong) {
886
887 # To force this malformation, we convert the original start
888 # byte into a continuation byte with the same data bits as
889 # originally. ...
890 substr($this_bytes, 0, 1)
891 = start_byte_to_cont(substr($this_bytes, 0, 1));
892
893 # ... Then we prepend it with a known overlong sequence. This
894 # should evaluate to the exact same code point as the
895 # original.
896 $this_bytes = I8_to_native("\xff")
897 . (I8_to_native(chr $::lowest_continuation)
898 x ( $::max_bytes - 1 - length($this_bytes)))
899 . $this_bytes;
900 $this_length = length($this_bytes);
901 $this_needed_to_discern_len = $::max_bytes
902 - ( $this_expected_len
903 - $this_needed_to_discern_len);
904 $this_expected_len = $::max_bytes;
905 push @expected_malformation_return_flags, $::UTF8_GOT_LONG;
906 push @malformation_names, 'overlong';
717dd9f9
KW
907
908 if ($malformed_allow_type == 2) {
909 $allow_flags |= $::UTF8_ALLOW_LONG_AND_ITS_VALUE;
910 }
911 elsif ($malformed_allow_type) {
912 $allow_flags |= $::UTF8_ALLOW_LONG;
913 }
4816e15f
KW
914 }
915
916 if ($short) {
917 push @malformation_names, 'short';
918
919 # To force this malformation, just tell the test to not look
920 # as far as it should into the input.
921 $this_length--;
922 $this_expected_len--;
923 push @expected_malformation_return_flags, $::UTF8_GOT_SHORT;
717dd9f9
KW
924
925 $allow_flags |= $::UTF8_ALLOW_SHORT if $malformed_allow_type;
4816e15f
KW
926 }
927
928 if ($unexpected_noncont) {
929 push @malformation_names, 'unexpected non-continuation';
930
931 # To force this malformation, change the final continuation
932 # byte into a non continuation.
933 my $pos = ($short) ? -2 : -1;
934 substr($this_bytes, $pos, 1) = '?';
935 $this_expected_len--;
936 push @expected_malformation_return_flags,
937 $::UTF8_GOT_NON_CONTINUATION;
717dd9f9
KW
938 $allow_flags |= $::UTF8_ALLOW_NON_CONTINUATION
939 if $malformed_allow_type;
4816e15f
KW
940 }
941
942 # Here, we've transformed the input with all of the desired
943 # non-overflow malformations. We are now in a position to
944 # construct any potential warnings for those malformations. But
945 # it's a pain to get the detailed messages exactly right, so for
946 # now XXX, only do so for those that return an explicit code
947 # point.
948
949 if ($overlong) {
950
951 # If one of the other malformation types is also in effect, we
952 # don't know what the intended code point was.
953 if ($short || $unexpected_noncont || $will_overflow) {
954 push @expected_malformation_warnings, qr/overlong/;
955 }
956 else {
957 my $wrong_bytes = display_bytes_no_quotes(
958 substr($this_bytes, 0, $this_length));
959 my $correct_bytes = display_bytes_no_quotes($bytes);
960 my $prefix = ($allowed_uv > 0x10FFFF) ? "0x" : "U+";
961 push @expected_malformation_warnings,
962 qr/\QMalformed UTF-8 character: $wrong_bytes\E
963 \Q (overlong; instead use\E
964 \Q $correct_bytes to\E
965 \Q represent $prefix$uv_string)/x;
966 }
967 }
968 if ($short) {
969 push @expected_malformation_warnings, qr/too short/;
970 }
971 if ($unexpected_noncont) {
972 push @expected_malformation_warnings,
973 qr/unexpected non-continuation byte/;
974 }
975
976 # The overflow malformation is done differently than other
977 # malformations. It comes from manually typed tests in the test
978 # array. We now make it be treated like one of the other
979 # malformations. But some has to be deferred until the inner loop
980 my $overflow_msg_pattern;
981 if ($will_overflow) {
982 push @malformation_names, 'overflow';
983
984 $overflow_msg_pattern = display_bytes_no_quotes(
985 substr($this_bytes, 0, $this_expected_len));
986 $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E
987 \Q $overflow_msg_pattern\E
988 \Q (overflows)\E/x;
989 push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW;
717dd9f9 990 $allow_flags |= $::UTF8_ALLOW_OVERFLOW if $malformed_allow_type;
4816e15f
KW
991 }
992
993 # And we can create the malformation-related text for the the test
994 # names we eventually will generate.
995 my $malformations_name = "";
996 if (@malformation_names) {
717dd9f9
KW
997 $malformations_name .= "dis" unless $malformed_allow_type;
998 $malformations_name .= "allowed ";
4816e15f
KW
999 $malformations_name .= "malformation";
1000 $malformations_name .= "s" if @malformation_names > 1;
1001 $malformations_name .= ": ";
1002 $malformations_name .= join "/", @malformation_names;
1003 $malformations_name = " ($malformations_name)";
1004 }
1005
1006 # Done setting up the malformation related stuff
1007
8139d291
KW
1008 foreach my $do_disallow (0, 1) {
1009 next if $skip_most_tests && ! $do_disallow;
6aa905cf 1010
69485e19
KW
1011 # We classify the warnings into certain "interesting" types,
1012 # described later
1013 foreach my $warning_type (0..4) {
1014 next if $skip_most_tests && $warning_type != 1;
1015 foreach my $use_warn_flag (0, 1) {
1016 next if $skip_most_tests && ! $use_warn_flag;
1017
8f79178b
KW
1018 # Finally, here is the inner loop
1019
69485e19
KW
1020 my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn;
1021 my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow;
1022 my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn;
1023 my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow;
1024
1025 my $eval_warn;
1026 my $expect_regular_warnings;
1027 my $expect_warnings_for_malformed;
1028 my $expect_warnings_for_overflow;
1029
1030 if ($warning_type == 0) {
1031 $eval_warn = "use warnings; no warnings 'deprecated'";
1032 $expect_regular_warnings = $use_warn_flag;
717dd9f9
KW
1033
1034 # We ordinarily expect overflow warnings here. But it
1035 # is somewhat more complicated, and the final
1036 # determination is deferred to one place in the filw
1037 # where we handle overflow.
69485e19 1038 $expect_warnings_for_overflow = 1;
717dd9f9
KW
1039
1040 # We would ordinarily expect malformed warnings in
1041 # this case, but not if malformations are allowed.
1042 $expect_warnings_for_malformed
1043 = $malformed_allow_type == 0;
6aa905cf 1044 }
69485e19
KW
1045 elsif ($warning_type == 1) {
1046 $eval_warn = "no warnings";
1047 $expect_regular_warnings = 0;
1048 $expect_warnings_for_overflow = 0;
1049 $expect_warnings_for_malformed = 0;
1050 }
1051 elsif ($warning_type == 2) {
1052 $eval_warn = "no warnings; use warnings 'utf8'";
1053 $expect_regular_warnings = $use_warn_flag;
1054 $expect_warnings_for_overflow = 1;
717dd9f9
KW
1055 $expect_warnings_for_malformed
1056 = $malformed_allow_type == 0;
69485e19
KW
1057 }
1058 elsif ($warning_type == 3) {
1059 $eval_warn = "no warnings; use warnings"
e4e140b4
KW
1060 . " '$controlling_warning_category'";
1061 $expect_regular_warnings = $use_warn_flag;
69485e19 1062 $expect_warnings_for_overflow
e4e140b4 1063 = $controlling_warning_category eq 'non_unicode';
69485e19
KW
1064 $expect_warnings_for_malformed = 0;
1065 }
1066 elsif ($warning_type == 4) { # Like type 3, but uses the
1067 # above-31-bit flags
1068 # The complement flags were set up so that the
1069 # above-31-bit flags have been tested that they don't
1070 # trigger wrongly for too small code points. And the
1071 # flags have been set up so that those small code
1072 # points are tested for being above Unicode. What's
1073 # left to test is that the large code points do
1074 # trigger the above-31-bit flags.
1075 next if ! $will_overflow && $allowed_uv < 0x80000000;
e4e140b4 1076 next if $controlling_warning_category ne 'non_unicode';
69485e19
KW
1077 $eval_warn = "no warnings; use warnings 'non_unicode'";
1078 $expect_regular_warnings = 1;
1079 $expect_warnings_for_overflow = 1;
1080 $expect_warnings_for_malformed = 0;
1081 $this_utf8n_flag_to_warn = $::UTF8_WARN_ABOVE_31_BIT;
1082 $this_utf8n_flag_to_disallow
1083 = $::UTF8_DISALLOW_ABOVE_31_BIT;
1084 $this_uvchr_flag_to_warn = $::UNICODE_WARN_ABOVE_31_BIT;
1085 $this_uvchr_flag_to_disallow
1086 = $::UNICODE_DISALLOW_ABOVE_31_BIT;
601e92f1
KW
1087 }
1088 else {
69485e19 1089 die "Unexpected warning type '$warning_type'";
601e92f1
KW
1090 }
1091
69485e19
KW
1092 # We only need to test the case where all warnings are
1093 # enabled (type 0) to see if turning off the warning flag
1094 # causes things to not be output. If those pass, then
1095 # turning on some sub-category of warnings, or turning off
1096 # warnings altogether are extremely likely to not output
1097 # warnings either, given how the warnings subsystem is
1098 # supposed to work, and this file assumes it does work.
1099 next if $warning_type != 0 && ! $use_warn_flag;
1100
1101 # The convention is that the 'got' flag is the same value
1102 # as the disallow one. If this were violated, the tests
1103 # here should start failing.
1104 my $return_flag = $this_utf8n_flag_to_disallow;
1105
e4e140b4
KW
1106 # If we aren't expecting warnings/disallow for this, turn
1107 # on all the other flags. That makes sure that they all
1108 # are independent of this flag, and so we don't need to
1109 # test them individually.
1110 my $this_warning_flags = ($use_warn_flag)
1111 ? $this_utf8n_flag_to_warn
1112 : $utf8n_flag_to_warn_complement;
69485e19 1113 my $this_disallow_flags = ($do_disallow)
e4e140b4
KW
1114 ? $this_utf8n_flag_to_disallow
1115 : $utf8n_flag_to_disallow_complement;
6aa905cf 1116 my $expected_uv = $allowed_uv;
717dd9f9 1117 my $this_uv_string = $uv_string;
6aa905cf 1118
4816e15f
KW
1119 my @expected_return_flags
1120 = @expected_malformation_return_flags;
69485e19 1121 my @expected_warnings;
4816e15f 1122 push @expected_warnings, @expected_malformation_warnings
69485e19
KW
1123 if $expect_warnings_for_malformed;
1124
69485e19
KW
1125 # The overflow malformation is done differently than other
1126 # malformations. It comes from manually typed tests in
1127 # the test array, but it also is above Unicode and uses
1128 # Perl extended UTF-8, so affects some of the flags being
1129 # tested. We now make it be treated like one of the other
1130 # generated malformations.
1131 if ($will_overflow) {
1132
1133 # An overflow is (way) above Unicode, and overrides
1134 # everything else.
1135 $expect_regular_warnings = 0;
1136
717dd9f9
KW
1137 # Earlier, we tentatively calculated whether this
1138 # should emit a message or not. It's tentative
1139 # because, even if we ordinarily would output it, we
1140 # don't if malformations are allowed -- except an
1141 # overflow is also a SUPER and ABOVE_31_BIT, and if
1142 # warnings for those are enabled, the overflow
1143 # warning does get raised.
1144 if ( $expect_warnings_for_overflow
1145 && ( $malformed_allow_type == 0
1146 || ( $this_warning_flags
1147 & ($::UTF8_WARN_SUPER
1148 |$::UTF8_WARN_ABOVE_31_BIT))))
1149 {
4816e15f 1150 push @expected_warnings, $overflow_msg_pattern;
6aa905cf 1151 }
69485e19
KW
1152 }
1153
1154 # It may be that the malformations have shortened the
1155 # amount of input we look at so much that we can't tell
1156 # what the category the code point was in. Otherwise, set
1157 # up the expected return flags based on the warnings and
1158 # disallowments.
1159 if ($this_expected_len < $this_needed_to_discern_len) {
1160 $expect_regular_warnings = 0;
1161 }
1162 elsif ( ($this_warning_flags & $this_utf8n_flag_to_warn)
1163 || ( $this_disallow_flags
1164 & $this_utf8n_flag_to_disallow))
1165 {
1166 push @expected_return_flags, $return_flag;
1167 }
1168
1169 # Finish setting up the expected warning.
1170 if ($expect_regular_warnings) {
1171
1172 # So far the array contains warnings generated by
1173 # malformations. Add the expected regular one.
1174 unshift @expected_warnings, $cp_message_qr;
1175
1176 # But it may need to be modified, because either of
1177 # these malformations means we can't determine the
1178 # expected code point.
1179 if ($short || $unexpected_noncont) {
1180 my $first_byte = substr($this_bytes, 0, 1);
1181 $expected_warnings[0] = display_bytes(
1182 substr($this_bytes, 0, $this_expected_len));
1183 $expected_warnings[0]
1184 = qr/[Aa]\Qny UTF-8 sequence that starts with\E
1185 \Q $expected_warnings[0]\E
1186 \Q $non_cp_trailing_text\E/x;
1187 }
1188 }
6aa905cf 1189
717dd9f9
KW
1190 # Is effectively disallowed if we've set up a malformation
1191 # (unless malformations are allowed), even if the flag
1192 # indicates it is allowed. Fix up test name to indicate
1193 # this as well
1194 my $disallowed = 0;
1195 if ( $this_disallow_flags & $this_utf8n_flag_to_disallow
1196 && $this_expected_len >= $this_needed_to_discern_len)
1197 {
1198 $disallowed = 1;
1199 }
1200 if ($malformations_name) {
1201 if ($malformed_allow_type == 0) {
1202 $disallowed = 1;
1203 }
1204 elsif ($malformed_allow_type == 1) {
1205
1206 # Even if allowed, the malformation returns the
1207 # REPLACEMENT CHARACTER.
1208 $expected_uv = 0xFFFD;
1209 $this_uv_string = "0xFFFD"
1210 }
1211 }
1212
6aa905cf 1213 my $this_name = "utf8n_to_uvchr_error() $testname: "
69485e19
KW
1214 . (($disallowed)
1215 ? 'disallowed'
1216 : 'allowed');
6aa905cf 1217 $this_name .= ", $eval_warn";
69485e19
KW
1218 $this_name .= ", " . (( $this_warning_flags
1219 & $this_utf8n_flag_to_warn)
1220 ? 'with flag for raising warnings'
1221 : 'no flag for raising warnings');
1222 $this_name .= $malformations_name;
db0f09e6 1223
8f79178b 1224 # Do the actual test using an eval
9cdc3054 1225 undef @warnings_gotten;
6aa905cf 1226 my $ret_ref;
717dd9f9
KW
1227 my $this_flags
1228 = $allow_flags|$this_warning_flags|$this_disallow_flags;
6aa905cf
KW
1229 my $eval_text = "$eval_warn; \$ret_ref"
1230 . " = test_utf8n_to_uvchr_error("
69485e19 1231 . "'$this_bytes', $this_length, $this_flags)";
6aa905cf 1232 eval "$eval_text";
69485e19 1233 if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
6aa905cf 1234 {
a8ee5133
KW
1235 diag "\$@='$@'; call was: "
1236 . utf8n_display_call($eval_text);
6aa905cf
KW
1237 next;
1238 }
1239 if ($disallowed) {
d402d77f 1240 is($ret_ref->[0], 0, " And returns 0")
a8ee5133 1241 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf
KW
1242 }
1243 else {
1244 is($ret_ref->[0], $expected_uv,
d402d77f 1245 " And returns expected uv: "
717dd9f9 1246 . $this_uv_string)
a8ee5133 1247 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf
KW
1248 }
1249 is($ret_ref->[1], $this_expected_len,
d402d77f 1250 " And returns expected length:"
6aa905cf 1251 . " $this_expected_len")
a8ee5133 1252 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf 1253
9cdc3054 1254 my $returned_flags = $ret_ref->[2];
6aa905cf 1255
9cdc3054 1256 for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
69485e19
KW
1257 if ($expected_return_flags[$i] & $returned_flags) {
1258 if ($expected_return_flags[$i]
1259 == $::UTF8_DISALLOW_ABOVE_31_BIT)
1260 {
1261 pass(" Expected and got return flag for"
1262 . " above_31_bit");
1263 }
1264 # The first entries in this are
1265 # malformations
1266 elsif ($i > @malformation_names - 1) {
1267 pass(" Expected and got return flag"
e4e140b4 1268 . " for " . $controlling_warning_category);
69485e19
KW
1269 }
1270 else {
1271 pass(" Expected and got return flag for "
1272 . $malformation_names[$i]
1273 . " malformation");
1274 }
9cdc3054 1275 $returned_flags &= ~$expected_return_flags[$i];
69485e19 1276 splice @expected_return_flags, $i, 1;
6aa905cf 1277 }
6aa905cf 1278 }
6aa905cf 1279
69485e19
KW
1280 is($returned_flags, 0,
1281 " Got no unexpected return flags")
1282 or diag "The unexpected flags gotten were: "
5722c46d
KW
1283 . (flags_to_text($returned_flags,
1284 \@utf8n_flags_to_text)
69485e19
KW
1285 # We strip off any prefixes from the flag
1286 # names
1287 =~ s/ \b [A-Z] _ //xgr);
1288 is (scalar @expected_return_flags, 0,
1289 " Got all expected return flags")
1290 or diag "The expected flags not gotten were: "
1291 . (flags_to_text(eval join("|",
1292 @expected_return_flags),
1293 \@utf8n_flags_to_text)
1294 # We strip off any prefixes from the flag
1295 # names
5722c46d 1296 =~ s/ \b [A-Z] _ //xgr);
9cdc3054 1297
69485e19
KW
1298 do_warnings_test(@expected_warnings)
1299 or diag "Call was: " . utf8n_display_call($eval_text);
1300 undef @warnings_gotten;
6aa905cf
KW
1301
1302 # Check CHECK_ONLY results when the input is
1303 # disallowed. Do this when actually disallowed,
69485e19 1304 # not just when the $this_disallow_flags is set
6aa905cf 1305 if ($disallowed) {
69485e19
KW
1306 my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY;
1307 my $eval_text = "use warnings; no warnings 'deprecated'; \$ret_ref ="
1308 . " test_utf8n_to_uvchr_error('"
1309 . "$this_bytes', $this_length,"
1310 . " $this_flags)";
1311 eval $eval_text;
a8ee5133
KW
1312 if (! ok ("$@ eq ''",
1313 " And eval succeeded with CHECK_ONLY"))
1314 {
1315 diag "\$@='$@'; Call was: "
1316 . utf8n_display_call($eval_text);
1317 next;
1318 }
d402d77f 1319 is($ret_ref->[0], 0, " CHECK_ONLY: Returns 0")
a8ee5133 1320 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf 1321 is($ret_ref->[1], -1,
d402d77f 1322 " CHECK_ONLY: returns -1 for length")
a8ee5133 1323 or diag "Call was: " . utf8n_display_call($eval_text);
9cdc3054 1324 if (! is(scalar @warnings_gotten, 0,
d402d77f 1325 " CHECK_ONLY: no warnings generated"))
6aa905cf 1326 {
a8ee5133 1327 diag "Call was: " . utf8n_display_call($eval_text);
9cdc3054 1328 output_warnings(@warnings_gotten);
6aa905cf
KW
1329 }
1330 }
1331
1332 # Now repeat some of the above, but for
1333 # uvchr_to_utf8_flags(). Since this comes from an
8f79178b
KW
1334 # existing code point, it hasn't overflowed, and isn't
1335 # malformed.
69485e19
KW
1336 next if @malformation_names;
1337
1338 $this_warning_flags = ($use_warn_flag)
1339 ? $this_uvchr_flag_to_warn
1340 : 0;
1341 $this_disallow_flags = ($do_disallow)
1342 ? $this_uvchr_flag_to_disallow
1343 : 0;
1344
1345 $disallowed = $this_disallow_flags
1346 & $this_uvchr_flag_to_disallow;
1347 $this_name .= ", " . (( $this_warning_flags
1348 & $this_utf8n_flag_to_warn)
1349 ? 'with flag for raising warnings'
1350 : 'no flag for raising warnings');
6aa905cf
KW
1351
1352 $this_name = "uvchr_to_utf8_flags() $testname: "
69485e19 1353 . (($disallowed)
6aa905cf 1354 ? 'disallowed'
6aa905cf
KW
1355 : 'allowed');
1356 $this_name .= ", $eval_warn";
69485e19
KW
1357 $this_name .= ", " . (( $this_warning_flags
1358 & $this_uvchr_flag_to_warn)
6aa905cf
KW
1359 ? 'with warning flag'
1360 : 'no warning flag');
1361
9cdc3054 1362 undef @warnings_gotten;
6aa905cf 1363 my $ret;
69485e19 1364 $this_flags = $this_warning_flags|$this_disallow_flags;
6aa905cf
KW
1365 $eval_text = "$eval_warn; \$ret ="
1366 . " test_uvchr_to_utf8_flags("
d884ea32 1367 . "$allowed_uv, $this_flags)";
6aa905cf
KW
1368 eval "$eval_text";
1369 if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
1370 {
d884ea32
KW
1371 diag "\$@='$@'; call was: "
1372 . uvchr_display_call($eval_text);
6aa905cf
KW
1373 next;
1374 }
1375 if ($disallowed) {
d402d77f 1376 is($ret, undef, " And returns undef")
d884ea32 1377 or diag "Call was: " . uvchr_display_call($eval_text);
6aa905cf
KW
1378 }
1379 else {
d402d77f 1380 is($ret, $this_bytes, " And returns expected string")
d884ea32 1381 or diag "Call was: " . uvchr_display_call($eval_text);
6aa905cf 1382 }
69485e19
KW
1383
1384 do_warnings_test(@expected_warnings)
1385 or diag "Call was: " . uvchr_display_call($eval_text);
6aa905cf
KW
1386 }
1387 }
717dd9f9 1388 }
6aa905cf
KW
1389 }
1390 }
8f79178b 1391 }
6aa905cf
KW
1392 }
1393}
1394
1395done_testing;