This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/utf8_warn_base.pl: Simplify some calculations
[perl5.git] / ext / XS-APItest / t / utf8_warn_base.pl
CommitLineData
6aa905cf
KW
1#!perl -w
2
3# This is a base file to be used by various .t's in its directory
4
5use strict;
6use Test::More;
7
8BEGIN {
9 use_ok('XS::APItest');
10 require 'charset_tools.pl';
11 require './t/utf8_setup.pl';
12};
13
14$|=1;
15
16no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
17 # machines, and that is tested elsewhere
18
19use XS::APItest;
20use Data::Dumper;
21
9cdc3054 22my @warnings_gotten;
6aa905cf
KW
23
24use warnings 'utf8';
9cdc3054 25local $SIG{__WARN__} = sub { push @warnings_gotten, @_ };
6aa905cf
KW
26
27sub nonportable_regex ($) {
28
29 # Returns a pattern that matches the non-portable message raised either
30 # for the specific input code point, or the one generated when there
31 # is some malformation that precludes the message containing the specific
32 # code point
33
34 my $code_point = shift;
35
36 my $string = sprintf '(Code point 0x%X is not Unicode, and'
37 . '|Any UTF-8 sequence that starts with'
38 . ' "(\\\x[[:xdigit:]]{2})+" is for a'
39 . ' non-Unicode code point, and is) not portable',
40 $code_point;
41 return qr/$string/;
42}
43
44# Now test the cases where a legal code point is generated, but may or may not
45# be allowed/warned on.
46my @tests = (
af816908 47 # ($testname, $bytes, $controlling_warning_category,
b7e1f4b2 48 # $allowed_uv, $needed_to_discern_len )
6aa905cf
KW
49 [ "lowest surrogate",
50 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
6aa905cf 51 'surrogate', 0xD800,
6aa905cf
KW
52 ],
53 [ "a middle surrogate",
54 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
6aa905cf 55 'surrogate', 0xD90D,
6aa905cf
KW
56 ],
57 [ "highest surrogate",
58 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
6aa905cf 59 'surrogate', 0xDFFF,
6aa905cf
KW
60 ],
61 [ "first non_unicode",
62 (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
6aa905cf 63 'non_unicode', 0x110000,
6aa905cf 64 2,
6aa905cf
KW
65 ],
66 [ "non_unicode whose first byte tells that",
67 (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
6aa905cf
KW
68 'non_unicode',
69 (isASCII) ? 0x140000 : 0x200000,
6aa905cf 70 1,
6aa905cf
KW
71 ],
72 [ "first of 32 consecutive non-character code points",
73 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
6aa905cf 74 'nonchar', 0xFDD0,
6aa905cf
KW
75 ],
76 [ "a mid non-character code point of the 32 consecutive ones",
77 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
6aa905cf 78 'nonchar', 0xFDE0,
6aa905cf
KW
79 ],
80 [ "final of 32 consecutive non-character code points",
81 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
6aa905cf 82 'nonchar', 0xFDEF,
6aa905cf
KW
83 ],
84 [ "non-character code point U+FFFE",
85 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
6aa905cf 86 'nonchar', 0xFFFE,
6aa905cf
KW
87 ],
88 [ "non-character code point U+FFFF",
89 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
6aa905cf 90 'nonchar', 0xFFFF,
6aa905cf
KW
91 ],
92 [ "non-character code point U+1FFFE",
93 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
6aa905cf 94 'nonchar', 0x1FFFE,
6aa905cf
KW
95 ],
96 [ "non-character code point U+1FFFF",
97 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
6aa905cf 98 'nonchar', 0x1FFFF,
6aa905cf
KW
99 ],
100 [ "non-character code point U+2FFFE",
101 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
6aa905cf 102 'nonchar', 0x2FFFE,
6aa905cf
KW
103 ],
104 [ "non-character code point U+2FFFF",
105 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
6aa905cf 106 'nonchar', 0x2FFFF,
6aa905cf
KW
107 ],
108 [ "non-character code point U+3FFFE",
109 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
6aa905cf 110 'nonchar', 0x3FFFE,
6aa905cf
KW
111 ],
112 [ "non-character code point U+3FFFF",
113 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
6aa905cf 114 'nonchar', 0x3FFFF,
6aa905cf
KW
115 ],
116 [ "non-character code point U+4FFFE",
117 (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
6aa905cf 118 'nonchar', 0x4FFFE,
6aa905cf
KW
119 ],
120 [ "non-character code point U+4FFFF",
121 (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
6aa905cf 122 'nonchar', 0x4FFFF,
6aa905cf
KW
123 ],
124 [ "non-character code point U+5FFFE",
125 (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
6aa905cf 126 'nonchar', 0x5FFFE,
6aa905cf
KW
127 ],
128 [ "non-character code point U+5FFFF",
129 (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
6aa905cf 130 'nonchar', 0x5FFFF,
6aa905cf
KW
131 ],
132 [ "non-character code point U+6FFFE",
133 (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
6aa905cf 134 'nonchar', 0x6FFFE,
6aa905cf
KW
135 ],
136 [ "non-character code point U+6FFFF",
137 (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
6aa905cf 138 'nonchar', 0x6FFFF,
6aa905cf
KW
139 ],
140 [ "non-character code point U+7FFFE",
141 (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
6aa905cf 142 'nonchar', 0x7FFFE,
6aa905cf
KW
143 ],
144 [ "non-character code point U+7FFFF",
145 (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
6aa905cf 146 'nonchar', 0x7FFFF,
6aa905cf
KW
147 ],
148 [ "non-character code point U+8FFFE",
149 (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
6aa905cf 150 'nonchar', 0x8FFFE,
6aa905cf
KW
151 ],
152 [ "non-character code point U+8FFFF",
153 (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
6aa905cf 154 'nonchar', 0x8FFFF,
6aa905cf
KW
155 ],
156 [ "non-character code point U+9FFFE",
157 (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
6aa905cf 158 'nonchar', 0x9FFFE,
6aa905cf
KW
159 ],
160 [ "non-character code point U+9FFFF",
161 (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
6aa905cf 162 'nonchar', 0x9FFFF,
6aa905cf
KW
163 ],
164 [ "non-character code point U+AFFFE",
165 (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
6aa905cf 166 'nonchar', 0xAFFFE,
6aa905cf
KW
167 ],
168 [ "non-character code point U+AFFFF",
169 (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
6aa905cf 170 'nonchar', 0xAFFFF,
6aa905cf
KW
171 ],
172 [ "non-character code point U+BFFFE",
173 (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
6aa905cf 174 'nonchar', 0xBFFFE,
6aa905cf
KW
175 ],
176 [ "non-character code point U+BFFFF",
177 (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
6aa905cf 178 'nonchar', 0xBFFFF,
6aa905cf
KW
179 ],
180 [ "non-character code point U+CFFFE",
181 (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
6aa905cf 182 'nonchar', 0xCFFFE,
6aa905cf
KW
183 ],
184 [ "non-character code point U+CFFFF",
185 (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
6aa905cf 186 'nonchar', 0xCFFFF,
6aa905cf
KW
187 ],
188 [ "non-character code point U+DFFFE",
189 (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
6aa905cf 190 'nonchar', 0xDFFFE,
6aa905cf
KW
191 ],
192 [ "non-character code point U+DFFFF",
193 (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
6aa905cf 194 'nonchar', 0xDFFFF,
6aa905cf
KW
195 ],
196 [ "non-character code point U+EFFFE",
197 (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
6aa905cf 198 'nonchar', 0xEFFFE,
6aa905cf
KW
199 ],
200 [ "non-character code point U+EFFFF",
201 (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
6aa905cf 202 'nonchar', 0xEFFFF,
6aa905cf
KW
203 ],
204 [ "non-character code point U+FFFFE",
205 (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
6aa905cf 206 'nonchar', 0xFFFFE,
6aa905cf
KW
207 ],
208 [ "non-character code point U+FFFFF",
209 (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
6aa905cf 210 'nonchar', 0xFFFFF,
6aa905cf
KW
211 ],
212 [ "non-character code point U+10FFFE",
213 (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
6aa905cf 214 'nonchar', 0x10FFFE,
6aa905cf
KW
215 ],
216 [ "non-character code point U+10FFFF",
217 (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
6aa905cf 218 'nonchar', 0x10FFFF,
6aa905cf 219 ],
76513bdc
KW
220 [ "requires at least 32 bits",
221 (isASCII)
222 ? "\xfe\x82\x80\x80\x80\x80\x80"
223 : I8_to_native(
224 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
225 # This code point is chosen so that it is representable in a UV on
226 # 32-bit machines
76513bdc 227 'utf8', 0x80000000,
76513bdc 228 (isASCII) ? 1 : 8,
76513bdc
KW
229 ],
230 [ "highest 32 bit code point",
231 (isASCII)
232 ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
233 : I8_to_native(
234 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
76513bdc 235 'utf8', 0xFFFFFFFF,
76513bdc 236 (isASCII) ? 1 : 8,
76513bdc
KW
237 ],
238 [ "requires at least 32 bits, and use SUPER-type flags, instead of"
239 . " ABOVE_31_BIT",
240 (isASCII)
241 ? "\xfe\x82\x80\x80\x80\x80\x80"
242 : I8_to_native(
243 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
76513bdc 244 'utf8', 0x80000000,
76513bdc 245 1,
76513bdc
KW
246 ],
247 [ "overflow with warnings/disallow for more than 31 bits",
248 # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
249 # with overflow. The overflow malformation is never allowed, so
250 # preventing it takes precedence if the ABOVE_31_BIT options would
251 # otherwise allow in an overflowing value. The ASCII code points (1
252 # for 32-bits; 1 for 64) were chosen because the old overflow
253 # detection algorithm did not catch them; this means this test also
254 # checks for that fix. The EBCDIC are arbitrary overflowing ones
255 # since we have no reports of failures with it.
256 (($::is64bit)
257 ? ((isASCII)
258 ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
259 : I8_to_native(
260 "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
261 : ((isASCII)
262 ? "\xfe\x86\x80\x80\x80\x80\x80"
263 : I8_to_native(
264 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
04f42bf6 265 'utf8', -1,
76513bdc 266 (isASCII || $::is64bit) ? 2 : 8,
76513bdc 267 ],
6aa905cf
KW
268);
269
270if (! $::is64bit) {
271 if (isASCII) {
272 no warnings qw{portable overflow};
273 push @tests,
274 [ "Lowest 33 bit code point: overflow",
275 "\xFE\x84\x80\x80\x80\x80\x80",
04f42bf6 276 'utf8', -1,
b3169593 277 1,
6aa905cf
KW
278 ];
279 }
280}
281else {
282 no warnings qw{portable overflow};
283 push @tests,
284 [ "More than 32 bits",
285 (isASCII)
286 ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
287 : I8_to_native(
288 "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
6aa905cf 289 'utf8', 0x1000000000,
b3169593 290 (isASCII) ? 1 : 7,
6aa905cf
KW
291 ];
292 if (! isASCII) {
293 push @tests, # These could falsely show wrongly in a naive
294 # implementation
295 [ "requires at least 32 bits",
296 I8_to_native(
297 "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
6aa905cf 298 'utf8', 0x800000000,
b3169593 299 7,
6aa905cf
KW
300 ],
301 [ "requires at least 32 bits",
302 I8_to_native(
303 "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
6aa905cf 304 'utf8', 0x10000000000,
b3169593 305 6,
6aa905cf
KW
306 ],
307 [ "requires at least 32 bits",
308 I8_to_native(
309 "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
6aa905cf 310 'utf8', 0x200000000000,
b3169593 311 5,
6aa905cf
KW
312 ],
313 [ "requires at least 32 bits",
314 I8_to_native(
315 "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
6aa905cf 316 'utf8', 0x4000000000000,
b3169593 317 4,
6aa905cf
KW
318 ],
319 [ "requires at least 32 bits",
320 I8_to_native(
321 "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
6aa905cf 322 'utf8', 0x80000000000000,
b3169593 323 3,
6aa905cf
KW
324 ],
325 [ "requires at least 32 bits",
326 I8_to_native(
327 "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
6aa905cf 328 'utf8', 0x1000000000000000,
b3169593 329 2,
6aa905cf
KW
330 ];
331 }
332}
333
5722c46d
KW
334sub flags_to_text($$)
335{
336 my ($flags, $flags_to_text_ref) = @_;
337
338 # Returns a string containing a mnemonic representation of the bits that
339 # are set in the $flags. These are assumed to be flag bits. The return
340 # looks like "FOO|BAR|BAZ". The second parameter is a reference to an
341 # array that gives the textual representation of all the possible flags.
342 # Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If
343 # no bits at all are set the string "0" is returned;
344
345 my @flag_text;
346 my $shift = 0;
347
348 return "0" if $flags == 0;
349
350 while ($flags) {
351 #diag sprintf "%x", $flags;
352 if ($flags & 1) {
353 push @flag_text, $flags_to_text_ref->[$shift];
354 }
355 $shift++;
356 $flags >>= 1;
357 }
358
359 return join "|", @flag_text;
360}
361
362# Possible flag returns from utf8n_to_uvchr_error(). These should have G_,
363# instead of A_, D_, but the prefixes will be used in a a later commit, so
364# minimize churn by having them here.
365my @utf8n_flags_to_text = ( qw(
366 A_EMPTY
367 A_CONTINUATION
368 A_NON_CONTINUATION
369 A_SHORT
370 A_LONG
371 A_LONG_AND_ITS_VALUE
372 PLACEHOLDER
373 A_OVERFLOW
374 D_SURROGATE
375 W_SURROGATE
376 D_NONCHAR
377 W_NONCHAR
378 D_SUPER
379 W_SUPER
380 D_ABOVE_31_BIT
381 W_ABOVE_31_BIT
382 CHECK_ONLY
383 NO_CONFIDENCE_IN_CURLEN_
384 ) );
385
a8ee5133
KW
386sub utf8n_display_call($)
387{
388 # Converts an eval string that calls test_utf8n_to_uvchr into a more human
389 # readable form, and returns it. Doesn't work if the byte string contains
390 # an apostrophe. The return will look something like:
391 # test_utf8n_to_uvchr_error('$bytes', $length, $flags)
392 #diag $_[0];
393
394 $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
395 my $text1 = $1; # Everything before the byte string
396 my $bytes = $2;
397 my $text2 = $3; # Includes the length
398 my $flags = $4;
399
400 return $text1
401 . display_bytes($bytes)
402 . $text2
403 . flags_to_text($flags, \@utf8n_flags_to_text)
404 . ')';
405}
406
d884ea32
KW
407sub uvchr_display_call($)
408{
409 # Converts an eval string that calls test_uvchr_to_utf8 into a more human
410 # readable form, and returns it. The return will look something like:
411 # test_uvchr_to_utf8n_flags($uv, $flags)
412 #diag $_[0];
413
414 my @flags_to_text = ( qw(
415 W_SURROGATE
416 W_NONCHAR
417 W_SUPER
418 W_ABOVE_31_BIT
419 D_SURROGATE
420 D_NONCHAR
421 D_SUPER
422 D_ABOVE_31_BIT
423 ) );
424
425 $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
426 my $text = $1;
427 my $cp = sprintf "%X", $2;
428 my $flags = $3;
429
430 return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
431}
432
6aa905cf
KW
433# This test is split into this number of files.
434my $num_test_files = $ENV{TEST_JOBS} || 1;
435$num_test_files = 10 if $num_test_files > 10;
436
437my $test_count = -1;
438foreach my $test (@tests) {
439 $test_count++;
440 next if $test_count % $num_test_files != $::TEST_CHUNK;
441
af816908 442 my ($testname, $bytes,
9cdc3054 443 $controlling_warning_category, $allowed_uv, $needed_to_discern_len
6aa905cf
KW
444 ) = @$test;
445
446 my $length = length $bytes;
04f42bf6 447 my $will_overflow = $allowed_uv < 0;
6aa905cf 448
2c511c58
KW
449 my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
450
af816908
KW
451 my $utf8n_flag_to_warn;
452 my $utf8n_flag_to_disallow;
453 my $uvchr_flag_to_warn;
454 my $uvchr_flag_to_disallow;
b7e1f4b2 455
3022ad00 456 my $message;
af816908
KW
457 if ($will_overflow || $allowed_uv > 0x10FFFF) {
458
459 $utf8n_flag_to_warn = $::UTF8_WARN_SUPER;
460 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
461 $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER;
462 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
463
464 if ($will_overflow) {
465 $message = qr/overflows/;
466 }
467 elsif ($allowed_uv > 0x7FFFFFFF) {
468 $message = nonportable_regex($allowed_uv);
469 }
470 else {
471 $message = qr/(not Unicode|for a non-Unicode code point).* may not be portable/;
472 }
3022ad00
KW
473 }
474 elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
475 $message = qr/surrogate/;
67e45424 476 $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
af816908
KW
477
478 $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE;
479 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
480 $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE;
481 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
3022ad00
KW
482 }
483 elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
484 || ($allowed_uv & 0xFFFE) == 0xFFFE)
485 {
486 $message = qr/Unicode non-character.*is not recommended for open interchange/;
67e45424 487 $needed_to_discern_len = $length unless defined $needed_to_discern_len;
af816908
KW
488
489 $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR;
490 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
491 $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR;
492 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
3022ad00
KW
493 }
494 else {
495 die "Can't figure out what type of warning to test for $testname"
496 }
497
67e45424
KW
498 die 'Didn\'t set $needed_to_discern_len for ' . $testname
499 unless defined $needed_to_discern_len;
af816908
KW
500 my $disallow_flags = $utf8n_flag_to_disallow;
501 my $warn_flags = $disallow_flags << 1;
502
503 # The convention is that the got flag is the same value as the disallow
504 # one, and the warn flag is the next bit over. If this were violated, the
505 # tests here should start failing. We could do an eval under no strict to
506 # be sure.
507 my $expected_error_flags = $disallow_flags;
67e45424 508
6aa905cf
KW
509 {
510 use warnings;
9cdc3054 511 undef @warnings_gotten;
6aa905cf
KW
512 my $ret = test_isUTF8_CHAR($bytes, $length);
513 my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
514 if ($will_overflow) {
515 is($ret, 0, "isUTF8_CHAR() $testname: returns 0");
516 is($ret_flags, 0, "isUTF8_CHAR_flags() $testname: returns 0");
517 }
518 else {
519 is($ret, $length,
520 "isUTF8_CHAR() $testname: returns expected length: $length");
521 is($ret_flags, $length, "isUTF8_CHAR_flags(...,0) $testname:"
522 . " returns expected length: $length");
523 }
9cdc3054 524 is(scalar @warnings_gotten, 0,
6aa905cf
KW
525 "isUTF8_CHAR() and isUTF8_CHAR()_flags $testname: generated"
526 . " no warnings")
9cdc3054 527 or output_warnings(@warnings_gotten);
6aa905cf 528
9cdc3054 529 undef @warnings_gotten;
6aa905cf
KW
530 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
531 if ($will_overflow) {
532 is($ret, 0, "isSTRICT_UTF8_CHAR() $testname: returns 0");
533 }
534 else {
535 my $expected_ret = ( $testname =~ /surrogate|non-character/
536 || $allowed_uv > 0x10FFFF)
537 ? 0
538 : $length;
539 is($ret, $expected_ret, "isSTRICT_UTF8_CHAR() $testname: returns"
540 . " expected length: $expected_ret");
541 $ret = test_isUTF8_CHAR_flags($bytes, $length,
542 $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
543 is($ret, $expected_ret,
544 "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
545 . " acts like isSTRICT_UTF8_CHAR");
546 }
9cdc3054 547 is(scalar @warnings_gotten, 0,
6aa905cf
KW
548 "isSTRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
549 . " generated no warnings")
9cdc3054 550 or output_warnings(@warnings_gotten);
6aa905cf 551
9cdc3054 552 undef @warnings_gotten;
6aa905cf
KW
553 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
554 if ($will_overflow) {
555 is($ret, 0, "isC9_STRICT_UTF8_CHAR() $testname: returns 0");
556 }
557 else {
558 my $expected_ret = ( $testname =~ /surrogate/
559 || $allowed_uv > 0x10FFFF)
560 ? 0
561 : $length;
562 is($ret, $expected_ret, "isC9_STRICT_UTF8_CHAR() $testname:"
563 ." returns expected length: $expected_ret");
564 $ret = test_isUTF8_CHAR_flags($bytes, $length,
565 $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
566 is($ret, $expected_ret,
567 "isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
568 . " acts like isC9_STRICT_UTF8_CHAR");
569 }
9cdc3054 570 is(scalar @warnings_gotten, 0,
6aa905cf
KW
571 "isC9_STRICT_UTF8_CHAR() and isUTF8_CHAR_flags $testname:"
572 . " generated no warnings")
9cdc3054 573 or output_warnings(@warnings_gotten);
6aa905cf
KW
574
575 # Test partial character handling, for each byte not a full character
576 for my $j (1.. $length - 1) {
577
578 # Skip the test for the interaction between overflow and above-31
579 # bit. It is really testing other things than the partial
580 # character tests, for which other tests in this file are
581 # sufficient
04f42bf6 582 last if $will_overflow;
6aa905cf
KW
583
584 foreach my $disallow_flag (0, $disallow_flags) {
585 my $partial = substr($bytes, 0, $j);
586 my $ret_should_be;
587 my $comment;
588 if ($disallow_flag) {
589 $ret_should_be = 0;
590 $comment = "disallowed";
591 if ($j < $needed_to_discern_len) {
592 $ret_should_be = 1;
593 $comment .= ", but need $needed_to_discern_len bytes"
594 . " to discern:";
595 }
596 }
597 else {
598 $ret_should_be = 1;
599 $comment = "allowed";
600 }
601
9cdc3054 602 undef @warnings_gotten;
6aa905cf
KW
603
604 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
605 $disallow_flag);
606 is($ret, $ret_should_be,
607 "$testname: is_utf8_valid_partial_char_flags("
608 . display_bytes($partial)
609 . "), $comment: returns $ret_should_be");
9cdc3054 610 is(scalar @warnings_gotten, 0,
6aa905cf
KW
611 "$testname: is_utf8_valid_partial_char_flags()"
612 . " generated no warnings")
9cdc3054 613 or output_warnings(@warnings_gotten);
6aa905cf
KW
614 }
615 }
616 }
617
618 # This is more complicated than the malformations tested earlier, as there
619 # are several orthogonal variables involved. We test all the subclasses
620 # of utf8 warnings to verify they work with and without the utf8 class,
621 # and don't have effects on other sublass warnings
9cdc3054 622 foreach my $trial_warning_category ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
6aa905cf
KW
623 foreach my $warn_flag (0, $warn_flags) {
624 foreach my $disallow_flag (0, $disallow_flags) {
625 foreach my $do_warning (0, 1) {
626
627 # We try each of the above with various combinations of
628 # malformations that can occur on the same input sequence.
629 foreach my $short ("", "short") {
630 foreach my $unexpected_noncont ("",
631 "unexpected non-continuation")
632 {
633 foreach my $overlong ("", "overlong") {
634
b3169593
KW
635 # If we're creating an overlong, it can't be longer than
636 # the maximum length, so skip if we're already at that
637 # length.
638 next if $overlong && $length >= $::max_bytes;
6aa905cf
KW
639
640 my @malformations;
9cdc3054 641 my @expected_return_flags;
6aa905cf
KW
642 push @malformations, $short if $short;
643 push @malformations, $unexpected_noncont
644 if $unexpected_noncont;
645 push @malformations, $overlong if $overlong;
646
647 # The overflow malformation test in the input
648 # array is coerced into being treated like one of
649 # the others.
650 if ($will_overflow) {
651 push @malformations, 'overflow';
9cdc3054 652 push @expected_return_flags, $::UTF8_GOT_OVERFLOW;
6aa905cf
KW
653 }
654
655 my $malformations_name = join "/", @malformations;
656 $malformations_name .= " malformation"
657 if $malformations_name;
658 $malformations_name .= "s" if @malformations > 1;
659 my $this_bytes = $bytes;
660 my $this_length = $length;
661 my $expected_uv = $allowed_uv;
b3169593 662 my $this_expected_len = $length;
6aa905cf
KW
663 my $this_needed_to_discern_len = $needed_to_discern_len;
664 if ($malformations_name) {
665 $expected_uv = 0;
666
667 # Coerce the input into the desired
668 # malformation
669 if ($malformations_name =~ /overlong/) {
670
671 # For an overlong, we convert the original
672 # start byte into a continuation byte with
673 # the same data bits as originally. ...
674 substr($this_bytes, 0, 1)
675 = start_byte_to_cont(substr($this_bytes,
676 0, 1));
677
678 # ... Then we prepend it with a known
679 # overlong sequence. This should evaluate
680 # to the exact same code point as the
681 # original.
682 $this_bytes
683 = I8_to_native("\xff")
dbb8d798 684 . (I8_to_native(chr $::lowest_continuation)
6aa905cf
KW
685 x ( $::max_bytes - 1 - length($this_bytes)))
686 . $this_bytes;
687 $this_length = length($this_bytes);
688 $this_needed_to_discern_len
689 = $::max_bytes - ($this_expected_len
690 - $this_needed_to_discern_len);
691 $this_expected_len = $::max_bytes;
9cdc3054 692 push @expected_return_flags, $::UTF8_GOT_LONG;
6aa905cf
KW
693 }
694 if ($malformations_name =~ /short/) {
695
696 # Just tell the test to not look far
697 # enough into the input.
698 $this_length--;
699 $this_expected_len--;
9cdc3054 700 push @expected_return_flags, $::UTF8_GOT_SHORT;
6aa905cf
KW
701 }
702 if ($malformations_name
703 =~ /non-continuation/)
704 {
705 # Change the final continuation byte into
706 # a non one.
707 my $pos = ($short) ? -2 : -1;
708 substr($this_bytes, $pos, 1) = '?';
709 $this_expected_len--;
9cdc3054 710 push @expected_return_flags,
6aa905cf
KW
711 $::UTF8_GOT_NON_CONTINUATION;
712 }
713 }
714
715 my $eval_warn = $do_warning
9cdc3054
KW
716 ? "use warnings '$trial_warning_category'"
717 : $trial_warning_category eq "utf8"
6aa905cf
KW
718 ? "no warnings 'utf8'"
719 : ( "use warnings 'utf8';"
9cdc3054 720 . " no warnings '$trial_warning_category'");
6aa905cf
KW
721
722 # Is effectively disallowed if we've set up a
723 # malformation, even if the flag indicates it is
724 # allowed. Fix up test name to indicate this as
725 # well
726 my $disallowed = $disallow_flag
727 || $malformations_name;
728 my $this_name = "utf8n_to_uvchr_error() $testname: "
729 . (($disallow_flag)
730 ? 'disallowed'
731 : $disallowed
732 ? $disallowed
733 : 'allowed');
734 $this_name .= ", $eval_warn";
735 $this_name .= ", " . (($warn_flag)
736 ? 'with warning flag'
737 : 'no warning flag');
738
9cdc3054 739 undef @warnings_gotten;
6aa905cf 740 my $ret_ref;
a8ee5133 741 my $this_flags = $warn_flag | $disallow_flag;
6aa905cf
KW
742 my $eval_text = "$eval_warn; \$ret_ref"
743 . " = test_utf8n_to_uvchr_error("
744 . "'$this_bytes',"
a8ee5133 745 . " $this_length, $this_flags)";
6aa905cf
KW
746 eval "$eval_text";
747 if (! ok ("$@ eq ''",
748 "$this_name: eval succeeded"))
749 {
a8ee5133
KW
750 diag "\$@='$@'; call was: "
751 . utf8n_display_call($eval_text);
6aa905cf
KW
752 next;
753 }
754 if ($disallowed) {
755 is($ret_ref->[0], 0, "$this_name: Returns 0")
a8ee5133 756 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf
KW
757 }
758 else {
759 is($ret_ref->[0], $expected_uv,
760 "$this_name: Returns expected uv: "
2c511c58 761 . $uv_string)
a8ee5133 762 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf
KW
763 }
764 is($ret_ref->[1], $this_expected_len,
765 "$this_name: Returns expected length:"
766 . " $this_expected_len")
a8ee5133 767 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf 768
9cdc3054 769 my $returned_flags = $ret_ref->[2];
6aa905cf 770
9cdc3054
KW
771 for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
772 if (ok($expected_return_flags[$i] & $returned_flags,
6aa905cf
KW
773 "Expected and got error bit return"
774 . " for $malformations[$i] malformation"))
775 {
9cdc3054 776 $returned_flags &= ~$expected_return_flags[$i];
6aa905cf 777 }
9cdc3054 778 splice @expected_return_flags, $i, 1;
6aa905cf 779 }
9cdc3054 780 is(scalar @expected_return_flags, 0,
6aa905cf 781 "Got all the expected malformation errors")
9cdc3054 782 or diag Dumper \@expected_return_flags;
6aa905cf
KW
783
784 if ( $this_expected_len >= $this_needed_to_discern_len
785 && ($warn_flag || $disallow_flag))
786 {
9cdc3054 787 is($returned_flags, $expected_error_flags,
6aa905cf 788 "Got the correct error flag")
a8ee5133 789 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf
KW
790 }
791 else {
5722c46d
KW
792 is($returned_flags, 0, "Got no other error flag")
793 or
794
795 # We strip off any prefixes from the flag names
796 diag "The unexpected flags were: "
797 . (flags_to_text($returned_flags,
798 \@utf8n_flags_to_text)
799 =~ s/ \b [A-Z] _ //xgr);
6aa905cf
KW
800 }
801
802 if (@malformations) {
9cdc3054 803 if (! $do_warning && $trial_warning_category eq 'utf8') {
6aa905cf
KW
804 goto no_warnings_expected;
805 }
806
807 # Check that each malformation generates a
808 # warning, removing that warning if found
809 MALFORMATION:
810 foreach my $malformation (@malformations) {
9cdc3054
KW
811 foreach (my $i = 0; $i < @warnings_gotten; $i++) {
812 if ($warnings_gotten[$i] =~ /$malformation/) {
6aa905cf
KW
813 pass("Expected and got"
814 . "'$malformation' warning");
9cdc3054 815 splice @warnings_gotten, $i, 1;
6aa905cf
KW
816 next MALFORMATION;
817 }
818 }
819 fail("Expected '$malformation' warning"
820 . " but didn't get it");
821
822 }
823 }
824
825 # Any overflow will override any super or above-31
826 # warnings.
827 goto no_warnings_expected
828 if $will_overflow || $this_expected_len
829 < $this_needed_to_discern_len;
830
831 if ( ! $do_warning
9cdc3054
KW
832 && ( $trial_warning_category eq 'utf8'
833 || $trial_warning_category eq $controlling_warning_category))
6aa905cf
KW
834 {
835 goto no_warnings_expected;
836 }
837 elsif ($warn_flag) {
9cdc3054 838 if (is(scalar @warnings_gotten, 1,
6aa905cf
KW
839 "$this_name: Got a single warning "))
840 {
9cdc3054 841 like($warnings_gotten[0], $message,
6aa905cf 842 "$this_name: Got expected warning")
a8ee5133
KW
843 or diag "Call was: "
844 . utf8n_display_call($eval_text);
6aa905cf
KW
845 }
846 else {
a8ee5133 847 diag "Call was: " . utf8n_display_call($eval_text);
9cdc3054
KW
848 if (scalar @warnings_gotten) {
849 output_warnings(@warnings_gotten);
6aa905cf
KW
850 }
851 }
852 }
853 else {
9cdc3054 854
6aa905cf 855 no_warnings_expected:
9cdc3054 856 unless (is(scalar @warnings_gotten, 0,
6aa905cf
KW
857 "$this_name: Got no warnings"))
858 {
a8ee5133 859 diag "Call was: " . utf8n_display_call($eval_text);
9cdc3054 860 output_warnings(@warnings_gotten);
6aa905cf
KW
861 }
862 }
863
864 # Check CHECK_ONLY results when the input is
865 # disallowed. Do this when actually disallowed,
866 # not just when the $disallow_flag is set
867 if ($disallowed) {
9cdc3054 868 undef @warnings_gotten;
a8ee5133
KW
869 $this_flags = $disallow_flag|$::UTF8_CHECK_ONLY;
870 $eval_text = "\$ret_ref = test_utf8n_to_uvchr_error("
871 . "'$this_bytes', $this_length, $this_flags)";
872 eval "$eval_text";
873 if (! ok ("$@ eq ''",
874 " And eval succeeded with CHECK_ONLY"))
875 {
876 diag "\$@='$@'; Call was: "
877 . utf8n_display_call($eval_text);
878 next;
879 }
6aa905cf
KW
880 is($ret_ref->[0], 0,
881 "$this_name, CHECK_ONLY: Returns 0")
a8ee5133 882 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf
KW
883 is($ret_ref->[1], -1,
884 "$this_name: CHECK_ONLY: returns -1 for length")
a8ee5133 885 or diag "Call was: " . utf8n_display_call($eval_text);
9cdc3054 886 if (! is(scalar @warnings_gotten, 0,
6aa905cf
KW
887 "$this_name, CHECK_ONLY: no warnings"
888 . " generated"))
889 {
a8ee5133 890 diag "Call was: " . utf8n_display_call($eval_text);
9cdc3054 891 output_warnings(@warnings_gotten);
6aa905cf
KW
892 }
893 }
894
895 # Now repeat some of the above, but for
896 # uvchr_to_utf8_flags(). Since this comes from an
897 # existing code point, it hasn't overflowed, and
898 # isn't malformed.
899 next if @malformations;
900
901 # The warning and disallow flags passed in are for
902 # utf8n_to_uvchr_error(). Convert them for
903 # uvchr_to_utf8_flags().
904 my $uvchr_warn_flag = 0;
905 my $uvchr_disallow_flag = 0;
906 if ($warn_flag) {
af816908 907 $uvchr_warn_flag = $uvchr_flag_to_warn;
6aa905cf
KW
908 }
909 if ($disallow_flag) {
af816908 910 $uvchr_disallow_flag = $uvchr_flag_to_disallow;
6aa905cf
KW
911 }
912
913 $disallowed = $uvchr_disallow_flag;
914
915 $this_name = "uvchr_to_utf8_flags() $testname: "
916 . (($uvchr_disallow_flag)
917 ? 'disallowed'
918 : ($disallowed)
919 ? 'ABOVE_31_BIT allowed'
920 : 'allowed');
921 $this_name .= ", $eval_warn";
922 $this_name .= ", " . (($uvchr_warn_flag)
923 ? 'with warning flag'
924 : 'no warning flag');
925
9cdc3054 926 undef @warnings_gotten;
6aa905cf 927 my $ret;
a8ee5133 928 $this_flags = $uvchr_warn_flag | $uvchr_disallow_flag;
6aa905cf
KW
929 $eval_text = "$eval_warn; \$ret ="
930 . " test_uvchr_to_utf8_flags("
d884ea32 931 . "$allowed_uv, $this_flags)";
6aa905cf
KW
932 eval "$eval_text";
933 if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
934 {
d884ea32
KW
935 diag "\$@='$@'; call was: "
936 . uvchr_display_call($eval_text);
6aa905cf
KW
937 next;
938 }
939 if ($disallowed) {
940 is($ret, undef, "$this_name: Returns undef")
d884ea32 941 or diag "Call was: " . uvchr_display_call($eval_text);
6aa905cf
KW
942 }
943 else {
d884ea32
KW
944 is($ret, $this_bytes, "$this_name: Returns expected string")
945 or diag "Call was: " . uvchr_display_call($eval_text);
6aa905cf
KW
946 }
947 if (! $do_warning
9cdc3054 948 && ($trial_warning_category eq 'utf8' || $trial_warning_category eq $controlling_warning_category))
6aa905cf 949 {
9cdc3054 950 if (!is(scalar @warnings_gotten, 0,
6aa905cf
KW
951 "$this_name: No warnings generated"))
952 {
d884ea32 953 diag "Call was: " . uvchr_display_call($eval_text);
9cdc3054 954 output_warnings(@warnings_gotten);
6aa905cf
KW
955 }
956 }
957 elsif ( $uvchr_warn_flag
9cdc3054
KW
958 && ( $trial_warning_category eq 'utf8'
959 || $trial_warning_category eq $controlling_warning_category))
6aa905cf 960 {
9cdc3054 961 if (is(scalar @warnings_gotten, 1,
6aa905cf
KW
962 "$this_name: Got a single warning "))
963 {
9cdc3054 964 like($warnings_gotten[0], $message,
6aa905cf 965 "$this_name: Got expected warning")
d884ea32
KW
966 or diag "Call was: "
967 . uvchr_display_call($eval_text);
6aa905cf
KW
968 }
969 else {
d884ea32 970 diag "Call was: " . uvchr_display_call($eval_text);
9cdc3054
KW
971 output_warnings(@warnings_gotten)
972 if scalar @warnings_gotten;
6aa905cf
KW
973 }
974 }
975 }
976 }
977 }
978 }
979 }
980 }
981 }
982}
983
984done_testing;