This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
APItest/t/utf8_warn_base.pl: Skip most tests
[perl5.git] / ext / XS-APItest / t / utf8_warn_base.pl
CommitLineData
6aa905cf
KW
1#!perl -w
2
3# This is a base file to be used by various .t's in its directory
4
5use strict;
6use Test::More;
7
8BEGIN {
9 use_ok('XS::APItest');
10 require 'charset_tools.pl';
11 require './t/utf8_setup.pl';
12};
13
14$|=1;
15
16no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
17 # machines, and that is tested elsewhere
18
19use XS::APItest;
20use Data::Dumper;
21
9cdc3054 22my @warnings_gotten;
6aa905cf
KW
23
24use warnings 'utf8';
8d6f1506
KW
25local $SIG{__WARN__} = sub { my @copy = @_;
26 push @warnings_gotten, map { chomp; $_ } @copy;
27 };
6aa905cf
KW
28
29sub nonportable_regex ($) {
30
31 # Returns a pattern that matches the non-portable message raised either
32 # for the specific input code point, or the one generated when there
33 # is some malformation that precludes the message containing the specific
34 # code point
35
36 my $code_point = shift;
37
38 my $string = sprintf '(Code point 0x%X is not Unicode, and'
39 . '|Any UTF-8 sequence that starts with'
40 . ' "(\\\x[[:xdigit:]]{2})+" is for a'
41 . ' non-Unicode code point, and is) not portable',
42 $code_point;
43 return qr/$string/;
44}
45
46# Now test the cases where a legal code point is generated, but may or may not
47# be allowed/warned on.
48my @tests = (
af816908 49 # ($testname, $bytes, $controlling_warning_category,
b7e1f4b2 50 # $allowed_uv, $needed_to_discern_len )
6aa905cf
KW
51 [ "lowest surrogate",
52 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
6aa905cf 53 'surrogate', 0xD800,
6aa905cf
KW
54 ],
55 [ "a middle surrogate",
56 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
6aa905cf 57 'surrogate', 0xD90D,
6aa905cf
KW
58 ],
59 [ "highest surrogate",
60 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
6aa905cf 61 'surrogate', 0xDFFF,
6aa905cf
KW
62 ],
63 [ "first non_unicode",
64 (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
6aa905cf 65 'non_unicode', 0x110000,
6aa905cf 66 2,
6aa905cf
KW
67 ],
68 [ "non_unicode whose first byte tells that",
69 (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
6aa905cf
KW
70 'non_unicode',
71 (isASCII) ? 0x140000 : 0x200000,
6aa905cf 72 1,
6aa905cf
KW
73 ],
74 [ "first of 32 consecutive non-character code points",
75 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
6aa905cf 76 'nonchar', 0xFDD0,
6aa905cf
KW
77 ],
78 [ "a mid non-character code point of the 32 consecutive ones",
79 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
6aa905cf 80 'nonchar', 0xFDE0,
6aa905cf
KW
81 ],
82 [ "final of 32 consecutive non-character code points",
83 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
6aa905cf 84 'nonchar', 0xFDEF,
6aa905cf
KW
85 ],
86 [ "non-character code point U+FFFE",
87 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
6aa905cf 88 'nonchar', 0xFFFE,
6aa905cf
KW
89 ],
90 [ "non-character code point U+FFFF",
91 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
6aa905cf 92 'nonchar', 0xFFFF,
6aa905cf
KW
93 ],
94 [ "non-character code point U+1FFFE",
95 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
6aa905cf 96 'nonchar', 0x1FFFE,
6aa905cf
KW
97 ],
98 [ "non-character code point U+1FFFF",
99 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
6aa905cf 100 'nonchar', 0x1FFFF,
6aa905cf
KW
101 ],
102 [ "non-character code point U+2FFFE",
103 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
6aa905cf 104 'nonchar', 0x2FFFE,
6aa905cf
KW
105 ],
106 [ "non-character code point U+2FFFF",
107 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
6aa905cf 108 'nonchar', 0x2FFFF,
6aa905cf
KW
109 ],
110 [ "non-character code point U+3FFFE",
111 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
6aa905cf 112 'nonchar', 0x3FFFE,
6aa905cf
KW
113 ],
114 [ "non-character code point U+3FFFF",
115 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
6aa905cf 116 'nonchar', 0x3FFFF,
6aa905cf
KW
117 ],
118 [ "non-character code point U+4FFFE",
119 (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
6aa905cf 120 'nonchar', 0x4FFFE,
6aa905cf
KW
121 ],
122 [ "non-character code point U+4FFFF",
123 (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
6aa905cf 124 'nonchar', 0x4FFFF,
6aa905cf
KW
125 ],
126 [ "non-character code point U+5FFFE",
127 (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
6aa905cf 128 'nonchar', 0x5FFFE,
6aa905cf
KW
129 ],
130 [ "non-character code point U+5FFFF",
131 (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
6aa905cf 132 'nonchar', 0x5FFFF,
6aa905cf
KW
133 ],
134 [ "non-character code point U+6FFFE",
135 (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
6aa905cf 136 'nonchar', 0x6FFFE,
6aa905cf
KW
137 ],
138 [ "non-character code point U+6FFFF",
139 (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
6aa905cf 140 'nonchar', 0x6FFFF,
6aa905cf
KW
141 ],
142 [ "non-character code point U+7FFFE",
143 (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
6aa905cf 144 'nonchar', 0x7FFFE,
6aa905cf
KW
145 ],
146 [ "non-character code point U+7FFFF",
147 (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
6aa905cf 148 'nonchar', 0x7FFFF,
6aa905cf
KW
149 ],
150 [ "non-character code point U+8FFFE",
151 (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
6aa905cf 152 'nonchar', 0x8FFFE,
6aa905cf
KW
153 ],
154 [ "non-character code point U+8FFFF",
155 (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
6aa905cf 156 'nonchar', 0x8FFFF,
6aa905cf
KW
157 ],
158 [ "non-character code point U+9FFFE",
159 (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
6aa905cf 160 'nonchar', 0x9FFFE,
6aa905cf
KW
161 ],
162 [ "non-character code point U+9FFFF",
163 (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
6aa905cf 164 'nonchar', 0x9FFFF,
6aa905cf
KW
165 ],
166 [ "non-character code point U+AFFFE",
167 (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
6aa905cf 168 'nonchar', 0xAFFFE,
6aa905cf
KW
169 ],
170 [ "non-character code point U+AFFFF",
171 (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
6aa905cf 172 'nonchar', 0xAFFFF,
6aa905cf
KW
173 ],
174 [ "non-character code point U+BFFFE",
175 (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
6aa905cf 176 'nonchar', 0xBFFFE,
6aa905cf
KW
177 ],
178 [ "non-character code point U+BFFFF",
179 (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
6aa905cf 180 'nonchar', 0xBFFFF,
6aa905cf
KW
181 ],
182 [ "non-character code point U+CFFFE",
183 (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
6aa905cf 184 'nonchar', 0xCFFFE,
6aa905cf
KW
185 ],
186 [ "non-character code point U+CFFFF",
187 (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
6aa905cf 188 'nonchar', 0xCFFFF,
6aa905cf
KW
189 ],
190 [ "non-character code point U+DFFFE",
191 (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
6aa905cf 192 'nonchar', 0xDFFFE,
6aa905cf
KW
193 ],
194 [ "non-character code point U+DFFFF",
195 (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
6aa905cf 196 'nonchar', 0xDFFFF,
6aa905cf
KW
197 ],
198 [ "non-character code point U+EFFFE",
199 (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
6aa905cf 200 'nonchar', 0xEFFFE,
6aa905cf
KW
201 ],
202 [ "non-character code point U+EFFFF",
203 (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
6aa905cf 204 'nonchar', 0xEFFFF,
6aa905cf
KW
205 ],
206 [ "non-character code point U+FFFFE",
207 (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
6aa905cf 208 'nonchar', 0xFFFFE,
6aa905cf
KW
209 ],
210 [ "non-character code point U+FFFFF",
211 (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
6aa905cf 212 'nonchar', 0xFFFFF,
6aa905cf
KW
213 ],
214 [ "non-character code point U+10FFFE",
215 (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
6aa905cf 216 'nonchar', 0x10FFFE,
6aa905cf
KW
217 ],
218 [ "non-character code point U+10FFFF",
219 (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
6aa905cf 220 'nonchar', 0x10FFFF,
6aa905cf 221 ],
76513bdc
KW
222 [ "requires at least 32 bits",
223 (isASCII)
224 ? "\xfe\x82\x80\x80\x80\x80\x80"
225 : I8_to_native(
226 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
227 # This code point is chosen so that it is representable in a UV on
228 # 32-bit machines
76513bdc 229 'utf8', 0x80000000,
76513bdc 230 (isASCII) ? 1 : 8,
76513bdc
KW
231 ],
232 [ "highest 32 bit code point",
233 (isASCII)
234 ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf"
235 : I8_to_native(
236 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
76513bdc 237 'utf8', 0xFFFFFFFF,
76513bdc 238 (isASCII) ? 1 : 8,
76513bdc
KW
239 ],
240 [ "requires at least 32 bits, and use SUPER-type flags, instead of"
241 . " ABOVE_31_BIT",
242 (isASCII)
243 ? "\xfe\x82\x80\x80\x80\x80\x80"
244 : I8_to_native(
245 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
76513bdc 246 'utf8', 0x80000000,
76513bdc 247 1,
76513bdc
KW
248 ],
249 [ "overflow with warnings/disallow for more than 31 bits",
250 # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
251 # with overflow. The overflow malformation is never allowed, so
252 # preventing it takes precedence if the ABOVE_31_BIT options would
253 # otherwise allow in an overflowing value. The ASCII code points (1
254 # for 32-bits; 1 for 64) were chosen because the old overflow
255 # detection algorithm did not catch them; this means this test also
256 # checks for that fix. The EBCDIC are arbitrary overflowing ones
257 # since we have no reports of failures with it.
258 (($::is64bit)
259 ? ((isASCII)
260 ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
261 : I8_to_native(
262 "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
263 : ((isASCII)
264 ? "\xfe\x86\x80\x80\x80\x80\x80"
265 : I8_to_native(
266 "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
04f42bf6 267 'utf8', -1,
76513bdc 268 (isASCII || $::is64bit) ? 2 : 8,
76513bdc 269 ],
6aa905cf
KW
270);
271
272if (! $::is64bit) {
273 if (isASCII) {
274 no warnings qw{portable overflow};
275 push @tests,
276 [ "Lowest 33 bit code point: overflow",
277 "\xFE\x84\x80\x80\x80\x80\x80",
04f42bf6 278 'utf8', -1,
b3169593 279 1,
6aa905cf
KW
280 ];
281 }
282}
283else {
284 no warnings qw{portable overflow};
285 push @tests,
286 [ "More than 32 bits",
287 (isASCII)
288 ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
289 : I8_to_native(
290 "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
6aa905cf 291 'utf8', 0x1000000000,
b3169593 292 (isASCII) ? 1 : 7,
6aa905cf
KW
293 ];
294 if (! isASCII) {
295 push @tests, # These could falsely show wrongly in a naive
296 # implementation
297 [ "requires at least 32 bits",
298 I8_to_native(
299 "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
6aa905cf 300 'utf8', 0x800000000,
b3169593 301 7,
6aa905cf
KW
302 ],
303 [ "requires at least 32 bits",
304 I8_to_native(
305 "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
6aa905cf 306 'utf8', 0x10000000000,
b3169593 307 6,
6aa905cf
KW
308 ],
309 [ "requires at least 32 bits",
310 I8_to_native(
311 "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
6aa905cf 312 'utf8', 0x200000000000,
b3169593 313 5,
6aa905cf
KW
314 ],
315 [ "requires at least 32 bits",
316 I8_to_native(
317 "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
6aa905cf 318 'utf8', 0x4000000000000,
b3169593 319 4,
6aa905cf
KW
320 ],
321 [ "requires at least 32 bits",
322 I8_to_native(
323 "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
6aa905cf 324 'utf8', 0x80000000000000,
b3169593 325 3,
6aa905cf
KW
326 ],
327 [ "requires at least 32 bits",
328 I8_to_native(
329 "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
6aa905cf 330 'utf8', 0x1000000000000000,
b3169593 331 2,
6aa905cf
KW
332 ];
333 }
334}
335
5722c46d
KW
336sub flags_to_text($$)
337{
338 my ($flags, $flags_to_text_ref) = @_;
339
340 # Returns a string containing a mnemonic representation of the bits that
341 # are set in the $flags. These are assumed to be flag bits. The return
342 # looks like "FOO|BAR|BAZ". The second parameter is a reference to an
343 # array that gives the textual representation of all the possible flags.
344 # Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If
345 # no bits at all are set the string "0" is returned;
346
347 my @flag_text;
348 my $shift = 0;
349
350 return "0" if $flags == 0;
351
352 while ($flags) {
353 #diag sprintf "%x", $flags;
354 if ($flags & 1) {
355 push @flag_text, $flags_to_text_ref->[$shift];
356 }
357 $shift++;
358 $flags >>= 1;
359 }
360
361 return join "|", @flag_text;
362}
363
364# Possible flag returns from utf8n_to_uvchr_error(). These should have G_,
365# instead of A_, D_, but the prefixes will be used in a a later commit, so
366# minimize churn by having them here.
367my @utf8n_flags_to_text = ( qw(
368 A_EMPTY
369 A_CONTINUATION
370 A_NON_CONTINUATION
371 A_SHORT
372 A_LONG
373 A_LONG_AND_ITS_VALUE
374 PLACEHOLDER
375 A_OVERFLOW
376 D_SURROGATE
377 W_SURROGATE
378 D_NONCHAR
379 W_NONCHAR
380 D_SUPER
381 W_SUPER
382 D_ABOVE_31_BIT
383 W_ABOVE_31_BIT
384 CHECK_ONLY
385 NO_CONFIDENCE_IN_CURLEN_
386 ) );
387
a8ee5133
KW
388sub utf8n_display_call($)
389{
390 # Converts an eval string that calls test_utf8n_to_uvchr into a more human
391 # readable form, and returns it. Doesn't work if the byte string contains
392 # an apostrophe. The return will look something like:
393 # test_utf8n_to_uvchr_error('$bytes', $length, $flags)
394 #diag $_[0];
395
396 $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x;
397 my $text1 = $1; # Everything before the byte string
398 my $bytes = $2;
399 my $text2 = $3; # Includes the length
400 my $flags = $4;
401
402 return $text1
403 . display_bytes($bytes)
404 . $text2
405 . flags_to_text($flags, \@utf8n_flags_to_text)
406 . ')';
407}
408
d884ea32
KW
409sub uvchr_display_call($)
410{
411 # Converts an eval string that calls test_uvchr_to_utf8 into a more human
412 # readable form, and returns it. The return will look something like:
413 # test_uvchr_to_utf8n_flags($uv, $flags)
414 #diag $_[0];
415
416 my @flags_to_text = ( qw(
417 W_SURROGATE
418 W_NONCHAR
419 W_SUPER
420 W_ABOVE_31_BIT
421 D_SURROGATE
422 D_NONCHAR
423 D_SUPER
424 D_ABOVE_31_BIT
425 ) );
426
427 $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x;
428 my $text = $1;
429 my $cp = sprintf "%X", $2;
430 my $flags = $3;
431
432 return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')';
433}
434
6aa905cf
KW
435# This test is split into this number of files.
436my $num_test_files = $ENV{TEST_JOBS} || 1;
437$num_test_files = 10 if $num_test_files > 10;
438
439my $test_count = -1;
440foreach my $test (@tests) {
441 $test_count++;
442 next if $test_count % $num_test_files != $::TEST_CHUNK;
443
af816908 444 my ($testname, $bytes,
9cdc3054 445 $controlling_warning_category, $allowed_uv, $needed_to_discern_len
6aa905cf
KW
446 ) = @$test;
447
448 my $length = length $bytes;
04f42bf6 449 my $will_overflow = $allowed_uv < 0;
6aa905cf 450
2c511c58
KW
451 my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv);
452
af816908
KW
453 my $utf8n_flag_to_warn;
454 my $utf8n_flag_to_disallow;
455 my $uvchr_flag_to_warn;
456 my $uvchr_flag_to_disallow;
b7e1f4b2 457
bf422d6a
KW
458 # Many of the code points being tested are middling in that if code point
459 # edge cases work, these are very likely to as well. Because this test
460 # file takes a while to execute, we skip testing the edge effects of code
461 # points deemed middling, while testing their basics and continuing to
462 # fully test the non-middling code points.
463 my $skip_most_tests = 0;
464
3022ad00 465 my $message;
af816908
KW
466 if ($will_overflow || $allowed_uv > 0x10FFFF) {
467
468 $utf8n_flag_to_warn = $::UTF8_WARN_SUPER;
469 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER;
470 $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER;
471 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;;
472
473 if ($will_overflow) {
474 $message = qr/overflows/;
475 }
476 elsif ($allowed_uv > 0x7FFFFFFF) {
477 $message = nonportable_regex($allowed_uv);
478 }
479 else {
480 $message = qr/(not Unicode|for a non-Unicode code point).* may not be portable/;
481 }
3022ad00
KW
482 }
483 elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) {
484 $message = qr/surrogate/;
67e45424 485 $needed_to_discern_len = 2 unless defined $needed_to_discern_len;
bf422d6a 486 $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF;
af816908
KW
487
488 $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE;
489 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE;
490 $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE;
491 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;;
3022ad00
KW
492 }
493 elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF)
494 || ($allowed_uv & 0xFFFE) == 0xFFFE)
495 {
496 $message = qr/Unicode non-character.*is not recommended for open interchange/;
67e45424 497 $needed_to_discern_len = $length unless defined $needed_to_discern_len;
bf422d6a
KW
498 if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF)
499 || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE))
500 {
501 $skip_most_tests = 1;
502 }
af816908
KW
503
504 $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR;
505 $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR;
506 $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR;
507 $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;;
3022ad00
KW
508 }
509 else {
510 die "Can't figure out what type of warning to test for $testname"
511 }
512
67e45424
KW
513 die 'Didn\'t set $needed_to_discern_len for ' . $testname
514 unless defined $needed_to_discern_len;
af816908
KW
515 my $disallow_flags = $utf8n_flag_to_disallow;
516 my $warn_flags = $disallow_flags << 1;
517
518 # The convention is that the got flag is the same value as the disallow
519 # one, and the warn flag is the next bit over. If this were violated, the
520 # tests here should start failing. We could do an eval under no strict to
521 # be sure.
522 my $expected_error_flags = $disallow_flags;
67e45424 523
6aa905cf
KW
524 {
525 use warnings;
9cdc3054 526 undef @warnings_gotten;
6aa905cf
KW
527 my $ret = test_isUTF8_CHAR($bytes, $length);
528 my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0);
529 if ($will_overflow) {
d402d77f
KW
530 is($ret, 0, "For $testname: isUTF8_CHAR() returns 0");
531 is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0");
6aa905cf
KW
532 }
533 else {
534 is($ret, $length,
d402d77f
KW
535 "For $testname: isUTF8_CHAR() returns expected length: $length");
536 is($ret_flags, $length, " And isUTF8_CHAR_flags(...,0)"
6aa905cf
KW
537 . " returns expected length: $length");
538 }
9cdc3054 539 is(scalar @warnings_gotten, 0,
d402d77f
KW
540 " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags generated"
541 . " any warnings")
9cdc3054 542 or output_warnings(@warnings_gotten);
6aa905cf 543
9cdc3054 544 undef @warnings_gotten;
6aa905cf
KW
545 $ret = test_isSTRICT_UTF8_CHAR($bytes, $length);
546 if ($will_overflow) {
d402d77f 547 is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0");
6aa905cf
KW
548 }
549 else {
550 my $expected_ret = ( $testname =~ /surrogate|non-character/
551 || $allowed_uv > 0x10FFFF)
552 ? 0
553 : $length;
d402d77f 554 is($ret, $expected_ret, " And isSTRICT_UTF8_CHAR() returns"
6aa905cf
KW
555 . " expected length: $expected_ret");
556 $ret = test_isUTF8_CHAR_flags($bytes, $length,
557 $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE);
558 is($ret, $expected_ret,
d402d77f
KW
559 " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')"
560 . " acts like isSTRICT_UTF8_CHAR");
6aa905cf 561 }
9cdc3054 562 is(scalar @warnings_gotten, 0,
d402d77f
KW
563 " And neither isSTRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
564 . " generated any warnings")
9cdc3054 565 or output_warnings(@warnings_gotten);
6aa905cf 566
9cdc3054 567 undef @warnings_gotten;
6aa905cf
KW
568 $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length);
569 if ($will_overflow) {
d402d77f 570 is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0");
6aa905cf
KW
571 }
572 else {
573 my $expected_ret = ( $testname =~ /surrogate/
574 || $allowed_uv > 0x10FFFF)
575 ? 0
576 : $length;
d402d77f 577 is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()"
6aa905cf
KW
578 ." returns expected length: $expected_ret");
579 $ret = test_isUTF8_CHAR_flags($bytes, $length,
580 $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE);
581 is($ret, $expected_ret,
d402d77f
KW
582 " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')"
583 . " acts like isC9_STRICT_UTF8_CHAR");
6aa905cf 584 }
9cdc3054 585 is(scalar @warnings_gotten, 0,
d402d77f
KW
586 " And neither isC9_STRICT_UTF8_CHAR() nor isUTF8_CHAR_flags"
587 . " generated any warnings")
9cdc3054 588 or output_warnings(@warnings_gotten);
6aa905cf
KW
589
590 # Test partial character handling, for each byte not a full character
591 for my $j (1.. $length - 1) {
592
593 # Skip the test for the interaction between overflow and above-31
594 # bit. It is really testing other things than the partial
595 # character tests, for which other tests in this file are
596 # sufficient
04f42bf6 597 last if $will_overflow;
6aa905cf
KW
598
599 foreach my $disallow_flag (0, $disallow_flags) {
600 my $partial = substr($bytes, 0, $j);
601 my $ret_should_be;
602 my $comment;
603 if ($disallow_flag) {
604 $ret_should_be = 0;
605 $comment = "disallowed";
606 if ($j < $needed_to_discern_len) {
607 $ret_should_be = 1;
608 $comment .= ", but need $needed_to_discern_len bytes"
609 . " to discern:";
610 }
611 }
612 else {
613 $ret_should_be = 1;
614 $comment = "allowed";
615 }
616
9cdc3054 617 undef @warnings_gotten;
6aa905cf
KW
618
619 $ret = test_is_utf8_valid_partial_char_flags($partial, $j,
620 $disallow_flag);
621 is($ret, $ret_should_be,
d402d77f
KW
622 " And is_utf8_valid_partial_char_flags("
623 . display_bytes($partial)
624 . "), $comment: returns $ret_should_be");
9cdc3054 625 is(scalar @warnings_gotten, 0,
d402d77f 626 " And is_utf8_valid_partial_char_flags()"
6aa905cf 627 . " generated no warnings")
9cdc3054 628 or output_warnings(@warnings_gotten);
6aa905cf
KW
629 }
630 }
631 }
632
633 # This is more complicated than the malformations tested earlier, as there
634 # are several orthogonal variables involved. We test all the subclasses
635 # of utf8 warnings to verify they work with and without the utf8 class,
636 # and don't have effects on other sublass warnings
9cdc3054 637 foreach my $trial_warning_category ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
bf422d6a 638 next if $skip_most_tests && $trial_warning_category ne $controlling_warning_category;
6aa905cf 639 foreach my $warn_flag (0, $warn_flags) {
bf422d6a 640 next if $skip_most_tests && ! $warn_flag;
6aa905cf 641 foreach my $disallow_flag (0, $disallow_flags) {
bf422d6a 642 next if $skip_most_tests && ! $disallow_flag;
6aa905cf 643 foreach my $do_warning (0, 1) {
bf422d6a 644 next if $skip_most_tests && ! $do_warning;
6aa905cf
KW
645
646 # We try each of the above with various combinations of
647 # malformations that can occur on the same input sequence.
648 foreach my $short ("", "short") {
bf422d6a 649 next if $skip_most_tests && $short;
6aa905cf
KW
650 foreach my $unexpected_noncont ("",
651 "unexpected non-continuation")
652 {
bf422d6a 653 next if $skip_most_tests && $unexpected_noncont;
6aa905cf 654 foreach my $overlong ("", "overlong") {
bf422d6a 655 next if $overlong && $skip_most_tests;
6aa905cf 656
b3169593
KW
657 # If we're creating an overlong, it can't be longer than
658 # the maximum length, so skip if we're already at that
659 # length.
660 next if $overlong && $length >= $::max_bytes;
6aa905cf
KW
661
662 my @malformations;
9cdc3054 663 my @expected_return_flags;
6aa905cf
KW
664 push @malformations, $short if $short;
665 push @malformations, $unexpected_noncont
666 if $unexpected_noncont;
667 push @malformations, $overlong if $overlong;
668
669 # The overflow malformation test in the input
670 # array is coerced into being treated like one of
671 # the others.
672 if ($will_overflow) {
673 push @malformations, 'overflow';
9cdc3054 674 push @expected_return_flags, $::UTF8_GOT_OVERFLOW;
6aa905cf
KW
675 }
676
677 my $malformations_name = join "/", @malformations;
678 $malformations_name .= " malformation"
679 if $malformations_name;
680 $malformations_name .= "s" if @malformations > 1;
681 my $this_bytes = $bytes;
682 my $this_length = $length;
683 my $expected_uv = $allowed_uv;
b3169593 684 my $this_expected_len = $length;
6aa905cf
KW
685 my $this_needed_to_discern_len = $needed_to_discern_len;
686 if ($malformations_name) {
687 $expected_uv = 0;
688
689 # Coerce the input into the desired
690 # malformation
691 if ($malformations_name =~ /overlong/) {
692
693 # For an overlong, we convert the original
694 # start byte into a continuation byte with
695 # the same data bits as originally. ...
696 substr($this_bytes, 0, 1)
697 = start_byte_to_cont(substr($this_bytes,
698 0, 1));
699
700 # ... Then we prepend it with a known
701 # overlong sequence. This should evaluate
702 # to the exact same code point as the
703 # original.
704 $this_bytes
705 = I8_to_native("\xff")
dbb8d798 706 . (I8_to_native(chr $::lowest_continuation)
6aa905cf
KW
707 x ( $::max_bytes - 1 - length($this_bytes)))
708 . $this_bytes;
709 $this_length = length($this_bytes);
710 $this_needed_to_discern_len
711 = $::max_bytes - ($this_expected_len
712 - $this_needed_to_discern_len);
713 $this_expected_len = $::max_bytes;
9cdc3054 714 push @expected_return_flags, $::UTF8_GOT_LONG;
6aa905cf
KW
715 }
716 if ($malformations_name =~ /short/) {
717
718 # Just tell the test to not look far
719 # enough into the input.
720 $this_length--;
721 $this_expected_len--;
9cdc3054 722 push @expected_return_flags, $::UTF8_GOT_SHORT;
6aa905cf
KW
723 }
724 if ($malformations_name
725 =~ /non-continuation/)
726 {
727 # Change the final continuation byte into
728 # a non one.
729 my $pos = ($short) ? -2 : -1;
730 substr($this_bytes, $pos, 1) = '?';
731 $this_expected_len--;
9cdc3054 732 push @expected_return_flags,
6aa905cf
KW
733 $::UTF8_GOT_NON_CONTINUATION;
734 }
735 }
736
737 my $eval_warn = $do_warning
9cdc3054
KW
738 ? "use warnings '$trial_warning_category'"
739 : $trial_warning_category eq "utf8"
6aa905cf
KW
740 ? "no warnings 'utf8'"
741 : ( "use warnings 'utf8';"
9cdc3054 742 . " no warnings '$trial_warning_category'");
6aa905cf
KW
743
744 # Is effectively disallowed if we've set up a
745 # malformation, even if the flag indicates it is
746 # allowed. Fix up test name to indicate this as
747 # well
748 my $disallowed = $disallow_flag
749 || $malformations_name;
750 my $this_name = "utf8n_to_uvchr_error() $testname: "
751 . (($disallow_flag)
752 ? 'disallowed'
753 : $disallowed
754 ? $disallowed
755 : 'allowed');
756 $this_name .= ", $eval_warn";
757 $this_name .= ", " . (($warn_flag)
758 ? 'with warning flag'
759 : 'no warning flag');
760
9cdc3054 761 undef @warnings_gotten;
6aa905cf 762 my $ret_ref;
a8ee5133 763 my $this_flags = $warn_flag | $disallow_flag;
6aa905cf
KW
764 my $eval_text = "$eval_warn; \$ret_ref"
765 . " = test_utf8n_to_uvchr_error("
766 . "'$this_bytes',"
a8ee5133 767 . " $this_length, $this_flags)";
6aa905cf
KW
768 eval "$eval_text";
769 if (! ok ("$@ eq ''",
770 "$this_name: eval succeeded"))
771 {
a8ee5133
KW
772 diag "\$@='$@'; call was: "
773 . utf8n_display_call($eval_text);
6aa905cf
KW
774 next;
775 }
776 if ($disallowed) {
d402d77f 777 is($ret_ref->[0], 0, " And returns 0")
a8ee5133 778 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf
KW
779 }
780 else {
781 is($ret_ref->[0], $expected_uv,
d402d77f
KW
782 " And returns expected uv: "
783 . $uv_string)
a8ee5133 784 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf
KW
785 }
786 is($ret_ref->[1], $this_expected_len,
d402d77f 787 " And returns expected length:"
6aa905cf 788 . " $this_expected_len")
a8ee5133 789 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf 790
9cdc3054 791 my $returned_flags = $ret_ref->[2];
6aa905cf 792
9cdc3054
KW
793 for (my $i = @expected_return_flags - 1; $i >= 0; $i--) {
794 if (ok($expected_return_flags[$i] & $returned_flags,
d402d77f 795 " Expected and got return flag"
6aa905cf
KW
796 . " for $malformations[$i] malformation"))
797 {
9cdc3054 798 $returned_flags &= ~$expected_return_flags[$i];
6aa905cf 799 }
9cdc3054 800 splice @expected_return_flags, $i, 1;
6aa905cf 801 }
9cdc3054 802 is(scalar @expected_return_flags, 0,
d402d77f 803 " Got all the expected malformation errors")
9cdc3054 804 or diag Dumper \@expected_return_flags;
6aa905cf
KW
805
806 if ( $this_expected_len >= $this_needed_to_discern_len
807 && ($warn_flag || $disallow_flag))
808 {
9cdc3054 809 is($returned_flags, $expected_error_flags,
d402d77f 810 " Got the correct error flag")
a8ee5133 811 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf
KW
812 }
813 else {
d402d77f 814 is($returned_flags, 0, " Got no other error flag")
5722c46d
KW
815 or
816
817 # We strip off any prefixes from the flag names
818 diag "The unexpected flags were: "
819 . (flags_to_text($returned_flags,
820 \@utf8n_flags_to_text)
821 =~ s/ \b [A-Z] _ //xgr);
6aa905cf
KW
822 }
823
824 if (@malformations) {
9cdc3054 825 if (! $do_warning && $trial_warning_category eq 'utf8') {
6aa905cf
KW
826 goto no_warnings_expected;
827 }
828
829 # Check that each malformation generates a
830 # warning, removing that warning if found
831 MALFORMATION:
832 foreach my $malformation (@malformations) {
9cdc3054
KW
833 foreach (my $i = 0; $i < @warnings_gotten; $i++) {
834 if ($warnings_gotten[$i] =~ /$malformation/) {
d402d77f 835 pass(" Expected and got"
6aa905cf 836 . "'$malformation' warning");
9cdc3054 837 splice @warnings_gotten, $i, 1;
6aa905cf
KW
838 next MALFORMATION;
839 }
840 }
d402d77f
KW
841 fail(" Expected '$malformation' warning"
842 . " but didn't get it");
6aa905cf
KW
843
844 }
845 }
846
847 # Any overflow will override any super or above-31
848 # warnings.
849 goto no_warnings_expected
850 if $will_overflow || $this_expected_len
851 < $this_needed_to_discern_len;
852
853 if ( ! $do_warning
9cdc3054
KW
854 && ( $trial_warning_category eq 'utf8'
855 || $trial_warning_category eq $controlling_warning_category))
6aa905cf
KW
856 {
857 goto no_warnings_expected;
858 }
859 elsif ($warn_flag) {
9cdc3054 860 if (is(scalar @warnings_gotten, 1,
d402d77f 861 " Got a single warning "))
6aa905cf 862 {
9cdc3054 863 like($warnings_gotten[0], $message,
d402d77f 864 " Got expected warning")
a8ee5133 865 or diag "Call was: "
d402d77f 866 . utf8n_display_call($eval_text);
6aa905cf
KW
867 }
868 else {
a8ee5133 869 diag "Call was: " . utf8n_display_call($eval_text);
9cdc3054
KW
870 if (scalar @warnings_gotten) {
871 output_warnings(@warnings_gotten);
6aa905cf
KW
872 }
873 }
874 }
875 else {
9cdc3054 876
6aa905cf 877 no_warnings_expected:
9cdc3054 878 unless (is(scalar @warnings_gotten, 0,
d402d77f 879 " Got no warnings"))
6aa905cf 880 {
a8ee5133 881 diag "Call was: " . utf8n_display_call($eval_text);
9cdc3054 882 output_warnings(@warnings_gotten);
6aa905cf
KW
883 }
884 }
885
886 # Check CHECK_ONLY results when the input is
887 # disallowed. Do this when actually disallowed,
888 # not just when the $disallow_flag is set
889 if ($disallowed) {
9cdc3054 890 undef @warnings_gotten;
a8ee5133
KW
891 $this_flags = $disallow_flag|$::UTF8_CHECK_ONLY;
892 $eval_text = "\$ret_ref = test_utf8n_to_uvchr_error("
893 . "'$this_bytes', $this_length, $this_flags)";
894 eval "$eval_text";
895 if (! ok ("$@ eq ''",
896 " And eval succeeded with CHECK_ONLY"))
897 {
898 diag "\$@='$@'; Call was: "
899 . utf8n_display_call($eval_text);
900 next;
901 }
d402d77f 902 is($ret_ref->[0], 0, " CHECK_ONLY: Returns 0")
a8ee5133 903 or diag "Call was: " . utf8n_display_call($eval_text);
6aa905cf 904 is($ret_ref->[1], -1,
d402d77f 905 " CHECK_ONLY: returns -1 for length")
a8ee5133 906 or diag "Call was: " . utf8n_display_call($eval_text);
9cdc3054 907 if (! is(scalar @warnings_gotten, 0,
d402d77f 908 " CHECK_ONLY: no warnings generated"))
6aa905cf 909 {
a8ee5133 910 diag "Call was: " . utf8n_display_call($eval_text);
9cdc3054 911 output_warnings(@warnings_gotten);
6aa905cf
KW
912 }
913 }
914
915 # Now repeat some of the above, but for
916 # uvchr_to_utf8_flags(). Since this comes from an
917 # existing code point, it hasn't overflowed, and
918 # isn't malformed.
919 next if @malformations;
920
921 # The warning and disallow flags passed in are for
922 # utf8n_to_uvchr_error(). Convert them for
923 # uvchr_to_utf8_flags().
924 my $uvchr_warn_flag = 0;
925 my $uvchr_disallow_flag = 0;
926 if ($warn_flag) {
af816908 927 $uvchr_warn_flag = $uvchr_flag_to_warn;
6aa905cf
KW
928 }
929 if ($disallow_flag) {
af816908 930 $uvchr_disallow_flag = $uvchr_flag_to_disallow;
6aa905cf
KW
931 }
932
933 $disallowed = $uvchr_disallow_flag;
934
935 $this_name = "uvchr_to_utf8_flags() $testname: "
936 . (($uvchr_disallow_flag)
937 ? 'disallowed'
938 : ($disallowed)
939 ? 'ABOVE_31_BIT allowed'
940 : 'allowed');
941 $this_name .= ", $eval_warn";
942 $this_name .= ", " . (($uvchr_warn_flag)
943 ? 'with warning flag'
944 : 'no warning flag');
945
9cdc3054 946 undef @warnings_gotten;
6aa905cf 947 my $ret;
a8ee5133 948 $this_flags = $uvchr_warn_flag | $uvchr_disallow_flag;
6aa905cf
KW
949 $eval_text = "$eval_warn; \$ret ="
950 . " test_uvchr_to_utf8_flags("
d884ea32 951 . "$allowed_uv, $this_flags)";
6aa905cf
KW
952 eval "$eval_text";
953 if (! ok ("$@ eq ''", "$this_name: eval succeeded"))
954 {
d884ea32
KW
955 diag "\$@='$@'; call was: "
956 . uvchr_display_call($eval_text);
6aa905cf
KW
957 next;
958 }
959 if ($disallowed) {
d402d77f 960 is($ret, undef, " And returns undef")
d884ea32 961 or diag "Call was: " . uvchr_display_call($eval_text);
6aa905cf
KW
962 }
963 else {
d402d77f 964 is($ret, $this_bytes, " And returns expected string")
d884ea32 965 or diag "Call was: " . uvchr_display_call($eval_text);
6aa905cf
KW
966 }
967 if (! $do_warning
9cdc3054 968 && ($trial_warning_category eq 'utf8' || $trial_warning_category eq $controlling_warning_category))
6aa905cf 969 {
9cdc3054 970 if (!is(scalar @warnings_gotten, 0,
d402d77f 971 " No warnings generated"))
6aa905cf 972 {
d884ea32 973 diag "Call was: " . uvchr_display_call($eval_text);
9cdc3054 974 output_warnings(@warnings_gotten);
6aa905cf
KW
975 }
976 }
977 elsif ( $uvchr_warn_flag
9cdc3054
KW
978 && ( $trial_warning_category eq 'utf8'
979 || $trial_warning_category eq $controlling_warning_category))
6aa905cf 980 {
9cdc3054 981 if (is(scalar @warnings_gotten, 1,
d402d77f 982 " Got a single warning "))
6aa905cf 983 {
9cdc3054 984 like($warnings_gotten[0], $message,
d402d77f 985 " Got expected warning")
d884ea32
KW
986 or diag "Call was: "
987 . uvchr_display_call($eval_text);
6aa905cf
KW
988 }
989 else {
d884ea32 990 diag "Call was: " . uvchr_display_call($eval_text);
9cdc3054
KW
991 output_warnings(@warnings_gotten)
992 if scalar @warnings_gotten;
6aa905cf
KW
993 }
994 }
995 }
996 }
997 }
998 }
999 }
1000 }
1001 }
1002}
1003
1004done_testing;