This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
regcomp.c: White-space only
[perl5.git] / ext / XS-APItest / t / utf8.t
CommitLineData
fed3ba5d
NC
1#!perl -w
2
3use strict;
4use Test::More;
5
6use XS::APItest;
7
4deba822
KW
8my $pound_sign = chr utf8::unicode_to_native(163);
9
fed3ba5d
NC
10foreach ([0, '', '', 'empty'],
11 [0, 'N', 'N', '1 char'],
12 [1, 'NN', 'N', '1 char substring'],
13 [-2, 'Perl', 'Rules', 'different'],
4deba822
KW
14 [0, $pound_sign, $pound_sign, 'pound sign'],
15 [1, $pound_sign . 10, $pound_sign . 1, '10 pounds is more than 1 pound'],
16 [1, $pound_sign . $pound_sign, $pound_sign, '2 pound signs are more than 1'],
fed3ba5d
NC
17 [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'],
18 [-1, '!', "!\x{1F42A}", 'Initial substrings match'],
19 ) {
20 my ($expect, $left, $right, $desc) = @$_;
21 my $copy = $right;
22 utf8::encode($copy);
23 is(bytes_cmp_utf8($left, $copy), $expect, $desc);
24 next if $right =~ tr/\0-\377//c;
25 utf8::encode($left);
26 is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed");
27}
28
6e3d6c02
KW
29my $isASCII = (ord("A") == 65);
30if ($isASCII) { # EBCDIC is too hard to test for malformations
4deba822 31
bd70aaaf 32# Test uft8n_to_uvchr(). These provide essentially complete code coverage.
eb83ed87
KW
33
34# Copied from utf8.h
35my $UTF8_ALLOW_EMPTY = 0x0001;
36my $UTF8_ALLOW_CONTINUATION = 0x0002;
37my $UTF8_ALLOW_NON_CONTINUATION = 0x0004;
38my $UTF8_ALLOW_SHORT = 0x0008;
39my $UTF8_ALLOW_LONG = 0x0010;
40my $UTF8_DISALLOW_SURROGATE = 0x0020;
41my $UTF8_WARN_SURROGATE = 0x0040;
42my $UTF8_DISALLOW_NONCHAR = 0x0080;
43my $UTF8_WARN_NONCHAR = 0x0100;
44my $UTF8_DISALLOW_SUPER = 0x0200;
45my $UTF8_WARN_SUPER = 0x0400;
46my $UTF8_DISALLOW_FE_FF = 0x0800;
47my $UTF8_WARN_FE_FF = 0x1000;
48my $UTF8_CHECK_ONLY = 0x2000;
49
50my $REPLACEMENT = 0xFFFD;
51
52my @warnings;
53
54use warnings 'utf8';
55local $SIG{__WARN__} = sub { push @warnings, @_ };
56
57# First test the malformations. All these raise category utf8 warnings.
58foreach my $test (
59 [ "zero length string malformation", "", 0,
60 $UTF8_ALLOW_EMPTY, 0, 0,
61 qr/empty string/
62 ],
63 [ "orphan continuation byte malformation", "\x80a", 2,
64 $UTF8_ALLOW_CONTINUATION, $REPLACEMENT, 1,
65 qr/unexpected continuation byte/
66 ],
67 [ "premature next character malformation (immediate)", "\xc2a", 2,
68 $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 1,
69 qr/unexpected non-continuation byte.*immediately after start byte/
70 ],
71 [ "premature next character malformation (non-immediate)", "\xf0\x80a", 3,
72 $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 2,
73 qr/unexpected non-continuation byte .* 2 bytes after start byte/
74 ],
75 [ "too short malformation", "\xf0\x80a", 2,
76 # Having the 'a' after this, but saying there are only 2 bytes also
77 # tests that we pay attention to the passed in length
78 $UTF8_ALLOW_SHORT, $REPLACEMENT, 2,
79 qr/2 bytes, need 4/
80 ],
81 [ "overlong malformation", "\xc1\xaf", 2,
82 $UTF8_ALLOW_LONG, ord('o'), 2,
83 qr/2 bytes, need 1/
84 ],
85 [ "overflow malformation", "\xff\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", 13,
86 0, # There is no way to allow this malformation
87 $REPLACEMENT, 13,
88 qr/overflow/
89 ],
90) {
91 my ($testname, $bytes, $length, $allow_flags, $allowed_uv, $expected_len, $message ) = @$test;
92
93 next if ! ok(length($bytes) >= $length, "$testname: Make sure won't read beyond buffer: " . length($bytes) . " >= $length");
94
95 # Test what happens when this malformation is not allowed
96 undef @warnings;
bd70aaaf 97 my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
eb83ed87
KW
98 is($ret_ref->[0], 0, "$testname: disallowed: Returns 0");
99 is($ret_ref->[1], $expected_len, "$testname: disallowed: Returns expected length");
100 if (is(scalar @warnings, 1, "$testname: disallowed: Got a single warning ")) {
101 like($warnings[0], $message, "$testname: disallowed: Got expected warning");
102 }
103 else {
104 if (scalar @warnings) {
105 note "The warnings were: " . join(", ", @warnings);
106 }
107 }
108
109 { # Next test when disallowed, and warnings are off.
110 undef @warnings;
111 no warnings 'utf8';
bd70aaaf 112 my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
eb83ed87
KW
113 is($ret_ref->[0], 0, "$testname: disallowed: no warnings 'utf8': Returns 0");
114 is($ret_ref->[1], $expected_len, "$testname: disallowed: no warnings 'utf8': Returns expected length");
115 if (!is(scalar @warnings, 0, "$testname: disallowed: no warnings 'utf8': no warnings generated")) {
116 note "The warnings were: " . join(", ", @warnings);
117 }
118 }
119
120 # Test with CHECK_ONLY
121 undef @warnings;
bd70aaaf 122 $ret_ref = test_utf8n_to_uvchr($bytes, $length, $UTF8_CHECK_ONLY);
eb83ed87
KW
123 is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0");
124 is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns expected length");
125 if (! is(scalar @warnings, 0, "$testname: CHECK_ONLY: no warnings generated")) {
126 note "The warnings were: " . join(", ", @warnings);
127 }
128
129 next if $allow_flags == 0; # Skip if can't allow this malformation
130
131 # Test when the malformation is allowed
132 undef @warnings;
bd70aaaf 133 $ret_ref = test_utf8n_to_uvchr($bytes, $length, $allow_flags);
eb83ed87
KW
134 is($ret_ref->[0], $allowed_uv, "$testname: allowed: Returns expected uv");
135 is($ret_ref->[1], $expected_len, "$testname: allowed: Returns expected length");
136 if (!is(scalar @warnings, 0, "$testname: allowed: no warnings generated"))
137 {
138 note "The warnings were: " . join(", ", @warnings);
139 }
140}
141
142my $FF_ret;
143
144use Unicode::UCD;
145my $has_quad = ($Unicode::UCD::MAX_CP > 0xFFFF_FFFF);
146if ($has_quad) {
147 no warnings qw{portable overflow};
148 $FF_ret = 0x1000000000;
149}
150else { # The above overflows unless a quad platform
151 $FF_ret = 0;
152}
153
154# Now test the cases where a legal code point is generated, but may or may not
155# be allowed/warned on.
2f8f112e 156my @tests = (
eb83ed87
KW
157 [ "surrogate", "\xed\xa4\x8d",
158 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE, 'surrogate', 0xD90D, 3,
159 qr/surrogate/
160 ],
161 [ "non_unicode", "\xf4\x90\x80\x80",
162 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER, 'non_unicode', 0x110000, 4,
163 qr/not Unicode/
164 ],
165 [ "non-character code point", "\xEF\xB7\x90",
166 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR, 'nonchar', 0xFDD0, 3,
ba707cdc 167 qr/Unicode non-character.*is not recommended for open interchange/
eb83ed87
KW
168 ],
169 [ "begins with FE", "\xfe\x82\x80\x80\x80\x80\x80",
170
171 # This code point is chosen so that it is representable in a UV on
2f8f112e 172 # 32-bit machines
eb83ed87 173 $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', 0x80000000, 7,
ea5ced44 174 qr/Code point 0x80000000 is not Unicode, and not portable/
eb83ed87 175 ],
2f8f112e
KW
176 [ "overflow with FE/FF",
177 # This tests the interaction of WARN_FE_FF/DISALLOW_FE_FF with
178 # overflow. The overflow malformation is never allowed, so preventing
179 # it takes precedence if the FE_FF options would otherwise allow in an
180 # overflowing value. These two code points (1 for 32-bits; 1 for 64)
181 # were chosen because the old overflow detection algorithm did not
182 # catch them; this means this test also checks for that fix.
183 ($has_quad)
184 ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
185 : "\xfe\x86\x80\x80\x80\x80\x80",
ea5ced44
KW
186
187 # We include both warning categories to make sure the FE_FF one has
188 # precedence
189 "$UTF8_WARN_FE_FF|$UTF8_WARN_SUPER", "$UTF8_DISALLOW_FE_FF", 'utf8', 0,
2f8f112e 190 ($has_quad) ? 13 : 7,
ea5ced44 191 qr/overflow at byte .*, after start byte 0xf/
eb83ed87 192 ],
2f8f112e
KW
193);
194
195if ($has_quad) { # All FF's will overflow on 32 bit
196 push @tests,
197 [ "begins with FF", "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
198 $UTF8_WARN_FE_FF, $UTF8_DISALLOW_FE_FF, 'utf8', $FF_ret, 13,
ea5ced44 199 qr/Code point 0x.* is not Unicode, and not portable/
2f8f112e
KW
200 ];
201}
202
203foreach my $test (@tests) {
eb83ed87
KW
204 my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test;
205
206 my $length = length $bytes;
2f8f112e 207 my $will_overflow = $testname =~ /overflow/;
eb83ed87
KW
208
209 # This is more complicated than the malformations tested earlier, as there
210 # are several orthogonal variables involved. We test all the subclasses
211 # of utf8 warnings to verify they work with and without the utf8 class,
212 # and don't have effects on other sublass warnings
54f4afef 213 foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
eb83ed87
KW
214 foreach my $warn_flag (0, $warn_flags) {
215 foreach my $disallow_flag (0, $disallow_flags) {
54f4afef 216 foreach my $do_warning (0, 1) {
eb83ed87 217
13d7a909
KW
218 my $eval_warn = $do_warning
219 ? "use warnings '$warning'"
220 : $warning eq "utf8"
54f4afef
KW
221 ? "no warnings 'utf8'"
222 : "use warnings 'utf8'; no warnings '$warning'";
2f8f112e 223
13d7a909
KW
224 # is effectively disallowed if will overflow, even if the
225 # flag indicates it is allowed, fix up test name to
226 # indicate this as well
227 my $disallowed = $disallow_flag || $will_overflow;
2f8f112e 228
13d7a909
KW
229 my $this_name = "$testname: " . (($disallow_flag)
230 ? 'disallowed'
231 : ($disallowed)
232 ? 'FE_FF allowed'
233 : 'allowed');
234 $this_name .= ", $eval_warn";
235 $this_name .= ", " . (($warn_flag)
236 ? 'with warning flag'
237 : 'no warning flag');
eb83ed87 238
13d7a909
KW
239 undef @warnings;
240 my $ret_ref;
241 #note __LINE__ . ": $eval_warn; \$ret_ref = test_utf8n_to_uvchr('$bytes', $length, $warn_flag|$disallow_flag)";
242 my $eval_text = "$eval_warn; \$ret_ref = test_utf8n_to_uvchr('$bytes', $length, $warn_flag|$disallow_flag)";
243 eval "$eval_text";
244 if (! ok ("$@ eq ''", "$this_name: eval succeeded")) {
245 note "\$!='$!'; eval'd=\"$eval_text\"";
246 next;
54f4afef 247 }
13d7a909
KW
248 if ($disallowed) {
249 is($ret_ref->[0], 0, "$this_name: Returns 0");
2f8f112e
KW
250 }
251 else {
13d7a909
KW
252 is($ret_ref->[0], $allowed_uv,
253 "$this_name: Returns expected uv");
254 }
255 is($ret_ref->[1], $expected_len,
256 "$this_name: Returns expected length");
257
258 if (! $do_warning
259 && ($warning eq 'utf8' || $warning eq $category))
260 {
261 if (!is(scalar @warnings, 0,
262 "$this_name: No warnings generated"))
263 {
2f8f112e
KW
264 note "The warnings were: " . join(", ", @warnings);
265 }
266 }
13d7a909
KW
267 elsif ($will_overflow
268 && ! $disallow_flag
269 && $warning eq 'utf8')
270 {
271
272 # Will get the overflow message instead of the expected
273 # message under these circumstances, as they would
274 # otherwise accept an overflowed value, which the code
275 # should not allow, so falls back to overflow.
276 if (is(scalar @warnings, 1,
277 "$this_name: Got a single warning "))
278 {
279 like($warnings[0], qr/overflow/,
280 "$this_name: Got overflow warning");
281 }
282 else {
283 if (scalar @warnings) {
284 note "The warnings were: "
285 . join(", ", @warnings);
286 }
287 }
eb83ed87 288 }
13d7a909
KW
289 elsif ($warn_flag
290 && ($warning eq 'utf8' || $warning eq $category))
291 {
292 if (is(scalar @warnings, 1,
293 "$this_name: Got a single warning "))
294 {
295 like($warnings[0], $message,
296 "$this_name: Got expected warning");
297 }
298 else {
299 if (scalar @warnings) {
300 note "The warnings were: "
301 . join(", ", @warnings);
302 }
eb83ed87
KW
303 }
304 }
eb83ed87 305
13d7a909
KW
306 # Check CHECK_ONLY results when the input is disallowed. Do
307 # this when actually disallowed, not just when the
308 # $disallow_flag is set
309 if ($disallowed) {
310 undef @warnings;
311 $ret_ref = test_utf8n_to_uvchr($bytes, $length,
312 $disallow_flag|$UTF8_CHECK_ONLY);
313 is($ret_ref->[0], 0, "$this_name, CHECK_ONLY: Returns 0");
314 is($ret_ref->[1], -1,
315 "$this_name: CHECK_ONLY: returns expected length");
316 if (! is(scalar @warnings, 0,
317 "$this_name, CHECK_ONLY: no warnings generated"))
318 {
319 note "The warnings were: " . join(", ", @warnings);
320 }
eb83ed87
KW
321 }
322 }
323 }
324 }
325 }
326}
4deba822 327}
eb83ed87 328
6e3d6c02
KW
329
330# The numbers in this array are chosen because they are "interesting" on
331# either ASCII or EBCDIC platforms. 0-255 require special handling on EBCDIC;
332# others are the boundaries where the number of bytes required to represent
333# them increase.
334my @code_points = (0 .. 256,
335 0x400 - 1, 0x400,
336 0x800 - 1, 0x800,
337 0x4000 - 1, 0x4000,
338 0x8000 - 1, 0x8000,
339 0xD000 - 1, 0xD000, # First code point considered
340 # problematic on ASCII.
341 0x10000 - 1, 0x1000,
342 0x200000 - 1, 0x20000,
343 0x40000 - 1, 0x40000,
344 0x400000 - 1, 0x400000,
345 0x4000000 - 1, 0x4000000,
346 0x80000000 - 1 # Highest legal on EBCDIC machines
347 );
348for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
349 @code_points)
350{
351 my $hex_u = sprintf("0x%02X", $u);
352 my $n = utf8::unicode_to_native($u);
353 my $hex_n = sprintf("0x%02X", $n);
354
355 my $offskip_should_be = (ord ("A") == 65)
356 ? ( $u < 0x80 ? 1 :
357 $u < 0x800 ? 2 :
358 $u < 0x10000 ? 3 :
359 $u < 0x200000 ? 4 :
360 $u < 0x4000000 ? 5 :
361 $u < 0x80000000 ? 6 : 7 # 13 for 64 bit words
362 )
363 : ($u < 0xA0 ? 1 :
364 $u < 0x400 ? 2 :
365 $u < 0x4000 ? 3 :
366 $u < 0x40000 ? 4 :
367 $u < 0x400000 ? 5 :
368 $u < 0x4000000 ? 6 : 7
369 );
370
371 # If this test fails, subsequent ones are meaningless.
372 next unless is(test_OFFUNISKIP($u), $offskip_should_be,
373 "Verify OFFUNISKIP($hex_u) is $offskip_should_be");
374 my $invariant = $offskip_should_be == 1;
375 my $display_invariant = $invariant || 0;
376 is(test_OFFUNI_IS_INVARIANT($u), $invariant,
377 "Verify OFFUNI_IS_INVARIANT($hex_u) is $display_invariant");
378
379 my $uvchr_skip_should_be = $offskip_should_be;
380 next unless is(test_UVCHR_SKIP($n), $uvchr_skip_should_be,
381 "Verify UVCHR_SKIP($hex_n) is $uvchr_skip_should_be");
382 is(test_UVCHR_IS_INVARIANT($n), $offskip_should_be == 1,
383 "Verify UVCHR_IS_INVARIANT($hex_n) is $display_invariant");
384
385 my $n_chr = chr $n;
386 utf8::upgrade $n_chr;
387
388 is(test_UTF8_SKIP($n_chr), $uvchr_skip_should_be,
389 "Verify UTF8_SKIP(chr $hex_n) is $uvchr_skip_should_be");
390
391 use bytes;
392 for (my $j = 0; $j < length $n_chr; $j++) {
393 my $b = substr($n_chr, $j, 1);
394 my $hex_b = sprintf("\"\\x%02x\"", ord $b);
395
396 my $byte_invariant = $j == 0 && $uvchr_skip_should_be == 1;
397 my $display_byte_invariant = $byte_invariant || 0;
398 next unless is(test_UTF8_IS_INVARIANT($b), $byte_invariant,
399 " Verify UTF8_IS_INVARIANT($hex_b) for byte $j "
400 . "is $display_byte_invariant");
401
402 my $is_start = $j == 0 && $uvchr_skip_should_be > 1;
403 my $display_is_start = $is_start || 0;
404 next unless is(test_UTF8_IS_START($b), $is_start,
405 " Verify UTF8_IS_START($hex_b) is $display_is_start");
406
407 my $is_continuation = $j != 0 && $uvchr_skip_should_be > 1;
408 my $display_is_continuation = $is_continuation || 0;
409 next unless is(test_UTF8_IS_CONTINUATION($b), $is_continuation,
410 " Verify UTF8_IS_CONTINUATION($hex_b) is "
411 . "$display_is_continuation");
412
413 my $is_continued = $uvchr_skip_should_be > 1;
414 my $display_is_continued = $is_continued || 0;
415 next unless is(test_UTF8_IS_CONTINUED($b), $is_continued,
416 " Verify UTF8_IS_CONTINUED($hex_b) is "
417 . "$display_is_continued");
418
419 my $is_downgradeable_start = $n < 256
420 && $uvchr_skip_should_be > 1
421 && $j == 0;
422 my $display_is_downgradeable_start = $is_downgradeable_start || 0;
423 next unless is(test_UTF8_IS_DOWNGRADEABLE_START($b),
424 $is_downgradeable_start,
425 " Verify UTF8_IS_DOWNGRADEABLE_START($hex_b) is "
426 . "$display_is_downgradeable_start");
427
428 my $is_above_latin1 = $n > 255 && $j == 0;
429 my $display_is_above_latin1 = $is_above_latin1 || 0;
430 next unless is(test_UTF8_IS_ABOVE_LATIN1($b),
431 $is_above_latin1,
432 " Verify UTF8_IS_ABOVE_LATIN1($hex_b) is "
433 . "$display_is_above_latin1");
434
435 my $is_possibly_problematic = $j == 0
436 && $n >= (($isASCII)
437 ? 0xD000
438 : 0x8000);
439 my $display_is_possibly_problematic = $is_possibly_problematic || 0;
440 next unless is(test_isUTF8_POSSIBLY_PROBLEMATIC($b),
441 $is_possibly_problematic,
442 " Verify isUTF8_POSSIBLY_PROBLEMATIC($hex_b) is "
443 . "$display_is_above_latin1");
444 }
445}
446
fed3ba5d 447done_testing;