This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
toke.c: Remove soon-to-be invalid t assumption
[perl5.git] / ext / XS-APItest / t / utf8.t
CommitLineData
fed3ba5d
NC
1#!perl -w
2
3use strict;
4use Test::More;
7dfd8446 5$|=1;
fed3ba5d
NC
6
7use XS::APItest;
8
4deba822
KW
9my $pound_sign = chr utf8::unicode_to_native(163);
10
7dfd8446
KW
11sub isASCII { ord "A" == 65 }
12
13sub display_bytes {
14 my $string = shift;
15 return '"'
16 . join("", map { sprintf("\\x%02x", ord $_) } split "", $string)
17 . '"';
18}
19
20# This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
21# because that uses the same functions we are testing here. So UTF-EBCDIC
22# strings are hard-coded as I8 strings in this file instead, and we use array
23# lookup to translate into the appropriate code page.
24
25my @i8_to_native = ( # Only code page 1047 so far.
26# _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F
270x00,0x01,0x02,0x03,0x37,0x2D,0x2E,0x2F,0x16,0x05,0x15,0x0B,0x0C,0x0D,0x0E,0x0F,
280x10,0x11,0x12,0x13,0x3C,0x3D,0x32,0x26,0x18,0x19,0x3F,0x27,0x1C,0x1D,0x1E,0x1F,
290x40,0x5A,0x7F,0x7B,0x5B,0x6C,0x50,0x7D,0x4D,0x5D,0x5C,0x4E,0x6B,0x60,0x4B,0x61,
300xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0x7A,0x5E,0x4C,0x7E,0x6E,0x6F,
310x7C,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,
320xD7,0xD8,0xD9,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xAD,0xE0,0xBD,0x5F,0x6D,
330x79,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x91,0x92,0x93,0x94,0x95,0x96,
340x97,0x98,0x99,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xC0,0x4F,0xD0,0xA1,0x07,
350x20,0x21,0x22,0x23,0x24,0x25,0x06,0x17,0x28,0x29,0x2A,0x2B,0x2C,0x09,0x0A,0x1B,
360x30,0x31,0x1A,0x33,0x34,0x35,0x36,0x08,0x38,0x39,0x3A,0x3B,0x04,0x14,0x3E,0xFF,
370x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x51,0x52,0x53,0x54,0x55,0x56,
380x57,0x58,0x59,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x70,0x71,0x72,0x73,
390x74,0x75,0x76,0x77,0x78,0x80,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,0x90,0x9A,0x9B,0x9C,
400x9D,0x9E,0x9F,0xA0,0xAA,0xAB,0xAC,0xAE,0xAF,0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,
410xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBE,0xBF,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xDA,0xDB,
420xDC,0xDD,0xDE,0xDF,0xE1,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xFA,0xFB,0xFC,0xFD,0xFE,
43);
44
45*I8_to_native = (isASCII)
46 ? sub { return shift }
47 : sub { return join "", map { chr $i8_to_native[ord $_] }
48 split "", shift };
49
50my $is64bit = length sprintf("%x", ~0) > 8;
51
52
53# Test utf8n_to_uvchr(). These provide essentially complete code coverage.
54# Copied from utf8.h
55my $UTF8_ALLOW_EMPTY = 0x0001;
56my $UTF8_ALLOW_CONTINUATION = 0x0002;
57my $UTF8_ALLOW_NON_CONTINUATION = 0x0004;
58my $UTF8_ALLOW_SHORT = 0x0008;
59my $UTF8_ALLOW_LONG = 0x0010;
60my $UTF8_DISALLOW_SURROGATE = 0x0020;
61my $UTF8_WARN_SURROGATE = 0x0040;
62my $UTF8_DISALLOW_NONCHAR = 0x0080;
63my $UTF8_WARN_NONCHAR = 0x0100;
64my $UTF8_DISALLOW_SUPER = 0x0200;
65my $UTF8_WARN_SUPER = 0x0400;
1d1c12d9
KW
66my $UTF8_DISALLOW_ABOVE_31_BIT = 0x0800;
67my $UTF8_WARN_ABOVE_31_BIT = 0x1000;
7dfd8446
KW
68my $UTF8_CHECK_ONLY = 0x2000;
69
046d01eb
KW
70# Test uvchr_to_utf8().
71my $UNICODE_WARN_SURROGATE = 0x0001;
72my $UNICODE_WARN_NONCHAR = 0x0002;
73my $UNICODE_WARN_SUPER = 0x0004;
74my $UNICODE_WARN_ABOVE_31_BIT = 0x0008;
75my $UNICODE_DISALLOW_SURROGATE = 0x0010;
76my $UNICODE_DISALLOW_NONCHAR = 0x0020;
77my $UNICODE_DISALLOW_SUPER = 0x0040;
78my $UNICODE_DISALLOW_ABOVE_31_BIT = 0x0080;
79
80my $look_for_everything_utf8n_to
81 = $UTF8_DISALLOW_SURROGATE
7dfd8446
KW
82 | $UTF8_WARN_SURROGATE
83 | $UTF8_DISALLOW_NONCHAR
84 | $UTF8_WARN_NONCHAR
85 | $UTF8_DISALLOW_SUPER
86 | $UTF8_WARN_SUPER
1d1c12d9
KW
87 | $UTF8_DISALLOW_ABOVE_31_BIT
88 | $UTF8_WARN_ABOVE_31_BIT;
046d01eb
KW
89my $look_for_everything_uvchr_to
90 = $UNICODE_DISALLOW_SURROGATE
91 | $UNICODE_WARN_SURROGATE
92 | $UNICODE_DISALLOW_NONCHAR
93 | $UNICODE_WARN_NONCHAR
94 | $UNICODE_DISALLOW_SUPER
95 | $UNICODE_WARN_SUPER
96 | $UNICODE_DISALLOW_ABOVE_31_BIT
97 | $UNICODE_WARN_ABOVE_31_BIT;
7dfd8446 98
fed3ba5d
NC
99foreach ([0, '', '', 'empty'],
100 [0, 'N', 'N', '1 char'],
101 [1, 'NN', 'N', '1 char substring'],
102 [-2, 'Perl', 'Rules', 'different'],
4deba822
KW
103 [0, $pound_sign, $pound_sign, 'pound sign'],
104 [1, $pound_sign . 10, $pound_sign . 1, '10 pounds is more than 1 pound'],
105 [1, $pound_sign . $pound_sign, $pound_sign, '2 pound signs are more than 1'],
fed3ba5d
NC
106 [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'],
107 [-1, '!', "!\x{1F42A}", 'Initial substrings match'],
108 ) {
109 my ($expect, $left, $right, $desc) = @$_;
110 my $copy = $right;
111 utf8::encode($copy);
112 is(bytes_cmp_utf8($left, $copy), $expect, $desc);
113 next if $right =~ tr/\0-\377//c;
114 utf8::encode($left);
115 is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed");
116}
117
7dfd8446
KW
118# The keys to this hash are Unicode code points, their values are the native
119# UTF-8 representations of them. The code points are chosen because they are
120# "interesting" on either or both ASCII and EBCDIC platforms. First we add
121# boundaries where the number of bytes required to represent them increase, or
122# are adjacent to problematic code points, so we want to make sure they aren't
123# considered problematic.
124my %code_points = (
125 0x0100 => (isASCII) ? "\xc4\x80" : I8_to_native("\xc8\xa0"),
126 0x0400 - 1 => (isASCII) ? "\xcf\xbf" : I8_to_native("\xdf\xbf"),
127 0x0400 => (isASCII) ? "\xd0\x80" : I8_to_native("\xe1\xa0\xa0"),
128 0x0800 - 1 => (isASCII) ? "\xdf\xbf" : I8_to_native("\xe1\xbf\xbf"),
129 0x0800 => (isASCII) ? "\xe0\xa0\x80" : I8_to_native("\xe2\xa0\xa0"),
130 0x4000 - 1 => (isASCII) ? "\xe3\xbf\xbf" : I8_to_native("\xef\xbf\xbf"),
131 0x4000 => (isASCII) ? "\xe4\x80\x80" : I8_to_native("\xf0\xb0\xa0\xa0"),
132 0x8000 - 1 => (isASCII) ? "\xe7\xbf\xbf" : I8_to_native("\xf0\xbf\xbf\xbf"),
133
134 # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC,
135 # as of this writing, considers potentially problematic on EBCDIC
136 0x8000 => (isASCII) ? "\xe8\x80\x80" : I8_to_native("\xf1\xa0\xa0\xa0"),
137
138 0xD000 - 1 => (isASCII) ? "\xec\xbf\xbf" : I8_to_native("\xf1\xb3\xbf\xbf"),
139
140 # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC,
141 # as of this writing, considers potentially problematic on ASCII
142 0xD000 => (isASCII) ? "\xed\x80\x80" : I8_to_native("\xf1\xb4\xa0\xa0"),
143
144 # Bracket the surrogates
145 0xD7FF => (isASCII) ? "\xed\x9f\xbf" : I8_to_native("\xf1\xb5\xbf\xbf"),
146 0xE000 => (isASCII) ? "\xee\x80\x80" : I8_to_native("\xf1\xb8\xa0\xa0"),
147
148 # Bracket the 32 contiguous non characters
149 0xFDCF => (isASCII) ? "\xef\xb7\x8f" : I8_to_native("\xf1\xbf\xae\xaf"),
150 0xFDF0 => (isASCII) ? "\xef\xb7\xb0" : I8_to_native("\xf1\xbf\xaf\xb0"),
151
152 # Mostly bracket non-characters, but some are transitions to longer
153 # strings
154 0xFFFD => (isASCII) ? "\xef\xbf\xbd" : I8_to_native("\xf1\xbf\xbf\xbd"),
155 0x10000 - 1 => (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
156 0x10000 => (isASCII) ? "\xf0\x90\x80\x80" : I8_to_native("\xf2\xa0\xa0\xa0"),
157 0x1FFFD => (isASCII) ? "\xf0\x9f\xbf\xbd" : I8_to_native("\xf3\xbf\xbf\xbd"),
158 0x20000 => (isASCII) ? "\xf0\xa0\x80\x80" : I8_to_native("\xf4\xa0\xa0\xa0"),
159 0x2FFFD => (isASCII) ? "\xf0\xaf\xbf\xbd" : I8_to_native("\xf5\xbf\xbf\xbd"),
160 0x30000 => (isASCII) ? "\xf0\xb0\x80\x80" : I8_to_native("\xf6\xa0\xa0\xa0"),
161 0x3FFFD => (isASCII) ? "\xf0\xbf\xbf\xbd" : I8_to_native("\xf7\xbf\xbf\xbd"),
162 0x40000 - 1 => (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
163 0x40000 => (isASCII) ? "\xf1\x80\x80\x80" : I8_to_native("\xf8\xa8\xa0\xa0\xa0"),
164 0x4FFFD => (isASCII) ? "\xf1\x8f\xbf\xbd" : I8_to_native("\xf8\xa9\xbf\xbf\xbd"),
165 0x50000 => (isASCII) ? "\xf1\x90\x80\x80" : I8_to_native("\xf8\xaa\xa0\xa0\xa0"),
166 0x5FFFD => (isASCII) ? "\xf1\x9f\xbf\xbd" : I8_to_native("\xf8\xab\xbf\xbf\xbd"),
167 0x60000 => (isASCII) ? "\xf1\xa0\x80\x80" : I8_to_native("\xf8\xac\xa0\xa0\xa0"),
168 0x6FFFD => (isASCII) ? "\xf1\xaf\xbf\xbd" : I8_to_native("\xf8\xad\xbf\xbf\xbd"),
169 0x70000 => (isASCII) ? "\xf1\xb0\x80\x80" : I8_to_native("\xf8\xae\xa0\xa0\xa0"),
170 0x7FFFD => (isASCII) ? "\xf1\xbf\xbf\xbd" : I8_to_native("\xf8\xaf\xbf\xbf\xbd"),
171 0x80000 => (isASCII) ? "\xf2\x80\x80\x80" : I8_to_native("\xf8\xb0\xa0\xa0\xa0"),
172 0x8FFFD => (isASCII) ? "\xf2\x8f\xbf\xbd" : I8_to_native("\xf8\xb1\xbf\xbf\xbd"),
173 0x90000 => (isASCII) ? "\xf2\x90\x80\x80" : I8_to_native("\xf8\xb2\xa0\xa0\xa0"),
174 0x9FFFD => (isASCII) ? "\xf2\x9f\xbf\xbd" : I8_to_native("\xf8\xb3\xbf\xbf\xbd"),
175 0xA0000 => (isASCII) ? "\xf2\xa0\x80\x80" : I8_to_native("\xf8\xb4\xa0\xa0\xa0"),
176 0xAFFFD => (isASCII) ? "\xf2\xaf\xbf\xbd" : I8_to_native("\xf8\xb5\xbf\xbf\xbd"),
177 0xB0000 => (isASCII) ? "\xf2\xb0\x80\x80" : I8_to_native("\xf8\xb6\xa0\xa0\xa0"),
178 0xBFFFD => (isASCII) ? "\xf2\xbf\xbf\xbd" : I8_to_native("\xf8\xb7\xbf\xbf\xbd"),
179 0xC0000 => (isASCII) ? "\xf3\x80\x80\x80" : I8_to_native("\xf8\xb8\xa0\xa0\xa0"),
180 0xCFFFD => (isASCII) ? "\xf3\x8f\xbf\xbd" : I8_to_native("\xf8\xb9\xbf\xbf\xbd"),
181 0xD0000 => (isASCII) ? "\xf3\x90\x80\x80" : I8_to_native("\xf8\xba\xa0\xa0\xa0"),
182 0xDFFFD => (isASCII) ? "\xf3\x9f\xbf\xbd" : I8_to_native("\xf8\xbb\xbf\xbf\xbd"),
183 0xE0000 => (isASCII) ? "\xf3\xa0\x80\x80" : I8_to_native("\xf8\xbc\xa0\xa0\xa0"),
184 0xEFFFD => (isASCII) ? "\xf3\xaf\xbf\xbd" : I8_to_native("\xf8\xbd\xbf\xbf\xbd"),
185 0xF0000 => (isASCII) ? "\xf3\xb0\x80\x80" : I8_to_native("\xf8\xbe\xa0\xa0\xa0"),
186 0xFFFFD => (isASCII) ? "\xf3\xbf\xbf\xbd" : I8_to_native("\xf8\xbf\xbf\xbf\xbd"),
187 0x100000 => (isASCII) ? "\xf4\x80\x80\x80" : I8_to_native("\xf9\xa0\xa0\xa0\xa0"),
188 0x10FFFD => (isASCII) ? "\xf4\x8f\xbf\xbd" : I8_to_native("\xf9\xa1\xbf\xbf\xbd"),
189 0x110000 => (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
190
191 # Things that would be noncharacters if they were in Unicode, and might be
192 # mistaken, if the C code is bad, to be nonchars
193 0x11FFFE => (isASCII) ? "\xf4\x9f\xbf\xbe" : I8_to_native("\xf9\xa3\xbf\xbf\xbe"),
194 0x11FFFF => (isASCII) ? "\xf4\x9f\xbf\xbf" : I8_to_native("\xf9\xa3\xbf\xbf\xbf"),
195 0x20FFFE => (isASCII) ? "\xf8\x88\x8f\xbf\xbe" : I8_to_native("\xfa\xa1\xbf\xbf\xbe"),
196 0x20FFFF => (isASCII) ? "\xf8\x88\x8f\xbf\xbf" : I8_to_native("\xfa\xa1\xbf\xbf\xbf"),
197
198 0x200000 - 1 => (isASCII) ? "\xf7\xbf\xbf\xbf" : I8_to_native("\xf9\xbf\xbf\xbf\xbf"),
199 0x200000 => (isASCII) ? "\xf8\x88\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
200 0x400000 - 1 => (isASCII) ? "\xf8\x8f\xbf\xbf\xbf" : I8_to_native("\xfb\xbf\xbf\xbf\xbf"),
201 0x400000 => (isASCII) ? "\xf8\x90\x80\x80\x80" : I8_to_native("\xfc\xa4\xa0\xa0\xa0\xa0"),
202 0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"),
203 0x4000000 => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"),
204 0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"),
205 0x4000000 => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"),
206 0x40000000 - 1 => (isASCII) ? "\xfc\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xfe\xbf\xbf\xbf\xbf\xbf\xbf"),
207 0x40000000 => (isASCII) ? "\xfd\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0"),
208 0x80000000 - 1 => (isASCII) ? "\xfd\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"),
209);
4deba822 210
7dfd8446
KW
211if (isASCII) {
212 $code_points{0x80000000} = "\xfe\x82\x80\x80\x80\x80\x80";
213 $code_points{0xFFFFFFFF} = "\xfe\x83\xbf\xbf\xbf\xbf\xbf";
214}
eb83ed87 215
7dfd8446
KW
216if ($is64bit) {
217 no warnings qw(overflow portable);
218 $code_points{0x100000000} = (isASCII) ? "\xfe\x84\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0");
219 $code_points{0x1000000000 - 1} = (isASCII) ? "\xfe\xbf\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf\xbf");
220 $code_points{0x1000000000} = (isASCII) ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0");
221 $code_points{0xFFFFFFFFFFFFFFFF} = (isASCII) ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf");
222}
eb83ed87 223
7dfd8446
KW
224# Now add in entries for each of code points 0-255, which require special
225# handling on EBCDIC. Remember the keys are Unicode values, and the values
226# are the native UTF-8. For invariants, the bytes are just the native chr.
227
228my $cp = 0;
229while ($cp < ((isASCII) ? 128 : 160)) { # This is from the definition of
230 # invariant
231 $code_points{$cp} = chr utf8::unicode_to_native($cp);
232 $cp++;
233}
234
235# Done with the invariants. Now do the variants. All in this range are 2
236# byte. Again, we can't use the internal functions to generate UTF-8, as
237# those are what we are trying to test. In the loop, we know what range the
238# continuation bytes can be in, and what the lowest start byte can be. So we
239# cycle through them.
240
241my $first_continuation = (isASCII) ? 0x80 : 0xA0;
242my $final_continuation = 0xBF;
243my $start = (isASCII) ? 0xC2 : 0xC5;
244
245my $continuation = $first_continuation - 1;
246
247while ($cp < 255) {
248 if (++$continuation > $final_continuation) {
249
250 # Wrap to the next start byte when we reach the final continuation
251 # byte possible
252 $continuation = $first_continuation;
253 $start++;
254 }
255 $code_points{$cp} = I8_to_native(chr($start) . chr($continuation));
256
257 $cp++;
258}
eb83ed87
KW
259
260my @warnings;
261
262use warnings 'utf8';
263local $SIG{__WARN__} = sub { push @warnings, @_ };
264
7dfd8446
KW
265# This set of tests looks for basic sanity, and lastly tests the bottom level
266# decode routine for the given code point. If the earlier tests for that code
267# point fail, that one probably will too. Malformations are tested in later
268# segments of code.
269for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
270 keys %code_points)
271{
272 my $hex_u = sprintf("0x%02X", $u);
273 my $n = utf8::unicode_to_native($u);
274 my $hex_n = sprintf("0x%02X", $n);
275 my $bytes = $code_points{$u};
276
277 my $offskip_should_be;
278 {
279 no warnings qw(overflow portable);
280 $offskip_should_be = (isASCII)
281 ? ( $u < 0x80 ? 1 :
282 $u < 0x800 ? 2 :
283 $u < 0x10000 ? 3 :
284 $u < 0x200000 ? 4 :
285 $u < 0x4000000 ? 5 :
286 $u < 0x80000000 ? 6 : (($is64bit)
287 ? ($u < 0x1000000000 ? 7 : 13)
288 : 7)
289 )
290 : ($u < 0xA0 ? 1 :
291 $u < 0x400 ? 2 :
292 $u < 0x4000 ? 3 :
293 $u < 0x40000 ? 4 :
294 $u < 0x400000 ? 5 :
295 $u < 0x4000000 ? 6 : 7
296 );
297 }
298
299 # If this test fails, subsequent ones are meaningless.
300 next unless is(test_OFFUNISKIP($u), $offskip_should_be,
301 "Verify OFFUNISKIP($hex_u) is $offskip_should_be");
302 my $invariant = $offskip_should_be == 1;
303 my $display_invariant = $invariant || 0;
304 is(test_OFFUNI_IS_INVARIANT($u), $invariant,
305 "Verify OFFUNI_IS_INVARIANT($hex_u) is $display_invariant");
306
307 my $uvchr_skip_should_be = $offskip_should_be;
308 next unless is(test_UVCHR_SKIP($n), $uvchr_skip_should_be,
309 "Verify UVCHR_SKIP($hex_n) is $uvchr_skip_should_be");
310 is(test_UVCHR_IS_INVARIANT($n), $offskip_should_be == 1,
311 "Verify UVCHR_IS_INVARIANT($hex_n) is $display_invariant");
312
313 my $n_chr = chr $n;
314 utf8::upgrade $n_chr;
315
316 is(test_UTF8_SKIP($n_chr), $uvchr_skip_should_be,
317 "Verify UTF8_SKIP(chr $hex_n) is $uvchr_skip_should_be");
318
319 use bytes;
320 for (my $j = 0; $j < length $n_chr; $j++) {
321 my $b = substr($n_chr, $j, 1);
322 my $hex_b = sprintf("\"\\x%02x\"", ord $b);
323
324 my $byte_invariant = $j == 0 && $uvchr_skip_should_be == 1;
325 my $display_byte_invariant = $byte_invariant || 0;
326 next unless is(test_UTF8_IS_INVARIANT($b), $byte_invariant,
327 " Verify UTF8_IS_INVARIANT($hex_b) for byte $j "
328 . "is $display_byte_invariant");
329
330 my $is_start = $j == 0 && $uvchr_skip_should_be > 1;
331 my $display_is_start = $is_start || 0;
332 next unless is(test_UTF8_IS_START($b), $is_start,
333 " Verify UTF8_IS_START($hex_b) is $display_is_start");
334
335 my $is_continuation = $j != 0 && $uvchr_skip_should_be > 1;
336 my $display_is_continuation = $is_continuation || 0;
337 next unless is(test_UTF8_IS_CONTINUATION($b), $is_continuation,
338 " Verify UTF8_IS_CONTINUATION($hex_b) is "
339 . "$display_is_continuation");
340
341 my $is_continued = $uvchr_skip_should_be > 1;
342 my $display_is_continued = $is_continued || 0;
343 next unless is(test_UTF8_IS_CONTINUED($b), $is_continued,
344 " Verify UTF8_IS_CONTINUED($hex_b) is "
345 . "$display_is_continued");
346
347 my $is_downgradeable_start = $n < 256
348 && $uvchr_skip_should_be > 1
349 && $j == 0;
350 my $display_is_downgradeable_start = $is_downgradeable_start || 0;
351 next unless is(test_UTF8_IS_DOWNGRADEABLE_START($b),
352 $is_downgradeable_start,
353 " Verify UTF8_IS_DOWNGRADEABLE_START($hex_b) is "
354 . "$display_is_downgradeable_start");
355
356 my $is_above_latin1 = $n > 255 && $j == 0;
357 my $display_is_above_latin1 = $is_above_latin1 || 0;
358 next unless is(test_UTF8_IS_ABOVE_LATIN1($b),
359 $is_above_latin1,
360 " Verify UTF8_IS_ABOVE_LATIN1($hex_b) is "
361 . "$display_is_above_latin1");
362
363 my $is_possibly_problematic = $j == 0
364 && $n >= ((isASCII)
365 ? 0xD000
366 : 0x8000);
367 my $display_is_possibly_problematic = $is_possibly_problematic || 0;
368 next unless is(test_isUTF8_POSSIBLY_PROBLEMATIC($b),
369 $is_possibly_problematic,
370 " Verify isUTF8_POSSIBLY_PROBLEMATIC($hex_b) is "
371 . "$display_is_above_latin1");
372 }
373
374 # We are not trying to look for warnings, etc, so if they should occur, it
375 # is an error. But some of the code points here do cause warnings, so we
376 # check here and turn off the ones that apply to such code points. A
377 # later section of the code tests for these kinds of things.
046d01eb 378 my $this_utf8_flags = $look_for_everything_utf8n_to;
7dfd8446
KW
379 my $len = length $bytes;
380 if ($n > 2 ** 31 - 1) {
1d1c12d9
KW
381 $this_utf8_flags &=
382 ~($UTF8_DISALLOW_ABOVE_31_BIT|$UTF8_WARN_ABOVE_31_BIT);
7dfd8446
KW
383 }
384 if ($n > 0x10FFFF) {
385 $this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER);
386 }
387 elsif (($n & 0xFFFE) == 0xFFFE) {
388 $this_utf8_flags &= ~($UTF8_DISALLOW_NONCHAR|$UTF8_WARN_NONCHAR);
389 }
390
391 undef @warnings;
392
393 my $display_flags = sprintf "0x%x", $this_utf8_flags;
394 my $ret_ref = test_utf8n_to_uvchr($bytes, $len, $this_utf8_flags);
395 my $display_bytes = display_bytes($bytes);
396 is($ret_ref->[0], $n, "Verify utf8n_to_uvchr($display_bytes, $display_flags) returns $hex_n");
397 is($ret_ref->[1], $len, "Verify utf8n_to_uvchr() for $hex_n returns expected length");
398
399 unless (is(scalar @warnings, 0,
400 "Verify utf8n_to_uvchr() for $hex_n generated no warnings"))
401 {
402 diag "The warnings were: " . join(", ", @warnings);
403 }
046d01eb
KW
404
405 # Similarly for uvchr_to_utf8
406 my $this_uvchr_flags = $look_for_everything_uvchr_to;
407 if ($n > 2 ** 31 - 1) {
408 $this_uvchr_flags &=
409 ~($UNICODE_DISALLOW_ABOVE_31_BIT|$UNICODE_WARN_ABOVE_31_BIT);
410 }
411 if ($n > 0x10FFFF) {
412 $this_uvchr_flags &= ~($UNICODE_DISALLOW_SUPER|$UNICODE_WARN_SUPER);
413 }
414 elsif (($n & 0xFFFE) == 0xFFFE) {
415 $this_uvchr_flags &= ~($UNICODE_DISALLOW_NONCHAR|$UNICODE_WARN_NONCHAR);
416 }
417 $display_flags = sprintf "0x%x", $this_uvchr_flags;
418
419 undef @warnings;
420
421 my $ret = test_uvchr_to_utf8_flags($n, $this_uvchr_flags);
422 ok(defined $ret, "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returned success");
423 is($ret, $bytes, "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returns correct bytes");
424
425 unless (is(scalar @warnings, 0,
426 "Verify uvchr_to_utf8_flags($hex_n, $display_flags) for $hex_n generated no warnings"))
427 {
428 diag "The warnings were: " . join(", ", @warnings);
429 }
7dfd8446
KW
430}
431
432my $REPLACEMENT = 0xFFFD;
433
434# Now test the malformations. All these raise category utf8 warnings.
435my $c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte
436my @malformations = (
eb83ed87
KW
437 [ "zero length string malformation", "", 0,
438 $UTF8_ALLOW_EMPTY, 0, 0,
439 qr/empty string/
440 ],
7dfd8446
KW
441 [ "orphan continuation byte malformation", I8_to_native("${c}a"),
442 2,
eb83ed87
KW
443 $UTF8_ALLOW_CONTINUATION, $REPLACEMENT, 1,
444 qr/unexpected continuation byte/
445 ],
7dfd8446
KW
446 [ "premature next character malformation (immediate)",
447 (isASCII) ? "\xc2a" : I8_to_native("\xc5") ."a",
448 2,
eb83ed87
KW
449 $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 1,
450 qr/unexpected non-continuation byte.*immediately after start byte/
451 ],
7dfd8446
KW
452 [ "premature next character malformation (non-immediate)",
453 I8_to_native("\xf0${c}a"),
454 3,
eb83ed87
KW
455 $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 2,
456 qr/unexpected non-continuation byte .* 2 bytes after start byte/
457 ],
7dfd8446 458 [ "too short malformation", I8_to_native("\xf0${c}a"), 2,
eb83ed87
KW
459 # Having the 'a' after this, but saying there are only 2 bytes also
460 # tests that we pay attention to the passed in length
461 $UTF8_ALLOW_SHORT, $REPLACEMENT, 2,
462 qr/2 bytes, need 4/
463 ],
7dfd8446
KW
464 [ "overlong malformation", I8_to_native("\xc0$c"), 2,
465 $UTF8_ALLOW_LONG,
466 0, # NUL
467 2,
eb83ed87 468 qr/2 bytes, need 1/
7dfd8446
KW
469 ]
470);
471
472if (isASCII) {
473 push @malformations,
474 [ "overflow malformation",
475 # These are the smallest overflowing on 64 byte machines:
476 # 2**64
477 "\xff\x80\x90\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0",
478 13,
eb83ed87 479 0, # There is no way to allow this malformation
7dfd8446
KW
480 $REPLACEMENT,
481 13,
eb83ed87 482 qr/overflow/
7dfd8446
KW
483 ];
484}
485
486foreach my $test (@malformations) {
eb83ed87
KW
487 my ($testname, $bytes, $length, $allow_flags, $allowed_uv, $expected_len, $message ) = @$test;
488
489 next if ! ok(length($bytes) >= $length, "$testname: Make sure won't read beyond buffer: " . length($bytes) . " >= $length");
490
491 # Test what happens when this malformation is not allowed
492 undef @warnings;
bd70aaaf 493 my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
eb83ed87
KW
494 is($ret_ref->[0], 0, "$testname: disallowed: Returns 0");
495 is($ret_ref->[1], $expected_len, "$testname: disallowed: Returns expected length");
496 if (is(scalar @warnings, 1, "$testname: disallowed: Got a single warning ")) {
497 like($warnings[0], $message, "$testname: disallowed: Got expected warning");
498 }
499 else {
500 if (scalar @warnings) {
7dfd8446 501 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
502 }
503 }
504
505 { # Next test when disallowed, and warnings are off.
506 undef @warnings;
507 no warnings 'utf8';
bd70aaaf 508 my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
eb83ed87
KW
509 is($ret_ref->[0], 0, "$testname: disallowed: no warnings 'utf8': Returns 0");
510 is($ret_ref->[1], $expected_len, "$testname: disallowed: no warnings 'utf8': Returns expected length");
511 if (!is(scalar @warnings, 0, "$testname: disallowed: no warnings 'utf8': no warnings generated")) {
7dfd8446 512 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
513 }
514 }
515
516 # Test with CHECK_ONLY
517 undef @warnings;
bd70aaaf 518 $ret_ref = test_utf8n_to_uvchr($bytes, $length, $UTF8_CHECK_ONLY);
eb83ed87
KW
519 is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0");
520 is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns expected length");
521 if (! is(scalar @warnings, 0, "$testname: CHECK_ONLY: no warnings generated")) {
7dfd8446 522 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
523 }
524
525 next if $allow_flags == 0; # Skip if can't allow this malformation
526
527 # Test when the malformation is allowed
528 undef @warnings;
bd70aaaf 529 $ret_ref = test_utf8n_to_uvchr($bytes, $length, $allow_flags);
eb83ed87
KW
530 is($ret_ref->[0], $allowed_uv, "$testname: allowed: Returns expected uv");
531 is($ret_ref->[1], $expected_len, "$testname: allowed: Returns expected length");
532 if (!is(scalar @warnings, 0, "$testname: allowed: no warnings generated"))
533 {
7dfd8446 534 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
535 }
536}
537
538my $FF_ret;
539
7dfd8446 540if ($is64bit) {
eb83ed87
KW
541 no warnings qw{portable overflow};
542 $FF_ret = 0x1000000000;
543}
544else { # The above overflows unless a quad platform
545 $FF_ret = 0;
546}
547
548# Now test the cases where a legal code point is generated, but may or may not
549# be allowed/warned on.
2f8f112e 550my @tests = (
7dfd8446
KW
551 [ "lowest surrogate",
552 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
553 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
554 'surrogate', 0xD800,
555 (isASCII) ? 3 : 4,
556 qr/surrogate/
557 ],
558 [ "a middle surrogate",
559 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
560 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
561 'surrogate', 0xD90D,
562 (isASCII) ? 3 : 4,
eb83ed87
KW
563 qr/surrogate/
564 ],
7dfd8446
KW
565 [ "highest surrogate",
566 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
567 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
568 'surrogate', 0xDFFF,
569 (isASCII) ? 3 : 4,
570 qr/surrogate/
571 ],
572 [ "first non_unicode",
573 (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
574 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
575 'non_unicode', 0x110000,
576 (isASCII) ? 4 : 5,
577 qr/not Unicode.* may not be portable/
578 ],
579 [ "first of 32 consecutive non-character code points",
580 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
581 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
582 'nonchar', 0xFDD0,
583 (isASCII) ? 3 : 4,
584 qr/Unicode non-character.*is not recommended for open interchange/
585 ],
586 [ "a mid non-character code point of the 32 consecutive ones",
587 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
588 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
589 'nonchar', 0xFDE0,
590 (isASCII) ? 3 : 4,
591 qr/Unicode non-character.*is not recommended for open interchange/
592 ],
593 [ "final of 32 consecutive non-character code points",
594 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
595 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
596 'nonchar', 0xFDEF,
597 (isASCII) ? 3 : 4,
598 qr/Unicode non-character.*is not recommended for open interchange/
599 ],
600 [ "non-character code point U+FFFE",
601 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
602 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
603 'nonchar', 0xFFFE,
604 (isASCII) ? 3 : 4,
605 qr/Unicode non-character.*is not recommended for open interchange/
606 ],
607 [ "non-character code point U+FFFF",
608 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
609 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
610 'nonchar', 0xFFFF,
611 (isASCII) ? 3 : 4,
612 qr/Unicode non-character.*is not recommended for open interchange/
613 ],
614 [ "non-character code point U+1FFFE",
615 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
616 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
617 'nonchar', 0x1FFFE, 4,
618 qr/Unicode non-character.*is not recommended for open interchange/
619 ],
620 [ "non-character code point U+1FFFF",
621 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
622 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
623 'nonchar', 0x1FFFF, 4,
624 qr/Unicode non-character.*is not recommended for open interchange/
625 ],
626 [ "non-character code point U+2FFFE",
627 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
628 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
629 'nonchar', 0x2FFFE, 4,
630 qr/Unicode non-character.*is not recommended for open interchange/
631 ],
632 [ "non-character code point U+2FFFF",
633 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
634 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
635 'nonchar', 0x2FFFF, 4,
636 qr/Unicode non-character.*is not recommended for open interchange/
637 ],
638 [ "non-character code point U+3FFFE",
639 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
640 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
641 'nonchar', 0x3FFFE, 4,
642 qr/Unicode non-character.*is not recommended for open interchange/
643 ],
644 [ "non-character code point U+3FFFF",
645 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
646 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
647 'nonchar', 0x3FFFF, 4,
648 qr/Unicode non-character.*is not recommended for open interchange/
649 ],
650 [ "non-character code point U+4FFFE",
651 (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
652 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
653 'nonchar', 0x4FFFE,
654 (isASCII) ? 4 : 5,
655 qr/Unicode non-character.*is not recommended for open interchange/
656 ],
657 [ "non-character code point U+4FFFF",
658 (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
659 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
660 'nonchar', 0x4FFFF,
661 (isASCII) ? 4 : 5,
662 qr/Unicode non-character.*is not recommended for open interchange/
663 ],
664 [ "non-character code point U+5FFFE",
665 (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
666 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
667 'nonchar', 0x5FFFE,
668 (isASCII) ? 4 : 5,
669 qr/Unicode non-character.*is not recommended for open interchange/
670 ],
671 [ "non-character code point U+5FFFF",
672 (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
673 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
674 'nonchar', 0x5FFFF,
675 (isASCII) ? 4 : 5,
676 qr/Unicode non-character.*is not recommended for open interchange/
677 ],
678 [ "non-character code point U+6FFFE",
679 (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
680 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
681 'nonchar', 0x6FFFE,
682 (isASCII) ? 4 : 5,
683 qr/Unicode non-character.*is not recommended for open interchange/
684 ],
685 [ "non-character code point U+6FFFF",
686 (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
687 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
688 'nonchar', 0x6FFFF,
689 (isASCII) ? 4 : 5,
690 qr/Unicode non-character.*is not recommended for open interchange/
691 ],
692 [ "non-character code point U+7FFFE",
693 (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
694 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
695 'nonchar', 0x7FFFE,
696 (isASCII) ? 4 : 5,
697 qr/Unicode non-character.*is not recommended for open interchange/
698 ],
699 [ "non-character code point U+7FFFF",
700 (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
701 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
702 'nonchar', 0x7FFFF,
703 (isASCII) ? 4 : 5,
704 qr/Unicode non-character.*is not recommended for open interchange/
705 ],
706 [ "non-character code point U+8FFFE",
707 (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
708 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
709 'nonchar', 0x8FFFE,
710 (isASCII) ? 4 : 5,
711 qr/Unicode non-character.*is not recommended for open interchange/
712 ],
713 [ "non-character code point U+8FFFF",
714 (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
715 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
716 'nonchar', 0x8FFFF,
717 (isASCII) ? 4 : 5,
718 qr/Unicode non-character.*is not recommended for open interchange/
719 ],
720 [ "non-character code point U+9FFFE",
721 (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
722 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
723 'nonchar', 0x9FFFE,
724 (isASCII) ? 4 : 5,
725 qr/Unicode non-character.*is not recommended for open interchange/
726 ],
727 [ "non-character code point U+9FFFF",
728 (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
729 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
730 'nonchar', 0x9FFFF,
731 (isASCII) ? 4 : 5,
732 qr/Unicode non-character.*is not recommended for open interchange/
733 ],
734 [ "non-character code point U+AFFFE",
735 (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
736 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
737 'nonchar', 0xAFFFE,
738 (isASCII) ? 4 : 5,
739 qr/Unicode non-character.*is not recommended for open interchange/
740 ],
741 [ "non-character code point U+AFFFF",
742 (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
743 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
744 'nonchar', 0xAFFFF,
745 (isASCII) ? 4 : 5,
746 qr/Unicode non-character.*is not recommended for open interchange/
747 ],
748 [ "non-character code point U+BFFFE",
749 (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
750 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
751 'nonchar', 0xBFFFE,
752 (isASCII) ? 4 : 5,
753 qr/Unicode non-character.*is not recommended for open interchange/
754 ],
755 [ "non-character code point U+BFFFF",
756 (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
757 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
758 'nonchar', 0xBFFFF,
759 (isASCII) ? 4 : 5,
760 qr/Unicode non-character.*is not recommended for open interchange/
761 ],
762 [ "non-character code point U+CFFFE",
763 (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
764 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
765 'nonchar', 0xCFFFE,
766 (isASCII) ? 4 : 5,
767 qr/Unicode non-character.*is not recommended for open interchange/
768 ],
769 [ "non-character code point U+CFFFF",
770 (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
771 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
772 'nonchar', 0xCFFFF,
773 (isASCII) ? 4 : 5,
774 qr/Unicode non-character.*is not recommended for open interchange/
775 ],
776 [ "non-character code point U+DFFFE",
777 (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
778 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
779 'nonchar', 0xDFFFE,
780 (isASCII) ? 4 : 5,
781 qr/Unicode non-character.*is not recommended for open interchange/
782 ],
783 [ "non-character code point U+DFFFF",
784 (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
785 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
786 'nonchar', 0xDFFFF,
787 (isASCII) ? 4 : 5,
788 qr/Unicode non-character.*is not recommended for open interchange/
789 ],
790 [ "non-character code point U+EFFFE",
791 (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
792 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
793 'nonchar', 0xEFFFE,
794 (isASCII) ? 4 : 5,
795 qr/Unicode non-character.*is not recommended for open interchange/
796 ],
797 [ "non-character code point U+EFFFF",
798 (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
799 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
800 'nonchar', 0xEFFFF,
801 (isASCII) ? 4 : 5,
802 qr/Unicode non-character.*is not recommended for open interchange/
803 ],
804 [ "non-character code point U+FFFFE",
805 (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
806 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
807 'nonchar', 0xFFFFE,
808 (isASCII) ? 4 : 5,
809 qr/Unicode non-character.*is not recommended for open interchange/
810 ],
811 [ "non-character code point U+FFFFF",
812 (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
813 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
814 'nonchar', 0xFFFFF,
815 (isASCII) ? 4 : 5,
816 qr/Unicode non-character.*is not recommended for open interchange/
817 ],
818 [ "non-character code point U+10FFFE",
819 (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
820 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
821 'nonchar', 0x10FFFE,
822 (isASCII) ? 4 : 5,
823 qr/Unicode non-character.*is not recommended for open interchange/
eb83ed87 824 ],
7dfd8446
KW
825 [ "non-character code point U+10FFFF",
826 (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
827 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
828 'nonchar', 0x10FFFF,
829 (isASCII) ? 4 : 5,
ba707cdc 830 qr/Unicode non-character.*is not recommended for open interchange/
eb83ed87 831 ],
7dfd8446
KW
832);
833
834
835if (isASCII) {
836 push @tests,
1d1c12d9 837 [ "requires at least 32 bits",
7dfd8446 838 "\xfe\x82\x80\x80\x80\x80\x80",
eb83ed87
KW
839
840 # This code point is chosen so that it is representable in a UV on
2f8f112e 841 # 32-bit machines
1d1c12d9 842 $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
7dfd8446
KW
843 'utf8', 0x80000000, 7,
844 qr/Code point 0x80000000 is not Unicode, and not portable/
845 ],
1d1c12d9 846 [ "requires at least 32 bits, and use SUPER-type flags, instead of ABOVE_31_BIT",
7dfd8446
KW
847 "\xfe\x82\x80\x80\x80\x80\x80",
848 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
849 'utf8', 0x80000000, 7,
ea5ced44 850 qr/Code point 0x80000000 is not Unicode, and not portable/
eb83ed87 851 ],
1d1c12d9
KW
852 [ "overflow with warnings/disallow for more than 31 bits",
853 # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
854 # with overflow. The overflow malformation is never allowed, so
855 # preventing it takes precedence if the ABOVE_31_BIT options would
856 # otherwise allow in an overflowing value. These two code points (1
857 # for 32-bits; 1 for 64) were chosen because the old overflow
858 # detection algorithm did not catch them; this means this test also
859 # checks for that fix.
7dfd8446 860 ($is64bit)
2f8f112e
KW
861 ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
862 : "\xfe\x86\x80\x80\x80\x80\x80",
ea5ced44 863
1d1c12d9
KW
864 # We include both warning categories to make sure the ABOVE_31_BIT one
865 # has precedence
866 "$UTF8_WARN_ABOVE_31_BIT|$UTF8_WARN_SUPER",
867 "$UTF8_DISALLOW_ABOVE_31_BIT",
7dfd8446
KW
868 'utf8', 0,
869 ($is64bit) ? 13 : 7,
ea5ced44 870 qr/overflow at byte .*, after start byte 0xf/
eb83ed87 871 ],
7dfd8446
KW
872 ;
873}
2f8f112e 874
7dfd8446 875if ($is64bit) { # All FF's will overflow on 32 bit
2f8f112e 876 push @tests,
1d1c12d9
KW
877 [ "More than 32 bits", "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80",
878 $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
7dfd8446 879 'utf8', $FF_ret, 13,
ea5ced44 880 qr/Code point 0x.* is not Unicode, and not portable/
2f8f112e
KW
881 ];
882}
883
884foreach my $test (@tests) {
eb83ed87
KW
885 my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test;
886
887 my $length = length $bytes;
2f8f112e 888 my $will_overflow = $testname =~ /overflow/;
eb83ed87
KW
889
890 # This is more complicated than the malformations tested earlier, as there
891 # are several orthogonal variables involved. We test all the subclasses
892 # of utf8 warnings to verify they work with and without the utf8 class,
893 # and don't have effects on other sublass warnings
54f4afef 894 foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
eb83ed87
KW
895 foreach my $warn_flag (0, $warn_flags) {
896 foreach my $disallow_flag (0, $disallow_flags) {
54f4afef 897 foreach my $do_warning (0, 1) {
eb83ed87 898
13d7a909
KW
899 my $eval_warn = $do_warning
900 ? "use warnings '$warning'"
901 : $warning eq "utf8"
7dfd8446
KW
902 ? "no warnings 'utf8'"
903 : "use warnings 'utf8'; no warnings '$warning'";
2f8f112e 904
13d7a909
KW
905 # is effectively disallowed if will overflow, even if the
906 # flag indicates it is allowed, fix up test name to
907 # indicate this as well
908 my $disallowed = $disallow_flag || $will_overflow;
2f8f112e 909
046d01eb 910 my $this_name = "utf8n_to_uvchr() $testname: " . (($disallow_flag)
13d7a909
KW
911 ? 'disallowed'
912 : ($disallowed)
1d1c12d9 913 ? 'ABOVE_31_BIT allowed'
13d7a909
KW
914 : 'allowed');
915 $this_name .= ", $eval_warn";
916 $this_name .= ", " . (($warn_flag)
917 ? 'with warning flag'
918 : 'no warning flag');
eb83ed87 919
13d7a909
KW
920 undef @warnings;
921 my $ret_ref;
7dfd8446
KW
922 my $display_bytes = display_bytes($bytes);
923 my $call = "Call was: $eval_warn; \$ret_ref = test_utf8n_to_uvchr('$display_bytes', $length, $warn_flag|$disallow_flag)";
924 my $eval_text = "$eval_warn; \$ret_ref = test_utf8n_to_uvchr('$bytes', $length, $warn_flag|$disallow_flag)";
13d7a909
KW
925 eval "$eval_text";
926 if (! ok ("$@ eq ''", "$this_name: eval succeeded")) {
7dfd8446 927 diag "\$!='$!'; eval'd=\"$call\"";
13d7a909 928 next;
54f4afef 929 }
13d7a909 930 if ($disallowed) {
7dfd8446
KW
931 unless (is($ret_ref->[0], 0, "$this_name: Returns 0"))
932 {
933 diag $call;
934 }
2f8f112e
KW
935 }
936 else {
7dfd8446
KW
937 unless (is($ret_ref->[0], $allowed_uv,
938 "$this_name: Returns expected uv"))
939 {
940 diag $call;
941 }
942 }
943 unless (is($ret_ref->[1], $expected_len,
944 "$this_name: Returns expected length"))
945 {
946 diag $call;
13d7a909 947 }
13d7a909
KW
948
949 if (! $do_warning
950 && ($warning eq 'utf8' || $warning eq $category))
951 {
952 if (!is(scalar @warnings, 0,
953 "$this_name: No warnings generated"))
954 {
7dfd8446
KW
955 diag $call;
956 diag "The warnings were: " . join(", ", @warnings);
2f8f112e
KW
957 }
958 }
13d7a909
KW
959 elsif ($will_overflow
960 && ! $disallow_flag
961 && $warning eq 'utf8')
962 {
963
964 # Will get the overflow message instead of the expected
965 # message under these circumstances, as they would
966 # otherwise accept an overflowed value, which the code
967 # should not allow, so falls back to overflow.
968 if (is(scalar @warnings, 1,
969 "$this_name: Got a single warning "))
970 {
7dfd8446
KW
971 unless (like($warnings[0], qr/overflow/,
972 "$this_name: Got overflow warning"))
973 {
974 diag $call;
975 }
13d7a909
KW
976 }
977 else {
7dfd8446 978 diag $call;
13d7a909 979 if (scalar @warnings) {
7dfd8446 980 diag "The warnings were: "
13d7a909
KW
981 . join(", ", @warnings);
982 }
983 }
eb83ed87 984 }
13d7a909
KW
985 elsif ($warn_flag
986 && ($warning eq 'utf8' || $warning eq $category))
987 {
988 if (is(scalar @warnings, 1,
989 "$this_name: Got a single warning "))
990 {
7dfd8446
KW
991 unless (like($warnings[0], $message,
992 "$this_name: Got expected warning"))
993 {
994 diag $call;
995 }
13d7a909
KW
996 }
997 else {
7dfd8446 998 diag $call;
13d7a909 999 if (scalar @warnings) {
7dfd8446 1000 diag "The warnings were: "
13d7a909
KW
1001 . join(", ", @warnings);
1002 }
eb83ed87
KW
1003 }
1004 }
eb83ed87 1005
13d7a909
KW
1006 # Check CHECK_ONLY results when the input is disallowed. Do
1007 # this when actually disallowed, not just when the
1008 # $disallow_flag is set
1009 if ($disallowed) {
1010 undef @warnings;
1011 $ret_ref = test_utf8n_to_uvchr($bytes, $length,
1012 $disallow_flag|$UTF8_CHECK_ONLY);
7dfd8446
KW
1013 unless (is($ret_ref->[0], 0, "$this_name, CHECK_ONLY: Returns 0")) {
1014 diag $call;
1015 }
1016 unless (is($ret_ref->[1], -1,
1017 "$this_name: CHECK_ONLY: returns expected length"))
1018 {
1019 diag $call;
1020 }
13d7a909
KW
1021 if (! is(scalar @warnings, 0,
1022 "$this_name, CHECK_ONLY: no warnings generated"))
1023 {
7dfd8446
KW
1024 diag $call;
1025 diag "The warnings were: " . join(", ", @warnings);
13d7a909 1026 }
eb83ed87 1027 }
046d01eb
KW
1028
1029 # Now repeat some of the above, but for
1030 # uvchr_to_utf8_flags(). Since this comes from an
1031 # existing code point, it hasn't overflowed.
1032 next if $will_overflow;
1033
1034 # The warning and disallow flags passed in are for
1035 # utf8n_to_uvchr(). Convert them for
1036 # uvchr_to_utf8_flags().
1037 my $uvchr_warn_flag = 0;
1038 my $uvchr_disallow_flag = 0;
1039 if ($warn_flag) {
1040 if ($warn_flag == $UTF8_WARN_SURROGATE) {
1041 $uvchr_warn_flag = $UNICODE_WARN_SURROGATE
1042 }
1043 elsif ($warn_flag == $UTF8_WARN_NONCHAR) {
1044 $uvchr_warn_flag = $UNICODE_WARN_NONCHAR
1045 }
1046 elsif ($warn_flag == $UTF8_WARN_SUPER) {
1047 $uvchr_warn_flag = $UNICODE_WARN_SUPER
1048 }
1d1c12d9 1049 elsif ($warn_flag == $UTF8_WARN_ABOVE_31_BIT) {
046d01eb
KW
1050 $uvchr_warn_flag = $UNICODE_WARN_ABOVE_31_BIT;
1051 }
1052 else {
1053 fail(sprintf "Unexpected warn flag: %x",
1054 $warn_flag);
1055 next;
1056 }
1057 }
1058 if ($disallow_flag) {
1059 if ($disallow_flag == $UTF8_DISALLOW_SURROGATE) {
1060 $uvchr_disallow_flag = $UNICODE_DISALLOW_SURROGATE
1061 }
1062 elsif ($disallow_flag == $UTF8_DISALLOW_NONCHAR) {
1063 $uvchr_disallow_flag = $UNICODE_DISALLOW_NONCHAR
1064 }
1065 elsif ($disallow_flag == $UTF8_DISALLOW_SUPER) {
1066 $uvchr_disallow_flag = $UNICODE_DISALLOW_SUPER
1067 }
1d1c12d9 1068 elsif ($disallow_flag == $UTF8_DISALLOW_ABOVE_31_BIT) {
046d01eb
KW
1069 $uvchr_disallow_flag =
1070 $UNICODE_DISALLOW_ABOVE_31_BIT;
1071 }
1072 else {
1073 fail(sprintf "Unexpected disallow flag: %x",
1074 $disallow_flag);
1075 next;
1076 }
1077 }
1078
1079 $disallowed = $uvchr_disallow_flag;
1080
1081 $this_name = "uvchr_to_utf8_flags() $testname: "
1082 . (($uvchr_disallow_flag)
1083 ? 'disallowed'
1084 : ($disallowed)
1085 ? 'ABOVE_31_BIT allowed'
1086 : 'allowed');
1087 $this_name .= ", $eval_warn";
1088 $this_name .= ", " . (($uvchr_warn_flag)
1089 ? 'with warning flag'
1090 : 'no warning flag');
1091
1092 undef @warnings;
1093 my $ret;
1094 my $warn_flag = sprintf "0x%x", $uvchr_warn_flag;
1095 my $disallow_flag = sprintf "0x%x", $uvchr_disallow_flag;
1096 $call = sprintf "call was: $eval_warn; \$ret = test_uvchr_to_utf8_flags(0x%x, $warn_flag|$disallow_flag)", $allowed_uv;
1097 $eval_text = "$eval_warn; \$ret = test_uvchr_to_utf8_flags($allowed_uv, $warn_flag|$disallow_flag)";
1098 eval "$eval_text";
1099 if (! ok ("$@ eq ''", "$this_name: eval succeeded")) {
1100 diag "\$!='$!'; eval'd=\"$eval_text\"";
1101 next;
1102 }
1103 if ($disallowed) {
1104 unless (is($ret, undef, "$this_name: Returns undef")) {
1105 diag $call;
1106 }
1107 }
1108 else {
1109 unless (is($ret, $bytes, "$this_name: Returns expected string")) {
1110 diag $call;
1111 }
1112 }
1113 if (! $do_warning
1114 && ($warning eq 'utf8' || $warning eq $category))
1115 {
1116 if (!is(scalar @warnings, 0,
1117 "$this_name: No warnings generated"))
1118 {
1119 diag $call;
1120 diag "The warnings were: " . join(", ", @warnings);
1121 }
1122 }
1123 elsif ($uvchr_warn_flag
1124 && ($warning eq 'utf8' || $warning eq $category))
1125 {
1126 if (is(scalar @warnings, 1,
1127 "$this_name: Got a single warning "))
1128 {
046d01eb
KW
1129 unless (like($warnings[0], $message,
1130 "$this_name: Got expected warning"))
1131 {
1132 diag $call;
1133 }
046d01eb
KW
1134 }
1135 else {
1136 diag $call;
1137 if (scalar @warnings) {
1138 diag "The warnings were: "
1139 . join(", ", @warnings);
1140 }
1141 }
1142 }
eb83ed87
KW
1143 }
1144 }
1145 }
1146 }
1147}
6e3d6c02 1148
fed3ba5d 1149done_testing;