This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
lib/warnings/utf8: Reinstate warning test
[perl5.git] / ext / XS-APItest / t / utf8.t
CommitLineData
fed3ba5d
NC
1#!perl -w
2
3use strict;
4use Test::More;
7dfd8446 5$|=1;
fed3ba5d 6
760c7c2f
KW
7no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit
8 # machines, and that is tested elsewhere
9
fed3ba5d
NC
10use XS::APItest;
11
4deba822
KW
12my $pound_sign = chr utf8::unicode_to_native(163);
13
7dfd8446
KW
14sub isASCII { ord "A" == 65 }
15
16sub display_bytes {
17 my $string = shift;
18 return '"'
19 . join("", map { sprintf("\\x%02x", ord $_) } split "", $string)
20 . '"';
21}
22
23# This test file can't use byte_utf8a_to_utf8n() from t/charset_tools.pl
24# because that uses the same functions we are testing here. So UTF-EBCDIC
25# strings are hard-coded as I8 strings in this file instead, and we use array
26# lookup to translate into the appropriate code page.
27
28my @i8_to_native = ( # Only code page 1047 so far.
29# _0 _1 _2 _3 _4 _5 _6 _7 _8 _9 _A _B _C _D _E _F
300x00,0x01,0x02,0x03,0x37,0x2D,0x2E,0x2F,0x16,0x05,0x15,0x0B,0x0C,0x0D,0x0E,0x0F,
310x10,0x11,0x12,0x13,0x3C,0x3D,0x32,0x26,0x18,0x19,0x3F,0x27,0x1C,0x1D,0x1E,0x1F,
320x40,0x5A,0x7F,0x7B,0x5B,0x6C,0x50,0x7D,0x4D,0x5D,0x5C,0x4E,0x6B,0x60,0x4B,0x61,
330xF0,0xF1,0xF2,0xF3,0xF4,0xF5,0xF6,0xF7,0xF8,0xF9,0x7A,0x5E,0x4C,0x7E,0x6E,0x6F,
340x7C,0xC1,0xC2,0xC3,0xC4,0xC5,0xC6,0xC7,0xC8,0xC9,0xD1,0xD2,0xD3,0xD4,0xD5,0xD6,
350xD7,0xD8,0xD9,0xE2,0xE3,0xE4,0xE5,0xE6,0xE7,0xE8,0xE9,0xAD,0xE0,0xBD,0x5F,0x6D,
360x79,0x81,0x82,0x83,0x84,0x85,0x86,0x87,0x88,0x89,0x91,0x92,0x93,0x94,0x95,0x96,
370x97,0x98,0x99,0xA2,0xA3,0xA4,0xA5,0xA6,0xA7,0xA8,0xA9,0xC0,0x4F,0xD0,0xA1,0x07,
380x20,0x21,0x22,0x23,0x24,0x25,0x06,0x17,0x28,0x29,0x2A,0x2B,0x2C,0x09,0x0A,0x1B,
390x30,0x31,0x1A,0x33,0x34,0x35,0x36,0x08,0x38,0x39,0x3A,0x3B,0x04,0x14,0x3E,0xFF,
400x41,0x42,0x43,0x44,0x45,0x46,0x47,0x48,0x49,0x4A,0x51,0x52,0x53,0x54,0x55,0x56,
410x57,0x58,0x59,0x62,0x63,0x64,0x65,0x66,0x67,0x68,0x69,0x6A,0x70,0x71,0x72,0x73,
420x74,0x75,0x76,0x77,0x78,0x80,0x8A,0x8B,0x8C,0x8D,0x8E,0x8F,0x90,0x9A,0x9B,0x9C,
430x9D,0x9E,0x9F,0xA0,0xAA,0xAB,0xAC,0xAE,0xAF,0xB0,0xB1,0xB2,0xB3,0xB4,0xB5,0xB6,
440xB7,0xB8,0xB9,0xBA,0xBB,0xBC,0xBE,0xBF,0xCA,0xCB,0xCC,0xCD,0xCE,0xCF,0xDA,0xDB,
450xDC,0xDD,0xDE,0xDF,0xE1,0xEA,0xEB,0xEC,0xED,0xEE,0xEF,0xFA,0xFB,0xFC,0xFD,0xFE,
46);
47
48*I8_to_native = (isASCII)
49 ? sub { return shift }
50 : sub { return join "", map { chr $i8_to_native[ord $_] }
51 split "", shift };
52
53my $is64bit = length sprintf("%x", ~0) > 8;
54
55
56# Test utf8n_to_uvchr(). These provide essentially complete code coverage.
57# Copied from utf8.h
58my $UTF8_ALLOW_EMPTY = 0x0001;
59my $UTF8_ALLOW_CONTINUATION = 0x0002;
60my $UTF8_ALLOW_NON_CONTINUATION = 0x0004;
61my $UTF8_ALLOW_SHORT = 0x0008;
62my $UTF8_ALLOW_LONG = 0x0010;
63my $UTF8_DISALLOW_SURROGATE = 0x0020;
64my $UTF8_WARN_SURROGATE = 0x0040;
65my $UTF8_DISALLOW_NONCHAR = 0x0080;
66my $UTF8_WARN_NONCHAR = 0x0100;
67my $UTF8_DISALLOW_SUPER = 0x0200;
68my $UTF8_WARN_SUPER = 0x0400;
1d1c12d9
KW
69my $UTF8_DISALLOW_ABOVE_31_BIT = 0x0800;
70my $UTF8_WARN_ABOVE_31_BIT = 0x1000;
7dfd8446
KW
71my $UTF8_CHECK_ONLY = 0x2000;
72
046d01eb
KW
73# Test uvchr_to_utf8().
74my $UNICODE_WARN_SURROGATE = 0x0001;
75my $UNICODE_WARN_NONCHAR = 0x0002;
76my $UNICODE_WARN_SUPER = 0x0004;
77my $UNICODE_WARN_ABOVE_31_BIT = 0x0008;
78my $UNICODE_DISALLOW_SURROGATE = 0x0010;
79my $UNICODE_DISALLOW_NONCHAR = 0x0020;
80my $UNICODE_DISALLOW_SUPER = 0x0040;
81my $UNICODE_DISALLOW_ABOVE_31_BIT = 0x0080;
82
83my $look_for_everything_utf8n_to
84 = $UTF8_DISALLOW_SURROGATE
7dfd8446
KW
85 | $UTF8_WARN_SURROGATE
86 | $UTF8_DISALLOW_NONCHAR
87 | $UTF8_WARN_NONCHAR
88 | $UTF8_DISALLOW_SUPER
89 | $UTF8_WARN_SUPER
1d1c12d9
KW
90 | $UTF8_DISALLOW_ABOVE_31_BIT
91 | $UTF8_WARN_ABOVE_31_BIT;
046d01eb
KW
92my $look_for_everything_uvchr_to
93 = $UNICODE_DISALLOW_SURROGATE
94 | $UNICODE_WARN_SURROGATE
95 | $UNICODE_DISALLOW_NONCHAR
96 | $UNICODE_WARN_NONCHAR
97 | $UNICODE_DISALLOW_SUPER
98 | $UNICODE_WARN_SUPER
99 | $UNICODE_DISALLOW_ABOVE_31_BIT
100 | $UNICODE_WARN_ABOVE_31_BIT;
7dfd8446 101
fed3ba5d
NC
102foreach ([0, '', '', 'empty'],
103 [0, 'N', 'N', '1 char'],
104 [1, 'NN', 'N', '1 char substring'],
105 [-2, 'Perl', 'Rules', 'different'],
4deba822
KW
106 [0, $pound_sign, $pound_sign, 'pound sign'],
107 [1, $pound_sign . 10, $pound_sign . 1, '10 pounds is more than 1 pound'],
108 [1, $pound_sign . $pound_sign, $pound_sign, '2 pound signs are more than 1'],
fed3ba5d
NC
109 [-2, ' $!', " \x{1F42B}!", 'Camels are worth more than 1 dollar'],
110 [-1, '!', "!\x{1F42A}", 'Initial substrings match'],
111 ) {
112 my ($expect, $left, $right, $desc) = @$_;
113 my $copy = $right;
114 utf8::encode($copy);
115 is(bytes_cmp_utf8($left, $copy), $expect, $desc);
116 next if $right =~ tr/\0-\377//c;
117 utf8::encode($left);
118 is(bytes_cmp_utf8($right, $left), -$expect, "$desc reversed");
119}
120
7dfd8446
KW
121# The keys to this hash are Unicode code points, their values are the native
122# UTF-8 representations of them. The code points are chosen because they are
123# "interesting" on either or both ASCII and EBCDIC platforms. First we add
124# boundaries where the number of bytes required to represent them increase, or
125# are adjacent to problematic code points, so we want to make sure they aren't
126# considered problematic.
127my %code_points = (
128 0x0100 => (isASCII) ? "\xc4\x80" : I8_to_native("\xc8\xa0"),
129 0x0400 - 1 => (isASCII) ? "\xcf\xbf" : I8_to_native("\xdf\xbf"),
130 0x0400 => (isASCII) ? "\xd0\x80" : I8_to_native("\xe1\xa0\xa0"),
131 0x0800 - 1 => (isASCII) ? "\xdf\xbf" : I8_to_native("\xe1\xbf\xbf"),
132 0x0800 => (isASCII) ? "\xe0\xa0\x80" : I8_to_native("\xe2\xa0\xa0"),
133 0x4000 - 1 => (isASCII) ? "\xe3\xbf\xbf" : I8_to_native("\xef\xbf\xbf"),
134 0x4000 => (isASCII) ? "\xe4\x80\x80" : I8_to_native("\xf0\xb0\xa0\xa0"),
135 0x8000 - 1 => (isASCII) ? "\xe7\xbf\xbf" : I8_to_native("\xf0\xbf\xbf\xbf"),
136
137 # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC,
138 # as of this writing, considers potentially problematic on EBCDIC
139 0x8000 => (isASCII) ? "\xe8\x80\x80" : I8_to_native("\xf1\xa0\xa0\xa0"),
140
141 0xD000 - 1 => (isASCII) ? "\xec\xbf\xbf" : I8_to_native("\xf1\xb3\xbf\xbf"),
142
143 # First code point that the implementation of isUTF8_POSSIBLY_PROBLEMATIC,
144 # as of this writing, considers potentially problematic on ASCII
145 0xD000 => (isASCII) ? "\xed\x80\x80" : I8_to_native("\xf1\xb4\xa0\xa0"),
146
147 # Bracket the surrogates
148 0xD7FF => (isASCII) ? "\xed\x9f\xbf" : I8_to_native("\xf1\xb5\xbf\xbf"),
149 0xE000 => (isASCII) ? "\xee\x80\x80" : I8_to_native("\xf1\xb8\xa0\xa0"),
150
151 # Bracket the 32 contiguous non characters
152 0xFDCF => (isASCII) ? "\xef\xb7\x8f" : I8_to_native("\xf1\xbf\xae\xaf"),
153 0xFDF0 => (isASCII) ? "\xef\xb7\xb0" : I8_to_native("\xf1\xbf\xaf\xb0"),
154
155 # Mostly bracket non-characters, but some are transitions to longer
156 # strings
157 0xFFFD => (isASCII) ? "\xef\xbf\xbd" : I8_to_native("\xf1\xbf\xbf\xbd"),
158 0x10000 - 1 => (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
159 0x10000 => (isASCII) ? "\xf0\x90\x80\x80" : I8_to_native("\xf2\xa0\xa0\xa0"),
160 0x1FFFD => (isASCII) ? "\xf0\x9f\xbf\xbd" : I8_to_native("\xf3\xbf\xbf\xbd"),
161 0x20000 => (isASCII) ? "\xf0\xa0\x80\x80" : I8_to_native("\xf4\xa0\xa0\xa0"),
162 0x2FFFD => (isASCII) ? "\xf0\xaf\xbf\xbd" : I8_to_native("\xf5\xbf\xbf\xbd"),
163 0x30000 => (isASCII) ? "\xf0\xb0\x80\x80" : I8_to_native("\xf6\xa0\xa0\xa0"),
164 0x3FFFD => (isASCII) ? "\xf0\xbf\xbf\xbd" : I8_to_native("\xf7\xbf\xbf\xbd"),
165 0x40000 - 1 => (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
166 0x40000 => (isASCII) ? "\xf1\x80\x80\x80" : I8_to_native("\xf8\xa8\xa0\xa0\xa0"),
167 0x4FFFD => (isASCII) ? "\xf1\x8f\xbf\xbd" : I8_to_native("\xf8\xa9\xbf\xbf\xbd"),
168 0x50000 => (isASCII) ? "\xf1\x90\x80\x80" : I8_to_native("\xf8\xaa\xa0\xa0\xa0"),
169 0x5FFFD => (isASCII) ? "\xf1\x9f\xbf\xbd" : I8_to_native("\xf8\xab\xbf\xbf\xbd"),
170 0x60000 => (isASCII) ? "\xf1\xa0\x80\x80" : I8_to_native("\xf8\xac\xa0\xa0\xa0"),
171 0x6FFFD => (isASCII) ? "\xf1\xaf\xbf\xbd" : I8_to_native("\xf8\xad\xbf\xbf\xbd"),
172 0x70000 => (isASCII) ? "\xf1\xb0\x80\x80" : I8_to_native("\xf8\xae\xa0\xa0\xa0"),
173 0x7FFFD => (isASCII) ? "\xf1\xbf\xbf\xbd" : I8_to_native("\xf8\xaf\xbf\xbf\xbd"),
174 0x80000 => (isASCII) ? "\xf2\x80\x80\x80" : I8_to_native("\xf8\xb0\xa0\xa0\xa0"),
175 0x8FFFD => (isASCII) ? "\xf2\x8f\xbf\xbd" : I8_to_native("\xf8\xb1\xbf\xbf\xbd"),
176 0x90000 => (isASCII) ? "\xf2\x90\x80\x80" : I8_to_native("\xf8\xb2\xa0\xa0\xa0"),
177 0x9FFFD => (isASCII) ? "\xf2\x9f\xbf\xbd" : I8_to_native("\xf8\xb3\xbf\xbf\xbd"),
178 0xA0000 => (isASCII) ? "\xf2\xa0\x80\x80" : I8_to_native("\xf8\xb4\xa0\xa0\xa0"),
179 0xAFFFD => (isASCII) ? "\xf2\xaf\xbf\xbd" : I8_to_native("\xf8\xb5\xbf\xbf\xbd"),
180 0xB0000 => (isASCII) ? "\xf2\xb0\x80\x80" : I8_to_native("\xf8\xb6\xa0\xa0\xa0"),
181 0xBFFFD => (isASCII) ? "\xf2\xbf\xbf\xbd" : I8_to_native("\xf8\xb7\xbf\xbf\xbd"),
182 0xC0000 => (isASCII) ? "\xf3\x80\x80\x80" : I8_to_native("\xf8\xb8\xa0\xa0\xa0"),
183 0xCFFFD => (isASCII) ? "\xf3\x8f\xbf\xbd" : I8_to_native("\xf8\xb9\xbf\xbf\xbd"),
184 0xD0000 => (isASCII) ? "\xf3\x90\x80\x80" : I8_to_native("\xf8\xba\xa0\xa0\xa0"),
185 0xDFFFD => (isASCII) ? "\xf3\x9f\xbf\xbd" : I8_to_native("\xf8\xbb\xbf\xbf\xbd"),
186 0xE0000 => (isASCII) ? "\xf3\xa0\x80\x80" : I8_to_native("\xf8\xbc\xa0\xa0\xa0"),
187 0xEFFFD => (isASCII) ? "\xf3\xaf\xbf\xbd" : I8_to_native("\xf8\xbd\xbf\xbf\xbd"),
188 0xF0000 => (isASCII) ? "\xf3\xb0\x80\x80" : I8_to_native("\xf8\xbe\xa0\xa0\xa0"),
189 0xFFFFD => (isASCII) ? "\xf3\xbf\xbf\xbd" : I8_to_native("\xf8\xbf\xbf\xbf\xbd"),
190 0x100000 => (isASCII) ? "\xf4\x80\x80\x80" : I8_to_native("\xf9\xa0\xa0\xa0\xa0"),
191 0x10FFFD => (isASCII) ? "\xf4\x8f\xbf\xbd" : I8_to_native("\xf9\xa1\xbf\xbf\xbd"),
192 0x110000 => (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
193
194 # Things that would be noncharacters if they were in Unicode, and might be
195 # mistaken, if the C code is bad, to be nonchars
196 0x11FFFE => (isASCII) ? "\xf4\x9f\xbf\xbe" : I8_to_native("\xf9\xa3\xbf\xbf\xbe"),
197 0x11FFFF => (isASCII) ? "\xf4\x9f\xbf\xbf" : I8_to_native("\xf9\xa3\xbf\xbf\xbf"),
198 0x20FFFE => (isASCII) ? "\xf8\x88\x8f\xbf\xbe" : I8_to_native("\xfa\xa1\xbf\xbf\xbe"),
199 0x20FFFF => (isASCII) ? "\xf8\x88\x8f\xbf\xbf" : I8_to_native("\xfa\xa1\xbf\xbf\xbf"),
200
201 0x200000 - 1 => (isASCII) ? "\xf7\xbf\xbf\xbf" : I8_to_native("\xf9\xbf\xbf\xbf\xbf"),
202 0x200000 => (isASCII) ? "\xf8\x88\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"),
203 0x400000 - 1 => (isASCII) ? "\xf8\x8f\xbf\xbf\xbf" : I8_to_native("\xfb\xbf\xbf\xbf\xbf"),
204 0x400000 => (isASCII) ? "\xf8\x90\x80\x80\x80" : I8_to_native("\xfc\xa4\xa0\xa0\xa0\xa0"),
205 0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"),
206 0x4000000 => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"),
207 0x4000000 - 1 => (isASCII) ? "\xfb\xbf\xbf\xbf\xbf" : I8_to_native("\xfd\xbf\xbf\xbf\xbf\xbf"),
208 0x4000000 => (isASCII) ? "\xfc\x84\x80\x80\x80\x80" : I8_to_native("\xfe\xa2\xa0\xa0\xa0\xa0\xa0"),
209 0x40000000 - 1 => (isASCII) ? "\xfc\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xfe\xbf\xbf\xbf\xbf\xbf\xbf"),
210 0x40000000 => (isASCII) ? "\xfd\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0"),
211 0x80000000 - 1 => (isASCII) ? "\xfd\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf"),
c0236afe
KW
212 0x80000000 => (isASCII) ? "\xfe\x82\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
213 0xFFFFFFFF => (isASCII) ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"),
7dfd8446 214);
4deba822 215
7dfd8446
KW
216if ($is64bit) {
217 no warnings qw(overflow portable);
218 $code_points{0x100000000} = (isASCII) ? "\xfe\x84\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0");
219 $code_points{0x1000000000 - 1} = (isASCII) ? "\xfe\xbf\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xbf\xbf\xbf\xbf\xbf\xbf\xbf");
220 $code_points{0x1000000000} = (isASCII) ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80" : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0");
221 $code_points{0xFFFFFFFFFFFFFFFF} = (isASCII) ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" : I8_to_native("\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf");
222}
eb83ed87 223
7dfd8446
KW
224# Now add in entries for each of code points 0-255, which require special
225# handling on EBCDIC. Remember the keys are Unicode values, and the values
226# are the native UTF-8. For invariants, the bytes are just the native chr.
227
228my $cp = 0;
229while ($cp < ((isASCII) ? 128 : 160)) { # This is from the definition of
230 # invariant
231 $code_points{$cp} = chr utf8::unicode_to_native($cp);
232 $cp++;
233}
234
235# Done with the invariants. Now do the variants. All in this range are 2
236# byte. Again, we can't use the internal functions to generate UTF-8, as
237# those are what we are trying to test. In the loop, we know what range the
238# continuation bytes can be in, and what the lowest start byte can be. So we
239# cycle through them.
240
241my $first_continuation = (isASCII) ? 0x80 : 0xA0;
242my $final_continuation = 0xBF;
243my $start = (isASCII) ? 0xC2 : 0xC5;
244
245my $continuation = $first_continuation - 1;
246
247while ($cp < 255) {
248 if (++$continuation > $final_continuation) {
249
250 # Wrap to the next start byte when we reach the final continuation
251 # byte possible
252 $continuation = $first_continuation;
253 $start++;
254 }
255 $code_points{$cp} = I8_to_native(chr($start) . chr($continuation));
256
257 $cp++;
258}
eb83ed87
KW
259
260my @warnings;
261
262use warnings 'utf8';
263local $SIG{__WARN__} = sub { push @warnings, @_ };
264
7dfd8446
KW
265# This set of tests looks for basic sanity, and lastly tests the bottom level
266# decode routine for the given code point. If the earlier tests for that code
267# point fail, that one probably will too. Malformations are tested in later
268# segments of code.
269for my $u (sort { utf8::unicode_to_native($a) <=> utf8::unicode_to_native($b) }
270 keys %code_points)
271{
272 my $hex_u = sprintf("0x%02X", $u);
273 my $n = utf8::unicode_to_native($u);
274 my $hex_n = sprintf("0x%02X", $n);
275 my $bytes = $code_points{$u};
276
277 my $offskip_should_be;
278 {
279 no warnings qw(overflow portable);
280 $offskip_should_be = (isASCII)
281 ? ( $u < 0x80 ? 1 :
282 $u < 0x800 ? 2 :
283 $u < 0x10000 ? 3 :
284 $u < 0x200000 ? 4 :
285 $u < 0x4000000 ? 5 :
286 $u < 0x80000000 ? 6 : (($is64bit)
287 ? ($u < 0x1000000000 ? 7 : 13)
288 : 7)
289 )
290 : ($u < 0xA0 ? 1 :
291 $u < 0x400 ? 2 :
292 $u < 0x4000 ? 3 :
293 $u < 0x40000 ? 4 :
294 $u < 0x400000 ? 5 :
c0236afe
KW
295 $u < 0x4000000 ? 6 :
296 $u < 0x40000000 ? 7 : 14 );
7dfd8446
KW
297 }
298
299 # If this test fails, subsequent ones are meaningless.
300 next unless is(test_OFFUNISKIP($u), $offskip_should_be,
301 "Verify OFFUNISKIP($hex_u) is $offskip_should_be");
302 my $invariant = $offskip_should_be == 1;
303 my $display_invariant = $invariant || 0;
304 is(test_OFFUNI_IS_INVARIANT($u), $invariant,
305 "Verify OFFUNI_IS_INVARIANT($hex_u) is $display_invariant");
306
307 my $uvchr_skip_should_be = $offskip_should_be;
308 next unless is(test_UVCHR_SKIP($n), $uvchr_skip_should_be,
309 "Verify UVCHR_SKIP($hex_n) is $uvchr_skip_should_be");
310 is(test_UVCHR_IS_INVARIANT($n), $offskip_should_be == 1,
311 "Verify UVCHR_IS_INVARIANT($hex_n) is $display_invariant");
312
313 my $n_chr = chr $n;
314 utf8::upgrade $n_chr;
315
316 is(test_UTF8_SKIP($n_chr), $uvchr_skip_should_be,
317 "Verify UTF8_SKIP(chr $hex_n) is $uvchr_skip_should_be");
318
319 use bytes;
320 for (my $j = 0; $j < length $n_chr; $j++) {
321 my $b = substr($n_chr, $j, 1);
322 my $hex_b = sprintf("\"\\x%02x\"", ord $b);
323
324 my $byte_invariant = $j == 0 && $uvchr_skip_should_be == 1;
325 my $display_byte_invariant = $byte_invariant || 0;
326 next unless is(test_UTF8_IS_INVARIANT($b), $byte_invariant,
327 " Verify UTF8_IS_INVARIANT($hex_b) for byte $j "
328 . "is $display_byte_invariant");
329
330 my $is_start = $j == 0 && $uvchr_skip_should_be > 1;
331 my $display_is_start = $is_start || 0;
332 next unless is(test_UTF8_IS_START($b), $is_start,
333 " Verify UTF8_IS_START($hex_b) is $display_is_start");
334
335 my $is_continuation = $j != 0 && $uvchr_skip_should_be > 1;
336 my $display_is_continuation = $is_continuation || 0;
337 next unless is(test_UTF8_IS_CONTINUATION($b), $is_continuation,
338 " Verify UTF8_IS_CONTINUATION($hex_b) is "
339 . "$display_is_continuation");
340
341 my $is_continued = $uvchr_skip_should_be > 1;
342 my $display_is_continued = $is_continued || 0;
343 next unless is(test_UTF8_IS_CONTINUED($b), $is_continued,
344 " Verify UTF8_IS_CONTINUED($hex_b) is "
345 . "$display_is_continued");
346
347 my $is_downgradeable_start = $n < 256
348 && $uvchr_skip_should_be > 1
349 && $j == 0;
350 my $display_is_downgradeable_start = $is_downgradeable_start || 0;
351 next unless is(test_UTF8_IS_DOWNGRADEABLE_START($b),
352 $is_downgradeable_start,
353 " Verify UTF8_IS_DOWNGRADEABLE_START($hex_b) is "
354 . "$display_is_downgradeable_start");
355
356 my $is_above_latin1 = $n > 255 && $j == 0;
357 my $display_is_above_latin1 = $is_above_latin1 || 0;
358 next unless is(test_UTF8_IS_ABOVE_LATIN1($b),
359 $is_above_latin1,
360 " Verify UTF8_IS_ABOVE_LATIN1($hex_b) is "
361 . "$display_is_above_latin1");
362
363 my $is_possibly_problematic = $j == 0
364 && $n >= ((isASCII)
365 ? 0xD000
366 : 0x8000);
367 my $display_is_possibly_problematic = $is_possibly_problematic || 0;
368 next unless is(test_isUTF8_POSSIBLY_PROBLEMATIC($b),
369 $is_possibly_problematic,
370 " Verify isUTF8_POSSIBLY_PROBLEMATIC($hex_b) is "
371 . "$display_is_above_latin1");
372 }
373
374 # We are not trying to look for warnings, etc, so if they should occur, it
375 # is an error. But some of the code points here do cause warnings, so we
376 # check here and turn off the ones that apply to such code points. A
377 # later section of the code tests for these kinds of things.
046d01eb 378 my $this_utf8_flags = $look_for_everything_utf8n_to;
7dfd8446
KW
379 my $len = length $bytes;
380 if ($n > 2 ** 31 - 1) {
1d1c12d9
KW
381 $this_utf8_flags &=
382 ~($UTF8_DISALLOW_ABOVE_31_BIT|$UTF8_WARN_ABOVE_31_BIT);
7dfd8446
KW
383 }
384 if ($n > 0x10FFFF) {
385 $this_utf8_flags &= ~($UTF8_DISALLOW_SUPER|$UTF8_WARN_SUPER);
386 }
387 elsif (($n & 0xFFFE) == 0xFFFE) {
388 $this_utf8_flags &= ~($UTF8_DISALLOW_NONCHAR|$UTF8_WARN_NONCHAR);
389 }
390
391 undef @warnings;
392
393 my $display_flags = sprintf "0x%x", $this_utf8_flags;
394 my $ret_ref = test_utf8n_to_uvchr($bytes, $len, $this_utf8_flags);
395 my $display_bytes = display_bytes($bytes);
396 is($ret_ref->[0], $n, "Verify utf8n_to_uvchr($display_bytes, $display_flags) returns $hex_n");
397 is($ret_ref->[1], $len, "Verify utf8n_to_uvchr() for $hex_n returns expected length");
398
399 unless (is(scalar @warnings, 0,
400 "Verify utf8n_to_uvchr() for $hex_n generated no warnings"))
401 {
402 diag "The warnings were: " . join(", ", @warnings);
403 }
046d01eb 404
75ffa578
KW
405 undef @warnings;
406
407 $ret_ref = test_valid_utf8_to_uvchr($bytes);
408 is($ret_ref->[0], $n, "Verify valid_utf8_to_uvchr($display_bytes) returns $hex_n");
409 is($ret_ref->[1], $len, "Verify valid_utf8_to_uvchr() for $hex_n returns expected length");
410
411 unless (is(scalar @warnings, 0,
412 "Verify valid_utf8_to_uvchr() for $hex_n generated no warnings"))
413 {
414 diag "The warnings were: " . join(", ", @warnings);
415 }
416
046d01eb
KW
417 # Similarly for uvchr_to_utf8
418 my $this_uvchr_flags = $look_for_everything_uvchr_to;
419 if ($n > 2 ** 31 - 1) {
420 $this_uvchr_flags &=
421 ~($UNICODE_DISALLOW_ABOVE_31_BIT|$UNICODE_WARN_ABOVE_31_BIT);
422 }
423 if ($n > 0x10FFFF) {
424 $this_uvchr_flags &= ~($UNICODE_DISALLOW_SUPER|$UNICODE_WARN_SUPER);
425 }
426 elsif (($n & 0xFFFE) == 0xFFFE) {
427 $this_uvchr_flags &= ~($UNICODE_DISALLOW_NONCHAR|$UNICODE_WARN_NONCHAR);
428 }
429 $display_flags = sprintf "0x%x", $this_uvchr_flags;
430
431 undef @warnings;
432
433 my $ret = test_uvchr_to_utf8_flags($n, $this_uvchr_flags);
434 ok(defined $ret, "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returned success");
435 is($ret, $bytes, "Verify uvchr_to_utf8_flags($hex_n, $display_flags) returns correct bytes");
436
437 unless (is(scalar @warnings, 0,
438 "Verify uvchr_to_utf8_flags($hex_n, $display_flags) for $hex_n generated no warnings"))
439 {
440 diag "The warnings were: " . join(", ", @warnings);
441 }
7dfd8446
KW
442}
443
444my $REPLACEMENT = 0xFFFD;
445
446# Now test the malformations. All these raise category utf8 warnings.
447my $c = (isASCII) ? "\x80" : "\xa0"; # A continuation byte
448my @malformations = (
eb83ed87
KW
449 [ "zero length string malformation", "", 0,
450 $UTF8_ALLOW_EMPTY, 0, 0,
451 qr/empty string/
452 ],
7dfd8446
KW
453 [ "orphan continuation byte malformation", I8_to_native("${c}a"),
454 2,
eb83ed87
KW
455 $UTF8_ALLOW_CONTINUATION, $REPLACEMENT, 1,
456 qr/unexpected continuation byte/
457 ],
7dfd8446
KW
458 [ "premature next character malformation (immediate)",
459 (isASCII) ? "\xc2a" : I8_to_native("\xc5") ."a",
460 2,
eb83ed87
KW
461 $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 1,
462 qr/unexpected non-continuation byte.*immediately after start byte/
463 ],
7dfd8446
KW
464 [ "premature next character malformation (non-immediate)",
465 I8_to_native("\xf0${c}a"),
466 3,
eb83ed87
KW
467 $UTF8_ALLOW_NON_CONTINUATION, $REPLACEMENT, 2,
468 qr/unexpected non-continuation byte .* 2 bytes after start byte/
469 ],
7dfd8446 470 [ "too short malformation", I8_to_native("\xf0${c}a"), 2,
eb83ed87
KW
471 # Having the 'a' after this, but saying there are only 2 bytes also
472 # tests that we pay attention to the passed in length
473 $UTF8_ALLOW_SHORT, $REPLACEMENT, 2,
474 qr/2 bytes, need 4/
475 ],
7dfd8446
KW
476 [ "overlong malformation", I8_to_native("\xc0$c"), 2,
477 $UTF8_ALLOW_LONG,
478 0, # NUL
479 2,
eb83ed87 480 qr/2 bytes, need 1/
c0236afe 481 ],
7dfd8446
KW
482 [ "overflow malformation",
483 # These are the smallest overflowing on 64 byte machines:
484 # 2**64
c0236afe
KW
485 (isASCII) ? "\xff\x80\x90\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"
486 : I8_to_native("\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
487 (isASCII) ? 13 : 14,
eb83ed87 488 0, # There is no way to allow this malformation
7dfd8446 489 $REPLACEMENT,
c0236afe 490 (isASCII) ? 13 : 14,
eb83ed87 491 qr/overflow/
c0236afe
KW
492 ],
493);
7dfd8446
KW
494
495foreach my $test (@malformations) {
eb83ed87
KW
496 my ($testname, $bytes, $length, $allow_flags, $allowed_uv, $expected_len, $message ) = @$test;
497
498 next if ! ok(length($bytes) >= $length, "$testname: Make sure won't read beyond buffer: " . length($bytes) . " >= $length");
499
500 # Test what happens when this malformation is not allowed
501 undef @warnings;
bd70aaaf 502 my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
eb83ed87
KW
503 is($ret_ref->[0], 0, "$testname: disallowed: Returns 0");
504 is($ret_ref->[1], $expected_len, "$testname: disallowed: Returns expected length");
505 if (is(scalar @warnings, 1, "$testname: disallowed: Got a single warning ")) {
506 like($warnings[0], $message, "$testname: disallowed: Got expected warning");
507 }
508 else {
509 if (scalar @warnings) {
7dfd8446 510 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
511 }
512 }
513
514 { # Next test when disallowed, and warnings are off.
515 undef @warnings;
516 no warnings 'utf8';
bd70aaaf 517 my $ret_ref = test_utf8n_to_uvchr($bytes, $length, 0);
eb83ed87
KW
518 is($ret_ref->[0], 0, "$testname: disallowed: no warnings 'utf8': Returns 0");
519 is($ret_ref->[1], $expected_len, "$testname: disallowed: no warnings 'utf8': Returns expected length");
520 if (!is(scalar @warnings, 0, "$testname: disallowed: no warnings 'utf8': no warnings generated")) {
7dfd8446 521 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
522 }
523 }
524
525 # Test with CHECK_ONLY
526 undef @warnings;
bd70aaaf 527 $ret_ref = test_utf8n_to_uvchr($bytes, $length, $UTF8_CHECK_ONLY);
eb83ed87
KW
528 is($ret_ref->[0], 0, "$testname: CHECK_ONLY: Returns 0");
529 is($ret_ref->[1], -1, "$testname: CHECK_ONLY: returns expected length");
530 if (! is(scalar @warnings, 0, "$testname: CHECK_ONLY: no warnings generated")) {
7dfd8446 531 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
532 }
533
534 next if $allow_flags == 0; # Skip if can't allow this malformation
535
536 # Test when the malformation is allowed
537 undef @warnings;
bd70aaaf 538 $ret_ref = test_utf8n_to_uvchr($bytes, $length, $allow_flags);
eb83ed87
KW
539 is($ret_ref->[0], $allowed_uv, "$testname: allowed: Returns expected uv");
540 is($ret_ref->[1], $expected_len, "$testname: allowed: Returns expected length");
541 if (!is(scalar @warnings, 0, "$testname: allowed: no warnings generated"))
542 {
7dfd8446 543 diag "The warnings were: " . join(", ", @warnings);
eb83ed87
KW
544 }
545}
546
eb83ed87
KW
547# Now test the cases where a legal code point is generated, but may or may not
548# be allowed/warned on.
2f8f112e 549my @tests = (
7dfd8446
KW
550 [ "lowest surrogate",
551 (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"),
552 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
553 'surrogate', 0xD800,
554 (isASCII) ? 3 : 4,
555 qr/surrogate/
556 ],
557 [ "a middle surrogate",
558 (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"),
559 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
560 'surrogate', 0xD90D,
561 (isASCII) ? 3 : 4,
eb83ed87
KW
562 qr/surrogate/
563 ],
7dfd8446
KW
564 [ "highest surrogate",
565 (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"),
566 $UTF8_WARN_SURROGATE, $UTF8_DISALLOW_SURROGATE,
567 'surrogate', 0xDFFF,
568 (isASCII) ? 3 : 4,
569 qr/surrogate/
570 ],
571 [ "first non_unicode",
572 (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"),
573 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
574 'non_unicode', 0x110000,
575 (isASCII) ? 4 : 5,
576 qr/not Unicode.* may not be portable/
577 ],
578 [ "first of 32 consecutive non-character code points",
579 (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"),
580 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
581 'nonchar', 0xFDD0,
582 (isASCII) ? 3 : 4,
583 qr/Unicode non-character.*is not recommended for open interchange/
584 ],
585 [ "a mid non-character code point of the 32 consecutive ones",
586 (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"),
587 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
588 'nonchar', 0xFDE0,
589 (isASCII) ? 3 : 4,
590 qr/Unicode non-character.*is not recommended for open interchange/
591 ],
592 [ "final of 32 consecutive non-character code points",
593 (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"),
594 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
595 'nonchar', 0xFDEF,
596 (isASCII) ? 3 : 4,
597 qr/Unicode non-character.*is not recommended for open interchange/
598 ],
599 [ "non-character code point U+FFFE",
600 (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"),
601 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
602 'nonchar', 0xFFFE,
603 (isASCII) ? 3 : 4,
604 qr/Unicode non-character.*is not recommended for open interchange/
605 ],
606 [ "non-character code point U+FFFF",
607 (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"),
608 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
609 'nonchar', 0xFFFF,
610 (isASCII) ? 3 : 4,
611 qr/Unicode non-character.*is not recommended for open interchange/
612 ],
613 [ "non-character code point U+1FFFE",
614 (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"),
615 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
616 'nonchar', 0x1FFFE, 4,
617 qr/Unicode non-character.*is not recommended for open interchange/
618 ],
619 [ "non-character code point U+1FFFF",
620 (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"),
621 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
622 'nonchar', 0x1FFFF, 4,
623 qr/Unicode non-character.*is not recommended for open interchange/
624 ],
625 [ "non-character code point U+2FFFE",
626 (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"),
627 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
628 'nonchar', 0x2FFFE, 4,
629 qr/Unicode non-character.*is not recommended for open interchange/
630 ],
631 [ "non-character code point U+2FFFF",
632 (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"),
633 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
634 'nonchar', 0x2FFFF, 4,
635 qr/Unicode non-character.*is not recommended for open interchange/
636 ],
637 [ "non-character code point U+3FFFE",
638 (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"),
639 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
640 'nonchar', 0x3FFFE, 4,
641 qr/Unicode non-character.*is not recommended for open interchange/
642 ],
643 [ "non-character code point U+3FFFF",
644 (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"),
645 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
646 'nonchar', 0x3FFFF, 4,
647 qr/Unicode non-character.*is not recommended for open interchange/
648 ],
649 [ "non-character code point U+4FFFE",
650 (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"),
651 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
652 'nonchar', 0x4FFFE,
653 (isASCII) ? 4 : 5,
654 qr/Unicode non-character.*is not recommended for open interchange/
655 ],
656 [ "non-character code point U+4FFFF",
657 (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"),
658 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
659 'nonchar', 0x4FFFF,
660 (isASCII) ? 4 : 5,
661 qr/Unicode non-character.*is not recommended for open interchange/
662 ],
663 [ "non-character code point U+5FFFE",
664 (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"),
665 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
666 'nonchar', 0x5FFFE,
667 (isASCII) ? 4 : 5,
668 qr/Unicode non-character.*is not recommended for open interchange/
669 ],
670 [ "non-character code point U+5FFFF",
671 (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"),
672 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
673 'nonchar', 0x5FFFF,
674 (isASCII) ? 4 : 5,
675 qr/Unicode non-character.*is not recommended for open interchange/
676 ],
677 [ "non-character code point U+6FFFE",
678 (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"),
679 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
680 'nonchar', 0x6FFFE,
681 (isASCII) ? 4 : 5,
682 qr/Unicode non-character.*is not recommended for open interchange/
683 ],
684 [ "non-character code point U+6FFFF",
685 (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"),
686 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
687 'nonchar', 0x6FFFF,
688 (isASCII) ? 4 : 5,
689 qr/Unicode non-character.*is not recommended for open interchange/
690 ],
691 [ "non-character code point U+7FFFE",
692 (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"),
693 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
694 'nonchar', 0x7FFFE,
695 (isASCII) ? 4 : 5,
696 qr/Unicode non-character.*is not recommended for open interchange/
697 ],
698 [ "non-character code point U+7FFFF",
699 (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"),
700 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
701 'nonchar', 0x7FFFF,
702 (isASCII) ? 4 : 5,
703 qr/Unicode non-character.*is not recommended for open interchange/
704 ],
705 [ "non-character code point U+8FFFE",
706 (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"),
707 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
708 'nonchar', 0x8FFFE,
709 (isASCII) ? 4 : 5,
710 qr/Unicode non-character.*is not recommended for open interchange/
711 ],
712 [ "non-character code point U+8FFFF",
713 (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"),
714 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
715 'nonchar', 0x8FFFF,
716 (isASCII) ? 4 : 5,
717 qr/Unicode non-character.*is not recommended for open interchange/
718 ],
719 [ "non-character code point U+9FFFE",
720 (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"),
721 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
722 'nonchar', 0x9FFFE,
723 (isASCII) ? 4 : 5,
724 qr/Unicode non-character.*is not recommended for open interchange/
725 ],
726 [ "non-character code point U+9FFFF",
727 (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"),
728 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
729 'nonchar', 0x9FFFF,
730 (isASCII) ? 4 : 5,
731 qr/Unicode non-character.*is not recommended for open interchange/
732 ],
733 [ "non-character code point U+AFFFE",
734 (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"),
735 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
736 'nonchar', 0xAFFFE,
737 (isASCII) ? 4 : 5,
738 qr/Unicode non-character.*is not recommended for open interchange/
739 ],
740 [ "non-character code point U+AFFFF",
741 (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"),
742 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
743 'nonchar', 0xAFFFF,
744 (isASCII) ? 4 : 5,
745 qr/Unicode non-character.*is not recommended for open interchange/
746 ],
747 [ "non-character code point U+BFFFE",
748 (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"),
749 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
750 'nonchar', 0xBFFFE,
751 (isASCII) ? 4 : 5,
752 qr/Unicode non-character.*is not recommended for open interchange/
753 ],
754 [ "non-character code point U+BFFFF",
755 (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"),
756 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
757 'nonchar', 0xBFFFF,
758 (isASCII) ? 4 : 5,
759 qr/Unicode non-character.*is not recommended for open interchange/
760 ],
761 [ "non-character code point U+CFFFE",
762 (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"),
763 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
764 'nonchar', 0xCFFFE,
765 (isASCII) ? 4 : 5,
766 qr/Unicode non-character.*is not recommended for open interchange/
767 ],
768 [ "non-character code point U+CFFFF",
769 (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"),
770 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
771 'nonchar', 0xCFFFF,
772 (isASCII) ? 4 : 5,
773 qr/Unicode non-character.*is not recommended for open interchange/
774 ],
775 [ "non-character code point U+DFFFE",
776 (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"),
777 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
778 'nonchar', 0xDFFFE,
779 (isASCII) ? 4 : 5,
780 qr/Unicode non-character.*is not recommended for open interchange/
781 ],
782 [ "non-character code point U+DFFFF",
783 (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"),
784 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
785 'nonchar', 0xDFFFF,
786 (isASCII) ? 4 : 5,
787 qr/Unicode non-character.*is not recommended for open interchange/
788 ],
789 [ "non-character code point U+EFFFE",
790 (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"),
791 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
792 'nonchar', 0xEFFFE,
793 (isASCII) ? 4 : 5,
794 qr/Unicode non-character.*is not recommended for open interchange/
795 ],
796 [ "non-character code point U+EFFFF",
797 (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"),
798 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
799 'nonchar', 0xEFFFF,
800 (isASCII) ? 4 : 5,
801 qr/Unicode non-character.*is not recommended for open interchange/
802 ],
803 [ "non-character code point U+FFFFE",
804 (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"),
805 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
806 'nonchar', 0xFFFFE,
807 (isASCII) ? 4 : 5,
808 qr/Unicode non-character.*is not recommended for open interchange/
809 ],
810 [ "non-character code point U+FFFFF",
811 (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"),
812 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
813 'nonchar', 0xFFFFF,
814 (isASCII) ? 4 : 5,
815 qr/Unicode non-character.*is not recommended for open interchange/
816 ],
817 [ "non-character code point U+10FFFE",
818 (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"),
819 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
820 'nonchar', 0x10FFFE,
821 (isASCII) ? 4 : 5,
822 qr/Unicode non-character.*is not recommended for open interchange/
eb83ed87 823 ],
7dfd8446
KW
824 [ "non-character code point U+10FFFF",
825 (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"),
826 $UTF8_WARN_NONCHAR, $UTF8_DISALLOW_NONCHAR,
827 'nonchar', 0x10FFFF,
828 (isASCII) ? 4 : 5,
ba707cdc 829 qr/Unicode non-character.*is not recommended for open interchange/
eb83ed87 830 ],
1d1c12d9 831 [ "requires at least 32 bits",
c0236afe
KW
832 (isASCII)
833 ? "\xfe\x82\x80\x80\x80\x80\x80"
834 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
eb83ed87 835 # This code point is chosen so that it is representable in a UV on
2f8f112e 836 # 32-bit machines
1d1c12d9 837 $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
c0236afe 838 'utf8', 0x80000000, (isASCII) ? 7 :14,
7dfd8446
KW
839 qr/Code point 0x80000000 is not Unicode, and not portable/
840 ],
1d1c12d9 841 [ "requires at least 32 bits, and use SUPER-type flags, instead of ABOVE_31_BIT",
c0236afe
KW
842 (isASCII)
843 ? "\xfe\x82\x80\x80\x80\x80\x80"
844 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"),
7dfd8446 845 $UTF8_WARN_SUPER, $UTF8_DISALLOW_SUPER,
c0236afe 846 'utf8', 0x80000000, (isASCII) ? 7 :14,
ea5ced44 847 qr/Code point 0x80000000 is not Unicode, and not portable/
eb83ed87 848 ],
1d1c12d9
KW
849 [ "overflow with warnings/disallow for more than 31 bits",
850 # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT
851 # with overflow. The overflow malformation is never allowed, so
852 # preventing it takes precedence if the ABOVE_31_BIT options would
c0236afe 853 # otherwise allow in an overflowing value. The ASCII code points (1
1d1c12d9
KW
854 # for 32-bits; 1 for 64) were chosen because the old overflow
855 # detection algorithm did not catch them; this means this test also
c0236afe
KW
856 # checks for that fix. The EBCDIC are arbitrary overflowing ones
857 # since we have no reports of failures with it.
858 (($is64bit)
859 ? ((isASCII)
860 ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"
861 : I8_to_native("\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"))
862 : ((isASCII)
863 ? "\xfe\x86\x80\x80\x80\x80\x80"
864 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))),
ea5ced44 865
1d1c12d9
KW
866 # We include both warning categories to make sure the ABOVE_31_BIT one
867 # has precedence
868 "$UTF8_WARN_ABOVE_31_BIT|$UTF8_WARN_SUPER",
869 "$UTF8_DISALLOW_ABOVE_31_BIT",
7dfd8446 870 'utf8', 0,
c0236afe 871 (! isASCII) ? 14 : ($is64bit) ? 13 : 7,
ea5ced44 872 qr/overflow at byte .*, after start byte 0xf/
eb83ed87 873 ],
c0236afe 874);
2f8f112e 875
c0236afe
KW
876if ($is64bit) {
877 no warnings qw{portable overflow};
2f8f112e 878 push @tests,
c0236afe
KW
879 [ "More than 32 bits",
880 (isASCII)
881 ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80"
882 : I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
1d1c12d9 883 $UTF8_WARN_ABOVE_31_BIT, $UTF8_DISALLOW_ABOVE_31_BIT,
c0236afe 884 'utf8', 0x1000000000, (isASCII) ? 13 : 14,
ea5ced44 885 qr/Code point 0x.* is not Unicode, and not portable/
2f8f112e 886 ];
83dc0f42
KW
887 if (! isASCII) {
888 push @tests, # These could falsely show wrongly in a naive implementation
889 [ "requires at least 32 bits",
890 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
891 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
892 'utf8', 0x800000000, 14,
893 qr/Code point 0x800000000 is not Unicode, and not portable/
894 ],
895 [ "requires at least 32 bits",
896 I8_to_native("\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
897 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
898 'utf8', 0x10000000000, 14,
899 qr/Code point 0x10000000000 is not Unicode, and not portable/
900 ],
901 [ "requires at least 32 bits",
902 I8_to_native("\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
903 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
904 'utf8', 0x200000000000, 14,
905 qr/Code point 0x200000000000 is not Unicode, and not portable/
906 ],
907 [ "requires at least 32 bits",
908 I8_to_native("\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
909 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
910 'utf8', 0x4000000000000, 14,
911 qr/Code point 0x4000000000000 is not Unicode, and not portable/
912 ],
913 [ "requires at least 32 bits",
914 I8_to_native("\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
915 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
916 'utf8', 0x80000000000000, 14,
917 qr/Code point 0x80000000000000 is not Unicode, and not portable/
918 ],
919 [ "requires at least 32 bits",
920 I8_to_native("\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"),
921 $UTF8_WARN_ABOVE_31_BIT,$UTF8_DISALLOW_ABOVE_31_BIT,
922 'utf8', 0x1000000000000000, 14,
923 qr/Code point 0x1000000000000000 is not Unicode, and not portable/
924 ];
925 }
2f8f112e
KW
926}
927
928foreach my $test (@tests) {
eb83ed87
KW
929 my ($testname, $bytes, $warn_flags, $disallow_flags, $category, $allowed_uv, $expected_len, $message ) = @$test;
930
931 my $length = length $bytes;
2f8f112e 932 my $will_overflow = $testname =~ /overflow/;
eb83ed87
KW
933
934 # This is more complicated than the malformations tested earlier, as there
935 # are several orthogonal variables involved. We test all the subclasses
936 # of utf8 warnings to verify they work with and without the utf8 class,
937 # and don't have effects on other sublass warnings
54f4afef 938 foreach my $warning ('utf8', 'surrogate', 'nonchar', 'non_unicode') {
eb83ed87
KW
939 foreach my $warn_flag (0, $warn_flags) {
940 foreach my $disallow_flag (0, $disallow_flags) {
54f4afef 941 foreach my $do_warning (0, 1) {
eb83ed87 942
13d7a909
KW
943 my $eval_warn = $do_warning
944 ? "use warnings '$warning'"
945 : $warning eq "utf8"
7dfd8446
KW
946 ? "no warnings 'utf8'"
947 : "use warnings 'utf8'; no warnings '$warning'";
2f8f112e 948
13d7a909
KW
949 # is effectively disallowed if will overflow, even if the
950 # flag indicates it is allowed, fix up test name to
951 # indicate this as well
952 my $disallowed = $disallow_flag || $will_overflow;
2f8f112e 953
046d01eb 954 my $this_name = "utf8n_to_uvchr() $testname: " . (($disallow_flag)
13d7a909
KW
955 ? 'disallowed'
956 : ($disallowed)
1d1c12d9 957 ? 'ABOVE_31_BIT allowed'
13d7a909
KW
958 : 'allowed');
959 $this_name .= ", $eval_warn";
960 $this_name .= ", " . (($warn_flag)
961 ? 'with warning flag'
962 : 'no warning flag');
eb83ed87 963
13d7a909
KW
964 undef @warnings;
965 my $ret_ref;
7dfd8446
KW
966 my $display_bytes = display_bytes($bytes);
967 my $call = "Call was: $eval_warn; \$ret_ref = test_utf8n_to_uvchr('$display_bytes', $length, $warn_flag|$disallow_flag)";
968 my $eval_text = "$eval_warn; \$ret_ref = test_utf8n_to_uvchr('$bytes', $length, $warn_flag|$disallow_flag)";
13d7a909
KW
969 eval "$eval_text";
970 if (! ok ("$@ eq ''", "$this_name: eval succeeded")) {
7dfd8446 971 diag "\$!='$!'; eval'd=\"$call\"";
13d7a909 972 next;
54f4afef 973 }
13d7a909 974 if ($disallowed) {
7dfd8446
KW
975 unless (is($ret_ref->[0], 0, "$this_name: Returns 0"))
976 {
977 diag $call;
978 }
2f8f112e
KW
979 }
980 else {
7dfd8446
KW
981 unless (is($ret_ref->[0], $allowed_uv,
982 "$this_name: Returns expected uv"))
983 {
984 diag $call;
985 }
986 }
987 unless (is($ret_ref->[1], $expected_len,
988 "$this_name: Returns expected length"))
989 {
990 diag $call;
13d7a909 991 }
13d7a909
KW
992
993 if (! $do_warning
994 && ($warning eq 'utf8' || $warning eq $category))
995 {
996 if (!is(scalar @warnings, 0,
997 "$this_name: No warnings generated"))
998 {
7dfd8446
KW
999 diag $call;
1000 diag "The warnings were: " . join(", ", @warnings);
2f8f112e
KW
1001 }
1002 }
13d7a909
KW
1003 elsif ($will_overflow
1004 && ! $disallow_flag
1005 && $warning eq 'utf8')
1006 {
1007
1008 # Will get the overflow message instead of the expected
1009 # message under these circumstances, as they would
1010 # otherwise accept an overflowed value, which the code
1011 # should not allow, so falls back to overflow.
1012 if (is(scalar @warnings, 1,
1013 "$this_name: Got a single warning "))
1014 {
7dfd8446
KW
1015 unless (like($warnings[0], qr/overflow/,
1016 "$this_name: Got overflow warning"))
1017 {
1018 diag $call;
1019 }
13d7a909
KW
1020 }
1021 else {
7dfd8446 1022 diag $call;
13d7a909 1023 if (scalar @warnings) {
7dfd8446 1024 diag "The warnings were: "
13d7a909
KW
1025 . join(", ", @warnings);
1026 }
1027 }
eb83ed87 1028 }
13d7a909
KW
1029 elsif ($warn_flag
1030 && ($warning eq 'utf8' || $warning eq $category))
1031 {
1032 if (is(scalar @warnings, 1,
1033 "$this_name: Got a single warning "))
1034 {
7dfd8446
KW
1035 unless (like($warnings[0], $message,
1036 "$this_name: Got expected warning"))
1037 {
1038 diag $call;
1039 }
13d7a909
KW
1040 }
1041 else {
7dfd8446 1042 diag $call;
13d7a909 1043 if (scalar @warnings) {
7dfd8446 1044 diag "The warnings were: "
13d7a909
KW
1045 . join(", ", @warnings);
1046 }
eb83ed87
KW
1047 }
1048 }
eb83ed87 1049
13d7a909
KW
1050 # Check CHECK_ONLY results when the input is disallowed. Do
1051 # this when actually disallowed, not just when the
1052 # $disallow_flag is set
1053 if ($disallowed) {
1054 undef @warnings;
1055 $ret_ref = test_utf8n_to_uvchr($bytes, $length,
1056 $disallow_flag|$UTF8_CHECK_ONLY);
7dfd8446
KW
1057 unless (is($ret_ref->[0], 0, "$this_name, CHECK_ONLY: Returns 0")) {
1058 diag $call;
1059 }
1060 unless (is($ret_ref->[1], -1,
1061 "$this_name: CHECK_ONLY: returns expected length"))
1062 {
1063 diag $call;
1064 }
13d7a909
KW
1065 if (! is(scalar @warnings, 0,
1066 "$this_name, CHECK_ONLY: no warnings generated"))
1067 {
7dfd8446
KW
1068 diag $call;
1069 diag "The warnings were: " . join(", ", @warnings);
13d7a909 1070 }
eb83ed87 1071 }
046d01eb
KW
1072
1073 # Now repeat some of the above, but for
1074 # uvchr_to_utf8_flags(). Since this comes from an
1075 # existing code point, it hasn't overflowed.
1076 next if $will_overflow;
1077
1078 # The warning and disallow flags passed in are for
1079 # utf8n_to_uvchr(). Convert them for
1080 # uvchr_to_utf8_flags().
1081 my $uvchr_warn_flag = 0;
1082 my $uvchr_disallow_flag = 0;
1083 if ($warn_flag) {
1084 if ($warn_flag == $UTF8_WARN_SURROGATE) {
1085 $uvchr_warn_flag = $UNICODE_WARN_SURROGATE
1086 }
1087 elsif ($warn_flag == $UTF8_WARN_NONCHAR) {
1088 $uvchr_warn_flag = $UNICODE_WARN_NONCHAR
1089 }
1090 elsif ($warn_flag == $UTF8_WARN_SUPER) {
1091 $uvchr_warn_flag = $UNICODE_WARN_SUPER
1092 }
1d1c12d9 1093 elsif ($warn_flag == $UTF8_WARN_ABOVE_31_BIT) {
046d01eb
KW
1094 $uvchr_warn_flag = $UNICODE_WARN_ABOVE_31_BIT;
1095 }
1096 else {
1097 fail(sprintf "Unexpected warn flag: %x",
1098 $warn_flag);
1099 next;
1100 }
1101 }
1102 if ($disallow_flag) {
1103 if ($disallow_flag == $UTF8_DISALLOW_SURROGATE) {
1104 $uvchr_disallow_flag = $UNICODE_DISALLOW_SURROGATE
1105 }
1106 elsif ($disallow_flag == $UTF8_DISALLOW_NONCHAR) {
1107 $uvchr_disallow_flag = $UNICODE_DISALLOW_NONCHAR
1108 }
1109 elsif ($disallow_flag == $UTF8_DISALLOW_SUPER) {
1110 $uvchr_disallow_flag = $UNICODE_DISALLOW_SUPER
1111 }
1d1c12d9 1112 elsif ($disallow_flag == $UTF8_DISALLOW_ABOVE_31_BIT) {
046d01eb
KW
1113 $uvchr_disallow_flag =
1114 $UNICODE_DISALLOW_ABOVE_31_BIT;
1115 }
1116 else {
1117 fail(sprintf "Unexpected disallow flag: %x",
1118 $disallow_flag);
1119 next;
1120 }
1121 }
1122
1123 $disallowed = $uvchr_disallow_flag;
1124
1125 $this_name = "uvchr_to_utf8_flags() $testname: "
1126 . (($uvchr_disallow_flag)
1127 ? 'disallowed'
1128 : ($disallowed)
1129 ? 'ABOVE_31_BIT allowed'
1130 : 'allowed');
1131 $this_name .= ", $eval_warn";
1132 $this_name .= ", " . (($uvchr_warn_flag)
1133 ? 'with warning flag'
1134 : 'no warning flag');
1135
1136 undef @warnings;
1137 my $ret;
1138 my $warn_flag = sprintf "0x%x", $uvchr_warn_flag;
1139 my $disallow_flag = sprintf "0x%x", $uvchr_disallow_flag;
1140 $call = sprintf "call was: $eval_warn; \$ret = test_uvchr_to_utf8_flags(0x%x, $warn_flag|$disallow_flag)", $allowed_uv;
1141 $eval_text = "$eval_warn; \$ret = test_uvchr_to_utf8_flags($allowed_uv, $warn_flag|$disallow_flag)";
1142 eval "$eval_text";
1143 if (! ok ("$@ eq ''", "$this_name: eval succeeded")) {
1144 diag "\$!='$!'; eval'd=\"$eval_text\"";
1145 next;
1146 }
1147 if ($disallowed) {
1148 unless (is($ret, undef, "$this_name: Returns undef")) {
1149 diag $call;
1150 }
1151 }
1152 else {
1153 unless (is($ret, $bytes, "$this_name: Returns expected string")) {
1154 diag $call;
1155 }
1156 }
1157 if (! $do_warning
1158 && ($warning eq 'utf8' || $warning eq $category))
1159 {
1160 if (!is(scalar @warnings, 0,
1161 "$this_name: No warnings generated"))
1162 {
1163 diag $call;
1164 diag "The warnings were: " . join(", ", @warnings);
1165 }
1166 }
1167 elsif ($uvchr_warn_flag
1168 && ($warning eq 'utf8' || $warning eq $category))
1169 {
1170 if (is(scalar @warnings, 1,
1171 "$this_name: Got a single warning "))
1172 {
046d01eb
KW
1173 unless (like($warnings[0], $message,
1174 "$this_name: Got expected warning"))
1175 {
1176 diag $call;
1177 }
046d01eb
KW
1178 }
1179 else {
1180 diag $call;
1181 if (scalar @warnings) {
1182 diag "The warnings were: "
1183 . join(", ", @warnings);
1184 }
1185 }
1186 }
eb83ed87
KW
1187 }
1188 }
1189 }
1190 }
1191}
6e3d6c02 1192
fed3ba5d 1193done_testing;