Commit | Line | Data |
---|---|---|
6aa905cf KW |
1 | #!perl -w |
2 | ||
3 | # This is a base file to be used by various .t's in its directory | |
4 | ||
5 | use strict; | |
6 | use Test::More; | |
7 | ||
8 | BEGIN { | |
9 | use_ok('XS::APItest'); | |
10 | require 'charset_tools.pl'; | |
11 | require './t/utf8_setup.pl'; | |
12 | }; | |
13 | ||
14 | $|=1; | |
15 | ||
16 | no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit | |
17 | # machines, and that is tested elsewhere | |
18 | ||
19 | use XS::APItest; | |
20 | use Data::Dumper; | |
21 | ||
9cdc3054 | 22 | my @warnings_gotten; |
6aa905cf KW |
23 | |
24 | use warnings 'utf8'; | |
8d6f1506 KW |
25 | local $SIG{__WARN__} = sub { my @copy = @_; |
26 | push @warnings_gotten, map { chomp; $_ } @copy; | |
27 | }; | |
6aa905cf | 28 | |
6aa905cf KW |
29 | |
30 | # Now test the cases where a legal code point is generated, but may or may not | |
31 | # be allowed/warned on. | |
32 | my @tests = ( | |
d42d7565 | 33 | # ($testname, $bytes, $allowed_uv, $needed_to_discern_len ) |
6aa905cf KW |
34 | [ "lowest surrogate", |
35 | (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"), | |
d42d7565 | 36 | 0xD800, |
6aa905cf KW |
37 | ], |
38 | [ "a middle surrogate", | |
39 | (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"), | |
d42d7565 | 40 | 0xD90D, |
6aa905cf KW |
41 | ], |
42 | [ "highest surrogate", | |
43 | (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"), | |
d42d7565 | 44 | 0xDFFF, |
6aa905cf KW |
45 | ], |
46 | [ "first non_unicode", | |
47 | (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"), | |
d42d7565 | 48 | 0x110000, |
6aa905cf | 49 | 2, |
6aa905cf KW |
50 | ], |
51 | [ "non_unicode whose first byte tells that", | |
52 | (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"), | |
6aa905cf | 53 | (isASCII) ? 0x140000 : 0x200000, |
6aa905cf | 54 | 1, |
6aa905cf KW |
55 | ], |
56 | [ "first of 32 consecutive non-character code points", | |
57 | (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"), | |
d42d7565 | 58 | 0xFDD0, |
6aa905cf KW |
59 | ], |
60 | [ "a mid non-character code point of the 32 consecutive ones", | |
61 | (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"), | |
d42d7565 | 62 | 0xFDE0, |
6aa905cf KW |
63 | ], |
64 | [ "final of 32 consecutive non-character code points", | |
65 | (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"), | |
d42d7565 | 66 | 0xFDEF, |
6aa905cf KW |
67 | ], |
68 | [ "non-character code point U+FFFE", | |
69 | (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"), | |
d42d7565 | 70 | 0xFFFE, |
6aa905cf KW |
71 | ], |
72 | [ "non-character code point U+FFFF", | |
73 | (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"), | |
d42d7565 | 74 | 0xFFFF, |
6aa905cf KW |
75 | ], |
76 | [ "non-character code point U+1FFFE", | |
77 | (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"), | |
d42d7565 | 78 | 0x1FFFE, |
6aa905cf KW |
79 | ], |
80 | [ "non-character code point U+1FFFF", | |
81 | (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"), | |
d42d7565 | 82 | 0x1FFFF, |
6aa905cf KW |
83 | ], |
84 | [ "non-character code point U+2FFFE", | |
85 | (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"), | |
d42d7565 | 86 | 0x2FFFE, |
6aa905cf KW |
87 | ], |
88 | [ "non-character code point U+2FFFF", | |
89 | (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"), | |
d42d7565 | 90 | 0x2FFFF, |
6aa905cf KW |
91 | ], |
92 | [ "non-character code point U+3FFFE", | |
93 | (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"), | |
d42d7565 | 94 | 0x3FFFE, |
6aa905cf KW |
95 | ], |
96 | [ "non-character code point U+3FFFF", | |
97 | (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"), | |
d42d7565 | 98 | 0x3FFFF, |
6aa905cf KW |
99 | ], |
100 | [ "non-character code point U+4FFFE", | |
101 | (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"), | |
d42d7565 | 102 | 0x4FFFE, |
6aa905cf KW |
103 | ], |
104 | [ "non-character code point U+4FFFF", | |
105 | (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"), | |
d42d7565 | 106 | 0x4FFFF, |
6aa905cf KW |
107 | ], |
108 | [ "non-character code point U+5FFFE", | |
109 | (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"), | |
d42d7565 | 110 | 0x5FFFE, |
6aa905cf KW |
111 | ], |
112 | [ "non-character code point U+5FFFF", | |
113 | (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"), | |
d42d7565 | 114 | 0x5FFFF, |
6aa905cf KW |
115 | ], |
116 | [ "non-character code point U+6FFFE", | |
117 | (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"), | |
d42d7565 | 118 | 0x6FFFE, |
6aa905cf KW |
119 | ], |
120 | [ "non-character code point U+6FFFF", | |
121 | (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"), | |
d42d7565 | 122 | 0x6FFFF, |
6aa905cf KW |
123 | ], |
124 | [ "non-character code point U+7FFFE", | |
125 | (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"), | |
d42d7565 | 126 | 0x7FFFE, |
6aa905cf KW |
127 | ], |
128 | [ "non-character code point U+7FFFF", | |
129 | (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"), | |
d42d7565 | 130 | 0x7FFFF, |
6aa905cf KW |
131 | ], |
132 | [ "non-character code point U+8FFFE", | |
133 | (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"), | |
d42d7565 | 134 | 0x8FFFE, |
6aa905cf KW |
135 | ], |
136 | [ "non-character code point U+8FFFF", | |
137 | (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"), | |
d42d7565 | 138 | 0x8FFFF, |
6aa905cf KW |
139 | ], |
140 | [ "non-character code point U+9FFFE", | |
141 | (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"), | |
d42d7565 | 142 | 0x9FFFE, |
6aa905cf KW |
143 | ], |
144 | [ "non-character code point U+9FFFF", | |
145 | (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"), | |
d42d7565 | 146 | 0x9FFFF, |
6aa905cf KW |
147 | ], |
148 | [ "non-character code point U+AFFFE", | |
149 | (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"), | |
d42d7565 | 150 | 0xAFFFE, |
6aa905cf KW |
151 | ], |
152 | [ "non-character code point U+AFFFF", | |
153 | (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"), | |
d42d7565 | 154 | 0xAFFFF, |
6aa905cf KW |
155 | ], |
156 | [ "non-character code point U+BFFFE", | |
157 | (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"), | |
d42d7565 | 158 | 0xBFFFE, |
6aa905cf KW |
159 | ], |
160 | [ "non-character code point U+BFFFF", | |
161 | (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"), | |
d42d7565 | 162 | 0xBFFFF, |
6aa905cf KW |
163 | ], |
164 | [ "non-character code point U+CFFFE", | |
165 | (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"), | |
d42d7565 | 166 | 0xCFFFE, |
6aa905cf KW |
167 | ], |
168 | [ "non-character code point U+CFFFF", | |
169 | (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"), | |
d42d7565 | 170 | 0xCFFFF, |
6aa905cf KW |
171 | ], |
172 | [ "non-character code point U+DFFFE", | |
173 | (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"), | |
d42d7565 | 174 | 0xDFFFE, |
6aa905cf KW |
175 | ], |
176 | [ "non-character code point U+DFFFF", | |
177 | (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"), | |
d42d7565 | 178 | 0xDFFFF, |
6aa905cf KW |
179 | ], |
180 | [ "non-character code point U+EFFFE", | |
181 | (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"), | |
d42d7565 | 182 | 0xEFFFE, |
6aa905cf KW |
183 | ], |
184 | [ "non-character code point U+EFFFF", | |
185 | (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"), | |
d42d7565 | 186 | 0xEFFFF, |
6aa905cf KW |
187 | ], |
188 | [ "non-character code point U+FFFFE", | |
189 | (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"), | |
d42d7565 | 190 | 0xFFFFE, |
6aa905cf KW |
191 | ], |
192 | [ "non-character code point U+FFFFF", | |
193 | (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"), | |
d42d7565 | 194 | 0xFFFFF, |
6aa905cf KW |
195 | ], |
196 | [ "non-character code point U+10FFFE", | |
197 | (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"), | |
d42d7565 | 198 | 0x10FFFE, |
6aa905cf KW |
199 | ], |
200 | [ "non-character code point U+10FFFF", | |
201 | (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"), | |
d42d7565 | 202 | 0x10FFFF, |
6aa905cf | 203 | ], |
76513bdc KW |
204 | [ "requires at least 32 bits", |
205 | (isASCII) | |
206 | ? "\xfe\x82\x80\x80\x80\x80\x80" | |
207 | : I8_to_native( | |
208 | "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), | |
209 | # This code point is chosen so that it is representable in a UV on | |
210 | # 32-bit machines | |
d42d7565 | 211 | 0x80000000, |
76513bdc | 212 | (isASCII) ? 1 : 8, |
76513bdc KW |
213 | ], |
214 | [ "highest 32 bit code point", | |
215 | (isASCII) | |
216 | ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" | |
217 | : I8_to_native( | |
218 | "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), | |
d42d7565 | 219 | 0xFFFFFFFF, |
76513bdc | 220 | (isASCII) ? 1 : 8, |
76513bdc KW |
221 | ], |
222 | [ "requires at least 32 bits, and use SUPER-type flags, instead of" | |
223 | . " ABOVE_31_BIT", | |
224 | (isASCII) | |
225 | ? "\xfe\x82\x80\x80\x80\x80\x80" | |
226 | : I8_to_native( | |
227 | "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), | |
d42d7565 | 228 | 0x80000000, |
76513bdc | 229 | 1, |
76513bdc KW |
230 | ], |
231 | [ "overflow with warnings/disallow for more than 31 bits", | |
232 | # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT | |
233 | # with overflow. The overflow malformation is never allowed, so | |
234 | # preventing it takes precedence if the ABOVE_31_BIT options would | |
235 | # otherwise allow in an overflowing value. The ASCII code points (1 | |
236 | # for 32-bits; 1 for 64) were chosen because the old overflow | |
237 | # detection algorithm did not catch them; this means this test also | |
238 | # checks for that fix. The EBCDIC are arbitrary overflowing ones | |
239 | # since we have no reports of failures with it. | |
240 | (($::is64bit) | |
241 | ? ((isASCII) | |
242 | ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" | |
243 | : I8_to_native( | |
244 | "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0")) | |
245 | : ((isASCII) | |
246 | ? "\xfe\x86\x80\x80\x80\x80\x80" | |
247 | : I8_to_native( | |
248 | "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))), | |
d42d7565 | 249 | -1, |
76513bdc | 250 | (isASCII || $::is64bit) ? 2 : 8, |
76513bdc | 251 | ], |
6aa905cf KW |
252 | ); |
253 | ||
254 | if (! $::is64bit) { | |
255 | if (isASCII) { | |
256 | no warnings qw{portable overflow}; | |
257 | push @tests, | |
258 | [ "Lowest 33 bit code point: overflow", | |
259 | "\xFE\x84\x80\x80\x80\x80\x80", | |
d42d7565 | 260 | -1, |
b3169593 | 261 | 1, |
6aa905cf KW |
262 | ]; |
263 | } | |
264 | } | |
265 | else { | |
266 | no warnings qw{portable overflow}; | |
267 | push @tests, | |
268 | [ "More than 32 bits", | |
269 | (isASCII) | |
270 | ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80" | |
271 | : I8_to_native( | |
272 | "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
d42d7565 | 273 | 0x1000000000, |
b3169593 | 274 | (isASCII) ? 1 : 7, |
6aa905cf KW |
275 | ]; |
276 | if (! isASCII) { | |
277 | push @tests, # These could falsely show wrongly in a naive | |
278 | # implementation | |
279 | [ "requires at least 32 bits", | |
280 | I8_to_native( | |
281 | "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
d42d7565 | 282 | 0x800000000, |
b3169593 | 283 | 7, |
6aa905cf KW |
284 | ], |
285 | [ "requires at least 32 bits", | |
286 | I8_to_native( | |
287 | "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
d42d7565 | 288 | 0x10000000000, |
b3169593 | 289 | 6, |
6aa905cf KW |
290 | ], |
291 | [ "requires at least 32 bits", | |
292 | I8_to_native( | |
293 | "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
d42d7565 | 294 | 0x200000000000, |
b3169593 | 295 | 5, |
6aa905cf KW |
296 | ], |
297 | [ "requires at least 32 bits", | |
298 | I8_to_native( | |
299 | "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
d42d7565 | 300 | 0x4000000000000, |
b3169593 | 301 | 4, |
6aa905cf KW |
302 | ], |
303 | [ "requires at least 32 bits", | |
304 | I8_to_native( | |
305 | "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
d42d7565 | 306 | 0x80000000000000, |
b3169593 | 307 | 3, |
6aa905cf KW |
308 | ], |
309 | [ "requires at least 32 bits", | |
310 | I8_to_native( | |
311 | "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
d42d7565 | 312 | 0x1000000000000000, |
b3169593 | 313 | 2, |
6aa905cf KW |
314 | ]; |
315 | } | |
316 | } | |
317 | ||
5722c46d KW |
318 | sub flags_to_text($$) |
319 | { | |
320 | my ($flags, $flags_to_text_ref) = @_; | |
321 | ||
322 | # Returns a string containing a mnemonic representation of the bits that | |
323 | # are set in the $flags. These are assumed to be flag bits. The return | |
324 | # looks like "FOO|BAR|BAZ". The second parameter is a reference to an | |
325 | # array that gives the textual representation of all the possible flags. | |
326 | # Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If | |
327 | # no bits at all are set the string "0" is returned; | |
328 | ||
329 | my @flag_text; | |
330 | my $shift = 0; | |
331 | ||
332 | return "0" if $flags == 0; | |
333 | ||
334 | while ($flags) { | |
335 | #diag sprintf "%x", $flags; | |
336 | if ($flags & 1) { | |
337 | push @flag_text, $flags_to_text_ref->[$shift]; | |
338 | } | |
339 | $shift++; | |
340 | $flags >>= 1; | |
341 | } | |
342 | ||
343 | return join "|", @flag_text; | |
344 | } | |
345 | ||
346 | # Possible flag returns from utf8n_to_uvchr_error(). These should have G_, | |
347 | # instead of A_, D_, but the prefixes will be used in a a later commit, so | |
348 | # minimize churn by having them here. | |
349 | my @utf8n_flags_to_text = ( qw( | |
350 | A_EMPTY | |
351 | A_CONTINUATION | |
352 | A_NON_CONTINUATION | |
353 | A_SHORT | |
354 | A_LONG | |
355 | A_LONG_AND_ITS_VALUE | |
356 | PLACEHOLDER | |
357 | A_OVERFLOW | |
358 | D_SURROGATE | |
359 | W_SURROGATE | |
360 | D_NONCHAR | |
361 | W_NONCHAR | |
362 | D_SUPER | |
363 | W_SUPER | |
364 | D_ABOVE_31_BIT | |
365 | W_ABOVE_31_BIT | |
366 | CHECK_ONLY | |
367 | NO_CONFIDENCE_IN_CURLEN_ | |
368 | ) ); | |
369 | ||
a8ee5133 KW |
370 | sub utf8n_display_call($) |
371 | { | |
372 | # Converts an eval string that calls test_utf8n_to_uvchr into a more human | |
373 | # readable form, and returns it. Doesn't work if the byte string contains | |
374 | # an apostrophe. The return will look something like: | |
375 | # test_utf8n_to_uvchr_error('$bytes', $length, $flags) | |
376 | #diag $_[0]; | |
377 | ||
378 | $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x; | |
379 | my $text1 = $1; # Everything before the byte string | |
380 | my $bytes = $2; | |
381 | my $text2 = $3; # Includes the length | |
382 | my $flags = $4; | |
383 | ||
384 | return $text1 | |
385 | . display_bytes($bytes) | |
386 | . $text2 | |
387 | . flags_to_text($flags, \@utf8n_flags_to_text) | |
388 | . ')'; | |
389 | } | |
390 | ||
d884ea32 KW |
391 | sub uvchr_display_call($) |
392 | { | |
393 | # Converts an eval string that calls test_uvchr_to_utf8 into a more human | |
394 | # readable form, and returns it. The return will look something like: | |
395 | # test_uvchr_to_utf8n_flags($uv, $flags) | |
396 | #diag $_[0]; | |
397 | ||
398 | my @flags_to_text = ( qw( | |
399 | W_SURROGATE | |
400 | W_NONCHAR | |
401 | W_SUPER | |
402 | W_ABOVE_31_BIT | |
403 | D_SURROGATE | |
404 | D_NONCHAR | |
405 | D_SUPER | |
406 | D_ABOVE_31_BIT | |
407 | ) ); | |
408 | ||
409 | $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x; | |
410 | my $text = $1; | |
411 | my $cp = sprintf "%X", $2; | |
412 | my $flags = $3; | |
413 | ||
414 | return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')'; | |
415 | } | |
416 | ||
69485e19 KW |
417 | sub do_warnings_test(@) |
418 | { | |
419 | my @expected_warnings = @_; | |
420 | ||
421 | # Compares the input expected warnings array with @warnings_gotten, | |
422 | # generating a pass for each found, removing it from @warnings_gotten. | |
423 | # Any discrepancies generate test failures. Returns TRUE if no | |
424 | # discrepcancies; otherwise FALSE. | |
425 | ||
426 | my $succeeded = 1; | |
427 | ||
428 | if (@expected_warnings == 0) { | |
429 | if (! is(@warnings_gotten, 0, " Expected and got no warnings")) { | |
430 | output_warnings(@warnings_gotten); | |
431 | $succeeded = 0; | |
432 | } | |
433 | return $succeeded; | |
434 | } | |
435 | ||
436 | # Check that we got all the expected warnings, | |
437 | # removing each one found | |
438 | WARNING: | |
439 | foreach my $expected (@expected_warnings) { | |
440 | foreach (my $i = 0; $i < @warnings_gotten; $i++) { | |
441 | if ($warnings_gotten[$i] =~ $expected) { | |
442 | pass(" Expected and got warning: " | |
443 | . " $warnings_gotten[$i]"); | |
444 | splice @warnings_gotten, $i, 1; | |
445 | next WARNING; | |
446 | } | |
447 | } | |
448 | fail(" Expected a warning that matches " | |
449 | . $expected . " but didn't get it"); | |
450 | $succeeded = 0; | |
451 | } | |
452 | ||
453 | if (! is(@warnings_gotten, 0, " Got no unexpected warnings")) { | |
454 | output_warnings(@warnings_gotten); | |
455 | $succeeded = 0; | |
456 | } | |
457 | ||
458 | return $succeeded; | |
459 | } | |
460 | ||
6aa905cf KW |
461 | # This test is split into this number of files. |
462 | my $num_test_files = $ENV{TEST_JOBS} || 1; | |
463 | $num_test_files = 10 if $num_test_files > 10; | |
464 | ||
465 | my $test_count = -1; | |
466 | foreach my $test (@tests) { | |
467 | $test_count++; | |
468 | next if $test_count % $num_test_files != $::TEST_CHUNK; | |
469 | ||
d42d7565 | 470 | my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test; |
6aa905cf KW |
471 | |
472 | my $length = length $bytes; | |
04f42bf6 | 473 | my $will_overflow = $allowed_uv < 0; |
6aa905cf | 474 | |
2c511c58 KW |
475 | my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv); |
476 | ||
d42d7565 | 477 | my $controlling_warning_category; |
af816908 KW |
478 | my $utf8n_flag_to_warn; |
479 | my $utf8n_flag_to_disallow; | |
480 | my $uvchr_flag_to_warn; | |
481 | my $uvchr_flag_to_disallow; | |
b7e1f4b2 | 482 | |
e4e140b4 KW |
483 | # We want to test that the independent flags are actually independent. |
484 | # For example, that a surrogate doesn't trigger a non-character warning, | |
485 | # and conversely, turning off an above-Unicode flag doesn't suppress a | |
486 | # surrogate warning. Earlier versions of this file used nested loops to | |
487 | # test all possible combinations. But that creates lots of tests, making | |
488 | # this run too long. What is now done instead is to use the complement of | |
489 | # the category we are testing to greatly reduce the combinatorial | |
490 | # explosion. For example, if we have a surrogate and we aren't expecting | |
491 | # a warning about it, we set all the flags for non-surrogates to raise | |
492 | # warnings. If one shows up, it indicates the flags aren't independent. | |
493 | my $utf8n_flag_to_warn_complement; | |
494 | my $utf8n_flag_to_disallow_complement; | |
495 | my $uvchr_flag_to_warn_complement; | |
496 | my $uvchr_flag_to_disallow_complement; | |
497 | ||
bf422d6a KW |
498 | # Many of the code points being tested are middling in that if code point |
499 | # edge cases work, these are very likely to as well. Because this test | |
500 | # file takes a while to execute, we skip testing the edge effects of code | |
501 | # points deemed middling, while testing their basics and continuing to | |
502 | # fully test the non-middling code points. | |
503 | my $skip_most_tests = 0; | |
504 | ||
601e92f1 KW |
505 | my $cp_message_qr; # Pattern that matches the message raised when |
506 | # that message contains the problematic code | |
507 | # point. The message is the same (currently) both | |
508 | # when going from/to utf8. | |
509 | my $non_cp_trailing_text; # The suffix text when the message doesn't | |
510 | # contain a code point. (This is a result of | |
511 | # some sort of malformation that means we | |
512 | # can't get an exact code poin | |
513 | ||
af816908 KW |
514 | if ($will_overflow || $allowed_uv > 0x10FFFF) { |
515 | ||
e4e140b4 | 516 | # Set the SUPER flags; later, we test for ABOVE_31_BIT as well. |
af816908 KW |
517 | $utf8n_flag_to_warn = $::UTF8_WARN_SUPER; |
518 | $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER; | |
519 | $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER; | |
520 | $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;; | |
521 | ||
e4e140b4 KW |
522 | # Below, we add the flags for non-above-31 bit to the code points that |
523 | # don't fit that category. Special tests are done for this category | |
524 | # in the inner loop. | |
525 | $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR | |
526 | |$::UTF8_WARN_SURROGATE; | |
527 | $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR | |
528 | |$::UTF8_DISALLOW_SURROGATE; | |
529 | $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR | |
530 | |$::UNICODE_WARN_SURROGATE; | |
531 | $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR | |
532 | |$::UNICODE_DISALLOW_SURROGATE; | |
533 | $controlling_warning_category = 'non_unicode'; | |
534 | ||
af816908 | 535 | if ($will_overflow) { |
69485e19 | 536 | $non_cp_trailing_text = "if you see this, there is an error"; |
601e92f1 | 537 | $cp_message_qr = qr/\Q$non_cp_trailing_text\E/; |
af816908 KW |
538 | } |
539 | elsif ($allowed_uv > 0x7FFFFFFF) { | |
601e92f1 KW |
540 | $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E |
541 | \Q and not portable\E/x; | |
542 | $non_cp_trailing_text = "is for a non-Unicode code point, and is not portable"; | |
af816908 | 543 | } |
601e92f1 KW |
544 | else { |
545 | $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E | |
546 | \Q may not be portable\E/x; | |
547 | $non_cp_trailing_text = "is for a non-Unicode code point, may not" | |
548 | . " be portable"; | |
e4e140b4 KW |
549 | $utf8n_flag_to_warn_complement |= $::UTF8_WARN_ABOVE_31_BIT; |
550 | $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_ABOVE_31_BIT; | |
551 | $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_ABOVE_31_BIT; | |
552 | $uvchr_flag_to_disallow_complement | |
553 | |= $::UNICODE_DISALLOW_ABOVE_31_BIT; | |
af816908 | 554 | } |
3022ad00 KW |
555 | } |
556 | elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) { | |
601e92f1 KW |
557 | $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/; |
558 | $non_cp_trailing_text = "is for a surrogate"; | |
67e45424 | 559 | $needed_to_discern_len = 2 unless defined $needed_to_discern_len; |
bf422d6a | 560 | $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF; |
af816908 KW |
561 | |
562 | $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE; | |
563 | $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE; | |
564 | $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE; | |
565 | $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;; | |
e4e140b4 KW |
566 | |
567 | $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR | |
568 | |$::UTF8_WARN_SUPER | |
569 | |$::UTF8_WARN_ABOVE_31_BIT; | |
570 | $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR | |
571 | |$::UTF8_DISALLOW_SUPER | |
572 | |$::UTF8_DISALLOW_ABOVE_31_BIT; | |
573 | $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR | |
574 | |$::UNICODE_WARN_SUPER | |
575 | |$::UNICODE_WARN_ABOVE_31_BIT; | |
576 | $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR | |
577 | |$::UNICODE_DISALLOW_SUPER | |
578 | |$::UNICODE_DISALLOW_ABOVE_31_BIT; | |
d42d7565 | 579 | $controlling_warning_category = 'surrogate'; |
3022ad00 KW |
580 | } |
581 | elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF) | |
582 | || ($allowed_uv & 0xFFFE) == 0xFFFE) | |
583 | { | |
601e92f1 KW |
584 | $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E |
585 | \Q is not recommended for open interchange\E/x; | |
586 | $non_cp_trailing_text = "if you see this, there is an error"; | |
67e45424 | 587 | $needed_to_discern_len = $length unless defined $needed_to_discern_len; |
bf422d6a KW |
588 | if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF) |
589 | || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE)) | |
590 | { | |
591 | $skip_most_tests = 1; | |
592 | } | |
af816908 KW |
593 | |
594 | $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR; | |
595 | $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR; | |
596 | $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR; | |
597 | $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;; | |
e4e140b4 KW |
598 | |
599 | $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE | |
600 | |$::UTF8_WARN_SUPER | |
601 | |$::UTF8_WARN_ABOVE_31_BIT; | |
602 | $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE | |
603 | |$::UTF8_DISALLOW_SUPER | |
604 | |$::UTF8_DISALLOW_ABOVE_31_BIT; | |
605 | $uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE | |
606 | |$::UNICODE_WARN_SUPER | |
607 | |$::UNICODE_WARN_ABOVE_31_BIT; | |
608 | $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE | |
609 | |$::UNICODE_DISALLOW_SUPER | |
610 | |$::UNICODE_DISALLOW_ABOVE_31_BIT; | |
611 | ||
d42d7565 | 612 | $controlling_warning_category = 'nonchar'; |
3022ad00 KW |
613 | } |
614 | else { | |
615 | die "Can't figure out what type of warning to test for $testname" | |
616 | } | |
617 | ||
67e45424 KW |
618 | die 'Didn\'t set $needed_to_discern_len for ' . $testname |
619 | unless defined $needed_to_discern_len; | |
6aa905cf KW |
620 | { |
621 | use warnings; | |
9cdc3054 | 622 | undef @warnings_gotten; |
6aa905cf KW |
623 | my $ret = test_isUTF8_CHAR($bytes, $length); |
624 | my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0); | |
625 | if ($will_overflow) { | |
d402d77f KW |
626 | is($ret, 0, "For $testname: isUTF8_CHAR() returns 0"); |
627 | is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0"); | |
6aa905cf KW |
628 | } |
629 | else { | |
630 | is($ret, $length, | |
d402d77f KW |
631 | "For $testname: isUTF8_CHAR() returns expected length: $length"); |
632 | is($ret_flags, $length, " And isUTF8_CHAR_flags(...,0)" | |
6aa905cf KW |
633 | . " returns expected length: $length"); |
634 | } | |
9cdc3054 | 635 | is(scalar @warnings_gotten, 0, |
d402d77f KW |
636 | " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags generated" |
637 | . " any warnings") | |
9cdc3054 | 638 | or output_warnings(@warnings_gotten); |
6aa905cf | 639 | |
9cdc3054 | 640 | undef @warnings_gotten; |
6aa905cf KW |
641 | $ret = test_isSTRICT_UTF8_CHAR($bytes, $length); |
642 | if ($will_overflow) { | |
d402d77f | 643 | is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0"); |
6aa905cf KW |
644 | } |
645 | else { | |
646 | my $expected_ret = ( $testname =~ /surrogate|non-character/ | |
647 | || $allowed_uv > 0x10FFFF) | |
648 | ? 0 | |
649 | : $length; | |
d402d77f | 650 | is($ret, $expected_ret, " And isSTRICT_UTF8_CHAR() returns" |
6aa905cf KW |
651 | . " expected length: $expected_ret"); |
652 | $ret = test_isUTF8_CHAR_flags($bytes, $length, | |
653 | $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE); | |
654 | is($ret, $expected_ret, | |
d402d77f KW |
655 | " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')" |
656 | . " acts like isSTRICT_UTF8_CHAR"); | |
6aa905cf | 657 | } |
9cdc3054 | 658 | is(scalar @warnings_gotten, 0, |
d402d77f KW |
659 | " And neither isSTRICT_UTF8_CHAR() nor isUTF8_CHAR_flags" |
660 | . " generated any warnings") | |
9cdc3054 | 661 | or output_warnings(@warnings_gotten); |
6aa905cf | 662 | |
9cdc3054 | 663 | undef @warnings_gotten; |
6aa905cf KW |
664 | $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length); |
665 | if ($will_overflow) { | |
d402d77f | 666 | is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0"); |
6aa905cf KW |
667 | } |
668 | else { | |
669 | my $expected_ret = ( $testname =~ /surrogate/ | |
670 | || $allowed_uv > 0x10FFFF) | |
671 | ? 0 | |
672 | : $length; | |
d402d77f | 673 | is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()" |
6aa905cf KW |
674 | ." returns expected length: $expected_ret"); |
675 | $ret = test_isUTF8_CHAR_flags($bytes, $length, | |
676 | $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); | |
677 | is($ret, $expected_ret, | |
d402d77f KW |
678 | " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')" |
679 | . " acts like isC9_STRICT_UTF8_CHAR"); | |
6aa905cf | 680 | } |
9cdc3054 | 681 | is(scalar @warnings_gotten, 0, |
d402d77f KW |
682 | " And neither isC9_STRICT_UTF8_CHAR() nor isUTF8_CHAR_flags" |
683 | . " generated any warnings") | |
9cdc3054 | 684 | or output_warnings(@warnings_gotten); |
6aa905cf KW |
685 | |
686 | # Test partial character handling, for each byte not a full character | |
687 | for my $j (1.. $length - 1) { | |
688 | ||
689 | # Skip the test for the interaction between overflow and above-31 | |
690 | # bit. It is really testing other things than the partial | |
691 | # character tests, for which other tests in this file are | |
692 | # sufficient | |
04f42bf6 | 693 | last if $will_overflow; |
6aa905cf | 694 | |
69485e19 | 695 | foreach my $disallow_flag (0, $utf8n_flag_to_disallow) { |
6aa905cf KW |
696 | my $partial = substr($bytes, 0, $j); |
697 | my $ret_should_be; | |
698 | my $comment; | |
699 | if ($disallow_flag) { | |
700 | $ret_should_be = 0; | |
701 | $comment = "disallowed"; | |
702 | if ($j < $needed_to_discern_len) { | |
703 | $ret_should_be = 1; | |
704 | $comment .= ", but need $needed_to_discern_len bytes" | |
705 | . " to discern:"; | |
706 | } | |
707 | } | |
708 | else { | |
709 | $ret_should_be = 1; | |
710 | $comment = "allowed"; | |
711 | } | |
712 | ||
9cdc3054 | 713 | undef @warnings_gotten; |
6aa905cf KW |
714 | |
715 | $ret = test_is_utf8_valid_partial_char_flags($partial, $j, | |
716 | $disallow_flag); | |
717 | is($ret, $ret_should_be, | |
d402d77f KW |
718 | " And is_utf8_valid_partial_char_flags(" |
719 | . display_bytes($partial) | |
720 | . "), $comment: returns $ret_should_be"); | |
9cdc3054 | 721 | is(scalar @warnings_gotten, 0, |
d402d77f | 722 | " And is_utf8_valid_partial_char_flags()" |
6aa905cf | 723 | . " generated no warnings") |
9cdc3054 | 724 | or output_warnings(@warnings_gotten); |
6aa905cf KW |
725 | } |
726 | } | |
727 | } | |
728 | ||
69485e19 KW |
729 | # This is more complicated than the malformations tested in other files in |
730 | # this directory, as there are several orthogonal variables involved. We | |
731 | # test most possible combinations | |
69485e19 KW |
732 | foreach my $do_disallow (0, 1) { |
733 | next if $skip_most_tests && ! $do_disallow; | |
6aa905cf | 734 | |
e4e140b4 | 735 | # We try various combinations of malformations that can occur |
6aa905cf | 736 | foreach my $short ("", "short") { |
bf422d6a | 737 | next if $skip_most_tests && $short; |
6aa905cf KW |
738 | foreach my $unexpected_noncont ("", |
739 | "unexpected non-continuation") | |
740 | { | |
bf422d6a | 741 | next if $skip_most_tests && $unexpected_noncont; |
6aa905cf | 742 | foreach my $overlong ("", "overlong") { |
bf422d6a | 743 | next if $overlong && $skip_most_tests; |
6aa905cf | 744 | |
b3169593 KW |
745 | # If we're creating an overlong, it can't be longer than |
746 | # the maximum length, so skip if we're already at that | |
747 | # length. | |
748 | next if $overlong && $length >= $::max_bytes; | |
6aa905cf | 749 | |
69485e19 KW |
750 | # We classify the warnings into certain "interesting" types, |
751 | # described later | |
752 | foreach my $warning_type (0..4) { | |
753 | next if $skip_most_tests && $warning_type != 1; | |
754 | foreach my $use_warn_flag (0, 1) { | |
755 | next if $skip_most_tests && ! $use_warn_flag; | |
756 | ||
757 | my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn; | |
758 | my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow; | |
759 | my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn; | |
760 | my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow; | |
761 | ||
762 | my $eval_warn; | |
763 | my $expect_regular_warnings; | |
764 | my $expect_warnings_for_malformed; | |
765 | my $expect_warnings_for_overflow; | |
766 | ||
767 | if ($warning_type == 0) { | |
768 | $eval_warn = "use warnings; no warnings 'deprecated'"; | |
769 | $expect_regular_warnings = $use_warn_flag; | |
770 | $expect_warnings_for_overflow = 1; | |
771 | $expect_warnings_for_malformed = 1; | |
6aa905cf | 772 | } |
69485e19 KW |
773 | elsif ($warning_type == 1) { |
774 | $eval_warn = "no warnings"; | |
775 | $expect_regular_warnings = 0; | |
776 | $expect_warnings_for_overflow = 0; | |
777 | $expect_warnings_for_malformed = 0; | |
778 | } | |
779 | elsif ($warning_type == 2) { | |
780 | $eval_warn = "no warnings; use warnings 'utf8'"; | |
781 | $expect_regular_warnings = $use_warn_flag; | |
782 | $expect_warnings_for_overflow = 1; | |
783 | $expect_warnings_for_malformed = 1; | |
784 | } | |
785 | elsif ($warning_type == 3) { | |
786 | $eval_warn = "no warnings; use warnings" | |
e4e140b4 KW |
787 | . " '$controlling_warning_category'"; |
788 | $expect_regular_warnings = $use_warn_flag; | |
69485e19 | 789 | $expect_warnings_for_overflow |
e4e140b4 | 790 | = $controlling_warning_category eq 'non_unicode'; |
69485e19 KW |
791 | $expect_warnings_for_malformed = 0; |
792 | } | |
793 | elsif ($warning_type == 4) { # Like type 3, but uses the | |
794 | # above-31-bit flags | |
795 | # The complement flags were set up so that the | |
796 | # above-31-bit flags have been tested that they don't | |
797 | # trigger wrongly for too small code points. And the | |
798 | # flags have been set up so that those small code | |
799 | # points are tested for being above Unicode. What's | |
800 | # left to test is that the large code points do | |
801 | # trigger the above-31-bit flags. | |
802 | next if ! $will_overflow && $allowed_uv < 0x80000000; | |
e4e140b4 | 803 | next if $controlling_warning_category ne 'non_unicode'; |
69485e19 KW |
804 | $eval_warn = "no warnings; use warnings 'non_unicode'"; |
805 | $expect_regular_warnings = 1; | |
806 | $expect_warnings_for_overflow = 1; | |
807 | $expect_warnings_for_malformed = 0; | |
808 | $this_utf8n_flag_to_warn = $::UTF8_WARN_ABOVE_31_BIT; | |
809 | $this_utf8n_flag_to_disallow | |
810 | = $::UTF8_DISALLOW_ABOVE_31_BIT; | |
811 | $this_uvchr_flag_to_warn = $::UNICODE_WARN_ABOVE_31_BIT; | |
812 | $this_uvchr_flag_to_disallow | |
813 | = $::UNICODE_DISALLOW_ABOVE_31_BIT; | |
601e92f1 KW |
814 | } |
815 | else { | |
69485e19 | 816 | die "Unexpected warning type '$warning_type'"; |
601e92f1 KW |
817 | } |
818 | ||
69485e19 KW |
819 | # We only need to test the case where all warnings are |
820 | # enabled (type 0) to see if turning off the warning flag | |
821 | # causes things to not be output. If those pass, then | |
822 | # turning on some sub-category of warnings, or turning off | |
823 | # warnings altogether are extremely likely to not output | |
824 | # warnings either, given how the warnings subsystem is | |
825 | # supposed to work, and this file assumes it does work. | |
826 | next if $warning_type != 0 && ! $use_warn_flag; | |
827 | ||
828 | # The convention is that the 'got' flag is the same value | |
829 | # as the disallow one. If this were violated, the tests | |
830 | # here should start failing. | |
831 | my $return_flag = $this_utf8n_flag_to_disallow; | |
832 | ||
e4e140b4 KW |
833 | # If we aren't expecting warnings/disallow for this, turn |
834 | # on all the other flags. That makes sure that they all | |
835 | # are independent of this flag, and so we don't need to | |
836 | # test them individually. | |
837 | my $this_warning_flags = ($use_warn_flag) | |
838 | ? $this_utf8n_flag_to_warn | |
839 | : $utf8n_flag_to_warn_complement; | |
69485e19 | 840 | my $this_disallow_flags = ($do_disallow) |
e4e140b4 KW |
841 | ? $this_utf8n_flag_to_disallow |
842 | : $utf8n_flag_to_disallow_complement; | |
6aa905cf KW |
843 | my $this_bytes = $bytes; |
844 | my $this_length = $length; | |
845 | my $expected_uv = $allowed_uv; | |
b3169593 | 846 | my $this_expected_len = $length; |
6aa905cf | 847 | my $this_needed_to_discern_len = $needed_to_discern_len; |
6aa905cf | 848 | |
69485e19 KW |
849 | my @malformation_names; |
850 | my @expected_warnings; | |
851 | my @expected_return_flags; | |
6aa905cf | 852 | |
69485e19 KW |
853 | # Now go through the possible malformations wanted, and |
854 | # change the input accordingly. We also can set up | |
855 | # certain other things now, like whether we expect a | |
856 | # return flag from this malformation and which flag. | |
857 | if ($overlong) { | |
6aa905cf KW |
858 | # For an overlong, we convert the original |
859 | # start byte into a continuation byte with | |
860 | # the same data bits as originally. ... | |
861 | substr($this_bytes, 0, 1) | |
862 | = start_byte_to_cont(substr($this_bytes, | |
863 | 0, 1)); | |
864 | ||
865 | # ... Then we prepend it with a known | |
866 | # overlong sequence. This should evaluate | |
867 | # to the exact same code point as the | |
868 | # original. | |
869 | $this_bytes | |
870 | = I8_to_native("\xff") | |
dbb8d798 | 871 | . (I8_to_native(chr $::lowest_continuation) |
6aa905cf KW |
872 | x ( $::max_bytes - 1 - length($this_bytes))) |
873 | . $this_bytes; | |
874 | $this_length = length($this_bytes); | |
875 | $this_needed_to_discern_len | |
876 | = $::max_bytes - ($this_expected_len | |
877 | - $this_needed_to_discern_len); | |
878 | $this_expected_len = $::max_bytes; | |
9cdc3054 | 879 | push @expected_return_flags, $::UTF8_GOT_LONG; |
69485e19 KW |
880 | push @malformation_names, $overlong; |
881 | if ($expect_warnings_for_malformed) { | |
882 | if ( ! $short | |
883 | && ! $unexpected_noncont | |
884 | && ! $will_overflow) | |
885 | { | |
886 | my $overlong_bytes | |
887 | = display_bytes_no_quotes($this_bytes); | |
888 | my $correct_bytes | |
889 | = display_bytes_no_quotes($bytes); | |
890 | push @expected_warnings, | |
891 | qr/\QMalformed UTF-8 character:\E | |
892 | \Q $overlong_bytes (overlong;\E | |
893 | \Q instead use $correct_bytes to\E | |
894 | \Q represent U+$uv_string)/x; | |
895 | } | |
896 | else { | |
897 | push @expected_warnings, qr/overlong/; | |
898 | } | |
6aa905cf | 899 | } |
69485e19 | 900 | } |
6aa905cf | 901 | |
69485e19 KW |
902 | if ($short) { |
903 | push @malformation_names, $short; | |
904 | push @expected_warnings, qr/short/ | |
905 | if $expect_warnings_for_malformed; | |
906 | ||
907 | # To force this malformation, just tell the test to | |
908 | # not look as far as it should into the input. | |
909 | $this_length--; | |
910 | $this_expected_len--; | |
911 | push @expected_return_flags, $::UTF8_GOT_SHORT; | |
912 | } | |
913 | ||
914 | if ($unexpected_noncont) { | |
915 | push @malformation_names, $unexpected_noncont; | |
916 | push @expected_warnings, qr/$unexpected_noncont/ | |
917 | if $expect_warnings_for_malformed; | |
918 | ||
919 | # To force this malformation, change the final | |
920 | # continuation byte into a non continuation. | |
921 | my $pos = ($short) ? -2 : -1; | |
922 | substr($this_bytes, $pos, 1) = '?'; | |
923 | $this_expected_len--; | |
924 | push @expected_return_flags, | |
925 | $::UTF8_GOT_NON_CONTINUATION; | |
926 | } | |
927 | ||
928 | # The overflow malformation is done differently than other | |
929 | # malformations. It comes from manually typed tests in | |
930 | # the test array, but it also is above Unicode and uses | |
931 | # Perl extended UTF-8, so affects some of the flags being | |
932 | # tested. We now make it be treated like one of the other | |
933 | # generated malformations. | |
934 | if ($will_overflow) { | |
935 | ||
936 | # An overflow is (way) above Unicode, and overrides | |
937 | # everything else. | |
938 | $expect_regular_warnings = 0; | |
939 | ||
940 | push @malformation_names, 'overflow'; | |
941 | if ($expect_warnings_for_overflow) { | |
942 | my $qr = display_bytes_no_quotes( | |
943 | substr($this_bytes, 0, $this_expected_len)); | |
944 | $qr = qr/\QMalformed UTF-8 character: \E | |
945 | \Q$qr (overflows)\E/x; | |
946 | push @expected_warnings, $qr; | |
6aa905cf | 947 | } |
69485e19 | 948 | push @expected_return_flags, $::UTF8_GOT_OVERFLOW; |
6aa905cf KW |
949 | } |
950 | ||
69485e19 KW |
951 | # Here, we've set things up based on the malformations. |
952 | # Now generate the text for them for the test name. | |
953 | my $malformations_name = ""; | |
954 | if (@malformation_names) { | |
955 | $malformations_name .= "malformation"; | |
956 | $malformations_name .= "s" if @malformation_names > 1; | |
957 | $malformations_name .= ": "; | |
958 | $malformations_name .= join "/", @malformation_names; | |
959 | $malformations_name = " ($malformations_name)"; | |
960 | } | |
961 | ||
962 | # It may be that the malformations have shortened the | |
963 | # amount of input we look at so much that we can't tell | |
964 | # what the category the code point was in. Otherwise, set | |
965 | # up the expected return flags based on the warnings and | |
966 | # disallowments. | |
967 | if ($this_expected_len < $this_needed_to_discern_len) { | |
968 | $expect_regular_warnings = 0; | |
969 | } | |
970 | elsif ( ($this_warning_flags & $this_utf8n_flag_to_warn) | |
971 | || ( $this_disallow_flags | |
972 | & $this_utf8n_flag_to_disallow)) | |
973 | { | |
974 | push @expected_return_flags, $return_flag; | |
975 | } | |
976 | ||
977 | # Finish setting up the expected warning. | |
978 | if ($expect_regular_warnings) { | |
979 | ||
980 | # So far the array contains warnings generated by | |
981 | # malformations. Add the expected regular one. | |
982 | unshift @expected_warnings, $cp_message_qr; | |
983 | ||
984 | # But it may need to be modified, because either of | |
985 | # these malformations means we can't determine the | |
986 | # expected code point. | |
987 | if ($short || $unexpected_noncont) { | |
988 | my $first_byte = substr($this_bytes, 0, 1); | |
989 | $expected_warnings[0] = display_bytes( | |
990 | substr($this_bytes, 0, $this_expected_len)); | |
991 | $expected_warnings[0] | |
992 | = qr/[Aa]\Qny UTF-8 sequence that starts with\E | |
993 | \Q $expected_warnings[0]\E | |
994 | \Q $non_cp_trailing_text\E/x; | |
995 | } | |
996 | } | |
6aa905cf KW |
997 | |
998 | # Is effectively disallowed if we've set up a | |
999 | # malformation, even if the flag indicates it is | |
1000 | # allowed. Fix up test name to indicate this as | |
1001 | # well | |
69485e19 KW |
1002 | my $disallowed = ( $this_disallow_flags |
1003 | & $this_utf8n_flag_to_disallow) | |
1004 | || $malformations_name; | |
6aa905cf | 1005 | my $this_name = "utf8n_to_uvchr_error() $testname: " |
69485e19 KW |
1006 | . (($disallowed) |
1007 | ? 'disallowed' | |
1008 | : 'allowed'); | |
6aa905cf | 1009 | $this_name .= ", $eval_warn"; |
69485e19 KW |
1010 | $this_name .= ", " . (( $this_warning_flags |
1011 | & $this_utf8n_flag_to_warn) | |
1012 | ? 'with flag for raising warnings' | |
1013 | : 'no flag for raising warnings'); | |
1014 | $this_name .= $malformations_name; | |
db0f09e6 | 1015 | |
9cdc3054 | 1016 | undef @warnings_gotten; |
6aa905cf | 1017 | my $ret_ref; |
69485e19 | 1018 | my $this_flags = $this_warning_flags|$this_disallow_flags; |
6aa905cf KW |
1019 | my $eval_text = "$eval_warn; \$ret_ref" |
1020 | . " = test_utf8n_to_uvchr_error(" | |
69485e19 | 1021 | . "'$this_bytes', $this_length, $this_flags)"; |
6aa905cf | 1022 | eval "$eval_text"; |
69485e19 | 1023 | if (! ok ("$@ eq ''", "$this_name: eval succeeded")) |
6aa905cf | 1024 | { |
a8ee5133 KW |
1025 | diag "\$@='$@'; call was: " |
1026 | . utf8n_display_call($eval_text); | |
6aa905cf KW |
1027 | next; |
1028 | } | |
1029 | if ($disallowed) { | |
d402d77f | 1030 | is($ret_ref->[0], 0, " And returns 0") |
a8ee5133 | 1031 | or diag "Call was: " . utf8n_display_call($eval_text); |
6aa905cf KW |
1032 | } |
1033 | else { | |
1034 | is($ret_ref->[0], $expected_uv, | |
d402d77f KW |
1035 | " And returns expected uv: " |
1036 | . $uv_string) | |
a8ee5133 | 1037 | or diag "Call was: " . utf8n_display_call($eval_text); |
6aa905cf KW |
1038 | } |
1039 | is($ret_ref->[1], $this_expected_len, | |
d402d77f | 1040 | " And returns expected length:" |
6aa905cf | 1041 | . " $this_expected_len") |
a8ee5133 | 1042 | or diag "Call was: " . utf8n_display_call($eval_text); |
6aa905cf | 1043 | |
9cdc3054 | 1044 | my $returned_flags = $ret_ref->[2]; |
6aa905cf | 1045 | |
9cdc3054 | 1046 | for (my $i = @expected_return_flags - 1; $i >= 0; $i--) { |
69485e19 KW |
1047 | if ($expected_return_flags[$i] & $returned_flags) { |
1048 | if ($expected_return_flags[$i] | |
1049 | == $::UTF8_DISALLOW_ABOVE_31_BIT) | |
1050 | { | |
1051 | pass(" Expected and got return flag for" | |
1052 | . " above_31_bit"); | |
1053 | } | |
1054 | # The first entries in this are | |
1055 | # malformations | |
1056 | elsif ($i > @malformation_names - 1) { | |
1057 | pass(" Expected and got return flag" | |
e4e140b4 | 1058 | . " for " . $controlling_warning_category); |
69485e19 KW |
1059 | } |
1060 | else { | |
1061 | pass(" Expected and got return flag for " | |
1062 | . $malformation_names[$i] | |
1063 | . " malformation"); | |
1064 | } | |
9cdc3054 | 1065 | $returned_flags &= ~$expected_return_flags[$i]; |
69485e19 | 1066 | splice @expected_return_flags, $i, 1; |
6aa905cf | 1067 | } |
6aa905cf | 1068 | } |
6aa905cf | 1069 | |
69485e19 KW |
1070 | is($returned_flags, 0, |
1071 | " Got no unexpected return flags") | |
1072 | or diag "The unexpected flags gotten were: " | |
5722c46d KW |
1073 | . (flags_to_text($returned_flags, |
1074 | \@utf8n_flags_to_text) | |
69485e19 KW |
1075 | # We strip off any prefixes from the flag |
1076 | # names | |
1077 | =~ s/ \b [A-Z] _ //xgr); | |
1078 | is (scalar @expected_return_flags, 0, | |
1079 | " Got all expected return flags") | |
1080 | or diag "The expected flags not gotten were: " | |
1081 | . (flags_to_text(eval join("|", | |
1082 | @expected_return_flags), | |
1083 | \@utf8n_flags_to_text) | |
1084 | # We strip off any prefixes from the flag | |
1085 | # names | |
5722c46d | 1086 | =~ s/ \b [A-Z] _ //xgr); |
9cdc3054 | 1087 | |
69485e19 KW |
1088 | do_warnings_test(@expected_warnings) |
1089 | or diag "Call was: " . utf8n_display_call($eval_text); | |
1090 | undef @warnings_gotten; | |
6aa905cf KW |
1091 | |
1092 | # Check CHECK_ONLY results when the input is | |
1093 | # disallowed. Do this when actually disallowed, | |
69485e19 | 1094 | # not just when the $this_disallow_flags is set |
6aa905cf | 1095 | if ($disallowed) { |
69485e19 KW |
1096 | my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY; |
1097 | my $eval_text = "use warnings; no warnings 'deprecated'; \$ret_ref =" | |
1098 | . " test_utf8n_to_uvchr_error('" | |
1099 | . "$this_bytes', $this_length," | |
1100 | . " $this_flags)"; | |
1101 | eval $eval_text; | |
a8ee5133 KW |
1102 | if (! ok ("$@ eq ''", |
1103 | " And eval succeeded with CHECK_ONLY")) | |
1104 | { | |
1105 | diag "\$@='$@'; Call was: " | |
1106 | . utf8n_display_call($eval_text); | |
1107 | next; | |
1108 | } | |
d402d77f | 1109 | is($ret_ref->[0], 0, " CHECK_ONLY: Returns 0") |
a8ee5133 | 1110 | or diag "Call was: " . utf8n_display_call($eval_text); |
6aa905cf | 1111 | is($ret_ref->[1], -1, |
d402d77f | 1112 | " CHECK_ONLY: returns -1 for length") |
a8ee5133 | 1113 | or diag "Call was: " . utf8n_display_call($eval_text); |
9cdc3054 | 1114 | if (! is(scalar @warnings_gotten, 0, |
d402d77f | 1115 | " CHECK_ONLY: no warnings generated")) |
6aa905cf | 1116 | { |
a8ee5133 | 1117 | diag "Call was: " . utf8n_display_call($eval_text); |
9cdc3054 | 1118 | output_warnings(@warnings_gotten); |
6aa905cf KW |
1119 | } |
1120 | } | |
1121 | ||
1122 | # Now repeat some of the above, but for | |
1123 | # uvchr_to_utf8_flags(). Since this comes from an | |
1124 | # existing code point, it hasn't overflowed, and | |
1125 | # isn't malformed. | |
69485e19 KW |
1126 | next if @malformation_names; |
1127 | ||
1128 | $this_warning_flags = ($use_warn_flag) | |
1129 | ? $this_uvchr_flag_to_warn | |
1130 | : 0; | |
1131 | $this_disallow_flags = ($do_disallow) | |
1132 | ? $this_uvchr_flag_to_disallow | |
1133 | : 0; | |
1134 | ||
1135 | $disallowed = $this_disallow_flags | |
1136 | & $this_uvchr_flag_to_disallow; | |
1137 | $this_name .= ", " . (( $this_warning_flags | |
1138 | & $this_utf8n_flag_to_warn) | |
1139 | ? 'with flag for raising warnings' | |
1140 | : 'no flag for raising warnings'); | |
6aa905cf KW |
1141 | |
1142 | $this_name = "uvchr_to_utf8_flags() $testname: " | |
69485e19 | 1143 | . (($disallowed) |
6aa905cf | 1144 | ? 'disallowed' |
6aa905cf KW |
1145 | : 'allowed'); |
1146 | $this_name .= ", $eval_warn"; | |
69485e19 KW |
1147 | $this_name .= ", " . (( $this_warning_flags |
1148 | & $this_uvchr_flag_to_warn) | |
6aa905cf KW |
1149 | ? 'with warning flag' |
1150 | : 'no warning flag'); | |
1151 | ||
9cdc3054 | 1152 | undef @warnings_gotten; |
6aa905cf | 1153 | my $ret; |
69485e19 | 1154 | $this_flags = $this_warning_flags|$this_disallow_flags; |
6aa905cf KW |
1155 | $eval_text = "$eval_warn; \$ret =" |
1156 | . " test_uvchr_to_utf8_flags(" | |
d884ea32 | 1157 | . "$allowed_uv, $this_flags)"; |
6aa905cf KW |
1158 | eval "$eval_text"; |
1159 | if (! ok ("$@ eq ''", "$this_name: eval succeeded")) | |
1160 | { | |
d884ea32 KW |
1161 | diag "\$@='$@'; call was: " |
1162 | . uvchr_display_call($eval_text); | |
6aa905cf KW |
1163 | next; |
1164 | } | |
1165 | if ($disallowed) { | |
d402d77f | 1166 | is($ret, undef, " And returns undef") |
d884ea32 | 1167 | or diag "Call was: " . uvchr_display_call($eval_text); |
6aa905cf KW |
1168 | } |
1169 | else { | |
d402d77f | 1170 | is($ret, $this_bytes, " And returns expected string") |
d884ea32 | 1171 | or diag "Call was: " . uvchr_display_call($eval_text); |
6aa905cf | 1172 | } |
69485e19 KW |
1173 | |
1174 | do_warnings_test(@expected_warnings) | |
1175 | or diag "Call was: " . uvchr_display_call($eval_text); | |
6aa905cf KW |
1176 | } |
1177 | } | |
1178 | } | |
1179 | } | |
1180 | } | |
6aa905cf KW |
1181 | } |
1182 | } | |
1183 | ||
1184 | done_testing; |