Commit | Line | Data |
---|---|---|
6aa905cf KW |
1 | #!perl -w |
2 | ||
3 | # This is a base file to be used by various .t's in its directory | |
8f79178b KW |
4 | # It tests various code points that are "problematic", and verifies that the |
5 | # correct warnings/flags etc are generated when using them. It also takes the | |
6 | # UTF-8 for some of them and perturbs it to be malformed in various ways, and | |
7 | # tests that this gets appropriately detected. | |
6aa905cf KW |
8 | |
9 | use strict; | |
10 | use Test::More; | |
11 | ||
12 | BEGIN { | |
13 | use_ok('XS::APItest'); | |
14 | require 'charset_tools.pl'; | |
15 | require './t/utf8_setup.pl'; | |
16 | }; | |
17 | ||
18 | $|=1; | |
19 | ||
20 | no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit | |
21 | # machines, and that is tested elsewhere | |
22 | ||
23 | use XS::APItest; | |
24 | use Data::Dumper; | |
25 | ||
9cdc3054 | 26 | my @warnings_gotten; |
6aa905cf KW |
27 | |
28 | use warnings 'utf8'; | |
8d6f1506 KW |
29 | local $SIG{__WARN__} = sub { my @copy = @_; |
30 | push @warnings_gotten, map { chomp; $_ } @copy; | |
31 | }; | |
c7f9e4bb KW |
32 | my @tests; |
33 | { | |
34 | no warnings qw(portable overflow); | |
35 | @tests = ( | |
8f79178b KW |
36 | # $testname, |
37 | # $bytes, UTF-8 string | |
38 | # $allowed_uv, code point $bytes evaluates to; -1 if | |
39 | # overflows | |
40 | # $needed_to_discern_len optional, how long an initial substring do | |
41 | # we need to tell that the string must be for | |
42 | # a code point in the category it falls in, | |
43 | # like being a surrogate; 0 indicates we need | |
44 | # the whole string. Some categories have a | |
45 | # default that is used if this is omitted. | |
46 | [ "lowest surrogate", | |
47 | (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"), | |
48 | 0xD800, | |
49 | ], | |
50 | [ "a middle surrogate", | |
51 | (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"), | |
52 | 0xD90D, | |
53 | ], | |
54 | [ "highest surrogate", | |
55 | (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"), | |
56 | 0xDFFF, | |
57 | ], | |
8f79178b KW |
58 | [ "first of 32 consecutive non-character code points", |
59 | (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"), | |
60 | 0xFDD0, | |
61 | ], | |
62 | [ "a mid non-character code point of the 32 consecutive ones", | |
63 | (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"), | |
64 | 0xFDE0, | |
65 | ], | |
66 | [ "final of 32 consecutive non-character code points", | |
67 | (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"), | |
68 | 0xFDEF, | |
69 | ], | |
70 | [ "non-character code point U+FFFE", | |
71 | (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"), | |
72 | 0xFFFE, | |
73 | ], | |
74 | [ "non-character code point U+FFFF", | |
75 | (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"), | |
76 | 0xFFFF, | |
77 | ], | |
78 | [ "non-character code point U+1FFFE", | |
79 | (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"), | |
80 | 0x1FFFE, | |
81 | ], | |
82 | [ "non-character code point U+1FFFF", | |
83 | (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"), | |
84 | 0x1FFFF, | |
85 | ], | |
86 | [ "non-character code point U+2FFFE", | |
87 | (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"), | |
88 | 0x2FFFE, | |
89 | ], | |
90 | [ "non-character code point U+2FFFF", | |
91 | (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"), | |
92 | 0x2FFFF, | |
93 | ], | |
94 | [ "non-character code point U+3FFFE", | |
95 | (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"), | |
96 | 0x3FFFE, | |
97 | ], | |
98 | [ "non-character code point U+3FFFF", | |
99 | (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"), | |
100 | 0x3FFFF, | |
101 | ], | |
102 | [ "non-character code point U+4FFFE", | |
103 | (isASCII) | |
104 | ? "\xf1\x8f\xbf\xbe" | |
105 | : I8_to_native("\xf8\xa9\xbf\xbf\xbe"), | |
106 | 0x4FFFE, | |
107 | ], | |
108 | [ "non-character code point U+4FFFF", | |
109 | (isASCII) | |
110 | ? "\xf1\x8f\xbf\xbf" | |
111 | : I8_to_native("\xf8\xa9\xbf\xbf\xbf"), | |
112 | 0x4FFFF, | |
113 | ], | |
114 | [ "non-character code point U+5FFFE", | |
115 | (isASCII) | |
116 | ? "\xf1\x9f\xbf\xbe" | |
117 | : I8_to_native("\xf8\xab\xbf\xbf\xbe"), | |
118 | 0x5FFFE, | |
119 | ], | |
120 | [ "non-character code point U+5FFFF", | |
121 | (isASCII) | |
122 | ? "\xf1\x9f\xbf\xbf" | |
123 | : I8_to_native("\xf8\xab\xbf\xbf\xbf"), | |
124 | 0x5FFFF, | |
125 | ], | |
126 | [ "non-character code point U+6FFFE", | |
127 | (isASCII) | |
128 | ? "\xf1\xaf\xbf\xbe" | |
129 | : I8_to_native("\xf8\xad\xbf\xbf\xbe"), | |
130 | 0x6FFFE, | |
131 | ], | |
132 | [ "non-character code point U+6FFFF", | |
133 | (isASCII) | |
134 | ? "\xf1\xaf\xbf\xbf" | |
135 | : I8_to_native("\xf8\xad\xbf\xbf\xbf"), | |
136 | 0x6FFFF, | |
137 | ], | |
138 | [ "non-character code point U+7FFFE", | |
139 | (isASCII) | |
140 | ? "\xf1\xbf\xbf\xbe" | |
141 | : I8_to_native("\xf8\xaf\xbf\xbf\xbe"), | |
142 | 0x7FFFE, | |
143 | ], | |
144 | [ "non-character code point U+7FFFF", | |
145 | (isASCII) | |
146 | ? "\xf1\xbf\xbf\xbf" | |
147 | : I8_to_native("\xf8\xaf\xbf\xbf\xbf"), | |
148 | 0x7FFFF, | |
149 | ], | |
150 | [ "non-character code point U+8FFFE", | |
151 | (isASCII) | |
152 | ? "\xf2\x8f\xbf\xbe" | |
153 | : I8_to_native("\xf8\xb1\xbf\xbf\xbe"), | |
154 | 0x8FFFE, | |
155 | ], | |
156 | [ "non-character code point U+8FFFF", | |
157 | (isASCII) | |
158 | ? "\xf2\x8f\xbf\xbf" | |
159 | : I8_to_native("\xf8\xb1\xbf\xbf\xbf"), | |
160 | 0x8FFFF, | |
161 | ], | |
162 | [ "non-character code point U+9FFFE", | |
163 | (isASCII) | |
164 | ? "\xf2\x9f\xbf\xbe" | |
165 | : I8_to_native("\xf8\xb3\xbf\xbf\xbe"), | |
166 | 0x9FFFE, | |
167 | ], | |
168 | [ "non-character code point U+9FFFF", | |
169 | (isASCII) | |
170 | ? "\xf2\x9f\xbf\xbf" | |
171 | : I8_to_native("\xf8\xb3\xbf\xbf\xbf"), | |
172 | 0x9FFFF, | |
173 | ], | |
174 | [ "non-character code point U+AFFFE", | |
175 | (isASCII) | |
176 | ? "\xf2\xaf\xbf\xbe" | |
177 | : I8_to_native("\xf8\xb5\xbf\xbf\xbe"), | |
178 | 0xAFFFE, | |
179 | ], | |
180 | [ "non-character code point U+AFFFF", | |
181 | (isASCII) | |
182 | ? "\xf2\xaf\xbf\xbf" | |
183 | : I8_to_native("\xf8\xb5\xbf\xbf\xbf"), | |
184 | 0xAFFFF, | |
185 | ], | |
186 | [ "non-character code point U+BFFFE", | |
187 | (isASCII) | |
188 | ? "\xf2\xbf\xbf\xbe" | |
189 | : I8_to_native("\xf8\xb7\xbf\xbf\xbe"), | |
190 | 0xBFFFE, | |
191 | ], | |
192 | [ "non-character code point U+BFFFF", | |
193 | (isASCII) | |
194 | ? "\xf2\xbf\xbf\xbf" | |
195 | : I8_to_native("\xf8\xb7\xbf\xbf\xbf"), | |
196 | 0xBFFFF, | |
197 | ], | |
198 | [ "non-character code point U+CFFFE", | |
199 | (isASCII) | |
200 | ? "\xf3\x8f\xbf\xbe" | |
201 | : I8_to_native("\xf8\xb9\xbf\xbf\xbe"), | |
202 | 0xCFFFE, | |
203 | ], | |
204 | [ "non-character code point U+CFFFF", | |
205 | (isASCII) | |
206 | ? "\xf3\x8f\xbf\xbf" | |
207 | : I8_to_native("\xf8\xb9\xbf\xbf\xbf"), | |
208 | 0xCFFFF, | |
209 | ], | |
210 | [ "non-character code point U+DFFFE", | |
211 | (isASCII) | |
212 | ? "\xf3\x9f\xbf\xbe" | |
213 | : I8_to_native("\xf8\xbb\xbf\xbf\xbe"), | |
214 | 0xDFFFE, | |
215 | ], | |
216 | [ "non-character code point U+DFFFF", | |
217 | (isASCII) | |
218 | ? "\xf3\x9f\xbf\xbf" | |
219 | : I8_to_native("\xf8\xbb\xbf\xbf\xbf"), | |
220 | 0xDFFFF, | |
221 | ], | |
222 | [ "non-character code point U+EFFFE", | |
223 | (isASCII) | |
224 | ? "\xf3\xaf\xbf\xbe" | |
225 | : I8_to_native("\xf8\xbd\xbf\xbf\xbe"), | |
226 | 0xEFFFE, | |
227 | ], | |
228 | [ "non-character code point U+EFFFF", | |
229 | (isASCII) | |
230 | ? "\xf3\xaf\xbf\xbf" | |
231 | : I8_to_native("\xf8\xbd\xbf\xbf\xbf"), | |
232 | 0xEFFFF, | |
233 | ], | |
234 | [ "non-character code point U+FFFFE", | |
235 | (isASCII) | |
236 | ? "\xf3\xbf\xbf\xbe" | |
237 | : I8_to_native("\xf8\xbf\xbf\xbf\xbe"), | |
238 | 0xFFFFE, | |
239 | ], | |
240 | [ "non-character code point U+FFFFF", | |
241 | (isASCII) | |
242 | ? "\xf3\xbf\xbf\xbf" | |
243 | : I8_to_native("\xf8\xbf\xbf\xbf\xbf"), | |
244 | 0xFFFFF, | |
245 | ], | |
246 | [ "non-character code point U+10FFFE", | |
247 | (isASCII) | |
248 | ? "\xf4\x8f\xbf\xbe" | |
249 | : I8_to_native("\xf9\xa1\xbf\xbf\xbe"), | |
250 | 0x10FFFE, | |
251 | ], | |
252 | [ "non-character code point U+10FFFF", | |
253 | (isASCII) | |
254 | ? "\xf4\x8f\xbf\xbf" | |
255 | : I8_to_native("\xf9\xa1\xbf\xbf\xbf"), | |
256 | 0x10FFFF, | |
257 | ], | |
7c9f4ec6 KW |
258 | [ "first non_unicode", |
259 | (isASCII) | |
260 | ? "\xf4\x90\x80\x80" | |
261 | : I8_to_native("\xf9\xa2\xa0\xa0\xa0"), | |
262 | 0x110000, | |
263 | 2, | |
264 | ], | |
265 | [ "non_unicode whose first byte tells that", | |
266 | (isASCII) | |
267 | ? "\xf5\x80\x80\x80" | |
268 | : I8_to_native("\xfa\xa0\xa0\xa0\xa0"), | |
269 | (isASCII) ? 0x140000 : 0x200000, | |
270 | 1, | |
271 | ], | |
48ceb60a | 272 | [ "lowest 32 bit code point", |
8f79178b KW |
273 | (isASCII) |
274 | ? "\xfe\x82\x80\x80\x80\x80\x80" | |
275 | : I8_to_native( | |
276 | "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), | |
8f79178b KW |
277 | 0x80000000, |
278 | (isASCII) ? 1 : 8, | |
279 | ], | |
280 | [ "highest 32 bit code point", | |
281 | (isASCII) | |
282 | ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" | |
283 | : I8_to_native( | |
284 | "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), | |
285 | 0xFFFFFFFF, | |
286 | (isASCII) ? 1 : 8, | |
287 | ], | |
f4da64d4 KW |
288 | [ "Lowest 33 bit code point", |
289 | (isASCII) | |
290 | ? "\xfe\x84\x80\x80\x80\x80\x80" | |
291 | : I8_to_native( | |
292 | "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"), | |
293 | ($::is64bit) ? 0x100000000 : -1, # Overflows on 32-bit systems | |
294 | (isASCII && ! $::is64bit) ? 2 : 1, | |
295 | ], | |
8f79178b KW |
296 | ); |
297 | ||
298 | if (! $::is64bit) { | |
299 | if (isASCII) { | |
8f79178b | 300 | push @tests, |
c81d4d83 KW |
301 | [ "overflow that old algorithm failed to detect", |
302 | "\xfe\x86\x80\x80\x80\x80\x80", | |
303 | -1, | |
304 | 2, | |
8f79178b KW |
305 | ]; |
306 | } | |
307 | } | |
c81d4d83 KW |
308 | |
309 | if ($::is64bit) { | |
ba627a0b KW |
310 | push @tests, |
311 | [ "highest 64 bit code point", | |
312 | (isASCII) | |
313 | ? "\xff\x80\x8f\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" | |
314 | : I8_to_native( | |
315 | "\xff\xaf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf"), | |
316 | 0xFFFFFFFFFFFFFFFF, | |
317 | (isASCII) ? 1 : 2, | |
318 | ], | |
319 | [ "first 65 bit code point", | |
320 | (isASCII) | |
321 | ? "\xff\x80\x9f\x80\x80\x80\x80\x80\x80\x80\x80\x80\x80" | |
322 | : I8_to_native( | |
323 | "\xff\xb0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
324 | -1, | |
325 | (isASCII) ? 3 : 2, | |
326 | ]; | |
c81d4d83 KW |
327 | if (isASCII) { |
328 | push @tests, | |
f6104ed7 KW |
329 | [ "Lowest code point requiring 13 bytes to represent", |
330 | "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80", | |
331 | 0x1000000000, | |
332 | 1, | |
333 | ], | |
c81d4d83 KW |
334 | [ "overflow that old algorithm failed to detect", |
335 | "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", | |
336 | -1, | |
337 | 3, | |
338 | ]; | |
339 | } | |
340 | else { | |
8f79178b KW |
341 | push @tests, # These could falsely show wrongly in a naive |
342 | # implementation | |
343 | [ "requires at least 32 bits", | |
344 | I8_to_native( | |
6aa905cf | 345 | "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), |
8f79178b KW |
346 | 0x800000000, |
347 | 7, | |
348 | ], | |
349 | [ "requires at least 32 bits", | |
350 | I8_to_native( | |
6aa905cf | 351 | "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), |
8f79178b KW |
352 | 0x10000000000, |
353 | 6, | |
354 | ], | |
355 | [ "requires at least 32 bits", | |
356 | I8_to_native( | |
6aa905cf | 357 | "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), |
8f79178b KW |
358 | 0x200000000000, |
359 | 5, | |
360 | ], | |
361 | [ "requires at least 32 bits", | |
362 | I8_to_native( | |
6aa905cf | 363 | "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), |
8f79178b KW |
364 | 0x4000000000000, |
365 | 4, | |
366 | ], | |
367 | [ "requires at least 32 bits", | |
368 | I8_to_native( | |
6aa905cf | 369 | "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), |
8f79178b KW |
370 | 0x80000000000000, |
371 | 3, | |
372 | ], | |
373 | [ "requires at least 32 bits", | |
374 | I8_to_native( | |
6aa905cf | 375 | "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), |
8f79178b KW |
376 | 0x1000000000000000, |
377 | 2, | |
378 | ]; | |
379 | } | |
6aa905cf | 380 | } |
c7f9e4bb | 381 | } |
6aa905cf | 382 | |
5722c46d KW |
383 | sub flags_to_text($$) |
384 | { | |
385 | my ($flags, $flags_to_text_ref) = @_; | |
386 | ||
387 | # Returns a string containing a mnemonic representation of the bits that | |
388 | # are set in the $flags. These are assumed to be flag bits. The return | |
389 | # looks like "FOO|BAR|BAZ". The second parameter is a reference to an | |
390 | # array that gives the textual representation of all the possible flags. | |
391 | # Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If | |
392 | # no bits at all are set the string "0" is returned; | |
393 | ||
394 | my @flag_text; | |
395 | my $shift = 0; | |
396 | ||
397 | return "0" if $flags == 0; | |
398 | ||
399 | while ($flags) { | |
400 | #diag sprintf "%x", $flags; | |
401 | if ($flags & 1) { | |
402 | push @flag_text, $flags_to_text_ref->[$shift]; | |
403 | } | |
404 | $shift++; | |
405 | $flags >>= 1; | |
406 | } | |
407 | ||
408 | return join "|", @flag_text; | |
409 | } | |
410 | ||
411 | # Possible flag returns from utf8n_to_uvchr_error(). These should have G_, | |
412 | # instead of A_, D_, but the prefixes will be used in a a later commit, so | |
413 | # minimize churn by having them here. | |
414 | my @utf8n_flags_to_text = ( qw( | |
415 | A_EMPTY | |
416 | A_CONTINUATION | |
417 | A_NON_CONTINUATION | |
418 | A_SHORT | |
419 | A_LONG | |
420 | A_LONG_AND_ITS_VALUE | |
421 | PLACEHOLDER | |
422 | A_OVERFLOW | |
423 | D_SURROGATE | |
424 | W_SURROGATE | |
425 | D_NONCHAR | |
426 | W_NONCHAR | |
427 | D_SUPER | |
428 | W_SUPER | |
429 | D_ABOVE_31_BIT | |
430 | W_ABOVE_31_BIT | |
431 | CHECK_ONLY | |
432 | NO_CONFIDENCE_IN_CURLEN_ | |
433 | ) ); | |
434 | ||
a8ee5133 KW |
435 | sub utf8n_display_call($) |
436 | { | |
437 | # Converts an eval string that calls test_utf8n_to_uvchr into a more human | |
438 | # readable form, and returns it. Doesn't work if the byte string contains | |
439 | # an apostrophe. The return will look something like: | |
440 | # test_utf8n_to_uvchr_error('$bytes', $length, $flags) | |
441 | #diag $_[0]; | |
442 | ||
443 | $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x; | |
444 | my $text1 = $1; # Everything before the byte string | |
445 | my $bytes = $2; | |
446 | my $text2 = $3; # Includes the length | |
447 | my $flags = $4; | |
448 | ||
449 | return $text1 | |
450 | . display_bytes($bytes) | |
451 | . $text2 | |
452 | . flags_to_text($flags, \@utf8n_flags_to_text) | |
453 | . ')'; | |
454 | } | |
455 | ||
d884ea32 KW |
456 | sub uvchr_display_call($) |
457 | { | |
458 | # Converts an eval string that calls test_uvchr_to_utf8 into a more human | |
459 | # readable form, and returns it. The return will look something like: | |
460 | # test_uvchr_to_utf8n_flags($uv, $flags) | |
461 | #diag $_[0]; | |
462 | ||
463 | my @flags_to_text = ( qw( | |
464 | W_SURROGATE | |
465 | W_NONCHAR | |
466 | W_SUPER | |
467 | W_ABOVE_31_BIT | |
468 | D_SURROGATE | |
469 | D_NONCHAR | |
470 | D_SUPER | |
471 | D_ABOVE_31_BIT | |
472 | ) ); | |
473 | ||
474 | $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x; | |
475 | my $text = $1; | |
476 | my $cp = sprintf "%X", $2; | |
477 | my $flags = $3; | |
478 | ||
479 | return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')'; | |
480 | } | |
481 | ||
69485e19 KW |
482 | sub do_warnings_test(@) |
483 | { | |
484 | my @expected_warnings = @_; | |
485 | ||
486 | # Compares the input expected warnings array with @warnings_gotten, | |
487 | # generating a pass for each found, removing it from @warnings_gotten. | |
488 | # Any discrepancies generate test failures. Returns TRUE if no | |
489 | # discrepcancies; otherwise FALSE. | |
490 | ||
491 | my $succeeded = 1; | |
492 | ||
493 | if (@expected_warnings == 0) { | |
494 | if (! is(@warnings_gotten, 0, " Expected and got no warnings")) { | |
495 | output_warnings(@warnings_gotten); | |
496 | $succeeded = 0; | |
497 | } | |
498 | return $succeeded; | |
499 | } | |
500 | ||
501 | # Check that we got all the expected warnings, | |
502 | # removing each one found | |
503 | WARNING: | |
504 | foreach my $expected (@expected_warnings) { | |
505 | foreach (my $i = 0; $i < @warnings_gotten; $i++) { | |
506 | if ($warnings_gotten[$i] =~ $expected) { | |
507 | pass(" Expected and got warning: " | |
508 | . " $warnings_gotten[$i]"); | |
509 | splice @warnings_gotten, $i, 1; | |
510 | next WARNING; | |
511 | } | |
512 | } | |
513 | fail(" Expected a warning that matches " | |
514 | . $expected . " but didn't get it"); | |
515 | $succeeded = 0; | |
516 | } | |
517 | ||
518 | if (! is(@warnings_gotten, 0, " Got no unexpected warnings")) { | |
519 | output_warnings(@warnings_gotten); | |
520 | $succeeded = 0; | |
521 | } | |
522 | ||
523 | return $succeeded; | |
524 | } | |
525 | ||
6aa905cf KW |
526 | # This test is split into this number of files. |
527 | my $num_test_files = $ENV{TEST_JOBS} || 1; | |
528 | $num_test_files = 10 if $num_test_files > 10; | |
529 | ||
530 | my $test_count = -1; | |
531 | foreach my $test (@tests) { | |
532 | $test_count++; | |
533 | next if $test_count % $num_test_files != $::TEST_CHUNK; | |
534 | ||
d42d7565 | 535 | my ($testname, $bytes, $allowed_uv, $needed_to_discern_len) = @$test; |
6aa905cf KW |
536 | |
537 | my $length = length $bytes; | |
04f42bf6 | 538 | my $will_overflow = $allowed_uv < 0; |
6aa905cf | 539 | |
2c511c58 | 540 | my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv); |
c0e5ee4b | 541 | my $display_bytes = display_bytes($bytes); |
2c511c58 | 542 | |
d42d7565 | 543 | my $controlling_warning_category; |
af816908 KW |
544 | my $utf8n_flag_to_warn; |
545 | my $utf8n_flag_to_disallow; | |
546 | my $uvchr_flag_to_warn; | |
547 | my $uvchr_flag_to_disallow; | |
b7e1f4b2 | 548 | |
e4e140b4 KW |
549 | # We want to test that the independent flags are actually independent. |
550 | # For example, that a surrogate doesn't trigger a non-character warning, | |
551 | # and conversely, turning off an above-Unicode flag doesn't suppress a | |
552 | # surrogate warning. Earlier versions of this file used nested loops to | |
553 | # test all possible combinations. But that creates lots of tests, making | |
554 | # this run too long. What is now done instead is to use the complement of | |
555 | # the category we are testing to greatly reduce the combinatorial | |
556 | # explosion. For example, if we have a surrogate and we aren't expecting | |
557 | # a warning about it, we set all the flags for non-surrogates to raise | |
558 | # warnings. If one shows up, it indicates the flags aren't independent. | |
559 | my $utf8n_flag_to_warn_complement; | |
560 | my $utf8n_flag_to_disallow_complement; | |
561 | my $uvchr_flag_to_warn_complement; | |
562 | my $uvchr_flag_to_disallow_complement; | |
563 | ||
bf422d6a KW |
564 | # Many of the code points being tested are middling in that if code point |
565 | # edge cases work, these are very likely to as well. Because this test | |
566 | # file takes a while to execute, we skip testing the edge effects of code | |
567 | # points deemed middling, while testing their basics and continuing to | |
568 | # fully test the non-middling code points. | |
569 | my $skip_most_tests = 0; | |
570 | ||
601e92f1 KW |
571 | my $cp_message_qr; # Pattern that matches the message raised when |
572 | # that message contains the problematic code | |
573 | # point. The message is the same (currently) both | |
574 | # when going from/to utf8. | |
575 | my $non_cp_trailing_text; # The suffix text when the message doesn't | |
576 | # contain a code point. (This is a result of | |
577 | # some sort of malformation that means we | |
578 | # can't get an exact code poin | |
579 | ||
c0e5ee4b KW |
580 | # Is this test malformed from the beginning? If so, we know to generally |
581 | # expect that the tests will show it isn't valid. | |
582 | my $initially_malformed = 0; | |
583 | ||
af816908 KW |
584 | if ($will_overflow || $allowed_uv > 0x10FFFF) { |
585 | ||
e4e140b4 | 586 | # Set the SUPER flags; later, we test for ABOVE_31_BIT as well. |
af816908 KW |
587 | $utf8n_flag_to_warn = $::UTF8_WARN_SUPER; |
588 | $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER; | |
589 | $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER; | |
590 | $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;; | |
591 | ||
e4e140b4 KW |
592 | # Below, we add the flags for non-above-31 bit to the code points that |
593 | # don't fit that category. Special tests are done for this category | |
594 | # in the inner loop. | |
595 | $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR | |
596 | |$::UTF8_WARN_SURROGATE; | |
597 | $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR | |
598 | |$::UTF8_DISALLOW_SURROGATE; | |
599 | $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR | |
600 | |$::UNICODE_WARN_SURROGATE; | |
601 | $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR | |
602 | |$::UNICODE_DISALLOW_SURROGATE; | |
603 | $controlling_warning_category = 'non_unicode'; | |
604 | ||
8f79178b | 605 | if ($will_overflow) { # This is realy a malformation |
69485e19 | 606 | $non_cp_trailing_text = "if you see this, there is an error"; |
601e92f1 | 607 | $cp_message_qr = qr/\Q$non_cp_trailing_text\E/; |
c0e5ee4b | 608 | $initially_malformed = 1; |
af816908 KW |
609 | } |
610 | elsif ($allowed_uv > 0x7FFFFFFF) { | |
601e92f1 KW |
611 | $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E |
612 | \Q and not portable\E/x; | |
613 | $non_cp_trailing_text = "is for a non-Unicode code point, and is not portable"; | |
af816908 | 614 | } |
601e92f1 KW |
615 | else { |
616 | $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E | |
617 | \Q may not be portable\E/x; | |
618 | $non_cp_trailing_text = "is for a non-Unicode code point, may not" | |
619 | . " be portable"; | |
e4e140b4 KW |
620 | $utf8n_flag_to_warn_complement |= $::UTF8_WARN_ABOVE_31_BIT; |
621 | $utf8n_flag_to_disallow_complement |= $::UTF8_DISALLOW_ABOVE_31_BIT; | |
622 | $uvchr_flag_to_warn_complement |= $::UNICODE_WARN_ABOVE_31_BIT; | |
623 | $uvchr_flag_to_disallow_complement | |
624 | |= $::UNICODE_DISALLOW_ABOVE_31_BIT; | |
af816908 | 625 | } |
3022ad00 KW |
626 | } |
627 | elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) { | |
601e92f1 KW |
628 | $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/; |
629 | $non_cp_trailing_text = "is for a surrogate"; | |
67e45424 | 630 | $needed_to_discern_len = 2 unless defined $needed_to_discern_len; |
bf422d6a | 631 | $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF; |
af816908 KW |
632 | |
633 | $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE; | |
634 | $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE; | |
635 | $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE; | |
636 | $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;; | |
e4e140b4 KW |
637 | |
638 | $utf8n_flag_to_warn_complement = $::UTF8_WARN_NONCHAR | |
639 | |$::UTF8_WARN_SUPER | |
640 | |$::UTF8_WARN_ABOVE_31_BIT; | |
641 | $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_NONCHAR | |
642 | |$::UTF8_DISALLOW_SUPER | |
643 | |$::UTF8_DISALLOW_ABOVE_31_BIT; | |
644 | $uvchr_flag_to_warn_complement = $::UNICODE_WARN_NONCHAR | |
645 | |$::UNICODE_WARN_SUPER | |
646 | |$::UNICODE_WARN_ABOVE_31_BIT; | |
647 | $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_NONCHAR | |
648 | |$::UNICODE_DISALLOW_SUPER | |
649 | |$::UNICODE_DISALLOW_ABOVE_31_BIT; | |
d42d7565 | 650 | $controlling_warning_category = 'surrogate'; |
3022ad00 KW |
651 | } |
652 | elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF) | |
653 | || ($allowed_uv & 0xFFFE) == 0xFFFE) | |
654 | { | |
601e92f1 KW |
655 | $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E |
656 | \Q is not recommended for open interchange\E/x; | |
657 | $non_cp_trailing_text = "if you see this, there is an error"; | |
67e45424 | 658 | $needed_to_discern_len = $length unless defined $needed_to_discern_len; |
bf422d6a KW |
659 | if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF) |
660 | || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE)) | |
661 | { | |
662 | $skip_most_tests = 1; | |
663 | } | |
af816908 KW |
664 | |
665 | $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR; | |
666 | $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR; | |
667 | $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR; | |
668 | $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;; | |
e4e140b4 KW |
669 | |
670 | $utf8n_flag_to_warn_complement = $::UTF8_WARN_SURROGATE | |
671 | |$::UTF8_WARN_SUPER | |
672 | |$::UTF8_WARN_ABOVE_31_BIT; | |
673 | $utf8n_flag_to_disallow_complement = $::UTF8_DISALLOW_SURROGATE | |
674 | |$::UTF8_DISALLOW_SUPER | |
675 | |$::UTF8_DISALLOW_ABOVE_31_BIT; | |
676 | $uvchr_flag_to_warn_complement = $::UNICODE_WARN_SURROGATE | |
677 | |$::UNICODE_WARN_SUPER | |
678 | |$::UNICODE_WARN_ABOVE_31_BIT; | |
679 | $uvchr_flag_to_disallow_complement = $::UNICODE_DISALLOW_SURROGATE | |
680 | |$::UNICODE_DISALLOW_SUPER | |
681 | |$::UNICODE_DISALLOW_ABOVE_31_BIT; | |
682 | ||
d42d7565 | 683 | $controlling_warning_category = 'nonchar'; |
3022ad00 KW |
684 | } |
685 | else { | |
686 | die "Can't figure out what type of warning to test for $testname" | |
687 | } | |
688 | ||
67e45424 KW |
689 | die 'Didn\'t set $needed_to_discern_len for ' . $testname |
690 | unless defined $needed_to_discern_len; | |
c0e5ee4b | 691 | |
8f79178b | 692 | { # First test the isFOO calls |
c0e5ee4b | 693 | use warnings; no warnings 'deprecated'; # Make sure these don't raise warnings |
9cdc3054 | 694 | undef @warnings_gotten; |
8f79178b | 695 | |
6aa905cf KW |
696 | my $ret = test_isUTF8_CHAR($bytes, $length); |
697 | my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0); | |
c0e5ee4b | 698 | if ($initially_malformed) { |
d402d77f KW |
699 | is($ret, 0, "For $testname: isUTF8_CHAR() returns 0"); |
700 | is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0"); | |
6aa905cf KW |
701 | } |
702 | else { | |
703 | is($ret, $length, | |
d402d77f KW |
704 | "For $testname: isUTF8_CHAR() returns expected length: $length"); |
705 | is($ret_flags, $length, " And isUTF8_CHAR_flags(...,0)" | |
6aa905cf KW |
706 | . " returns expected length: $length"); |
707 | } | |
9cdc3054 | 708 | is(scalar @warnings_gotten, 0, |
d402d77f KW |
709 | " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags generated" |
710 | . " any warnings") | |
9cdc3054 | 711 | or output_warnings(@warnings_gotten); |
6aa905cf | 712 | |
9cdc3054 | 713 | undef @warnings_gotten; |
6aa905cf | 714 | $ret = test_isSTRICT_UTF8_CHAR($bytes, $length); |
c0e5ee4b | 715 | if ($initially_malformed) { |
d402d77f | 716 | is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0"); |
6aa905cf KW |
717 | } |
718 | else { | |
719 | my $expected_ret = ( $testname =~ /surrogate|non-character/ | |
720 | || $allowed_uv > 0x10FFFF) | |
721 | ? 0 | |
722 | : $length; | |
d402d77f | 723 | is($ret, $expected_ret, " And isSTRICT_UTF8_CHAR() returns" |
6aa905cf KW |
724 | . " expected length: $expected_ret"); |
725 | $ret = test_isUTF8_CHAR_flags($bytes, $length, | |
726 | $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE); | |
727 | is($ret, $expected_ret, | |
d402d77f KW |
728 | " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')" |
729 | . " acts like isSTRICT_UTF8_CHAR"); | |
6aa905cf | 730 | } |
9cdc3054 | 731 | is(scalar @warnings_gotten, 0, |
d402d77f KW |
732 | " And neither isSTRICT_UTF8_CHAR() nor isUTF8_CHAR_flags" |
733 | . " generated any warnings") | |
9cdc3054 | 734 | or output_warnings(@warnings_gotten); |
6aa905cf | 735 | |
9cdc3054 | 736 | undef @warnings_gotten; |
6aa905cf | 737 | $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length); |
c0e5ee4b | 738 | if ($initially_malformed) { |
d402d77f | 739 | is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0"); |
6aa905cf KW |
740 | } |
741 | else { | |
742 | my $expected_ret = ( $testname =~ /surrogate/ | |
743 | || $allowed_uv > 0x10FFFF) | |
744 | ? 0 | |
745 | : $length; | |
d402d77f | 746 | is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()" |
6aa905cf KW |
747 | ." returns expected length: $expected_ret"); |
748 | $ret = test_isUTF8_CHAR_flags($bytes, $length, | |
8f79178b | 749 | $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); |
6aa905cf | 750 | is($ret, $expected_ret, |
d402d77f KW |
751 | " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')" |
752 | . " acts like isC9_STRICT_UTF8_CHAR"); | |
6aa905cf | 753 | } |
9cdc3054 | 754 | is(scalar @warnings_gotten, 0, |
d402d77f KW |
755 | " And neither isC9_STRICT_UTF8_CHAR() nor isUTF8_CHAR_flags" |
756 | . " generated any warnings") | |
9cdc3054 | 757 | or output_warnings(@warnings_gotten); |
6aa905cf | 758 | |
c0e5ee4b KW |
759 | foreach my $disallow_type (0..2) { |
760 | # 0 is don't disallow this type of code point | |
761 | # 1 is do disallow | |
762 | # 2 is do disallow, but only for above 31 bit | |
763 | ||
764 | my $disallow_flags; | |
765 | my $expected_ret; | |
766 | ||
767 | if ($initially_malformed) { | |
768 | ||
769 | # Malformations are by default disallowed, so testing with | |
770 | # $disallow_type equal to 0 is sufficicient. | |
771 | next if $disallow_type; | |
772 | ||
773 | $disallow_flags = 0; | |
774 | $expected_ret = 0; | |
775 | } | |
776 | elsif ($disallow_type == 1) { | |
777 | $disallow_flags = $utf8n_flag_to_disallow; | |
778 | $expected_ret = 0; | |
779 | } | |
780 | elsif ($disallow_type == 2) { | |
781 | next if ! $will_overflow && $allowed_uv < 0x80000000; | |
782 | $disallow_flags = $::UTF8_DISALLOW_ABOVE_31_BIT; | |
783 | $expected_ret = 0; | |
784 | } | |
785 | else { # type is 0 | |
786 | $disallow_flags = $utf8n_flag_to_disallow_complement; | |
787 | $expected_ret = $length; | |
788 | } | |
789 | ||
790 | $ret = test_isUTF8_CHAR_flags($bytes, $length, $disallow_flags); | |
791 | is($ret, $expected_ret, " And isUTF8_CHAR_flags(" | |
792 | . "$display_bytes, $disallow_flags) returns " | |
793 | . $expected_ret) | |
794 | or diag "The flags mean " | |
795 | . flags_to_text($disallow_flags, \@utf8n_flags_to_text); | |
6aa905cf | 796 | |
c0e5ee4b KW |
797 | is(scalar @warnings_gotten, 0, |
798 | " And isUTF8_CHAR_flags(...) generated no warnings") | |
799 | or output_warnings(@warnings_gotten); | |
6aa905cf | 800 | |
c0e5ee4b KW |
801 | # Test partial character handling, for each byte not a full character |
802 | for (my $j = 1; $j < $length - 1; $j++) { | |
6aa905cf KW |
803 | my $partial = substr($bytes, 0, $j); |
804 | my $ret_should_be; | |
805 | my $comment; | |
c0e5ee4b | 806 | if ($disallow_type || $initially_malformed) { |
6aa905cf KW |
807 | $ret_should_be = 0; |
808 | $comment = "disallowed"; | |
809 | if ($j < $needed_to_discern_len) { | |
810 | $ret_should_be = 1; | |
c0e5ee4b KW |
811 | $comment .= ", but need $needed_to_discern_len" |
812 | . " bytes to discern:"; | |
6aa905cf KW |
813 | } |
814 | } | |
815 | else { | |
816 | $ret_should_be = 1; | |
817 | $comment = "allowed"; | |
818 | } | |
819 | ||
9cdc3054 | 820 | undef @warnings_gotten; |
6aa905cf KW |
821 | |
822 | $ret = test_is_utf8_valid_partial_char_flags($partial, $j, | |
c0e5ee4b | 823 | $disallow_flags); |
6aa905cf | 824 | is($ret, $ret_should_be, |
c0e5ee4b KW |
825 | " And is_utf8_valid_partial_char_flags(" |
826 | . display_bytes($partial) | |
827 | . ", $disallow_flags), $comment: returns $ret_should_be") | |
828 | or diag "The flags mean " | |
829 | . flags_to_text($disallow_flags, \@utf8n_flags_to_text); | |
9cdc3054 | 830 | is(scalar @warnings_gotten, 0, |
d402d77f | 831 | " And is_utf8_valid_partial_char_flags()" |
6aa905cf | 832 | . " generated no warnings") |
9cdc3054 | 833 | or output_warnings(@warnings_gotten); |
6aa905cf KW |
834 | } |
835 | } | |
836 | } | |
837 | ||
8f79178b | 838 | # Now test the to/from UTF-8 calls |
69485e19 KW |
839 | # This is more complicated than the malformations tested in other files in |
840 | # this directory, as there are several orthogonal variables involved. We | |
841 | # test most possible combinations | |
8f79178b | 842 | |
8139d291 KW |
843 | # We try various combinations of malformations that can occur |
844 | foreach my $short (0, 1) { | |
845 | next if $skip_most_tests && $short; | |
846 | foreach my $unexpected_noncont (0, 1) { | |
847 | next if $skip_most_tests && $unexpected_noncont; | |
848 | foreach my $overlong (0, 1) { | |
849 | next if $overlong && $skip_most_tests; | |
8f79178b | 850 | |
8139d291 KW |
851 | # If we're creating an overlong, it can't be longer than the |
852 | # maximum length, so skip if we're already at that length. | |
853 | next if $overlong && $length >= $::max_bytes; | |
854 | ||
4816e15f KW |
855 | # Here we are in the innermost loop for malformations. So we |
856 | # know which ones are in effect. Can now change the input to be | |
857 | # appropriately malformed. We also can set up certain other | |
858 | # things now, like whether we expect a return flag from this | |
859 | # malformation, and which flag. | |
860 | ||
861 | my $this_bytes = $bytes; | |
862 | my $this_length = $length; | |
863 | my $this_expected_len = $length; | |
864 | my $this_needed_to_discern_len = $needed_to_discern_len; | |
865 | ||
866 | my @malformation_names; | |
867 | my @expected_malformation_warnings; | |
868 | my @expected_malformation_return_flags; | |
869 | ||
870 | if ($overlong) { | |
871 | ||
872 | # To force this malformation, we convert the original start | |
873 | # byte into a continuation byte with the same data bits as | |
874 | # originally. ... | |
875 | substr($this_bytes, 0, 1) | |
876 | = start_byte_to_cont(substr($this_bytes, 0, 1)); | |
877 | ||
878 | # ... Then we prepend it with a known overlong sequence. This | |
879 | # should evaluate to the exact same code point as the | |
880 | # original. | |
881 | $this_bytes = I8_to_native("\xff") | |
882 | . (I8_to_native(chr $::lowest_continuation) | |
883 | x ( $::max_bytes - 1 - length($this_bytes))) | |
884 | . $this_bytes; | |
885 | $this_length = length($this_bytes); | |
886 | $this_needed_to_discern_len = $::max_bytes | |
887 | - ( $this_expected_len | |
888 | - $this_needed_to_discern_len); | |
889 | $this_expected_len = $::max_bytes; | |
890 | push @expected_malformation_return_flags, $::UTF8_GOT_LONG; | |
891 | push @malformation_names, 'overlong'; | |
892 | } | |
893 | ||
894 | if ($short) { | |
895 | push @malformation_names, 'short'; | |
896 | ||
897 | # To force this malformation, just tell the test to not look | |
898 | # as far as it should into the input. | |
899 | $this_length--; | |
900 | $this_expected_len--; | |
901 | push @expected_malformation_return_flags, $::UTF8_GOT_SHORT; | |
902 | } | |
903 | ||
904 | if ($unexpected_noncont) { | |
905 | push @malformation_names, 'unexpected non-continuation'; | |
906 | ||
907 | # To force this malformation, change the final continuation | |
908 | # byte into a non continuation. | |
909 | my $pos = ($short) ? -2 : -1; | |
910 | substr($this_bytes, $pos, 1) = '?'; | |
911 | $this_expected_len--; | |
912 | push @expected_malformation_return_flags, | |
913 | $::UTF8_GOT_NON_CONTINUATION; | |
914 | } | |
915 | ||
916 | # Here, we've transformed the input with all of the desired | |
917 | # non-overflow malformations. We are now in a position to | |
918 | # construct any potential warnings for those malformations. But | |
919 | # it's a pain to get the detailed messages exactly right, so for | |
920 | # now XXX, only do so for those that return an explicit code | |
921 | # point. | |
922 | ||
923 | if ($overlong) { | |
924 | ||
925 | # If one of the other malformation types is also in effect, we | |
926 | # don't know what the intended code point was. | |
927 | if ($short || $unexpected_noncont || $will_overflow) { | |
928 | push @expected_malformation_warnings, qr/overlong/; | |
929 | } | |
930 | else { | |
931 | my $wrong_bytes = display_bytes_no_quotes( | |
932 | substr($this_bytes, 0, $this_length)); | |
933 | my $correct_bytes = display_bytes_no_quotes($bytes); | |
934 | my $prefix = ($allowed_uv > 0x10FFFF) ? "0x" : "U+"; | |
935 | push @expected_malformation_warnings, | |
936 | qr/\QMalformed UTF-8 character: $wrong_bytes\E | |
937 | \Q (overlong; instead use\E | |
938 | \Q $correct_bytes to\E | |
939 | \Q represent $prefix$uv_string)/x; | |
940 | } | |
941 | } | |
942 | if ($short) { | |
943 | push @expected_malformation_warnings, qr/too short/; | |
944 | } | |
945 | if ($unexpected_noncont) { | |
946 | push @expected_malformation_warnings, | |
947 | qr/unexpected non-continuation byte/; | |
948 | } | |
949 | ||
950 | # The overflow malformation is done differently than other | |
951 | # malformations. It comes from manually typed tests in the test | |
952 | # array. We now make it be treated like one of the other | |
953 | # malformations. But some has to be deferred until the inner loop | |
954 | my $overflow_msg_pattern; | |
955 | if ($will_overflow) { | |
956 | push @malformation_names, 'overflow'; | |
957 | ||
958 | $overflow_msg_pattern = display_bytes_no_quotes( | |
959 | substr($this_bytes, 0, $this_expected_len)); | |
960 | $overflow_msg_pattern = qr/\QMalformed UTF-8 character:\E | |
961 | \Q $overflow_msg_pattern\E | |
962 | \Q (overflows)\E/x; | |
963 | push @expected_malformation_return_flags, $::UTF8_GOT_OVERFLOW; | |
964 | } | |
965 | ||
966 | # And we can create the malformation-related text for the the test | |
967 | # names we eventually will generate. | |
968 | my $malformations_name = ""; | |
969 | if (@malformation_names) { | |
970 | $malformations_name .= "malformation"; | |
971 | $malformations_name .= "s" if @malformation_names > 1; | |
972 | $malformations_name .= ": "; | |
973 | $malformations_name .= join "/", @malformation_names; | |
974 | $malformations_name = " ($malformations_name)"; | |
975 | } | |
976 | ||
977 | # Done setting up the malformation related stuff | |
978 | ||
8139d291 KW |
979 | foreach my $do_disallow (0, 1) { |
980 | next if $skip_most_tests && ! $do_disallow; | |
6aa905cf | 981 | |
69485e19 KW |
982 | # We classify the warnings into certain "interesting" types, |
983 | # described later | |
984 | foreach my $warning_type (0..4) { | |
985 | next if $skip_most_tests && $warning_type != 1; | |
986 | foreach my $use_warn_flag (0, 1) { | |
987 | next if $skip_most_tests && ! $use_warn_flag; | |
988 | ||
8f79178b KW |
989 | # Finally, here is the inner loop |
990 | ||
69485e19 KW |
991 | my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn; |
992 | my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow; | |
993 | my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn; | |
994 | my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow; | |
995 | ||
996 | my $eval_warn; | |
997 | my $expect_regular_warnings; | |
998 | my $expect_warnings_for_malformed; | |
999 | my $expect_warnings_for_overflow; | |
1000 | ||
1001 | if ($warning_type == 0) { | |
1002 | $eval_warn = "use warnings; no warnings 'deprecated'"; | |
1003 | $expect_regular_warnings = $use_warn_flag; | |
1004 | $expect_warnings_for_overflow = 1; | |
1005 | $expect_warnings_for_malformed = 1; | |
6aa905cf | 1006 | } |
69485e19 KW |
1007 | elsif ($warning_type == 1) { |
1008 | $eval_warn = "no warnings"; | |
1009 | $expect_regular_warnings = 0; | |
1010 | $expect_warnings_for_overflow = 0; | |
1011 | $expect_warnings_for_malformed = 0; | |
1012 | } | |
1013 | elsif ($warning_type == 2) { | |
1014 | $eval_warn = "no warnings; use warnings 'utf8'"; | |
1015 | $expect_regular_warnings = $use_warn_flag; | |
1016 | $expect_warnings_for_overflow = 1; | |
1017 | $expect_warnings_for_malformed = 1; | |
1018 | } | |
1019 | elsif ($warning_type == 3) { | |
1020 | $eval_warn = "no warnings; use warnings" | |
e4e140b4 KW |
1021 | . " '$controlling_warning_category'"; |
1022 | $expect_regular_warnings = $use_warn_flag; | |
69485e19 | 1023 | $expect_warnings_for_overflow |
e4e140b4 | 1024 | = $controlling_warning_category eq 'non_unicode'; |
69485e19 KW |
1025 | $expect_warnings_for_malformed = 0; |
1026 | } | |
1027 | elsif ($warning_type == 4) { # Like type 3, but uses the | |
1028 | # above-31-bit flags | |
1029 | # The complement flags were set up so that the | |
1030 | # above-31-bit flags have been tested that they don't | |
1031 | # trigger wrongly for too small code points. And the | |
1032 | # flags have been set up so that those small code | |
1033 | # points are tested for being above Unicode. What's | |
1034 | # left to test is that the large code points do | |
1035 | # trigger the above-31-bit flags. | |
1036 | next if ! $will_overflow && $allowed_uv < 0x80000000; | |
e4e140b4 | 1037 | next if $controlling_warning_category ne 'non_unicode'; |
69485e19 KW |
1038 | $eval_warn = "no warnings; use warnings 'non_unicode'"; |
1039 | $expect_regular_warnings = 1; | |
1040 | $expect_warnings_for_overflow = 1; | |
1041 | $expect_warnings_for_malformed = 0; | |
1042 | $this_utf8n_flag_to_warn = $::UTF8_WARN_ABOVE_31_BIT; | |
1043 | $this_utf8n_flag_to_disallow | |
1044 | = $::UTF8_DISALLOW_ABOVE_31_BIT; | |
1045 | $this_uvchr_flag_to_warn = $::UNICODE_WARN_ABOVE_31_BIT; | |
1046 | $this_uvchr_flag_to_disallow | |
1047 | = $::UNICODE_DISALLOW_ABOVE_31_BIT; | |
601e92f1 KW |
1048 | } |
1049 | else { | |
69485e19 | 1050 | die "Unexpected warning type '$warning_type'"; |
601e92f1 KW |
1051 | } |
1052 | ||
69485e19 KW |
1053 | # We only need to test the case where all warnings are |
1054 | # enabled (type 0) to see if turning off the warning flag | |
1055 | # causes things to not be output. If those pass, then | |
1056 | # turning on some sub-category of warnings, or turning off | |
1057 | # warnings altogether are extremely likely to not output | |
1058 | # warnings either, given how the warnings subsystem is | |
1059 | # supposed to work, and this file assumes it does work. | |
1060 | next if $warning_type != 0 && ! $use_warn_flag; | |
1061 | ||
1062 | # The convention is that the 'got' flag is the same value | |
1063 | # as the disallow one. If this were violated, the tests | |
1064 | # here should start failing. | |
1065 | my $return_flag = $this_utf8n_flag_to_disallow; | |
1066 | ||
e4e140b4 KW |
1067 | # If we aren't expecting warnings/disallow for this, turn |
1068 | # on all the other flags. That makes sure that they all | |
1069 | # are independent of this flag, and so we don't need to | |
1070 | # test them individually. | |
1071 | my $this_warning_flags = ($use_warn_flag) | |
1072 | ? $this_utf8n_flag_to_warn | |
1073 | : $utf8n_flag_to_warn_complement; | |
69485e19 | 1074 | my $this_disallow_flags = ($do_disallow) |
e4e140b4 KW |
1075 | ? $this_utf8n_flag_to_disallow |
1076 | : $utf8n_flag_to_disallow_complement; | |
6aa905cf | 1077 | my $expected_uv = $allowed_uv; |
6aa905cf | 1078 | |
4816e15f KW |
1079 | my @expected_return_flags |
1080 | = @expected_malformation_return_flags; | |
69485e19 | 1081 | my @expected_warnings; |
4816e15f | 1082 | push @expected_warnings, @expected_malformation_warnings |
69485e19 KW |
1083 | if $expect_warnings_for_malformed; |
1084 | ||
69485e19 KW |
1085 | # The overflow malformation is done differently than other |
1086 | # malformations. It comes from manually typed tests in | |
1087 | # the test array, but it also is above Unicode and uses | |
1088 | # Perl extended UTF-8, so affects some of the flags being | |
1089 | # tested. We now make it be treated like one of the other | |
1090 | # generated malformations. | |
1091 | if ($will_overflow) { | |
1092 | ||
1093 | # An overflow is (way) above Unicode, and overrides | |
1094 | # everything else. | |
1095 | $expect_regular_warnings = 0; | |
1096 | ||
69485e19 | 1097 | if ($expect_warnings_for_overflow) { |
4816e15f | 1098 | push @expected_warnings, $overflow_msg_pattern; |
6aa905cf | 1099 | } |
69485e19 KW |
1100 | } |
1101 | ||
1102 | # It may be that the malformations have shortened the | |
1103 | # amount of input we look at so much that we can't tell | |
1104 | # what the category the code point was in. Otherwise, set | |
1105 | # up the expected return flags based on the warnings and | |
1106 | # disallowments. | |
1107 | if ($this_expected_len < $this_needed_to_discern_len) { | |
1108 | $expect_regular_warnings = 0; | |
1109 | } | |
1110 | elsif ( ($this_warning_flags & $this_utf8n_flag_to_warn) | |
1111 | || ( $this_disallow_flags | |
1112 | & $this_utf8n_flag_to_disallow)) | |
1113 | { | |
1114 | push @expected_return_flags, $return_flag; | |
1115 | } | |
1116 | ||
1117 | # Finish setting up the expected warning. | |
1118 | if ($expect_regular_warnings) { | |
1119 | ||
1120 | # So far the array contains warnings generated by | |
1121 | # malformations. Add the expected regular one. | |
1122 | unshift @expected_warnings, $cp_message_qr; | |
1123 | ||
1124 | # But it may need to be modified, because either of | |
1125 | # these malformations means we can't determine the | |
1126 | # expected code point. | |
1127 | if ($short || $unexpected_noncont) { | |
1128 | my $first_byte = substr($this_bytes, 0, 1); | |
1129 | $expected_warnings[0] = display_bytes( | |
1130 | substr($this_bytes, 0, $this_expected_len)); | |
1131 | $expected_warnings[0] | |
1132 | = qr/[Aa]\Qny UTF-8 sequence that starts with\E | |
1133 | \Q $expected_warnings[0]\E | |
1134 | \Q $non_cp_trailing_text\E/x; | |
1135 | } | |
1136 | } | |
6aa905cf KW |
1137 | |
1138 | # Is effectively disallowed if we've set up a | |
1139 | # malformation, even if the flag indicates it is | |
1140 | # allowed. Fix up test name to indicate this as | |
1141 | # well | |
69485e19 KW |
1142 | my $disallowed = ( $this_disallow_flags |
1143 | & $this_utf8n_flag_to_disallow) | |
1144 | || $malformations_name; | |
6aa905cf | 1145 | my $this_name = "utf8n_to_uvchr_error() $testname: " |
69485e19 KW |
1146 | . (($disallowed) |
1147 | ? 'disallowed' | |
1148 | : 'allowed'); | |
6aa905cf | 1149 | $this_name .= ", $eval_warn"; |
69485e19 KW |
1150 | $this_name .= ", " . (( $this_warning_flags |
1151 | & $this_utf8n_flag_to_warn) | |
1152 | ? 'with flag for raising warnings' | |
1153 | : 'no flag for raising warnings'); | |
1154 | $this_name .= $malformations_name; | |
db0f09e6 | 1155 | |
8f79178b | 1156 | # Do the actual test using an eval |
9cdc3054 | 1157 | undef @warnings_gotten; |
6aa905cf | 1158 | my $ret_ref; |
69485e19 | 1159 | my $this_flags = $this_warning_flags|$this_disallow_flags; |
6aa905cf KW |
1160 | my $eval_text = "$eval_warn; \$ret_ref" |
1161 | . " = test_utf8n_to_uvchr_error(" | |
69485e19 | 1162 | . "'$this_bytes', $this_length, $this_flags)"; |
6aa905cf | 1163 | eval "$eval_text"; |
69485e19 | 1164 | if (! ok ("$@ eq ''", "$this_name: eval succeeded")) |
6aa905cf | 1165 | { |
a8ee5133 KW |
1166 | diag "\$@='$@'; call was: " |
1167 | . utf8n_display_call($eval_text); | |
6aa905cf KW |
1168 | next; |
1169 | } | |
1170 | if ($disallowed) { | |
d402d77f | 1171 | is($ret_ref->[0], 0, " And returns 0") |
a8ee5133 | 1172 | or diag "Call was: " . utf8n_display_call($eval_text); |
6aa905cf KW |
1173 | } |
1174 | else { | |
1175 | is($ret_ref->[0], $expected_uv, | |
d402d77f KW |
1176 | " And returns expected uv: " |
1177 | . $uv_string) | |
a8ee5133 | 1178 | or diag "Call was: " . utf8n_display_call($eval_text); |
6aa905cf KW |
1179 | } |
1180 | is($ret_ref->[1], $this_expected_len, | |
d402d77f | 1181 | " And returns expected length:" |
6aa905cf | 1182 | . " $this_expected_len") |
a8ee5133 | 1183 | or diag "Call was: " . utf8n_display_call($eval_text); |
6aa905cf | 1184 | |
9cdc3054 | 1185 | my $returned_flags = $ret_ref->[2]; |
6aa905cf | 1186 | |
9cdc3054 | 1187 | for (my $i = @expected_return_flags - 1; $i >= 0; $i--) { |
69485e19 KW |
1188 | if ($expected_return_flags[$i] & $returned_flags) { |
1189 | if ($expected_return_flags[$i] | |
1190 | == $::UTF8_DISALLOW_ABOVE_31_BIT) | |
1191 | { | |
1192 | pass(" Expected and got return flag for" | |
1193 | . " above_31_bit"); | |
1194 | } | |
1195 | # The first entries in this are | |
1196 | # malformations | |
1197 | elsif ($i > @malformation_names - 1) { | |
1198 | pass(" Expected and got return flag" | |
e4e140b4 | 1199 | . " for " . $controlling_warning_category); |
69485e19 KW |
1200 | } |
1201 | else { | |
1202 | pass(" Expected and got return flag for " | |
1203 | . $malformation_names[$i] | |
1204 | . " malformation"); | |
1205 | } | |
9cdc3054 | 1206 | $returned_flags &= ~$expected_return_flags[$i]; |
69485e19 | 1207 | splice @expected_return_flags, $i, 1; |
6aa905cf | 1208 | } |
6aa905cf | 1209 | } |
6aa905cf | 1210 | |
69485e19 KW |
1211 | is($returned_flags, 0, |
1212 | " Got no unexpected return flags") | |
1213 | or diag "The unexpected flags gotten were: " | |
5722c46d KW |
1214 | . (flags_to_text($returned_flags, |
1215 | \@utf8n_flags_to_text) | |
69485e19 KW |
1216 | # We strip off any prefixes from the flag |
1217 | # names | |
1218 | =~ s/ \b [A-Z] _ //xgr); | |
1219 | is (scalar @expected_return_flags, 0, | |
1220 | " Got all expected return flags") | |
1221 | or diag "The expected flags not gotten were: " | |
1222 | . (flags_to_text(eval join("|", | |
1223 | @expected_return_flags), | |
1224 | \@utf8n_flags_to_text) | |
1225 | # We strip off any prefixes from the flag | |
1226 | # names | |
5722c46d | 1227 | =~ s/ \b [A-Z] _ //xgr); |
9cdc3054 | 1228 | |
69485e19 KW |
1229 | do_warnings_test(@expected_warnings) |
1230 | or diag "Call was: " . utf8n_display_call($eval_text); | |
1231 | undef @warnings_gotten; | |
6aa905cf KW |
1232 | |
1233 | # Check CHECK_ONLY results when the input is | |
1234 | # disallowed. Do this when actually disallowed, | |
69485e19 | 1235 | # not just when the $this_disallow_flags is set |
6aa905cf | 1236 | if ($disallowed) { |
69485e19 KW |
1237 | my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY; |
1238 | my $eval_text = "use warnings; no warnings 'deprecated'; \$ret_ref =" | |
1239 | . " test_utf8n_to_uvchr_error('" | |
1240 | . "$this_bytes', $this_length," | |
1241 | . " $this_flags)"; | |
1242 | eval $eval_text; | |
a8ee5133 KW |
1243 | if (! ok ("$@ eq ''", |
1244 | " And eval succeeded with CHECK_ONLY")) | |
1245 | { | |
1246 | diag "\$@='$@'; Call was: " | |
1247 | . utf8n_display_call($eval_text); | |
1248 | next; | |
1249 | } | |
d402d77f | 1250 | is($ret_ref->[0], 0, " CHECK_ONLY: Returns 0") |
a8ee5133 | 1251 | or diag "Call was: " . utf8n_display_call($eval_text); |
6aa905cf | 1252 | is($ret_ref->[1], -1, |
d402d77f | 1253 | " CHECK_ONLY: returns -1 for length") |
a8ee5133 | 1254 | or diag "Call was: " . utf8n_display_call($eval_text); |
9cdc3054 | 1255 | if (! is(scalar @warnings_gotten, 0, |
d402d77f | 1256 | " CHECK_ONLY: no warnings generated")) |
6aa905cf | 1257 | { |
a8ee5133 | 1258 | diag "Call was: " . utf8n_display_call($eval_text); |
9cdc3054 | 1259 | output_warnings(@warnings_gotten); |
6aa905cf KW |
1260 | } |
1261 | } | |
1262 | ||
1263 | # Now repeat some of the above, but for | |
1264 | # uvchr_to_utf8_flags(). Since this comes from an | |
8f79178b KW |
1265 | # existing code point, it hasn't overflowed, and isn't |
1266 | # malformed. | |
69485e19 KW |
1267 | next if @malformation_names; |
1268 | ||
1269 | $this_warning_flags = ($use_warn_flag) | |
1270 | ? $this_uvchr_flag_to_warn | |
1271 | : 0; | |
1272 | $this_disallow_flags = ($do_disallow) | |
1273 | ? $this_uvchr_flag_to_disallow | |
1274 | : 0; | |
1275 | ||
1276 | $disallowed = $this_disallow_flags | |
1277 | & $this_uvchr_flag_to_disallow; | |
1278 | $this_name .= ", " . (( $this_warning_flags | |
1279 | & $this_utf8n_flag_to_warn) | |
1280 | ? 'with flag for raising warnings' | |
1281 | : 'no flag for raising warnings'); | |
6aa905cf KW |
1282 | |
1283 | $this_name = "uvchr_to_utf8_flags() $testname: " | |
69485e19 | 1284 | . (($disallowed) |
6aa905cf | 1285 | ? 'disallowed' |
6aa905cf KW |
1286 | : 'allowed'); |
1287 | $this_name .= ", $eval_warn"; | |
69485e19 KW |
1288 | $this_name .= ", " . (( $this_warning_flags |
1289 | & $this_uvchr_flag_to_warn) | |
6aa905cf KW |
1290 | ? 'with warning flag' |
1291 | : 'no warning flag'); | |
1292 | ||
9cdc3054 | 1293 | undef @warnings_gotten; |
6aa905cf | 1294 | my $ret; |
69485e19 | 1295 | $this_flags = $this_warning_flags|$this_disallow_flags; |
6aa905cf KW |
1296 | $eval_text = "$eval_warn; \$ret =" |
1297 | . " test_uvchr_to_utf8_flags(" | |
d884ea32 | 1298 | . "$allowed_uv, $this_flags)"; |
6aa905cf KW |
1299 | eval "$eval_text"; |
1300 | if (! ok ("$@ eq ''", "$this_name: eval succeeded")) | |
1301 | { | |
d884ea32 KW |
1302 | diag "\$@='$@'; call was: " |
1303 | . uvchr_display_call($eval_text); | |
6aa905cf KW |
1304 | next; |
1305 | } | |
1306 | if ($disallowed) { | |
d402d77f | 1307 | is($ret, undef, " And returns undef") |
d884ea32 | 1308 | or diag "Call was: " . uvchr_display_call($eval_text); |
6aa905cf KW |
1309 | } |
1310 | else { | |
d402d77f | 1311 | is($ret, $this_bytes, " And returns expected string") |
d884ea32 | 1312 | or diag "Call was: " . uvchr_display_call($eval_text); |
6aa905cf | 1313 | } |
69485e19 KW |
1314 | |
1315 | do_warnings_test(@expected_warnings) | |
1316 | or diag "Call was: " . uvchr_display_call($eval_text); | |
6aa905cf KW |
1317 | } |
1318 | } | |
6aa905cf KW |
1319 | } |
1320 | } | |
8f79178b | 1321 | } |
6aa905cf KW |
1322 | } |
1323 | } | |
1324 | ||
1325 | done_testing; |