Commit | Line | Data |
---|---|---|
6aa905cf KW |
1 | #!perl -w |
2 | ||
3 | # This is a base file to be used by various .t's in its directory | |
4 | ||
5 | use strict; | |
6 | use Test::More; | |
7 | ||
8 | BEGIN { | |
9 | use_ok('XS::APItest'); | |
10 | require 'charset_tools.pl'; | |
11 | require './t/utf8_setup.pl'; | |
12 | }; | |
13 | ||
14 | $|=1; | |
15 | ||
16 | no warnings 'deprecated'; # Some of the below are above IV_MAX on 32 bit | |
17 | # machines, and that is tested elsewhere | |
18 | ||
19 | use XS::APItest; | |
20 | use Data::Dumper; | |
21 | ||
9cdc3054 | 22 | my @warnings_gotten; |
6aa905cf KW |
23 | |
24 | use warnings 'utf8'; | |
8d6f1506 KW |
25 | local $SIG{__WARN__} = sub { my @copy = @_; |
26 | push @warnings_gotten, map { chomp; $_ } @copy; | |
27 | }; | |
6aa905cf | 28 | |
6aa905cf KW |
29 | |
30 | # Now test the cases where a legal code point is generated, but may or may not | |
31 | # be allowed/warned on. | |
32 | my @tests = ( | |
af816908 | 33 | # ($testname, $bytes, $controlling_warning_category, |
b7e1f4b2 | 34 | # $allowed_uv, $needed_to_discern_len ) |
6aa905cf KW |
35 | [ "lowest surrogate", |
36 | (isASCII) ? "\xed\xa0\x80" : I8_to_native("\xf1\xb6\xa0\xa0"), | |
6aa905cf | 37 | 'surrogate', 0xD800, |
6aa905cf KW |
38 | ], |
39 | [ "a middle surrogate", | |
40 | (isASCII) ? "\xed\xa4\x8d" : I8_to_native("\xf1\xb6\xa8\xad"), | |
6aa905cf | 41 | 'surrogate', 0xD90D, |
6aa905cf KW |
42 | ], |
43 | [ "highest surrogate", | |
44 | (isASCII) ? "\xed\xbf\xbf" : I8_to_native("\xf1\xb7\xbf\xbf"), | |
6aa905cf | 45 | 'surrogate', 0xDFFF, |
6aa905cf KW |
46 | ], |
47 | [ "first non_unicode", | |
48 | (isASCII) ? "\xf4\x90\x80\x80" : I8_to_native("\xf9\xa2\xa0\xa0\xa0"), | |
6aa905cf | 49 | 'non_unicode', 0x110000, |
6aa905cf | 50 | 2, |
6aa905cf KW |
51 | ], |
52 | [ "non_unicode whose first byte tells that", | |
53 | (isASCII) ? "\xf5\x80\x80\x80" : I8_to_native("\xfa\xa0\xa0\xa0\xa0"), | |
6aa905cf KW |
54 | 'non_unicode', |
55 | (isASCII) ? 0x140000 : 0x200000, | |
6aa905cf | 56 | 1, |
6aa905cf KW |
57 | ], |
58 | [ "first of 32 consecutive non-character code points", | |
59 | (isASCII) ? "\xef\xb7\x90" : I8_to_native("\xf1\xbf\xae\xb0"), | |
6aa905cf | 60 | 'nonchar', 0xFDD0, |
6aa905cf KW |
61 | ], |
62 | [ "a mid non-character code point of the 32 consecutive ones", | |
63 | (isASCII) ? "\xef\xb7\xa0" : I8_to_native("\xf1\xbf\xaf\xa0"), | |
6aa905cf | 64 | 'nonchar', 0xFDE0, |
6aa905cf KW |
65 | ], |
66 | [ "final of 32 consecutive non-character code points", | |
67 | (isASCII) ? "\xef\xb7\xaf" : I8_to_native("\xf1\xbf\xaf\xaf"), | |
6aa905cf | 68 | 'nonchar', 0xFDEF, |
6aa905cf KW |
69 | ], |
70 | [ "non-character code point U+FFFE", | |
71 | (isASCII) ? "\xef\xbf\xbe" : I8_to_native("\xf1\xbf\xbf\xbe"), | |
6aa905cf | 72 | 'nonchar', 0xFFFE, |
6aa905cf KW |
73 | ], |
74 | [ "non-character code point U+FFFF", | |
75 | (isASCII) ? "\xef\xbf\xbf" : I8_to_native("\xf1\xbf\xbf\xbf"), | |
6aa905cf | 76 | 'nonchar', 0xFFFF, |
6aa905cf KW |
77 | ], |
78 | [ "non-character code point U+1FFFE", | |
79 | (isASCII) ? "\xf0\x9f\xbf\xbe" : I8_to_native("\xf3\xbf\xbf\xbe"), | |
6aa905cf | 80 | 'nonchar', 0x1FFFE, |
6aa905cf KW |
81 | ], |
82 | [ "non-character code point U+1FFFF", | |
83 | (isASCII) ? "\xf0\x9f\xbf\xbf" : I8_to_native("\xf3\xbf\xbf\xbf"), | |
6aa905cf | 84 | 'nonchar', 0x1FFFF, |
6aa905cf KW |
85 | ], |
86 | [ "non-character code point U+2FFFE", | |
87 | (isASCII) ? "\xf0\xaf\xbf\xbe" : I8_to_native("\xf5\xbf\xbf\xbe"), | |
6aa905cf | 88 | 'nonchar', 0x2FFFE, |
6aa905cf KW |
89 | ], |
90 | [ "non-character code point U+2FFFF", | |
91 | (isASCII) ? "\xf0\xaf\xbf\xbf" : I8_to_native("\xf5\xbf\xbf\xbf"), | |
6aa905cf | 92 | 'nonchar', 0x2FFFF, |
6aa905cf KW |
93 | ], |
94 | [ "non-character code point U+3FFFE", | |
95 | (isASCII) ? "\xf0\xbf\xbf\xbe" : I8_to_native("\xf7\xbf\xbf\xbe"), | |
6aa905cf | 96 | 'nonchar', 0x3FFFE, |
6aa905cf KW |
97 | ], |
98 | [ "non-character code point U+3FFFF", | |
99 | (isASCII) ? "\xf0\xbf\xbf\xbf" : I8_to_native("\xf7\xbf\xbf\xbf"), | |
6aa905cf | 100 | 'nonchar', 0x3FFFF, |
6aa905cf KW |
101 | ], |
102 | [ "non-character code point U+4FFFE", | |
103 | (isASCII) ? "\xf1\x8f\xbf\xbe" : I8_to_native("\xf8\xa9\xbf\xbf\xbe"), | |
6aa905cf | 104 | 'nonchar', 0x4FFFE, |
6aa905cf KW |
105 | ], |
106 | [ "non-character code point U+4FFFF", | |
107 | (isASCII) ? "\xf1\x8f\xbf\xbf" : I8_to_native("\xf8\xa9\xbf\xbf\xbf"), | |
6aa905cf | 108 | 'nonchar', 0x4FFFF, |
6aa905cf KW |
109 | ], |
110 | [ "non-character code point U+5FFFE", | |
111 | (isASCII) ? "\xf1\x9f\xbf\xbe" : I8_to_native("\xf8\xab\xbf\xbf\xbe"), | |
6aa905cf | 112 | 'nonchar', 0x5FFFE, |
6aa905cf KW |
113 | ], |
114 | [ "non-character code point U+5FFFF", | |
115 | (isASCII) ? "\xf1\x9f\xbf\xbf" : I8_to_native("\xf8\xab\xbf\xbf\xbf"), | |
6aa905cf | 116 | 'nonchar', 0x5FFFF, |
6aa905cf KW |
117 | ], |
118 | [ "non-character code point U+6FFFE", | |
119 | (isASCII) ? "\xf1\xaf\xbf\xbe" : I8_to_native("\xf8\xad\xbf\xbf\xbe"), | |
6aa905cf | 120 | 'nonchar', 0x6FFFE, |
6aa905cf KW |
121 | ], |
122 | [ "non-character code point U+6FFFF", | |
123 | (isASCII) ? "\xf1\xaf\xbf\xbf" : I8_to_native("\xf8\xad\xbf\xbf\xbf"), | |
6aa905cf | 124 | 'nonchar', 0x6FFFF, |
6aa905cf KW |
125 | ], |
126 | [ "non-character code point U+7FFFE", | |
127 | (isASCII) ? "\xf1\xbf\xbf\xbe" : I8_to_native("\xf8\xaf\xbf\xbf\xbe"), | |
6aa905cf | 128 | 'nonchar', 0x7FFFE, |
6aa905cf KW |
129 | ], |
130 | [ "non-character code point U+7FFFF", | |
131 | (isASCII) ? "\xf1\xbf\xbf\xbf" : I8_to_native("\xf8\xaf\xbf\xbf\xbf"), | |
6aa905cf | 132 | 'nonchar', 0x7FFFF, |
6aa905cf KW |
133 | ], |
134 | [ "non-character code point U+8FFFE", | |
135 | (isASCII) ? "\xf2\x8f\xbf\xbe" : I8_to_native("\xf8\xb1\xbf\xbf\xbe"), | |
6aa905cf | 136 | 'nonchar', 0x8FFFE, |
6aa905cf KW |
137 | ], |
138 | [ "non-character code point U+8FFFF", | |
139 | (isASCII) ? "\xf2\x8f\xbf\xbf" : I8_to_native("\xf8\xb1\xbf\xbf\xbf"), | |
6aa905cf | 140 | 'nonchar', 0x8FFFF, |
6aa905cf KW |
141 | ], |
142 | [ "non-character code point U+9FFFE", | |
143 | (isASCII) ? "\xf2\x9f\xbf\xbe" : I8_to_native("\xf8\xb3\xbf\xbf\xbe"), | |
6aa905cf | 144 | 'nonchar', 0x9FFFE, |
6aa905cf KW |
145 | ], |
146 | [ "non-character code point U+9FFFF", | |
147 | (isASCII) ? "\xf2\x9f\xbf\xbf" : I8_to_native("\xf8\xb3\xbf\xbf\xbf"), | |
6aa905cf | 148 | 'nonchar', 0x9FFFF, |
6aa905cf KW |
149 | ], |
150 | [ "non-character code point U+AFFFE", | |
151 | (isASCII) ? "\xf2\xaf\xbf\xbe" : I8_to_native("\xf8\xb5\xbf\xbf\xbe"), | |
6aa905cf | 152 | 'nonchar', 0xAFFFE, |
6aa905cf KW |
153 | ], |
154 | [ "non-character code point U+AFFFF", | |
155 | (isASCII) ? "\xf2\xaf\xbf\xbf" : I8_to_native("\xf8\xb5\xbf\xbf\xbf"), | |
6aa905cf | 156 | 'nonchar', 0xAFFFF, |
6aa905cf KW |
157 | ], |
158 | [ "non-character code point U+BFFFE", | |
159 | (isASCII) ? "\xf2\xbf\xbf\xbe" : I8_to_native("\xf8\xb7\xbf\xbf\xbe"), | |
6aa905cf | 160 | 'nonchar', 0xBFFFE, |
6aa905cf KW |
161 | ], |
162 | [ "non-character code point U+BFFFF", | |
163 | (isASCII) ? "\xf2\xbf\xbf\xbf" : I8_to_native("\xf8\xb7\xbf\xbf\xbf"), | |
6aa905cf | 164 | 'nonchar', 0xBFFFF, |
6aa905cf KW |
165 | ], |
166 | [ "non-character code point U+CFFFE", | |
167 | (isASCII) ? "\xf3\x8f\xbf\xbe" : I8_to_native("\xf8\xb9\xbf\xbf\xbe"), | |
6aa905cf | 168 | 'nonchar', 0xCFFFE, |
6aa905cf KW |
169 | ], |
170 | [ "non-character code point U+CFFFF", | |
171 | (isASCII) ? "\xf3\x8f\xbf\xbf" : I8_to_native("\xf8\xb9\xbf\xbf\xbf"), | |
6aa905cf | 172 | 'nonchar', 0xCFFFF, |
6aa905cf KW |
173 | ], |
174 | [ "non-character code point U+DFFFE", | |
175 | (isASCII) ? "\xf3\x9f\xbf\xbe" : I8_to_native("\xf8\xbb\xbf\xbf\xbe"), | |
6aa905cf | 176 | 'nonchar', 0xDFFFE, |
6aa905cf KW |
177 | ], |
178 | [ "non-character code point U+DFFFF", | |
179 | (isASCII) ? "\xf3\x9f\xbf\xbf" : I8_to_native("\xf8\xbb\xbf\xbf\xbf"), | |
6aa905cf | 180 | 'nonchar', 0xDFFFF, |
6aa905cf KW |
181 | ], |
182 | [ "non-character code point U+EFFFE", | |
183 | (isASCII) ? "\xf3\xaf\xbf\xbe" : I8_to_native("\xf8\xbd\xbf\xbf\xbe"), | |
6aa905cf | 184 | 'nonchar', 0xEFFFE, |
6aa905cf KW |
185 | ], |
186 | [ "non-character code point U+EFFFF", | |
187 | (isASCII) ? "\xf3\xaf\xbf\xbf" : I8_to_native("\xf8\xbd\xbf\xbf\xbf"), | |
6aa905cf | 188 | 'nonchar', 0xEFFFF, |
6aa905cf KW |
189 | ], |
190 | [ "non-character code point U+FFFFE", | |
191 | (isASCII) ? "\xf3\xbf\xbf\xbe" : I8_to_native("\xf8\xbf\xbf\xbf\xbe"), | |
6aa905cf | 192 | 'nonchar', 0xFFFFE, |
6aa905cf KW |
193 | ], |
194 | [ "non-character code point U+FFFFF", | |
195 | (isASCII) ? "\xf3\xbf\xbf\xbf" : I8_to_native("\xf8\xbf\xbf\xbf\xbf"), | |
6aa905cf | 196 | 'nonchar', 0xFFFFF, |
6aa905cf KW |
197 | ], |
198 | [ "non-character code point U+10FFFE", | |
199 | (isASCII) ? "\xf4\x8f\xbf\xbe" : I8_to_native("\xf9\xa1\xbf\xbf\xbe"), | |
6aa905cf | 200 | 'nonchar', 0x10FFFE, |
6aa905cf KW |
201 | ], |
202 | [ "non-character code point U+10FFFF", | |
203 | (isASCII) ? "\xf4\x8f\xbf\xbf" : I8_to_native("\xf9\xa1\xbf\xbf\xbf"), | |
6aa905cf | 204 | 'nonchar', 0x10FFFF, |
6aa905cf | 205 | ], |
76513bdc KW |
206 | [ "requires at least 32 bits", |
207 | (isASCII) | |
208 | ? "\xfe\x82\x80\x80\x80\x80\x80" | |
209 | : I8_to_native( | |
210 | "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), | |
211 | # This code point is chosen so that it is representable in a UV on | |
212 | # 32-bit machines | |
69485e19 | 213 | 'non_unicode', 0x80000000, |
76513bdc | 214 | (isASCII) ? 1 : 8, |
76513bdc KW |
215 | ], |
216 | [ "highest 32 bit code point", | |
217 | (isASCII) | |
218 | ? "\xfe\x83\xbf\xbf\xbf\xbf\xbf" | |
219 | : I8_to_native( | |
220 | "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa3\xbf\xbf\xbf\xbf\xbf\xbf"), | |
69485e19 | 221 | 'non_unicode', 0xFFFFFFFF, |
76513bdc | 222 | (isASCII) ? 1 : 8, |
76513bdc KW |
223 | ], |
224 | [ "requires at least 32 bits, and use SUPER-type flags, instead of" | |
225 | . " ABOVE_31_BIT", | |
226 | (isASCII) | |
227 | ? "\xfe\x82\x80\x80\x80\x80\x80" | |
228 | : I8_to_native( | |
229 | "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0"), | |
69485e19 | 230 | 'non_unicode', 0x80000000, |
76513bdc | 231 | 1, |
76513bdc KW |
232 | ], |
233 | [ "overflow with warnings/disallow for more than 31 bits", | |
234 | # This tests the interaction of WARN_ABOVE_31_BIT/DISALLOW_ABOVE_31_BIT | |
235 | # with overflow. The overflow malformation is never allowed, so | |
236 | # preventing it takes precedence if the ABOVE_31_BIT options would | |
237 | # otherwise allow in an overflowing value. The ASCII code points (1 | |
238 | # for 32-bits; 1 for 64) were chosen because the old overflow | |
239 | # detection algorithm did not catch them; this means this test also | |
240 | # checks for that fix. The EBCDIC are arbitrary overflowing ones | |
241 | # since we have no reports of failures with it. | |
242 | (($::is64bit) | |
243 | ? ((isASCII) | |
244 | ? "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf" | |
245 | : I8_to_native( | |
246 | "\xff\xB0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0")) | |
247 | : ((isASCII) | |
248 | ? "\xfe\x86\x80\x80\x80\x80\x80" | |
249 | : I8_to_native( | |
250 | "\xff\xa0\xa0\xa0\xa0\xa0\xa0\xa4\xa0\xa0\xa0\xa0\xa0\xa0"))), | |
69485e19 | 251 | 'non_unicode', -1, |
76513bdc | 252 | (isASCII || $::is64bit) ? 2 : 8, |
76513bdc | 253 | ], |
6aa905cf KW |
254 | ); |
255 | ||
256 | if (! $::is64bit) { | |
257 | if (isASCII) { | |
258 | no warnings qw{portable overflow}; | |
259 | push @tests, | |
260 | [ "Lowest 33 bit code point: overflow", | |
261 | "\xFE\x84\x80\x80\x80\x80\x80", | |
69485e19 | 262 | 'non_unicode', -1, |
b3169593 | 263 | 1, |
6aa905cf KW |
264 | ]; |
265 | } | |
266 | } | |
267 | else { | |
268 | no warnings qw{portable overflow}; | |
269 | push @tests, | |
270 | [ "More than 32 bits", | |
271 | (isASCII) | |
272 | ? "\xff\x80\x80\x80\x80\x80\x81\x80\x80\x80\x80\x80\x80" | |
273 | : I8_to_native( | |
274 | "\xff\xa0\xa0\xa0\xa0\xa0\xa2\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
69485e19 | 275 | 'non_unicode', 0x1000000000, |
b3169593 | 276 | (isASCII) ? 1 : 7, |
6aa905cf KW |
277 | ]; |
278 | if (! isASCII) { | |
279 | push @tests, # These could falsely show wrongly in a naive | |
280 | # implementation | |
281 | [ "requires at least 32 bits", | |
282 | I8_to_native( | |
283 | "\xff\xa0\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
69485e19 | 284 | 'non_unicode', 0x800000000, |
b3169593 | 285 | 7, |
6aa905cf KW |
286 | ], |
287 | [ "requires at least 32 bits", | |
288 | I8_to_native( | |
289 | "\xff\xa0\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
69485e19 | 290 | 'non_unicode', 0x10000000000, |
b3169593 | 291 | 6, |
6aa905cf KW |
292 | ], |
293 | [ "requires at least 32 bits", | |
294 | I8_to_native( | |
295 | "\xff\xa0\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
69485e19 | 296 | 'non_unicode', 0x200000000000, |
b3169593 | 297 | 5, |
6aa905cf KW |
298 | ], |
299 | [ "requires at least 32 bits", | |
300 | I8_to_native( | |
301 | "\xff\xa0\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
69485e19 | 302 | 'non_unicode', 0x4000000000000, |
b3169593 | 303 | 4, |
6aa905cf KW |
304 | ], |
305 | [ "requires at least 32 bits", | |
306 | I8_to_native( | |
307 | "\xff\xa0\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
69485e19 | 308 | 'non_unicode', 0x80000000000000, |
b3169593 | 309 | 3, |
6aa905cf KW |
310 | ], |
311 | [ "requires at least 32 bits", | |
312 | I8_to_native( | |
313 | "\xff\xa1\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0\xa0"), | |
69485e19 | 314 | 'non_unicode', 0x1000000000000000, |
b3169593 | 315 | 2, |
6aa905cf KW |
316 | ]; |
317 | } | |
318 | } | |
319 | ||
5722c46d KW |
320 | sub flags_to_text($$) |
321 | { | |
322 | my ($flags, $flags_to_text_ref) = @_; | |
323 | ||
324 | # Returns a string containing a mnemonic representation of the bits that | |
325 | # are set in the $flags. These are assumed to be flag bits. The return | |
326 | # looks like "FOO|BAR|BAZ". The second parameter is a reference to an | |
327 | # array that gives the textual representation of all the possible flags. | |
328 | # Element 0 is the text for the bit 0 flag; element 1 for bit 1; .... If | |
329 | # no bits at all are set the string "0" is returned; | |
330 | ||
331 | my @flag_text; | |
332 | my $shift = 0; | |
333 | ||
334 | return "0" if $flags == 0; | |
335 | ||
336 | while ($flags) { | |
337 | #diag sprintf "%x", $flags; | |
338 | if ($flags & 1) { | |
339 | push @flag_text, $flags_to_text_ref->[$shift]; | |
340 | } | |
341 | $shift++; | |
342 | $flags >>= 1; | |
343 | } | |
344 | ||
345 | return join "|", @flag_text; | |
346 | } | |
347 | ||
348 | # Possible flag returns from utf8n_to_uvchr_error(). These should have G_, | |
349 | # instead of A_, D_, but the prefixes will be used in a a later commit, so | |
350 | # minimize churn by having them here. | |
351 | my @utf8n_flags_to_text = ( qw( | |
352 | A_EMPTY | |
353 | A_CONTINUATION | |
354 | A_NON_CONTINUATION | |
355 | A_SHORT | |
356 | A_LONG | |
357 | A_LONG_AND_ITS_VALUE | |
358 | PLACEHOLDER | |
359 | A_OVERFLOW | |
360 | D_SURROGATE | |
361 | W_SURROGATE | |
362 | D_NONCHAR | |
363 | W_NONCHAR | |
364 | D_SUPER | |
365 | W_SUPER | |
366 | D_ABOVE_31_BIT | |
367 | W_ABOVE_31_BIT | |
368 | CHECK_ONLY | |
369 | NO_CONFIDENCE_IN_CURLEN_ | |
370 | ) ); | |
371 | ||
a8ee5133 KW |
372 | sub utf8n_display_call($) |
373 | { | |
374 | # Converts an eval string that calls test_utf8n_to_uvchr into a more human | |
375 | # readable form, and returns it. Doesn't work if the byte string contains | |
376 | # an apostrophe. The return will look something like: | |
377 | # test_utf8n_to_uvchr_error('$bytes', $length, $flags) | |
378 | #diag $_[0]; | |
379 | ||
380 | $_[0] =~ / ^ ( [^(]* \( ) ' ( [^']*? ) ' ( .+ , \D* ) ( \d+ ) \) $ /x; | |
381 | my $text1 = $1; # Everything before the byte string | |
382 | my $bytes = $2; | |
383 | my $text2 = $3; # Includes the length | |
384 | my $flags = $4; | |
385 | ||
386 | return $text1 | |
387 | . display_bytes($bytes) | |
388 | . $text2 | |
389 | . flags_to_text($flags, \@utf8n_flags_to_text) | |
390 | . ')'; | |
391 | } | |
392 | ||
d884ea32 KW |
393 | sub uvchr_display_call($) |
394 | { | |
395 | # Converts an eval string that calls test_uvchr_to_utf8 into a more human | |
396 | # readable form, and returns it. The return will look something like: | |
397 | # test_uvchr_to_utf8n_flags($uv, $flags) | |
398 | #diag $_[0]; | |
399 | ||
400 | my @flags_to_text = ( qw( | |
401 | W_SURROGATE | |
402 | W_NONCHAR | |
403 | W_SUPER | |
404 | W_ABOVE_31_BIT | |
405 | D_SURROGATE | |
406 | D_NONCHAR | |
407 | D_SUPER | |
408 | D_ABOVE_31_BIT | |
409 | ) ); | |
410 | ||
411 | $_[0] =~ / ^ ( [^(]* \( ) ( \d+ ) , \s* ( \d+ ) \) $ /x; | |
412 | my $text = $1; | |
413 | my $cp = sprintf "%X", $2; | |
414 | my $flags = $3; | |
415 | ||
416 | return "${text}0x$cp, " . flags_to_text($flags, \@flags_to_text) . ')'; | |
417 | } | |
418 | ||
69485e19 KW |
419 | sub do_warnings_test(@) |
420 | { | |
421 | my @expected_warnings = @_; | |
422 | ||
423 | # Compares the input expected warnings array with @warnings_gotten, | |
424 | # generating a pass for each found, removing it from @warnings_gotten. | |
425 | # Any discrepancies generate test failures. Returns TRUE if no | |
426 | # discrepcancies; otherwise FALSE. | |
427 | ||
428 | my $succeeded = 1; | |
429 | ||
430 | if (@expected_warnings == 0) { | |
431 | if (! is(@warnings_gotten, 0, " Expected and got no warnings")) { | |
432 | output_warnings(@warnings_gotten); | |
433 | $succeeded = 0; | |
434 | } | |
435 | return $succeeded; | |
436 | } | |
437 | ||
438 | # Check that we got all the expected warnings, | |
439 | # removing each one found | |
440 | WARNING: | |
441 | foreach my $expected (@expected_warnings) { | |
442 | foreach (my $i = 0; $i < @warnings_gotten; $i++) { | |
443 | if ($warnings_gotten[$i] =~ $expected) { | |
444 | pass(" Expected and got warning: " | |
445 | . " $warnings_gotten[$i]"); | |
446 | splice @warnings_gotten, $i, 1; | |
447 | next WARNING; | |
448 | } | |
449 | } | |
450 | fail(" Expected a warning that matches " | |
451 | . $expected . " but didn't get it"); | |
452 | $succeeded = 0; | |
453 | } | |
454 | ||
455 | if (! is(@warnings_gotten, 0, " Got no unexpected warnings")) { | |
456 | output_warnings(@warnings_gotten); | |
457 | $succeeded = 0; | |
458 | } | |
459 | ||
460 | return $succeeded; | |
461 | } | |
462 | ||
6aa905cf KW |
463 | # This test is split into this number of files. |
464 | my $num_test_files = $ENV{TEST_JOBS} || 1; | |
465 | $num_test_files = 10 if $num_test_files > 10; | |
466 | ||
467 | my $test_count = -1; | |
468 | foreach my $test (@tests) { | |
469 | $test_count++; | |
470 | next if $test_count % $num_test_files != $::TEST_CHUNK; | |
471 | ||
af816908 | 472 | my ($testname, $bytes, |
9cdc3054 | 473 | $controlling_warning_category, $allowed_uv, $needed_to_discern_len |
6aa905cf KW |
474 | ) = @$test; |
475 | ||
476 | my $length = length $bytes; | |
04f42bf6 | 477 | my $will_overflow = $allowed_uv < 0; |
6aa905cf | 478 | |
2c511c58 KW |
479 | my $uv_string = sprintf(($allowed_uv < 0x100) ? "%02X" : "%04X", $allowed_uv); |
480 | ||
af816908 KW |
481 | my $utf8n_flag_to_warn; |
482 | my $utf8n_flag_to_disallow; | |
483 | my $uvchr_flag_to_warn; | |
484 | my $uvchr_flag_to_disallow; | |
b7e1f4b2 | 485 | |
bf422d6a KW |
486 | # Many of the code points being tested are middling in that if code point |
487 | # edge cases work, these are very likely to as well. Because this test | |
488 | # file takes a while to execute, we skip testing the edge effects of code | |
489 | # points deemed middling, while testing their basics and continuing to | |
490 | # fully test the non-middling code points. | |
491 | my $skip_most_tests = 0; | |
492 | ||
601e92f1 KW |
493 | my $cp_message_qr; # Pattern that matches the message raised when |
494 | # that message contains the problematic code | |
495 | # point. The message is the same (currently) both | |
496 | # when going from/to utf8. | |
497 | my $non_cp_trailing_text; # The suffix text when the message doesn't | |
498 | # contain a code point. (This is a result of | |
499 | # some sort of malformation that means we | |
500 | # can't get an exact code poin | |
501 | ||
af816908 KW |
502 | if ($will_overflow || $allowed_uv > 0x10FFFF) { |
503 | ||
504 | $utf8n_flag_to_warn = $::UTF8_WARN_SUPER; | |
505 | $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SUPER; | |
506 | $uvchr_flag_to_warn = $::UNICODE_WARN_SUPER; | |
507 | $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SUPER;; | |
508 | ||
509 | if ($will_overflow) { | |
69485e19 | 510 | $non_cp_trailing_text = "if you see this, there is an error"; |
601e92f1 | 511 | $cp_message_qr = qr/\Q$non_cp_trailing_text\E/; |
af816908 KW |
512 | } |
513 | elsif ($allowed_uv > 0x7FFFFFFF) { | |
601e92f1 KW |
514 | $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E |
515 | \Q and not portable\E/x; | |
516 | $non_cp_trailing_text = "is for a non-Unicode code point, and is not portable"; | |
af816908 | 517 | } |
601e92f1 KW |
518 | else { |
519 | $cp_message_qr = qr/\QCode point 0x$uv_string is not Unicode,\E | |
520 | \Q may not be portable\E/x; | |
521 | $non_cp_trailing_text = "is for a non-Unicode code point, may not" | |
522 | . " be portable"; | |
af816908 | 523 | } |
3022ad00 KW |
524 | } |
525 | elsif ($allowed_uv >= 0xD800 && $allowed_uv <= 0xDFFF) { | |
601e92f1 KW |
526 | $cp_message_qr = qr/UTF-16 surrogate U\+$uv_string/; |
527 | $non_cp_trailing_text = "is for a surrogate"; | |
67e45424 | 528 | $needed_to_discern_len = 2 unless defined $needed_to_discern_len; |
bf422d6a | 529 | $skip_most_tests = 1 if $allowed_uv > 0xD800 && $allowed_uv < 0xDFFF; |
af816908 KW |
530 | |
531 | $utf8n_flag_to_warn = $::UTF8_WARN_SURROGATE; | |
532 | $utf8n_flag_to_disallow = $::UTF8_DISALLOW_SURROGATE; | |
533 | $uvchr_flag_to_warn = $::UNICODE_WARN_SURROGATE; | |
534 | $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_SURROGATE;; | |
3022ad00 KW |
535 | } |
536 | elsif ( ($allowed_uv >= 0xFDD0 && $allowed_uv <= 0xFDEF) | |
537 | || ($allowed_uv & 0xFFFE) == 0xFFFE) | |
538 | { | |
601e92f1 KW |
539 | $cp_message_qr = qr/\QUnicode non-character U+$uv_string\E |
540 | \Q is not recommended for open interchange\E/x; | |
541 | $non_cp_trailing_text = "if you see this, there is an error"; | |
67e45424 | 542 | $needed_to_discern_len = $length unless defined $needed_to_discern_len; |
bf422d6a KW |
543 | if ( ($allowed_uv > 0xFDD0 && $allowed_uv < 0xFDEF) |
544 | || ($allowed_uv > 0xFFFF && $allowed_uv < 0x10FFFE)) | |
545 | { | |
546 | $skip_most_tests = 1; | |
547 | } | |
af816908 KW |
548 | |
549 | $utf8n_flag_to_warn = $::UTF8_WARN_NONCHAR; | |
550 | $utf8n_flag_to_disallow = $::UTF8_DISALLOW_NONCHAR; | |
551 | $uvchr_flag_to_warn = $::UNICODE_WARN_NONCHAR; | |
552 | $uvchr_flag_to_disallow = $::UNICODE_DISALLOW_NONCHAR;; | |
3022ad00 KW |
553 | } |
554 | else { | |
555 | die "Can't figure out what type of warning to test for $testname" | |
556 | } | |
557 | ||
67e45424 KW |
558 | die 'Didn\'t set $needed_to_discern_len for ' . $testname |
559 | unless defined $needed_to_discern_len; | |
6aa905cf KW |
560 | { |
561 | use warnings; | |
9cdc3054 | 562 | undef @warnings_gotten; |
6aa905cf KW |
563 | my $ret = test_isUTF8_CHAR($bytes, $length); |
564 | my $ret_flags = test_isUTF8_CHAR_flags($bytes, $length, 0); | |
565 | if ($will_overflow) { | |
d402d77f KW |
566 | is($ret, 0, "For $testname: isUTF8_CHAR() returns 0"); |
567 | is($ret_flags, 0, " And isUTF8_CHAR_flags() returns 0"); | |
6aa905cf KW |
568 | } |
569 | else { | |
570 | is($ret, $length, | |
d402d77f KW |
571 | "For $testname: isUTF8_CHAR() returns expected length: $length"); |
572 | is($ret_flags, $length, " And isUTF8_CHAR_flags(...,0)" | |
6aa905cf KW |
573 | . " returns expected length: $length"); |
574 | } | |
9cdc3054 | 575 | is(scalar @warnings_gotten, 0, |
d402d77f KW |
576 | " And neither isUTF8_CHAR() nor isUTF8_CHAR()_flags generated" |
577 | . " any warnings") | |
9cdc3054 | 578 | or output_warnings(@warnings_gotten); |
6aa905cf | 579 | |
9cdc3054 | 580 | undef @warnings_gotten; |
6aa905cf KW |
581 | $ret = test_isSTRICT_UTF8_CHAR($bytes, $length); |
582 | if ($will_overflow) { | |
d402d77f | 583 | is($ret, 0, " And isSTRICT_UTF8_CHAR() returns 0"); |
6aa905cf KW |
584 | } |
585 | else { | |
586 | my $expected_ret = ( $testname =~ /surrogate|non-character/ | |
587 | || $allowed_uv > 0x10FFFF) | |
588 | ? 0 | |
589 | : $length; | |
d402d77f | 590 | is($ret, $expected_ret, " And isSTRICT_UTF8_CHAR() returns" |
6aa905cf KW |
591 | . " expected length: $expected_ret"); |
592 | $ret = test_isUTF8_CHAR_flags($bytes, $length, | |
593 | $::UTF8_DISALLOW_ILLEGAL_INTERCHANGE); | |
594 | is($ret, $expected_ret, | |
d402d77f KW |
595 | " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_INTERCHANGE')" |
596 | . " acts like isSTRICT_UTF8_CHAR"); | |
6aa905cf | 597 | } |
9cdc3054 | 598 | is(scalar @warnings_gotten, 0, |
d402d77f KW |
599 | " And neither isSTRICT_UTF8_CHAR() nor isUTF8_CHAR_flags" |
600 | . " generated any warnings") | |
9cdc3054 | 601 | or output_warnings(@warnings_gotten); |
6aa905cf | 602 | |
9cdc3054 | 603 | undef @warnings_gotten; |
6aa905cf KW |
604 | $ret = test_isC9_STRICT_UTF8_CHAR($bytes, $length); |
605 | if ($will_overflow) { | |
d402d77f | 606 | is($ret, 0, " And isC9_STRICT_UTF8_CHAR() returns 0"); |
6aa905cf KW |
607 | } |
608 | else { | |
609 | my $expected_ret = ( $testname =~ /surrogate/ | |
610 | || $allowed_uv > 0x10FFFF) | |
611 | ? 0 | |
612 | : $length; | |
d402d77f | 613 | is($ret, $expected_ret, " And isC9_STRICT_UTF8_CHAR()" |
6aa905cf KW |
614 | ." returns expected length: $expected_ret"); |
615 | $ret = test_isUTF8_CHAR_flags($bytes, $length, | |
616 | $::UTF8_DISALLOW_ILLEGAL_C9_INTERCHANGE); | |
617 | is($ret, $expected_ret, | |
d402d77f KW |
618 | " And isUTF8_CHAR_flags('DISALLOW_ILLEGAL_C9_INTERCHANGE')" |
619 | . " acts like isC9_STRICT_UTF8_CHAR"); | |
6aa905cf | 620 | } |
9cdc3054 | 621 | is(scalar @warnings_gotten, 0, |
d402d77f KW |
622 | " And neither isC9_STRICT_UTF8_CHAR() nor isUTF8_CHAR_flags" |
623 | . " generated any warnings") | |
9cdc3054 | 624 | or output_warnings(@warnings_gotten); |
6aa905cf KW |
625 | |
626 | # Test partial character handling, for each byte not a full character | |
627 | for my $j (1.. $length - 1) { | |
628 | ||
629 | # Skip the test for the interaction between overflow and above-31 | |
630 | # bit. It is really testing other things than the partial | |
631 | # character tests, for which other tests in this file are | |
632 | # sufficient | |
04f42bf6 | 633 | last if $will_overflow; |
6aa905cf | 634 | |
69485e19 | 635 | foreach my $disallow_flag (0, $utf8n_flag_to_disallow) { |
6aa905cf KW |
636 | my $partial = substr($bytes, 0, $j); |
637 | my $ret_should_be; | |
638 | my $comment; | |
639 | if ($disallow_flag) { | |
640 | $ret_should_be = 0; | |
641 | $comment = "disallowed"; | |
642 | if ($j < $needed_to_discern_len) { | |
643 | $ret_should_be = 1; | |
644 | $comment .= ", but need $needed_to_discern_len bytes" | |
645 | . " to discern:"; | |
646 | } | |
647 | } | |
648 | else { | |
649 | $ret_should_be = 1; | |
650 | $comment = "allowed"; | |
651 | } | |
652 | ||
9cdc3054 | 653 | undef @warnings_gotten; |
6aa905cf KW |
654 | |
655 | $ret = test_is_utf8_valid_partial_char_flags($partial, $j, | |
656 | $disallow_flag); | |
657 | is($ret, $ret_should_be, | |
d402d77f KW |
658 | " And is_utf8_valid_partial_char_flags(" |
659 | . display_bytes($partial) | |
660 | . "), $comment: returns $ret_should_be"); | |
9cdc3054 | 661 | is(scalar @warnings_gotten, 0, |
d402d77f | 662 | " And is_utf8_valid_partial_char_flags()" |
6aa905cf | 663 | . " generated no warnings") |
9cdc3054 | 664 | or output_warnings(@warnings_gotten); |
6aa905cf KW |
665 | } |
666 | } | |
667 | } | |
668 | ||
69485e19 KW |
669 | # This is more complicated than the malformations tested in other files in |
670 | # this directory, as there are several orthogonal variables involved. We | |
671 | # test most possible combinations | |
672 | foreach my $trial_warning_category ('surrogate', 'nonchar', 'non_unicode') { | |
bf422d6a | 673 | next if $skip_most_tests && $trial_warning_category ne $controlling_warning_category; |
69485e19 KW |
674 | foreach my $do_disallow (0, 1) { |
675 | next if $skip_most_tests && ! $do_disallow; | |
6aa905cf KW |
676 | |
677 | # We try each of the above with various combinations of | |
678 | # malformations that can occur on the same input sequence. | |
679 | foreach my $short ("", "short") { | |
bf422d6a | 680 | next if $skip_most_tests && $short; |
6aa905cf KW |
681 | foreach my $unexpected_noncont ("", |
682 | "unexpected non-continuation") | |
683 | { | |
bf422d6a | 684 | next if $skip_most_tests && $unexpected_noncont; |
6aa905cf | 685 | foreach my $overlong ("", "overlong") { |
bf422d6a | 686 | next if $overlong && $skip_most_tests; |
6aa905cf | 687 | |
b3169593 KW |
688 | # If we're creating an overlong, it can't be longer than |
689 | # the maximum length, so skip if we're already at that | |
690 | # length. | |
691 | next if $overlong && $length >= $::max_bytes; | |
6aa905cf | 692 | |
69485e19 KW |
693 | # We classify the warnings into certain "interesting" types, |
694 | # described later | |
695 | foreach my $warning_type (0..4) { | |
696 | next if $skip_most_tests && $warning_type != 1; | |
697 | foreach my $use_warn_flag (0, 1) { | |
698 | next if $skip_most_tests && ! $use_warn_flag; | |
699 | ||
700 | my $this_utf8n_flag_to_warn = $utf8n_flag_to_warn; | |
701 | my $this_utf8n_flag_to_disallow = $utf8n_flag_to_disallow; | |
702 | my $this_uvchr_flag_to_warn = $uvchr_flag_to_warn; | |
703 | my $this_uvchr_flag_to_disallow = $uvchr_flag_to_disallow; | |
704 | ||
705 | my $eval_warn; | |
706 | my $expect_regular_warnings; | |
707 | my $expect_warnings_for_malformed; | |
708 | my $expect_warnings_for_overflow; | |
709 | ||
710 | if ($warning_type == 0) { | |
711 | $eval_warn = "use warnings; no warnings 'deprecated'"; | |
712 | $expect_regular_warnings = $use_warn_flag; | |
713 | $expect_warnings_for_overflow = 1; | |
714 | $expect_warnings_for_malformed = 1; | |
6aa905cf | 715 | } |
69485e19 KW |
716 | elsif ($warning_type == 1) { |
717 | $eval_warn = "no warnings"; | |
718 | $expect_regular_warnings = 0; | |
719 | $expect_warnings_for_overflow = 0; | |
720 | $expect_warnings_for_malformed = 0; | |
721 | } | |
722 | elsif ($warning_type == 2) { | |
723 | $eval_warn = "no warnings; use warnings 'utf8'"; | |
724 | $expect_regular_warnings = $use_warn_flag; | |
725 | $expect_warnings_for_overflow = 1; | |
726 | $expect_warnings_for_malformed = 1; | |
727 | } | |
728 | elsif ($warning_type == 3) { | |
729 | $eval_warn = "no warnings; use warnings" | |
730 | . " '$trial_warning_category'"; | |
731 | $expect_regular_warnings | |
732 | = ( $trial_warning_category eq $controlling_warning_category | |
733 | && $use_warn_flag); | |
734 | $expect_warnings_for_overflow | |
735 | = $trial_warning_category eq 'non_unicode'; | |
736 | $expect_warnings_for_malformed = 0; | |
737 | } | |
738 | elsif ($warning_type == 4) { # Like type 3, but uses the | |
739 | # above-31-bit flags | |
740 | # The complement flags were set up so that the | |
741 | # above-31-bit flags have been tested that they don't | |
742 | # trigger wrongly for too small code points. And the | |
743 | # flags have been set up so that those small code | |
744 | # points are tested for being above Unicode. What's | |
745 | # left to test is that the large code points do | |
746 | # trigger the above-31-bit flags. | |
747 | next if ! $will_overflow && $allowed_uv < 0x80000000; | |
748 | next if $trial_warning_category ne 'non_unicode'; | |
749 | $eval_warn = "no warnings; use warnings 'non_unicode'"; | |
750 | $expect_regular_warnings = 1; | |
751 | $expect_warnings_for_overflow = 1; | |
752 | $expect_warnings_for_malformed = 0; | |
753 | $this_utf8n_flag_to_warn = $::UTF8_WARN_ABOVE_31_BIT; | |
754 | $this_utf8n_flag_to_disallow | |
755 | = $::UTF8_DISALLOW_ABOVE_31_BIT; | |
756 | $this_uvchr_flag_to_warn = $::UNICODE_WARN_ABOVE_31_BIT; | |
757 | $this_uvchr_flag_to_disallow | |
758 | = $::UNICODE_DISALLOW_ABOVE_31_BIT; | |
601e92f1 KW |
759 | } |
760 | else { | |
69485e19 | 761 | die "Unexpected warning type '$warning_type'"; |
601e92f1 KW |
762 | } |
763 | ||
69485e19 KW |
764 | # We only need to test the case where all warnings are |
765 | # enabled (type 0) to see if turning off the warning flag | |
766 | # causes things to not be output. If those pass, then | |
767 | # turning on some sub-category of warnings, or turning off | |
768 | # warnings altogether are extremely likely to not output | |
769 | # warnings either, given how the warnings subsystem is | |
770 | # supposed to work, and this file assumes it does work. | |
771 | next if $warning_type != 0 && ! $use_warn_flag; | |
772 | ||
773 | # The convention is that the 'got' flag is the same value | |
774 | # as the disallow one. If this were violated, the tests | |
775 | # here should start failing. | |
776 | my $return_flag = $this_utf8n_flag_to_disallow; | |
777 | ||
778 | my $this_warning_flags = ($use_warn_flag) | |
779 | ? $this_utf8n_flag_to_warn | |
780 | : 0; | |
781 | my $this_disallow_flags = ($do_disallow) | |
782 | ? $this_utf8n_flag_to_disallow | |
783 | : 0; | |
6aa905cf KW |
784 | my $this_bytes = $bytes; |
785 | my $this_length = $length; | |
786 | my $expected_uv = $allowed_uv; | |
b3169593 | 787 | my $this_expected_len = $length; |
6aa905cf | 788 | my $this_needed_to_discern_len = $needed_to_discern_len; |
6aa905cf | 789 | |
69485e19 KW |
790 | my @malformation_names; |
791 | my @expected_warnings; | |
792 | my @expected_return_flags; | |
6aa905cf | 793 | |
69485e19 KW |
794 | # Now go through the possible malformations wanted, and |
795 | # change the input accordingly. We also can set up | |
796 | # certain other things now, like whether we expect a | |
797 | # return flag from this malformation and which flag. | |
798 | if ($overlong) { | |
6aa905cf KW |
799 | # For an overlong, we convert the original |
800 | # start byte into a continuation byte with | |
801 | # the same data bits as originally. ... | |
802 | substr($this_bytes, 0, 1) | |
803 | = start_byte_to_cont(substr($this_bytes, | |
804 | 0, 1)); | |
805 | ||
806 | # ... Then we prepend it with a known | |
807 | # overlong sequence. This should evaluate | |
808 | # to the exact same code point as the | |
809 | # original. | |
810 | $this_bytes | |
811 | = I8_to_native("\xff") | |
dbb8d798 | 812 | . (I8_to_native(chr $::lowest_continuation) |
6aa905cf KW |
813 | x ( $::max_bytes - 1 - length($this_bytes))) |
814 | . $this_bytes; | |
815 | $this_length = length($this_bytes); | |
816 | $this_needed_to_discern_len | |
817 | = $::max_bytes - ($this_expected_len | |
818 | - $this_needed_to_discern_len); | |
819 | $this_expected_len = $::max_bytes; | |
9cdc3054 | 820 | push @expected_return_flags, $::UTF8_GOT_LONG; |
69485e19 KW |
821 | push @malformation_names, $overlong; |
822 | if ($expect_warnings_for_malformed) { | |
823 | if ( ! $short | |
824 | && ! $unexpected_noncont | |
825 | && ! $will_overflow) | |
826 | { | |
827 | my $overlong_bytes | |
828 | = display_bytes_no_quotes($this_bytes); | |
829 | my $correct_bytes | |
830 | = display_bytes_no_quotes($bytes); | |
831 | push @expected_warnings, | |
832 | qr/\QMalformed UTF-8 character:\E | |
833 | \Q $overlong_bytes (overlong;\E | |
834 | \Q instead use $correct_bytes to\E | |
835 | \Q represent U+$uv_string)/x; | |
836 | } | |
837 | else { | |
838 | push @expected_warnings, qr/overlong/; | |
839 | } | |
6aa905cf | 840 | } |
69485e19 | 841 | } |
6aa905cf | 842 | |
69485e19 KW |
843 | if ($short) { |
844 | push @malformation_names, $short; | |
845 | push @expected_warnings, qr/short/ | |
846 | if $expect_warnings_for_malformed; | |
847 | ||
848 | # To force this malformation, just tell the test to | |
849 | # not look as far as it should into the input. | |
850 | $this_length--; | |
851 | $this_expected_len--; | |
852 | push @expected_return_flags, $::UTF8_GOT_SHORT; | |
853 | } | |
854 | ||
855 | if ($unexpected_noncont) { | |
856 | push @malformation_names, $unexpected_noncont; | |
857 | push @expected_warnings, qr/$unexpected_noncont/ | |
858 | if $expect_warnings_for_malformed; | |
859 | ||
860 | # To force this malformation, change the final | |
861 | # continuation byte into a non continuation. | |
862 | my $pos = ($short) ? -2 : -1; | |
863 | substr($this_bytes, $pos, 1) = '?'; | |
864 | $this_expected_len--; | |
865 | push @expected_return_flags, | |
866 | $::UTF8_GOT_NON_CONTINUATION; | |
867 | } | |
868 | ||
869 | # The overflow malformation is done differently than other | |
870 | # malformations. It comes from manually typed tests in | |
871 | # the test array, but it also is above Unicode and uses | |
872 | # Perl extended UTF-8, so affects some of the flags being | |
873 | # tested. We now make it be treated like one of the other | |
874 | # generated malformations. | |
875 | if ($will_overflow) { | |
876 | ||
877 | # An overflow is (way) above Unicode, and overrides | |
878 | # everything else. | |
879 | $expect_regular_warnings = 0; | |
880 | ||
881 | push @malformation_names, 'overflow'; | |
882 | if ($expect_warnings_for_overflow) { | |
883 | my $qr = display_bytes_no_quotes( | |
884 | substr($this_bytes, 0, $this_expected_len)); | |
885 | $qr = qr/\QMalformed UTF-8 character: \E | |
886 | \Q$qr (overflows)\E/x; | |
887 | push @expected_warnings, $qr; | |
6aa905cf | 888 | } |
69485e19 | 889 | push @expected_return_flags, $::UTF8_GOT_OVERFLOW; |
6aa905cf KW |
890 | } |
891 | ||
69485e19 KW |
892 | # Here, we've set things up based on the malformations. |
893 | # Now generate the text for them for the test name. | |
894 | my $malformations_name = ""; | |
895 | if (@malformation_names) { | |
896 | $malformations_name .= "malformation"; | |
897 | $malformations_name .= "s" if @malformation_names > 1; | |
898 | $malformations_name .= ": "; | |
899 | $malformations_name .= join "/", @malformation_names; | |
900 | $malformations_name = " ($malformations_name)"; | |
901 | } | |
902 | ||
903 | # It may be that the malformations have shortened the | |
904 | # amount of input we look at so much that we can't tell | |
905 | # what the category the code point was in. Otherwise, set | |
906 | # up the expected return flags based on the warnings and | |
907 | # disallowments. | |
908 | if ($this_expected_len < $this_needed_to_discern_len) { | |
909 | $expect_regular_warnings = 0; | |
910 | } | |
911 | elsif ( ($this_warning_flags & $this_utf8n_flag_to_warn) | |
912 | || ( $this_disallow_flags | |
913 | & $this_utf8n_flag_to_disallow)) | |
914 | { | |
915 | push @expected_return_flags, $return_flag; | |
916 | } | |
917 | ||
918 | # Finish setting up the expected warning. | |
919 | if ($expect_regular_warnings) { | |
920 | ||
921 | # So far the array contains warnings generated by | |
922 | # malformations. Add the expected regular one. | |
923 | unshift @expected_warnings, $cp_message_qr; | |
924 | ||
925 | # But it may need to be modified, because either of | |
926 | # these malformations means we can't determine the | |
927 | # expected code point. | |
928 | if ($short || $unexpected_noncont) { | |
929 | my $first_byte = substr($this_bytes, 0, 1); | |
930 | $expected_warnings[0] = display_bytes( | |
931 | substr($this_bytes, 0, $this_expected_len)); | |
932 | $expected_warnings[0] | |
933 | = qr/[Aa]\Qny UTF-8 sequence that starts with\E | |
934 | \Q $expected_warnings[0]\E | |
935 | \Q $non_cp_trailing_text\E/x; | |
936 | } | |
937 | } | |
6aa905cf KW |
938 | |
939 | # Is effectively disallowed if we've set up a | |
940 | # malformation, even if the flag indicates it is | |
941 | # allowed. Fix up test name to indicate this as | |
942 | # well | |
69485e19 KW |
943 | my $disallowed = ( $this_disallow_flags |
944 | & $this_utf8n_flag_to_disallow) | |
945 | || $malformations_name; | |
6aa905cf | 946 | my $this_name = "utf8n_to_uvchr_error() $testname: " |
69485e19 KW |
947 | . (($disallowed) |
948 | ? 'disallowed' | |
949 | : 'allowed'); | |
6aa905cf | 950 | $this_name .= ", $eval_warn"; |
69485e19 KW |
951 | $this_name .= ", " . (( $this_warning_flags |
952 | & $this_utf8n_flag_to_warn) | |
953 | ? 'with flag for raising warnings' | |
954 | : 'no flag for raising warnings'); | |
955 | $this_name .= $malformations_name; | |
db0f09e6 | 956 | |
9cdc3054 | 957 | undef @warnings_gotten; |
6aa905cf | 958 | my $ret_ref; |
69485e19 | 959 | my $this_flags = $this_warning_flags|$this_disallow_flags; |
6aa905cf KW |
960 | my $eval_text = "$eval_warn; \$ret_ref" |
961 | . " = test_utf8n_to_uvchr_error(" | |
69485e19 | 962 | . "'$this_bytes', $this_length, $this_flags)"; |
6aa905cf | 963 | eval "$eval_text"; |
69485e19 | 964 | if (! ok ("$@ eq ''", "$this_name: eval succeeded")) |
6aa905cf | 965 | { |
a8ee5133 KW |
966 | diag "\$@='$@'; call was: " |
967 | . utf8n_display_call($eval_text); | |
6aa905cf KW |
968 | next; |
969 | } | |
970 | if ($disallowed) { | |
d402d77f | 971 | is($ret_ref->[0], 0, " And returns 0") |
a8ee5133 | 972 | or diag "Call was: " . utf8n_display_call($eval_text); |
6aa905cf KW |
973 | } |
974 | else { | |
975 | is($ret_ref->[0], $expected_uv, | |
d402d77f KW |
976 | " And returns expected uv: " |
977 | . $uv_string) | |
a8ee5133 | 978 | or diag "Call was: " . utf8n_display_call($eval_text); |
6aa905cf KW |
979 | } |
980 | is($ret_ref->[1], $this_expected_len, | |
d402d77f | 981 | " And returns expected length:" |
6aa905cf | 982 | . " $this_expected_len") |
a8ee5133 | 983 | or diag "Call was: " . utf8n_display_call($eval_text); |
6aa905cf | 984 | |
9cdc3054 | 985 | my $returned_flags = $ret_ref->[2]; |
6aa905cf | 986 | |
9cdc3054 | 987 | for (my $i = @expected_return_flags - 1; $i >= 0; $i--) { |
69485e19 KW |
988 | if ($expected_return_flags[$i] & $returned_flags) { |
989 | if ($expected_return_flags[$i] | |
990 | == $::UTF8_DISALLOW_ABOVE_31_BIT) | |
991 | { | |
992 | pass(" Expected and got return flag for" | |
993 | . " above_31_bit"); | |
994 | } | |
995 | # The first entries in this are | |
996 | # malformations | |
997 | elsif ($i > @malformation_names - 1) { | |
998 | pass(" Expected and got return flag" | |
999 | . " for " . $trial_warning_category); | |
1000 | } | |
1001 | else { | |
1002 | pass(" Expected and got return flag for " | |
1003 | . $malformation_names[$i] | |
1004 | . " malformation"); | |
1005 | } | |
9cdc3054 | 1006 | $returned_flags &= ~$expected_return_flags[$i]; |
69485e19 | 1007 | splice @expected_return_flags, $i, 1; |
6aa905cf | 1008 | } |
6aa905cf | 1009 | } |
6aa905cf | 1010 | |
69485e19 KW |
1011 | is($returned_flags, 0, |
1012 | " Got no unexpected return flags") | |
1013 | or diag "The unexpected flags gotten were: " | |
5722c46d KW |
1014 | . (flags_to_text($returned_flags, |
1015 | \@utf8n_flags_to_text) | |
69485e19 KW |
1016 | # We strip off any prefixes from the flag |
1017 | # names | |
1018 | =~ s/ \b [A-Z] _ //xgr); | |
1019 | is (scalar @expected_return_flags, 0, | |
1020 | " Got all expected return flags") | |
1021 | or diag "The expected flags not gotten were: " | |
1022 | . (flags_to_text(eval join("|", | |
1023 | @expected_return_flags), | |
1024 | \@utf8n_flags_to_text) | |
1025 | # We strip off any prefixes from the flag | |
1026 | # names | |
5722c46d | 1027 | =~ s/ \b [A-Z] _ //xgr); |
9cdc3054 | 1028 | |
69485e19 KW |
1029 | do_warnings_test(@expected_warnings) |
1030 | or diag "Call was: " . utf8n_display_call($eval_text); | |
1031 | undef @warnings_gotten; | |
6aa905cf KW |
1032 | |
1033 | # Check CHECK_ONLY results when the input is | |
1034 | # disallowed. Do this when actually disallowed, | |
69485e19 | 1035 | # not just when the $this_disallow_flags is set |
6aa905cf | 1036 | if ($disallowed) { |
69485e19 KW |
1037 | my $this_flags = $this_disallow_flags|$::UTF8_CHECK_ONLY; |
1038 | my $eval_text = "use warnings; no warnings 'deprecated'; \$ret_ref =" | |
1039 | . " test_utf8n_to_uvchr_error('" | |
1040 | . "$this_bytes', $this_length," | |
1041 | . " $this_flags)"; | |
1042 | eval $eval_text; | |
a8ee5133 KW |
1043 | if (! ok ("$@ eq ''", |
1044 | " And eval succeeded with CHECK_ONLY")) | |
1045 | { | |
1046 | diag "\$@='$@'; Call was: " | |
1047 | . utf8n_display_call($eval_text); | |
1048 | next; | |
1049 | } | |
d402d77f | 1050 | is($ret_ref->[0], 0, " CHECK_ONLY: Returns 0") |
a8ee5133 | 1051 | or diag "Call was: " . utf8n_display_call($eval_text); |
6aa905cf | 1052 | is($ret_ref->[1], -1, |
d402d77f | 1053 | " CHECK_ONLY: returns -1 for length") |
a8ee5133 | 1054 | or diag "Call was: " . utf8n_display_call($eval_text); |
9cdc3054 | 1055 | if (! is(scalar @warnings_gotten, 0, |
d402d77f | 1056 | " CHECK_ONLY: no warnings generated")) |
6aa905cf | 1057 | { |
a8ee5133 | 1058 | diag "Call was: " . utf8n_display_call($eval_text); |
9cdc3054 | 1059 | output_warnings(@warnings_gotten); |
6aa905cf KW |
1060 | } |
1061 | } | |
1062 | ||
1063 | # Now repeat some of the above, but for | |
1064 | # uvchr_to_utf8_flags(). Since this comes from an | |
1065 | # existing code point, it hasn't overflowed, and | |
1066 | # isn't malformed. | |
69485e19 KW |
1067 | next if @malformation_names; |
1068 | ||
1069 | $this_warning_flags = ($use_warn_flag) | |
1070 | ? $this_uvchr_flag_to_warn | |
1071 | : 0; | |
1072 | $this_disallow_flags = ($do_disallow) | |
1073 | ? $this_uvchr_flag_to_disallow | |
1074 | : 0; | |
1075 | ||
1076 | $disallowed = $this_disallow_flags | |
1077 | & $this_uvchr_flag_to_disallow; | |
1078 | $this_name .= ", " . (( $this_warning_flags | |
1079 | & $this_utf8n_flag_to_warn) | |
1080 | ? 'with flag for raising warnings' | |
1081 | : 'no flag for raising warnings'); | |
6aa905cf KW |
1082 | |
1083 | $this_name = "uvchr_to_utf8_flags() $testname: " | |
69485e19 | 1084 | . (($disallowed) |
6aa905cf | 1085 | ? 'disallowed' |
6aa905cf KW |
1086 | : 'allowed'); |
1087 | $this_name .= ", $eval_warn"; | |
69485e19 KW |
1088 | $this_name .= ", " . (( $this_warning_flags |
1089 | & $this_uvchr_flag_to_warn) | |
6aa905cf KW |
1090 | ? 'with warning flag' |
1091 | : 'no warning flag'); | |
1092 | ||
9cdc3054 | 1093 | undef @warnings_gotten; |
6aa905cf | 1094 | my $ret; |
69485e19 | 1095 | $this_flags = $this_warning_flags|$this_disallow_flags; |
6aa905cf KW |
1096 | $eval_text = "$eval_warn; \$ret =" |
1097 | . " test_uvchr_to_utf8_flags(" | |
d884ea32 | 1098 | . "$allowed_uv, $this_flags)"; |
6aa905cf KW |
1099 | eval "$eval_text"; |
1100 | if (! ok ("$@ eq ''", "$this_name: eval succeeded")) | |
1101 | { | |
d884ea32 KW |
1102 | diag "\$@='$@'; call was: " |
1103 | . uvchr_display_call($eval_text); | |
6aa905cf KW |
1104 | next; |
1105 | } | |
1106 | if ($disallowed) { | |
d402d77f | 1107 | is($ret, undef, " And returns undef") |
d884ea32 | 1108 | or diag "Call was: " . uvchr_display_call($eval_text); |
6aa905cf KW |
1109 | } |
1110 | else { | |
d402d77f | 1111 | is($ret, $this_bytes, " And returns expected string") |
d884ea32 | 1112 | or diag "Call was: " . uvchr_display_call($eval_text); |
6aa905cf | 1113 | } |
69485e19 KW |
1114 | |
1115 | do_warnings_test(@expected_warnings) | |
1116 | or diag "Call was: " . uvchr_display_call($eval_text); | |
6aa905cf KW |
1117 | } |
1118 | } | |
1119 | } | |
1120 | } | |
1121 | } | |
1122 | } | |
1123 | } | |
1124 | } | |
1125 | ||
1126 | done_testing; |