Commit | Line | Data |
---|---|---|
adfe19db MHM |
1 | ################################################################################ |
2 | ## | |
b2049988 | 3 | ## Version 3.x, Copyright (C) 2004-2013, Marcus Holland-Moritz. |
adfe19db MHM |
4 | ## Version 2.x, Copyright (C) 2001, Paul Marquess. |
5 | ## Version 1.x, Copyright (C) 1999, Kenneth Albanowski. | |
6 | ## | |
7 | ## This program is free software; you can redistribute it and/or | |
8 | ## modify it under the same terms as Perl itself. | |
9 | ## | |
10 | ################################################################################ | |
11 | ||
12 | =provides | |
13 | ||
adfe19db | 14 | __UNDEFINED__ |
e28192fd | 15 | my_strnlen |
679ad62d | 16 | SvUOK |
aadf4f9e N |
17 | utf8_to_uvchr_buf |
18 | ||
19 | =dontwarn | |
20 | ||
21 | _ppport_utf8_to_uvchr_buf_callee | |
22 | _ppport_MIN | |
adfe19db MHM |
23 | |
24 | =implementation | |
25 | ||
aadf4f9e N |
26 | #define _ppport_MIN(a,b) (((a) <= (b)) ? (a) : (b)) |
27 | ||
0d0f8426 MHM |
28 | __UNDEFINED__ sv_setuv(sv, uv) \ |
29 | STMT_START { \ | |
30 | UV TeMpUv = uv; \ | |
31 | if (TeMpUv <= IV_MAX) \ | |
32 | sv_setiv(sv, TeMpUv); \ | |
33 | else \ | |
34 | sv_setnv(sv, (double)TeMpUv); \ | |
35 | } STMT_END | |
36 | ||
37 | __UNDEFINED__ newSVuv(uv) ((uv) <= IV_MAX ? newSViv((IV)uv) : newSVnv((NV)uv)) | |
adfe19db MHM |
38 | |
39 | __UNDEFINED__ sv_2uv(sv) ((PL_Sv = (sv)), (UV) (SvNOK(PL_Sv) ? SvNV(PL_Sv) : sv_2nv(PL_Sv))) | |
40 | __UNDEFINED__ SvUVX(sv) ((UV)SvIVX(sv)) | |
41 | __UNDEFINED__ SvUVXx(sv) SvUVX(sv) | |
42 | __UNDEFINED__ SvUV(sv) (SvIOK(sv) ? SvUVX(sv) : sv_2uv(sv)) | |
43 | __UNDEFINED__ SvUVx(sv) ((PL_Sv = (sv)), SvUV(PL_Sv)) | |
44 | ||
45 | /* Hint: sv_uv | |
46 | * Always use the SvUVx() macro instead of sv_uv(). | |
47 | */ | |
48 | __UNDEFINED__ sv_uv(sv) SvUVx(sv) | |
49 | ||
679ad62d MHM |
50 | #if !defined(SvUOK) && defined(SvIOK_UV) |
51 | # define SvUOK(sv) SvIOK_UV(sv) | |
52 | #endif | |
53 | ||
adfe19db MHM |
54 | __UNDEFINED__ XST_mUV(i,v) (ST(i) = sv_2mortal(newSVuv(v)) ) |
55 | __UNDEFINED__ XSRETURN_UV(v) STMT_START { XST_mUV(0,v); XSRETURN(1); } STMT_END | |
56 | ||
96ad942f MHM |
57 | __UNDEFINED__ PUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); PUSHTARG; } STMT_END |
58 | __UNDEFINED__ XPUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); XPUSHTARG; } STMT_END | |
59 | ||
f2e3e4ce | 60 | #if defined UTF8SKIP |
94a2c4f7 KW |
61 | |
62 | /* Don't use official version because it uses MIN, which may not be available */ | |
63 | #undef UTF8_SAFE_SKIP | |
64 | ||
aadf4f9e N |
65 | __UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \ |
66 | ((((e) - (s)) <= 0) \ | |
f2e3e4ce | 67 | ? 0 \ |
aadf4f9e | 68 | : _ppport_MIN(((e) - (s)), UTF8SKIP(s)))) |
f2e3e4ce KW |
69 | #endif |
70 | ||
e28192fd KW |
71 | #if !defined(my_strnlen) |
72 | #if { NEED my_strnlen } | |
73 | ||
74 | STRLEN | |
75 | my_strnlen(const char *str, Size_t maxlen) | |
76 | { | |
77 | const char *p = str; | |
78 | ||
79 | while(maxlen-- && *p) | |
80 | p++; | |
81 | ||
82 | return p - str; | |
83 | } | |
84 | ||
85 | #endif | |
86 | #endif | |
4c28bdc5 | 87 | |
97d83eb2 | 88 | #if { VERSION < 5.31.3 } |
aadf4f9e N |
89 | /* Versions prior to this accepted things that are now considered |
90 | * malformations, and didn't return -1 on error with warnings enabled | |
91 | * */ | |
92 | # undef utf8_to_uvchr_buf | |
93 | #endif | |
4c28bdc5 | 94 | |
aadf4f9e N |
95 | /* This implementation brings modern, generally more restricted standards to |
96 | * utf8_to_uvchr_buf. Some of these are security related, and clearly must | |
97 | * be done. But its arguable that the others need not, and hence should not. | |
98 | * The reason they're here is that a module that intends to play with the | |
99 | * latest perls shoud be able to work the same in all releases. An example is | |
100 | * that perl no longer accepts any UV for a code point, but limits them to | |
101 | * IV_MAX or below. This is for future internal use of the larger code points. | |
102 | * If it turns out that some of these changes are breaking code that isn't | |
103 | * intended to work with modern perls, the tighter restrictions could be | |
104 | * relaxed. khw thinks this is unlikely, but has been wrong in the past. */ | |
105 | ||
9aa6a863 N |
106 | #if { VERSION < 5.10.0 } |
107 | # define D_PPP_CU8 U8 | |
108 | #else | |
109 | # define D_PPP_CU8 const U8 | |
110 | #endif | |
111 | ||
aadf4f9e N |
112 | #ifndef utf8_to_uvchr_buf |
113 | /* Choose which underlying implementation to use. At least one must be | |
114 | * present or the perl is too early to handle this function */ | |
115 | # if defined(utf8n_to_uvchr) || defined(utf8_to_uv) | |
116 | # if defined(utf8n_to_uvchr) /* This is the preferred implementation */ | |
117 | # define _ppport_utf8_to_uvchr_buf_callee utf8n_to_uvchr | |
9aa6a863 | 118 | # elif { VERSION >= 5.6.1 } |
aadf4f9e N |
119 | # define _ppport_utf8_to_uvchr_buf_callee utf8_to_uv |
120 | # endif | |
121 | ||
122 | # endif | |
123 | ||
124 | #ifdef _ppport_utf8_to_uvchr_buf_callee | |
125 | # if { NEED utf8_to_uvchr_buf } | |
4c28bdc5 | 126 | |
aadf4f9e | 127 | UV |
9aa6a863 | 128 | utf8_to_uvchr_buf(pTHX_ D_PPP_CU8 *s, const U8 *send, STRLEN *retlen) |
aadf4f9e N |
129 | { |
130 | UV ret; | |
131 | STRLEN curlen; | |
132 | bool overflows = 0; | |
133 | const U8 *cur_s = s; | |
134 | const bool do_warnings = ckWARN_d(WARN_UTF8); | |
135 | ||
136 | if (send > s) { | |
137 | curlen = send - s; | |
138 | } | |
139 | else { | |
140 | assert(0); /* Modern perls die under this circumstance */ | |
141 | curlen = 0; | |
142 | if (! do_warnings) { /* Handle empty here if no warnings needed */ | |
143 | if (retlen) *retlen = 0; | |
144 | return UNICODE_REPLACEMENT; | |
145 | } | |
146 | } | |
147 | ||
148 | /* The modern version allows anything that evaluates to a legal UV, but not | |
149 | * overlongs nor an empty input */ | |
150 | ret = _ppport_utf8_to_uvchr_buf_callee( | |
151 | s, curlen, retlen, (UTF8_ALLOW_ANYUV | |
152 | & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY))); | |
153 | ||
154 | /* But actually, modern versions restrict the UV to being no more than what | |
155 | * an IV can hold */ | |
156 | if (ret > PERL_INT_MAX) { | |
157 | overflows = 1; | |
158 | } | |
159 | ||
160 | # if { VERSION < 5.26.0 } | |
161 | # ifndef EBCDIC | |
162 | ||
163 | /* There are bugs in versions earlier than this on non-EBCDIC platforms | |
164 | * in which it did not detect all instances of overflow, which could be | |
165 | * a security hole. Also, earlier versions did not allow the overflow | |
166 | * malformation under any circumstances, and modern ones do. So we | |
167 | * need to check here. */ | |
168 | ||
169 | else if (curlen > 0 && *s >= 0xFE) { | |
170 | ||
171 | /* If the main routine detected overflow, great; it returned 0. But if the | |
172 | * input's first byte indicates it could overflow, we need to verify. | |
173 | * First, on a 32-bit machine the first byte being at least \xFE | |
174 | * automatically is overflow */ | |
175 | if (sizeof(ret) < 8) { | |
176 | overflows = 1; | |
177 | } | |
178 | else { | |
179 | const U8 highest[] = /* 2*63-1 */ | |
180 | "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"; | |
181 | const U8 *cur_h = highest; | |
182 | ||
183 | for (cur_s = s; cur_s < send; cur_s++, cur_h++) { | |
184 | if (UNLIKELY(*cur_s == *cur_h)) { | |
185 | continue; | |
186 | } | |
187 | ||
188 | /* If this byte is larger than the corresponding highest UTF-8 | |
189 | * byte, the sequence overflows; otherwise the byte is less than | |
190 | * (as we handled the equality case above), and so the sequence | |
191 | * doesn't overflow */ | |
192 | overflows = *cur_s > *cur_h; | |
193 | break; | |
194 | ||
195 | } | |
196 | ||
197 | /* Here, either we set the bool and broke out of the loop, or got | |
198 | * to the end and all bytes are the same which indicates it doesn't | |
199 | * overflow. */ | |
200 | } | |
201 | } | |
202 | ||
203 | # endif | |
204 | # endif /* < 5.26 */ | |
205 | ||
206 | if (UNLIKELY(overflows)) { | |
207 | if (! do_warnings) { | |
208 | if (retlen) { | |
209 | *retlen = _ppport_MIN(*retlen, UTF8SKIP(s)); | |
210 | *retlen = _ppport_MIN(*retlen, curlen); | |
211 | } | |
212 | return UNICODE_REPLACEMENT; | |
213 | } | |
214 | else { | |
215 | ||
216 | /* On versions that correctly detect overflow, but forbid it | |
217 | * always, 0 will be returned, but also a warning will have been | |
218 | * raised. Don't repeat it */ | |
219 | if (ret != 0) { | |
220 | /* We use the error message in use from 5.8-5.14 */ | |
221 | Perl_warner(aTHX_ packWARN(WARN_UTF8), | |
222 | "Malformed UTF-8 character (overflow at 0x%" UVxf | |
223 | ", byte 0x%02x, after start byte 0x%02x)", | |
224 | ret, *cur_s, *s); | |
225 | } | |
226 | if (retlen) { | |
227 | *retlen = (STRLEN) -1; | |
228 | } | |
229 | return 0; | |
230 | } | |
231 | } | |
232 | ||
233 | /* If failed and warnings are off, to emulate the behavior of the real | |
234 | * utf8_to_uvchr(), try again, allowing anything. (Note a return of 0 is | |
235 | * ok if the input was '\0') */ | |
236 | if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) { | |
237 | ||
238 | /* If curlen is 0, we already handled the case where warnings are | |
239 | * disabled, so this 'if' will be true, and we won't look at the | |
240 | * contents of 's' */ | |
241 | if (do_warnings) { | |
242 | *retlen = (STRLEN) -1; | |
243 | } | |
244 | else { | |
245 | ret = _ppport_utf8_to_uvchr_buf_callee( | |
246 | s, curlen, retlen, UTF8_ALLOW_ANY); | |
247 | /* Override with the REPLACEMENT character, as that is what the | |
248 | * modern version of this function returns */ | |
249 | ret = UNICODE_REPLACEMENT; | |
250 | ||
251 | # if { VERSION < 5.16.0 } | |
252 | ||
253 | /* Versions earlier than this don't necessarily return the proper | |
254 | * length. It should not extend past the end of string, nor past | |
255 | * what the first byte indicates the length is, nor past the | |
256 | * continuation characters */ | |
257 | if (retlen && *retlen >= 0) { | |
f1590d76 | 258 | unsigned int i = 1; |
aadf4f9e N |
259 | *retlen = _ppport_MIN(*retlen, curlen); |
260 | *retlen = _ppport_MIN(*retlen, UTF8SKIP(s)); | |
aadf4f9e N |
261 | do { |
262 | if (s[i] < 0x80 || s[i] > 0xBF) { | |
263 | *retlen = i; | |
264 | break; | |
265 | } | |
266 | } while (++i < *retlen); | |
267 | } | |
268 | ||
269 | # endif | |
270 | ||
271 | } | |
272 | } | |
273 | ||
274 | return ret; | |
275 | } | |
e28192fd | 276 | |
aadf4f9e N |
277 | # endif |
278 | #endif | |
279 | #endif | |
39d7245c | 280 | |
aadf4f9e N |
281 | #if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf) |
282 | #undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses | |
283 | to read past a NUL, making it much less likely to read | |
284 | off the end of the buffer. A NUL indicates the start | |
285 | of the next character anyway. If the input isn't | |
286 | NUL-terminated, the function remains unsafe, as it | |
287 | always has been. */ | |
39d7245c KW |
288 | |
289 | __UNDEFINED__ utf8_to_uvchr(s, lp) \ | |
290 | ((*(s) == '\0') \ | |
291 | ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \ | |
292 | : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp))) | |
293 | ||
aadf4f9e N |
294 | #endif |
295 | ||
e28192fd KW |
296 | =xsinit |
297 | ||
298 | #define NEED_my_strnlen | |
aadf4f9e | 299 | #define NEED_utf8_to_uvchr_buf |
e28192fd | 300 | |
adfe19db MHM |
301 | =xsubs |
302 | ||
303 | SV * | |
304 | sv_setuv(uv) | |
b2049988 MHM |
305 | UV uv |
306 | CODE: | |
307 | RETVAL = newSViv(1); | |
308 | sv_setuv(RETVAL, uv); | |
309 | OUTPUT: | |
310 | RETVAL | |
adfe19db MHM |
311 | |
312 | SV * | |
313 | newSVuv(uv) | |
b2049988 MHM |
314 | UV uv |
315 | CODE: | |
316 | RETVAL = newSVuv(uv); | |
317 | OUTPUT: | |
318 | RETVAL | |
adfe19db MHM |
319 | |
320 | UV | |
321 | sv_2uv(sv) | |
b2049988 MHM |
322 | SV *sv |
323 | CODE: | |
324 | RETVAL = sv_2uv(sv); | |
325 | OUTPUT: | |
326 | RETVAL | |
adfe19db MHM |
327 | |
328 | UV | |
329 | SvUVx(sv) | |
b2049988 MHM |
330 | SV *sv |
331 | CODE: | |
332 | sv--; | |
333 | RETVAL = SvUVx(++sv); | |
334 | OUTPUT: | |
335 | RETVAL | |
adfe19db MHM |
336 | |
337 | void | |
338 | XSRETURN_UV() | |
b2049988 MHM |
339 | PPCODE: |
340 | XSRETURN_UV(42); | |
adfe19db | 341 | |
96ad942f MHM |
342 | void |
343 | PUSHu() | |
b2049988 MHM |
344 | PREINIT: |
345 | dTARG; | |
346 | PPCODE: | |
347 | TARG = sv_newmortal(); | |
348 | EXTEND(SP, 1); | |
349 | PUSHu(42); | |
350 | XSRETURN(1); | |
96ad942f MHM |
351 | |
352 | void | |
353 | XPUSHu() | |
b2049988 MHM |
354 | PREINIT: |
355 | dTARG; | |
356 | PPCODE: | |
357 | TARG = sv_newmortal(); | |
358 | XPUSHu(43); | |
359 | XSRETURN(1); | |
96ad942f | 360 | |
f2e3e4ce KW |
361 | STRLEN |
362 | UTF8_SAFE_SKIP(s, adjustment) | |
363 | unsigned char * s | |
364 | int adjustment | |
365 | CODE: | |
366 | /* Instead of passing in an 'e' ptr, use the real end, adjusted */ | |
9aa6a863 | 367 | #if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP) |
f2e3e4ce | 368 | RETVAL = UTF8_SAFE_SKIP(s, s + UTF8SKIP(s) + adjustment); |
9aa6a863 N |
369 | #else |
370 | RETVAL = 0; | |
371 | #endif | |
f2e3e4ce KW |
372 | OUTPUT: |
373 | RETVAL | |
374 | ||
e28192fd KW |
375 | STRLEN |
376 | my_strnlen(s, max) | |
377 | char * s | |
378 | STRLEN max | |
379 | CODE: | |
380 | RETVAL= my_strnlen(s, max); | |
381 | OUTPUT: | |
382 | RETVAL | |
383 | ||
4c28bdc5 | 384 | AV * |
aadf4f9e | 385 | utf8_to_uvchr_buf(s, adjustment) |
4c28bdc5 | 386 | unsigned char *s |
aadf4f9e | 387 | int adjustment |
4c28bdc5 KW |
388 | PREINIT: |
389 | AV *av; | |
390 | STRLEN len; | |
391 | CODE: | |
392 | av = newAV(); | |
9aa6a863 | 393 | #ifdef utf8_to_uvchr_buf |
aadf4f9e N |
394 | av_push(av, newSVuv(utf8_to_uvchr_buf(s, |
395 | s + UTF8SKIP(s) + adjustment, | |
396 | &len))); | |
9aa6a863 N |
397 | #else |
398 | av_push(av, newSVuv(0)); | |
399 | len = (STRLEN) -1; | |
400 | #endif | |
721db7b0 N |
401 | if (len == (STRLEN) -1) { |
402 | av_push(av, newSViv(-1)); | |
403 | } | |
404 | else { | |
405 | av_push(av, newSVuv(len)); | |
406 | } | |
4c28bdc5 KW |
407 | RETVAL = av; |
408 | OUTPUT: | |
409 | RETVAL | |
410 | ||
39d7245c KW |
411 | AV * |
412 | utf8_to_uvchr(s) | |
413 | unsigned char *s | |
414 | PREINIT: | |
415 | AV *av; | |
416 | STRLEN len; | |
417 | CODE: | |
418 | av = newAV(); | |
9aa6a863 | 419 | #ifdef utf8_to_uvchr |
39d7245c | 420 | av_push(av, newSVuv(utf8_to_uvchr(s, &len))); |
9aa6a863 N |
421 | #else |
422 | av_push(av, newSVuv(0)); | |
423 | len = (STRLEN) -1; | |
424 | #endif | |
721db7b0 N |
425 | if (len == (STRLEN) -1) { |
426 | av_push(av, newSViv(-1)); | |
427 | } | |
428 | else { | |
429 | av_push(av, newSVuv(len)); | |
430 | } | |
39d7245c KW |
431 | RETVAL = av; |
432 | OUTPUT: | |
433 | RETVAL | |
434 | ||
5f93efa0 | 435 | =tests plan => 62 |
adfe19db MHM |
436 | |
437 | ok(&Devel::PPPort::sv_setuv(42), 42); | |
438 | ok(&Devel::PPPort::newSVuv(123), 123); | |
439 | ok(&Devel::PPPort::sv_2uv("4711"), 4711); | |
440 | ok(&Devel::PPPort::sv_2uv("1735928559"), 1735928559); | |
441 | ok(&Devel::PPPort::SvUVx("1735928559"), 1735928559); | |
442 | ok(&Devel::PPPort::SvUVx(1735928559), 1735928559); | |
443 | ok(&Devel::PPPort::SvUVx(0xdeadbeef), 0xdeadbeef); | |
444 | ok(&Devel::PPPort::XSRETURN_UV(), 42); | |
96ad942f MHM |
445 | ok(&Devel::PPPort::PUSHu(), 42); |
446 | ok(&Devel::PPPort::XPUSHu(), 43); | |
f2e3e4ce KW |
447 | ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1); |
448 | ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0); | |
e28192fd | 449 | ok(&Devel::PPPort::my_strnlen("abc\0def", 7), 3); |
aadf4f9e | 450 | |
89d8c7cd KW |
451 | # skip tests on 5.6.0 and earlier |
452 | if ("$]" le '5.006') { | |
5f93efa0 | 453 | skip 'skip: broken utf8 support', 0 for 1..49; |
89d8c7cd KW |
454 | exit; |
455 | } | |
456 | ||
aadf4f9e | 457 | my $ret = &Devel::PPPort::utf8_to_uvchr("A"); |
4c28bdc5 KW |
458 | ok($ret->[0], ord("A")); |
459 | ok($ret->[1], 1); | |
aadf4f9e N |
460 | |
461 | $ret = &Devel::PPPort::utf8_to_uvchr("\0"); | |
4c28bdc5 KW |
462 | ok($ret->[0], 0); |
463 | ok($ret->[1], 1); | |
aadf4f9e N |
464 | |
465 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0); | |
39d7245c KW |
466 | ok($ret->[0], ord("A")); |
467 | ok($ret->[1], 1); | |
aadf4f9e N |
468 | |
469 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0); | |
39d7245c KW |
470 | ok($ret->[0], 0); |
471 | ok($ret->[1], 1); | |
aadf4f9e N |
472 | |
473 | if (ord("A") != 65) { # tests not valid for EBCDIC | |
5f93efa0 | 474 | ok(1, 1) for 1 .. (2 + 4 + (7 * 5)); |
aadf4f9e N |
475 | } |
476 | else { | |
477 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0); | |
478 | ok($ret->[0], 0x100); | |
479 | ok($ret->[1], 2); | |
480 | ||
481 | my @warnings; | |
482 | local $SIG{__WARN__} = sub { push @warnings, @_; }; | |
483 | ||
484 | { | |
485 | use warnings 'utf8'; | |
486 | $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); | |
487 | ok($ret->[0], 0); | |
488 | ok($ret->[1], -1); | |
489 | ||
490 | no warnings; | |
491 | $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); | |
492 | ok($ret->[0], 0xFFFD); | |
493 | ok($ret->[1], 1); | |
494 | } | |
495 | ||
496 | my @buf_tests = ( | |
497 | { | |
498 | input => "A", | |
499 | adjustment => -1, | |
500 | warning => qr/empty/, | |
501 | no_warnings_returned_length => 0, | |
502 | }, | |
503 | { | |
504 | input => "\xc4\xc5", | |
505 | adjustment => 0, | |
506 | warning => qr/non-continuation/, | |
507 | no_warnings_returned_length => 1, | |
508 | }, | |
509 | { | |
510 | input => "\xc4\x80", | |
511 | adjustment => -1, | |
512 | warning => qr/short|1 byte, need 2/, | |
513 | no_warnings_returned_length => 1, | |
514 | }, | |
515 | { | |
516 | input => "\xc0\x81", | |
517 | adjustment => 0, | |
518 | warning => qr/overlong|2 bytes, need 1/, | |
519 | no_warnings_returned_length => 2, | |
520 | }, | |
5f93efa0 KW |
521 | { |
522 | input => "\xe0\x80\x81", | |
523 | adjustment => 0, | |
524 | warning => qr/overlong|3 bytes, need 1/, | |
525 | no_warnings_returned_length => 3, | |
526 | }, | |
527 | { | |
528 | input => "\xf0\x80\x80\x81", | |
529 | adjustment => 0, | |
530 | warning => qr/overlong|4 bytes, need 1/, | |
531 | no_warnings_returned_length => 4, | |
532 | }, | |
aadf4f9e N |
533 | { # Old algorithm supposedly failed to detect this |
534 | input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", | |
535 | adjustment => 0, | |
9aa6a863 | 536 | warning => ("$]" le 5.008006) ? qr/Malformed UTF-8 character/ : qr/overflow/, |
aadf4f9e N |
537 | no_warnings_returned_length => 13, |
538 | }, | |
539 | ); | |
540 | ||
541 | # An empty input is an assertion failure on debugging builds. It is | |
542 | # deliberately the first test. | |
543 | require Config; import Config; | |
544 | use vars '%Config'; | |
545 | if ($Config{ccflags} =~ /-DDEBUGGING/) { | |
546 | shift @buf_tests; | |
547 | ok(1, 1) for 1..5; | |
548 | } | |
549 | ||
550 | for my $test (@buf_tests) { | |
551 | my $input = $test->{'input'}; | |
552 | my $adjustment = $test->{'adjustment'}; | |
553 | my $display = 'utf8_to_uvchr_buf("'; | |
554 | for (my $i = 0; $i < length($input) + $adjustment; $i++) { | |
555 | $display .= sprintf "\\x%02x", ord substr($input, $i, 1); | |
556 | } | |
557 | ||
558 | $display .= '")'; | |
559 | my $warning = $test->{'warning'}; | |
560 | ||
561 | undef @warnings; | |
562 | use warnings 'utf8'; | |
563 | $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); | |
564 | ok($ret->[0], 0, "returned value $display; warnings enabled"); | |
565 | ok($ret->[1], -1, "returned length $display; warnings enabled"); | |
566 | my $all_warnings = join "; ", @warnings; | |
567 | my $contains = grep { $_ =~ $warning } $all_warnings; | |
17c135f5 KW |
568 | ok($contains, 1, $display |
569 | . "; Got: '$all_warnings', which should contain '$warning'"); | |
aadf4f9e N |
570 | |
571 | undef @warnings; | |
572 | no warnings 'utf8'; | |
573 | $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); | |
574 | ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled"); | |
575 | ok($ret->[1], $test->{'no_warnings_returned_length'}, | |
576 | "returned length $display; warnings disabled"); | |
577 | } | |
578 | } |