Commit | Line | Data |
---|---|---|
adfe19db MHM |
1 | ################################################################################ |
2 | ## | |
b2049988 | 3 | ## Version 3.x, Copyright (C) 2004-2013, Marcus Holland-Moritz. |
adfe19db MHM |
4 | ## Version 2.x, Copyright (C) 2001, Paul Marquess. |
5 | ## Version 1.x, Copyright (C) 1999, Kenneth Albanowski. | |
6 | ## | |
7 | ## This program is free software; you can redistribute it and/or | |
8 | ## modify it under the same terms as Perl itself. | |
9 | ## | |
10 | ################################################################################ | |
11 | ||
12 | =provides | |
13 | ||
adfe19db | 14 | __UNDEFINED__ |
e28192fd | 15 | my_strnlen |
679ad62d | 16 | SvUOK |
aadf4f9e N |
17 | utf8_to_uvchr_buf |
18 | ||
19 | =dontwarn | |
20 | ||
21 | _ppport_utf8_to_uvchr_buf_callee | |
22 | _ppport_MIN | |
adfe19db MHM |
23 | |
24 | =implementation | |
25 | ||
aadf4f9e N |
26 | #define _ppport_MIN(a,b) (((a) <= (b)) ? (a) : (b)) |
27 | ||
0d0f8426 MHM |
28 | __UNDEFINED__ sv_setuv(sv, uv) \ |
29 | STMT_START { \ | |
30 | UV TeMpUv = uv; \ | |
31 | if (TeMpUv <= IV_MAX) \ | |
32 | sv_setiv(sv, TeMpUv); \ | |
33 | else \ | |
34 | sv_setnv(sv, (double)TeMpUv); \ | |
35 | } STMT_END | |
36 | ||
37 | __UNDEFINED__ newSVuv(uv) ((uv) <= IV_MAX ? newSViv((IV)uv) : newSVnv((NV)uv)) | |
adfe19db MHM |
38 | |
39 | __UNDEFINED__ sv_2uv(sv) ((PL_Sv = (sv)), (UV) (SvNOK(PL_Sv) ? SvNV(PL_Sv) : sv_2nv(PL_Sv))) | |
40 | __UNDEFINED__ SvUVX(sv) ((UV)SvIVX(sv)) | |
41 | __UNDEFINED__ SvUVXx(sv) SvUVX(sv) | |
42 | __UNDEFINED__ SvUV(sv) (SvIOK(sv) ? SvUVX(sv) : sv_2uv(sv)) | |
43 | __UNDEFINED__ SvUVx(sv) ((PL_Sv = (sv)), SvUV(PL_Sv)) | |
44 | ||
45 | /* Hint: sv_uv | |
46 | * Always use the SvUVx() macro instead of sv_uv(). | |
47 | */ | |
48 | __UNDEFINED__ sv_uv(sv) SvUVx(sv) | |
49 | ||
679ad62d MHM |
50 | #if !defined(SvUOK) && defined(SvIOK_UV) |
51 | # define SvUOK(sv) SvIOK_UV(sv) | |
52 | #endif | |
53 | ||
adfe19db MHM |
54 | __UNDEFINED__ XST_mUV(i,v) (ST(i) = sv_2mortal(newSVuv(v)) ) |
55 | __UNDEFINED__ XSRETURN_UV(v) STMT_START { XST_mUV(0,v); XSRETURN(1); } STMT_END | |
56 | ||
96ad942f MHM |
57 | __UNDEFINED__ PUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); PUSHTARG; } STMT_END |
58 | __UNDEFINED__ XPUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); XPUSHTARG; } STMT_END | |
59 | ||
f2e3e4ce | 60 | #if defined UTF8SKIP |
94a2c4f7 KW |
61 | |
62 | /* Don't use official version because it uses MIN, which may not be available */ | |
63 | #undef UTF8_SAFE_SKIP | |
64 | ||
aadf4f9e N |
65 | __UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \ |
66 | ((((e) - (s)) <= 0) \ | |
f2e3e4ce | 67 | ? 0 \ |
aadf4f9e | 68 | : _ppport_MIN(((e) - (s)), UTF8SKIP(s)))) |
f2e3e4ce KW |
69 | #endif |
70 | ||
e28192fd KW |
71 | #if !defined(my_strnlen) |
72 | #if { NEED my_strnlen } | |
73 | ||
74 | STRLEN | |
75 | my_strnlen(const char *str, Size_t maxlen) | |
76 | { | |
77 | const char *p = str; | |
78 | ||
79 | while(maxlen-- && *p) | |
80 | p++; | |
81 | ||
82 | return p - str; | |
83 | } | |
84 | ||
85 | #endif | |
86 | #endif | |
4c28bdc5 | 87 | |
97d83eb2 | 88 | #if { VERSION < 5.31.3 } |
aadf4f9e N |
89 | /* Versions prior to this accepted things that are now considered |
90 | * malformations, and didn't return -1 on error with warnings enabled | |
91 | * */ | |
92 | # undef utf8_to_uvchr_buf | |
93 | #endif | |
4c28bdc5 | 94 | |
aadf4f9e N |
95 | /* This implementation brings modern, generally more restricted standards to |
96 | * utf8_to_uvchr_buf. Some of these are security related, and clearly must | |
97 | * be done. But its arguable that the others need not, and hence should not. | |
98 | * The reason they're here is that a module that intends to play with the | |
6858ac42 | 99 | * latest perls should be able to work the same in all releases. An example is |
aadf4f9e N |
100 | * that perl no longer accepts any UV for a code point, but limits them to |
101 | * IV_MAX or below. This is for future internal use of the larger code points. | |
102 | * If it turns out that some of these changes are breaking code that isn't | |
103 | * intended to work with modern perls, the tighter restrictions could be | |
104 | * relaxed. khw thinks this is unlikely, but has been wrong in the past. */ | |
105 | ||
6858ac42 KW |
106 | /* 5.6.0 is the first release with UTF-8, and we don't implement this function |
107 | * there due to its likely lack of still being in use, and the underlying | |
108 | * implementation is very different from later ones, without the later | |
109 | * safeguards, so would require extra work to deal with */ | |
110 | #if { VERSION < 5.10.0 } /* Was non-const before this */ | |
9aa6a863 N |
111 | # define D_PPP_CU8 U8 |
112 | #else | |
113 | # define D_PPP_CU8 const U8 | |
114 | #endif | |
115 | ||
aadf4f9e N |
116 | #ifndef utf8_to_uvchr_buf |
117 | /* Choose which underlying implementation to use. At least one must be | |
118 | * present or the perl is too early to handle this function */ | |
119 | # if defined(utf8n_to_uvchr) || defined(utf8_to_uv) | |
120 | # if defined(utf8n_to_uvchr) /* This is the preferred implementation */ | |
121 | # define _ppport_utf8_to_uvchr_buf_callee utf8n_to_uvchr | |
9aa6a863 | 122 | # elif { VERSION >= 5.6.1 } |
aadf4f9e N |
123 | # define _ppport_utf8_to_uvchr_buf_callee utf8_to_uv |
124 | # endif | |
125 | ||
126 | # endif | |
127 | ||
128 | #ifdef _ppport_utf8_to_uvchr_buf_callee | |
129 | # if { NEED utf8_to_uvchr_buf } | |
4c28bdc5 | 130 | |
aadf4f9e | 131 | UV |
9aa6a863 | 132 | utf8_to_uvchr_buf(pTHX_ D_PPP_CU8 *s, const U8 *send, STRLEN *retlen) |
aadf4f9e N |
133 | { |
134 | UV ret; | |
135 | STRLEN curlen; | |
136 | bool overflows = 0; | |
137 | const U8 *cur_s = s; | |
138 | const bool do_warnings = ckWARN_d(WARN_UTF8); | |
139 | ||
140 | if (send > s) { | |
141 | curlen = send - s; | |
142 | } | |
143 | else { | |
144 | assert(0); /* Modern perls die under this circumstance */ | |
145 | curlen = 0; | |
146 | if (! do_warnings) { /* Handle empty here if no warnings needed */ | |
147 | if (retlen) *retlen = 0; | |
148 | return UNICODE_REPLACEMENT; | |
149 | } | |
150 | } | |
151 | ||
152 | /* The modern version allows anything that evaluates to a legal UV, but not | |
153 | * overlongs nor an empty input */ | |
154 | ret = _ppport_utf8_to_uvchr_buf_callee( | |
155 | s, curlen, retlen, (UTF8_ALLOW_ANYUV | |
156 | & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY))); | |
157 | ||
158 | /* But actually, modern versions restrict the UV to being no more than what | |
159 | * an IV can hold */ | |
fde93382 | 160 | if (UNLIKELY(ret > IV_MAX)) { |
aadf4f9e N |
161 | overflows = 1; |
162 | } | |
163 | ||
8ec545d9 | 164 | # if { VERSION < 5.26.0 } && ! defined(EBCDIC) |
aadf4f9e | 165 | |
6858ac42 KW |
166 | /* Perl did not properly detect overflow for much of its history on |
167 | * non-EBCDIC platforms, often returning an overlong value which may or may | |
168 | * not have been tolerated in the call. Also, earlier versions, when they | |
169 | * did detect overflow, may have disallowed it completely. Modern ones can | |
170 | * replace it with the REPLACEMENT CHARACTER, depending on calling | |
171 | * parameters. Therefore detect it ourselves in releases it was | |
172 | * problematic in. */ | |
aadf4f9e | 173 | |
fde93382 | 174 | else if (curlen > 0 && UNLIKELY(*s >= 0xFE)) { |
aadf4f9e | 175 | |
6858ac42 KW |
176 | /* First, on a 32-bit machine the first byte being at least \xFE |
177 | * automatically is overflow, as it indicates something requiring more | |
178 | * than 31 bits */ | |
aadf4f9e N |
179 | if (sizeof(ret) < 8) { |
180 | overflows = 1; | |
181 | } | |
182 | else { | |
183 | const U8 highest[] = /* 2*63-1 */ | |
184 | "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"; | |
185 | const U8 *cur_h = highest; | |
186 | ||
187 | for (cur_s = s; cur_s < send; cur_s++, cur_h++) { | |
188 | if (UNLIKELY(*cur_s == *cur_h)) { | |
189 | continue; | |
190 | } | |
191 | ||
192 | /* If this byte is larger than the corresponding highest UTF-8 | |
6858ac42 KW |
193 | * byte, the sequence overflows; otherwise the byte is less |
194 | * than (as we handled the equality case above), and so the | |
195 | * sequence doesn't overflow */ | |
aadf4f9e N |
196 | overflows = *cur_s > *cur_h; |
197 | break; | |
198 | ||
199 | } | |
200 | ||
201 | /* Here, either we set the bool and broke out of the loop, or got | |
202 | * to the end and all bytes are the same which indicates it doesn't | |
203 | * overflow. */ | |
204 | } | |
205 | } | |
206 | ||
aadf4f9e N |
207 | # endif /* < 5.26 */ |
208 | ||
209 | if (UNLIKELY(overflows)) { | |
210 | if (! do_warnings) { | |
211 | if (retlen) { | |
212 | *retlen = _ppport_MIN(*retlen, UTF8SKIP(s)); | |
213 | *retlen = _ppport_MIN(*retlen, curlen); | |
214 | } | |
215 | return UNICODE_REPLACEMENT; | |
216 | } | |
217 | else { | |
218 | ||
219 | /* On versions that correctly detect overflow, but forbid it | |
220 | * always, 0 will be returned, but also a warning will have been | |
221 | * raised. Don't repeat it */ | |
222 | if (ret != 0) { | |
6858ac42 | 223 | /* We use the error message in use from 5.8-5.26 */ |
aadf4f9e N |
224 | Perl_warner(aTHX_ packWARN(WARN_UTF8), |
225 | "Malformed UTF-8 character (overflow at 0x%" UVxf | |
226 | ", byte 0x%02x, after start byte 0x%02x)", | |
227 | ret, *cur_s, *s); | |
228 | } | |
229 | if (retlen) { | |
230 | *retlen = (STRLEN) -1; | |
231 | } | |
232 | return 0; | |
233 | } | |
234 | } | |
235 | ||
6858ac42 KW |
236 | /* Here, did not overflow, but if it failed for some other reason, and |
237 | * warnings are off, to emulate the behavior of the real utf8_to_uvchr(), | |
238 | * try again, allowing anything. (Note a return of 0 is ok if the input | |
239 | * was '\0') */ | |
aadf4f9e N |
240 | if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) { |
241 | ||
242 | /* If curlen is 0, we already handled the case where warnings are | |
6858ac42 KW |
243 | * disabled, so this 'if' will be true, and so later on, we know that |
244 | * 's' is dereferencible */ | |
aadf4f9e N |
245 | if (do_warnings) { |
246 | *retlen = (STRLEN) -1; | |
247 | } | |
248 | else { | |
249 | ret = _ppport_utf8_to_uvchr_buf_callee( | |
250 | s, curlen, retlen, UTF8_ALLOW_ANY); | |
251 | /* Override with the REPLACEMENT character, as that is what the | |
252 | * modern version of this function returns */ | |
253 | ret = UNICODE_REPLACEMENT; | |
254 | ||
255 | # if { VERSION < 5.16.0 } | |
256 | ||
257 | /* Versions earlier than this don't necessarily return the proper | |
258 | * length. It should not extend past the end of string, nor past | |
259 | * what the first byte indicates the length is, nor past the | |
260 | * continuation characters */ | |
261 | if (retlen && *retlen >= 0) { | |
f1590d76 | 262 | unsigned int i = 1; |
aadf4f9e N |
263 | *retlen = _ppport_MIN(*retlen, curlen); |
264 | *retlen = _ppport_MIN(*retlen, UTF8SKIP(s)); | |
aadf4f9e N |
265 | do { |
266 | if (s[i] < 0x80 || s[i] > 0xBF) { | |
267 | *retlen = i; | |
268 | break; | |
269 | } | |
270 | } while (++i < *retlen); | |
271 | } | |
272 | ||
273 | # endif | |
274 | ||
275 | } | |
276 | } | |
277 | ||
278 | return ret; | |
279 | } | |
e28192fd | 280 | |
aadf4f9e N |
281 | # endif |
282 | #endif | |
283 | #endif | |
39d7245c | 284 | |
aadf4f9e N |
285 | #if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf) |
286 | #undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses | |
287 | to read past a NUL, making it much less likely to read | |
288 | off the end of the buffer. A NUL indicates the start | |
289 | of the next character anyway. If the input isn't | |
290 | NUL-terminated, the function remains unsafe, as it | |
291 | always has been. */ | |
39d7245c KW |
292 | |
293 | __UNDEFINED__ utf8_to_uvchr(s, lp) \ | |
294 | ((*(s) == '\0') \ | |
295 | ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \ | |
296 | : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp))) | |
297 | ||
aadf4f9e N |
298 | #endif |
299 | ||
e28192fd KW |
300 | =xsinit |
301 | ||
302 | #define NEED_my_strnlen | |
aadf4f9e | 303 | #define NEED_utf8_to_uvchr_buf |
e28192fd | 304 | |
adfe19db MHM |
305 | =xsubs |
306 | ||
307 | SV * | |
308 | sv_setuv(uv) | |
b2049988 MHM |
309 | UV uv |
310 | CODE: | |
311 | RETVAL = newSViv(1); | |
312 | sv_setuv(RETVAL, uv); | |
313 | OUTPUT: | |
314 | RETVAL | |
adfe19db MHM |
315 | |
316 | SV * | |
317 | newSVuv(uv) | |
b2049988 MHM |
318 | UV uv |
319 | CODE: | |
320 | RETVAL = newSVuv(uv); | |
321 | OUTPUT: | |
322 | RETVAL | |
adfe19db MHM |
323 | |
324 | UV | |
325 | sv_2uv(sv) | |
b2049988 MHM |
326 | SV *sv |
327 | CODE: | |
328 | RETVAL = sv_2uv(sv); | |
329 | OUTPUT: | |
330 | RETVAL | |
adfe19db MHM |
331 | |
332 | UV | |
333 | SvUVx(sv) | |
b2049988 MHM |
334 | SV *sv |
335 | CODE: | |
336 | sv--; | |
337 | RETVAL = SvUVx(++sv); | |
338 | OUTPUT: | |
339 | RETVAL | |
adfe19db MHM |
340 | |
341 | void | |
342 | XSRETURN_UV() | |
b2049988 MHM |
343 | PPCODE: |
344 | XSRETURN_UV(42); | |
adfe19db | 345 | |
96ad942f MHM |
346 | void |
347 | PUSHu() | |
b2049988 MHM |
348 | PREINIT: |
349 | dTARG; | |
350 | PPCODE: | |
351 | TARG = sv_newmortal(); | |
352 | EXTEND(SP, 1); | |
353 | PUSHu(42); | |
354 | XSRETURN(1); | |
96ad942f MHM |
355 | |
356 | void | |
357 | XPUSHu() | |
b2049988 MHM |
358 | PREINIT: |
359 | dTARG; | |
360 | PPCODE: | |
361 | TARG = sv_newmortal(); | |
362 | XPUSHu(43); | |
363 | XSRETURN(1); | |
96ad942f | 364 | |
f2e3e4ce KW |
365 | STRLEN |
366 | UTF8_SAFE_SKIP(s, adjustment) | |
367 | unsigned char * s | |
368 | int adjustment | |
369 | CODE: | |
370 | /* Instead of passing in an 'e' ptr, use the real end, adjusted */ | |
9aa6a863 | 371 | #if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP) |
f2e3e4ce | 372 | RETVAL = UTF8_SAFE_SKIP(s, s + UTF8SKIP(s) + adjustment); |
9aa6a863 N |
373 | #else |
374 | RETVAL = 0; | |
375 | #endif | |
f2e3e4ce KW |
376 | OUTPUT: |
377 | RETVAL | |
378 | ||
e28192fd KW |
379 | STRLEN |
380 | my_strnlen(s, max) | |
381 | char * s | |
382 | STRLEN max | |
383 | CODE: | |
384 | RETVAL= my_strnlen(s, max); | |
385 | OUTPUT: | |
386 | RETVAL | |
387 | ||
4c28bdc5 | 388 | AV * |
aadf4f9e | 389 | utf8_to_uvchr_buf(s, adjustment) |
4c28bdc5 | 390 | unsigned char *s |
aadf4f9e | 391 | int adjustment |
4c28bdc5 KW |
392 | PREINIT: |
393 | AV *av; | |
394 | STRLEN len; | |
395 | CODE: | |
396 | av = newAV(); | |
9aa6a863 | 397 | #ifdef utf8_to_uvchr_buf |
aadf4f9e N |
398 | av_push(av, newSVuv(utf8_to_uvchr_buf(s, |
399 | s + UTF8SKIP(s) + adjustment, | |
400 | &len))); | |
9aa6a863 N |
401 | #else |
402 | av_push(av, newSVuv(0)); | |
403 | len = (STRLEN) -1; | |
404 | #endif | |
721db7b0 N |
405 | if (len == (STRLEN) -1) { |
406 | av_push(av, newSViv(-1)); | |
407 | } | |
408 | else { | |
409 | av_push(av, newSVuv(len)); | |
410 | } | |
4c28bdc5 KW |
411 | RETVAL = av; |
412 | OUTPUT: | |
413 | RETVAL | |
414 | ||
39d7245c KW |
415 | AV * |
416 | utf8_to_uvchr(s) | |
417 | unsigned char *s | |
418 | PREINIT: | |
419 | AV *av; | |
420 | STRLEN len; | |
421 | CODE: | |
422 | av = newAV(); | |
9aa6a863 | 423 | #ifdef utf8_to_uvchr |
39d7245c | 424 | av_push(av, newSVuv(utf8_to_uvchr(s, &len))); |
9aa6a863 N |
425 | #else |
426 | av_push(av, newSVuv(0)); | |
427 | len = (STRLEN) -1; | |
428 | #endif | |
721db7b0 N |
429 | if (len == (STRLEN) -1) { |
430 | av_push(av, newSViv(-1)); | |
431 | } | |
432 | else { | |
433 | av_push(av, newSVuv(len)); | |
434 | } | |
39d7245c KW |
435 | RETVAL = av; |
436 | OUTPUT: | |
437 | RETVAL | |
438 | ||
5f93efa0 | 439 | =tests plan => 62 |
adfe19db MHM |
440 | |
441 | ok(&Devel::PPPort::sv_setuv(42), 42); | |
442 | ok(&Devel::PPPort::newSVuv(123), 123); | |
443 | ok(&Devel::PPPort::sv_2uv("4711"), 4711); | |
444 | ok(&Devel::PPPort::sv_2uv("1735928559"), 1735928559); | |
445 | ok(&Devel::PPPort::SvUVx("1735928559"), 1735928559); | |
446 | ok(&Devel::PPPort::SvUVx(1735928559), 1735928559); | |
447 | ok(&Devel::PPPort::SvUVx(0xdeadbeef), 0xdeadbeef); | |
448 | ok(&Devel::PPPort::XSRETURN_UV(), 42); | |
96ad942f MHM |
449 | ok(&Devel::PPPort::PUSHu(), 42); |
450 | ok(&Devel::PPPort::XPUSHu(), 43); | |
f2e3e4ce KW |
451 | ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1); |
452 | ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0); | |
e28192fd | 453 | ok(&Devel::PPPort::my_strnlen("abc\0def", 7), 3); |
aadf4f9e | 454 | |
89d8c7cd KW |
455 | # skip tests on 5.6.0 and earlier |
456 | if ("$]" le '5.006') { | |
5f93efa0 | 457 | skip 'skip: broken utf8 support', 0 for 1..49; |
89d8c7cd KW |
458 | exit; |
459 | } | |
460 | ||
aadf4f9e | 461 | my $ret = &Devel::PPPort::utf8_to_uvchr("A"); |
4c28bdc5 KW |
462 | ok($ret->[0], ord("A")); |
463 | ok($ret->[1], 1); | |
aadf4f9e N |
464 | |
465 | $ret = &Devel::PPPort::utf8_to_uvchr("\0"); | |
4c28bdc5 KW |
466 | ok($ret->[0], 0); |
467 | ok($ret->[1], 1); | |
aadf4f9e N |
468 | |
469 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0); | |
39d7245c KW |
470 | ok($ret->[0], ord("A")); |
471 | ok($ret->[1], 1); | |
aadf4f9e N |
472 | |
473 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0); | |
39d7245c KW |
474 | ok($ret->[0], 0); |
475 | ok($ret->[1], 1); | |
aadf4f9e N |
476 | |
477 | if (ord("A") != 65) { # tests not valid for EBCDIC | |
5f93efa0 | 478 | ok(1, 1) for 1 .. (2 + 4 + (7 * 5)); |
aadf4f9e N |
479 | } |
480 | else { | |
481 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0); | |
482 | ok($ret->[0], 0x100); | |
483 | ok($ret->[1], 2); | |
484 | ||
485 | my @warnings; | |
486 | local $SIG{__WARN__} = sub { push @warnings, @_; }; | |
487 | ||
488 | { | |
489 | use warnings 'utf8'; | |
490 | $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); | |
491 | ok($ret->[0], 0); | |
492 | ok($ret->[1], -1); | |
493 | ||
494 | no warnings; | |
495 | $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); | |
496 | ok($ret->[0], 0xFFFD); | |
497 | ok($ret->[1], 1); | |
498 | } | |
499 | ||
500 | my @buf_tests = ( | |
501 | { | |
502 | input => "A", | |
503 | adjustment => -1, | |
504 | warning => qr/empty/, | |
505 | no_warnings_returned_length => 0, | |
506 | }, | |
507 | { | |
508 | input => "\xc4\xc5", | |
509 | adjustment => 0, | |
510 | warning => qr/non-continuation/, | |
511 | no_warnings_returned_length => 1, | |
512 | }, | |
513 | { | |
514 | input => "\xc4\x80", | |
515 | adjustment => -1, | |
516 | warning => qr/short|1 byte, need 2/, | |
517 | no_warnings_returned_length => 1, | |
518 | }, | |
519 | { | |
520 | input => "\xc0\x81", | |
521 | adjustment => 0, | |
522 | warning => qr/overlong|2 bytes, need 1/, | |
523 | no_warnings_returned_length => 2, | |
524 | }, | |
5f93efa0 KW |
525 | { |
526 | input => "\xe0\x80\x81", | |
527 | adjustment => 0, | |
528 | warning => qr/overlong|3 bytes, need 1/, | |
529 | no_warnings_returned_length => 3, | |
530 | }, | |
531 | { | |
532 | input => "\xf0\x80\x80\x81", | |
533 | adjustment => 0, | |
534 | warning => qr/overlong|4 bytes, need 1/, | |
535 | no_warnings_returned_length => 4, | |
536 | }, | |
6858ac42 | 537 | { # Old algorithm failed to detect this |
aadf4f9e N |
538 | input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", |
539 | adjustment => 0, | |
9aa6a863 | 540 | warning => ("$]" le 5.008006) ? qr/Malformed UTF-8 character/ : qr/overflow/, |
aadf4f9e N |
541 | no_warnings_returned_length => 13, |
542 | }, | |
543 | ); | |
544 | ||
545 | # An empty input is an assertion failure on debugging builds. It is | |
546 | # deliberately the first test. | |
547 | require Config; import Config; | |
548 | use vars '%Config'; | |
549 | if ($Config{ccflags} =~ /-DDEBUGGING/) { | |
550 | shift @buf_tests; | |
551 | ok(1, 1) for 1..5; | |
552 | } | |
553 | ||
554 | for my $test (@buf_tests) { | |
555 | my $input = $test->{'input'}; | |
556 | my $adjustment = $test->{'adjustment'}; | |
557 | my $display = 'utf8_to_uvchr_buf("'; | |
558 | for (my $i = 0; $i < length($input) + $adjustment; $i++) { | |
559 | $display .= sprintf "\\x%02x", ord substr($input, $i, 1); | |
560 | } | |
561 | ||
562 | $display .= '")'; | |
563 | my $warning = $test->{'warning'}; | |
564 | ||
565 | undef @warnings; | |
566 | use warnings 'utf8'; | |
567 | $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); | |
568 | ok($ret->[0], 0, "returned value $display; warnings enabled"); | |
569 | ok($ret->[1], -1, "returned length $display; warnings enabled"); | |
570 | my $all_warnings = join "; ", @warnings; | |
571 | my $contains = grep { $_ =~ $warning } $all_warnings; | |
17c135f5 KW |
572 | ok($contains, 1, $display |
573 | . "; Got: '$all_warnings', which should contain '$warning'"); | |
aadf4f9e N |
574 | |
575 | undef @warnings; | |
576 | no warnings 'utf8'; | |
577 | $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); | |
578 | ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled"); | |
579 | ok($ret->[1], $test->{'no_warnings_returned_length'}, | |
580 | "returned length $display; warnings disabled"); | |
581 | } | |
582 | } |