Commit | Line | Data |
---|---|---|
adfe19db MHM |
1 | ################################################################################ |
2 | ## | |
b2049988 | 3 | ## Version 3.x, Copyright (C) 2004-2013, Marcus Holland-Moritz. |
adfe19db MHM |
4 | ## Version 2.x, Copyright (C) 2001, Paul Marquess. |
5 | ## Version 1.x, Copyright (C) 1999, Kenneth Albanowski. | |
6 | ## | |
7 | ## This program is free software; you can redistribute it and/or | |
8 | ## modify it under the same terms as Perl itself. | |
9 | ## | |
10 | ################################################################################ | |
11 | ||
12 | =provides | |
13 | ||
adfe19db | 14 | __UNDEFINED__ |
e28192fd | 15 | my_strnlen |
679ad62d | 16 | SvUOK |
aadf4f9e N |
17 | utf8_to_uvchr_buf |
18 | ||
19 | =dontwarn | |
20 | ||
21 | _ppport_utf8_to_uvchr_buf_callee | |
22 | _ppport_MIN | |
adfe19db MHM |
23 | |
24 | =implementation | |
25 | ||
aadf4f9e N |
26 | #define _ppport_MIN(a,b) (((a) <= (b)) ? (a) : (b)) |
27 | ||
0d0f8426 MHM |
28 | __UNDEFINED__ sv_setuv(sv, uv) \ |
29 | STMT_START { \ | |
30 | UV TeMpUv = uv; \ | |
31 | if (TeMpUv <= IV_MAX) \ | |
32 | sv_setiv(sv, TeMpUv); \ | |
33 | else \ | |
34 | sv_setnv(sv, (double)TeMpUv); \ | |
35 | } STMT_END | |
36 | ||
37 | __UNDEFINED__ newSVuv(uv) ((uv) <= IV_MAX ? newSViv((IV)uv) : newSVnv((NV)uv)) | |
adfe19db MHM |
38 | |
39 | __UNDEFINED__ sv_2uv(sv) ((PL_Sv = (sv)), (UV) (SvNOK(PL_Sv) ? SvNV(PL_Sv) : sv_2nv(PL_Sv))) | |
40 | __UNDEFINED__ SvUVX(sv) ((UV)SvIVX(sv)) | |
41 | __UNDEFINED__ SvUVXx(sv) SvUVX(sv) | |
42 | __UNDEFINED__ SvUV(sv) (SvIOK(sv) ? SvUVX(sv) : sv_2uv(sv)) | |
43 | __UNDEFINED__ SvUVx(sv) ((PL_Sv = (sv)), SvUV(PL_Sv)) | |
44 | ||
45 | /* Hint: sv_uv | |
46 | * Always use the SvUVx() macro instead of sv_uv(). | |
47 | */ | |
48 | __UNDEFINED__ sv_uv(sv) SvUVx(sv) | |
49 | ||
679ad62d MHM |
50 | #if !defined(SvUOK) && defined(SvIOK_UV) |
51 | # define SvUOK(sv) SvIOK_UV(sv) | |
52 | #endif | |
53 | ||
adfe19db MHM |
54 | __UNDEFINED__ XST_mUV(i,v) (ST(i) = sv_2mortal(newSVuv(v)) ) |
55 | __UNDEFINED__ XSRETURN_UV(v) STMT_START { XST_mUV(0,v); XSRETURN(1); } STMT_END | |
56 | ||
96ad942f MHM |
57 | __UNDEFINED__ PUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); PUSHTARG; } STMT_END |
58 | __UNDEFINED__ XPUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); XPUSHTARG; } STMT_END | |
59 | ||
f2e3e4ce | 60 | #if defined UTF8SKIP |
94a2c4f7 KW |
61 | |
62 | /* Don't use official version because it uses MIN, which may not be available */ | |
63 | #undef UTF8_SAFE_SKIP | |
64 | ||
aadf4f9e N |
65 | __UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \ |
66 | ((((e) - (s)) <= 0) \ | |
f2e3e4ce | 67 | ? 0 \ |
aadf4f9e | 68 | : _ppport_MIN(((e) - (s)), UTF8SKIP(s)))) |
f2e3e4ce KW |
69 | #endif |
70 | ||
e28192fd KW |
71 | #if !defined(my_strnlen) |
72 | #if { NEED my_strnlen } | |
73 | ||
74 | STRLEN | |
75 | my_strnlen(const char *str, Size_t maxlen) | |
76 | { | |
77 | const char *p = str; | |
78 | ||
79 | while(maxlen-- && *p) | |
80 | p++; | |
81 | ||
82 | return p - str; | |
83 | } | |
84 | ||
85 | #endif | |
86 | #endif | |
4c28bdc5 | 87 | |
97d83eb2 | 88 | #if { VERSION < 5.31.3 } |
aadf4f9e N |
89 | /* Versions prior to this accepted things that are now considered |
90 | * malformations, and didn't return -1 on error with warnings enabled | |
91 | * */ | |
92 | # undef utf8_to_uvchr_buf | |
93 | #endif | |
4c28bdc5 | 94 | |
aadf4f9e N |
95 | /* This implementation brings modern, generally more restricted standards to |
96 | * utf8_to_uvchr_buf. Some of these are security related, and clearly must | |
97 | * be done. But its arguable that the others need not, and hence should not. | |
98 | * The reason they're here is that a module that intends to play with the | |
6858ac42 | 99 | * latest perls should be able to work the same in all releases. An example is |
aadf4f9e N |
100 | * that perl no longer accepts any UV for a code point, but limits them to |
101 | * IV_MAX or below. This is for future internal use of the larger code points. | |
102 | * If it turns out that some of these changes are breaking code that isn't | |
103 | * intended to work with modern perls, the tighter restrictions could be | |
104 | * relaxed. khw thinks this is unlikely, but has been wrong in the past. */ | |
105 | ||
6858ac42 KW |
106 | /* 5.6.0 is the first release with UTF-8, and we don't implement this function |
107 | * there due to its likely lack of still being in use, and the underlying | |
108 | * implementation is very different from later ones, without the later | |
109 | * safeguards, so would require extra work to deal with */ | |
e4a76720 | 110 | #if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf) |
de222e8b KW |
111 | # if { VERSION < 5.10.0 } /* Was non-const before this */ |
112 | # define D_PPP_CU8 U8 | |
113 | # else | |
114 | # define D_PPP_CU8 const U8 | |
115 | # endif | |
9aa6a863 | 116 | |
aadf4f9e N |
117 | /* Choose which underlying implementation to use. At least one must be |
118 | * present or the perl is too early to handle this function */ | |
119 | # if defined(utf8n_to_uvchr) || defined(utf8_to_uv) | |
120 | # if defined(utf8n_to_uvchr) /* This is the preferred implementation */ | |
121 | # define _ppport_utf8_to_uvchr_buf_callee utf8n_to_uvchr | |
e4a76720 | 122 | # else /* Must be at least 5.6.1 from #if above */ |
aadf4f9e N |
123 | # define _ppport_utf8_to_uvchr_buf_callee utf8_to_uv |
124 | # endif | |
aadf4f9e N |
125 | # endif |
126 | ||
aadf4f9e | 127 | # if { NEED utf8_to_uvchr_buf } |
4c28bdc5 | 128 | |
aadf4f9e | 129 | UV |
9aa6a863 | 130 | utf8_to_uvchr_buf(pTHX_ D_PPP_CU8 *s, const U8 *send, STRLEN *retlen) |
aadf4f9e N |
131 | { |
132 | UV ret; | |
133 | STRLEN curlen; | |
134 | bool overflows = 0; | |
135 | const U8 *cur_s = s; | |
136 | const bool do_warnings = ckWARN_d(WARN_UTF8); | |
05fc829b | 137 | STRLEN overflow_length = 0; |
aadf4f9e N |
138 | |
139 | if (send > s) { | |
140 | curlen = send - s; | |
141 | } | |
142 | else { | |
143 | assert(0); /* Modern perls die under this circumstance */ | |
144 | curlen = 0; | |
145 | if (! do_warnings) { /* Handle empty here if no warnings needed */ | |
146 | if (retlen) *retlen = 0; | |
147 | return UNICODE_REPLACEMENT; | |
148 | } | |
149 | } | |
150 | ||
8ec545d9 | 151 | # if { VERSION < 5.26.0 } && ! defined(EBCDIC) |
aadf4f9e | 152 | |
6858ac42 KW |
153 | /* Perl did not properly detect overflow for much of its history on |
154 | * non-EBCDIC platforms, often returning an overlong value which may or may | |
155 | * not have been tolerated in the call. Also, earlier versions, when they | |
156 | * did detect overflow, may have disallowed it completely. Modern ones can | |
157 | * replace it with the REPLACEMENT CHARACTER, depending on calling | |
158 | * parameters. Therefore detect it ourselves in releases it was | |
159 | * problematic in. */ | |
aadf4f9e | 160 | |
05fc829b | 161 | if (curlen > 0 && UNLIKELY(*s >= 0xFE)) { |
aadf4f9e | 162 | |
6858ac42 KW |
163 | /* First, on a 32-bit machine the first byte being at least \xFE |
164 | * automatically is overflow, as it indicates something requiring more | |
165 | * than 31 bits */ | |
aadf4f9e N |
166 | if (sizeof(ret) < 8) { |
167 | overflows = 1; | |
05fc829b | 168 | overflow_length = 7; |
aadf4f9e N |
169 | } |
170 | else { | |
171 | const U8 highest[] = /* 2*63-1 */ | |
172 | "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"; | |
173 | const U8 *cur_h = highest; | |
174 | ||
175 | for (cur_s = s; cur_s < send; cur_s++, cur_h++) { | |
176 | if (UNLIKELY(*cur_s == *cur_h)) { | |
177 | continue; | |
178 | } | |
179 | ||
180 | /* If this byte is larger than the corresponding highest UTF-8 | |
6858ac42 KW |
181 | * byte, the sequence overflows; otherwise the byte is less |
182 | * than (as we handled the equality case above), and so the | |
183 | * sequence doesn't overflow */ | |
aadf4f9e N |
184 | overflows = *cur_s > *cur_h; |
185 | break; | |
186 | ||
187 | } | |
188 | ||
189 | /* Here, either we set the bool and broke out of the loop, or got | |
190 | * to the end and all bytes are the same which indicates it doesn't | |
05fc829b KW |
191 | * overflow. If it did overflow, it would be this number of bytes |
192 | * */ | |
193 | overflow_length = 13; | |
194 | } | |
195 | } | |
196 | ||
197 | if (UNLIKELY(overflows)) { | |
198 | ret = 0; | |
199 | ||
200 | if (! do_warnings && retlen) { | |
201 | *retlen = overflow_length; | |
aadf4f9e N |
202 | } |
203 | } | |
05fc829b | 204 | else |
aadf4f9e | 205 | |
aadf4f9e N |
206 | # endif /* < 5.26 */ |
207 | ||
05fc829b KW |
208 | /* Here, we are either in a release that properly detects overflow, or |
209 | * we have checked for overflow and the next statement is executing as | |
210 | * part of the above conditional where we know we don't have overflow. | |
211 | * | |
212 | * The modern versions allow anything that evaluates to a legal UV, but | |
213 | * not overlongs nor an empty input */ | |
214 | ret = _ppport_utf8_to_uvchr_buf_callee( | |
215 | s, curlen, retlen, (UTF8_ALLOW_ANYUV | |
216 | & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY))); | |
217 | ||
218 | # if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 } | |
219 | ||
220 | /* But actually, more modern versions restrict the UV to being no more than | |
221 | * what * an IV can hold, so it could, so it could still have gotten it | |
222 | * wrong about overflowing. */ | |
223 | if (UNLIKELY(ret > IV_MAX)) { | |
224 | overflows = 1; | |
225 | } | |
226 | ||
227 | # endif | |
228 | ||
aadf4f9e N |
229 | if (UNLIKELY(overflows)) { |
230 | if (! do_warnings) { | |
231 | if (retlen) { | |
232 | *retlen = _ppport_MIN(*retlen, UTF8SKIP(s)); | |
233 | *retlen = _ppport_MIN(*retlen, curlen); | |
234 | } | |
235 | return UNICODE_REPLACEMENT; | |
236 | } | |
237 | else { | |
238 | ||
6858ac42 | 239 | /* We use the error message in use from 5.8-5.26 */ |
de222e8b KW |
240 | Perl_warner(aTHX_ packWARN(WARN_UTF8), |
241 | "Malformed UTF-8 character (overflow at 0x%" UVxf | |
242 | ", byte 0x%02x, after start byte 0x%02x)", | |
243 | ret, *cur_s, *s); | |
aadf4f9e N |
244 | if (retlen) { |
245 | *retlen = (STRLEN) -1; | |
246 | } | |
247 | return 0; | |
248 | } | |
249 | } | |
250 | ||
6858ac42 KW |
251 | /* Here, did not overflow, but if it failed for some other reason, and |
252 | * warnings are off, to emulate the behavior of the real utf8_to_uvchr(), | |
253 | * try again, allowing anything. (Note a return of 0 is ok if the input | |
254 | * was '\0') */ | |
aadf4f9e N |
255 | if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) { |
256 | ||
257 | /* If curlen is 0, we already handled the case where warnings are | |
6858ac42 KW |
258 | * disabled, so this 'if' will be true, and so later on, we know that |
259 | * 's' is dereferencible */ | |
aadf4f9e N |
260 | if (do_warnings) { |
261 | *retlen = (STRLEN) -1; | |
262 | } | |
263 | else { | |
264 | ret = _ppport_utf8_to_uvchr_buf_callee( | |
265 | s, curlen, retlen, UTF8_ALLOW_ANY); | |
266 | /* Override with the REPLACEMENT character, as that is what the | |
267 | * modern version of this function returns */ | |
268 | ret = UNICODE_REPLACEMENT; | |
269 | ||
de222e8b | 270 | # if { VERSION < 5.16.0 } |
aadf4f9e N |
271 | |
272 | /* Versions earlier than this don't necessarily return the proper | |
273 | * length. It should not extend past the end of string, nor past | |
274 | * what the first byte indicates the length is, nor past the | |
275 | * continuation characters */ | |
276 | if (retlen && *retlen >= 0) { | |
f1590d76 | 277 | unsigned int i = 1; |
aadf4f9e N |
278 | *retlen = _ppport_MIN(*retlen, curlen); |
279 | *retlen = _ppport_MIN(*retlen, UTF8SKIP(s)); | |
aadf4f9e N |
280 | do { |
281 | if (s[i] < 0x80 || s[i] > 0xBF) { | |
282 | *retlen = i; | |
283 | break; | |
284 | } | |
285 | } while (++i < *retlen); | |
286 | } | |
287 | ||
de222e8b | 288 | # endif |
aadf4f9e N |
289 | |
290 | } | |
291 | } | |
292 | ||
293 | return ret; | |
294 | } | |
e28192fd | 295 | |
aadf4f9e N |
296 | # endif |
297 | #endif | |
39d7245c | 298 | |
aadf4f9e N |
299 | #if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf) |
300 | #undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses | |
301 | to read past a NUL, making it much less likely to read | |
302 | off the end of the buffer. A NUL indicates the start | |
303 | of the next character anyway. If the input isn't | |
304 | NUL-terminated, the function remains unsafe, as it | |
305 | always has been. */ | |
39d7245c KW |
306 | |
307 | __UNDEFINED__ utf8_to_uvchr(s, lp) \ | |
308 | ((*(s) == '\0') \ | |
309 | ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \ | |
310 | : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp))) | |
311 | ||
aadf4f9e N |
312 | #endif |
313 | ||
e28192fd KW |
314 | =xsinit |
315 | ||
316 | #define NEED_my_strnlen | |
aadf4f9e | 317 | #define NEED_utf8_to_uvchr_buf |
e28192fd | 318 | |
adfe19db MHM |
319 | =xsubs |
320 | ||
321 | SV * | |
322 | sv_setuv(uv) | |
b2049988 MHM |
323 | UV uv |
324 | CODE: | |
325 | RETVAL = newSViv(1); | |
326 | sv_setuv(RETVAL, uv); | |
327 | OUTPUT: | |
328 | RETVAL | |
adfe19db MHM |
329 | |
330 | SV * | |
331 | newSVuv(uv) | |
b2049988 MHM |
332 | UV uv |
333 | CODE: | |
334 | RETVAL = newSVuv(uv); | |
335 | OUTPUT: | |
336 | RETVAL | |
adfe19db MHM |
337 | |
338 | UV | |
339 | sv_2uv(sv) | |
b2049988 MHM |
340 | SV *sv |
341 | CODE: | |
342 | RETVAL = sv_2uv(sv); | |
343 | OUTPUT: | |
344 | RETVAL | |
adfe19db MHM |
345 | |
346 | UV | |
347 | SvUVx(sv) | |
b2049988 MHM |
348 | SV *sv |
349 | CODE: | |
350 | sv--; | |
351 | RETVAL = SvUVx(++sv); | |
352 | OUTPUT: | |
353 | RETVAL | |
adfe19db MHM |
354 | |
355 | void | |
356 | XSRETURN_UV() | |
b2049988 MHM |
357 | PPCODE: |
358 | XSRETURN_UV(42); | |
adfe19db | 359 | |
96ad942f MHM |
360 | void |
361 | PUSHu() | |
b2049988 MHM |
362 | PREINIT: |
363 | dTARG; | |
364 | PPCODE: | |
365 | TARG = sv_newmortal(); | |
366 | EXTEND(SP, 1); | |
367 | PUSHu(42); | |
368 | XSRETURN(1); | |
96ad942f MHM |
369 | |
370 | void | |
371 | XPUSHu() | |
b2049988 MHM |
372 | PREINIT: |
373 | dTARG; | |
374 | PPCODE: | |
375 | TARG = sv_newmortal(); | |
376 | XPUSHu(43); | |
377 | XSRETURN(1); | |
96ad942f | 378 | |
f2e3e4ce KW |
379 | STRLEN |
380 | UTF8_SAFE_SKIP(s, adjustment) | |
381 | unsigned char * s | |
382 | int adjustment | |
383 | CODE: | |
384 | /* Instead of passing in an 'e' ptr, use the real end, adjusted */ | |
9aa6a863 | 385 | #if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP) |
f2e3e4ce | 386 | RETVAL = UTF8_SAFE_SKIP(s, s + UTF8SKIP(s) + adjustment); |
9aa6a863 N |
387 | #else |
388 | RETVAL = 0; | |
389 | #endif | |
f2e3e4ce KW |
390 | OUTPUT: |
391 | RETVAL | |
392 | ||
e28192fd KW |
393 | STRLEN |
394 | my_strnlen(s, max) | |
395 | char * s | |
396 | STRLEN max | |
397 | CODE: | |
398 | RETVAL= my_strnlen(s, max); | |
399 | OUTPUT: | |
400 | RETVAL | |
401 | ||
4c28bdc5 | 402 | AV * |
aadf4f9e | 403 | utf8_to_uvchr_buf(s, adjustment) |
4c28bdc5 | 404 | unsigned char *s |
aadf4f9e | 405 | int adjustment |
4c28bdc5 KW |
406 | PREINIT: |
407 | AV *av; | |
408 | STRLEN len; | |
409 | CODE: | |
410 | av = newAV(); | |
9aa6a863 | 411 | #ifdef utf8_to_uvchr_buf |
aadf4f9e N |
412 | av_push(av, newSVuv(utf8_to_uvchr_buf(s, |
413 | s + UTF8SKIP(s) + adjustment, | |
414 | &len))); | |
9aa6a863 N |
415 | #else |
416 | av_push(av, newSVuv(0)); | |
417 | len = (STRLEN) -1; | |
418 | #endif | |
721db7b0 N |
419 | if (len == (STRLEN) -1) { |
420 | av_push(av, newSViv(-1)); | |
421 | } | |
422 | else { | |
423 | av_push(av, newSVuv(len)); | |
424 | } | |
4c28bdc5 KW |
425 | RETVAL = av; |
426 | OUTPUT: | |
427 | RETVAL | |
428 | ||
39d7245c KW |
429 | AV * |
430 | utf8_to_uvchr(s) | |
431 | unsigned char *s | |
432 | PREINIT: | |
433 | AV *av; | |
434 | STRLEN len; | |
435 | CODE: | |
436 | av = newAV(); | |
9aa6a863 | 437 | #ifdef utf8_to_uvchr |
39d7245c | 438 | av_push(av, newSVuv(utf8_to_uvchr(s, &len))); |
9aa6a863 N |
439 | #else |
440 | av_push(av, newSVuv(0)); | |
441 | len = (STRLEN) -1; | |
442 | #endif | |
721db7b0 N |
443 | if (len == (STRLEN) -1) { |
444 | av_push(av, newSViv(-1)); | |
445 | } | |
446 | else { | |
447 | av_push(av, newSVuv(len)); | |
448 | } | |
39d7245c KW |
449 | RETVAL = av; |
450 | OUTPUT: | |
451 | RETVAL | |
452 | ||
5f93efa0 | 453 | =tests plan => 62 |
adfe19db MHM |
454 | |
455 | ok(&Devel::PPPort::sv_setuv(42), 42); | |
456 | ok(&Devel::PPPort::newSVuv(123), 123); | |
457 | ok(&Devel::PPPort::sv_2uv("4711"), 4711); | |
458 | ok(&Devel::PPPort::sv_2uv("1735928559"), 1735928559); | |
459 | ok(&Devel::PPPort::SvUVx("1735928559"), 1735928559); | |
460 | ok(&Devel::PPPort::SvUVx(1735928559), 1735928559); | |
461 | ok(&Devel::PPPort::SvUVx(0xdeadbeef), 0xdeadbeef); | |
462 | ok(&Devel::PPPort::XSRETURN_UV(), 42); | |
96ad942f MHM |
463 | ok(&Devel::PPPort::PUSHu(), 42); |
464 | ok(&Devel::PPPort::XPUSHu(), 43); | |
f2e3e4ce KW |
465 | ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1); |
466 | ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0); | |
e28192fd | 467 | ok(&Devel::PPPort::my_strnlen("abc\0def", 7), 3); |
aadf4f9e | 468 | |
89d8c7cd KW |
469 | # skip tests on 5.6.0 and earlier |
470 | if ("$]" le '5.006') { | |
5f93efa0 | 471 | skip 'skip: broken utf8 support', 0 for 1..49; |
89d8c7cd KW |
472 | exit; |
473 | } | |
474 | ||
aadf4f9e | 475 | my $ret = &Devel::PPPort::utf8_to_uvchr("A"); |
4c28bdc5 KW |
476 | ok($ret->[0], ord("A")); |
477 | ok($ret->[1], 1); | |
aadf4f9e N |
478 | |
479 | $ret = &Devel::PPPort::utf8_to_uvchr("\0"); | |
4c28bdc5 KW |
480 | ok($ret->[0], 0); |
481 | ok($ret->[1], 1); | |
aadf4f9e N |
482 | |
483 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0); | |
39d7245c KW |
484 | ok($ret->[0], ord("A")); |
485 | ok($ret->[1], 1); | |
aadf4f9e N |
486 | |
487 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0); | |
39d7245c KW |
488 | ok($ret->[0], 0); |
489 | ok($ret->[1], 1); | |
aadf4f9e N |
490 | |
491 | if (ord("A") != 65) { # tests not valid for EBCDIC | |
5f93efa0 | 492 | ok(1, 1) for 1 .. (2 + 4 + (7 * 5)); |
aadf4f9e N |
493 | } |
494 | else { | |
495 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0); | |
496 | ok($ret->[0], 0x100); | |
497 | ok($ret->[1], 2); | |
498 | ||
499 | my @warnings; | |
500 | local $SIG{__WARN__} = sub { push @warnings, @_; }; | |
501 | ||
502 | { | |
503 | use warnings 'utf8'; | |
504 | $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); | |
505 | ok($ret->[0], 0); | |
506 | ok($ret->[1], -1); | |
507 | ||
508 | no warnings; | |
509 | $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); | |
510 | ok($ret->[0], 0xFFFD); | |
511 | ok($ret->[1], 1); | |
512 | } | |
513 | ||
514 | my @buf_tests = ( | |
515 | { | |
516 | input => "A", | |
517 | adjustment => -1, | |
518 | warning => qr/empty/, | |
519 | no_warnings_returned_length => 0, | |
520 | }, | |
521 | { | |
522 | input => "\xc4\xc5", | |
523 | adjustment => 0, | |
524 | warning => qr/non-continuation/, | |
525 | no_warnings_returned_length => 1, | |
526 | }, | |
527 | { | |
528 | input => "\xc4\x80", | |
529 | adjustment => -1, | |
530 | warning => qr/short|1 byte, need 2/, | |
531 | no_warnings_returned_length => 1, | |
532 | }, | |
533 | { | |
534 | input => "\xc0\x81", | |
535 | adjustment => 0, | |
536 | warning => qr/overlong|2 bytes, need 1/, | |
537 | no_warnings_returned_length => 2, | |
538 | }, | |
5f93efa0 KW |
539 | { |
540 | input => "\xe0\x80\x81", | |
541 | adjustment => 0, | |
542 | warning => qr/overlong|3 bytes, need 1/, | |
543 | no_warnings_returned_length => 3, | |
544 | }, | |
545 | { | |
546 | input => "\xf0\x80\x80\x81", | |
547 | adjustment => 0, | |
548 | warning => qr/overlong|4 bytes, need 1/, | |
549 | no_warnings_returned_length => 4, | |
550 | }, | |
6858ac42 | 551 | { # Old algorithm failed to detect this |
aadf4f9e N |
552 | input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", |
553 | adjustment => 0, | |
05fc829b | 554 | warning => qr/overflow/, |
aadf4f9e N |
555 | no_warnings_returned_length => 13, |
556 | }, | |
557 | ); | |
558 | ||
559 | # An empty input is an assertion failure on debugging builds. It is | |
560 | # deliberately the first test. | |
561 | require Config; import Config; | |
562 | use vars '%Config'; | |
563 | if ($Config{ccflags} =~ /-DDEBUGGING/) { | |
564 | shift @buf_tests; | |
565 | ok(1, 1) for 1..5; | |
566 | } | |
567 | ||
568 | for my $test (@buf_tests) { | |
569 | my $input = $test->{'input'}; | |
570 | my $adjustment = $test->{'adjustment'}; | |
571 | my $display = 'utf8_to_uvchr_buf("'; | |
572 | for (my $i = 0; $i < length($input) + $adjustment; $i++) { | |
573 | $display .= sprintf "\\x%02x", ord substr($input, $i, 1); | |
574 | } | |
575 | ||
576 | $display .= '")'; | |
577 | my $warning = $test->{'warning'}; | |
578 | ||
579 | undef @warnings; | |
580 | use warnings 'utf8'; | |
581 | $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); | |
582 | ok($ret->[0], 0, "returned value $display; warnings enabled"); | |
583 | ok($ret->[1], -1, "returned length $display; warnings enabled"); | |
584 | my $all_warnings = join "; ", @warnings; | |
585 | my $contains = grep { $_ =~ $warning } $all_warnings; | |
17c135f5 KW |
586 | ok($contains, 1, $display |
587 | . "; Got: '$all_warnings', which should contain '$warning'"); | |
aadf4f9e N |
588 | |
589 | undef @warnings; | |
590 | no warnings 'utf8'; | |
591 | $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); | |
592 | ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled"); | |
593 | ok($ret->[1], $test->{'no_warnings_returned_length'}, | |
594 | "returned length $display; warnings disabled"); | |
595 | } | |
596 | } |