Commit | Line | Data |
---|---|---|
adfe19db MHM |
1 | ################################################################################ |
2 | ## | |
b2049988 | 3 | ## Version 3.x, Copyright (C) 2004-2013, Marcus Holland-Moritz. |
adfe19db MHM |
4 | ## Version 2.x, Copyright (C) 2001, Paul Marquess. |
5 | ## Version 1.x, Copyright (C) 1999, Kenneth Albanowski. | |
6 | ## | |
7 | ## This program is free software; you can redistribute it and/or | |
8 | ## modify it under the same terms as Perl itself. | |
9 | ## | |
10 | ################################################################################ | |
11 | ||
12 | =provides | |
13 | ||
adfe19db | 14 | __UNDEFINED__ |
e28192fd | 15 | my_strnlen |
679ad62d | 16 | SvUOK |
aadf4f9e N |
17 | utf8_to_uvchr_buf |
18 | ||
adfe19db MHM |
19 | =implementation |
20 | ||
bbe8b705 | 21 | #define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b)) |
aadf4f9e | 22 | |
0d0f8426 MHM |
23 | __UNDEFINED__ sv_setuv(sv, uv) \ |
24 | STMT_START { \ | |
25 | UV TeMpUv = uv; \ | |
26 | if (TeMpUv <= IV_MAX) \ | |
27 | sv_setiv(sv, TeMpUv); \ | |
28 | else \ | |
29 | sv_setnv(sv, (double)TeMpUv); \ | |
30 | } STMT_END | |
31 | ||
32 | __UNDEFINED__ newSVuv(uv) ((uv) <= IV_MAX ? newSViv((IV)uv) : newSVnv((NV)uv)) | |
adfe19db MHM |
33 | |
34 | __UNDEFINED__ sv_2uv(sv) ((PL_Sv = (sv)), (UV) (SvNOK(PL_Sv) ? SvNV(PL_Sv) : sv_2nv(PL_Sv))) | |
35 | __UNDEFINED__ SvUVX(sv) ((UV)SvIVX(sv)) | |
36 | __UNDEFINED__ SvUVXx(sv) SvUVX(sv) | |
37 | __UNDEFINED__ SvUV(sv) (SvIOK(sv) ? SvUVX(sv) : sv_2uv(sv)) | |
38 | __UNDEFINED__ SvUVx(sv) ((PL_Sv = (sv)), SvUV(PL_Sv)) | |
39 | ||
40 | /* Hint: sv_uv | |
41 | * Always use the SvUVx() macro instead of sv_uv(). | |
42 | */ | |
43 | __UNDEFINED__ sv_uv(sv) SvUVx(sv) | |
44 | ||
679ad62d MHM |
45 | #if !defined(SvUOK) && defined(SvIOK_UV) |
46 | # define SvUOK(sv) SvIOK_UV(sv) | |
47 | #endif | |
48 | ||
adfe19db MHM |
49 | __UNDEFINED__ XST_mUV(i,v) (ST(i) = sv_2mortal(newSVuv(v)) ) |
50 | __UNDEFINED__ XSRETURN_UV(v) STMT_START { XST_mUV(0,v); XSRETURN(1); } STMT_END | |
51 | ||
96ad942f MHM |
52 | __UNDEFINED__ PUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); PUSHTARG; } STMT_END |
53 | __UNDEFINED__ XPUSHu(u) STMT_START { sv_setuv(TARG, (UV)(u)); XPUSHTARG; } STMT_END | |
54 | ||
f2e3e4ce | 55 | #if defined UTF8SKIP |
94a2c4f7 KW |
56 | |
57 | /* Don't use official version because it uses MIN, which may not be available */ | |
58 | #undef UTF8_SAFE_SKIP | |
59 | ||
aadf4f9e N |
60 | __UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \ |
61 | ((((e) - (s)) <= 0) \ | |
f2e3e4ce | 62 | ? 0 \ |
bbe8b705 | 63 | : D_PPP_MIN(((e) - (s)), UTF8SKIP(s)))) |
f2e3e4ce KW |
64 | #endif |
65 | ||
e28192fd KW |
66 | #if !defined(my_strnlen) |
67 | #if { NEED my_strnlen } | |
68 | ||
69 | STRLEN | |
70 | my_strnlen(const char *str, Size_t maxlen) | |
71 | { | |
72 | const char *p = str; | |
73 | ||
74 | while(maxlen-- && *p) | |
75 | p++; | |
76 | ||
77 | return p - str; | |
78 | } | |
79 | ||
80 | #endif | |
81 | #endif | |
4c28bdc5 | 82 | |
97d83eb2 | 83 | #if { VERSION < 5.31.3 } |
aadf4f9e N |
84 | /* Versions prior to this accepted things that are now considered |
85 | * malformations, and didn't return -1 on error with warnings enabled | |
86 | * */ | |
87 | # undef utf8_to_uvchr_buf | |
88 | #endif | |
4c28bdc5 | 89 | |
aadf4f9e N |
90 | /* This implementation brings modern, generally more restricted standards to |
91 | * utf8_to_uvchr_buf. Some of these are security related, and clearly must | |
92 | * be done. But its arguable that the others need not, and hence should not. | |
93 | * The reason they're here is that a module that intends to play with the | |
6858ac42 | 94 | * latest perls should be able to work the same in all releases. An example is |
aadf4f9e N |
95 | * that perl no longer accepts any UV for a code point, but limits them to |
96 | * IV_MAX or below. This is for future internal use of the larger code points. | |
97 | * If it turns out that some of these changes are breaking code that isn't | |
98 | * intended to work with modern perls, the tighter restrictions could be | |
99 | * relaxed. khw thinks this is unlikely, but has been wrong in the past. */ | |
100 | ||
6858ac42 KW |
101 | /* 5.6.0 is the first release with UTF-8, and we don't implement this function |
102 | * there due to its likely lack of still being in use, and the underlying | |
103 | * implementation is very different from later ones, without the later | |
104 | * safeguards, so would require extra work to deal with */ | |
e4a76720 | 105 | #if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf) |
de222e8b KW |
106 | # if { VERSION < 5.10.0 } /* Was non-const before this */ |
107 | # define D_PPP_CU8 U8 | |
108 | # else | |
109 | # define D_PPP_CU8 const U8 | |
110 | # endif | |
9aa6a863 | 111 | |
aadf4f9e N |
112 | /* Choose which underlying implementation to use. At least one must be |
113 | * present or the perl is too early to handle this function */ | |
114 | # if defined(utf8n_to_uvchr) || defined(utf8_to_uv) | |
115 | # if defined(utf8n_to_uvchr) /* This is the preferred implementation */ | |
bbe8b705 | 116 | # define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr |
e4a76720 | 117 | # else /* Must be at least 5.6.1 from #if above */ |
bbe8b705 | 118 | # define D_PPP_utf8_to_uvchr_buf_callee utf8_to_uv |
aadf4f9e | 119 | # endif |
aadf4f9e N |
120 | # endif |
121 | ||
aadf4f9e | 122 | # if { NEED utf8_to_uvchr_buf } |
4c28bdc5 | 123 | |
aadf4f9e | 124 | UV |
9aa6a863 | 125 | utf8_to_uvchr_buf(pTHX_ D_PPP_CU8 *s, const U8 *send, STRLEN *retlen) |
aadf4f9e N |
126 | { |
127 | UV ret; | |
128 | STRLEN curlen; | |
129 | bool overflows = 0; | |
130 | const U8 *cur_s = s; | |
131 | const bool do_warnings = ckWARN_d(WARN_UTF8); | |
05fc829b | 132 | STRLEN overflow_length = 0; |
aadf4f9e N |
133 | |
134 | if (send > s) { | |
135 | curlen = send - s; | |
136 | } | |
137 | else { | |
138 | assert(0); /* Modern perls die under this circumstance */ | |
139 | curlen = 0; | |
140 | if (! do_warnings) { /* Handle empty here if no warnings needed */ | |
141 | if (retlen) *retlen = 0; | |
142 | return UNICODE_REPLACEMENT; | |
143 | } | |
144 | } | |
145 | ||
8ec545d9 | 146 | # if { VERSION < 5.26.0 } && ! defined(EBCDIC) |
aadf4f9e | 147 | |
6858ac42 KW |
148 | /* Perl did not properly detect overflow for much of its history on |
149 | * non-EBCDIC platforms, often returning an overlong value which may or may | |
150 | * not have been tolerated in the call. Also, earlier versions, when they | |
151 | * did detect overflow, may have disallowed it completely. Modern ones can | |
152 | * replace it with the REPLACEMENT CHARACTER, depending on calling | |
153 | * parameters. Therefore detect it ourselves in releases it was | |
154 | * problematic in. */ | |
aadf4f9e | 155 | |
05fc829b | 156 | if (curlen > 0 && UNLIKELY(*s >= 0xFE)) { |
aadf4f9e | 157 | |
6858ac42 KW |
158 | /* First, on a 32-bit machine the first byte being at least \xFE |
159 | * automatically is overflow, as it indicates something requiring more | |
160 | * than 31 bits */ | |
aadf4f9e N |
161 | if (sizeof(ret) < 8) { |
162 | overflows = 1; | |
05fc829b | 163 | overflow_length = 7; |
aadf4f9e N |
164 | } |
165 | else { | |
166 | const U8 highest[] = /* 2*63-1 */ | |
167 | "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"; | |
168 | const U8 *cur_h = highest; | |
169 | ||
170 | for (cur_s = s; cur_s < send; cur_s++, cur_h++) { | |
171 | if (UNLIKELY(*cur_s == *cur_h)) { | |
172 | continue; | |
173 | } | |
174 | ||
175 | /* If this byte is larger than the corresponding highest UTF-8 | |
6858ac42 KW |
176 | * byte, the sequence overflows; otherwise the byte is less |
177 | * than (as we handled the equality case above), and so the | |
178 | * sequence doesn't overflow */ | |
aadf4f9e N |
179 | overflows = *cur_s > *cur_h; |
180 | break; | |
181 | ||
182 | } | |
183 | ||
184 | /* Here, either we set the bool and broke out of the loop, or got | |
185 | * to the end and all bytes are the same which indicates it doesn't | |
05fc829b KW |
186 | * overflow. If it did overflow, it would be this number of bytes |
187 | * */ | |
188 | overflow_length = 13; | |
189 | } | |
190 | } | |
191 | ||
192 | if (UNLIKELY(overflows)) { | |
193 | ret = 0; | |
194 | ||
195 | if (! do_warnings && retlen) { | |
196 | *retlen = overflow_length; | |
aadf4f9e N |
197 | } |
198 | } | |
05fc829b | 199 | else |
aadf4f9e | 200 | |
aadf4f9e N |
201 | # endif /* < 5.26 */ |
202 | ||
05fc829b KW |
203 | /* Here, we are either in a release that properly detects overflow, or |
204 | * we have checked for overflow and the next statement is executing as | |
205 | * part of the above conditional where we know we don't have overflow. | |
206 | * | |
207 | * The modern versions allow anything that evaluates to a legal UV, but | |
208 | * not overlongs nor an empty input */ | |
bbe8b705 | 209 | ret = D_PPP_utf8_to_uvchr_buf_callee( |
05fc829b KW |
210 | s, curlen, retlen, (UTF8_ALLOW_ANYUV |
211 | & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY))); | |
212 | ||
213 | # if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 } | |
214 | ||
215 | /* But actually, more modern versions restrict the UV to being no more than | |
216 | * what * an IV can hold, so it could, so it could still have gotten it | |
217 | * wrong about overflowing. */ | |
218 | if (UNLIKELY(ret > IV_MAX)) { | |
219 | overflows = 1; | |
220 | } | |
221 | ||
222 | # endif | |
223 | ||
aadf4f9e N |
224 | if (UNLIKELY(overflows)) { |
225 | if (! do_warnings) { | |
226 | if (retlen) { | |
bbe8b705 KW |
227 | *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s)); |
228 | *retlen = D_PPP_MIN(*retlen, curlen); | |
aadf4f9e N |
229 | } |
230 | return UNICODE_REPLACEMENT; | |
231 | } | |
232 | else { | |
233 | ||
6858ac42 | 234 | /* We use the error message in use from 5.8-5.26 */ |
de222e8b KW |
235 | Perl_warner(aTHX_ packWARN(WARN_UTF8), |
236 | "Malformed UTF-8 character (overflow at 0x%" UVxf | |
237 | ", byte 0x%02x, after start byte 0x%02x)", | |
238 | ret, *cur_s, *s); | |
aadf4f9e N |
239 | if (retlen) { |
240 | *retlen = (STRLEN) -1; | |
241 | } | |
242 | return 0; | |
243 | } | |
244 | } | |
245 | ||
6858ac42 KW |
246 | /* Here, did not overflow, but if it failed for some other reason, and |
247 | * warnings are off, to emulate the behavior of the real utf8_to_uvchr(), | |
248 | * try again, allowing anything. (Note a return of 0 is ok if the input | |
249 | * was '\0') */ | |
aadf4f9e N |
250 | if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) { |
251 | ||
252 | /* If curlen is 0, we already handled the case where warnings are | |
6858ac42 KW |
253 | * disabled, so this 'if' will be true, and so later on, we know that |
254 | * 's' is dereferencible */ | |
aadf4f9e N |
255 | if (do_warnings) { |
256 | *retlen = (STRLEN) -1; | |
257 | } | |
258 | else { | |
bbe8b705 | 259 | ret = D_PPP_utf8_to_uvchr_buf_callee( |
aadf4f9e N |
260 | s, curlen, retlen, UTF8_ALLOW_ANY); |
261 | /* Override with the REPLACEMENT character, as that is what the | |
262 | * modern version of this function returns */ | |
263 | ret = UNICODE_REPLACEMENT; | |
264 | ||
de222e8b | 265 | # if { VERSION < 5.16.0 } |
aadf4f9e N |
266 | |
267 | /* Versions earlier than this don't necessarily return the proper | |
268 | * length. It should not extend past the end of string, nor past | |
269 | * what the first byte indicates the length is, nor past the | |
270 | * continuation characters */ | |
271 | if (retlen && *retlen >= 0) { | |
f1590d76 | 272 | unsigned int i = 1; |
bbe8b705 KW |
273 | |
274 | *retlen = D_PPP_MIN(*retlen, curlen); | |
275 | *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s)); | |
aadf4f9e N |
276 | do { |
277 | if (s[i] < 0x80 || s[i] > 0xBF) { | |
278 | *retlen = i; | |
279 | break; | |
280 | } | |
281 | } while (++i < *retlen); | |
282 | } | |
283 | ||
de222e8b | 284 | # endif |
aadf4f9e N |
285 | |
286 | } | |
287 | } | |
288 | ||
289 | return ret; | |
290 | } | |
e28192fd | 291 | |
aadf4f9e N |
292 | # endif |
293 | #endif | |
39d7245c | 294 | |
aadf4f9e N |
295 | #if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf) |
296 | #undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses | |
297 | to read past a NUL, making it much less likely to read | |
298 | off the end of the buffer. A NUL indicates the start | |
299 | of the next character anyway. If the input isn't | |
300 | NUL-terminated, the function remains unsafe, as it | |
301 | always has been. */ | |
39d7245c KW |
302 | |
303 | __UNDEFINED__ utf8_to_uvchr(s, lp) \ | |
304 | ((*(s) == '\0') \ | |
305 | ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \ | |
306 | : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp))) | |
307 | ||
aadf4f9e N |
308 | #endif |
309 | ||
e28192fd KW |
310 | =xsinit |
311 | ||
312 | #define NEED_my_strnlen | |
aadf4f9e | 313 | #define NEED_utf8_to_uvchr_buf |
e28192fd | 314 | |
adfe19db MHM |
315 | =xsubs |
316 | ||
317 | SV * | |
318 | sv_setuv(uv) | |
b2049988 MHM |
319 | UV uv |
320 | CODE: | |
321 | RETVAL = newSViv(1); | |
322 | sv_setuv(RETVAL, uv); | |
323 | OUTPUT: | |
324 | RETVAL | |
adfe19db MHM |
325 | |
326 | SV * | |
327 | newSVuv(uv) | |
b2049988 MHM |
328 | UV uv |
329 | CODE: | |
330 | RETVAL = newSVuv(uv); | |
331 | OUTPUT: | |
332 | RETVAL | |
adfe19db MHM |
333 | |
334 | UV | |
335 | sv_2uv(sv) | |
b2049988 MHM |
336 | SV *sv |
337 | CODE: | |
338 | RETVAL = sv_2uv(sv); | |
339 | OUTPUT: | |
340 | RETVAL | |
adfe19db MHM |
341 | |
342 | UV | |
343 | SvUVx(sv) | |
b2049988 MHM |
344 | SV *sv |
345 | CODE: | |
346 | sv--; | |
347 | RETVAL = SvUVx(++sv); | |
348 | OUTPUT: | |
349 | RETVAL | |
adfe19db MHM |
350 | |
351 | void | |
352 | XSRETURN_UV() | |
b2049988 MHM |
353 | PPCODE: |
354 | XSRETURN_UV(42); | |
adfe19db | 355 | |
96ad942f MHM |
356 | void |
357 | PUSHu() | |
b2049988 MHM |
358 | PREINIT: |
359 | dTARG; | |
360 | PPCODE: | |
361 | TARG = sv_newmortal(); | |
362 | EXTEND(SP, 1); | |
363 | PUSHu(42); | |
364 | XSRETURN(1); | |
96ad942f MHM |
365 | |
366 | void | |
367 | XPUSHu() | |
b2049988 MHM |
368 | PREINIT: |
369 | dTARG; | |
370 | PPCODE: | |
371 | TARG = sv_newmortal(); | |
372 | XPUSHu(43); | |
373 | XSRETURN(1); | |
96ad942f | 374 | |
f2e3e4ce KW |
375 | STRLEN |
376 | UTF8_SAFE_SKIP(s, adjustment) | |
377 | unsigned char * s | |
378 | int adjustment | |
379 | CODE: | |
380 | /* Instead of passing in an 'e' ptr, use the real end, adjusted */ | |
9aa6a863 | 381 | #if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP) |
f2e3e4ce | 382 | RETVAL = UTF8_SAFE_SKIP(s, s + UTF8SKIP(s) + adjustment); |
9aa6a863 N |
383 | #else |
384 | RETVAL = 0; | |
385 | #endif | |
f2e3e4ce KW |
386 | OUTPUT: |
387 | RETVAL | |
388 | ||
e28192fd KW |
389 | STRLEN |
390 | my_strnlen(s, max) | |
391 | char * s | |
392 | STRLEN max | |
393 | CODE: | |
394 | RETVAL= my_strnlen(s, max); | |
395 | OUTPUT: | |
396 | RETVAL | |
397 | ||
4c28bdc5 | 398 | AV * |
aadf4f9e | 399 | utf8_to_uvchr_buf(s, adjustment) |
4c28bdc5 | 400 | unsigned char *s |
aadf4f9e | 401 | int adjustment |
4c28bdc5 KW |
402 | PREINIT: |
403 | AV *av; | |
404 | STRLEN len; | |
405 | CODE: | |
406 | av = newAV(); | |
9aa6a863 | 407 | #ifdef utf8_to_uvchr_buf |
aadf4f9e N |
408 | av_push(av, newSVuv(utf8_to_uvchr_buf(s, |
409 | s + UTF8SKIP(s) + adjustment, | |
410 | &len))); | |
9aa6a863 N |
411 | #else |
412 | av_push(av, newSVuv(0)); | |
413 | len = (STRLEN) -1; | |
414 | #endif | |
721db7b0 N |
415 | if (len == (STRLEN) -1) { |
416 | av_push(av, newSViv(-1)); | |
417 | } | |
418 | else { | |
419 | av_push(av, newSVuv(len)); | |
420 | } | |
4c28bdc5 KW |
421 | RETVAL = av; |
422 | OUTPUT: | |
423 | RETVAL | |
424 | ||
39d7245c KW |
425 | AV * |
426 | utf8_to_uvchr(s) | |
427 | unsigned char *s | |
428 | PREINIT: | |
429 | AV *av; | |
430 | STRLEN len; | |
431 | CODE: | |
432 | av = newAV(); | |
9aa6a863 | 433 | #ifdef utf8_to_uvchr |
39d7245c | 434 | av_push(av, newSVuv(utf8_to_uvchr(s, &len))); |
9aa6a863 N |
435 | #else |
436 | av_push(av, newSVuv(0)); | |
437 | len = (STRLEN) -1; | |
438 | #endif | |
721db7b0 N |
439 | if (len == (STRLEN) -1) { |
440 | av_push(av, newSViv(-1)); | |
441 | } | |
442 | else { | |
443 | av_push(av, newSVuv(len)); | |
444 | } | |
39d7245c KW |
445 | RETVAL = av; |
446 | OUTPUT: | |
447 | RETVAL | |
448 | ||
5f93efa0 | 449 | =tests plan => 62 |
adfe19db MHM |
450 | |
451 | ok(&Devel::PPPort::sv_setuv(42), 42); | |
452 | ok(&Devel::PPPort::newSVuv(123), 123); | |
453 | ok(&Devel::PPPort::sv_2uv("4711"), 4711); | |
454 | ok(&Devel::PPPort::sv_2uv("1735928559"), 1735928559); | |
455 | ok(&Devel::PPPort::SvUVx("1735928559"), 1735928559); | |
456 | ok(&Devel::PPPort::SvUVx(1735928559), 1735928559); | |
457 | ok(&Devel::PPPort::SvUVx(0xdeadbeef), 0xdeadbeef); | |
458 | ok(&Devel::PPPort::XSRETURN_UV(), 42); | |
96ad942f MHM |
459 | ok(&Devel::PPPort::PUSHu(), 42); |
460 | ok(&Devel::PPPort::XPUSHu(), 43); | |
f2e3e4ce KW |
461 | ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1); |
462 | ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0); | |
e28192fd | 463 | ok(&Devel::PPPort::my_strnlen("abc\0def", 7), 3); |
aadf4f9e | 464 | |
89d8c7cd KW |
465 | # skip tests on 5.6.0 and earlier |
466 | if ("$]" le '5.006') { | |
5f93efa0 | 467 | skip 'skip: broken utf8 support', 0 for 1..49; |
89d8c7cd KW |
468 | exit; |
469 | } | |
470 | ||
aadf4f9e | 471 | my $ret = &Devel::PPPort::utf8_to_uvchr("A"); |
4c28bdc5 KW |
472 | ok($ret->[0], ord("A")); |
473 | ok($ret->[1], 1); | |
aadf4f9e N |
474 | |
475 | $ret = &Devel::PPPort::utf8_to_uvchr("\0"); | |
4c28bdc5 KW |
476 | ok($ret->[0], 0); |
477 | ok($ret->[1], 1); | |
aadf4f9e N |
478 | |
479 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0); | |
39d7245c KW |
480 | ok($ret->[0], ord("A")); |
481 | ok($ret->[1], 1); | |
aadf4f9e N |
482 | |
483 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0); | |
39d7245c KW |
484 | ok($ret->[0], 0); |
485 | ok($ret->[1], 1); | |
aadf4f9e N |
486 | |
487 | if (ord("A") != 65) { # tests not valid for EBCDIC | |
5f93efa0 | 488 | ok(1, 1) for 1 .. (2 + 4 + (7 * 5)); |
aadf4f9e N |
489 | } |
490 | else { | |
491 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0); | |
492 | ok($ret->[0], 0x100); | |
493 | ok($ret->[1], 2); | |
494 | ||
495 | my @warnings; | |
496 | local $SIG{__WARN__} = sub { push @warnings, @_; }; | |
497 | ||
498 | { | |
499 | use warnings 'utf8'; | |
500 | $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); | |
501 | ok($ret->[0], 0); | |
502 | ok($ret->[1], -1); | |
503 | ||
504 | no warnings; | |
505 | $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); | |
506 | ok($ret->[0], 0xFFFD); | |
507 | ok($ret->[1], 1); | |
508 | } | |
509 | ||
510 | my @buf_tests = ( | |
511 | { | |
512 | input => "A", | |
513 | adjustment => -1, | |
514 | warning => qr/empty/, | |
515 | no_warnings_returned_length => 0, | |
516 | }, | |
517 | { | |
518 | input => "\xc4\xc5", | |
519 | adjustment => 0, | |
520 | warning => qr/non-continuation/, | |
521 | no_warnings_returned_length => 1, | |
522 | }, | |
523 | { | |
524 | input => "\xc4\x80", | |
525 | adjustment => -1, | |
526 | warning => qr/short|1 byte, need 2/, | |
527 | no_warnings_returned_length => 1, | |
528 | }, | |
529 | { | |
530 | input => "\xc0\x81", | |
531 | adjustment => 0, | |
532 | warning => qr/overlong|2 bytes, need 1/, | |
533 | no_warnings_returned_length => 2, | |
534 | }, | |
5f93efa0 KW |
535 | { |
536 | input => "\xe0\x80\x81", | |
537 | adjustment => 0, | |
538 | warning => qr/overlong|3 bytes, need 1/, | |
539 | no_warnings_returned_length => 3, | |
540 | }, | |
541 | { | |
542 | input => "\xf0\x80\x80\x81", | |
543 | adjustment => 0, | |
544 | warning => qr/overlong|4 bytes, need 1/, | |
545 | no_warnings_returned_length => 4, | |
546 | }, | |
6858ac42 | 547 | { # Old algorithm failed to detect this |
aadf4f9e N |
548 | input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", |
549 | adjustment => 0, | |
05fc829b | 550 | warning => qr/overflow/, |
aadf4f9e N |
551 | no_warnings_returned_length => 13, |
552 | }, | |
553 | ); | |
554 | ||
555 | # An empty input is an assertion failure on debugging builds. It is | |
556 | # deliberately the first test. | |
557 | require Config; import Config; | |
558 | use vars '%Config'; | |
559 | if ($Config{ccflags} =~ /-DDEBUGGING/) { | |
560 | shift @buf_tests; | |
561 | ok(1, 1) for 1..5; | |
562 | } | |
563 | ||
564 | for my $test (@buf_tests) { | |
565 | my $input = $test->{'input'}; | |
566 | my $adjustment = $test->{'adjustment'}; | |
567 | my $display = 'utf8_to_uvchr_buf("'; | |
568 | for (my $i = 0; $i < length($input) + $adjustment; $i++) { | |
569 | $display .= sprintf "\\x%02x", ord substr($input, $i, 1); | |
570 | } | |
571 | ||
572 | $display .= '")'; | |
573 | my $warning = $test->{'warning'}; | |
574 | ||
575 | undef @warnings; | |
576 | use warnings 'utf8'; | |
577 | $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); | |
578 | ok($ret->[0], 0, "returned value $display; warnings enabled"); | |
579 | ok($ret->[1], -1, "returned length $display; warnings enabled"); | |
580 | my $all_warnings = join "; ", @warnings; | |
581 | my $contains = grep { $_ =~ $warning } $all_warnings; | |
17c135f5 KW |
582 | ok($contains, 1, $display |
583 | . "; Got: '$all_warnings', which should contain '$warning'"); | |
aadf4f9e N |
584 | |
585 | undef @warnings; | |
586 | no warnings 'utf8'; | |
587 | $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); | |
588 | ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled"); | |
589 | ok($ret->[1], $test->{'no_warnings_returned_length'}, | |
590 | "returned length $display; warnings disabled"); | |
591 | } | |
592 | } |