Commit | Line | Data |
---|---|---|
7899b636 KW |
1 | =provides |
2 | ||
3 | __UNDEFINED__ | |
4 | utf8_to_uvchr_buf | |
0570adb7 P |
5 | sv_len_utf8 |
6 | sv_len_utf8_nomg | |
7899b636 KW |
7 | |
8 | =implementation | |
9 | ||
10 | #define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b)) | |
11 | ||
12 | __UNDEFINED__ UNICODE_REPLACEMENT 0xFFFD | |
13 | ||
14 | __UNDEFINED__ UTF8_MAXBYTES UTF8_MAXLEN | |
15 | ||
16 | __UNDEFINED__ UTF8_ALLOW_ANYUV 0 | |
17 | __UNDEFINED__ UTF8_ALLOW_EMPTY 0x0001 | |
18 | __UNDEFINED__ UTF8_ALLOW_CONTINUATION 0x0002 | |
19 | __UNDEFINED__ UTF8_ALLOW_NON_CONTINUATION 0x0004 | |
20 | __UNDEFINED__ UTF8_ALLOW_SHORT 0x0008 | |
21 | __UNDEFINED__ UTF8_ALLOW_LONG 0x0010 | |
22 | __UNDEFINED__ UTF8_ALLOW_OVERFLOW 0x0080 | |
23 | __UNDEFINED__ UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \ | |
24 | |UTF8_ALLOW_NON_CONTINUATION \ | |
25 | |UTF8_ALLOW_SHORT \ | |
26 | |UTF8_ALLOW_LONG \ | |
27 | |UTF8_ALLOW_OVERFLOW) | |
28 | ||
29 | #if defined UTF8SKIP | |
30 | ||
31 | /* Don't use official version because it uses MIN, which may not be available */ | |
32 | #undef UTF8_SAFE_SKIP | |
33 | ||
34 | __UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \ | |
35 | ((((e) - (s)) <= 0) \ | |
36 | ? 0 \ | |
37 | : D_PPP_MIN(((e) - (s)), UTF8SKIP(s)))) | |
38 | #endif | |
39 | ||
d2b44e2b KW |
40 | #ifdef is_ascii_string |
41 | __UNDEFINED__ is_invariant_string(s,l) is_ascii_string(s,l) | |
42 | __UNDEFINED__ is_utf8_invariant_string(s,l) is_ascii_string(s,l) | |
43 | ||
44 | /* Hint: is_utf8_invariant_string | |
45 | Please use this instead of is_ascii_string or is_invariant_string | |
46 | */ | |
47 | #endif | |
48 | ||
2ff9e5e8 KW |
49 | #ifdef ibcmp_utf8 |
50 | __UNDEFINED__ foldEQ_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2) \ | |
51 | cBOOL(! ibcmp_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2)) | |
52 | #endif | |
53 | ||
7899b636 KW |
54 | #if defined(is_utf8_string) && defined(UTF8SKIP) |
55 | __UNDEFINED__ isUTF8_CHAR(s0, e) ( \ | |
56 | (e) <= (s0) || ! is_utf8_string(s0, D_PPP_MIN(UTF8SKIP(s0), (e) - (s0))) \ | |
57 | ? 0 \ | |
58 | : UTF8SKIP(s0)) | |
59 | #endif | |
60 | ||
8fc7db65 KW |
61 | #if 'A' == 65 |
62 | __UNDEFINED__ BOM_UTF8 "\xEF\xBB\xBF" | |
63 | __UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xEF\xBF\xBD" | |
64 | #elif '^' == 95 | |
65 | __UNDEFINED__ BOM_UTF8 "\xDD\x73\x66\x73" | |
66 | __UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x73\x73\x71" | |
67 | #elif '^' == 176 | |
68 | __UNDEFINED__ BOM_UTF8 "\xDD\x72\x65\x72" | |
69 | __UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x72\x72\x70" | |
70 | #else | |
71 | # error Unknown character set | |
72 | #endif | |
73 | ||
96656f64 | 74 | #if { VERSION < 5.31.4 } |
7899b636 KW |
75 | /* Versions prior to this accepted things that are now considered |
76 | * malformations, and didn't return -1 on error with warnings enabled | |
77 | * */ | |
78 | # undef utf8_to_uvchr_buf | |
79 | #endif | |
80 | ||
81 | /* This implementation brings modern, generally more restricted standards to | |
82 | * utf8_to_uvchr_buf. Some of these are security related, and clearly must | |
83 | * be done. But its arguable that the others need not, and hence should not. | |
84 | * The reason they're here is that a module that intends to play with the | |
85 | * latest perls should be able to work the same in all releases. An example is | |
86 | * that perl no longer accepts any UV for a code point, but limits them to | |
87 | * IV_MAX or below. This is for future internal use of the larger code points. | |
88 | * If it turns out that some of these changes are breaking code that isn't | |
89 | * intended to work with modern perls, the tighter restrictions could be | |
90 | * relaxed. khw thinks this is unlikely, but has been wrong in the past. */ | |
91 | ||
92 | /* 5.6.0 is the first release with UTF-8, and we don't implement this function | |
93 | * there due to its likely lack of still being in use, and the underlying | |
94 | * implementation is very different from later ones, without the later | |
95 | * safeguards, so would require extra work to deal with */ | |
96 | #if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf) | |
97 | /* Choose which underlying implementation to use. At least one must be | |
98 | * present or the perl is too early to handle this function */ | |
99 | # if defined(utf8n_to_uvchr) || defined(utf8_to_uv) | |
100 | # if defined(utf8n_to_uvchr) /* This is the preferred implementation */ | |
101 | # define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr | |
102 | # else /* Must be at least 5.6.1 from #if above */ | |
103 | # define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) utf8_to_uv((U8 *)(s), (curlen), (retlen), (flags)) | |
104 | # endif | |
105 | # endif | |
106 | ||
107 | # if { NEED utf8_to_uvchr_buf } | |
108 | ||
109 | UV | |
110 | utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen) | |
111 | { | |
112 | UV ret; | |
113 | STRLEN curlen; | |
114 | bool overflows = 0; | |
115 | const U8 *cur_s = s; | |
116 | const bool do_warnings = ckWARN_d(WARN_UTF8); | |
117 | # if { VERSION < 5.26.0 } && ! defined(EBCDIC) | |
118 | STRLEN overflow_length = 0; | |
119 | # endif | |
120 | ||
121 | if (send > s) { | |
122 | curlen = send - s; | |
123 | } | |
124 | else { | |
125 | assert(0); /* Modern perls die under this circumstance */ | |
126 | curlen = 0; | |
127 | if (! do_warnings) { /* Handle empty here if no warnings needed */ | |
128 | if (retlen) *retlen = 0; | |
129 | return UNICODE_REPLACEMENT; | |
130 | } | |
131 | } | |
132 | ||
133 | # if { VERSION < 5.26.0 } && ! defined(EBCDIC) | |
134 | ||
135 | /* Perl did not properly detect overflow for much of its history on | |
136 | * non-EBCDIC platforms, often returning an overlong value which may or may | |
137 | * not have been tolerated in the call. Also, earlier versions, when they | |
138 | * did detect overflow, may have disallowed it completely. Modern ones can | |
139 | * replace it with the REPLACEMENT CHARACTER, depending on calling | |
140 | * parameters. Therefore detect it ourselves in releases it was | |
141 | * problematic in. */ | |
142 | ||
143 | if (curlen > 0 && UNLIKELY(*s >= 0xFE)) { | |
144 | ||
145 | /* First, on a 32-bit machine the first byte being at least \xFE | |
146 | * automatically is overflow, as it indicates something requiring more | |
147 | * than 31 bits */ | |
148 | if (sizeof(ret) < 8) { | |
149 | overflows = 1; | |
150 | overflow_length = 7; | |
151 | } | |
152 | else { | |
153 | const U8 highest[] = /* 2*63-1 */ | |
154 | "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF"; | |
155 | const U8 *cur_h = highest; | |
156 | ||
157 | for (cur_s = s; cur_s < send; cur_s++, cur_h++) { | |
158 | if (UNLIKELY(*cur_s == *cur_h)) { | |
159 | continue; | |
160 | } | |
161 | ||
162 | /* If this byte is larger than the corresponding highest UTF-8 | |
163 | * byte, the sequence overflows; otherwise the byte is less | |
164 | * than (as we handled the equality case above), and so the | |
165 | * sequence doesn't overflow */ | |
166 | overflows = *cur_s > *cur_h; | |
167 | break; | |
168 | ||
169 | } | |
170 | ||
171 | /* Here, either we set the bool and broke out of the loop, or got | |
172 | * to the end and all bytes are the same which indicates it doesn't | |
173 | * overflow. If it did overflow, it would be this number of bytes | |
174 | * */ | |
175 | overflow_length = 13; | |
176 | } | |
177 | } | |
178 | ||
179 | if (UNLIKELY(overflows)) { | |
180 | ret = 0; | |
181 | ||
182 | if (! do_warnings && retlen) { | |
183 | *retlen = overflow_length; | |
184 | } | |
185 | } | |
186 | else | |
187 | ||
188 | # endif /* < 5.26 */ | |
189 | ||
190 | /* Here, we are either in a release that properly detects overflow, or | |
191 | * we have checked for overflow and the next statement is executing as | |
192 | * part of the above conditional where we know we don't have overflow. | |
193 | * | |
194 | * The modern versions allow anything that evaluates to a legal UV, but | |
195 | * not overlongs nor an empty input */ | |
196 | ret = D_PPP_utf8_to_uvchr_buf_callee( | |
197 | s, curlen, retlen, (UTF8_ALLOW_ANYUV | |
198 | & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY))); | |
199 | ||
200 | # if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 } | |
201 | ||
202 | /* But actually, more modern versions restrict the UV to being no more than | |
203 | * what * an IV can hold, so it could, so it could still have gotten it | |
204 | * wrong about overflowing. */ | |
205 | if (UNLIKELY(ret > IV_MAX)) { | |
206 | overflows = 1; | |
207 | } | |
208 | ||
209 | # endif | |
210 | ||
211 | if (UNLIKELY(overflows)) { | |
212 | if (! do_warnings) { | |
213 | if (retlen) { | |
214 | *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s)); | |
215 | *retlen = D_PPP_MIN(*retlen, curlen); | |
216 | } | |
217 | return UNICODE_REPLACEMENT; | |
218 | } | |
219 | else { | |
220 | ||
221 | /* We use the error message in use from 5.8-5.26 */ | |
222 | Perl_warner(aTHX_ packWARN(WARN_UTF8), | |
223 | "Malformed UTF-8 character (overflow at 0x%" UVxf | |
224 | ", byte 0x%02x, after start byte 0x%02x)", | |
225 | ret, *cur_s, *s); | |
226 | if (retlen) { | |
227 | *retlen = (STRLEN) -1; | |
228 | } | |
229 | return 0; | |
230 | } | |
231 | } | |
232 | ||
233 | /* Here, did not overflow, but if it failed for some other reason, and | |
234 | * warnings are off, to emulate the behavior of the real utf8_to_uvchr(), | |
235 | * try again, allowing anything. (Note a return of 0 is ok if the input | |
236 | * was '\0') */ | |
237 | if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) { | |
238 | ||
239 | /* If curlen is 0, we already handled the case where warnings are | |
240 | * disabled, so this 'if' will be true, and so later on, we know that | |
241 | * 's' is dereferencible */ | |
242 | if (do_warnings) { | |
243 | *retlen = (STRLEN) -1; | |
244 | } | |
245 | else { | |
246 | ret = D_PPP_utf8_to_uvchr_buf_callee( | |
247 | s, curlen, retlen, UTF8_ALLOW_ANY); | |
248 | /* Override with the REPLACEMENT character, as that is what the | |
249 | * modern version of this function returns */ | |
250 | ret = UNICODE_REPLACEMENT; | |
251 | ||
252 | # if { VERSION < 5.16.0 } | |
253 | ||
254 | /* Versions earlier than this don't necessarily return the proper | |
255 | * length. It should not extend past the end of string, nor past | |
256 | * what the first byte indicates the length is, nor past the | |
257 | * continuation characters */ | |
258 | if (retlen && *retlen >= 0) { | |
259 | unsigned int i = 1; | |
260 | ||
261 | *retlen = D_PPP_MIN(*retlen, curlen); | |
262 | *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s)); | |
263 | do { | |
264 | if (s[i] < 0x80 || s[i] > 0xBF) { | |
265 | *retlen = i; | |
266 | break; | |
267 | } | |
268 | } while (++i < *retlen); | |
269 | } | |
270 | ||
271 | # endif | |
272 | ||
273 | } | |
274 | } | |
275 | ||
276 | return ret; | |
277 | } | |
278 | ||
279 | # endif | |
280 | #endif | |
281 | ||
282 | #if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf) | |
283 | #undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses | |
284 | to read past a NUL, making it much less likely to read | |
285 | off the end of the buffer. A NUL indicates the start | |
286 | of the next character anyway. If the input isn't | |
287 | NUL-terminated, the function remains unsafe, as it | |
288 | always has been. */ | |
289 | ||
290 | __UNDEFINED__ utf8_to_uvchr(s, lp) \ | |
291 | ((*(s) == '\0') \ | |
292 | ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \ | |
293 | : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp))) | |
294 | ||
295 | #endif | |
296 | ||
0570adb7 P |
297 | #ifdef SV_NOSTEAL |
298 | /* Older Perl versions have broken sv_len_utf8() when passed sv does not have SVf_UTF8 flag set */ | |
299 | /* Also note that SvGETMAGIC() may change presence of SVf_UTF8 flag */ | |
300 | # if { VERSION < 5.17.5 } | |
301 | # undef sv_len_utf8 | |
302 | # if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN) | |
303 | # define sv_len_utf8_nomg(sv) ({ SV *_sv2 = (sv); (SvUTF8(_sv2) ? Perl_sv_len_utf8(aTHX_ (!SvGMAGICAL(_sv2) ? _sv2 : sv_mortalcopy_flags(_sv2, SV_NOSTEAL))) : ({ STRLEN _len; SvPV_nomg(_sv2, _len); _len; })); }) | |
304 | # define sv_len_utf8(sv) ({ SV *_sv1 = (sv); SvGETMAGIC(_sv1); sv_len_utf8_nomg(_sv1); }) | |
305 | # else | |
306 | # define sv_len_utf8_nomg(sv) (PL_Sv = (sv), (SvUTF8(PL_Sv) ? Perl_sv_len_utf8(aTHX_ (!SvGMAGICAL(PL_Sv) ? PL_Sv : sv_mortalcopy_flags(PL_Sv, SV_NOSTEAL))) : (SvPV_nomg(PL_Sv, PL_na), PL_na))) | |
307 | # define sv_len_utf8(sv) (PL_Sv = (sv), SvGETMAGIC(PL_Sv), sv_len_utf8_nomg(PL_Sv)) | |
308 | # endif | |
309 | # endif | |
310 | # if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN) | |
311 | __UNDEFINED__ sv_len_utf8_nomg(sv) ({ SV *_sv = (sv); sv_len_utf8(!SvGMAGICAL(_sv) ? _sv : sv_mortalcopy_flags(_sv, SV_NOSTEAL)); }) | |
312 | # else | |
313 | __UNDEFINED__ sv_len_utf8_nomg(sv) ((PL_Sv = (sv)), sv_len_utf8(!SvGMAGICAL(PL_Sv) ? PL_Sv : sv_mortalcopy_flags(PL_Sv, SV_NOSTEAL))) | |
314 | # endif | |
315 | #endif | |
316 | ||
7899b636 KW |
317 | =xsinit |
318 | ||
319 | #define NEED_utf8_to_uvchr_buf | |
320 | ||
321 | =xsubs | |
322 | ||
323 | #if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP) | |
324 | ||
325 | STRLEN | |
326 | UTF8_SAFE_SKIP(s, adjustment) | |
327 | char * s | |
328 | int adjustment | |
329 | PREINIT: | |
330 | const char *const_s; | |
331 | CODE: | |
332 | const_s = s; | |
333 | /* Instead of passing in an 'e' ptr, use the real end, adjusted */ | |
334 | RETVAL = UTF8_SAFE_SKIP(const_s, s + UTF8SKIP(s) + adjustment); | |
335 | OUTPUT: | |
336 | RETVAL | |
337 | ||
338 | #endif | |
339 | ||
340 | #ifdef isUTF8_CHAR | |
341 | ||
342 | STRLEN | |
343 | isUTF8_CHAR(s, adjustment) | |
344 | unsigned char * s | |
345 | int adjustment | |
346 | PREINIT: | |
347 | const unsigned char *const_s; | |
348 | const unsigned char *const_e; | |
349 | CODE: | |
350 | const_s = s; | |
351 | /* Instead of passing in an 'e' ptr, use the real end, adjusted */ | |
352 | const_e = const_s + UTF8SKIP(const_s) + adjustment; | |
353 | RETVAL = isUTF8_CHAR(const_s, const_e); | |
354 | OUTPUT: | |
355 | RETVAL | |
356 | ||
357 | #endif | |
358 | ||
2ff9e5e8 KW |
359 | |
360 | #ifdef foldEQ_utf8 | |
361 | ||
362 | STRLEN | |
363 | foldEQ_utf8(s1, l1, u1, s2, l2, u2) | |
364 | char *s1 | |
365 | UV l1 | |
366 | bool u1 | |
367 | char *s2 | |
368 | UV l2 | |
369 | bool u2 | |
370 | PREINIT: | |
371 | const char *const_s1; | |
372 | const char *const_s2; | |
373 | CODE: | |
374 | const_s1 = s1; | |
375 | const_s2 = s2; | |
376 | RETVAL = foldEQ_utf8(const_s1, NULL, l1, u1, const_s2, NULL, l2, u2); | |
377 | OUTPUT: | |
378 | RETVAL | |
379 | ||
380 | #endif | |
381 | ||
7899b636 KW |
382 | #ifdef utf8_to_uvchr_buf |
383 | ||
384 | AV * | |
385 | utf8_to_uvchr_buf(s, adjustment) | |
386 | unsigned char *s | |
387 | int adjustment | |
388 | PREINIT: | |
389 | AV *av; | |
390 | STRLEN len; | |
391 | const unsigned char *const_s; | |
392 | CODE: | |
393 | av = newAV(); | |
394 | const_s = s; | |
395 | av_push(av, newSVuv(utf8_to_uvchr_buf(const_s, | |
396 | s + UTF8SKIP(s) + adjustment, | |
397 | &len))); | |
398 | if (len == (STRLEN) -1) { | |
399 | av_push(av, newSViv(-1)); | |
400 | } | |
401 | else { | |
402 | av_push(av, newSVuv(len)); | |
403 | } | |
404 | RETVAL = av; | |
405 | OUTPUT: | |
406 | RETVAL | |
407 | ||
408 | #endif | |
409 | ||
410 | #ifdef utf8_to_uvchr | |
411 | ||
412 | AV * | |
413 | utf8_to_uvchr(s) | |
414 | unsigned char *s | |
415 | PREINIT: | |
416 | AV *av; | |
417 | STRLEN len; | |
418 | const unsigned char *const_s; | |
419 | CODE: | |
420 | av = newAV(); | |
421 | const_s = s; | |
422 | av_push(av, newSVuv(utf8_to_uvchr(const_s, &len))); | |
423 | if (len == (STRLEN) -1) { | |
424 | av_push(av, newSViv(-1)); | |
425 | } | |
426 | else { | |
427 | av_push(av, newSVuv(len)); | |
428 | } | |
429 | RETVAL = av; | |
430 | OUTPUT: | |
431 | RETVAL | |
432 | ||
433 | #endif | |
434 | ||
0570adb7 P |
435 | #ifdef SV_NOSTEAL |
436 | ||
437 | STRLEN | |
438 | sv_len_utf8(sv) | |
439 | SV *sv | |
440 | CODE: | |
441 | RETVAL = sv_len_utf8(sv); | |
442 | OUTPUT: | |
443 | RETVAL | |
444 | ||
445 | STRLEN | |
446 | sv_len_utf8_nomg(sv) | |
447 | SV *sv | |
448 | CODE: | |
449 | RETVAL = sv_len_utf8_nomg(sv); | |
450 | OUTPUT: | |
451 | RETVAL | |
452 | ||
453 | #endif | |
454 | ||
455 | =tests plan => 81 | |
7899b636 KW |
456 | |
457 | BEGIN { require warnings if "$]" gt '5.006' } | |
458 | ||
459 | # skip tests on 5.6.0 and earlier | |
460 | if ("$]" le '5.006') { | |
0570adb7 | 461 | skip 'skip: broken utf8 support', 0 for 1..81; |
7899b636 KW |
462 | exit; |
463 | } | |
464 | ||
465 | ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1); | |
466 | ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0); | |
467 | ||
468 | ok(&Devel::PPPort::isUTF8_CHAR("A", -1), 0); | |
469 | ok(&Devel::PPPort::isUTF8_CHAR("A", 0), 1); | |
470 | ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", -1), 0); | |
471 | ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", 0), 2); | |
472 | ||
2ff9e5e8 KW |
473 | if ("$]" lt '5.008') { |
474 | ok(1, 1) for 1 ..3 | |
475 | } | |
476 | else { | |
477 | ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{101}", 3, 1), 1); | |
478 | ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{102}", 3, 1), 0); | |
479 | ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "b\x{101}", 3, 1), 0); | |
480 | } | |
481 | ||
7899b636 KW |
482 | my $ret = &Devel::PPPort::utf8_to_uvchr("A"); |
483 | ok($ret->[0], ord("A")); | |
484 | ok($ret->[1], 1); | |
485 | ||
486 | $ret = &Devel::PPPort::utf8_to_uvchr("\0"); | |
487 | ok($ret->[0], 0); | |
488 | ok($ret->[1], 1); | |
489 | ||
490 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0); | |
491 | ok($ret->[0], ord("A")); | |
492 | ok($ret->[1], 1); | |
493 | ||
494 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0); | |
495 | ok($ret->[0], 0); | |
496 | ok($ret->[1], 1); | |
497 | ||
498 | if (ord("A") != 65) { # tests not valid for EBCDIC | |
499 | ok(1, 1) for 1 .. (2 + 4 + (7 * 5)); | |
500 | } | |
501 | else { | |
502 | $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0); | |
503 | ok($ret->[0], 0x100); | |
504 | ok($ret->[1], 2); | |
505 | ||
506 | my @warnings; | |
507 | local $SIG{__WARN__} = sub { push @warnings, @_; }; | |
508 | ||
509 | { | |
510 | BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' } | |
511 | $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); | |
512 | ok($ret->[0], 0); | |
513 | ok($ret->[1], -1); | |
514 | ||
515 | BEGIN { 'warnings'->unimport() if "$]" gt '5.006' } | |
516 | $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80"); | |
517 | ok($ret->[0], 0xFFFD); | |
518 | ok($ret->[1], 1); | |
519 | } | |
520 | ||
521 | my @buf_tests = ( | |
522 | { | |
523 | input => "A", | |
524 | adjustment => -1, | |
525 | warning => qr/empty/, | |
526 | no_warnings_returned_length => 0, | |
527 | }, | |
528 | { | |
529 | input => "\xc4\xc5", | |
530 | adjustment => 0, | |
531 | warning => qr/non-continuation/, | |
532 | no_warnings_returned_length => 1, | |
533 | }, | |
534 | { | |
535 | input => "\xc4\x80", | |
536 | adjustment => -1, | |
537 | warning => qr/short|1 byte, need 2/, | |
538 | no_warnings_returned_length => 1, | |
539 | }, | |
540 | { | |
541 | input => "\xc0\x81", | |
542 | adjustment => 0, | |
543 | warning => qr/overlong|2 bytes, need 1/, | |
544 | no_warnings_returned_length => 2, | |
545 | }, | |
546 | { | |
547 | input => "\xe0\x80\x81", | |
548 | adjustment => 0, | |
549 | warning => qr/overlong|3 bytes, need 1/, | |
550 | no_warnings_returned_length => 3, | |
551 | }, | |
552 | { | |
553 | input => "\xf0\x80\x80\x81", | |
554 | adjustment => 0, | |
555 | warning => qr/overlong|4 bytes, need 1/, | |
556 | no_warnings_returned_length => 4, | |
557 | }, | |
558 | { # Old algorithm failed to detect this | |
559 | input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf", | |
560 | adjustment => 0, | |
561 | warning => qr/overflow/, | |
562 | no_warnings_returned_length => 13, | |
563 | }, | |
564 | ); | |
565 | ||
566 | # An empty input is an assertion failure on debugging builds. It is | |
567 | # deliberately the first test. | |
568 | require Config; import Config; | |
569 | use vars '%Config'; | |
570 | if ($Config{ccflags} =~ /-DDEBUGGING/) { | |
571 | shift @buf_tests; | |
572 | ok(1, 1) for 1..5; | |
573 | } | |
574 | ||
575 | for my $test (@buf_tests) { | |
576 | my $input = $test->{'input'}; | |
577 | my $adjustment = $test->{'adjustment'}; | |
578 | my $display = 'utf8_to_uvchr_buf("'; | |
579 | for (my $i = 0; $i < length($input) + $adjustment; $i++) { | |
580 | $display .= sprintf "\\x%02x", ord substr($input, $i, 1); | |
581 | } | |
582 | ||
583 | $display .= '")'; | |
584 | my $warning = $test->{'warning'}; | |
585 | ||
586 | undef @warnings; | |
587 | BEGIN { 'warnings'->import('utf8') if "$]" gt '5.006' } | |
588 | $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); | |
589 | ok($ret->[0], 0, "returned value $display; warnings enabled"); | |
590 | ok($ret->[1], -1, "returned length $display; warnings enabled"); | |
591 | my $all_warnings = join "; ", @warnings; | |
592 | my $contains = grep { $_ =~ $warning } $all_warnings; | |
593 | ok($contains, 1, $display | |
594 | . "; Got: '$all_warnings', which should contain '$warning'"); | |
595 | ||
596 | undef @warnings; | |
597 | BEGIN { 'warnings'->unimport('utf8') if "$]" gt '5.006' } | |
598 | $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment); | |
599 | ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled"); | |
600 | ok($ret->[1], $test->{'no_warnings_returned_length'}, | |
601 | "returned length $display; warnings disabled"); | |
602 | } | |
603 | } | |
0570adb7 P |
604 | |
605 | if ("$]" ge '5.008') { | |
606 | BEGIN { if ("$]" ge '5.008') { require utf8; "utf8"->import() } } | |
607 | ||
608 | ok(Devel::PPPort::sv_len_utf8("aščť"), 4); | |
609 | ok(Devel::PPPort::sv_len_utf8_nomg("aščť"), 4); | |
610 | ||
611 | my $str = "áíé"; | |
612 | utf8::downgrade($str); | |
613 | ok(Devel::PPPort::sv_len_utf8($str), 3); | |
614 | utf8::downgrade($str); | |
615 | ok(Devel::PPPort::sv_len_utf8_nomg($str), 3); | |
616 | utf8::upgrade($str); | |
617 | ok(Devel::PPPort::sv_len_utf8($str), 3); | |
618 | utf8::upgrade($str); | |
619 | ok(Devel::PPPort::sv_len_utf8_nomg($str), 3); | |
620 | ||
621 | tie my $scalar, 'TieScalarCounter', "é"; | |
622 | ||
623 | ok(tied($scalar)->{fetch}, 0); | |
624 | ok(tied($scalar)->{store}, 0); | |
625 | ok(Devel::PPPort::sv_len_utf8($scalar), 2); | |
626 | ok(tied($scalar)->{fetch}, 1); | |
627 | ok(tied($scalar)->{store}, 0); | |
628 | ok(Devel::PPPort::sv_len_utf8($scalar), 3); | |
629 | ok(tied($scalar)->{fetch}, 2); | |
630 | ok(tied($scalar)->{store}, 0); | |
631 | ok(Devel::PPPort::sv_len_utf8($scalar), 4); | |
632 | ok(tied($scalar)->{fetch}, 3); | |
633 | ok(tied($scalar)->{store}, 0); | |
634 | ok(Devel::PPPort::sv_len_utf8_nomg($scalar), 4); | |
635 | ok(tied($scalar)->{fetch}, 3); | |
636 | ok(tied($scalar)->{store}, 0); | |
637 | ok(Devel::PPPort::sv_len_utf8_nomg($scalar), 4); | |
638 | ok(tied($scalar)->{fetch}, 3); | |
639 | ok(tied($scalar)->{store}, 0); | |
640 | } else { | |
8583f9b2 KW |
641 | for (1..23) { |
642 | skip 'skip: no SV_NOSTEAL support', 0; | |
643 | } | |
0570adb7 P |
644 | } |
645 | ||
646 | package TieScalarCounter; | |
647 | ||
648 | sub TIESCALAR { | |
649 | my ($class, $value) = @_; | |
650 | return bless { fetch => 0, store => 0, value => $value }, $class; | |
651 | } | |
652 | ||
653 | sub FETCH { | |
654 | BEGIN { if ("$]" ge '5.008') { require utf8; "utf8"->import() } } | |
655 | my ($self) = @_; | |
656 | $self->{fetch}++; | |
657 | return $self->{value} .= "é"; | |
658 | } | |
659 | ||
660 | sub STORE { | |
661 | my ($self, $value) = @_; | |
662 | $self->{store}++; | |
663 | $self->{value} = $value; | |
664 | } |