This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
parts/inc/utf8: Add hint for utf8_to_uvchr()
[perl5.git] / dist / Devel-PPPort / parts / inc / utf8
CommitLineData
7899b636
KW
1=provides
2
3__UNDEFINED__
4utf8_to_uvchr_buf
0570adb7
P
5sv_len_utf8
6sv_len_utf8_nomg
7899b636
KW
7
8=implementation
9
10#define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b))
11
12__UNDEFINED__ UNICODE_REPLACEMENT 0xFFFD
13
d4cea7f8 14#ifdef UTF8_MAXLEN
7899b636 15__UNDEFINED__ UTF8_MAXBYTES UTF8_MAXLEN
d4cea7f8 16#endif
7899b636
KW
17
18__UNDEFINED__ UTF8_ALLOW_ANYUV 0
19__UNDEFINED__ UTF8_ALLOW_EMPTY 0x0001
20__UNDEFINED__ UTF8_ALLOW_CONTINUATION 0x0002
21__UNDEFINED__ UTF8_ALLOW_NON_CONTINUATION 0x0004
22__UNDEFINED__ UTF8_ALLOW_SHORT 0x0008
23__UNDEFINED__ UTF8_ALLOW_LONG 0x0010
24__UNDEFINED__ UTF8_ALLOW_OVERFLOW 0x0080
25__UNDEFINED__ UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \
26 |UTF8_ALLOW_NON_CONTINUATION \
27 |UTF8_ALLOW_SHORT \
28 |UTF8_ALLOW_LONG \
29 |UTF8_ALLOW_OVERFLOW)
30
31#if defined UTF8SKIP
32
33/* Don't use official version because it uses MIN, which may not be available */
34#undef UTF8_SAFE_SKIP
35
36__UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \
37 ((((e) - (s)) <= 0) \
38 ? 0 \
39 : D_PPP_MIN(((e) - (s)), UTF8SKIP(s))))
40#endif
41
d2b44e2b
KW
42#ifdef is_ascii_string
43__UNDEFINED__ is_invariant_string(s,l) is_ascii_string(s,l)
44__UNDEFINED__ is_utf8_invariant_string(s,l) is_ascii_string(s,l)
45
1242cf1e
KW
46/* Hint: is_ascii_string, is_invariant_string
47 is_utf8_invariant_string() does the same thing and is preferred because its
48 name is more accurate as to what it does */
d2b44e2b
KW
49#endif
50
2ff9e5e8
KW
51#ifdef ibcmp_utf8
52__UNDEFINED__ foldEQ_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2) \
53 cBOOL(! ibcmp_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2))
54#endif
55
7899b636
KW
56#if defined(is_utf8_string) && defined(UTF8SKIP)
57__UNDEFINED__ isUTF8_CHAR(s0, e) ( \
58 (e) <= (s0) || ! is_utf8_string(s0, D_PPP_MIN(UTF8SKIP(s0), (e) - (s0))) \
59 ? 0 \
60 : UTF8SKIP(s0))
61#endif
62
8fc7db65
KW
63#if 'A' == 65
64__UNDEFINED__ BOM_UTF8 "\xEF\xBB\xBF"
65__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xEF\xBF\xBD"
66#elif '^' == 95
67__UNDEFINED__ BOM_UTF8 "\xDD\x73\x66\x73"
68__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x73\x73\x71"
69#elif '^' == 176
70__UNDEFINED__ BOM_UTF8 "\xDD\x72\x65\x72"
71__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x72\x72\x70"
72#else
73# error Unknown character set
74#endif
75
96656f64 76#if { VERSION < 5.31.4 }
7899b636
KW
77 /* Versions prior to this accepted things that are now considered
78 * malformations, and didn't return -1 on error with warnings enabled
79 * */
80# undef utf8_to_uvchr_buf
81#endif
82
83/* This implementation brings modern, generally more restricted standards to
84 * utf8_to_uvchr_buf. Some of these are security related, and clearly must
85 * be done. But its arguable that the others need not, and hence should not.
86 * The reason they're here is that a module that intends to play with the
87 * latest perls should be able to work the same in all releases. An example is
88 * that perl no longer accepts any UV for a code point, but limits them to
89 * IV_MAX or below. This is for future internal use of the larger code points.
90 * If it turns out that some of these changes are breaking code that isn't
91 * intended to work with modern perls, the tighter restrictions could be
92 * relaxed. khw thinks this is unlikely, but has been wrong in the past. */
93
94/* 5.6.0 is the first release with UTF-8, and we don't implement this function
95 * there due to its likely lack of still being in use, and the underlying
96 * implementation is very different from later ones, without the later
97 * safeguards, so would require extra work to deal with */
98#if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf)
99 /* Choose which underlying implementation to use. At least one must be
100 * present or the perl is too early to handle this function */
08811920 101# if defined(utf8n_to_uvchr) || defined(utf8_to_uvchr) || defined(utf8_to_uv)
7899b636
KW
102# if defined(utf8n_to_uvchr) /* This is the preferred implementation */
103# define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr
08811920
KW
104# elif /* Must be at least 5.6.1 from #if above; \
105 If have both regular and _simple, regular has all args */ \
106 defined(utf8_to_uv) && defined(utf8_to_uv_simple)
107# define D_PPP_utf8_to_uvchr_buf_callee utf8_to_uv
108# elif defined(utf8_to_uvchr) /* The below won't work well on error input */
109# define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) \
110 utf8_to_uvchr((U8 *)(s), (retlen))
111# else
112# define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) \
113 utf8_to_uv((U8 *)(s), (retlen))
7899b636
KW
114# endif
115# endif
116
117# if { NEED utf8_to_uvchr_buf }
118
119UV
120utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
121{
122 UV ret;
123 STRLEN curlen;
124 bool overflows = 0;
125 const U8 *cur_s = s;
126 const bool do_warnings = ckWARN_d(WARN_UTF8);
127# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
128 STRLEN overflow_length = 0;
129# endif
130
131 if (send > s) {
132 curlen = send - s;
133 }
134 else {
135 assert(0); /* Modern perls die under this circumstance */
136 curlen = 0;
137 if (! do_warnings) { /* Handle empty here if no warnings needed */
138 if (retlen) *retlen = 0;
139 return UNICODE_REPLACEMENT;
140 }
141 }
142
143# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
144
145 /* Perl did not properly detect overflow for much of its history on
146 * non-EBCDIC platforms, often returning an overlong value which may or may
147 * not have been tolerated in the call. Also, earlier versions, when they
148 * did detect overflow, may have disallowed it completely. Modern ones can
149 * replace it with the REPLACEMENT CHARACTER, depending on calling
150 * parameters. Therefore detect it ourselves in releases it was
151 * problematic in. */
152
153 if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {
154
155 /* First, on a 32-bit machine the first byte being at least \xFE
156 * automatically is overflow, as it indicates something requiring more
157 * than 31 bits */
158 if (sizeof(ret) < 8) {
159 overflows = 1;
160 overflow_length = 7;
161 }
162 else {
163 const U8 highest[] = /* 2*63-1 */
164 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
165 const U8 *cur_h = highest;
166
167 for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
168 if (UNLIKELY(*cur_s == *cur_h)) {
169 continue;
170 }
171
172 /* If this byte is larger than the corresponding highest UTF-8
173 * byte, the sequence overflows; otherwise the byte is less
174 * than (as we handled the equality case above), and so the
175 * sequence doesn't overflow */
176 overflows = *cur_s > *cur_h;
177 break;
178
179 }
180
181 /* Here, either we set the bool and broke out of the loop, or got
182 * to the end and all bytes are the same which indicates it doesn't
183 * overflow. If it did overflow, it would be this number of bytes
184 * */
185 overflow_length = 13;
186 }
187 }
188
189 if (UNLIKELY(overflows)) {
190 ret = 0;
191
192 if (! do_warnings && retlen) {
193 *retlen = overflow_length;
194 }
195 }
196 else
197
198# endif /* < 5.26 */
199
200 /* Here, we are either in a release that properly detects overflow, or
201 * we have checked for overflow and the next statement is executing as
202 * part of the above conditional where we know we don't have overflow.
203 *
204 * The modern versions allow anything that evaluates to a legal UV, but
205 * not overlongs nor an empty input */
206 ret = D_PPP_utf8_to_uvchr_buf_callee(
207 s, curlen, retlen, (UTF8_ALLOW_ANYUV
208 & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
209
210# if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }
211
212 /* But actually, more modern versions restrict the UV to being no more than
f807bbfa
KW
213 * what an IV can hold, so it could still have gotten it wrong about
214 * overflowing. */
7899b636
KW
215 if (UNLIKELY(ret > IV_MAX)) {
216 overflows = 1;
217 }
218
219# endif
220
221 if (UNLIKELY(overflows)) {
222 if (! do_warnings) {
223 if (retlen) {
224 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
225 *retlen = D_PPP_MIN(*retlen, curlen);
226 }
227 return UNICODE_REPLACEMENT;
228 }
229 else {
230
231 /* We use the error message in use from 5.8-5.26 */
232 Perl_warner(aTHX_ packWARN(WARN_UTF8),
233 "Malformed UTF-8 character (overflow at 0x%" UVxf
234 ", byte 0x%02x, after start byte 0x%02x)",
235 ret, *cur_s, *s);
236 if (retlen) {
237 *retlen = (STRLEN) -1;
238 }
239 return 0;
240 }
241 }
242
243 /* Here, did not overflow, but if it failed for some other reason, and
244 * warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
245 * try again, allowing anything. (Note a return of 0 is ok if the input
246 * was '\0') */
247 if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
248
249 /* If curlen is 0, we already handled the case where warnings are
250 * disabled, so this 'if' will be true, and so later on, we know that
251 * 's' is dereferencible */
252 if (do_warnings) {
253 *retlen = (STRLEN) -1;
254 }
255 else {
256 ret = D_PPP_utf8_to_uvchr_buf_callee(
257 s, curlen, retlen, UTF8_ALLOW_ANY);
258 /* Override with the REPLACEMENT character, as that is what the
259 * modern version of this function returns */
260 ret = UNICODE_REPLACEMENT;
261
262# if { VERSION < 5.16.0 }
263
264 /* Versions earlier than this don't necessarily return the proper
265 * length. It should not extend past the end of string, nor past
266 * what the first byte indicates the length is, nor past the
267 * continuation characters */
268 if (retlen && *retlen >= 0) {
269 unsigned int i = 1;
270
271 *retlen = D_PPP_MIN(*retlen, curlen);
272 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
273 do {
274 if (s[i] < 0x80 || s[i] > 0xBF) {
275 *retlen = i;
276 break;
277 }
278 } while (++i < *retlen);
279 }
280
281# endif
282
283 }
284 }
285
286 return ret;
287}
288
289# endif
290#endif
291
292#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
293#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
294 to read past a NUL, making it much less likely to read
295 off the end of the buffer. A NUL indicates the start
296 of the next character anyway. If the input isn't
297 NUL-terminated, the function remains unsafe, as it
298 always has been. */
299
300__UNDEFINED__ utf8_to_uvchr(s, lp) \
301 ((*(s) == '\0') \
302 ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \
303 : utf8_to_uvchr_buf(s, (s) + my_strnlen((char *) (s), UTF8SKIP(s)), (lp)))
304
305#endif
306
7252d154
KW
307/* Hint: utf8_to_uvchr
308 Use utf8_to_uvchr_buf() instead. But ONLY if you KNOW the upper bound
309 of the input string (not resorting to using UTF8SKIP, etc., to infer it).
310 The backported utf8_to_uvchr() will do a better job to prevent most cases
311 of trying to read beyond the end of the buffer */
312
313/* Replace utf8_to_uvchr with utf8_to_uvchr_buf */
314
0570adb7
P
315#ifdef SV_NOSTEAL
316 /* Older Perl versions have broken sv_len_utf8() when passed sv does not have SVf_UTF8 flag set */
317 /* Also note that SvGETMAGIC() may change presence of SVf_UTF8 flag */
318# if { VERSION < 5.17.5 }
319# undef sv_len_utf8
320# if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)
321# define sv_len_utf8_nomg(sv) ({ SV *_sv2 = (sv); (SvUTF8(_sv2) ? Perl_sv_len_utf8(aTHX_ (!SvGMAGICAL(_sv2) ? _sv2 : sv_mortalcopy_flags(_sv2, SV_NOSTEAL))) : ({ STRLEN _len; SvPV_nomg(_sv2, _len); _len; })); })
322# define sv_len_utf8(sv) ({ SV *_sv1 = (sv); SvGETMAGIC(_sv1); sv_len_utf8_nomg(_sv1); })
323# else
324# define sv_len_utf8_nomg(sv) (PL_Sv = (sv), (SvUTF8(PL_Sv) ? Perl_sv_len_utf8(aTHX_ (!SvGMAGICAL(PL_Sv) ? PL_Sv : sv_mortalcopy_flags(PL_Sv, SV_NOSTEAL))) : (SvPV_nomg(PL_Sv, PL_na), PL_na)))
325# define sv_len_utf8(sv) (PL_Sv = (sv), SvGETMAGIC(PL_Sv), sv_len_utf8_nomg(PL_Sv))
326# endif
327# endif
328# if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)
329 __UNDEFINED__ sv_len_utf8_nomg(sv) ({ SV *_sv = (sv); sv_len_utf8(!SvGMAGICAL(_sv) ? _sv : sv_mortalcopy_flags(_sv, SV_NOSTEAL)); })
330# else
331 __UNDEFINED__ sv_len_utf8_nomg(sv) ((PL_Sv = (sv)), sv_len_utf8(!SvGMAGICAL(PL_Sv) ? PL_Sv : sv_mortalcopy_flags(PL_Sv, SV_NOSTEAL)))
332# endif
333#endif
334
7899b636
KW
335=xsinit
336
337#define NEED_utf8_to_uvchr_buf
338
339=xsubs
340
341#if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP)
342
343STRLEN
344UTF8_SAFE_SKIP(s, adjustment)
345 char * s
346 int adjustment
347 PREINIT:
348 const char *const_s;
349 CODE:
350 const_s = s;
351 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
352 RETVAL = UTF8_SAFE_SKIP(const_s, s + UTF8SKIP(s) + adjustment);
353 OUTPUT:
354 RETVAL
355
356#endif
357
358#ifdef isUTF8_CHAR
359
360STRLEN
361isUTF8_CHAR(s, adjustment)
362 unsigned char * s
363 int adjustment
364 PREINIT:
365 const unsigned char *const_s;
366 const unsigned char *const_e;
367 CODE:
368 const_s = s;
369 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
370 const_e = const_s + UTF8SKIP(const_s) + adjustment;
371 RETVAL = isUTF8_CHAR(const_s, const_e);
372 OUTPUT:
373 RETVAL
374
375#endif
376
2ff9e5e8
KW
377
378#ifdef foldEQ_utf8
379
380STRLEN
381foldEQ_utf8(s1, l1, u1, s2, l2, u2)
382 char *s1
383 UV l1
384 bool u1
385 char *s2
386 UV l2
387 bool u2
388 PREINIT:
389 const char *const_s1;
390 const char *const_s2;
391 CODE:
392 const_s1 = s1;
393 const_s2 = s2;
394 RETVAL = foldEQ_utf8(const_s1, NULL, l1, u1, const_s2, NULL, l2, u2);
395 OUTPUT:
396 RETVAL
397
398#endif
399
7899b636
KW
400#ifdef utf8_to_uvchr_buf
401
402AV *
403utf8_to_uvchr_buf(s, adjustment)
404 unsigned char *s
405 int adjustment
406 PREINIT:
407 AV *av;
408 STRLEN len;
409 const unsigned char *const_s;
410 CODE:
411 av = newAV();
412 const_s = s;
413 av_push(av, newSVuv(utf8_to_uvchr_buf(const_s,
414 s + UTF8SKIP(s) + adjustment,
415 &len)));
416 if (len == (STRLEN) -1) {
417 av_push(av, newSViv(-1));
418 }
419 else {
420 av_push(av, newSVuv(len));
421 }
422 RETVAL = av;
423 OUTPUT:
424 RETVAL
425
426#endif
427
428#ifdef utf8_to_uvchr
429
430AV *
431utf8_to_uvchr(s)
432 unsigned char *s
433 PREINIT:
434 AV *av;
435 STRLEN len;
436 const unsigned char *const_s;
437 CODE:
438 av = newAV();
439 const_s = s;
440 av_push(av, newSVuv(utf8_to_uvchr(const_s, &len)));
441 if (len == (STRLEN) -1) {
442 av_push(av, newSViv(-1));
443 }
444 else {
445 av_push(av, newSVuv(len));
446 }
447 RETVAL = av;
448 OUTPUT:
449 RETVAL
450
451#endif
452
0570adb7
P
453#ifdef SV_NOSTEAL
454
455STRLEN
456sv_len_utf8(sv)
457 SV *sv
458 CODE:
459 RETVAL = sv_len_utf8(sv);
460 OUTPUT:
461 RETVAL
462
463STRLEN
464sv_len_utf8_nomg(sv)
465 SV *sv
466 CODE:
467 RETVAL = sv_len_utf8_nomg(sv);
468 OUTPUT:
469 RETVAL
470
471#endif
472
473=tests plan => 81
7899b636 474
c45043a0 475BEGIN { require warnings if "$]" > '5.006' }
7899b636 476
08811920
KW
477# skip tests on 5.6.0 and earlier, plus 7.0
478if ("$]" <= '5.006' || "$]" == '5.007' ) {
479 for (1..81) {
480 skip 'skip: broken utf8 support', 0;
481 }
7899b636
KW
482 exit;
483}
484
485ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
486ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
487
488ok(&Devel::PPPort::isUTF8_CHAR("A", -1), 0);
489ok(&Devel::PPPort::isUTF8_CHAR("A", 0), 1);
490ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", -1), 0);
491ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", 0), 2);
492
c45043a0 493if ("$]" < '5.008') {
66da169a
KW
494 for (1 ..3) {
495 ok(1, 1)
496 }
2ff9e5e8
KW
497}
498else {
499 ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{101}", 3, 1), 1);
500 ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{102}", 3, 1), 0);
501 ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "b\x{101}", 3, 1), 0);
502}
503
7899b636
KW
504my $ret = &Devel::PPPort::utf8_to_uvchr("A");
505ok($ret->[0], ord("A"));
506ok($ret->[1], 1);
507
508$ret = &Devel::PPPort::utf8_to_uvchr("\0");
509ok($ret->[0], 0);
510ok($ret->[1], 1);
511
512$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
513ok($ret->[0], ord("A"));
514ok($ret->[1], 1);
515
516$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
517ok($ret->[0], 0);
518ok($ret->[1], 1);
519
520if (ord("A") != 65) { # tests not valid for EBCDIC
66da169a
KW
521 for (1 .. (2 + 4 + (7 * 5))) {
522 ok(1, 1);
523 }
7899b636
KW
524}
525else {
526 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
527 ok($ret->[0], 0x100);
528 ok($ret->[1], 2);
529
530 my @warnings;
531 local $SIG{__WARN__} = sub { push @warnings, @_; };
532
533 {
c45043a0 534 BEGIN { 'warnings'->import('utf8') if "$]" > '5.006' }
7899b636
KW
535 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
536 ok($ret->[0], 0);
537 ok($ret->[1], -1);
538
c45043a0 539 BEGIN { 'warnings'->unimport() if "$]" > '5.006' }
7899b636
KW
540 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
541 ok($ret->[0], 0xFFFD);
542 ok($ret->[1], 1);
543 }
544
545 my @buf_tests = (
546 {
547 input => "A",
548 adjustment => -1,
21e22f08 549 warning => eval "qr/empty/",
7899b636
KW
550 no_warnings_returned_length => 0,
551 },
552 {
553 input => "\xc4\xc5",
554 adjustment => 0,
21e22f08 555 warning => eval "qr/non-continuation/",
7899b636
KW
556 no_warnings_returned_length => 1,
557 },
558 {
559 input => "\xc4\x80",
560 adjustment => -1,
21e22f08 561 warning => eval "qr/short|1 byte, need 2/",
7899b636
KW
562 no_warnings_returned_length => 1,
563 },
564 {
565 input => "\xc0\x81",
566 adjustment => 0,
21e22f08 567 warning => eval "qr/overlong|2 bytes, need 1/",
7899b636
KW
568 no_warnings_returned_length => 2,
569 },
570 {
571 input => "\xe0\x80\x81",
572 adjustment => 0,
21e22f08 573 warning => eval "qr/overlong|3 bytes, need 1/",
7899b636
KW
574 no_warnings_returned_length => 3,
575 },
576 {
577 input => "\xf0\x80\x80\x81",
578 adjustment => 0,
21e22f08 579 warning => eval "qr/overlong|4 bytes, need 1/",
7899b636
KW
580 no_warnings_returned_length => 4,
581 },
582 { # Old algorithm failed to detect this
583 input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
584 adjustment => 0,
21e22f08 585 warning => eval "qr/overflow/",
7899b636
KW
586 no_warnings_returned_length => 13,
587 },
588 );
589
590 # An empty input is an assertion failure on debugging builds. It is
591 # deliberately the first test.
592 require Config; import Config;
593 use vars '%Config';
594 if ($Config{ccflags} =~ /-DDEBUGGING/) {
595 shift @buf_tests;
66da169a
KW
596 for (1..5) {
597 ok(1, 1);
598 }
7899b636
KW
599 }
600
66da169a
KW
601 my $test;
602 for $test (@buf_tests) {
7899b636
KW
603 my $input = $test->{'input'};
604 my $adjustment = $test->{'adjustment'};
605 my $display = 'utf8_to_uvchr_buf("';
66da169a
KW
606 my $i;
607 for ($i = 0; $i < length($input) + $adjustment; $i++) {
7899b636
KW
608 $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
609 }
610
611 $display .= '")';
612 my $warning = $test->{'warning'};
613
614 undef @warnings;
c45043a0 615 BEGIN { 'warnings'->import('utf8') if "$]" > '5.006' }
7899b636
KW
616 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
617 ok($ret->[0], 0, "returned value $display; warnings enabled");
618 ok($ret->[1], -1, "returned length $display; warnings enabled");
619 my $all_warnings = join "; ", @warnings;
620 my $contains = grep { $_ =~ $warning } $all_warnings;
621 ok($contains, 1, $display
622 . "; Got: '$all_warnings', which should contain '$warning'");
623
624 undef @warnings;
c45043a0 625 BEGIN { 'warnings'->unimport('utf8') if "$]" > '5.006' }
7899b636
KW
626 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
627 ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled");
628 ok($ret->[1], $test->{'no_warnings_returned_length'},
629 "returned length $display; warnings disabled");
630 }
631}
0570adb7
P
632
633if ("$]" ge '5.008') {
634 BEGIN { if ("$]" ge '5.008') { require utf8; "utf8"->import() } }
635
636 ok(Devel::PPPort::sv_len_utf8("aščť"), 4);
637 ok(Devel::PPPort::sv_len_utf8_nomg("aščť"), 4);
638
639 my $str = "áíé";
640 utf8::downgrade($str);
641 ok(Devel::PPPort::sv_len_utf8($str), 3);
642 utf8::downgrade($str);
643 ok(Devel::PPPort::sv_len_utf8_nomg($str), 3);
644 utf8::upgrade($str);
645 ok(Devel::PPPort::sv_len_utf8($str), 3);
646 utf8::upgrade($str);
647 ok(Devel::PPPort::sv_len_utf8_nomg($str), 3);
648
649 tie my $scalar, 'TieScalarCounter', "é";
650
651 ok(tied($scalar)->{fetch}, 0);
652 ok(tied($scalar)->{store}, 0);
653 ok(Devel::PPPort::sv_len_utf8($scalar), 2);
654 ok(tied($scalar)->{fetch}, 1);
655 ok(tied($scalar)->{store}, 0);
656 ok(Devel::PPPort::sv_len_utf8($scalar), 3);
657 ok(tied($scalar)->{fetch}, 2);
658 ok(tied($scalar)->{store}, 0);
659 ok(Devel::PPPort::sv_len_utf8($scalar), 4);
660 ok(tied($scalar)->{fetch}, 3);
661 ok(tied($scalar)->{store}, 0);
662 ok(Devel::PPPort::sv_len_utf8_nomg($scalar), 4);
663 ok(tied($scalar)->{fetch}, 3);
664 ok(tied($scalar)->{store}, 0);
665 ok(Devel::PPPort::sv_len_utf8_nomg($scalar), 4);
666 ok(tied($scalar)->{fetch}, 3);
667 ok(tied($scalar)->{store}, 0);
668} else {
8583f9b2
KW
669 for (1..23) {
670 skip 'skip: no SV_NOSTEAL support', 0;
671 }
0570adb7
P
672}
673
674package TieScalarCounter;
675
676sub TIESCALAR {
677 my ($class, $value) = @_;
678 return bless { fetch => 0, store => 0, value => $value }, $class;
679}
680
681sub FETCH {
682 BEGIN { if ("$]" ge '5.008') { require utf8; "utf8"->import() } }
683 my ($self) = @_;
684 $self->{fetch}++;
685 return $self->{value} .= "é";
686}
687
688sub STORE {
689 my ($self, $value) = @_;
690 $self->{store}++;
691 $self->{value} = $value;
692}