This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
isPSXSPC() is a synonym for isSPACE
[perl5.git] / dist / Devel-PPPort / parts / inc / utf8
CommitLineData
7899b636
KW
1=provides
2
3__UNDEFINED__
4utf8_to_uvchr_buf
0570adb7
P
5sv_len_utf8
6sv_len_utf8_nomg
7899b636
KW
7
8=implementation
9
10#define D_PPP_MIN(a,b) (((a) <= (b)) ? (a) : (b))
11
12__UNDEFINED__ UNICODE_REPLACEMENT 0xFFFD
13
d4cea7f8 14#ifdef UTF8_MAXLEN
7899b636 15__UNDEFINED__ UTF8_MAXBYTES UTF8_MAXLEN
d4cea7f8 16#endif
7899b636 17
a807d978
KW
18__UNDEF_NOT_PROVIDED__ UTF_START_MARK(len) \
19 (((len) > 7) ? 0xFF : (0xFF & (0xFE << (7-(len)))))
20
21#if 'A' == 65
22__UNDEF_NOT_PROVIDED__ UTF_ACCUMULATION_SHIFT 6
23#else
24__UNDEF_NOT_PROVIDED__ UTF_ACCUMULATION_SHIFT 5
25#endif
26
27#ifdef NATIVE_TO_UTF
28__UNDEF_NOT_PROVIDED__ NATIVE_UTF8_TO_I8(c) NATIVE_TO_UTF(c)
29#else /* System doesn't support EBCDIC */
30__UNDEF_NOT_PROVIDED__ NATIVE_UTF8_TO_I8(c) (c)
31#endif
32
33#ifdef UTF_TO_NATIVE
34__UNDEF_NOT_PROVIDED__ I8_TO_NATIVE_UTF8(c) UTF_TO_NATIVE(c)
35#else /* System doesn't support EBCDIC */
36__UNDEF_NOT_PROVIDED__ I8_TO_NATIVE_UTF8(c) (c)
37#endif
38
39__UNDEF_NOT_PROVIDED__ UTF_START_MASK(len) \
40 (((len) >= 7) ? 0x00 : (0x1F >> ((len)-2)))
41__UNDEF_NOT_PROVIDED__ UTF_IS_CONTINUATION_MASK \
42 ((U8) (0xFF << UTF_ACCUMULATION_SHIFT))
43__UNDEF_NOT_PROVIDED__ UTF_CONTINUATION_MARK \
44 (UTF_IS_CONTINUATION_MASK & 0xB0)
45__UNDEF_NOT_PROVIDED__ UTF_MIN_START_BYTE \
46 ((UTF_CONTINUATION_MARK >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
47
48__UNDEF_NOT_PROVIDED__ UTF_MIN_ABOVE_LATIN1_BYTE \
49 ((0x100 >> UTF_ACCUMULATION_SHIFT) | UTF_START_MARK(2))
50
51#if { VERSION < 5.007 } /* Was the complement of what should have been */
52# undef UTF8_IS_DOWNGRADEABLE_START
53#endif
54__UNDEF_NOT_PROVIDED__ UTF8_IS_DOWNGRADEABLE_START(c) \
55 inRANGE(NATIVE_UTF8_TO_I8(c), \
56 UTF_MIN_START_BYTE, UTF_MIN_ABOVE_LATIN1_BYTE - 1)
57__UNDEF_NOT_PROVIDED__ UTF_CONTINUATION_MASK \
58 ((U8) ((1U << UTF_ACCUMULATION_SHIFT) - 1))
59
60__UNDEF_NOT_PROVIDED__ UTF8_ACCUMULATE(base, added) \
61 (((base) << UTF_ACCUMULATION_SHIFT) \
62 | ((NATIVE_UTF8_TO_I8(added)) \
63 & UTF_CONTINUATION_MASK))
64
971bc248
KW
65__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_ANYUV 0
66__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_EMPTY 0x0001
67__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_CONTINUATION 0x0002
68__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_NON_CONTINUATION 0x0004
69__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_SHORT 0x0008
70__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_LONG 0x0010
71__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_OVERFLOW 0x0080
72__UNDEF_NOT_PROVIDED__ UTF8_ALLOW_ANY ( UTF8_ALLOW_CONTINUATION \
73 |UTF8_ALLOW_NON_CONTINUATION \
74 |UTF8_ALLOW_SHORT \
75 |UTF8_ALLOW_LONG \
76 |UTF8_ALLOW_OVERFLOW)
7899b636
KW
77
78#if defined UTF8SKIP
79
80/* Don't use official version because it uses MIN, which may not be available */
81#undef UTF8_SAFE_SKIP
82
83__UNDEFINED__ UTF8_SAFE_SKIP(s, e) ( \
84 ((((e) - (s)) <= 0) \
85 ? 0 \
86 : D_PPP_MIN(((e) - (s)), UTF8SKIP(s))))
be9a67f5 87
7f7c413a
KW
88__UNDEFINED__ UTF8_CHK_SKIP(s) \
89 (s[0] == '\0' ? 1 : ((U8) D_PPP_MIN(my_strnlen((char *) (s), UTF8SKIP(s)), \
90 UTF8SKIP(s))))
be9a67f5 91__UNDEFINED__ UTF8_SKIP(s) UTF8SKIP(s)
7899b636
KW
92#endif
93
e2da5d9e
KW
94#if 'A' == 65
95__UNDEFINED__ UTF8_IS_INVARIANT(c) isASCII(c)
96#else
97__UNDEFINED__ UTF8_IS_INVARIANT(c) (isASCII(c) || isCNTRL_L1(c))
98#endif
99
100__UNDEFINED__ UVCHR_IS_INVARIANT(c) UTF8_IS_INVARIANT(c)
101
08d12710
KW
102#ifdef UVCHR_IS_INVARIANT
103# if 'A' == 65
104# define D_PPP_BYTE_INFO_BITS 6 /* 6 bits meaningful in continuation bytes */
105# ifdef QUADKIND
106# define D_PPP_UVCHR_SKIP_UPPER(c) \
107 (WIDEST_UTYPE) (c) < \
108 (((WIDEST_UTYPE) 1) << (6 * D_PPP_BYTE_INFO_BITS)) ? 7 : 13
109# else
110# define D_PPP_UVCHR_SKIP_UPPER(c) 7 /* 32 bit platform */
111# endif
112# else
113# define D_PPP_BYTE_INFO_BITS 5 /* EBCDIC has only 5 meaningful bits */
114
115 /* In the releases this is backported to, UTF-EBCDIC had a max of 2**31-1 */
116# define D_PPP_UVCHR_SKIP_UPPER(c) 7
117# endif
118
119__UNDEFINED__ UVCHR_SKIP(c) \
120 UVCHR_IS_INVARIANT(c) ? 1 : \
121 (WIDEST_UTYPE) (c) < (32 * (1U << ( D_PPP_BYTE_INFO_BITS))) ? 2 : \
122 (WIDEST_UTYPE) (c) < (16 * (1U << (2 * D_PPP_BYTE_INFO_BITS))) ? 3 : \
123 (WIDEST_UTYPE) (c) < ( 8 * (1U << (3 * D_PPP_BYTE_INFO_BITS))) ? 4 : \
124 (WIDEST_UTYPE) (c) < ( 4 * (1U << (4 * D_PPP_BYTE_INFO_BITS))) ? 5 : \
125 (WIDEST_UTYPE) (c) < ( 2 * (1U << (5 * D_PPP_BYTE_INFO_BITS))) ? 6 : \
126 D_PPP_UVCHR_SKIP_UPPER(c)
127#endif
128
d2b44e2b
KW
129#ifdef is_ascii_string
130__UNDEFINED__ is_invariant_string(s,l) is_ascii_string(s,l)
131__UNDEFINED__ is_utf8_invariant_string(s,l) is_ascii_string(s,l)
132
1242cf1e
KW
133/* Hint: is_ascii_string, is_invariant_string
134 is_utf8_invariant_string() does the same thing and is preferred because its
135 name is more accurate as to what it does */
d2b44e2b
KW
136#endif
137
2ff9e5e8
KW
138#ifdef ibcmp_utf8
139__UNDEFINED__ foldEQ_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2) \
140 cBOOL(! ibcmp_utf8(s1,pe1,l1,u1,s2,pe2,l2,u2))
141#endif
142
7899b636
KW
143#if defined(is_utf8_string) && defined(UTF8SKIP)
144__UNDEFINED__ isUTF8_CHAR(s0, e) ( \
145 (e) <= (s0) || ! is_utf8_string(s0, D_PPP_MIN(UTF8SKIP(s0), (e) - (s0))) \
146 ? 0 \
147 : UTF8SKIP(s0))
148#endif
149
8fc7db65
KW
150#if 'A' == 65
151__UNDEFINED__ BOM_UTF8 "\xEF\xBB\xBF"
152__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xEF\xBF\xBD"
153#elif '^' == 95
154__UNDEFINED__ BOM_UTF8 "\xDD\x73\x66\x73"
155__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x73\x73\x71"
156#elif '^' == 176
157__UNDEFINED__ BOM_UTF8 "\xDD\x72\x65\x72"
158__UNDEFINED__ REPLACEMENT_CHARACTER_UTF8 "\xDD\x72\x72\x70"
159#else
160# error Unknown character set
161#endif
162
96656f64 163#if { VERSION < 5.31.4 }
7899b636
KW
164 /* Versions prior to this accepted things that are now considered
165 * malformations, and didn't return -1 on error with warnings enabled
166 * */
167# undef utf8_to_uvchr_buf
168#endif
169
170/* This implementation brings modern, generally more restricted standards to
171 * utf8_to_uvchr_buf. Some of these are security related, and clearly must
172 * be done. But its arguable that the others need not, and hence should not.
173 * The reason they're here is that a module that intends to play with the
174 * latest perls should be able to work the same in all releases. An example is
175 * that perl no longer accepts any UV for a code point, but limits them to
176 * IV_MAX or below. This is for future internal use of the larger code points.
177 * If it turns out that some of these changes are breaking code that isn't
178 * intended to work with modern perls, the tighter restrictions could be
179 * relaxed. khw thinks this is unlikely, but has been wrong in the past. */
180
181/* 5.6.0 is the first release with UTF-8, and we don't implement this function
182 * there due to its likely lack of still being in use, and the underlying
183 * implementation is very different from later ones, without the later
184 * safeguards, so would require extra work to deal with */
185#if { VERSION >= 5.6.1 } && ! defined(utf8_to_uvchr_buf)
186 /* Choose which underlying implementation to use. At least one must be
187 * present or the perl is too early to handle this function */
08811920 188# if defined(utf8n_to_uvchr) || defined(utf8_to_uvchr) || defined(utf8_to_uv)
7899b636
KW
189# if defined(utf8n_to_uvchr) /* This is the preferred implementation */
190# define D_PPP_utf8_to_uvchr_buf_callee utf8n_to_uvchr
08811920
KW
191# elif /* Must be at least 5.6.1 from #if above; \
192 If have both regular and _simple, regular has all args */ \
193 defined(utf8_to_uv) && defined(utf8_to_uv_simple)
194# define D_PPP_utf8_to_uvchr_buf_callee utf8_to_uv
195# elif defined(utf8_to_uvchr) /* The below won't work well on error input */
196# define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) \
197 utf8_to_uvchr((U8 *)(s), (retlen))
198# else
199# define D_PPP_utf8_to_uvchr_buf_callee(s, curlen, retlen, flags) \
200 utf8_to_uv((U8 *)(s), (retlen))
7899b636
KW
201# endif
202# endif
203
204# if { NEED utf8_to_uvchr_buf }
205
206UV
207utf8_to_uvchr_buf(pTHX_ const U8 *s, const U8 *send, STRLEN *retlen)
208{
209 UV ret;
210 STRLEN curlen;
211 bool overflows = 0;
212 const U8 *cur_s = s;
213 const bool do_warnings = ckWARN_d(WARN_UTF8);
214# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
215 STRLEN overflow_length = 0;
216# endif
217
218 if (send > s) {
219 curlen = send - s;
220 }
221 else {
222 assert(0); /* Modern perls die under this circumstance */
223 curlen = 0;
224 if (! do_warnings) { /* Handle empty here if no warnings needed */
225 if (retlen) *retlen = 0;
226 return UNICODE_REPLACEMENT;
227 }
228 }
229
230# if { VERSION < 5.26.0 } && ! defined(EBCDIC)
231
232 /* Perl did not properly detect overflow for much of its history on
233 * non-EBCDIC platforms, often returning an overlong value which may or may
234 * not have been tolerated in the call. Also, earlier versions, when they
235 * did detect overflow, may have disallowed it completely. Modern ones can
236 * replace it with the REPLACEMENT CHARACTER, depending on calling
237 * parameters. Therefore detect it ourselves in releases it was
238 * problematic in. */
239
240 if (curlen > 0 && UNLIKELY(*s >= 0xFE)) {
241
242 /* First, on a 32-bit machine the first byte being at least \xFE
243 * automatically is overflow, as it indicates something requiring more
244 * than 31 bits */
245 if (sizeof(ret) < 8) {
246 overflows = 1;
247 overflow_length = 7;
248 }
249 else {
250 const U8 highest[] = /* 2*63-1 */
251 "\xFF\x80\x87\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF\xBF";
252 const U8 *cur_h = highest;
253
254 for (cur_s = s; cur_s < send; cur_s++, cur_h++) {
255 if (UNLIKELY(*cur_s == *cur_h)) {
256 continue;
257 }
258
259 /* If this byte is larger than the corresponding highest UTF-8
260 * byte, the sequence overflows; otherwise the byte is less
261 * than (as we handled the equality case above), and so the
262 * sequence doesn't overflow */
263 overflows = *cur_s > *cur_h;
264 break;
265
266 }
267
268 /* Here, either we set the bool and broke out of the loop, or got
269 * to the end and all bytes are the same which indicates it doesn't
270 * overflow. If it did overflow, it would be this number of bytes
271 * */
272 overflow_length = 13;
273 }
274 }
275
276 if (UNLIKELY(overflows)) {
277 ret = 0;
278
279 if (! do_warnings && retlen) {
280 *retlen = overflow_length;
281 }
282 }
283 else
284
285# endif /* < 5.26 */
286
287 /* Here, we are either in a release that properly detects overflow, or
288 * we have checked for overflow and the next statement is executing as
289 * part of the above conditional where we know we don't have overflow.
290 *
291 * The modern versions allow anything that evaluates to a legal UV, but
292 * not overlongs nor an empty input */
293 ret = D_PPP_utf8_to_uvchr_buf_callee(
294 s, curlen, retlen, (UTF8_ALLOW_ANYUV
295 & ~(UTF8_ALLOW_LONG|UTF8_ALLOW_EMPTY)));
296
297# if { VERSION >= 5.26.0 } && { VERSION < 5.28.0 }
298
299 /* But actually, more modern versions restrict the UV to being no more than
f807bbfa
KW
300 * what an IV can hold, so it could still have gotten it wrong about
301 * overflowing. */
7899b636
KW
302 if (UNLIKELY(ret > IV_MAX)) {
303 overflows = 1;
304 }
305
306# endif
307
308 if (UNLIKELY(overflows)) {
309 if (! do_warnings) {
310 if (retlen) {
311 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
312 *retlen = D_PPP_MIN(*retlen, curlen);
313 }
314 return UNICODE_REPLACEMENT;
315 }
316 else {
317
318 /* We use the error message in use from 5.8-5.26 */
319 Perl_warner(aTHX_ packWARN(WARN_UTF8),
320 "Malformed UTF-8 character (overflow at 0x%" UVxf
321 ", byte 0x%02x, after start byte 0x%02x)",
322 ret, *cur_s, *s);
323 if (retlen) {
324 *retlen = (STRLEN) -1;
325 }
326 return 0;
327 }
328 }
329
330 /* Here, did not overflow, but if it failed for some other reason, and
331 * warnings are off, to emulate the behavior of the real utf8_to_uvchr(),
332 * try again, allowing anything. (Note a return of 0 is ok if the input
333 * was '\0') */
334 if (UNLIKELY(ret == 0 && (curlen == 0 || *s != '\0'))) {
335
336 /* If curlen is 0, we already handled the case where warnings are
337 * disabled, so this 'if' will be true, and so later on, we know that
338 * 's' is dereferencible */
339 if (do_warnings) {
340 *retlen = (STRLEN) -1;
341 }
342 else {
343 ret = D_PPP_utf8_to_uvchr_buf_callee(
344 s, curlen, retlen, UTF8_ALLOW_ANY);
345 /* Override with the REPLACEMENT character, as that is what the
346 * modern version of this function returns */
347 ret = UNICODE_REPLACEMENT;
348
349# if { VERSION < 5.16.0 }
350
351 /* Versions earlier than this don't necessarily return the proper
352 * length. It should not extend past the end of string, nor past
353 * what the first byte indicates the length is, nor past the
354 * continuation characters */
355 if (retlen && *retlen >= 0) {
356 unsigned int i = 1;
357
358 *retlen = D_PPP_MIN(*retlen, curlen);
359 *retlen = D_PPP_MIN(*retlen, UTF8SKIP(s));
360 do {
d747fab5
KW
361# ifdef UTF8_IS_CONTINUATION
362 if (! UTF8_IS_CONTINUATION(s[i]))
363# else /* Versions without the above don't support EBCDIC anyway */
364 if (s[i] < 0x80 || s[i] > 0xBF)
365# endif
366 {
7899b636
KW
367 *retlen = i;
368 break;
369 }
370 } while (++i < *retlen);
371 }
372
373# endif
374
375 }
376 }
377
378 return ret;
379}
380
381# endif
382#endif
383
384#if defined(UTF8SKIP) && defined(utf8_to_uvchr_buf)
385#undef utf8_to_uvchr /* Always redefine this unsafe function so that it refuses
386 to read past a NUL, making it much less likely to read
387 off the end of the buffer. A NUL indicates the start
388 of the next character anyway. If the input isn't
389 NUL-terminated, the function remains unsafe, as it
390 always has been. */
391
392__UNDEFINED__ utf8_to_uvchr(s, lp) \
393 ((*(s) == '\0') \
394 ? utf8_to_uvchr_buf(s,((s)+1), lp) /* Handle single NUL specially */ \
7f7c413a 395 : utf8_to_uvchr_buf(s, (s) + UTF8_CHK_SKIP(s), (lp)))
7899b636
KW
396
397#endif
398
7252d154
KW
399/* Hint: utf8_to_uvchr
400 Use utf8_to_uvchr_buf() instead. But ONLY if you KNOW the upper bound
401 of the input string (not resorting to using UTF8SKIP, etc., to infer it).
402 The backported utf8_to_uvchr() will do a better job to prevent most cases
403 of trying to read beyond the end of the buffer */
404
405/* Replace utf8_to_uvchr with utf8_to_uvchr_buf */
406
0570adb7
P
407#ifdef SV_NOSTEAL
408 /* Older Perl versions have broken sv_len_utf8() when passed sv does not have SVf_UTF8 flag set */
409 /* Also note that SvGETMAGIC() may change presence of SVf_UTF8 flag */
410# if { VERSION < 5.17.5 }
411# undef sv_len_utf8
412# if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)
413# define sv_len_utf8_nomg(sv) ({ SV *_sv2 = (sv); (SvUTF8(_sv2) ? Perl_sv_len_utf8(aTHX_ (!SvGMAGICAL(_sv2) ? _sv2 : sv_mortalcopy_flags(_sv2, SV_NOSTEAL))) : ({ STRLEN _len; SvPV_nomg(_sv2, _len); _len; })); })
414# define sv_len_utf8(sv) ({ SV *_sv1 = (sv); SvGETMAGIC(_sv1); sv_len_utf8_nomg(_sv1); })
415# else
416# define sv_len_utf8_nomg(sv) (PL_Sv = (sv), (SvUTF8(PL_Sv) ? Perl_sv_len_utf8(aTHX_ (!SvGMAGICAL(PL_Sv) ? PL_Sv : sv_mortalcopy_flags(PL_Sv, SV_NOSTEAL))) : (SvPV_nomg(PL_Sv, PL_na), PL_na)))
417# define sv_len_utf8(sv) (PL_Sv = (sv), SvGETMAGIC(PL_Sv), sv_len_utf8_nomg(PL_Sv))
418# endif
419# endif
420# if defined(__GNUC__) && !defined(PERL_GCC_BRACE_GROUPS_FORBIDDEN)
421 __UNDEFINED__ sv_len_utf8_nomg(sv) ({ SV *_sv = (sv); sv_len_utf8(!SvGMAGICAL(_sv) ? _sv : sv_mortalcopy_flags(_sv, SV_NOSTEAL)); })
422# else
423 __UNDEFINED__ sv_len_utf8_nomg(sv) ((PL_Sv = (sv)), sv_len_utf8(!SvGMAGICAL(PL_Sv) ? PL_Sv : sv_mortalcopy_flags(PL_Sv, SV_NOSTEAL)))
424# endif
425#endif
426
7899b636
KW
427=xsinit
428
429#define NEED_utf8_to_uvchr_buf
430
431=xsubs
432
a807d978
KW
433#if { VERSION >= 5.006 } /* This is just a helper fcn, not publicized */ \
434 /* as being available and params not what the */ \
435 /* API function has; works on EBCDIC too */
436
437SV *
438uvoffuni_to_utf8(uni)
439
440 UV uni
441 PREINIT:
442 int len;
443 U8 string[UTF8_MAXBYTES+1];
444 int i;
445 UV native;
446 CODE:
447 native = UNI_TO_NATIVE(uni);
448
449 len = UVCHR_SKIP(native);
450
451 for (i = 0; i < len; i++) {
452 string[i] = '\0';
453 }
454
455 if (len <= 1) {
456 string[0] = native;
457 }
458 else {
459 i = len;
460 while (i-- > 1) {
461 string[i] = I8_TO_NATIVE_UTF8((uni & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
462 uni >>= UTF_ACCUMULATION_SHIFT;
463 }
464 string[0] = I8_TO_NATIVE_UTF8((uni & UTF_START_MASK(len)) | UTF_START_MARK(len));
465 }
466
467 RETVAL = newSVpvn((char *) string, len);
468 SvUTF8_on(RETVAL);
469 OUTPUT:
470 RETVAL
471
472#endif
7899b636
KW
473#if defined(UTF8_SAFE_SKIP) && defined(UTF8SKIP)
474
475STRLEN
476UTF8_SAFE_SKIP(s, adjustment)
477 char * s
478 int adjustment
479 PREINIT:
480 const char *const_s;
481 CODE:
482 const_s = s;
483 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
484 RETVAL = UTF8_SAFE_SKIP(const_s, s + UTF8SKIP(s) + adjustment);
485 OUTPUT:
486 RETVAL
487
488#endif
489
490#ifdef isUTF8_CHAR
491
492STRLEN
493isUTF8_CHAR(s, adjustment)
494 unsigned char * s
495 int adjustment
496 PREINIT:
497 const unsigned char *const_s;
498 const unsigned char *const_e;
499 CODE:
500 const_s = s;
501 /* Instead of passing in an 'e' ptr, use the real end, adjusted */
502 const_e = const_s + UTF8SKIP(const_s) + adjustment;
503 RETVAL = isUTF8_CHAR(const_s, const_e);
504 OUTPUT:
505 RETVAL
506
507#endif
508
2ff9e5e8
KW
509
510#ifdef foldEQ_utf8
511
512STRLEN
513foldEQ_utf8(s1, l1, u1, s2, l2, u2)
514 char *s1
515 UV l1
516 bool u1
517 char *s2
518 UV l2
519 bool u2
520 PREINIT:
521 const char *const_s1;
522 const char *const_s2;
523 CODE:
524 const_s1 = s1;
525 const_s2 = s2;
526 RETVAL = foldEQ_utf8(const_s1, NULL, l1, u1, const_s2, NULL, l2, u2);
527 OUTPUT:
528 RETVAL
529
530#endif
531
7899b636
KW
532#ifdef utf8_to_uvchr_buf
533
534AV *
535utf8_to_uvchr_buf(s, adjustment)
536 unsigned char *s
537 int adjustment
538 PREINIT:
539 AV *av;
540 STRLEN len;
541 const unsigned char *const_s;
542 CODE:
543 av = newAV();
544 const_s = s;
545 av_push(av, newSVuv(utf8_to_uvchr_buf(const_s,
546 s + UTF8SKIP(s) + adjustment,
547 &len)));
548 if (len == (STRLEN) -1) {
549 av_push(av, newSViv(-1));
550 }
551 else {
552 av_push(av, newSVuv(len));
553 }
554 RETVAL = av;
555 OUTPUT:
556 RETVAL
557
558#endif
559
560#ifdef utf8_to_uvchr
561
562AV *
563utf8_to_uvchr(s)
564 unsigned char *s
565 PREINIT:
566 AV *av;
567 STRLEN len;
568 const unsigned char *const_s;
569 CODE:
570 av = newAV();
571 const_s = s;
572 av_push(av, newSVuv(utf8_to_uvchr(const_s, &len)));
573 if (len == (STRLEN) -1) {
574 av_push(av, newSViv(-1));
575 }
576 else {
577 av_push(av, newSVuv(len));
578 }
579 RETVAL = av;
580 OUTPUT:
581 RETVAL
582
583#endif
584
0570adb7
P
585#ifdef SV_NOSTEAL
586
587STRLEN
588sv_len_utf8(sv)
589 SV *sv
590 CODE:
591 RETVAL = sv_len_utf8(sv);
592 OUTPUT:
593 RETVAL
594
595STRLEN
596sv_len_utf8_nomg(sv)
597 SV *sv
598 CODE:
599 RETVAL = sv_len_utf8_nomg(sv);
600 OUTPUT:
601 RETVAL
602
603#endif
604
e2da5d9e
KW
605#ifdef UVCHR_IS_INVARIANT
606
607bool
608UVCHR_IS_INVARIANT(c)
609 unsigned c
610 PREINIT:
611 CODE:
612 RETVAL = UVCHR_IS_INVARIANT(c);
613 OUTPUT:
614 RETVAL
615
616#endif
617
08d12710
KW
618#ifdef UVCHR_SKIP
619
620STRLEN
621UVCHR_SKIP(c)
622 UV c
623 PREINIT:
624 CODE:
625 RETVAL = UVCHR_SKIP(c);
626 OUTPUT:
627 RETVAL
628
629#endif
630
631=tests plan => 93
7899b636 632
c45043a0 633BEGIN { require warnings if "$]" > '5.006' }
7899b636 634
08811920
KW
635# skip tests on 5.6.0 and earlier, plus 7.0
636if ("$]" <= '5.006' || "$]" == '5.007' ) {
08d12710 637 for (1..93) {
08811920
KW
638 skip 'skip: broken utf8 support', 0;
639 }
7899b636
KW
640 exit;
641}
642
643ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", 0), 1);
644ok(&Devel::PPPort::UTF8_SAFE_SKIP("A", -1), 0);
645
646ok(&Devel::PPPort::isUTF8_CHAR("A", -1), 0);
647ok(&Devel::PPPort::isUTF8_CHAR("A", 0), 1);
648ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", -1), 0);
649ok(&Devel::PPPort::isUTF8_CHAR("\x{100}", 0), 2);
650
e2da5d9e
KW
651ok(&Devel::PPPort::UVCHR_IS_INVARIANT(ord("A")), 1);
652ok(! &Devel::PPPort::UVCHR_IS_INVARIANT(0xb6));
653ok(! &Devel::PPPort::UVCHR_IS_INVARIANT(0x100));
654
08d12710
KW
655if ("$]" < '5.006') {
656 for (1 ..9) {
657 ok(1, 1)
658 }
659}
660else {
661 ok(&Devel::PPPort::UVCHR_SKIP(ord("A")), 1);
662 ok(&Devel::PPPort::UVCHR_SKIP(0xb6), 2, "This is a test");
663 ok(&Devel::PPPort::UVCHR_SKIP(0x3FF), 2);
664 ok(&Devel::PPPort::UVCHR_SKIP(0x3FFF), 3);
665 ok(&Devel::PPPort::UVCHR_SKIP(0x3FFFF), 4);
666 ok(&Devel::PPPort::UVCHR_SKIP(0x3FFFFF), 5);
667 ok(&Devel::PPPort::UVCHR_SKIP(0x3FFFFFF), ord("A") == 65 ? 5 : 6);
668 ok(&Devel::PPPort::UVCHR_SKIP(0x4000000), ord("A") == 65 ? 6 : 7);
669 if (ord("A") != 65) {
670 ok(1, 1)
671 }
672 else {
673 ok(&Devel::PPPort::UVCHR_SKIP(0xFFFFFFFF), 7);
674 }
675}
676
c45043a0 677if ("$]" < '5.008') {
66da169a
KW
678 for (1 ..3) {
679 ok(1, 1)
680 }
2ff9e5e8
KW
681}
682else {
683 ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{101}", 3, 1), 1);
684 ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "a\x{102}", 3, 1), 0);
685 ok(&Devel::PPPort::foldEQ_utf8("A\x{100}", 3, 1, "b\x{101}", 3, 1), 0);
686}
687
7899b636
KW
688my $ret = &Devel::PPPort::utf8_to_uvchr("A");
689ok($ret->[0], ord("A"));
690ok($ret->[1], 1);
691
692$ret = &Devel::PPPort::utf8_to_uvchr("\0");
693ok($ret->[0], 0);
694ok($ret->[1], 1);
695
696$ret = &Devel::PPPort::utf8_to_uvchr_buf("A", 0);
697ok($ret->[0], ord("A"));
698ok($ret->[1], 1);
699
700$ret = &Devel::PPPort::utf8_to_uvchr_buf("\0", 0);
701ok($ret->[0], 0);
702ok($ret->[1], 1);
703
704if (ord("A") != 65) { # tests not valid for EBCDIC
66da169a
KW
705 for (1 .. (2 + 4 + (7 * 5))) {
706 ok(1, 1);
707 }
7899b636
KW
708}
709else {
710 $ret = &Devel::PPPort::utf8_to_uvchr_buf("\xc4\x80", 0);
711 ok($ret->[0], 0x100);
712 ok($ret->[1], 2);
713
714 my @warnings;
715 local $SIG{__WARN__} = sub { push @warnings, @_; };
716
717 {
c45043a0 718 BEGIN { 'warnings'->import('utf8') if "$]" > '5.006' }
7899b636
KW
719 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
720 ok($ret->[0], 0);
721 ok($ret->[1], -1);
722
c45043a0 723 BEGIN { 'warnings'->unimport() if "$]" > '5.006' }
7899b636
KW
724 $ret = &Devel::PPPort::utf8_to_uvchr("\xe0\0\x80");
725 ok($ret->[0], 0xFFFD);
726 ok($ret->[1], 1);
727 }
728
729 my @buf_tests = (
730 {
731 input => "A",
732 adjustment => -1,
21e22f08 733 warning => eval "qr/empty/",
7899b636
KW
734 no_warnings_returned_length => 0,
735 },
736 {
737 input => "\xc4\xc5",
738 adjustment => 0,
21e22f08 739 warning => eval "qr/non-continuation/",
7899b636
KW
740 no_warnings_returned_length => 1,
741 },
742 {
743 input => "\xc4\x80",
744 adjustment => -1,
21e22f08 745 warning => eval "qr/short|1 byte, need 2/",
7899b636
KW
746 no_warnings_returned_length => 1,
747 },
748 {
749 input => "\xc0\x81",
750 adjustment => 0,
21e22f08 751 warning => eval "qr/overlong|2 bytes, need 1/",
7899b636
KW
752 no_warnings_returned_length => 2,
753 },
754 {
755 input => "\xe0\x80\x81",
756 adjustment => 0,
21e22f08 757 warning => eval "qr/overlong|3 bytes, need 1/",
7899b636
KW
758 no_warnings_returned_length => 3,
759 },
760 {
761 input => "\xf0\x80\x80\x81",
762 adjustment => 0,
21e22f08 763 warning => eval "qr/overlong|4 bytes, need 1/",
7899b636
KW
764 no_warnings_returned_length => 4,
765 },
766 { # Old algorithm failed to detect this
767 input => "\xff\x80\x90\x90\x90\xbf\xbf\xbf\xbf\xbf\xbf\xbf\xbf",
768 adjustment => 0,
21e22f08 769 warning => eval "qr/overflow/",
7899b636
KW
770 no_warnings_returned_length => 13,
771 },
772 );
773
774 # An empty input is an assertion failure on debugging builds. It is
775 # deliberately the first test.
776 require Config; import Config;
777 use vars '%Config';
778 if ($Config{ccflags} =~ /-DDEBUGGING/) {
779 shift @buf_tests;
66da169a
KW
780 for (1..5) {
781 ok(1, 1);
782 }
7899b636
KW
783 }
784
66da169a
KW
785 my $test;
786 for $test (@buf_tests) {
7899b636
KW
787 my $input = $test->{'input'};
788 my $adjustment = $test->{'adjustment'};
789 my $display = 'utf8_to_uvchr_buf("';
66da169a
KW
790 my $i;
791 for ($i = 0; $i < length($input) + $adjustment; $i++) {
7899b636
KW
792 $display .= sprintf "\\x%02x", ord substr($input, $i, 1);
793 }
794
795 $display .= '")';
796 my $warning = $test->{'warning'};
797
798 undef @warnings;
c45043a0 799 BEGIN { 'warnings'->import('utf8') if "$]" > '5.006' }
7899b636
KW
800 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
801 ok($ret->[0], 0, "returned value $display; warnings enabled");
802 ok($ret->[1], -1, "returned length $display; warnings enabled");
803 my $all_warnings = join "; ", @warnings;
804 my $contains = grep { $_ =~ $warning } $all_warnings;
805 ok($contains, 1, $display
806 . "; Got: '$all_warnings', which should contain '$warning'");
807
808 undef @warnings;
c45043a0 809 BEGIN { 'warnings'->unimport('utf8') if "$]" > '5.006' }
7899b636
KW
810 $ret = &Devel::PPPort::utf8_to_uvchr_buf($input, $adjustment);
811 ok($ret->[0], 0xFFFD, "returned value $display; warnings disabled");
812 ok($ret->[1], $test->{'no_warnings_returned_length'},
813 "returned length $display; warnings disabled");
814 }
815}
0570adb7
P
816
817if ("$]" ge '5.008') {
818 BEGIN { if ("$]" ge '5.008') { require utf8; "utf8"->import() } }
819
820 ok(Devel::PPPort::sv_len_utf8("aščť"), 4);
821 ok(Devel::PPPort::sv_len_utf8_nomg("aščť"), 4);
822
823 my $str = "áíé";
824 utf8::downgrade($str);
825 ok(Devel::PPPort::sv_len_utf8($str), 3);
826 utf8::downgrade($str);
827 ok(Devel::PPPort::sv_len_utf8_nomg($str), 3);
828 utf8::upgrade($str);
829 ok(Devel::PPPort::sv_len_utf8($str), 3);
830 utf8::upgrade($str);
831 ok(Devel::PPPort::sv_len_utf8_nomg($str), 3);
832
833 tie my $scalar, 'TieScalarCounter', "é";
834
835 ok(tied($scalar)->{fetch}, 0);
836 ok(tied($scalar)->{store}, 0);
837 ok(Devel::PPPort::sv_len_utf8($scalar), 2);
838 ok(tied($scalar)->{fetch}, 1);
839 ok(tied($scalar)->{store}, 0);
840 ok(Devel::PPPort::sv_len_utf8($scalar), 3);
841 ok(tied($scalar)->{fetch}, 2);
842 ok(tied($scalar)->{store}, 0);
843 ok(Devel::PPPort::sv_len_utf8($scalar), 4);
844 ok(tied($scalar)->{fetch}, 3);
845 ok(tied($scalar)->{store}, 0);
846 ok(Devel::PPPort::sv_len_utf8_nomg($scalar), 4);
847 ok(tied($scalar)->{fetch}, 3);
848 ok(tied($scalar)->{store}, 0);
849 ok(Devel::PPPort::sv_len_utf8_nomg($scalar), 4);
850 ok(tied($scalar)->{fetch}, 3);
851 ok(tied($scalar)->{store}, 0);
852} else {
8583f9b2
KW
853 for (1..23) {
854 skip 'skip: no SV_NOSTEAL support', 0;
855 }
0570adb7
P
856}
857
858package TieScalarCounter;
859
860sub TIESCALAR {
861 my ($class, $value) = @_;
862 return bless { fetch => 0, store => 0, value => $value }, $class;
863}
864
865sub FETCH {
866 BEGIN { if ("$]" ge '5.008') { require utf8; "utf8"->import() } }
867 my ($self) = @_;
868 $self->{fetch}++;
869 return $self->{value} .= "é";
870}
871
872sub STORE {
873 my ($self, $value) = @_;
874 $self->{store}++;
875 $self->{value} = $value;
876}