This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
pack with a human face: the sequel
[perl5.git] / utf8.c
... / ...
CommitLineData
1/* utf8.c
2 *
3 * Copyright (c) 1998-2002, Larry Wall
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
24#define PERL_IN_UTF8_C
25#include "perl.h"
26
27/*
28=head1 Unicode Support
29
30=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
35end of the new character. In other words,
36
37 d = uvuni_to_utf8_flags(d, uv, flags);
38
39or, in most cases,
40
41 d = uvuni_to_utf8(d, uv);
42
43(which is equivalent to)
44
45 d = uvuni_to_utf8_flags(d, uv, 0);
46
47is the recommended Unicode-aware way of saying
48
49 *(d++) = uv;
50
51=cut
52*/
53
54U8 *
55Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
56{
57 if (ckWARN(WARN_UTF8)) {
58 if (UNICODE_IS_SURROGATE(uv) &&
59 !(flags & UNICODE_ALLOW_SURROGATE))
60 Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
61 else if (
62 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
63 !(flags & UNICODE_ALLOW_FDD0))
64 ||
65 ((uv & 0xFFFF) == 0xFFFE &&
66 !(flags & UNICODE_ALLOW_FFFE))
67 ||
68 ((uv & 0xFFFF) == 0xFFFF &&
69 !(flags & UNICODE_ALLOW_FFFF))) &&
70 /* UNICODE_ALLOW_SUPER includes
71 * FFFEs and FFFFs beyond 0x10FFFF. */
72 ((uv <= PERL_UNICODE_MAX) ||
73 !(flags & UNICODE_ALLOW_SUPER))
74 )
75 Perl_warner(aTHX_ WARN_UTF8,
76 "Unicode character 0x%04"UVxf" is illegal", uv);
77 }
78 if (UNI_IS_INVARIANT(uv)) {
79 *d++ = UTF_TO_NATIVE(uv);
80 return d;
81 }
82#if defined(EBCDIC)
83 else {
84 STRLEN len = UNISKIP(uv);
85 U8 *p = d+len-1;
86 while (p > d) {
87 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
88 uv >>= UTF_ACCUMULATION_SHIFT;
89 }
90 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
91 return d+len;
92 }
93#else /* Non loop style */
94 if (uv < 0x800) {
95 *d++ = (( uv >> 6) | 0xc0);
96 *d++ = (( uv & 0x3f) | 0x80);
97 return d;
98 }
99 if (uv < 0x10000) {
100 *d++ = (( uv >> 12) | 0xe0);
101 *d++ = (((uv >> 6) & 0x3f) | 0x80);
102 *d++ = (( uv & 0x3f) | 0x80);
103 return d;
104 }
105 if (uv < 0x200000) {
106 *d++ = (( uv >> 18) | 0xf0);
107 *d++ = (((uv >> 12) & 0x3f) | 0x80);
108 *d++ = (((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (( uv & 0x3f) | 0x80);
110 return d;
111 }
112 if (uv < 0x4000000) {
113 *d++ = (( uv >> 24) | 0xf8);
114 *d++ = (((uv >> 18) & 0x3f) | 0x80);
115 *d++ = (((uv >> 12) & 0x3f) | 0x80);
116 *d++ = (((uv >> 6) & 0x3f) | 0x80);
117 *d++ = (( uv & 0x3f) | 0x80);
118 return d;
119 }
120 if (uv < 0x80000000) {
121 *d++ = (( uv >> 30) | 0xfc);
122 *d++ = (((uv >> 24) & 0x3f) | 0x80);
123 *d++ = (((uv >> 18) & 0x3f) | 0x80);
124 *d++ = (((uv >> 12) & 0x3f) | 0x80);
125 *d++ = (((uv >> 6) & 0x3f) | 0x80);
126 *d++ = (( uv & 0x3f) | 0x80);
127 return d;
128 }
129#ifdef HAS_QUAD
130 if (uv < UTF8_QUAD_MAX)
131#endif
132 {
133 *d++ = 0xfe; /* Can't match U+FEFF! */
134 *d++ = (((uv >> 30) & 0x3f) | 0x80);
135 *d++ = (((uv >> 24) & 0x3f) | 0x80);
136 *d++ = (((uv >> 18) & 0x3f) | 0x80);
137 *d++ = (((uv >> 12) & 0x3f) | 0x80);
138 *d++ = (((uv >> 6) & 0x3f) | 0x80);
139 *d++ = (( uv & 0x3f) | 0x80);
140 return d;
141 }
142#ifdef HAS_QUAD
143 {
144 *d++ = 0xff; /* Can't match U+FFFE! */
145 *d++ = 0x80; /* 6 Reserved bits */
146 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
147 *d++ = (((uv >> 54) & 0x3f) | 0x80);
148 *d++ = (((uv >> 48) & 0x3f) | 0x80);
149 *d++ = (((uv >> 42) & 0x3f) | 0x80);
150 *d++ = (((uv >> 36) & 0x3f) | 0x80);
151 *d++ = (((uv >> 30) & 0x3f) | 0x80);
152 *d++ = (((uv >> 24) & 0x3f) | 0x80);
153 *d++ = (((uv >> 18) & 0x3f) | 0x80);
154 *d++ = (((uv >> 12) & 0x3f) | 0x80);
155 *d++ = (((uv >> 6) & 0x3f) | 0x80);
156 *d++ = (( uv & 0x3f) | 0x80);
157 return d;
158 }
159#endif
160#endif /* Loop style */
161}
162
163U8 *
164Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
165{
166 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
167}
168
169
170/*
171=for apidoc A|STRLEN|is_utf8_char|U8 *s
172
173Tests if some arbitrary number of bytes begins in a valid UTF-8
174character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
175The actual number of bytes in the UTF-8 character will be returned if
176it is valid, otherwise 0.
177
178=cut
179*/
180STRLEN
181Perl_is_utf8_char(pTHX_ U8 *s)
182{
183 U8 u = *s;
184 STRLEN slen, len;
185 UV uv, ouv;
186
187 if (UTF8_IS_INVARIANT(u))
188 return 1;
189
190 if (!UTF8_IS_START(u))
191 return 0;
192
193 len = UTF8SKIP(s);
194
195 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
196 return 0;
197
198 slen = len - 1;
199 s++;
200 u &= UTF_START_MASK(len);
201 uv = u;
202 ouv = uv;
203 while (slen--) {
204 if (!UTF8_IS_CONTINUATION(*s))
205 return 0;
206 uv = UTF8_ACCUMULATE(uv, *s);
207 if (uv < ouv)
208 return 0;
209 ouv = uv;
210 s++;
211 }
212
213 if (UNISKIP(uv) < len)
214 return 0;
215
216 return len;
217}
218
219/*
220=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
221
222Returns true if first C<len> bytes of the given string form a valid UTF8
223string, false otherwise. Note that 'a valid UTF8 string' does not mean
224'a string that contains UTF8' because a valid ASCII string is a valid
225UTF8 string.
226
227=cut
228*/
229
230bool
231Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
232{
233 U8* x = s;
234 U8* send;
235 STRLEN c;
236
237 if (!len)
238 len = strlen((char *)s);
239 send = s + len;
240
241 while (x < send) {
242 c = is_utf8_char(x);
243 if (!c)
244 return FALSE;
245 x += c;
246 }
247 if (x != send)
248 return FALSE;
249
250 return TRUE;
251}
252
253/*
254=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
255
256Bottom level UTF-8 decode routine.
257Returns the unicode code point value of the first character in the string C<s>
258which is assumed to be in UTF8 encoding and no longer than C<curlen>;
259C<retlen> will be set to the length, in bytes, of that character.
260
261If C<s> does not point to a well-formed UTF8 character, the behaviour
262is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
263it is assumed that the caller will raise a warning, and this function
264will silently just set C<retlen> to C<-1> and return zero. If the
265C<flags> does not contain UTF8_CHECK_ONLY, warnings about
266malformations will be given, C<retlen> will be set to the expected
267length of the UTF-8 character in bytes, and zero will be returned.
268
269The C<flags> can also contain various flags to allow deviations from
270the strict UTF-8 encoding (see F<utf8.h>).
271
272Most code should use utf8_to_uvchr() rather than call this directly.
273
274=cut
275*/
276
277UV
278Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
279{
280 U8 *s0 = s;
281 UV uv = *s, ouv = 0;
282 STRLEN len = 1;
283 bool dowarn = ckWARN_d(WARN_UTF8);
284 UV startbyte = *s;
285 STRLEN expectlen = 0;
286 U32 warning = 0;
287
288/* This list is a superset of the UTF8_ALLOW_XXX. */
289
290#define UTF8_WARN_EMPTY 1
291#define UTF8_WARN_CONTINUATION 2
292#define UTF8_WARN_NON_CONTINUATION 3
293#define UTF8_WARN_FE_FF 4
294#define UTF8_WARN_SHORT 5
295#define UTF8_WARN_OVERFLOW 6
296#define UTF8_WARN_SURROGATE 7
297#define UTF8_WARN_BOM 8
298#define UTF8_WARN_LONG 9
299#define UTF8_WARN_FFFF 10
300
301 if (curlen == 0 &&
302 !(flags & UTF8_ALLOW_EMPTY)) {
303 warning = UTF8_WARN_EMPTY;
304 goto malformed;
305 }
306
307 if (UTF8_IS_INVARIANT(uv)) {
308 if (retlen)
309 *retlen = 1;
310 return (UV) (NATIVE_TO_UTF(*s));
311 }
312
313 if (UTF8_IS_CONTINUATION(uv) &&
314 !(flags & UTF8_ALLOW_CONTINUATION)) {
315 warning = UTF8_WARN_CONTINUATION;
316 goto malformed;
317 }
318
319 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
320 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
321 warning = UTF8_WARN_NON_CONTINUATION;
322 goto malformed;
323 }
324
325#ifdef EBCDIC
326 uv = NATIVE_TO_UTF(uv);
327#else
328 if ((uv == 0xfe || uv == 0xff) &&
329 !(flags & UTF8_ALLOW_FE_FF)) {
330 warning = UTF8_WARN_FE_FF;
331 goto malformed;
332 }
333#endif
334
335 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
336 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
337 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
338 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
339#ifdef EBCDIC
340 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
341 else { len = 7; uv &= 0x01; }
342#else
343 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
344 else if (!(uv & 0x01)) { len = 7; uv = 0; }
345 else { len = 13; uv = 0; } /* whoa! */
346#endif
347
348 if (retlen)
349 *retlen = len;
350
351 expectlen = len;
352
353 if ((curlen < expectlen) &&
354 !(flags & UTF8_ALLOW_SHORT)) {
355 warning = UTF8_WARN_SHORT;
356 goto malformed;
357 }
358
359 len--;
360 s++;
361 ouv = uv;
362
363 while (len--) {
364 if (!UTF8_IS_CONTINUATION(*s) &&
365 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
366 s--;
367 warning = UTF8_WARN_NON_CONTINUATION;
368 goto malformed;
369 }
370 else
371 uv = UTF8_ACCUMULATE(uv, *s);
372 if (!(uv > ouv)) {
373 /* These cannot be allowed. */
374 if (uv == ouv) {
375 if (!(flags & UTF8_ALLOW_LONG)) {
376 warning = UTF8_WARN_LONG;
377 goto malformed;
378 }
379 }
380 else { /* uv < ouv */
381 /* This cannot be allowed. */
382 warning = UTF8_WARN_OVERFLOW;
383 goto malformed;
384 }
385 }
386 s++;
387 ouv = uv;
388 }
389
390 if (UNICODE_IS_SURROGATE(uv) &&
391 !(flags & UTF8_ALLOW_SURROGATE)) {
392 warning = UTF8_WARN_SURROGATE;
393 goto malformed;
394 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
395 !(flags & UTF8_ALLOW_BOM)) {
396 warning = UTF8_WARN_BOM;
397 goto malformed;
398 } else if ((expectlen > UNISKIP(uv)) &&
399 !(flags & UTF8_ALLOW_LONG)) {
400 warning = UTF8_WARN_LONG;
401 goto malformed;
402 } else if (UNICODE_IS_ILLEGAL(uv) &&
403 !(flags & UTF8_ALLOW_FFFF)) {
404 warning = UTF8_WARN_FFFF;
405 goto malformed;
406 }
407
408 return uv;
409
410malformed:
411
412 if (flags & UTF8_CHECK_ONLY) {
413 if (retlen)
414 *retlen = -1;
415 return 0;
416 }
417
418 if (dowarn) {
419 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
420
421 switch (warning) {
422 case 0: /* Intentionally empty. */ break;
423 case UTF8_WARN_EMPTY:
424 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
425 break;
426 case UTF8_WARN_CONTINUATION:
427 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
428 break;
429 case UTF8_WARN_NON_CONTINUATION:
430 if (s == s0)
431 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
432 (UV)s[1], startbyte);
433 else
434 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
435 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
436
437 break;
438 case UTF8_WARN_FE_FF:
439 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
440 break;
441 case UTF8_WARN_SHORT:
442 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
443 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
444 expectlen = curlen; /* distance for caller to skip */
445 break;
446 case UTF8_WARN_OVERFLOW:
447 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
448 ouv, *s, startbyte);
449 break;
450 case UTF8_WARN_SURROGATE:
451 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
452 break;
453 case UTF8_WARN_BOM:
454 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
455 break;
456 case UTF8_WARN_LONG:
457 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
458 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
459 break;
460 case UTF8_WARN_FFFF:
461 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
462 break;
463 default:
464 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
465 break;
466 }
467
468 if (warning) {
469 char *s = SvPVX(sv);
470
471 if (PL_op)
472 Perl_warner(aTHX_ WARN_UTF8,
473 "%s in %s", s, OP_DESC(PL_op));
474 else
475 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
476 }
477 }
478
479 if (retlen)
480 *retlen = expectlen ? expectlen : len;
481
482 return 0;
483}
484
485/*
486=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
487
488Returns the native character value of the first character in the string C<s>
489which is assumed to be in UTF8 encoding; C<retlen> will be set to the
490length, in bytes, of that character.
491
492If C<s> does not point to a well-formed UTF8 character, zero is
493returned and retlen is set, if possible, to -1.
494
495=cut
496*/
497
498UV
499Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
500{
501 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
502}
503
504/*
505=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
506
507Returns the Unicode code point of the first character in the string C<s>
508which is assumed to be in UTF8 encoding; C<retlen> will be set to the
509length, in bytes, of that character.
510
511This function should only be used when returned UV is considered
512an index into the Unicode semantic tables (e.g. swashes).
513
514If C<s> does not point to a well-formed UTF8 character, zero is
515returned and retlen is set, if possible, to -1.
516
517=cut
518*/
519
520UV
521Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
522{
523 /* Call the low level routine asking for checks */
524 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
525}
526
527/*
528=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
529
530Return the length of the UTF-8 char encoded string C<s> in characters.
531Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
532up past C<e>, croaks.
533
534=cut
535*/
536
537STRLEN
538Perl_utf8_length(pTHX_ U8 *s, U8 *e)
539{
540 STRLEN len = 0;
541
542 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
543 * the bitops (especially ~) can create illegal UTF-8.
544 * In other words: in Perl UTF-8 is not just for Unicode. */
545
546 if (e < s)
547 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
548 while (s < e) {
549 U8 t = UTF8SKIP(s);
550
551 if (e - s < t)
552 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
553 s += t;
554 len++;
555 }
556
557 return len;
558}
559
560/*
561=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
562
563Returns the number of UTF8 characters between the UTF-8 pointers C<a>
564and C<b>.
565
566WARNING: use only if you *know* that the pointers point inside the
567same UTF-8 buffer.
568
569=cut
570*/
571
572IV
573Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
574{
575 IV off = 0;
576
577 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
578 * the bitops (especially ~) can create illegal UTF-8.
579 * In other words: in Perl UTF-8 is not just for Unicode. */
580
581 if (a < b) {
582 while (a < b) {
583 U8 c = UTF8SKIP(a);
584
585 if (b - a < c)
586 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
587 a += c;
588 off--;
589 }
590 }
591 else {
592 while (b < a) {
593 U8 c = UTF8SKIP(b);
594
595 if (a - b < c)
596 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
597 b += c;
598 off++;
599 }
600 }
601
602 return off;
603}
604
605/*
606=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
607
608Return the UTF-8 pointer C<s> displaced by C<off> characters, either
609forward or backward.
610
611WARNING: do not use the following unless you *know* C<off> is within
612the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
613on the first byte of character or just after the last byte of a character.
614
615=cut
616*/
617
618U8 *
619Perl_utf8_hop(pTHX_ U8 *s, I32 off)
620{
621 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
622 * the bitops (especially ~) can create illegal UTF-8.
623 * In other words: in Perl UTF-8 is not just for Unicode. */
624
625 if (off >= 0) {
626 while (off--)
627 s += UTF8SKIP(s);
628 }
629 else {
630 while (off++) {
631 s--;
632 while (UTF8_IS_CONTINUATION(*s))
633 s--;
634 }
635 }
636 return s;
637}
638
639/*
640=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
641
642Converts a string C<s> of length C<len> from UTF8 into byte encoding.
643Unlike C<bytes_to_utf8>, this over-writes the original string, and
644updates len to contain the new length.
645Returns zero on failure, setting C<len> to -1.
646
647=cut
648*/
649
650U8 *
651Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
652{
653 U8 *send;
654 U8 *d;
655 U8 *save = s;
656
657 /* ensure valid UTF8 and chars < 256 before updating string */
658 for (send = s + *len; s < send; ) {
659 U8 c = *s++;
660
661 if (!UTF8_IS_INVARIANT(c) &&
662 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
663 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
664 *len = -1;
665 return 0;
666 }
667 }
668
669 d = s = save;
670 while (s < send) {
671 STRLEN ulen;
672 *d++ = (U8)utf8_to_uvchr(s, &ulen);
673 s += ulen;
674 }
675 *d = '\0';
676 *len = d - save;
677 return save;
678}
679
680/*
681=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
682
683Converts a string C<s> of length C<len> from UTF8 into byte encoding.
684Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
685the newly-created string, and updates C<len> to contain the new
686length. Returns the original string if no conversion occurs, C<len>
687is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6880 if C<s> is converted or contains all 7bit characters.
689
690=cut
691*/
692
693U8 *
694Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
695{
696 U8 *d;
697 U8 *start = s;
698 U8 *send;
699 I32 count = 0;
700
701 if (!*is_utf8)
702 return start;
703
704 /* ensure valid UTF8 and chars < 256 before converting string */
705 for (send = s + *len; s < send;) {
706 U8 c = *s++;
707 if (!UTF8_IS_INVARIANT(c)) {
708 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
709 (c = *s++) && UTF8_IS_CONTINUATION(c))
710 count++;
711 else
712 return start;
713 }
714 }
715
716 *is_utf8 = 0;
717
718 Newz(801, d, (*len) - count + 1, U8);
719 s = start; start = d;
720 while (s < send) {
721 U8 c = *s++;
722 if (!UTF8_IS_INVARIANT(c)) {
723 /* Then it is two-byte encoded */
724 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
725 c = ASCII_TO_NATIVE(c);
726 }
727 *d++ = c;
728 }
729 *d = '\0';
730 *len = d - start;
731 return start;
732}
733
734/*
735=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
736
737Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
738Returns a pointer to the newly-created string, and sets C<len> to
739reflect the new length.
740
741=cut
742*/
743
744U8*
745Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
746{
747 U8 *send;
748 U8 *d;
749 U8 *dst;
750 send = s + (*len);
751
752 Newz(801, d, (*len) * 2 + 1, U8);
753 dst = d;
754
755 while (s < send) {
756 UV uv = NATIVE_TO_ASCII(*s++);
757 if (UNI_IS_INVARIANT(uv))
758 *d++ = UTF_TO_NATIVE(uv);
759 else {
760 *d++ = UTF8_EIGHT_BIT_HI(uv);
761 *d++ = UTF8_EIGHT_BIT_LO(uv);
762 }
763 }
764 *d = '\0';
765 *len = d-dst;
766 return dst;
767}
768
769/*
770 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
771 *
772 * Destination must be pre-extended to 3/2 source. Do not use in-place.
773 * We optimize for native, for obvious reasons. */
774
775U8*
776Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
777{
778 U8* pend;
779 U8* dstart = d;
780
781 if (bytelen & 1)
782 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
783
784 pend = p + bytelen;
785
786 while (p < pend) {
787 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
788 p += 2;
789 if (uv < 0x80) {
790 *d++ = uv;
791 continue;
792 }
793 if (uv < 0x800) {
794 *d++ = (( uv >> 6) | 0xc0);
795 *d++ = (( uv & 0x3f) | 0x80);
796 continue;
797 }
798 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
799 UV low = *p++;
800 if (low < 0xdc00 || low >= 0xdfff)
801 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
802 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
803 }
804 if (uv < 0x10000) {
805 *d++ = (( uv >> 12) | 0xe0);
806 *d++ = (((uv >> 6) & 0x3f) | 0x80);
807 *d++ = (( uv & 0x3f) | 0x80);
808 continue;
809 }
810 else {
811 *d++ = (( uv >> 18) | 0xf0);
812 *d++ = (((uv >> 12) & 0x3f) | 0x80);
813 *d++ = (((uv >> 6) & 0x3f) | 0x80);
814 *d++ = (( uv & 0x3f) | 0x80);
815 continue;
816 }
817 }
818 *newlen = d - dstart;
819 return d;
820}
821
822/* Note: this one is slightly destructive of the source. */
823
824U8*
825Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
826{
827 U8* s = (U8*)p;
828 U8* send = s + bytelen;
829 while (s < send) {
830 U8 tmp = s[0];
831 s[0] = s[1];
832 s[1] = tmp;
833 s += 2;
834 }
835 return utf16_to_utf8(p, d, bytelen, newlen);
836}
837
838/* for now these are all defined (inefficiently) in terms of the utf8 versions */
839
840bool
841Perl_is_uni_alnum(pTHX_ UV c)
842{
843 U8 tmpbuf[UTF8_MAXLEN+1];
844 uvchr_to_utf8(tmpbuf, c);
845 return is_utf8_alnum(tmpbuf);
846}
847
848bool
849Perl_is_uni_alnumc(pTHX_ UV c)
850{
851 U8 tmpbuf[UTF8_MAXLEN+1];
852 uvchr_to_utf8(tmpbuf, c);
853 return is_utf8_alnumc(tmpbuf);
854}
855
856bool
857Perl_is_uni_idfirst(pTHX_ UV c)
858{
859 U8 tmpbuf[UTF8_MAXLEN+1];
860 uvchr_to_utf8(tmpbuf, c);
861 return is_utf8_idfirst(tmpbuf);
862}
863
864bool
865Perl_is_uni_alpha(pTHX_ UV c)
866{
867 U8 tmpbuf[UTF8_MAXLEN+1];
868 uvchr_to_utf8(tmpbuf, c);
869 return is_utf8_alpha(tmpbuf);
870}
871
872bool
873Perl_is_uni_ascii(pTHX_ UV c)
874{
875 U8 tmpbuf[UTF8_MAXLEN+1];
876 uvchr_to_utf8(tmpbuf, c);
877 return is_utf8_ascii(tmpbuf);
878}
879
880bool
881Perl_is_uni_space(pTHX_ UV c)
882{
883 U8 tmpbuf[UTF8_MAXLEN+1];
884 uvchr_to_utf8(tmpbuf, c);
885 return is_utf8_space(tmpbuf);
886}
887
888bool
889Perl_is_uni_digit(pTHX_ UV c)
890{
891 U8 tmpbuf[UTF8_MAXLEN+1];
892 uvchr_to_utf8(tmpbuf, c);
893 return is_utf8_digit(tmpbuf);
894}
895
896bool
897Perl_is_uni_upper(pTHX_ UV c)
898{
899 U8 tmpbuf[UTF8_MAXLEN+1];
900 uvchr_to_utf8(tmpbuf, c);
901 return is_utf8_upper(tmpbuf);
902}
903
904bool
905Perl_is_uni_lower(pTHX_ UV c)
906{
907 U8 tmpbuf[UTF8_MAXLEN+1];
908 uvchr_to_utf8(tmpbuf, c);
909 return is_utf8_lower(tmpbuf);
910}
911
912bool
913Perl_is_uni_cntrl(pTHX_ UV c)
914{
915 U8 tmpbuf[UTF8_MAXLEN+1];
916 uvchr_to_utf8(tmpbuf, c);
917 return is_utf8_cntrl(tmpbuf);
918}
919
920bool
921Perl_is_uni_graph(pTHX_ UV c)
922{
923 U8 tmpbuf[UTF8_MAXLEN+1];
924 uvchr_to_utf8(tmpbuf, c);
925 return is_utf8_graph(tmpbuf);
926}
927
928bool
929Perl_is_uni_print(pTHX_ UV c)
930{
931 U8 tmpbuf[UTF8_MAXLEN+1];
932 uvchr_to_utf8(tmpbuf, c);
933 return is_utf8_print(tmpbuf);
934}
935
936bool
937Perl_is_uni_punct(pTHX_ UV c)
938{
939 U8 tmpbuf[UTF8_MAXLEN+1];
940 uvchr_to_utf8(tmpbuf, c);
941 return is_utf8_punct(tmpbuf);
942}
943
944bool
945Perl_is_uni_xdigit(pTHX_ UV c)
946{
947 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
948 uvchr_to_utf8(tmpbuf, c);
949 return is_utf8_xdigit(tmpbuf);
950}
951
952UV
953Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
954{
955 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
956 uvchr_to_utf8(tmpbuf, c);
957 return to_utf8_upper(tmpbuf, p, lenp);
958}
959
960UV
961Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
962{
963 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
964 uvchr_to_utf8(tmpbuf, c);
965 return to_utf8_title(tmpbuf, p, lenp);
966}
967
968UV
969Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
970{
971 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
972 uvchr_to_utf8(tmpbuf, c);
973 return to_utf8_lower(tmpbuf, p, lenp);
974}
975
976UV
977Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
978{
979 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
980 uvchr_to_utf8(tmpbuf, c);
981 return to_utf8_fold(tmpbuf, p, lenp);
982}
983
984/* for now these all assume no locale info available for Unicode > 255 */
985
986bool
987Perl_is_uni_alnum_lc(pTHX_ UV c)
988{
989 return is_uni_alnum(c); /* XXX no locale support yet */
990}
991
992bool
993Perl_is_uni_alnumc_lc(pTHX_ UV c)
994{
995 return is_uni_alnumc(c); /* XXX no locale support yet */
996}
997
998bool
999Perl_is_uni_idfirst_lc(pTHX_ UV c)
1000{
1001 return is_uni_idfirst(c); /* XXX no locale support yet */
1002}
1003
1004bool
1005Perl_is_uni_alpha_lc(pTHX_ UV c)
1006{
1007 return is_uni_alpha(c); /* XXX no locale support yet */
1008}
1009
1010bool
1011Perl_is_uni_ascii_lc(pTHX_ UV c)
1012{
1013 return is_uni_ascii(c); /* XXX no locale support yet */
1014}
1015
1016bool
1017Perl_is_uni_space_lc(pTHX_ UV c)
1018{
1019 return is_uni_space(c); /* XXX no locale support yet */
1020}
1021
1022bool
1023Perl_is_uni_digit_lc(pTHX_ UV c)
1024{
1025 return is_uni_digit(c); /* XXX no locale support yet */
1026}
1027
1028bool
1029Perl_is_uni_upper_lc(pTHX_ UV c)
1030{
1031 return is_uni_upper(c); /* XXX no locale support yet */
1032}
1033
1034bool
1035Perl_is_uni_lower_lc(pTHX_ UV c)
1036{
1037 return is_uni_lower(c); /* XXX no locale support yet */
1038}
1039
1040bool
1041Perl_is_uni_cntrl_lc(pTHX_ UV c)
1042{
1043 return is_uni_cntrl(c); /* XXX no locale support yet */
1044}
1045
1046bool
1047Perl_is_uni_graph_lc(pTHX_ UV c)
1048{
1049 return is_uni_graph(c); /* XXX no locale support yet */
1050}
1051
1052bool
1053Perl_is_uni_print_lc(pTHX_ UV c)
1054{
1055 return is_uni_print(c); /* XXX no locale support yet */
1056}
1057
1058bool
1059Perl_is_uni_punct_lc(pTHX_ UV c)
1060{
1061 return is_uni_punct(c); /* XXX no locale support yet */
1062}
1063
1064bool
1065Perl_is_uni_xdigit_lc(pTHX_ UV c)
1066{
1067 return is_uni_xdigit(c); /* XXX no locale support yet */
1068}
1069
1070U32
1071Perl_to_uni_upper_lc(pTHX_ U32 c)
1072{
1073 /* XXX returns only the first character -- do not use XXX */
1074 /* XXX no locale support yet */
1075 STRLEN len;
1076 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1077 return (U32)to_uni_upper(c, tmpbuf, &len);
1078}
1079
1080U32
1081Perl_to_uni_title_lc(pTHX_ U32 c)
1082{
1083 /* XXX returns only the first character XXX -- do not use XXX */
1084 /* XXX no locale support yet */
1085 STRLEN len;
1086 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1087 return (U32)to_uni_title(c, tmpbuf, &len);
1088}
1089
1090U32
1091Perl_to_uni_lower_lc(pTHX_ U32 c)
1092{
1093 /* XXX returns only the first character -- do not use XXX */
1094 /* XXX no locale support yet */
1095 STRLEN len;
1096 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1097 return (U32)to_uni_lower(c, tmpbuf, &len);
1098}
1099
1100bool
1101Perl_is_utf8_alnum(pTHX_ U8 *p)
1102{
1103 if (!is_utf8_char(p))
1104 return FALSE;
1105 if (!PL_utf8_alnum)
1106 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1107 * descendant of isalnum(3), in other words, it doesn't
1108 * contain the '_'. --jhi */
1109 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1110 return swash_fetch(PL_utf8_alnum, p, TRUE);
1111/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1112#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1113 if (!PL_utf8_alnum)
1114 PL_utf8_alnum = swash_init("utf8", "",
1115 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1116 return swash_fetch(PL_utf8_alnum, p, TRUE);
1117#endif
1118}
1119
1120bool
1121Perl_is_utf8_alnumc(pTHX_ U8 *p)
1122{
1123 if (!is_utf8_char(p))
1124 return FALSE;
1125 if (!PL_utf8_alnum)
1126 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1127 return swash_fetch(PL_utf8_alnum, p, TRUE);
1128/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1129#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1130 if (!PL_utf8_alnum)
1131 PL_utf8_alnum = swash_init("utf8", "",
1132 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1133 return swash_fetch(PL_utf8_alnum, p, TRUE);
1134#endif
1135}
1136
1137bool
1138Perl_is_utf8_idfirst(pTHX_ U8 *p)
1139{
1140 return *p == '_' || is_utf8_alpha(p);
1141}
1142
1143bool
1144Perl_is_utf8_alpha(pTHX_ U8 *p)
1145{
1146 if (!is_utf8_char(p))
1147 return FALSE;
1148 if (!PL_utf8_alpha)
1149 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1150 return swash_fetch(PL_utf8_alpha, p, TRUE);
1151}
1152
1153bool
1154Perl_is_utf8_ascii(pTHX_ U8 *p)
1155{
1156 if (!is_utf8_char(p))
1157 return FALSE;
1158 if (!PL_utf8_ascii)
1159 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1160 return swash_fetch(PL_utf8_ascii, p, TRUE);
1161}
1162
1163bool
1164Perl_is_utf8_space(pTHX_ U8 *p)
1165{
1166 if (!is_utf8_char(p))
1167 return FALSE;
1168 if (!PL_utf8_space)
1169 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1170 return swash_fetch(PL_utf8_space, p, TRUE);
1171}
1172
1173bool
1174Perl_is_utf8_digit(pTHX_ U8 *p)
1175{
1176 if (!is_utf8_char(p))
1177 return FALSE;
1178 if (!PL_utf8_digit)
1179 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1180 return swash_fetch(PL_utf8_digit, p, TRUE);
1181}
1182
1183bool
1184Perl_is_utf8_upper(pTHX_ U8 *p)
1185{
1186 if (!is_utf8_char(p))
1187 return FALSE;
1188 if (!PL_utf8_upper)
1189 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1190 return swash_fetch(PL_utf8_upper, p, TRUE);
1191}
1192
1193bool
1194Perl_is_utf8_lower(pTHX_ U8 *p)
1195{
1196 if (!is_utf8_char(p))
1197 return FALSE;
1198 if (!PL_utf8_lower)
1199 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1200 return swash_fetch(PL_utf8_lower, p, TRUE);
1201}
1202
1203bool
1204Perl_is_utf8_cntrl(pTHX_ U8 *p)
1205{
1206 if (!is_utf8_char(p))
1207 return FALSE;
1208 if (!PL_utf8_cntrl)
1209 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1210 return swash_fetch(PL_utf8_cntrl, p, TRUE);
1211}
1212
1213bool
1214Perl_is_utf8_graph(pTHX_ U8 *p)
1215{
1216 if (!is_utf8_char(p))
1217 return FALSE;
1218 if (!PL_utf8_graph)
1219 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1220 return swash_fetch(PL_utf8_graph, p, TRUE);
1221}
1222
1223bool
1224Perl_is_utf8_print(pTHX_ U8 *p)
1225{
1226 if (!is_utf8_char(p))
1227 return FALSE;
1228 if (!PL_utf8_print)
1229 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1230 return swash_fetch(PL_utf8_print, p, TRUE);
1231}
1232
1233bool
1234Perl_is_utf8_punct(pTHX_ U8 *p)
1235{
1236 if (!is_utf8_char(p))
1237 return FALSE;
1238 if (!PL_utf8_punct)
1239 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1240 return swash_fetch(PL_utf8_punct, p, TRUE);
1241}
1242
1243bool
1244Perl_is_utf8_xdigit(pTHX_ U8 *p)
1245{
1246 if (!is_utf8_char(p))
1247 return FALSE;
1248 if (!PL_utf8_xdigit)
1249 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1250 return swash_fetch(PL_utf8_xdigit, p, TRUE);
1251}
1252
1253bool
1254Perl_is_utf8_mark(pTHX_ U8 *p)
1255{
1256 if (!is_utf8_char(p))
1257 return FALSE;
1258 if (!PL_utf8_mark)
1259 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1260 return swash_fetch(PL_utf8_mark, p, TRUE);
1261}
1262
1263/*
1264=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1265
1266The "p" contains the pointer to the UTF-8 string encoding
1267the character that is being converted.
1268
1269The "ustrp" is a pointer to the character buffer to put the
1270conversion result to. The "lenp" is a pointer to the length
1271of the result.
1272
1273The "swashp" is a pointer to the swash to use.
1274
1275Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1276and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1277but not always, a multicharacter mapping), is tried first.
1278
1279The "special" is a string like "utf8::ToSpecLower", which means the
1280hash %utf8::ToSpecLower. The access to the hash is through
1281Perl_to_utf8_case().
1282
1283The "normal" is a string like "ToLower" which means the swash
1284%utf8::ToLower.
1285
1286=cut */
1287
1288UV
1289Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1290{
1291 UV uv0, uv1;
1292 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1293 STRLEN len = 0;
1294
1295 uv0 = utf8_to_uvchr(p, 0);
1296 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1297 * are necessary in EBCDIC, they are redundant no-ops
1298 * in ASCII-ish platforms, and hopefully optimized away. */
1299 uv1 = NATIVE_TO_UNI(uv0);
1300 uvuni_to_utf8(tmpbuf, uv1);
1301
1302 if (!*swashp) /* load on-demand */
1303 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1304
1305 if (special) {
1306 /* It might be "special" (sometimes, but not always,
1307 * a multicharacter mapping) */
1308 HV *hv;
1309 SV *keysv;
1310 HE *he;
1311 SV *val;
1312
1313 if ((hv = get_hv(special, FALSE)) &&
1314 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv1))) &&
1315 (he = hv_fetch_ent(hv, keysv, FALSE, 0)) &&
1316 (val = HeVAL(he))) {
1317 char *s;
1318
1319 s = SvPV(val, len);
1320 if (len == 1)
1321 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1322 else {
1323#ifdef EBCDIC
1324 /* If we have EBCDIC we need to remap the characters
1325 * since any characters in the low 256 are Unicode
1326 * code points, not EBCDIC. */
1327 U8 *t = (U8*)s, *tend = t + len, *d;
1328
1329 d = tmpbuf;
1330 if (SvUTF8(val)) {
1331 STRLEN tlen = 0;
1332
1333 while (t < tend) {
1334 UV c = utf8_to_uvchr(t, &tlen);
1335 if (tlen > 0) {
1336 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1337 t += tlen;
1338 }
1339 else
1340 break;
1341 }
1342 }
1343 else {
1344 while (t < tend) {
1345 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1346 t++;
1347 }
1348 }
1349 len = d - tmpbuf;
1350 Copy(tmpbuf, ustrp, len, U8);
1351#else
1352 Copy(s, ustrp, len, U8);
1353#endif
1354 }
1355 }
1356 }
1357
1358 if (!len && *swashp) {
1359 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1360
1361 if (uv2) {
1362 /* It was "normal" (a single character mapping). */
1363 UV uv3 = UNI_TO_NATIVE(uv2);
1364
1365 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1366 }
1367 }
1368
1369 if (!len) /* Neither: just copy. */
1370 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1371
1372 if (lenp)
1373 *lenp = len;
1374
1375 return len ? utf8_to_uvchr(ustrp, 0) : 0;
1376}
1377
1378/*
1379=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1380
1381Convert the UTF-8 encoded character at p to its uppercase version and
1382store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1383that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1384uppercase version may be longer than the original character (up to two
1385characters).
1386
1387The first character of the uppercased version is returned
1388(but note, as explained above, that there may be more.)
1389
1390=cut */
1391
1392UV
1393Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1394{
1395 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1396 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1397}
1398
1399/*
1400=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1401
1402Convert the UTF-8 encoded character at p to its titlecase version and
1403store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1404that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1405titlecase version may be longer than the original character (up to two
1406characters).
1407
1408The first character of the titlecased version is returned
1409(but note, as explained above, that there may be more.)
1410
1411=cut */
1412
1413UV
1414Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1415{
1416 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1417 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1418}
1419
1420/*
1421=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1422
1423Convert the UTF-8 encoded character at p to its lowercase version and
1424store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1425that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1426lowercase version may be longer than the original character (up to two
1427characters).
1428
1429The first character of the lowercased version is returned
1430(but note, as explained above, that there may be more.)
1431
1432=cut */
1433
1434UV
1435Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1436{
1437 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1438 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1439}
1440
1441/*
1442=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1443
1444Convert the UTF-8 encoded character at p to its foldcase version and
1445store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1446that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1447foldcase version may be longer than the original character (up to
1448three characters).
1449
1450The first character of the foldcased version is returned
1451(but note, as explained above, that there may be more.)
1452
1453=cut */
1454
1455UV
1456Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1457{
1458 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1459 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1460}
1461
1462/* a "swash" is a swatch hash */
1463
1464SV*
1465Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1466{
1467 SV* retval;
1468 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1469 dSP;
1470 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1471 SV* errsv_save;
1472
1473 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1474 ENTER;
1475 errsv_save = newSVsv(ERRSV);
1476 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1477 if (!SvTRUE(ERRSV))
1478 sv_setsv(ERRSV, errsv_save);
1479 SvREFCNT_dec(errsv_save);
1480 LEAVE;
1481 }
1482 SPAGAIN;
1483 PUSHSTACKi(PERLSI_MAGIC);
1484 PUSHMARK(SP);
1485 EXTEND(SP,5);
1486 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1487 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1488 PUSHs(listsv);
1489 PUSHs(sv_2mortal(newSViv(minbits)));
1490 PUSHs(sv_2mortal(newSViv(none)));
1491 PUTBACK;
1492 ENTER;
1493 SAVEI32(PL_hints);
1494 PL_hints = 0;
1495 save_re_context();
1496 if (PL_curcop == &PL_compiling)
1497 /* XXX ought to be handled by lex_start */
1498 sv_setpv(tokenbufsv, PL_tokenbuf);
1499 errsv_save = newSVsv(ERRSV);
1500 if (call_method("SWASHNEW", G_SCALAR))
1501 retval = newSVsv(*PL_stack_sp--);
1502 else
1503 retval = &PL_sv_undef;
1504 if (!SvTRUE(ERRSV))
1505 sv_setsv(ERRSV, errsv_save);
1506 SvREFCNT_dec(errsv_save);
1507 LEAVE;
1508 POPSTACK;
1509 if (PL_curcop == &PL_compiling) {
1510 STRLEN len;
1511 char* pv = SvPV(tokenbufsv, len);
1512
1513 Copy(pv, PL_tokenbuf, len+1, char);
1514 PL_curcop->op_private = PL_hints;
1515 }
1516 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1517 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1518 return retval;
1519}
1520
1521
1522/* This API is wrong for special case conversions since we may need to
1523 * return several Unicode characters for a single Unicode character
1524 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1525 * the lower-level routine, and it is similarly broken for returning
1526 * multiple values. --jhi */
1527UV
1528Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1529{
1530 HV* hv = (HV*)SvRV(sv);
1531 U32 klen;
1532 U32 off;
1533 STRLEN slen;
1534 STRLEN needents;
1535 U8 *tmps = NULL;
1536 U32 bit;
1537 SV *retval;
1538 U8 tmputf8[2];
1539 UV c = NATIVE_TO_ASCII(*ptr);
1540
1541 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1542 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1543 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1544 ptr = tmputf8;
1545 }
1546 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1547 * then the "swatch" is a vec() for al the chars which start
1548 * with 0xAA..0xYY
1549 * So the key in the hash (klen) is length of encoded char -1
1550 */
1551 klen = UTF8SKIP(ptr) - 1;
1552 off = ptr[klen];
1553
1554 if (klen == 0)
1555 {
1556 /* If char in invariant then swatch is for all the invariant chars
1557 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1558 */
1559 needents = UTF_CONTINUATION_MARK;
1560 off = NATIVE_TO_UTF(ptr[klen]);
1561 }
1562 else
1563 {
1564 /* If char is encoded then swatch is for the prefix */
1565 needents = (1 << UTF_ACCUMULATION_SHIFT);
1566 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1567 }
1568
1569 /*
1570 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1571 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1572 * it's nothing to sniff at.) Pity we usually come through at least
1573 * two function calls to get here...
1574 *
1575 * NB: this code assumes that swatches are never modified, once generated!
1576 */
1577
1578 if (hv == PL_last_swash_hv &&
1579 klen == PL_last_swash_klen &&
1580 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1581 {
1582 tmps = PL_last_swash_tmps;
1583 slen = PL_last_swash_slen;
1584 }
1585 else {
1586 /* Try our second-level swatch cache, kept in a hash. */
1587 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1588
1589 /* If not cached, generate it via utf8::SWASHGET */
1590 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1591 dSP;
1592 /* We use utf8n_to_uvuni() as we want an index into
1593 Unicode tables, not a native character number.
1594 */
1595 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1596 SV *errsv_save;
1597 ENTER;
1598 SAVETMPS;
1599 save_re_context();
1600 PUSHSTACKi(PERLSI_MAGIC);
1601 PUSHMARK(SP);
1602 EXTEND(SP,3);
1603 PUSHs((SV*)sv);
1604 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1605 PUSHs(sv_2mortal(newSViv((klen) ?
1606 (code_point & ~(needents - 1)) : 0)));
1607 PUSHs(sv_2mortal(newSViv(needents)));
1608 PUTBACK;
1609 errsv_save = newSVsv(ERRSV);
1610 if (call_method("SWASHGET", G_SCALAR))
1611 retval = newSVsv(*PL_stack_sp--);
1612 else
1613 retval = &PL_sv_undef;
1614 if (!SvTRUE(ERRSV))
1615 sv_setsv(ERRSV, errsv_save);
1616 SvREFCNT_dec(errsv_save);
1617 POPSTACK;
1618 FREETMPS;
1619 LEAVE;
1620 if (PL_curcop == &PL_compiling)
1621 PL_curcop->op_private = PL_hints;
1622
1623 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1624
1625 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1626 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1627 }
1628
1629 PL_last_swash_hv = hv;
1630 PL_last_swash_klen = klen;
1631 PL_last_swash_tmps = tmps;
1632 PL_last_swash_slen = slen;
1633 if (klen)
1634 Copy(ptr, PL_last_swash_key, klen, U8);
1635 }
1636
1637 switch ((int)((slen << 3) / needents)) {
1638 case 1:
1639 bit = 1 << (off & 7);
1640 off >>= 3;
1641 return (tmps[off] & bit) != 0;
1642 case 8:
1643 return tmps[off];
1644 case 16:
1645 off <<= 1;
1646 return (tmps[off] << 8) + tmps[off + 1] ;
1647 case 32:
1648 off <<= 2;
1649 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1650 }
1651 Perl_croak(aTHX_ "panic: swash_fetch");
1652 return 0;
1653}
1654
1655
1656/*
1657=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1658
1659Adds the UTF8 representation of the Native codepoint C<uv> to the end
1660of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1661bytes available. The return value is the pointer to the byte after the
1662end of the new character. In other words,
1663
1664 d = uvchr_to_utf8(d, uv);
1665
1666is the recommended wide native character-aware way of saying
1667
1668 *(d++) = uv;
1669
1670=cut
1671*/
1672
1673/* On ASCII machines this is normally a macro but we want a
1674 real function in case XS code wants it
1675*/
1676#undef Perl_uvchr_to_utf8
1677U8 *
1678Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1679{
1680 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1681}
1682
1683U8 *
1684Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1685{
1686 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1687}
1688
1689/*
1690=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1691
1692Returns the native character value of the first character in the string C<s>
1693which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1694length, in bytes, of that character.
1695
1696Allows length and flags to be passed to low level routine.
1697
1698=cut
1699*/
1700/* On ASCII machines this is normally a macro but we want
1701 a real function in case XS code wants it
1702*/
1703#undef Perl_utf8n_to_uvchr
1704UV
1705Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1706{
1707 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1708 return UNI_TO_NATIVE(uv);
1709}
1710
1711/*
1712=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1713
1714Build to the scalar dsv a displayable version of the string spv,
1715length len, the displayable version being at most pvlim bytes long
1716(if longer, the rest is truncated and "..." will be appended).
1717
1718The flags argument can have UNI_DISPLAY_ISPRINT set to display
1719isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1720to display the \\[nrfta\\] as the backslashed versions (like '\n')
1721(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1722UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1723UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1724
1725The pointer to the PV of the dsv is returned.
1726
1727=cut */
1728char *
1729Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1730{
1731 int truncated = 0;
1732 char *s, *e;
1733
1734 sv_setpvn(dsv, "", 0);
1735 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1736 UV u;
1737 bool ok = FALSE;
1738
1739 if (pvlim && SvCUR(dsv) >= pvlim) {
1740 truncated++;
1741 break;
1742 }
1743 u = utf8_to_uvchr((U8*)s, 0);
1744 if (u < 256) {
1745 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1746 switch (u & 0xFF) {
1747 case '\n':
1748 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1749 case '\r':
1750 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1751 case '\t':
1752 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1753 case '\f':
1754 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1755 case '\a':
1756 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1757 case '\\':
1758 Perl_sv_catpvf(aTHX_ dsv, "\\" ); ok = TRUE; break;
1759 default: break;
1760 }
1761 }
1762 /* isPRINT() is the locale-blind version. */
1763 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1764 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1765 ok = TRUE;
1766 }
1767 }
1768 if (!ok)
1769 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1770 }
1771 if (truncated)
1772 sv_catpvn(dsv, "...", 3);
1773
1774 return SvPVX(dsv);
1775}
1776
1777/*
1778=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1779
1780Build to the scalar dsv a displayable version of the scalar sv,
1781the displayable version being at most pvlim bytes long
1782(if longer, the rest is truncated and "..." will be appended).
1783
1784The flags argument is as in pv_uni_display().
1785
1786The pointer to the PV of the dsv is returned.
1787
1788=cut */
1789char *
1790Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1791{
1792 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1793 pvlim, flags);
1794}
1795
1796/*
1797=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1798
1799Return true if the strings s1 and s2 differ case-insensitively, false
1800if not (if they are equal case-insensitively). If u1 is true, the
1801string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
1802the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1803are false, the respective string is assumed to be in native 8-bit
1804encoding.
1805
1806If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1807in there (they will point at the beginning of the I<next> character).
1808If the pointers behind pe1 or pe2 are non-NULL, they are the end
1809pointers beyond which scanning will not continue under any
1810circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1811s2+l2 will be used as goal end pointers that will also stop the scan,
1812and which qualify towards defining a successful match: all the scans
1813that define an explicit length must reach their goal pointers for
1814a match to succeed).
1815
1816For case-insensitiveness, the "casefolding" of Unicode is used
1817instead of upper/lowercasing both the characters, see
1818http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1819
1820=cut */
1821I32
1822Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1823{
1824 register U8 *p1 = (U8*)s1;
1825 register U8 *p2 = (U8*)s2;
1826 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1827 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1828 STRLEN n1 = 0, n2 = 0;
1829 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1830 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1831 U8 natbuf[1+1];
1832 STRLEN foldlen1, foldlen2;
1833 bool match;
1834
1835 if (pe1)
1836 e1 = *(U8**)pe1;
1837 if (e1 == 0 || (l1 && l1 < e1 - (U8*)s1))
1838 f1 = (U8*)s1 + l1;
1839 if (pe2)
1840 e2 = *(U8**)pe2;
1841 if (e2 == 0 || (l2 && l2 < e2 - (U8*)s2))
1842 f2 = (U8*)s2 + l2;
1843
1844 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1845 return 1; /* mismatch; possible infinite loop or false positive */
1846
1847 if (!u1 || !u2)
1848 natbuf[1] = 0; /* Need to terminate the buffer. */
1849
1850 while ((e1 == 0 || p1 < e1) &&
1851 (f1 == 0 || p1 < f1) &&
1852 (e2 == 0 || p2 < e2) &&
1853 (f2 == 0 || p2 < f2)) {
1854 if (n1 == 0) {
1855 if (u1)
1856 to_utf8_fold(p1, foldbuf1, &foldlen1);
1857 else {
1858 natbuf[0] = *p1;
1859 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1860 }
1861 q1 = foldbuf1;
1862 n1 = foldlen1;
1863 }
1864 if (n2 == 0) {
1865 if (u2)
1866 to_utf8_fold(p2, foldbuf2, &foldlen2);
1867 else {
1868 natbuf[0] = *p2;
1869 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1870 }
1871 q2 = foldbuf2;
1872 n2 = foldlen2;
1873 }
1874 while (n1 && n2) {
1875 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1876 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1877 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1878 return 1; /* mismatch */
1879 n1 -= UTF8SKIP(q1);
1880 q1 += UTF8SKIP(q1);
1881 n2 -= UTF8SKIP(q2);
1882 q2 += UTF8SKIP(q2);
1883 }
1884 if (n1 == 0)
1885 p1 += u1 ? UTF8SKIP(p1) : 1;
1886 if (n2 == 0)
1887 p2 += u2 ? UTF8SKIP(p2) : 1;
1888
1889 }
1890
1891 /* A match is defined by all the scans that specified
1892 * an explicit length reaching their final goals. */
1893 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
1894
1895 if (match) {
1896 if (pe1)
1897 *pe1 = (char*)p1;
1898 if (pe2)
1899 *pe2 = (char*)p2;
1900 }
1901
1902 return match ? 0 : 1; /* 0 match, 1 mismatch */
1903}
1904