This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Retreat, retreat! (retract #14715 and #14716)
[perl5.git] / utf8.c
... / ...
CommitLineData
1/* utf8.c
2 *
3 * Copyright (c) 1998-2002, Larry Wall
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
24#define PERL_IN_UTF8_C
25#include "perl.h"
26
27/*
28=head1 Unicode Support
29
30=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
35end of the new character. In other words,
36
37 d = uvuni_to_utf8_flags(d, uv, flags);
38
39or, in most cases,
40
41 d = uvuni_to_utf8(d, uv);
42
43(which is equivalent to)
44
45 d = uvuni_to_utf8_flags(d, uv, 0);
46
47is the recommended Unicode-aware way of saying
48
49 *(d++) = uv;
50
51=cut
52*/
53
54U8 *
55Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
56{
57 if (ckWARN(WARN_UTF8)) {
58 if (UNICODE_IS_SURROGATE(uv) &&
59 !(flags & UNICODE_ALLOW_SURROGATE))
60 Perl_warner(aTHX_ WARN_UTF8, "UTF-16 surrogate 0x%04"UVxf, uv);
61 else if (
62 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
63 !(flags & UNICODE_ALLOW_FDD0))
64 ||
65 ((uv & 0xFFFF) == 0xFFFE &&
66 !(flags & UNICODE_ALLOW_FFFE))
67 ||
68 ((uv & 0xFFFF) == 0xFFFF &&
69 !(flags & UNICODE_ALLOW_FFFF))) &&
70 /* UNICODE_ALLOW_SUPER includes
71 * FFFEs and FFFFs beyond 0x10FFFF. */
72 ((uv <= PERL_UNICODE_MAX) ||
73 !(flags & UNICODE_ALLOW_SUPER))
74 )
75 Perl_warner(aTHX_ WARN_UTF8,
76 "Unicode character 0x%04"UVxf" is illegal", uv);
77 }
78 if (UNI_IS_INVARIANT(uv)) {
79 *d++ = UTF_TO_NATIVE(uv);
80 return d;
81 }
82#if defined(EBCDIC)
83 else {
84 STRLEN len = UNISKIP(uv);
85 U8 *p = d+len-1;
86 while (p > d) {
87 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
88 uv >>= UTF_ACCUMULATION_SHIFT;
89 }
90 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
91 return d+len;
92 }
93#else /* Non loop style */
94 if (uv < 0x800) {
95 *d++ = (( uv >> 6) | 0xc0);
96 *d++ = (( uv & 0x3f) | 0x80);
97 return d;
98 }
99 if (uv < 0x10000) {
100 *d++ = (( uv >> 12) | 0xe0);
101 *d++ = (((uv >> 6) & 0x3f) | 0x80);
102 *d++ = (( uv & 0x3f) | 0x80);
103 return d;
104 }
105 if (uv < 0x200000) {
106 *d++ = (( uv >> 18) | 0xf0);
107 *d++ = (((uv >> 12) & 0x3f) | 0x80);
108 *d++ = (((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (( uv & 0x3f) | 0x80);
110 return d;
111 }
112 if (uv < 0x4000000) {
113 *d++ = (( uv >> 24) | 0xf8);
114 *d++ = (((uv >> 18) & 0x3f) | 0x80);
115 *d++ = (((uv >> 12) & 0x3f) | 0x80);
116 *d++ = (((uv >> 6) & 0x3f) | 0x80);
117 *d++ = (( uv & 0x3f) | 0x80);
118 return d;
119 }
120 if (uv < 0x80000000) {
121 *d++ = (( uv >> 30) | 0xfc);
122 *d++ = (((uv >> 24) & 0x3f) | 0x80);
123 *d++ = (((uv >> 18) & 0x3f) | 0x80);
124 *d++ = (((uv >> 12) & 0x3f) | 0x80);
125 *d++ = (((uv >> 6) & 0x3f) | 0x80);
126 *d++ = (( uv & 0x3f) | 0x80);
127 return d;
128 }
129#ifdef HAS_QUAD
130 if (uv < UTF8_QUAD_MAX)
131#endif
132 {
133 *d++ = 0xfe; /* Can't match U+FEFF! */
134 *d++ = (((uv >> 30) & 0x3f) | 0x80);
135 *d++ = (((uv >> 24) & 0x3f) | 0x80);
136 *d++ = (((uv >> 18) & 0x3f) | 0x80);
137 *d++ = (((uv >> 12) & 0x3f) | 0x80);
138 *d++ = (((uv >> 6) & 0x3f) | 0x80);
139 *d++ = (( uv & 0x3f) | 0x80);
140 return d;
141 }
142#ifdef HAS_QUAD
143 {
144 *d++ = 0xff; /* Can't match U+FFFE! */
145 *d++ = 0x80; /* 6 Reserved bits */
146 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
147 *d++ = (((uv >> 54) & 0x3f) | 0x80);
148 *d++ = (((uv >> 48) & 0x3f) | 0x80);
149 *d++ = (((uv >> 42) & 0x3f) | 0x80);
150 *d++ = (((uv >> 36) & 0x3f) | 0x80);
151 *d++ = (((uv >> 30) & 0x3f) | 0x80);
152 *d++ = (((uv >> 24) & 0x3f) | 0x80);
153 *d++ = (((uv >> 18) & 0x3f) | 0x80);
154 *d++ = (((uv >> 12) & 0x3f) | 0x80);
155 *d++ = (((uv >> 6) & 0x3f) | 0x80);
156 *d++ = (( uv & 0x3f) | 0x80);
157 return d;
158 }
159#endif
160#endif /* Loop style */
161}
162
163U8 *
164Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
165{
166 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
167}
168
169
170/*
171=for apidoc A|STRLEN|is_utf8_char|U8 *s
172
173Tests if some arbitrary number of bytes begins in a valid UTF-8
174character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
175The actual number of bytes in the UTF-8 character will be returned if
176it is valid, otherwise 0.
177
178=cut
179*/
180STRLEN
181Perl_is_utf8_char(pTHX_ U8 *s)
182{
183 U8 u = *s;
184 STRLEN slen, len;
185 UV uv, ouv;
186
187 if (UTF8_IS_INVARIANT(u))
188 return 1;
189
190 if (!UTF8_IS_START(u))
191 return 0;
192
193 len = UTF8SKIP(s);
194
195 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
196 return 0;
197
198 slen = len - 1;
199 s++;
200 u &= UTF_START_MASK(len);
201 uv = u;
202 ouv = uv;
203 while (slen--) {
204 if (!UTF8_IS_CONTINUATION(*s))
205 return 0;
206 uv = UTF8_ACCUMULATE(uv, *s);
207 if (uv < ouv)
208 return 0;
209 ouv = uv;
210 s++;
211 }
212
213 if (UNISKIP(uv) < len)
214 return 0;
215
216 return len;
217}
218
219/*
220=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
221
222Returns true if first C<len> bytes of the given string form a valid UTF8
223string, false otherwise. Note that 'a valid UTF8 string' does not mean
224'a string that contains UTF8' because a valid ASCII string is a valid
225UTF8 string.
226
227=cut
228*/
229
230bool
231Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
232{
233 U8* x = s;
234 U8* send;
235 STRLEN c;
236
237 if (!len)
238 len = strlen((char *)s);
239 send = s + len;
240
241 while (x < send) {
242 c = is_utf8_char(x);
243 if (!c)
244 return FALSE;
245 x += c;
246 }
247 if (x != send)
248 return FALSE;
249
250 return TRUE;
251}
252
253/*
254=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
255
256Bottom level UTF-8 decode routine.
257Returns the unicode code point value of the first character in the string C<s>
258which is assumed to be in UTF8 encoding and no longer than C<curlen>;
259C<retlen> will be set to the length, in bytes, of that character.
260
261If C<s> does not point to a well-formed UTF8 character, the behaviour
262is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
263it is assumed that the caller will raise a warning, and this function
264will silently just set C<retlen> to C<-1> and return zero. If the
265C<flags> does not contain UTF8_CHECK_ONLY, warnings about
266malformations will be given, C<retlen> will be set to the expected
267length of the UTF-8 character in bytes, and zero will be returned.
268
269The C<flags> can also contain various flags to allow deviations from
270the strict UTF-8 encoding (see F<utf8.h>).
271
272Most code should use utf8_to_uvchr() rather than call this directly.
273
274=cut
275*/
276
277UV
278Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
279{
280 U8 *s0 = s;
281 UV uv = *s, ouv = 0;
282 STRLEN len = 1;
283 bool dowarn = ckWARN_d(WARN_UTF8);
284 UV startbyte = *s;
285 STRLEN expectlen = 0;
286 U32 warning = 0;
287
288/* This list is a superset of the UTF8_ALLOW_XXX. */
289
290#define UTF8_WARN_EMPTY 1
291#define UTF8_WARN_CONTINUATION 2
292#define UTF8_WARN_NON_CONTINUATION 3
293#define UTF8_WARN_FE_FF 4
294#define UTF8_WARN_SHORT 5
295#define UTF8_WARN_OVERFLOW 6
296#define UTF8_WARN_SURROGATE 7
297#define UTF8_WARN_BOM 8
298#define UTF8_WARN_LONG 9
299#define UTF8_WARN_FFFF 10
300
301 if (curlen == 0 &&
302 !(flags & UTF8_ALLOW_EMPTY)) {
303 warning = UTF8_WARN_EMPTY;
304 goto malformed;
305 }
306
307 if (UTF8_IS_INVARIANT(uv)) {
308 if (retlen)
309 *retlen = 1;
310 return (UV) (NATIVE_TO_UTF(*s));
311 }
312
313 if (UTF8_IS_CONTINUATION(uv) &&
314 !(flags & UTF8_ALLOW_CONTINUATION)) {
315 warning = UTF8_WARN_CONTINUATION;
316 goto malformed;
317 }
318
319 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
320 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
321 warning = UTF8_WARN_NON_CONTINUATION;
322 goto malformed;
323 }
324
325#ifdef EBCDIC
326 uv = NATIVE_TO_UTF(uv);
327#else
328 if ((uv == 0xfe || uv == 0xff) &&
329 !(flags & UTF8_ALLOW_FE_FF)) {
330 warning = UTF8_WARN_FE_FF;
331 goto malformed;
332 }
333#endif
334
335 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
336 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
337 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
338 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
339#ifdef EBCDIC
340 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
341 else { len = 7; uv &= 0x01; }
342#else
343 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
344 else if (!(uv & 0x01)) { len = 7; uv = 0; }
345 else { len = 13; uv = 0; } /* whoa! */
346#endif
347
348 if (retlen)
349 *retlen = len;
350
351 expectlen = len;
352
353 if ((curlen < expectlen) &&
354 !(flags & UTF8_ALLOW_SHORT)) {
355 warning = UTF8_WARN_SHORT;
356 goto malformed;
357 }
358
359 len--;
360 s++;
361 ouv = uv;
362
363 while (len--) {
364 if (!UTF8_IS_CONTINUATION(*s) &&
365 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
366 s--;
367 warning = UTF8_WARN_NON_CONTINUATION;
368 goto malformed;
369 }
370 else
371 uv = UTF8_ACCUMULATE(uv, *s);
372 if (!(uv > ouv)) {
373 /* These cannot be allowed. */
374 if (uv == ouv) {
375 if (!(flags & UTF8_ALLOW_LONG)) {
376 warning = UTF8_WARN_LONG;
377 goto malformed;
378 }
379 }
380 else { /* uv < ouv */
381 /* This cannot be allowed. */
382 warning = UTF8_WARN_OVERFLOW;
383 goto malformed;
384 }
385 }
386 s++;
387 ouv = uv;
388 }
389
390 if (UNICODE_IS_SURROGATE(uv) &&
391 !(flags & UTF8_ALLOW_SURROGATE)) {
392 warning = UTF8_WARN_SURROGATE;
393 goto malformed;
394 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
395 !(flags & UTF8_ALLOW_BOM)) {
396 warning = UTF8_WARN_BOM;
397 goto malformed;
398 } else if ((expectlen > UNISKIP(uv)) &&
399 !(flags & UTF8_ALLOW_LONG)) {
400 warning = UTF8_WARN_LONG;
401 goto malformed;
402 } else if (UNICODE_IS_ILLEGAL(uv) &&
403 !(flags & UTF8_ALLOW_FFFF)) {
404 warning = UTF8_WARN_FFFF;
405 goto malformed;
406 }
407
408 return uv;
409
410malformed:
411
412 if (flags & UTF8_CHECK_ONLY) {
413 if (retlen)
414 *retlen = -1;
415 return 0;
416 }
417
418 if (dowarn) {
419 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
420
421 switch (warning) {
422 case 0: /* Intentionally empty. */ break;
423 case UTF8_WARN_EMPTY:
424 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
425 break;
426 case UTF8_WARN_CONTINUATION:
427 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
428 break;
429 case UTF8_WARN_NON_CONTINUATION:
430 if (s == s0)
431 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
432 (UV)s[1], startbyte);
433 else
434 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
435 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, expectlen);
436
437 break;
438 case UTF8_WARN_FE_FF:
439 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
440 break;
441 case UTF8_WARN_SHORT:
442 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
443 curlen, curlen == 1 ? "" : "s", expectlen, startbyte);
444 expectlen = curlen; /* distance for caller to skip */
445 break;
446 case UTF8_WARN_OVERFLOW:
447 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
448 ouv, *s, startbyte);
449 break;
450 case UTF8_WARN_SURROGATE:
451 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
452 break;
453 case UTF8_WARN_BOM:
454 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
455 break;
456 case UTF8_WARN_LONG:
457 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
458 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
459 break;
460 case UTF8_WARN_FFFF:
461 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
462 break;
463 default:
464 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
465 break;
466 }
467
468 if (warning) {
469 char *s = SvPVX(sv);
470
471 if (PL_op)
472 Perl_warner(aTHX_ WARN_UTF8,
473 "%s in %s", s, OP_DESC(PL_op));
474 else
475 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
476 }
477 }
478
479 if (retlen)
480 *retlen = expectlen ? expectlen : len;
481
482 return 0;
483}
484
485/*
486=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
487
488Returns the native character value of the first character in the string C<s>
489which is assumed to be in UTF8 encoding; C<retlen> will be set to the
490length, in bytes, of that character.
491
492If C<s> does not point to a well-formed UTF8 character, zero is
493returned and retlen is set, if possible, to -1.
494
495=cut
496*/
497
498UV
499Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
500{
501 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
502}
503
504/*
505=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
506
507Returns the Unicode code point of the first character in the string C<s>
508which is assumed to be in UTF8 encoding; C<retlen> will be set to the
509length, in bytes, of that character.
510
511This function should only be used when returned UV is considered
512an index into the Unicode semantic tables (e.g. swashes).
513
514If C<s> does not point to a well-formed UTF8 character, zero is
515returned and retlen is set, if possible, to -1.
516
517=cut
518*/
519
520UV
521Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
522{
523 /* Call the low level routine asking for checks */
524 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
525}
526
527/*
528=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
529
530Return the length of the UTF-8 char encoded string C<s> in characters.
531Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
532up past C<e>, croaks.
533
534=cut
535*/
536
537STRLEN
538Perl_utf8_length(pTHX_ U8 *s, U8 *e)
539{
540 STRLEN len = 0;
541
542 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
543 * the bitops (especially ~) can create illegal UTF-8.
544 * In other words: in Perl UTF-8 is not just for Unicode. */
545
546 if (e < s)
547 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
548 while (s < e) {
549 U8 t = UTF8SKIP(s);
550
551 if (e - s < t)
552 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
553 s += t;
554 len++;
555 }
556
557 return len;
558}
559
560/*
561=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
562
563Returns the number of UTF8 characters between the UTF-8 pointers C<a>
564and C<b>.
565
566WARNING: use only if you *know* that the pointers point inside the
567same UTF-8 buffer.
568
569=cut
570*/
571
572IV
573Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
574{
575 IV off = 0;
576
577 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
578 * the bitops (especially ~) can create illegal UTF-8.
579 * In other words: in Perl UTF-8 is not just for Unicode. */
580
581 if (a < b) {
582 while (a < b) {
583 U8 c = UTF8SKIP(a);
584
585 if (b - a < c)
586 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
587 a += c;
588 off--;
589 }
590 }
591 else {
592 while (b < a) {
593 U8 c = UTF8SKIP(b);
594
595 if (a - b < c)
596 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
597 b += c;
598 off++;
599 }
600 }
601
602 return off;
603}
604
605/*
606=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
607
608Return the UTF-8 pointer C<s> displaced by C<off> characters, either
609forward or backward.
610
611WARNING: do not use the following unless you *know* C<off> is within
612the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
613on the first byte of character or just after the last byte of a character.
614
615=cut
616*/
617
618U8 *
619Perl_utf8_hop(pTHX_ U8 *s, I32 off)
620{
621 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
622 * the bitops (especially ~) can create illegal UTF-8.
623 * In other words: in Perl UTF-8 is not just for Unicode. */
624
625 if (off >= 0) {
626 while (off--)
627 s += UTF8SKIP(s);
628 }
629 else {
630 while (off++) {
631 s--;
632 while (UTF8_IS_CONTINUATION(*s))
633 s--;
634 }
635 }
636 return s;
637}
638
639/*
640=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
641
642Converts a string C<s> of length C<len> from UTF8 into byte encoding.
643Unlike C<bytes_to_utf8>, this over-writes the original string, and
644updates len to contain the new length.
645Returns zero on failure, setting C<len> to -1.
646
647=cut
648*/
649
650U8 *
651Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
652{
653 U8 *send;
654 U8 *d;
655 U8 *save = s;
656
657 /* ensure valid UTF8 and chars < 256 before updating string */
658 for (send = s + *len; s < send; ) {
659 U8 c = *s++;
660
661 if (!UTF8_IS_INVARIANT(c) &&
662 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
663 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
664 *len = -1;
665 return 0;
666 }
667 }
668
669 d = s = save;
670 while (s < send) {
671 STRLEN ulen;
672 *d++ = (U8)utf8_to_uvchr(s, &ulen);
673 s += ulen;
674 }
675 *d = '\0';
676 *len = d - save;
677 return save;
678}
679
680/*
681=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
682
683Converts a string C<s> of length C<len> from UTF8 into byte encoding.
684Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
685the newly-created string, and updates C<len> to contain the new
686length. Returns the original string if no conversion occurs, C<len>
687is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6880 if C<s> is converted or contains all 7bit characters.
689
690=cut
691*/
692
693U8 *
694Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
695{
696 U8 *d;
697 U8 *start = s;
698 U8 *send;
699 I32 count = 0;
700
701 if (!*is_utf8)
702 return start;
703
704 /* ensure valid UTF8 and chars < 256 before converting string */
705 for (send = s + *len; s < send;) {
706 U8 c = *s++;
707 if (!UTF8_IS_INVARIANT(c)) {
708 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
709 (c = *s++) && UTF8_IS_CONTINUATION(c))
710 count++;
711 else
712 return start;
713 }
714 }
715
716 *is_utf8 = 0;
717
718 Newz(801, d, (*len) - count + 1, U8);
719 s = start; start = d;
720 while (s < send) {
721 U8 c = *s++;
722 if (!UTF8_IS_INVARIANT(c)) {
723 /* Then it is two-byte encoded */
724 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
725 c = ASCII_TO_NATIVE(c);
726 }
727 *d++ = c;
728 }
729 *d = '\0';
730 *len = d - start;
731 return start;
732}
733
734/*
735=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
736
737Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
738Returns a pointer to the newly-created string, and sets C<len> to
739reflect the new length.
740
741=cut
742*/
743
744U8*
745Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
746{
747 U8 *send;
748 U8 *d;
749 U8 *dst;
750 send = s + (*len);
751
752 Newz(801, d, (*len) * 2 + 1, U8);
753 dst = d;
754
755 while (s < send) {
756 UV uv = NATIVE_TO_ASCII(*s++);
757 if (UNI_IS_INVARIANT(uv))
758 *d++ = UTF_TO_NATIVE(uv);
759 else {
760 *d++ = UTF8_EIGHT_BIT_HI(uv);
761 *d++ = UTF8_EIGHT_BIT_LO(uv);
762 }
763 }
764 *d = '\0';
765 *len = d-dst;
766 return dst;
767}
768
769/*
770 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
771 *
772 * Destination must be pre-extended to 3/2 source. Do not use in-place.
773 * We optimize for native, for obvious reasons. */
774
775U8*
776Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
777{
778 U8* pend;
779 U8* dstart = d;
780
781 if (bytelen & 1)
782 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
783
784 pend = p + bytelen;
785
786 while (p < pend) {
787 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
788 p += 2;
789 if (uv < 0x80) {
790 *d++ = uv;
791 continue;
792 }
793 if (uv < 0x800) {
794 *d++ = (( uv >> 6) | 0xc0);
795 *d++ = (( uv & 0x3f) | 0x80);
796 continue;
797 }
798 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
799 UV low = *p++;
800 if (low < 0xdc00 || low >= 0xdfff)
801 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
802 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
803 }
804 if (uv < 0x10000) {
805 *d++ = (( uv >> 12) | 0xe0);
806 *d++ = (((uv >> 6) & 0x3f) | 0x80);
807 *d++ = (( uv & 0x3f) | 0x80);
808 continue;
809 }
810 else {
811 *d++ = (( uv >> 18) | 0xf0);
812 *d++ = (((uv >> 12) & 0x3f) | 0x80);
813 *d++ = (((uv >> 6) & 0x3f) | 0x80);
814 *d++ = (( uv & 0x3f) | 0x80);
815 continue;
816 }
817 }
818 *newlen = d - dstart;
819 return d;
820}
821
822/* Note: this one is slightly destructive of the source. */
823
824U8*
825Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
826{
827 U8* s = (U8*)p;
828 U8* send = s + bytelen;
829 while (s < send) {
830 U8 tmp = s[0];
831 s[0] = s[1];
832 s[1] = tmp;
833 s += 2;
834 }
835 return utf16_to_utf8(p, d, bytelen, newlen);
836}
837
838/* for now these are all defined (inefficiently) in terms of the utf8 versions */
839
840bool
841Perl_is_uni_alnum(pTHX_ UV c)
842{
843 U8 tmpbuf[UTF8_MAXLEN+1];
844 uvchr_to_utf8(tmpbuf, (UV)c);
845 return is_utf8_alnum(tmpbuf);
846}
847
848bool
849Perl_is_uni_alnumc(pTHX_ UV c)
850{
851 U8 tmpbuf[UTF8_MAXLEN+1];
852 uvchr_to_utf8(tmpbuf, (UV)c);
853 return is_utf8_alnumc(tmpbuf);
854}
855
856bool
857Perl_is_uni_idfirst(pTHX_ UV c)
858{
859 U8 tmpbuf[UTF8_MAXLEN+1];
860 uvchr_to_utf8(tmpbuf, (UV)c);
861 return is_utf8_idfirst(tmpbuf);
862}
863
864bool
865Perl_is_uni_alpha(pTHX_ UV c)
866{
867 U8 tmpbuf[UTF8_MAXLEN+1];
868 uvchr_to_utf8(tmpbuf, (UV)c);
869 return is_utf8_alpha(tmpbuf);
870}
871
872bool
873Perl_is_uni_ascii(pTHX_ UV c)
874{
875 U8 tmpbuf[UTF8_MAXLEN+1];
876 uvchr_to_utf8(tmpbuf, (UV)c);
877 return is_utf8_ascii(tmpbuf);
878}
879
880bool
881Perl_is_uni_space(pTHX_ UV c)
882{
883 U8 tmpbuf[UTF8_MAXLEN+1];
884 uvchr_to_utf8(tmpbuf, (UV)c);
885 return is_utf8_space(tmpbuf);
886}
887
888bool
889Perl_is_uni_digit(pTHX_ UV c)
890{
891 U8 tmpbuf[UTF8_MAXLEN+1];
892 uvchr_to_utf8(tmpbuf, (UV)c);
893 return is_utf8_digit(tmpbuf);
894}
895
896bool
897Perl_is_uni_upper(pTHX_ UV c)
898{
899 U8 tmpbuf[UTF8_MAXLEN+1];
900 uvchr_to_utf8(tmpbuf, (UV)c);
901 return is_utf8_upper(tmpbuf);
902}
903
904bool
905Perl_is_uni_lower(pTHX_ UV c)
906{
907 U8 tmpbuf[UTF8_MAXLEN+1];
908 uvchr_to_utf8(tmpbuf, (UV)c);
909 return is_utf8_lower(tmpbuf);
910}
911
912bool
913Perl_is_uni_cntrl(pTHX_ UV c)
914{
915 U8 tmpbuf[UTF8_MAXLEN+1];
916 uvchr_to_utf8(tmpbuf, (UV)c);
917 return is_utf8_cntrl(tmpbuf);
918}
919
920bool
921Perl_is_uni_graph(pTHX_ UV c)
922{
923 U8 tmpbuf[UTF8_MAXLEN+1];
924 uvchr_to_utf8(tmpbuf, (UV)c);
925 return is_utf8_graph(tmpbuf);
926}
927
928bool
929Perl_is_uni_print(pTHX_ UV c)
930{
931 U8 tmpbuf[UTF8_MAXLEN+1];
932 uvchr_to_utf8(tmpbuf, (UV)c);
933 return is_utf8_print(tmpbuf);
934}
935
936bool
937Perl_is_uni_punct(pTHX_ UV c)
938{
939 U8 tmpbuf[UTF8_MAXLEN+1];
940 uvchr_to_utf8(tmpbuf, (UV)c);
941 return is_utf8_punct(tmpbuf);
942}
943
944bool
945Perl_is_uni_xdigit(pTHX_ UV c)
946{
947 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
948 uvchr_to_utf8(tmpbuf, (UV)c);
949 return is_utf8_xdigit(tmpbuf);
950}
951
952UV
953Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
954{
955 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
956 uvchr_to_utf8(tmpbuf, (UV)c);
957 return to_utf8_upper(tmpbuf, p, lenp);
958}
959
960UV
961Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
962{
963 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
964 uvchr_to_utf8(tmpbuf, (UV)c);
965 return to_utf8_title(tmpbuf, p, lenp);
966}
967
968UV
969Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
970{
971 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
972 uvchr_to_utf8(tmpbuf, (UV)c);
973 return to_utf8_lower(tmpbuf, p, lenp);
974}
975
976UV
977Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
978{
979 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
980 uvchr_to_utf8(tmpbuf, (UV)c);
981 return to_utf8_fold(tmpbuf, p, lenp);
982}
983
984/* for now these all assume no locale info available for Unicode > 255 */
985
986bool
987Perl_is_uni_alnum_lc(pTHX_ UV c)
988{
989 return is_uni_alnum(c); /* XXX no locale support yet */
990}
991
992bool
993Perl_is_uni_alnumc_lc(pTHX_ UV c)
994{
995 return is_uni_alnumc(c); /* XXX no locale support yet */
996}
997
998bool
999Perl_is_uni_idfirst_lc(pTHX_ UV c)
1000{
1001 return is_uni_idfirst(c); /* XXX no locale support yet */
1002}
1003
1004bool
1005Perl_is_uni_alpha_lc(pTHX_ UV c)
1006{
1007 return is_uni_alpha(c); /* XXX no locale support yet */
1008}
1009
1010bool
1011Perl_is_uni_ascii_lc(pTHX_ UV c)
1012{
1013 return is_uni_ascii(c); /* XXX no locale support yet */
1014}
1015
1016bool
1017Perl_is_uni_space_lc(pTHX_ UV c)
1018{
1019 return is_uni_space(c); /* XXX no locale support yet */
1020}
1021
1022bool
1023Perl_is_uni_digit_lc(pTHX_ UV c)
1024{
1025 return is_uni_digit(c); /* XXX no locale support yet */
1026}
1027
1028bool
1029Perl_is_uni_upper_lc(pTHX_ UV c)
1030{
1031 return is_uni_upper(c); /* XXX no locale support yet */
1032}
1033
1034bool
1035Perl_is_uni_lower_lc(pTHX_ UV c)
1036{
1037 return is_uni_lower(c); /* XXX no locale support yet */
1038}
1039
1040bool
1041Perl_is_uni_cntrl_lc(pTHX_ UV c)
1042{
1043 return is_uni_cntrl(c); /* XXX no locale support yet */
1044}
1045
1046bool
1047Perl_is_uni_graph_lc(pTHX_ UV c)
1048{
1049 return is_uni_graph(c); /* XXX no locale support yet */
1050}
1051
1052bool
1053Perl_is_uni_print_lc(pTHX_ UV c)
1054{
1055 return is_uni_print(c); /* XXX no locale support yet */
1056}
1057
1058bool
1059Perl_is_uni_punct_lc(pTHX_ UV c)
1060{
1061 return is_uni_punct(c); /* XXX no locale support yet */
1062}
1063
1064bool
1065Perl_is_uni_xdigit_lc(pTHX_ UV c)
1066{
1067 return is_uni_xdigit(c); /* XXX no locale support yet */
1068}
1069
1070U32
1071Perl_to_uni_upper_lc(pTHX_ U32 c)
1072{
1073 /* XXX returns only the first character -- do not use XXX */
1074 /* XXX no locale support yet */
1075 STRLEN len;
1076 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1077 return (U32)to_uni_upper(c, tmpbuf, &len);
1078}
1079
1080U32
1081Perl_to_uni_title_lc(pTHX_ U32 c)
1082{
1083 /* XXX returns only the first character XXX -- do not use XXX */
1084 /* XXX no locale support yet */
1085 STRLEN len;
1086 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1087 return (U32)to_uni_title(c, tmpbuf, &len);
1088}
1089
1090U32
1091Perl_to_uni_lower_lc(pTHX_ U32 c)
1092{
1093 /* XXX returns only the first character -- do not use XXX */
1094 /* XXX no locale support yet */
1095 STRLEN len;
1096 U8 tmpbuf[UTF8_MAXLEN_UCLC+1];
1097 return (U32)to_uni_lower(c, tmpbuf, &len);
1098}
1099
1100bool
1101Perl_is_utf8_alnum(pTHX_ U8 *p)
1102{
1103 if (!is_utf8_char(p))
1104 return FALSE;
1105 if (!PL_utf8_alnum)
1106 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1107 * descendant of isalnum(3), in other words, it doesn't
1108 * contain the '_'. --jhi */
1109 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1110 return swash_fetch(PL_utf8_alnum, p, TRUE);
1111/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1112#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1113 if (!PL_utf8_alnum)
1114 PL_utf8_alnum = swash_init("utf8", "",
1115 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1116 return swash_fetch(PL_utf8_alnum, p, TRUE);
1117#endif
1118}
1119
1120bool
1121Perl_is_utf8_alnumc(pTHX_ U8 *p)
1122{
1123 if (!is_utf8_char(p))
1124 return FALSE;
1125 if (!PL_utf8_alnum)
1126 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1127 return swash_fetch(PL_utf8_alnum, p, TRUE);
1128/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1129#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1130 if (!PL_utf8_alnum)
1131 PL_utf8_alnum = swash_init("utf8", "",
1132 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1133 return swash_fetch(PL_utf8_alnum, p, TRUE);
1134#endif
1135}
1136
1137bool
1138Perl_is_utf8_idfirst(pTHX_ U8 *p)
1139{
1140 return *p == '_' || is_utf8_alpha(p);
1141}
1142
1143bool
1144Perl_is_utf8_alpha(pTHX_ U8 *p)
1145{
1146 if (!is_utf8_char(p))
1147 return FALSE;
1148 if (!PL_utf8_alpha)
1149 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1150 return swash_fetch(PL_utf8_alpha, p, TRUE);
1151}
1152
1153bool
1154Perl_is_utf8_ascii(pTHX_ U8 *p)
1155{
1156 if (!is_utf8_char(p))
1157 return FALSE;
1158 if (!PL_utf8_ascii)
1159 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1160 return swash_fetch(PL_utf8_ascii, p, TRUE);
1161}
1162
1163bool
1164Perl_is_utf8_space(pTHX_ U8 *p)
1165{
1166 if (!is_utf8_char(p))
1167 return FALSE;
1168 if (!PL_utf8_space)
1169 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1170 return swash_fetch(PL_utf8_space, p, TRUE);
1171}
1172
1173bool
1174Perl_is_utf8_digit(pTHX_ U8 *p)
1175{
1176 if (!is_utf8_char(p))
1177 return FALSE;
1178 if (!PL_utf8_digit)
1179 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1180 return swash_fetch(PL_utf8_digit, p, TRUE);
1181}
1182
1183bool
1184Perl_is_utf8_upper(pTHX_ U8 *p)
1185{
1186 if (!is_utf8_char(p))
1187 return FALSE;
1188 if (!PL_utf8_upper)
1189 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1190 return swash_fetch(PL_utf8_upper, p, TRUE);
1191}
1192
1193bool
1194Perl_is_utf8_lower(pTHX_ U8 *p)
1195{
1196 if (!is_utf8_char(p))
1197 return FALSE;
1198 if (!PL_utf8_lower)
1199 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1200 return swash_fetch(PL_utf8_lower, p, TRUE);
1201}
1202
1203bool
1204Perl_is_utf8_cntrl(pTHX_ U8 *p)
1205{
1206 if (!is_utf8_char(p))
1207 return FALSE;
1208 if (!PL_utf8_cntrl)
1209 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1210 return swash_fetch(PL_utf8_cntrl, p, TRUE);
1211}
1212
1213bool
1214Perl_is_utf8_graph(pTHX_ U8 *p)
1215{
1216 if (!is_utf8_char(p))
1217 return FALSE;
1218 if (!PL_utf8_graph)
1219 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1220 return swash_fetch(PL_utf8_graph, p, TRUE);
1221}
1222
1223bool
1224Perl_is_utf8_print(pTHX_ U8 *p)
1225{
1226 if (!is_utf8_char(p))
1227 return FALSE;
1228 if (!PL_utf8_print)
1229 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1230 return swash_fetch(PL_utf8_print, p, TRUE);
1231}
1232
1233bool
1234Perl_is_utf8_punct(pTHX_ U8 *p)
1235{
1236 if (!is_utf8_char(p))
1237 return FALSE;
1238 if (!PL_utf8_punct)
1239 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1240 return swash_fetch(PL_utf8_punct, p, TRUE);
1241}
1242
1243bool
1244Perl_is_utf8_xdigit(pTHX_ U8 *p)
1245{
1246 if (!is_utf8_char(p))
1247 return FALSE;
1248 if (!PL_utf8_xdigit)
1249 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1250 return swash_fetch(PL_utf8_xdigit, p, TRUE);
1251}
1252
1253bool
1254Perl_is_utf8_mark(pTHX_ U8 *p)
1255{
1256 if (!is_utf8_char(p))
1257 return FALSE;
1258 if (!PL_utf8_mark)
1259 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1260 return swash_fetch(PL_utf8_mark, p, TRUE);
1261}
1262
1263/*
1264=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1265
1266The "p" contains the pointer to the UTF-8 string encoding
1267the character that is being converted.
1268
1269The "ustrp" is a pointer to the character buffer to put the
1270conversion result to. The "lenp" is a pointer to the length
1271of the result.
1272
1273The "swash" is a pointer to the swash to use.
1274
1275The "normal" is a string like "ToLower" which means the swash
1276$utf8::ToLower, which is stored in lib/unicore/To/Lower.pl,
1277and loaded by SWASHGET, using lib/utf8_heavy.pl.
1278
1279The "special" is a string like "utf8::ToSpecLower", which means
1280the hash %utf8::ToSpecLower, which is stored in the same file,
1281lib/unicore/To/Lower.pl, and also loaded by SWASHGET. The access
1282to the hash is by Perl_to_utf8_case().
1283
1284=cut
1285 */
1286
1287UV
1288Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
1289{
1290 UV uv;
1291
1292 if (!*swashp)
1293 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1294 uv = swash_fetch(*swashp, p, TRUE);
1295 if (!uv) {
1296 HV *hv;
1297 SV *keysv;
1298 HE *he;
1299
1300 uv = utf8_to_uvchr(p, 0);
1301
1302 if ((hv = get_hv(special, FALSE)) &&
1303 (keysv = sv_2mortal(Perl_newSVpvf(aTHX_ "%04"UVXf, uv))) &&
1304 (he = hv_fetch_ent(hv, keysv, FALSE, 0))) {
1305 SV *val = HeVAL(he);
1306 STRLEN len;
1307 char *s = SvPV(val, len);
1308
1309 if (len > 1) {
1310 Copy(s, ustrp, len, U8);
1311#ifdef EBCDIC
1312 {
1313 /* If we have EBCDIC we need to remap the
1314 * characters coming in from the "special"
1315 * (usually, but not always multicharacter)
1316 * mapping, since any characters in the low 256
1317 * are in Unicode code points, not EBCDIC.
1318 * --jhi */
1319
1320 U8 tmpbuf[UTF8_MAXLEN_FOLD+1];
1321 U8 *d = tmpbuf;
1322 U8 *t, *tend;
1323
1324 if (SvUTF8(val)) {
1325 STRLEN tlen = 0;
1326
1327 for (t = ustrp, tend = t + len;
1328 t < tend; t += tlen) {
1329 UV c = utf8_to_uvchr(t, &tlen);
1330
1331 if (tlen > 0)
1332 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1333 else
1334 break;
1335 }
1336 } else {
1337 for (t = ustrp, tend = t + len;
1338 t < tend; t++)
1339 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1340 }
1341 len = d - tmpbuf;
1342 Copy(tmpbuf, ustrp, len, U8);
1343 }
1344#endif
1345 }
1346 else {
1347 UV c = UNI_TO_NATIVE(*(U8*)s);
1348 U8 *d = uvchr_to_utf8(ustrp, c);
1349
1350 len = d - ustrp;
1351 }
1352 if (lenp)
1353 *lenp = len;
1354 return utf8_to_uvchr(ustrp, 0);
1355 }
1356 uv = NATIVE_TO_UNI(uv);
1357 }
1358 if (lenp)
1359 *lenp = UNISKIP(uv);
1360 uvuni_to_utf8(ustrp, uv);
1361 return uv;
1362}
1363
1364/*
1365=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1366
1367Convert the UTF-8 encoded character at p to its uppercase version and
1368store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1369that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1370uppercase version may be longer than the original character (up to two
1371characters).
1372
1373The first character of the uppercased version is returned
1374(but note, as explained above, that there may be more.)
1375
1376=cut */
1377
1378UV
1379Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1380{
1381 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1382 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1383}
1384
1385/*
1386=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1387
1388Convert the UTF-8 encoded character at p to its titlecase version and
1389store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1390that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1391titlecase version may be longer than the original character (up to two
1392characters).
1393
1394The first character of the titlecased version is returned
1395(but note, as explained above, that there may be more.)
1396
1397=cut */
1398
1399UV
1400Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1401{
1402 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1403 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1404}
1405
1406/*
1407=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1408
1409Convert the UTF-8 encoded character at p to its lowercase version and
1410store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1411that the ustrp needs to be at least UTF8_MAXLEN_UCLC+1 bytes since the
1412lowercase version may be longer than the original character (up to two
1413characters).
1414
1415The first character of the lowercased version is returned
1416(but note, as explained above, that there may be more.)
1417
1418=cut */
1419
1420UV
1421Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1422{
1423 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1424 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1425}
1426
1427/*
1428=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1429
1430Convert the UTF-8 encoded character at p to its foldcase version and
1431store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1432that the ustrp needs to be at least UTF8_MAXLEN_FOLD+1 bytes since the
1433foldcase version may be longer than the original character (up to
1434three characters).
1435
1436The first character of the foldcased version is returned
1437(but note, as explained above, that there may be more.)
1438
1439=cut */
1440
1441UV
1442Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1443{
1444 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1445 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1446}
1447
1448/* a "swash" is a swatch hash */
1449
1450SV*
1451Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1452{
1453 SV* retval;
1454 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1455 dSP;
1456 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1457 SV* errsv_save;
1458
1459 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1460 ENTER;
1461 errsv_save = newSVsv(ERRSV);
1462 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1463 if (!SvTRUE(ERRSV))
1464 sv_setsv(ERRSV, errsv_save);
1465 SvREFCNT_dec(errsv_save);
1466 LEAVE;
1467 }
1468 SPAGAIN;
1469 PUSHSTACKi(PERLSI_MAGIC);
1470 PUSHMARK(SP);
1471 EXTEND(SP,5);
1472 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1473 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1474 PUSHs(listsv);
1475 PUSHs(sv_2mortal(newSViv(minbits)));
1476 PUSHs(sv_2mortal(newSViv(none)));
1477 PUTBACK;
1478 ENTER;
1479 SAVEI32(PL_hints);
1480 PL_hints = 0;
1481 save_re_context();
1482 if (PL_curcop == &PL_compiling)
1483 /* XXX ought to be handled by lex_start */
1484 sv_setpv(tokenbufsv, PL_tokenbuf);
1485 errsv_save = newSVsv(ERRSV);
1486 if (call_method("SWASHNEW", G_SCALAR))
1487 retval = newSVsv(*PL_stack_sp--);
1488 else
1489 retval = &PL_sv_undef;
1490 if (!SvTRUE(ERRSV))
1491 sv_setsv(ERRSV, errsv_save);
1492 SvREFCNT_dec(errsv_save);
1493 LEAVE;
1494 POPSTACK;
1495 if (PL_curcop == &PL_compiling) {
1496 STRLEN len;
1497 char* pv = SvPV(tokenbufsv, len);
1498
1499 Copy(pv, PL_tokenbuf, len+1, char);
1500 PL_curcop->op_private = PL_hints;
1501 }
1502 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1503 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1504 return retval;
1505}
1506
1507
1508/* This API is wrong for special case conversions since we may need to
1509 * return several Unicode characters for a single Unicode character
1510 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1511 * the lower-level routine, and it is similarly broken for returning
1512 * multiple values. --jhi */
1513UV
1514Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1515{
1516 HV* hv = (HV*)SvRV(sv);
1517 U32 klen;
1518 U32 off;
1519 STRLEN slen;
1520 STRLEN needents;
1521 U8 *tmps = NULL;
1522 U32 bit;
1523 SV *retval;
1524 U8 tmputf8[2];
1525 UV c = NATIVE_TO_ASCII(*ptr);
1526
1527 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1528 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1529 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1530 ptr = tmputf8;
1531 }
1532 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1533 * then the "swatch" is a vec() for al the chars which start
1534 * with 0xAA..0xYY
1535 * So the key in the hash (klen) is length of encoded char -1
1536 */
1537 klen = UTF8SKIP(ptr) - 1;
1538 off = ptr[klen];
1539
1540 if (klen == 0)
1541 {
1542 /* If char in invariant then swatch is for all the invariant chars
1543 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1544 */
1545 needents = UTF_CONTINUATION_MARK;
1546 off = NATIVE_TO_UTF(ptr[klen]);
1547 }
1548 else
1549 {
1550 /* If char is encoded then swatch is for the prefix */
1551 needents = (1 << UTF_ACCUMULATION_SHIFT);
1552 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1553 }
1554
1555 /*
1556 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1557 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1558 * it's nothing to sniff at.) Pity we usually come through at least
1559 * two function calls to get here...
1560 *
1561 * NB: this code assumes that swatches are never modified, once generated!
1562 */
1563
1564 if (hv == PL_last_swash_hv &&
1565 klen == PL_last_swash_klen &&
1566 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1567 {
1568 tmps = PL_last_swash_tmps;
1569 slen = PL_last_swash_slen;
1570 }
1571 else {
1572 /* Try our second-level swatch cache, kept in a hash. */
1573 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1574
1575 /* If not cached, generate it via utf8::SWASHGET */
1576 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1577 dSP;
1578 /* We use utf8n_to_uvuni() as we want an index into
1579 Unicode tables, not a native character number.
1580 */
1581 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1582 SV *errsv_save;
1583 ENTER;
1584 SAVETMPS;
1585 save_re_context();
1586 PUSHSTACKi(PERLSI_MAGIC);
1587 PUSHMARK(SP);
1588 EXTEND(SP,3);
1589 PUSHs((SV*)sv);
1590 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1591 PUSHs(sv_2mortal(newSViv((klen) ?
1592 (code_point & ~(needents - 1)) : 0)));
1593 PUSHs(sv_2mortal(newSViv(needents)));
1594 PUTBACK;
1595 errsv_save = newSVsv(ERRSV);
1596 if (call_method("SWASHGET", G_SCALAR))
1597 retval = newSVsv(*PL_stack_sp--);
1598 else
1599 retval = &PL_sv_undef;
1600 if (!SvTRUE(ERRSV))
1601 sv_setsv(ERRSV, errsv_save);
1602 SvREFCNT_dec(errsv_save);
1603 POPSTACK;
1604 FREETMPS;
1605 LEAVE;
1606 if (PL_curcop == &PL_compiling)
1607 PL_curcop->op_private = PL_hints;
1608
1609 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1610
1611 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1612 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1613 }
1614
1615 PL_last_swash_hv = hv;
1616 PL_last_swash_klen = klen;
1617 PL_last_swash_tmps = tmps;
1618 PL_last_swash_slen = slen;
1619 if (klen)
1620 Copy(ptr, PL_last_swash_key, klen, U8);
1621 }
1622
1623 switch ((int)((slen << 3) / needents)) {
1624 case 1:
1625 bit = 1 << (off & 7);
1626 off >>= 3;
1627 return (tmps[off] & bit) != 0;
1628 case 8:
1629 return tmps[off];
1630 case 16:
1631 off <<= 1;
1632 return (tmps[off] << 8) + tmps[off + 1] ;
1633 case 32:
1634 off <<= 2;
1635 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1636 }
1637 Perl_croak(aTHX_ "panic: swash_fetch");
1638 return 0;
1639}
1640
1641
1642/*
1643=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1644
1645Adds the UTF8 representation of the Native codepoint C<uv> to the end
1646of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1647bytes available. The return value is the pointer to the byte after the
1648end of the new character. In other words,
1649
1650 d = uvchr_to_utf8(d, uv);
1651
1652is the recommended wide native character-aware way of saying
1653
1654 *(d++) = uv;
1655
1656=cut
1657*/
1658
1659/* On ASCII machines this is normally a macro but we want a
1660 real function in case XS code wants it
1661*/
1662#undef Perl_uvchr_to_utf8
1663U8 *
1664Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1665{
1666 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
1667}
1668
1669U8 *
1670Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1671{
1672 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1673}
1674
1675/*
1676=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1677
1678Returns the native character value of the first character in the string C<s>
1679which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1680length, in bytes, of that character.
1681
1682Allows length and flags to be passed to low level routine.
1683
1684=cut
1685*/
1686/* On ASCII machines this is normally a macro but we want
1687 a real function in case XS code wants it
1688*/
1689#undef Perl_utf8n_to_uvchr
1690UV
1691Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1692{
1693 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1694 return UNI_TO_NATIVE(uv);
1695}
1696
1697/*
1698=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1699
1700Build to the scalar dsv a displayable version of the string spv,
1701length len, the displayable version being at most pvlim bytes long
1702(if longer, the rest is truncated and "..." will be appended).
1703
1704The flags argument can have UNI_DISPLAY_ISPRINT set to display
1705isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
1706to display the \\[nrfta\\] as the backslashed versions (like '\n')
1707(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1708UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1709UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1710
1711The pointer to the PV of the dsv is returned.
1712
1713=cut */
1714char *
1715Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1716{
1717 int truncated = 0;
1718 char *s, *e;
1719
1720 sv_setpvn(dsv, "", 0);
1721 for (s = (char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
1722 UV u;
1723 bool ok = FALSE;
1724
1725 if (pvlim && SvCUR(dsv) >= pvlim) {
1726 truncated++;
1727 break;
1728 }
1729 u = utf8_to_uvchr((U8*)s, 0);
1730 if (u < 256) {
1731 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
1732 switch (u & 0xFF) {
1733 case '\n':
1734 Perl_sv_catpvf(aTHX_ dsv, "\\n"); ok = TRUE; break;
1735 case '\r':
1736 Perl_sv_catpvf(aTHX_ dsv, "\\r"); ok = TRUE; break;
1737 case '\t':
1738 Perl_sv_catpvf(aTHX_ dsv, "\\t"); ok = TRUE; break;
1739 case '\f':
1740 Perl_sv_catpvf(aTHX_ dsv, "\\f"); ok = TRUE; break;
1741 case '\a':
1742 Perl_sv_catpvf(aTHX_ dsv, "\\a"); ok = TRUE; break;
1743 case '\\':
1744 Perl_sv_catpvf(aTHX_ dsv, "\\" ); ok = TRUE; break;
1745 default: break;
1746 }
1747 }
1748 /* isPRINT() is the locale-blind version. */
1749 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(u & 0xFF)) {
1750 Perl_sv_catpvf(aTHX_ dsv, "%c", (char)(u & 0xFF));
1751 ok = TRUE;
1752 }
1753 }
1754 if (!ok)
1755 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
1756 }
1757 if (truncated)
1758 sv_catpvn(dsv, "...", 3);
1759
1760 return SvPVX(dsv);
1761}
1762
1763/*
1764=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1765
1766Build to the scalar dsv a displayable version of the scalar sv,
1767the displayable version being at most pvlim bytes long
1768(if longer, the rest is truncated and "..." will be appended).
1769
1770The flags argument is as in pv_uni_display().
1771
1772The pointer to the PV of the dsv is returned.
1773
1774=cut */
1775char *
1776Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1777{
1778 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX(ssv), SvCUR(ssv),
1779 pvlim, flags);
1780}
1781
1782/*
1783=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
1784
1785Return true if the strings s1 and s2 differ case-insensitively, false
1786if not (if they are equal case-insensitively). If u1 is true, the
1787string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
1788the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1789are false, the respective string is assumed to be in native 8-bit
1790encoding.
1791
1792If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1793in there (they will point at the beginning of the I<next> character).
1794If the pointers behind pe1 or pe2 are non-NULL, they are the end
1795pointers beyond which scanning will not continue under any
1796circustances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
1797s2+l2 will be used as goal end pointers that will also stop the scan,
1798and which qualify towards defining a successful match: all the scans
1799that define an explicit length must reach their goal pointers for
1800a match to succeed).
1801
1802For case-insensitiveness, the "casefolding" of Unicode is used
1803instead of upper/lowercasing both the characters, see
1804http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1805
1806=cut */
1807I32
1808Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
1809{
1810 register U8 *p1 = (U8*)s1;
1811 register U8 *p2 = (U8*)s2;
1812 register U8 *e1 = 0, *f1 = 0, *q1 = 0;
1813 register U8 *e2 = 0, *f2 = 0, *q2 = 0;
1814 STRLEN n1 = 0, n2 = 0;
1815 U8 foldbuf1[UTF8_MAXLEN_FOLD+1];
1816 U8 foldbuf2[UTF8_MAXLEN_FOLD+1];
1817 U8 natbuf[1+1];
1818 STRLEN foldlen1, foldlen2;
1819 bool match;
1820
1821 if (pe1)
1822 e1 = *(U8**)pe1;
1823 if (e1 == 0 || (l1 && l1 < e1 - (U8*)s1))
1824 f1 = (U8*)s1 + l1;
1825 if (pe2)
1826 e2 = *(U8**)pe2;
1827 if (e2 == 0 || (l2 && l2 < e2 - (U8*)s2))
1828 f2 = (U8*)s2 + l2;
1829
1830 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
1831 return 1; /* mismatch; possible infinite loop or false positive */
1832
1833 if (!u1 || !u2)
1834 natbuf[1] = 0; /* Need to terminate the buffer. */
1835
1836 while ((e1 == 0 || p1 < e1) &&
1837 (f1 == 0 || p1 < f1) &&
1838 (e2 == 0 || p2 < e2) &&
1839 (f2 == 0 || p2 < f2)) {
1840 if (n1 == 0) {
1841 if (u1)
1842 to_utf8_fold(p1, foldbuf1, &foldlen1);
1843 else {
1844 natbuf[0] = NATIVE_TO_UNI(*p1);
1845 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
1846 }
1847 q1 = foldbuf1;
1848 n1 = foldlen1;
1849 }
1850 if (n2 == 0) {
1851 if (u2)
1852 to_utf8_fold(p2, foldbuf2, &foldlen2);
1853 else {
1854 natbuf[0] = NATIVE_TO_UNI(*p2);
1855 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
1856 }
1857 q2 = foldbuf2;
1858 n2 = foldlen2;
1859 }
1860 while (n1 && n2) {
1861 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
1862 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
1863 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
1864 return 1; /* mismatch */
1865 n1 -= UTF8SKIP(q1);
1866 q1 += UTF8SKIP(q1);
1867 n2 -= UTF8SKIP(q2);
1868 q2 += UTF8SKIP(q2);
1869 }
1870 if (n1 == 0)
1871 p1 += u1 ? UTF8SKIP(p1) : 1;
1872 if (n2 == 0)
1873 p2 += u2 ? UTF8SKIP(p2) : 1;
1874
1875 }
1876
1877 /* A match is defined by all the scans that specified
1878 * an explicit length reaching their final goals. */
1879 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
1880
1881 if (match) {
1882 if (pe1)
1883 *pe1 = (char*)p1;
1884 if (pe2)
1885 *pe2 = (char*)p2;
1886 }
1887
1888 return match ? 0 : 1; /* 0 match, 1 mismatch */
1889}
1890