This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
remove PL_na from typemap
[perl5.git] / utf8.c
... / ...
CommitLineData
1/* utf8.c
2 *
3 * Copyright (c) 1998-2001, Larry Wall
4 *
5 * You may distribute under the terms of either the GNU General Public
6 * License or the Artistic License, as specified in the README file.
7 *
8 */
9
10/*
11 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
12 * heard of that we don't want to see any closer; and that's the one place
13 * we're trying to get to! And that's just where we can't get, nohow.'
14 *
15 * 'Well do I understand your speech,' he answered in the same language;
16 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
17 * as is the custom in the West, if you wish to be answered?'
18 *
19 * ...the travellers perceived that the floor was paved with stones of many
20 * hues; branching runes and strange devices intertwined beneath their feet.
21 */
22
23#include "EXTERN.h"
24#define PERL_IN_UTF8_C
25#include "perl.h"
26
27/* Unicode support */
28
29/*
30=for apidoc A|U8 *|uvuni_to_utf8|U8 *d|UV uv
31
32Adds the UTF8 representation of the Unicode codepoint C<uv> to the end
33of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
34bytes available. The return value is the pointer to the byte after the
35end of the new character. In other words,
36
37 d = uvuni_to_utf8(d, uv);
38
39is the recommended Unicode-aware way of saying
40
41 *(d++) = uv;
42
43=cut
44*/
45
46U8 *
47Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
48{
49 if (UNI_IS_INVARIANT(uv)) {
50 *d++ = UTF_TO_NATIVE(uv);
51 return d;
52 }
53#if defined(EBCDIC) || 1 /* always for testing */
54 else {
55 STRLEN len = UNISKIP(uv);
56 U8 *p = d+len-1;
57 while (p > d) {
58 *p-- = UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
59 uv >>= UTF_ACCUMULATION_SHIFT;
60 }
61 *p = UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
62 return d+len;
63 }
64#else /* Non loop style */
65 if (uv < 0x800) {
66 *d++ = (( uv >> 6) | 0xc0);
67 *d++ = (( uv & 0x3f) | 0x80);
68 return d;
69 }
70 if (uv < 0x10000) {
71 *d++ = (( uv >> 12) | 0xe0);
72 *d++ = (((uv >> 6) & 0x3f) | 0x80);
73 *d++ = (( uv & 0x3f) | 0x80);
74 return d;
75 }
76 if (uv < 0x200000) {
77 *d++ = (( uv >> 18) | 0xf0);
78 *d++ = (((uv >> 12) & 0x3f) | 0x80);
79 *d++ = (((uv >> 6) & 0x3f) | 0x80);
80 *d++ = (( uv & 0x3f) | 0x80);
81 return d;
82 }
83 if (uv < 0x4000000) {
84 *d++ = (( uv >> 24) | 0xf8);
85 *d++ = (((uv >> 18) & 0x3f) | 0x80);
86 *d++ = (((uv >> 12) & 0x3f) | 0x80);
87 *d++ = (((uv >> 6) & 0x3f) | 0x80);
88 *d++ = (( uv & 0x3f) | 0x80);
89 return d;
90 }
91 if (uv < 0x80000000) {
92 *d++ = (( uv >> 30) | 0xfc);
93 *d++ = (((uv >> 24) & 0x3f) | 0x80);
94 *d++ = (((uv >> 18) & 0x3f) | 0x80);
95 *d++ = (((uv >> 12) & 0x3f) | 0x80);
96 *d++ = (((uv >> 6) & 0x3f) | 0x80);
97 *d++ = (( uv & 0x3f) | 0x80);
98 return d;
99 }
100#ifdef HAS_QUAD
101 if (uv < UTF8_QUAD_MAX)
102#endif
103 {
104 *d++ = 0xfe; /* Can't match U+FEFF! */
105 *d++ = (((uv >> 30) & 0x3f) | 0x80);
106 *d++ = (((uv >> 24) & 0x3f) | 0x80);
107 *d++ = (((uv >> 18) & 0x3f) | 0x80);
108 *d++ = (((uv >> 12) & 0x3f) | 0x80);
109 *d++ = (((uv >> 6) & 0x3f) | 0x80);
110 *d++ = (( uv & 0x3f) | 0x80);
111 return d;
112 }
113#ifdef HAS_QUAD
114 {
115 *d++ = 0xff; /* Can't match U+FFFE! */
116 *d++ = 0x80; /* 6 Reserved bits */
117 *d++ = (((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
118 *d++ = (((uv >> 54) & 0x3f) | 0x80);
119 *d++ = (((uv >> 48) & 0x3f) | 0x80);
120 *d++ = (((uv >> 42) & 0x3f) | 0x80);
121 *d++ = (((uv >> 36) & 0x3f) | 0x80);
122 *d++ = (((uv >> 30) & 0x3f) | 0x80);
123 *d++ = (((uv >> 24) & 0x3f) | 0x80);
124 *d++ = (((uv >> 18) & 0x3f) | 0x80);
125 *d++ = (((uv >> 12) & 0x3f) | 0x80);
126 *d++ = (((uv >> 6) & 0x3f) | 0x80);
127 *d++ = (( uv & 0x3f) | 0x80);
128 return d;
129 }
130#endif
131#endif /* Loop style */
132}
133
134
135
136/*
137=for apidoc A|STRLEN|is_utf8_char|U8 *s
138
139Tests if some arbitrary number of bytes begins in a valid UTF-8
140character. Note that an INVARIANT (i.e. ASCII) character is a valid UTF-8 character.
141The actual number of bytes in the UTF-8 character will be returned if
142it is valid, otherwise 0.
143
144=cut
145*/
146STRLEN
147Perl_is_utf8_char(pTHX_ U8 *s)
148{
149 U8 u = *s;
150 STRLEN slen, len;
151 UV uv, ouv;
152
153 if (UTF8_IS_INVARIANT(u))
154 return 1;
155
156 if (!UTF8_IS_START(u))
157 return 0;
158
159 len = UTF8SKIP(s);
160
161 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
162 return 0;
163
164 slen = len - 1;
165 s++;
166 u &= UTF_START_MASK(len);
167 uv = u;
168 ouv = uv;
169 while (slen--) {
170 if (!UTF8_IS_CONTINUATION(*s))
171 return 0;
172 uv = UTF8_ACCUMULATE(uv, *s);
173 if (uv < ouv)
174 return 0;
175 ouv = uv;
176 s++;
177 }
178
179 if (UNISKIP(uv) < len)
180 return 0;
181
182 return len;
183}
184
185/*
186=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
187
188Returns true if first C<len> bytes of the given string form a valid UTF8
189string, false otherwise. Note that 'a valid UTF8 string' does not mean
190'a string that contains UTF8' because a valid ASCII string is a valid
191UTF8 string.
192
193=cut
194*/
195
196bool
197Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
198{
199 U8* x = s;
200 U8* send;
201 STRLEN c;
202
203 if (!len)
204 len = strlen((char *)s);
205 send = s + len;
206
207 while (x < send) {
208 c = is_utf8_char(x);
209 if (!c)
210 return FALSE;
211 x += c;
212 }
213 if (x != send)
214 return FALSE;
215
216 return TRUE;
217}
218
219/*
220=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
221
222Bottom level UTF-8 decode routine.
223Returns the unicode code point value of the first character in the string C<s>
224which is assumed to be in UTF8 encoding and no longer than C<curlen>;
225C<retlen> will be set to the length, in bytes, of that character.
226
227If C<s> does not point to a well-formed UTF8 character, the behaviour
228is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
229it is assumed that the caller will raise a warning, and this function
230will silently just set C<retlen> to C<-1> and return zero. If the
231C<flags> does not contain UTF8_CHECK_ONLY, warnings about
232malformations will be given, C<retlen> will be set to the expected
233length of the UTF-8 character in bytes, and zero will be returned.
234
235The C<flags> can also contain various flags to allow deviations from
236the strict UTF-8 encoding (see F<utf8.h>).
237
238Most code should use utf8_to_uvchr() rather than call this directly.
239
240=cut
241*/
242
243UV
244Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
245{
246 UV uv = *s, ouv = 0;
247 STRLEN len = 1;
248 bool dowarn = ckWARN_d(WARN_UTF8);
249 STRLEN expectlen = 0;
250 U32 warning = 0;
251
252/* This list is a superset of the UTF8_ALLOW_XXX. */
253
254#define UTF8_WARN_EMPTY 1
255#define UTF8_WARN_CONTINUATION 2
256#define UTF8_WARN_NON_CONTINUATION 3
257#define UTF8_WARN_FE_FF 4
258#define UTF8_WARN_SHORT 5
259#define UTF8_WARN_OVERFLOW 6
260#define UTF8_WARN_SURROGATE 7
261#define UTF8_WARN_BOM 8
262#define UTF8_WARN_LONG 9
263#define UTF8_WARN_FFFF 10
264
265 if (curlen == 0 &&
266 !(flags & UTF8_ALLOW_EMPTY)) {
267 warning = UTF8_WARN_EMPTY;
268 goto malformed;
269 }
270
271 if (UTF8_IS_INVARIANT(uv)) {
272 if (retlen)
273 *retlen = 1;
274 return (UV) (NATIVE_TO_UTF(*s));
275 }
276
277 if (UTF8_IS_CONTINUATION(uv) &&
278 !(flags & UTF8_ALLOW_CONTINUATION)) {
279 warning = UTF8_WARN_CONTINUATION;
280 goto malformed;
281 }
282
283 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
284 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
285 warning = UTF8_WARN_NON_CONTINUATION;
286 goto malformed;
287 }
288
289#ifdef EBCDIC
290 uv = NATIVE_TO_UTF(uv);
291#else
292 if ((uv == 0xfe || uv == 0xff) &&
293 !(flags & UTF8_ALLOW_FE_FF)) {
294 warning = UTF8_WARN_FE_FF;
295 goto malformed;
296 }
297#endif
298
299 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
300 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
301 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
302 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
303#ifdef EBCDIC
304 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
305 else { len = 7; uv &= 0x01; }
306#else
307 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
308 else if (!(uv & 0x01)) { len = 7; uv = 0; }
309 else { len = 13; uv = 0; } /* whoa! */
310#endif
311
312 if (retlen)
313 *retlen = len;
314
315 expectlen = len;
316
317 if ((curlen < expectlen) &&
318 !(flags & UTF8_ALLOW_SHORT)) {
319 warning = UTF8_WARN_SHORT;
320 goto malformed;
321 }
322
323 len--;
324 s++;
325 ouv = uv;
326
327 while (len--) {
328 if (!UTF8_IS_CONTINUATION(*s) &&
329 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
330 s--;
331 warning = UTF8_WARN_NON_CONTINUATION;
332 goto malformed;
333 }
334 else
335 uv = UTF8_ACCUMULATE(uv, *s);
336 if (!(uv > ouv)) {
337 /* These cannot be allowed. */
338 if (uv == ouv) {
339 if (!(flags & UTF8_ALLOW_LONG)) {
340 warning = UTF8_WARN_LONG;
341 goto malformed;
342 }
343 }
344 else { /* uv < ouv */
345 /* This cannot be allowed. */
346 warning = UTF8_WARN_OVERFLOW;
347 goto malformed;
348 }
349 }
350 s++;
351 ouv = uv;
352 }
353
354 if (UNICODE_IS_SURROGATE(uv) &&
355 !(flags & UTF8_ALLOW_SURROGATE)) {
356 warning = UTF8_WARN_SURROGATE;
357 goto malformed;
358 } else if (UNICODE_IS_BYTE_ORDER_MARK(uv) &&
359 !(flags & UTF8_ALLOW_BOM)) {
360 warning = UTF8_WARN_BOM;
361 goto malformed;
362 } else if ((expectlen > UNISKIP(uv)) &&
363 !(flags & UTF8_ALLOW_LONG)) {
364 warning = UTF8_WARN_LONG;
365 goto malformed;
366 } else if (UNICODE_IS_ILLEGAL(uv) &&
367 !(flags & UTF8_ALLOW_FFFF)) {
368 warning = UTF8_WARN_FFFF;
369 goto malformed;
370 }
371
372 return uv;
373
374malformed:
375
376 if (flags & UTF8_CHECK_ONLY) {
377 if (retlen)
378 *retlen = -1;
379 return 0;
380 }
381
382 if (dowarn) {
383 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
384
385 switch (warning) {
386 case 0: /* Intentionally empty. */ break;
387 case UTF8_WARN_EMPTY:
388 Perl_sv_catpvf(aTHX_ sv, "(empty string)");
389 break;
390 case UTF8_WARN_CONTINUATION:
391 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf")", uv);
392 break;
393 case UTF8_WARN_NON_CONTINUATION:
394 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf" after start byte 0x%02"UVxf")",
395 (UV)s[1], uv);
396 break;
397 case UTF8_WARN_FE_FF:
398 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
399 break;
400 case UTF8_WARN_SHORT:
401 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
402 curlen, curlen == 1 ? "" : "s", expectlen);
403 break;
404 case UTF8_WARN_OVERFLOW:
405 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x)",
406 ouv, *s);
407 break;
408 case UTF8_WARN_SURROGATE:
409 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
410 break;
411 case UTF8_WARN_BOM:
412 Perl_sv_catpvf(aTHX_ sv, "(byte order mark 0x%04"UVxf")", uv);
413 break;
414 case UTF8_WARN_LONG:
415 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d)",
416 expectlen, expectlen == 1 ? "": "s", UNISKIP(uv));
417 break;
418 case UTF8_WARN_FFFF:
419 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
420 break;
421 default:
422 Perl_sv_catpvf(aTHX_ sv, "(unknown reason)");
423 break;
424 }
425
426 if (warning) {
427 char *s = SvPVX(sv);
428
429 if (PL_op)
430 Perl_warner(aTHX_ WARN_UTF8,
431 "%s in %s", s, PL_op_desc[PL_op->op_type]);
432 else
433 Perl_warner(aTHX_ WARN_UTF8, "%s", s);
434 }
435 }
436
437 if (retlen)
438 *retlen = expectlen ? expectlen : len;
439
440 return 0;
441}
442
443/*
444=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
445
446Returns the native character value of the first character in the string C<s>
447which is assumed to be in UTF8 encoding; C<retlen> will be set to the
448length, in bytes, of that character.
449
450If C<s> does not point to a well-formed UTF8 character, zero is
451returned and retlen is set, if possible, to -1.
452
453=cut
454*/
455
456UV
457Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
458{
459 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXLEN, retlen, 0);
460}
461
462/*
463=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
464
465Returns the Unicode code point of the first character in the string C<s>
466which is assumed to be in UTF8 encoding; C<retlen> will be set to the
467length, in bytes, of that character.
468
469This function should only be used when returned UV is considered
470an index into the Unicode semantic tables (e.g. swashes).
471
472If C<s> does not point to a well-formed UTF8 character, zero is
473returned and retlen is set, if possible, to -1.
474
475=cut
476*/
477
478UV
479Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
480{
481 /* Call the low level routine asking for checks */
482 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXLEN, retlen, 0);
483}
484
485/*
486=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
487
488Return the length of the UTF-8 char encoded string C<s> in characters.
489Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
490up past C<e>, croaks.
491
492=cut
493*/
494
495STRLEN
496Perl_utf8_length(pTHX_ U8 *s, U8 *e)
497{
498 STRLEN len = 0;
499
500 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
501 * the bitops (especially ~) can create illegal UTF-8.
502 * In other words: in Perl UTF-8 is not just for Unicode. */
503
504 if (e < s)
505 Perl_croak(aTHX_ "panic: utf8_length: unexpected end");
506 while (s < e) {
507 U8 t = UTF8SKIP(s);
508
509 if (e - s < t)
510 Perl_croak(aTHX_ "panic: utf8_length: unaligned end");
511 s += t;
512 len++;
513 }
514
515 return len;
516}
517
518/*
519=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
520
521Returns the number of UTF8 characters between the UTF-8 pointers C<a>
522and C<b>.
523
524WARNING: use only if you *know* that the pointers point inside the
525same UTF-8 buffer.
526
527=cut
528*/
529
530IV
531Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
532{
533 IV off = 0;
534
535 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
536 * the bitops (especially ~) can create illegal UTF-8.
537 * In other words: in Perl UTF-8 is not just for Unicode. */
538
539 if (a < b) {
540 while (a < b) {
541 U8 c = UTF8SKIP(a);
542
543 if (b - a < c)
544 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
545 a += c;
546 off--;
547 }
548 }
549 else {
550 while (b < a) {
551 U8 c = UTF8SKIP(b);
552
553 if (a - b < c)
554 Perl_croak(aTHX_ "panic: utf8_distance: unaligned end");
555 b += c;
556 off++;
557 }
558 }
559
560 return off;
561}
562
563/*
564=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
565
566Return the UTF-8 pointer C<s> displaced by C<off> characters, either
567forward or backward.
568
569WARNING: do not use the following unless you *know* C<off> is within
570the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
571on the first byte of character or just after the last byte of a character.
572
573=cut
574*/
575
576U8 *
577Perl_utf8_hop(pTHX_ U8 *s, I32 off)
578{
579 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
580 * the bitops (especially ~) can create illegal UTF-8.
581 * In other words: in Perl UTF-8 is not just for Unicode. */
582
583 if (off >= 0) {
584 while (off--)
585 s += UTF8SKIP(s);
586 }
587 else {
588 while (off++) {
589 s--;
590 while (UTF8_IS_CONTINUATION(*s))
591 s--;
592 }
593 }
594 return s;
595}
596
597/*
598=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
599
600Converts a string C<s> of length C<len> from UTF8 into byte encoding.
601Unlike C<bytes_to_utf8>, this over-writes the original string, and
602updates len to contain the new length.
603Returns zero on failure, setting C<len> to -1.
604
605=cut
606*/
607
608U8 *
609Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
610{
611 U8 *send;
612 U8 *d;
613 U8 *save = s;
614
615 /* ensure valid UTF8 and chars < 256 before updating string */
616 for (send = s + *len; s < send; ) {
617 U8 c = *s++;
618
619 if (!UTF8_IS_INVARIANT(c) &&
620 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
621 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
622 *len = -1;
623 return 0;
624 }
625 }
626
627 d = s = save;
628 while (s < send) {
629 STRLEN ulen;
630 *d++ = (U8)utf8_to_uvchr(s, &ulen);
631 s += ulen;
632 }
633 *d = '\0';
634 *len = d - save;
635 return save;
636}
637
638/*
639=for apidoc A|U8 *|bytes_from_utf8|U8 *s|STRLEN *len|bool *is_utf8
640
641Converts a string C<s> of length C<len> from UTF8 into byte encoding.
642Unlike <utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
643the newly-created string, and updates C<len> to contain the new
644length. Returns the original string if no conversion occurs, C<len>
645is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
6460 if C<s> is converted or contains all 7bit characters.
647
648=cut
649*/
650
651U8 *
652Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
653{
654 U8 *d;
655 U8 *start = s;
656 U8 *send;
657 I32 count = 0;
658
659 if (!*is_utf8)
660 return start;
661
662 /* ensure valid UTF8 and chars < 256 before converting string */
663 for (send = s + *len; s < send;) {
664 U8 c = *s++;
665 if (!UTF8_IS_INVARIANT(c)) {
666 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
667 (c = *s++) && UTF8_IS_CONTINUATION(c))
668 count++;
669 else
670 return start;
671 }
672 }
673
674 *is_utf8 = 0;
675
676 Newz(801, d, (*len) - count + 1, U8);
677 s = start; start = d;
678 while (s < send) {
679 U8 c = *s++;
680 if (!UTF8_IS_INVARIANT(c)) {
681 /* Then it is two-byte encoded */
682 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
683 c = ASCII_TO_NATIVE(c);
684 }
685 *d++ = c;
686 }
687 *d = '\0';
688 *len = d - start;
689 return start;
690}
691
692/*
693=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
694
695Converts a string C<s> of length C<len> from ASCII into UTF8 encoding.
696Returns a pointer to the newly-created string, and sets C<len> to
697reflect the new length.
698
699=cut
700*/
701
702U8*
703Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
704{
705 U8 *send;
706 U8 *d;
707 U8 *dst;
708 send = s + (*len);
709
710 Newz(801, d, (*len) * 2 + 1, U8);
711 dst = d;
712
713 while (s < send) {
714 UV uv = NATIVE_TO_ASCII(*s++);
715 if (UNI_IS_INVARIANT(uv))
716 *d++ = UTF_TO_NATIVE(uv);
717 else {
718 *d++ = UTF8_EIGHT_BIT_HI(uv);
719 *d++ = UTF8_EIGHT_BIT_LO(uv);
720 }
721 }
722 *d = '\0';
723 *len = d-dst;
724 return dst;
725}
726
727/*
728 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
729 *
730 * Destination must be pre-extended to 3/2 source. Do not use in-place.
731 * We optimize for native, for obvious reasons. */
732
733U8*
734Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
735{
736 U8* pend;
737 U8* dstart = d;
738
739 if (bytelen & 1)
740 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen");
741
742 pend = p + bytelen;
743
744 while (p < pend) {
745 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
746 p += 2;
747 if (uv < 0x80) {
748 *d++ = uv;
749 continue;
750 }
751 if (uv < 0x800) {
752 *d++ = (( uv >> 6) | 0xc0);
753 *d++ = (( uv & 0x3f) | 0x80);
754 continue;
755 }
756 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
757 UV low = *p++;
758 if (low < 0xdc00 || low >= 0xdfff)
759 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
760 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
761 }
762 if (uv < 0x10000) {
763 *d++ = (( uv >> 12) | 0xe0);
764 *d++ = (((uv >> 6) & 0x3f) | 0x80);
765 *d++ = (( uv & 0x3f) | 0x80);
766 continue;
767 }
768 else {
769 *d++ = (( uv >> 18) | 0xf0);
770 *d++ = (((uv >> 12) & 0x3f) | 0x80);
771 *d++ = (((uv >> 6) & 0x3f) | 0x80);
772 *d++ = (( uv & 0x3f) | 0x80);
773 continue;
774 }
775 }
776 *newlen = d - dstart;
777 return d;
778}
779
780/* Note: this one is slightly destructive of the source. */
781
782U8*
783Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
784{
785 U8* s = (U8*)p;
786 U8* send = s + bytelen;
787 while (s < send) {
788 U8 tmp = s[0];
789 s[0] = s[1];
790 s[1] = tmp;
791 s += 2;
792 }
793 return utf16_to_utf8(p, d, bytelen, newlen);
794}
795
796/* for now these are all defined (inefficiently) in terms of the utf8 versions */
797
798bool
799Perl_is_uni_alnum(pTHX_ U32 c)
800{
801 U8 tmpbuf[UTF8_MAXLEN+1];
802 uvchr_to_utf8(tmpbuf, (UV)c);
803 return is_utf8_alnum(tmpbuf);
804}
805
806bool
807Perl_is_uni_alnumc(pTHX_ U32 c)
808{
809 U8 tmpbuf[UTF8_MAXLEN+1];
810 uvchr_to_utf8(tmpbuf, (UV)c);
811 return is_utf8_alnumc(tmpbuf);
812}
813
814bool
815Perl_is_uni_idfirst(pTHX_ U32 c)
816{
817 U8 tmpbuf[UTF8_MAXLEN+1];
818 uvchr_to_utf8(tmpbuf, (UV)c);
819 return is_utf8_idfirst(tmpbuf);
820}
821
822bool
823Perl_is_uni_alpha(pTHX_ U32 c)
824{
825 U8 tmpbuf[UTF8_MAXLEN+1];
826 uvchr_to_utf8(tmpbuf, (UV)c);
827 return is_utf8_alpha(tmpbuf);
828}
829
830bool
831Perl_is_uni_ascii(pTHX_ U32 c)
832{
833 U8 tmpbuf[UTF8_MAXLEN+1];
834 uvchr_to_utf8(tmpbuf, (UV)c);
835 return is_utf8_ascii(tmpbuf);
836}
837
838bool
839Perl_is_uni_space(pTHX_ U32 c)
840{
841 U8 tmpbuf[UTF8_MAXLEN+1];
842 uvchr_to_utf8(tmpbuf, (UV)c);
843 return is_utf8_space(tmpbuf);
844}
845
846bool
847Perl_is_uni_digit(pTHX_ U32 c)
848{
849 U8 tmpbuf[UTF8_MAXLEN+1];
850 uvchr_to_utf8(tmpbuf, (UV)c);
851 return is_utf8_digit(tmpbuf);
852}
853
854bool
855Perl_is_uni_upper(pTHX_ U32 c)
856{
857 U8 tmpbuf[UTF8_MAXLEN+1];
858 uvchr_to_utf8(tmpbuf, (UV)c);
859 return is_utf8_upper(tmpbuf);
860}
861
862bool
863Perl_is_uni_lower(pTHX_ U32 c)
864{
865 U8 tmpbuf[UTF8_MAXLEN+1];
866 uvchr_to_utf8(tmpbuf, (UV)c);
867 return is_utf8_lower(tmpbuf);
868}
869
870bool
871Perl_is_uni_cntrl(pTHX_ U32 c)
872{
873 U8 tmpbuf[UTF8_MAXLEN+1];
874 uvchr_to_utf8(tmpbuf, (UV)c);
875 return is_utf8_cntrl(tmpbuf);
876}
877
878bool
879Perl_is_uni_graph(pTHX_ U32 c)
880{
881 U8 tmpbuf[UTF8_MAXLEN+1];
882 uvchr_to_utf8(tmpbuf, (UV)c);
883 return is_utf8_graph(tmpbuf);
884}
885
886bool
887Perl_is_uni_print(pTHX_ U32 c)
888{
889 U8 tmpbuf[UTF8_MAXLEN+1];
890 uvchr_to_utf8(tmpbuf, (UV)c);
891 return is_utf8_print(tmpbuf);
892}
893
894bool
895Perl_is_uni_punct(pTHX_ U32 c)
896{
897 U8 tmpbuf[UTF8_MAXLEN+1];
898 uvchr_to_utf8(tmpbuf, (UV)c);
899 return is_utf8_punct(tmpbuf);
900}
901
902bool
903Perl_is_uni_xdigit(pTHX_ U32 c)
904{
905 U8 tmpbuf[UTF8_MAXLEN+1];
906 uvchr_to_utf8(tmpbuf, (UV)c);
907 return is_utf8_xdigit(tmpbuf);
908}
909
910U32
911Perl_to_uni_upper(pTHX_ U32 c)
912{
913 U8 tmpbuf[UTF8_MAXLEN+1];
914 uvchr_to_utf8(tmpbuf, (UV)c);
915 return to_utf8_upper(tmpbuf);
916}
917
918U32
919Perl_to_uni_title(pTHX_ U32 c)
920{
921 U8 tmpbuf[UTF8_MAXLEN+1];
922 uvchr_to_utf8(tmpbuf, (UV)c);
923 return to_utf8_title(tmpbuf);
924}
925
926U32
927Perl_to_uni_lower(pTHX_ U32 c)
928{
929 U8 tmpbuf[UTF8_MAXLEN+1];
930 uvchr_to_utf8(tmpbuf, (UV)c);
931 return to_utf8_lower(tmpbuf);
932}
933
934/* for now these all assume no locale info available for Unicode > 255 */
935
936bool
937Perl_is_uni_alnum_lc(pTHX_ U32 c)
938{
939 return is_uni_alnum(c); /* XXX no locale support yet */
940}
941
942bool
943Perl_is_uni_alnumc_lc(pTHX_ U32 c)
944{
945 return is_uni_alnumc(c); /* XXX no locale support yet */
946}
947
948bool
949Perl_is_uni_idfirst_lc(pTHX_ U32 c)
950{
951 return is_uni_idfirst(c); /* XXX no locale support yet */
952}
953
954bool
955Perl_is_uni_alpha_lc(pTHX_ U32 c)
956{
957 return is_uni_alpha(c); /* XXX no locale support yet */
958}
959
960bool
961Perl_is_uni_ascii_lc(pTHX_ U32 c)
962{
963 return is_uni_ascii(c); /* XXX no locale support yet */
964}
965
966bool
967Perl_is_uni_space_lc(pTHX_ U32 c)
968{
969 return is_uni_space(c); /* XXX no locale support yet */
970}
971
972bool
973Perl_is_uni_digit_lc(pTHX_ U32 c)
974{
975 return is_uni_digit(c); /* XXX no locale support yet */
976}
977
978bool
979Perl_is_uni_upper_lc(pTHX_ U32 c)
980{
981 return is_uni_upper(c); /* XXX no locale support yet */
982}
983
984bool
985Perl_is_uni_lower_lc(pTHX_ U32 c)
986{
987 return is_uni_lower(c); /* XXX no locale support yet */
988}
989
990bool
991Perl_is_uni_cntrl_lc(pTHX_ U32 c)
992{
993 return is_uni_cntrl(c); /* XXX no locale support yet */
994}
995
996bool
997Perl_is_uni_graph_lc(pTHX_ U32 c)
998{
999 return is_uni_graph(c); /* XXX no locale support yet */
1000}
1001
1002bool
1003Perl_is_uni_print_lc(pTHX_ U32 c)
1004{
1005 return is_uni_print(c); /* XXX no locale support yet */
1006}
1007
1008bool
1009Perl_is_uni_punct_lc(pTHX_ U32 c)
1010{
1011 return is_uni_punct(c); /* XXX no locale support yet */
1012}
1013
1014bool
1015Perl_is_uni_xdigit_lc(pTHX_ U32 c)
1016{
1017 return is_uni_xdigit(c); /* XXX no locale support yet */
1018}
1019
1020U32
1021Perl_to_uni_upper_lc(pTHX_ U32 c)
1022{
1023 return to_uni_upper(c); /* XXX no locale support yet */
1024}
1025
1026U32
1027Perl_to_uni_title_lc(pTHX_ U32 c)
1028{
1029 return to_uni_title(c); /* XXX no locale support yet */
1030}
1031
1032U32
1033Perl_to_uni_lower_lc(pTHX_ U32 c)
1034{
1035 return to_uni_lower(c); /* XXX no locale support yet */
1036}
1037
1038bool
1039Perl_is_utf8_alnum(pTHX_ U8 *p)
1040{
1041 if (!is_utf8_char(p))
1042 return FALSE;
1043 if (!PL_utf8_alnum)
1044 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1045 * descendant of isalnum(3), in other words, it doesn't
1046 * contain the '_'. --jhi */
1047 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
1048 return swash_fetch(PL_utf8_alnum, p, TRUE);
1049/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1050#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1051 if (!PL_utf8_alnum)
1052 PL_utf8_alnum = swash_init("utf8", "",
1053 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1054 return swash_fetch(PL_utf8_alnum, p, TRUE);
1055#endif
1056}
1057
1058bool
1059Perl_is_utf8_alnumc(pTHX_ U8 *p)
1060{
1061 if (!is_utf8_char(p))
1062 return FALSE;
1063 if (!PL_utf8_alnum)
1064 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
1065 return swash_fetch(PL_utf8_alnum, p, TRUE);
1066/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1067#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1068 if (!PL_utf8_alnum)
1069 PL_utf8_alnum = swash_init("utf8", "",
1070 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
1071 return swash_fetch(PL_utf8_alnum, p, TRUE);
1072#endif
1073}
1074
1075bool
1076Perl_is_utf8_idfirst(pTHX_ U8 *p)
1077{
1078 return *p == '_' || is_utf8_alpha(p);
1079}
1080
1081bool
1082Perl_is_utf8_alpha(pTHX_ U8 *p)
1083{
1084 if (!is_utf8_char(p))
1085 return FALSE;
1086 if (!PL_utf8_alpha)
1087 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
1088 return swash_fetch(PL_utf8_alpha, p, TRUE);
1089}
1090
1091bool
1092Perl_is_utf8_ascii(pTHX_ U8 *p)
1093{
1094 if (!is_utf8_char(p))
1095 return FALSE;
1096 if (!PL_utf8_ascii)
1097 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
1098 return swash_fetch(PL_utf8_ascii, p, TRUE);
1099}
1100
1101bool
1102Perl_is_utf8_space(pTHX_ U8 *p)
1103{
1104 if (!is_utf8_char(p))
1105 return FALSE;
1106 if (!PL_utf8_space)
1107 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
1108 return swash_fetch(PL_utf8_space, p, TRUE);
1109}
1110
1111bool
1112Perl_is_utf8_digit(pTHX_ U8 *p)
1113{
1114 if (!is_utf8_char(p))
1115 return FALSE;
1116 if (!PL_utf8_digit)
1117 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
1118 return swash_fetch(PL_utf8_digit, p, TRUE);
1119}
1120
1121bool
1122Perl_is_utf8_upper(pTHX_ U8 *p)
1123{
1124 if (!is_utf8_char(p))
1125 return FALSE;
1126 if (!PL_utf8_upper)
1127 PL_utf8_upper = swash_init("utf8", "IsUpper", &PL_sv_undef, 0, 0);
1128 return swash_fetch(PL_utf8_upper, p, TRUE);
1129}
1130
1131bool
1132Perl_is_utf8_lower(pTHX_ U8 *p)
1133{
1134 if (!is_utf8_char(p))
1135 return FALSE;
1136 if (!PL_utf8_lower)
1137 PL_utf8_lower = swash_init("utf8", "IsLower", &PL_sv_undef, 0, 0);
1138 return swash_fetch(PL_utf8_lower, p, TRUE);
1139}
1140
1141bool
1142Perl_is_utf8_cntrl(pTHX_ U8 *p)
1143{
1144 if (!is_utf8_char(p))
1145 return FALSE;
1146 if (!PL_utf8_cntrl)
1147 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
1148 return swash_fetch(PL_utf8_cntrl, p, TRUE);
1149}
1150
1151bool
1152Perl_is_utf8_graph(pTHX_ U8 *p)
1153{
1154 if (!is_utf8_char(p))
1155 return FALSE;
1156 if (!PL_utf8_graph)
1157 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
1158 return swash_fetch(PL_utf8_graph, p, TRUE);
1159}
1160
1161bool
1162Perl_is_utf8_print(pTHX_ U8 *p)
1163{
1164 if (!is_utf8_char(p))
1165 return FALSE;
1166 if (!PL_utf8_print)
1167 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
1168 return swash_fetch(PL_utf8_print, p, TRUE);
1169}
1170
1171bool
1172Perl_is_utf8_punct(pTHX_ U8 *p)
1173{
1174 if (!is_utf8_char(p))
1175 return FALSE;
1176 if (!PL_utf8_punct)
1177 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
1178 return swash_fetch(PL_utf8_punct, p, TRUE);
1179}
1180
1181bool
1182Perl_is_utf8_xdigit(pTHX_ U8 *p)
1183{
1184 if (!is_utf8_char(p))
1185 return FALSE;
1186 if (!PL_utf8_xdigit)
1187 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
1188 return swash_fetch(PL_utf8_xdigit, p, TRUE);
1189}
1190
1191bool
1192Perl_is_utf8_mark(pTHX_ U8 *p)
1193{
1194 if (!is_utf8_char(p))
1195 return FALSE;
1196 if (!PL_utf8_mark)
1197 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
1198 return swash_fetch(PL_utf8_mark, p, TRUE);
1199}
1200
1201UV
1202Perl_to_utf8_upper(pTHX_ U8 *p)
1203{
1204 UV uv;
1205
1206 if (!PL_utf8_toupper)
1207 PL_utf8_toupper = swash_init("utf8", "ToUpper", &PL_sv_undef, 4, 0);
1208 uv = swash_fetch(PL_utf8_toupper, p, TRUE);
1209 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1210}
1211
1212UV
1213Perl_to_utf8_title(pTHX_ U8 *p)
1214{
1215 UV uv;
1216
1217 if (!PL_utf8_totitle)
1218 PL_utf8_totitle = swash_init("utf8", "ToTitle", &PL_sv_undef, 4, 0);
1219 uv = swash_fetch(PL_utf8_totitle, p, TRUE);
1220 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1221}
1222
1223UV
1224Perl_to_utf8_lower(pTHX_ U8 *p)
1225{
1226 UV uv;
1227
1228 if (!PL_utf8_tolower)
1229 PL_utf8_tolower = swash_init("utf8", "ToLower", &PL_sv_undef, 4, 0);
1230 uv = swash_fetch(PL_utf8_tolower, p, TRUE);
1231 return uv ? UNI_TO_NATIVE(uv) : utf8_to_uvchr(p,0);
1232}
1233
1234/* a "swash" is a swatch hash */
1235
1236SV*
1237Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
1238{
1239 SV* retval;
1240 SV* tokenbufsv = sv_2mortal(NEWSV(0,0));
1241 dSP;
1242 HV *stash = gv_stashpvn(pkg, strlen(pkg), FALSE);
1243 SV* errsv_save;
1244
1245 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1246 ENTER;
1247 errsv_save = newSVsv(ERRSV);
1248 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpv(pkg,0), Nullsv);
1249 if (!SvTRUE(ERRSV))
1250 sv_setsv(ERRSV, errsv_save);
1251 SvREFCNT_dec(errsv_save);
1252 LEAVE;
1253 }
1254 SPAGAIN;
1255 PUSHSTACKi(PERLSI_MAGIC);
1256 PUSHMARK(SP);
1257 EXTEND(SP,5);
1258 PUSHs(sv_2mortal(newSVpvn(pkg, strlen(pkg))));
1259 PUSHs(sv_2mortal(newSVpvn(name, strlen(name))));
1260 PUSHs(listsv);
1261 PUSHs(sv_2mortal(newSViv(minbits)));
1262 PUSHs(sv_2mortal(newSViv(none)));
1263 PUTBACK;
1264 ENTER;
1265 SAVEI32(PL_hints);
1266 PL_hints = 0;
1267 save_re_context();
1268 if (PL_curcop == &PL_compiling)
1269 /* XXX ought to be handled by lex_start */
1270 sv_setpv(tokenbufsv, PL_tokenbuf);
1271 errsv_save = newSVsv(ERRSV);
1272 if (call_method("SWASHNEW", G_SCALAR))
1273 retval = newSVsv(*PL_stack_sp--);
1274 else
1275 retval = &PL_sv_undef;
1276 if (!SvTRUE(ERRSV))
1277 sv_setsv(ERRSV, errsv_save);
1278 SvREFCNT_dec(errsv_save);
1279 LEAVE;
1280 POPSTACK;
1281 if (PL_curcop == &PL_compiling) {
1282 STRLEN len;
1283 char* pv = SvPV(tokenbufsv, len);
1284
1285 Copy(pv, PL_tokenbuf, len+1, char);
1286 PL_curcop->op_private = PL_hints;
1287 }
1288 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV)
1289 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1290 return retval;
1291}
1292
1293UV
1294Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
1295{
1296 HV* hv = (HV*)SvRV(sv);
1297 U32 klen;
1298 U32 off;
1299 STRLEN slen;
1300 STRLEN needents;
1301 U8 *tmps;
1302 U32 bit;
1303 SV *retval;
1304 U8 tmputf8[2];
1305 UV c = NATIVE_TO_ASCII(*ptr);
1306
1307 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1308 tmputf8[0] = UTF8_EIGHT_BIT_HI(c);
1309 tmputf8[1] = UTF8_EIGHT_BIT_LO(c);
1310 ptr = tmputf8;
1311 }
1312 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1313 * then the "swatch" is a vec() for al the chars which start
1314 * with 0xAA..0xYY
1315 * So the key in the hash (klen) is length of encoded char -1
1316 */
1317 klen = UTF8SKIP(ptr) - 1;
1318 off = ptr[klen];
1319
1320 if (klen == 0)
1321 {
1322 /* If char in invariant then swatch is for all the invariant chars
1323 * In both UTF-8 and UTF8-MOD that happens to be UTF_CONTINUATION_MARK
1324 */
1325 needents = UTF_CONTINUATION_MARK;
1326 off = NATIVE_TO_UTF(ptr[klen]);
1327 }
1328 else
1329 {
1330 /* If char is encoded then swatch is for the prefix */
1331 needents = (1 << UTF_ACCUMULATION_SHIFT);
1332 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1333 }
1334
1335 /*
1336 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1337 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1338 * it's nothing to sniff at.) Pity we usually come through at least
1339 * two function calls to get here...
1340 *
1341 * NB: this code assumes that swatches are never modified, once generated!
1342 */
1343
1344 if (hv == PL_last_swash_hv &&
1345 klen == PL_last_swash_klen &&
1346 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1347 {
1348 tmps = PL_last_swash_tmps;
1349 slen = PL_last_swash_slen;
1350 }
1351 else {
1352 /* Try our second-level swatch cache, kept in a hash. */
1353 SV** svp = hv_fetch(hv, (char*)ptr, klen, FALSE);
1354
1355 /* If not cached, generate it via utf8::SWASHGET */
1356 if (!svp || !SvPOK(*svp) || !(tmps = (U8*)SvPV(*svp, slen))) {
1357 dSP;
1358 /* We use utf8n_to_uvuni() as we want an index into
1359 Unicode tables, not a native character number.
1360 */
1361 UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXLEN, NULL, 0);
1362 SV *errsv_save;
1363 ENTER;
1364 SAVETMPS;
1365 save_re_context();
1366 PUSHSTACKi(PERLSI_MAGIC);
1367 PUSHMARK(SP);
1368 EXTEND(SP,3);
1369 PUSHs((SV*)sv);
1370 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1371 PUSHs(sv_2mortal(newSViv((klen) ?
1372 (code_point & ~(needents - 1)) : 0)));
1373 PUSHs(sv_2mortal(newSViv(needents)));
1374 PUTBACK;
1375 errsv_save = newSVsv(ERRSV);
1376 if (call_method("SWASHGET", G_SCALAR))
1377 retval = newSVsv(*PL_stack_sp--);
1378 else
1379 retval = &PL_sv_undef;
1380 if (!SvTRUE(ERRSV))
1381 sv_setsv(ERRSV, errsv_save);
1382 SvREFCNT_dec(errsv_save);
1383 POPSTACK;
1384 FREETMPS;
1385 LEAVE;
1386 if (PL_curcop == &PL_compiling)
1387 PL_curcop->op_private = PL_hints;
1388
1389 svp = hv_store(hv, (char*)ptr, klen, retval, 0);
1390
1391 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
1392 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
1393 }
1394
1395 PL_last_swash_hv = hv;
1396 PL_last_swash_klen = klen;
1397 PL_last_swash_tmps = tmps;
1398 PL_last_swash_slen = slen;
1399 if (klen)
1400 Copy(ptr, PL_last_swash_key, klen, U8);
1401 }
1402
1403 switch ((int)((slen << 3) / needents)) {
1404 case 1:
1405 bit = 1 << (off & 7);
1406 off >>= 3;
1407 return (tmps[off] & bit) != 0;
1408 case 8:
1409 return tmps[off];
1410 case 16:
1411 off <<= 1;
1412 return (tmps[off] << 8) + tmps[off + 1] ;
1413 case 32:
1414 off <<= 2;
1415 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1416 }
1417 Perl_croak(aTHX_ "panic: swash_fetch");
1418 return 0;
1419}
1420
1421
1422/*
1423=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
1424
1425Adds the UTF8 representation of the Native codepoint C<uv> to the end
1426of the string C<d>; C<d> should be have at least C<UTF8_MAXLEN+1> free
1427bytes available. The return value is the pointer to the byte after the
1428end of the new character. In other words,
1429
1430 d = uvchr_to_utf8(d, uv);
1431
1432is the recommended wide native character-aware way of saying
1433
1434 *(d++) = uv;
1435
1436=cut
1437*/
1438
1439/* On ASCII machines this is normally a macro but we want a
1440 real function in case XS code wants it
1441*/
1442#undef Perl_uvchr_to_utf8
1443U8 *
1444Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1445{
1446 return Perl_uvuni_to_utf8(aTHX_ d, NATIVE_TO_UNI(uv));
1447}
1448
1449
1450/*
1451=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
1452
1453Returns the native character value of the first character in the string C<s>
1454which is assumed to be in UTF8 encoding; C<retlen> will be set to the
1455length, in bytes, of that character.
1456
1457Allows length and flags to be passed to low level routine.
1458
1459=cut
1460*/
1461/* On ASCII machines this is normally a macro but we want a
1462 real function in case XS code wants it
1463*/
1464#undef Perl_utf8n_to_uvchr
1465UV
1466Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
1467{
1468 UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
1469 return UNI_TO_NATIVE(uv);
1470}
1471
1472