This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
I'll finally get it right. (thanks to Sadahiro Tomoyuki)
[perl5.git] / utf8.c
... / ...
CommitLineData
1/* utf8.c
2 *
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
4 * by Larry Wall and others
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
25#define PERL_IN_UTF8_C
26#include "perl.h"
27
28#ifndef EBCDIC
29/* Separate prototypes needed because in ASCII systems these
30 * usually macros but they still are compiled as code, too. */
31PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
32PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
33#endif
34
35static const char unees[] =
36 "Malformed UTF-8 character (unexpected end of string)";
37
38/*
39=head1 Unicode Support
40
41This file contains various utility functions for manipulating UTF8-encoded
42strings. For the uninitiated, this is a method of representing arbitrary
43Unicode characters as a variable number of bytes, in such a way that
44characters in the ASCII range are unmodified, and a zero byte never appears
45within non-zero characters.
46
47=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
48
49Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
50of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
51bytes available. The return value is the pointer to the byte after the
52end of the new character. In other words,
53
54 d = uvuni_to_utf8_flags(d, uv, flags);
55
56or, in most cases,
57
58 d = uvuni_to_utf8(d, uv);
59
60(which is equivalent to)
61
62 d = uvuni_to_utf8_flags(d, uv, 0);
63
64is the recommended Unicode-aware way of saying
65
66 *(d++) = uv;
67
68=cut
69*/
70
71U8 *
72Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
73{
74 if (ckWARN(WARN_UTF8)) {
75 if (UNICODE_IS_SURROGATE(uv) &&
76 !(flags & UNICODE_ALLOW_SURROGATE))
77 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
78 else if (
79 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
80 !(flags & UNICODE_ALLOW_FDD0))
81 ||
82 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
83 !(flags & UNICODE_ALLOW_FFFF))) &&
84 /* UNICODE_ALLOW_SUPER includes
85 * FFFEs and FFFFs beyond 0x10FFFF. */
86 ((uv <= PERL_UNICODE_MAX) ||
87 !(flags & UNICODE_ALLOW_SUPER))
88 )
89 Perl_warner(aTHX_ packWARN(WARN_UTF8),
90 "Unicode character 0x%04"UVxf" is illegal", uv);
91 }
92 if (UNI_IS_INVARIANT(uv)) {
93 *d++ = (U8)UTF_TO_NATIVE(uv);
94 return d;
95 }
96#if defined(EBCDIC)
97 else {
98 STRLEN len = UNISKIP(uv);
99 U8 *p = d+len-1;
100 while (p > d) {
101 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
102 uv >>= UTF_ACCUMULATION_SHIFT;
103 }
104 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
105 return d+len;
106 }
107#else /* Non loop style */
108 if (uv < 0x800) {
109 *d++ = (U8)(( uv >> 6) | 0xc0);
110 *d++ = (U8)(( uv & 0x3f) | 0x80);
111 return d;
112 }
113 if (uv < 0x10000) {
114 *d++ = (U8)(( uv >> 12) | 0xe0);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
117 return d;
118 }
119 if (uv < 0x200000) {
120 *d++ = (U8)(( uv >> 18) | 0xf0);
121 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
123 *d++ = (U8)(( uv & 0x3f) | 0x80);
124 return d;
125 }
126 if (uv < 0x4000000) {
127 *d++ = (U8)(( uv >> 24) | 0xf8);
128 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
129 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
131 *d++ = (U8)(( uv & 0x3f) | 0x80);
132 return d;
133 }
134 if (uv < 0x80000000) {
135 *d++ = (U8)(( uv >> 30) | 0xfc);
136 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
137 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
138 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
139 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
140 *d++ = (U8)(( uv & 0x3f) | 0x80);
141 return d;
142 }
143#ifdef HAS_QUAD
144 if (uv < UTF8_QUAD_MAX)
145#endif
146 {
147 *d++ = 0xfe; /* Can't match U+FEFF! */
148 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
149 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
150 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
151 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
152 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
153 *d++ = (U8)(( uv & 0x3f) | 0x80);
154 return d;
155 }
156#ifdef HAS_QUAD
157 {
158 *d++ = 0xff; /* Can't match U+FFFE! */
159 *d++ = 0x80; /* 6 Reserved bits */
160 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
161 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
163 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
164 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
165 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
166 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
167 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
168 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
169 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
170 *d++ = (U8)(( uv & 0x3f) | 0x80);
171 return d;
172 }
173#endif
174#endif /* Loop style */
175}
176
177/*
178
179Tests if some arbitrary number of bytes begins in a valid UTF-8
180character. Note that an INVARIANT (i.e. ASCII) character is a valid
181UTF-8 character. The actual number of bytes in the UTF-8 character
182will be returned if it is valid, otherwise 0.
183
184This is the "slow" version as opposed to the "fast" version which is
185the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
186difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
187or less you should use the IS_UTF8_CHAR(), for lengths of five or more
188you should use the _slow(). In practice this means that the _slow()
189will be used very rarely, since the maximum Unicode code point (as of
190Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
191the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
192five bytes or more.
193
194=cut */
195STATIC STRLEN
196S_is_utf8_char_slow(const U8 *s, const STRLEN len)
197{
198 U8 u = *s;
199 STRLEN slen;
200 UV uv, ouv;
201
202 if (UTF8_IS_INVARIANT(u))
203 return 1;
204
205 if (!UTF8_IS_START(u))
206 return 0;
207
208 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
209 return 0;
210
211 slen = len - 1;
212 s++;
213#ifdef EBCDIC
214 u = NATIVE_TO_UTF(u);
215#endif
216 u &= UTF_START_MASK(len);
217 uv = u;
218 ouv = uv;
219 while (slen--) {
220 if (!UTF8_IS_CONTINUATION(*s))
221 return 0;
222 uv = UTF8_ACCUMULATE(uv, *s);
223 if (uv < ouv)
224 return 0;
225 ouv = uv;
226 s++;
227 }
228
229 if ((STRLEN)UNISKIP(uv) < len)
230 return 0;
231
232 return len;
233}
234
235/*
236=for apidoc A|STRLEN|is_utf8_char|const U8 *s
237
238Tests if some arbitrary number of bytes begins in a valid UTF-8
239character. Note that an INVARIANT (i.e. ASCII) character is a valid
240UTF-8 character. The actual number of bytes in the UTF-8 character
241will be returned if it is valid, otherwise 0.
242
243=cut */
244STRLEN
245Perl_is_utf8_char(pTHX_ const U8 *s)
246{
247 const STRLEN len = UTF8SKIP(s);
248 PERL_UNUSED_CONTEXT;
249#ifdef IS_UTF8_CHAR
250 if (IS_UTF8_CHAR_FAST(len))
251 return IS_UTF8_CHAR(s, len) ? len : 0;
252#endif /* #ifdef IS_UTF8_CHAR */
253 return is_utf8_char_slow(s, len);
254}
255
256/*
257=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
258
259Returns true if first C<len> bytes of the given string form a valid
260UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
261not mean 'a string that contains code points above 0x7F encoded in UTF-8'
262because a valid ASCII string is a valid UTF-8 string.
263
264See also is_utf8_string_loclen() and is_utf8_string_loc().
265
266=cut
267*/
268
269bool
270Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
271{
272 const U8* const send = s + (len ? len : strlen((const char *)s));
273 const U8* x = s;
274
275 PERL_UNUSED_CONTEXT;
276
277 while (x < send) {
278 STRLEN c;
279 /* Inline the easy bits of is_utf8_char() here for speed... */
280 if (UTF8_IS_INVARIANT(*x))
281 c = 1;
282 else if (!UTF8_IS_START(*x))
283 goto out;
284 else {
285 /* ... and call is_utf8_char() only if really needed. */
286#ifdef IS_UTF8_CHAR
287 c = UTF8SKIP(x);
288 if (IS_UTF8_CHAR_FAST(c)) {
289 if (!IS_UTF8_CHAR(x, c))
290 c = 0;
291 }
292 else
293 c = is_utf8_char_slow(x, c);
294#else
295 c = is_utf8_char(x);
296#endif /* #ifdef IS_UTF8_CHAR */
297 if (!c)
298 goto out;
299 }
300 x += c;
301 }
302
303 out:
304 if (x != send)
305 return FALSE;
306
307 return TRUE;
308}
309
310/*
311Implemented as a macro in utf8.h
312
313=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
314
315Like is_utf8_string() but stores the location of the failure (in the
316case of "utf8ness failure") or the location s+len (in the case of
317"utf8ness success") in the C<ep>.
318
319See also is_utf8_string_loclen() and is_utf8_string().
320
321=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
322
323Like is_utf8_string() but stores the location of the failure (in the
324case of "utf8ness failure") or the location s+len (in the case of
325"utf8ness success") in the C<ep>, and the number of UTF-8
326encoded characters in the C<el>.
327
328See also is_utf8_string_loc() and is_utf8_string().
329
330=cut
331*/
332
333bool
334Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
335{
336 const U8* const send = s + (len ? len : strlen((const char *)s));
337 const U8* x = s;
338 STRLEN c;
339 STRLEN outlen = 0;
340 PERL_UNUSED_CONTEXT;
341
342 while (x < send) {
343 /* Inline the easy bits of is_utf8_char() here for speed... */
344 if (UTF8_IS_INVARIANT(*x))
345 c = 1;
346 else if (!UTF8_IS_START(*x))
347 goto out;
348 else {
349 /* ... and call is_utf8_char() only if really needed. */
350#ifdef IS_UTF8_CHAR
351 c = UTF8SKIP(x);
352 if (IS_UTF8_CHAR_FAST(c)) {
353 if (!IS_UTF8_CHAR(x, c))
354 c = 0;
355 } else
356 c = is_utf8_char_slow(x, c);
357#else
358 c = is_utf8_char(x);
359#endif /* #ifdef IS_UTF8_CHAR */
360 if (!c)
361 goto out;
362 }
363 x += c;
364 outlen++;
365 }
366
367 out:
368 if (el)
369 *el = outlen;
370
371 if (ep)
372 *ep = x;
373 return (x == send);
374}
375
376/*
377
378=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
379
380Bottom level UTF-8 decode routine.
381Returns the unicode code point value of the first character in the string C<s>
382which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
383C<retlen> will be set to the length, in bytes, of that character.
384
385If C<s> does not point to a well-formed UTF-8 character, the behaviour
386is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
387it is assumed that the caller will raise a warning, and this function
388will silently just set C<retlen> to C<-1> and return zero. If the
389C<flags> does not contain UTF8_CHECK_ONLY, warnings about
390malformations will be given, C<retlen> will be set to the expected
391length of the UTF-8 character in bytes, and zero will be returned.
392
393The C<flags> can also contain various flags to allow deviations from
394the strict UTF-8 encoding (see F<utf8.h>).
395
396Most code should use utf8_to_uvchr() rather than call this directly.
397
398=cut
399*/
400
401UV
402Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
403{
404 dVAR;
405 const U8 * const s0 = s;
406 UV uv = *s, ouv = 0;
407 STRLEN len = 1;
408 const bool dowarn = ckWARN_d(WARN_UTF8);
409 const UV startbyte = *s;
410 STRLEN expectlen = 0;
411 U32 warning = 0;
412
413/* This list is a superset of the UTF8_ALLOW_XXX. */
414
415#define UTF8_WARN_EMPTY 1
416#define UTF8_WARN_CONTINUATION 2
417#define UTF8_WARN_NON_CONTINUATION 3
418#define UTF8_WARN_FE_FF 4
419#define UTF8_WARN_SHORT 5
420#define UTF8_WARN_OVERFLOW 6
421#define UTF8_WARN_SURROGATE 7
422#define UTF8_WARN_LONG 8
423#define UTF8_WARN_FFFF 9 /* Also FFFE. */
424
425 if (curlen == 0 &&
426 !(flags & UTF8_ALLOW_EMPTY)) {
427 warning = UTF8_WARN_EMPTY;
428 goto malformed;
429 }
430
431 if (UTF8_IS_INVARIANT(uv)) {
432 if (retlen)
433 *retlen = 1;
434 return (UV) (NATIVE_TO_UTF(*s));
435 }
436
437 if (UTF8_IS_CONTINUATION(uv) &&
438 !(flags & UTF8_ALLOW_CONTINUATION)) {
439 warning = UTF8_WARN_CONTINUATION;
440 goto malformed;
441 }
442
443 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
444 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
445 warning = UTF8_WARN_NON_CONTINUATION;
446 goto malformed;
447 }
448
449#ifdef EBCDIC
450 uv = NATIVE_TO_UTF(uv);
451#else
452 if ((uv == 0xfe || uv == 0xff) &&
453 !(flags & UTF8_ALLOW_FE_FF)) {
454 warning = UTF8_WARN_FE_FF;
455 goto malformed;
456 }
457#endif
458
459 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
460 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
461 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
462 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
463#ifdef EBCDIC
464 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
465 else { len = 7; uv &= 0x01; }
466#else
467 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
468 else if (!(uv & 0x01)) { len = 7; uv = 0; }
469 else { len = 13; uv = 0; } /* whoa! */
470#endif
471
472 if (retlen)
473 *retlen = len;
474
475 expectlen = len;
476
477 if ((curlen < expectlen) &&
478 !(flags & UTF8_ALLOW_SHORT)) {
479 warning = UTF8_WARN_SHORT;
480 goto malformed;
481 }
482
483 len--;
484 s++;
485 ouv = uv;
486
487 while (len--) {
488 if (!UTF8_IS_CONTINUATION(*s) &&
489 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
490 s--;
491 warning = UTF8_WARN_NON_CONTINUATION;
492 goto malformed;
493 }
494 else
495 uv = UTF8_ACCUMULATE(uv, *s);
496 if (!(uv > ouv)) {
497 /* These cannot be allowed. */
498 if (uv == ouv) {
499 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
500 warning = UTF8_WARN_LONG;
501 goto malformed;
502 }
503 }
504 else { /* uv < ouv */
505 /* This cannot be allowed. */
506 warning = UTF8_WARN_OVERFLOW;
507 goto malformed;
508 }
509 }
510 s++;
511 ouv = uv;
512 }
513
514 if (UNICODE_IS_SURROGATE(uv) &&
515 !(flags & UTF8_ALLOW_SURROGATE)) {
516 warning = UTF8_WARN_SURROGATE;
517 goto malformed;
518 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
519 !(flags & UTF8_ALLOW_LONG)) {
520 warning = UTF8_WARN_LONG;
521 goto malformed;
522 } else if (UNICODE_IS_ILLEGAL(uv) &&
523 !(flags & UTF8_ALLOW_FFFF)) {
524 warning = UTF8_WARN_FFFF;
525 goto malformed;
526 }
527
528 return uv;
529
530malformed:
531
532 if (flags & UTF8_CHECK_ONLY) {
533 if (retlen)
534 *retlen = ((STRLEN) -1);
535 return 0;
536 }
537
538 if (dowarn) {
539 SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
540
541 switch (warning) {
542 case 0: /* Intentionally empty. */ break;
543 case UTF8_WARN_EMPTY:
544 sv_catpvs(sv, "(empty string)");
545 break;
546 case UTF8_WARN_CONTINUATION:
547 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
548 break;
549 case UTF8_WARN_NON_CONTINUATION:
550 if (s == s0)
551 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
552 (UV)s[1], startbyte);
553 else {
554 const int len = (int)(s-s0);
555 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
556 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
557 }
558
559 break;
560 case UTF8_WARN_FE_FF:
561 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
562 break;
563 case UTF8_WARN_SHORT:
564 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
565 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
566 expectlen = curlen; /* distance for caller to skip */
567 break;
568 case UTF8_WARN_OVERFLOW:
569 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
570 ouv, *s, startbyte);
571 break;
572 case UTF8_WARN_SURROGATE:
573 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
574 break;
575 case UTF8_WARN_LONG:
576 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
577 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
578 break;
579 case UTF8_WARN_FFFF:
580 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
581 break;
582 default:
583 sv_catpvs(sv, "(unknown reason)");
584 break;
585 }
586
587 if (warning) {
588 const char * const s = SvPVX_const(sv);
589
590 if (PL_op)
591 Perl_warner(aTHX_ packWARN(WARN_UTF8),
592 "%s in %s", s, OP_DESC(PL_op));
593 else
594 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
595 }
596 }
597
598 if (retlen)
599 *retlen = expectlen ? expectlen : len;
600
601 return 0;
602}
603
604/*
605=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
606
607Returns the native character value of the first character in the string C<s>
608which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
609length, in bytes, of that character.
610
611If C<s> does not point to a well-formed UTF-8 character, zero is
612returned and retlen is set, if possible, to -1.
613
614=cut
615*/
616
617UV
618Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
619{
620 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
621 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
622}
623
624/*
625=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
626
627Returns the Unicode code point of the first character in the string C<s>
628which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
629length, in bytes, of that character.
630
631This function should only be used when returned UV is considered
632an index into the Unicode semantic tables (e.g. swashes).
633
634If C<s> does not point to a well-formed UTF-8 character, zero is
635returned and retlen is set, if possible, to -1.
636
637=cut
638*/
639
640UV
641Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
642{
643 /* Call the low level routine asking for checks */
644 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
645 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
646}
647
648/*
649=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
650
651Return the length of the UTF-8 char encoded string C<s> in characters.
652Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
653up past C<e>, croaks.
654
655=cut
656*/
657
658STRLEN
659Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
660{
661 dVAR;
662 STRLEN len = 0;
663 U8 t = 0;
664
665 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
666 * the bitops (especially ~) can create illegal UTF-8.
667 * In other words: in Perl UTF-8 is not just for Unicode. */
668
669 if (e < s)
670 goto warn_and_return;
671 while (s < e) {
672 t = UTF8SKIP(s);
673 if (e - s < t) {
674 warn_and_return:
675 if (ckWARN_d(WARN_UTF8)) {
676 if (PL_op)
677 Perl_warner(aTHX_ packWARN(WARN_UTF8),
678 "%s in %s", unees, OP_DESC(PL_op));
679 else
680 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
681 }
682 return len;
683 }
684 s += t;
685 len++;
686 }
687
688 return len;
689}
690
691/*
692=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
693
694Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
695and C<b>.
696
697WARNING: use only if you *know* that the pointers point inside the
698same UTF-8 buffer.
699
700=cut
701*/
702
703IV
704Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
705{
706 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
707}
708
709/*
710=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
711
712Return the UTF-8 pointer C<s> displaced by C<off> characters, either
713forward or backward.
714
715WARNING: do not use the following unless you *know* C<off> is within
716the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
717on the first byte of character or just after the last byte of a character.
718
719=cut
720*/
721
722U8 *
723Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
724{
725 PERL_UNUSED_CONTEXT;
726 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
727 * the bitops (especially ~) can create illegal UTF-8.
728 * In other words: in Perl UTF-8 is not just for Unicode. */
729
730 if (off >= 0) {
731 while (off--)
732 s += UTF8SKIP(s);
733 }
734 else {
735 while (off++) {
736 s--;
737 while (UTF8_IS_CONTINUATION(*s))
738 s--;
739 }
740 }
741 return (U8 *)s;
742}
743
744/*
745=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
746
747Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
748Unlike C<bytes_to_utf8>, this over-writes the original string, and
749updates len to contain the new length.
750Returns zero on failure, setting C<len> to -1.
751
752If you need a copy of the string, see C<bytes_from_utf8>.
753
754=cut
755*/
756
757U8 *
758Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
759{
760 U8 * const save = s;
761 U8 * const send = s + *len;
762 U8 *d;
763
764 /* ensure valid UTF-8 and chars < 256 before updating string */
765 while (s < send) {
766 U8 c = *s++;
767
768 if (!UTF8_IS_INVARIANT(c) &&
769 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
770 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
771 *len = ((STRLEN) -1);
772 return 0;
773 }
774 }
775
776 d = s = save;
777 while (s < send) {
778 STRLEN ulen;
779 *d++ = (U8)utf8_to_uvchr(s, &ulen);
780 s += ulen;
781 }
782 *d = '\0';
783 *len = d - save;
784 return save;
785}
786
787/*
788=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
789
790Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
791Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
792the newly-created string, and updates C<len> to contain the new
793length. Returns the original string if no conversion occurs, C<len>
794is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7950 if C<s> is converted or contains all 7bit characters.
796
797=cut
798*/
799
800U8 *
801Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
802{
803 U8 *d;
804 const U8 *start = s;
805 const U8 *send;
806 I32 count = 0;
807
808 PERL_UNUSED_CONTEXT;
809 if (!*is_utf8)
810 return (U8 *)start;
811
812 /* ensure valid UTF-8 and chars < 256 before converting string */
813 for (send = s + *len; s < send;) {
814 U8 c = *s++;
815 if (!UTF8_IS_INVARIANT(c)) {
816 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
817 (c = *s++) && UTF8_IS_CONTINUATION(c))
818 count++;
819 else
820 return (U8 *)start;
821 }
822 }
823
824 *is_utf8 = FALSE;
825
826 Newx(d, (*len) - count + 1, U8);
827 s = start; start = d;
828 while (s < send) {
829 U8 c = *s++;
830 if (!UTF8_IS_INVARIANT(c)) {
831 /* Then it is two-byte encoded */
832 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
833 c = ASCII_TO_NATIVE(c);
834 }
835 *d++ = c;
836 }
837 *d = '\0';
838 *len = d - start;
839 return (U8 *)start;
840}
841
842/*
843=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
844
845Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
846Returns a pointer to the newly-created string, and sets C<len> to
847reflect the new length.
848
849If you want to convert to UTF-8 from other encodings than ASCII,
850see sv_recode_to_utf8().
851
852=cut
853*/
854
855U8*
856Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
857{
858 const U8 * const send = s + (*len);
859 U8 *d;
860 U8 *dst;
861 PERL_UNUSED_CONTEXT;
862
863 Newx(d, (*len) * 2 + 1, U8);
864 dst = d;
865
866 while (s < send) {
867 const UV uv = NATIVE_TO_ASCII(*s++);
868 if (UNI_IS_INVARIANT(uv))
869 *d++ = (U8)UTF_TO_NATIVE(uv);
870 else {
871 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
872 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
873 }
874 }
875 *d = '\0';
876 *len = d-dst;
877 return dst;
878}
879
880/*
881 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
882 *
883 * Destination must be pre-extended to 3/2 source. Do not use in-place.
884 * We optimize for native, for obvious reasons. */
885
886U8*
887Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
888{
889 U8* pend;
890 U8* dstart = d;
891
892 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
893 d[0] = 0;
894 *newlen = 1;
895 return d;
896 }
897
898 if (bytelen & 1)
899 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
900
901 pend = p + bytelen;
902
903 while (p < pend) {
904 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
905 p += 2;
906 if (uv < 0x80) {
907#ifdef EBCDIC
908 *d++ = UNI_TO_NATIVE(uv);
909#else
910 *d++ = (U8)uv;
911#endif
912 continue;
913 }
914 if (uv < 0x800) {
915 *d++ = (U8)(( uv >> 6) | 0xc0);
916 *d++ = (U8)(( uv & 0x3f) | 0x80);
917 continue;
918 }
919 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
920 UV low = (p[0] << 8) + p[1];
921 p += 2;
922 if (low < 0xdc00 || low >= 0xdfff)
923 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
924 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
925 }
926 if (uv < 0x10000) {
927 *d++ = (U8)(( uv >> 12) | 0xe0);
928 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
929 *d++ = (U8)(( uv & 0x3f) | 0x80);
930 continue;
931 }
932 else {
933 *d++ = (U8)(( uv >> 18) | 0xf0);
934 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
935 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
936 *d++ = (U8)(( uv & 0x3f) | 0x80);
937 continue;
938 }
939 }
940 *newlen = d - dstart;
941 return d;
942}
943
944/* Note: this one is slightly destructive of the source. */
945
946U8*
947Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
948{
949 U8* s = (U8*)p;
950 U8* const send = s + bytelen;
951 while (s < send) {
952 const U8 tmp = s[0];
953 s[0] = s[1];
954 s[1] = tmp;
955 s += 2;
956 }
957 return utf16_to_utf8(p, d, bytelen, newlen);
958}
959
960/* for now these are all defined (inefficiently) in terms of the utf8 versions */
961
962bool
963Perl_is_uni_alnum(pTHX_ UV c)
964{
965 U8 tmpbuf[UTF8_MAXBYTES+1];
966 uvchr_to_utf8(tmpbuf, c);
967 return is_utf8_alnum(tmpbuf);
968}
969
970bool
971Perl_is_uni_alnumc(pTHX_ UV c)
972{
973 U8 tmpbuf[UTF8_MAXBYTES+1];
974 uvchr_to_utf8(tmpbuf, c);
975 return is_utf8_alnumc(tmpbuf);
976}
977
978bool
979Perl_is_uni_idfirst(pTHX_ UV c)
980{
981 U8 tmpbuf[UTF8_MAXBYTES+1];
982 uvchr_to_utf8(tmpbuf, c);
983 return is_utf8_idfirst(tmpbuf);
984}
985
986bool
987Perl_is_uni_alpha(pTHX_ UV c)
988{
989 U8 tmpbuf[UTF8_MAXBYTES+1];
990 uvchr_to_utf8(tmpbuf, c);
991 return is_utf8_alpha(tmpbuf);
992}
993
994bool
995Perl_is_uni_ascii(pTHX_ UV c)
996{
997 U8 tmpbuf[UTF8_MAXBYTES+1];
998 uvchr_to_utf8(tmpbuf, c);
999 return is_utf8_ascii(tmpbuf);
1000}
1001
1002bool
1003Perl_is_uni_space(pTHX_ UV c)
1004{
1005 U8 tmpbuf[UTF8_MAXBYTES+1];
1006 uvchr_to_utf8(tmpbuf, c);
1007 return is_utf8_space(tmpbuf);
1008}
1009
1010bool
1011Perl_is_uni_digit(pTHX_ UV c)
1012{
1013 U8 tmpbuf[UTF8_MAXBYTES+1];
1014 uvchr_to_utf8(tmpbuf, c);
1015 return is_utf8_digit(tmpbuf);
1016}
1017
1018bool
1019Perl_is_uni_upper(pTHX_ UV c)
1020{
1021 U8 tmpbuf[UTF8_MAXBYTES+1];
1022 uvchr_to_utf8(tmpbuf, c);
1023 return is_utf8_upper(tmpbuf);
1024}
1025
1026bool
1027Perl_is_uni_lower(pTHX_ UV c)
1028{
1029 U8 tmpbuf[UTF8_MAXBYTES+1];
1030 uvchr_to_utf8(tmpbuf, c);
1031 return is_utf8_lower(tmpbuf);
1032}
1033
1034bool
1035Perl_is_uni_cntrl(pTHX_ UV c)
1036{
1037 U8 tmpbuf[UTF8_MAXBYTES+1];
1038 uvchr_to_utf8(tmpbuf, c);
1039 return is_utf8_cntrl(tmpbuf);
1040}
1041
1042bool
1043Perl_is_uni_graph(pTHX_ UV c)
1044{
1045 U8 tmpbuf[UTF8_MAXBYTES+1];
1046 uvchr_to_utf8(tmpbuf, c);
1047 return is_utf8_graph(tmpbuf);
1048}
1049
1050bool
1051Perl_is_uni_print(pTHX_ UV c)
1052{
1053 U8 tmpbuf[UTF8_MAXBYTES+1];
1054 uvchr_to_utf8(tmpbuf, c);
1055 return is_utf8_print(tmpbuf);
1056}
1057
1058bool
1059Perl_is_uni_punct(pTHX_ UV c)
1060{
1061 U8 tmpbuf[UTF8_MAXBYTES+1];
1062 uvchr_to_utf8(tmpbuf, c);
1063 return is_utf8_punct(tmpbuf);
1064}
1065
1066bool
1067Perl_is_uni_xdigit(pTHX_ UV c)
1068{
1069 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1070 uvchr_to_utf8(tmpbuf, c);
1071 return is_utf8_xdigit(tmpbuf);
1072}
1073
1074UV
1075Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1076{
1077 uvchr_to_utf8(p, c);
1078 return to_utf8_upper(p, p, lenp);
1079}
1080
1081UV
1082Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1083{
1084 uvchr_to_utf8(p, c);
1085 return to_utf8_title(p, p, lenp);
1086}
1087
1088UV
1089Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1090{
1091 uvchr_to_utf8(p, c);
1092 return to_utf8_lower(p, p, lenp);
1093}
1094
1095UV
1096Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1097{
1098 uvchr_to_utf8(p, c);
1099 return to_utf8_fold(p, p, lenp);
1100}
1101
1102/* for now these all assume no locale info available for Unicode > 255 */
1103
1104bool
1105Perl_is_uni_alnum_lc(pTHX_ UV c)
1106{
1107 return is_uni_alnum(c); /* XXX no locale support yet */
1108}
1109
1110bool
1111Perl_is_uni_alnumc_lc(pTHX_ UV c)
1112{
1113 return is_uni_alnumc(c); /* XXX no locale support yet */
1114}
1115
1116bool
1117Perl_is_uni_idfirst_lc(pTHX_ UV c)
1118{
1119 return is_uni_idfirst(c); /* XXX no locale support yet */
1120}
1121
1122bool
1123Perl_is_uni_alpha_lc(pTHX_ UV c)
1124{
1125 return is_uni_alpha(c); /* XXX no locale support yet */
1126}
1127
1128bool
1129Perl_is_uni_ascii_lc(pTHX_ UV c)
1130{
1131 return is_uni_ascii(c); /* XXX no locale support yet */
1132}
1133
1134bool
1135Perl_is_uni_space_lc(pTHX_ UV c)
1136{
1137 return is_uni_space(c); /* XXX no locale support yet */
1138}
1139
1140bool
1141Perl_is_uni_digit_lc(pTHX_ UV c)
1142{
1143 return is_uni_digit(c); /* XXX no locale support yet */
1144}
1145
1146bool
1147Perl_is_uni_upper_lc(pTHX_ UV c)
1148{
1149 return is_uni_upper(c); /* XXX no locale support yet */
1150}
1151
1152bool
1153Perl_is_uni_lower_lc(pTHX_ UV c)
1154{
1155 return is_uni_lower(c); /* XXX no locale support yet */
1156}
1157
1158bool
1159Perl_is_uni_cntrl_lc(pTHX_ UV c)
1160{
1161 return is_uni_cntrl(c); /* XXX no locale support yet */
1162}
1163
1164bool
1165Perl_is_uni_graph_lc(pTHX_ UV c)
1166{
1167 return is_uni_graph(c); /* XXX no locale support yet */
1168}
1169
1170bool
1171Perl_is_uni_print_lc(pTHX_ UV c)
1172{
1173 return is_uni_print(c); /* XXX no locale support yet */
1174}
1175
1176bool
1177Perl_is_uni_punct_lc(pTHX_ UV c)
1178{
1179 return is_uni_punct(c); /* XXX no locale support yet */
1180}
1181
1182bool
1183Perl_is_uni_xdigit_lc(pTHX_ UV c)
1184{
1185 return is_uni_xdigit(c); /* XXX no locale support yet */
1186}
1187
1188U32
1189Perl_to_uni_upper_lc(pTHX_ U32 c)
1190{
1191 /* XXX returns only the first character -- do not use XXX */
1192 /* XXX no locale support yet */
1193 STRLEN len;
1194 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1195 return (U32)to_uni_upper(c, tmpbuf, &len);
1196}
1197
1198U32
1199Perl_to_uni_title_lc(pTHX_ U32 c)
1200{
1201 /* XXX returns only the first character XXX -- do not use XXX */
1202 /* XXX no locale support yet */
1203 STRLEN len;
1204 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1205 return (U32)to_uni_title(c, tmpbuf, &len);
1206}
1207
1208U32
1209Perl_to_uni_lower_lc(pTHX_ U32 c)
1210{
1211 /* XXX returns only the first character -- do not use XXX */
1212 /* XXX no locale support yet */
1213 STRLEN len;
1214 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1215 return (U32)to_uni_lower(c, tmpbuf, &len);
1216}
1217
1218static bool
1219S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1220 const char *const swashname)
1221{
1222 dVAR;
1223 if (!is_utf8_char(p))
1224 return FALSE;
1225 if (!*swash)
1226 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1227 return swash_fetch(*swash, p, TRUE) != 0;
1228}
1229
1230bool
1231Perl_is_utf8_alnum(pTHX_ const U8 *p)
1232{
1233 dVAR;
1234 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1235 * descendant of isalnum(3), in other words, it doesn't
1236 * contain the '_'. --jhi */
1237 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1238}
1239
1240bool
1241Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1242{
1243 dVAR;
1244 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
1245}
1246
1247bool
1248Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1249{
1250 dVAR;
1251 if (*p == '_')
1252 return TRUE;
1253 /* is_utf8_idstart would be more logical. */
1254 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1255}
1256
1257bool
1258Perl_is_utf8_idcont(pTHX_ const U8 *p)
1259{
1260 dVAR;
1261 if (*p == '_')
1262 return TRUE;
1263 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1264}
1265
1266bool
1267Perl_is_utf8_alpha(pTHX_ const U8 *p)
1268{
1269 dVAR;
1270 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1271}
1272
1273bool
1274Perl_is_utf8_ascii(pTHX_ const U8 *p)
1275{
1276 dVAR;
1277 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1278}
1279
1280bool
1281Perl_is_utf8_space(pTHX_ const U8 *p)
1282{
1283 dVAR;
1284 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1285}
1286
1287bool
1288Perl_is_utf8_digit(pTHX_ const U8 *p)
1289{
1290 dVAR;
1291 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1292}
1293
1294bool
1295Perl_is_utf8_upper(pTHX_ const U8 *p)
1296{
1297 dVAR;
1298 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1299}
1300
1301bool
1302Perl_is_utf8_lower(pTHX_ const U8 *p)
1303{
1304 dVAR;
1305 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1306}
1307
1308bool
1309Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1310{
1311 dVAR;
1312 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1313}
1314
1315bool
1316Perl_is_utf8_graph(pTHX_ const U8 *p)
1317{
1318 dVAR;
1319 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1320}
1321
1322bool
1323Perl_is_utf8_print(pTHX_ const U8 *p)
1324{
1325 dVAR;
1326 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1327}
1328
1329bool
1330Perl_is_utf8_punct(pTHX_ const U8 *p)
1331{
1332 dVAR;
1333 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1334}
1335
1336bool
1337Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1338{
1339 dVAR;
1340 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
1341}
1342
1343bool
1344Perl_is_utf8_mark(pTHX_ const U8 *p)
1345{
1346 dVAR;
1347 return is_utf8_common(p, &PL_utf8_mark, "IsM");
1348}
1349
1350/*
1351=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1352
1353The "p" contains the pointer to the UTF-8 string encoding
1354the character that is being converted.
1355
1356The "ustrp" is a pointer to the character buffer to put the
1357conversion result to. The "lenp" is a pointer to the length
1358of the result.
1359
1360The "swashp" is a pointer to the swash to use.
1361
1362Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1363and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
1364but not always, a multicharacter mapping), is tried first.
1365
1366The "special" is a string like "utf8::ToSpecLower", which means the
1367hash %utf8::ToSpecLower. The access to the hash is through
1368Perl_to_utf8_case().
1369
1370The "normal" is a string like "ToLower" which means the swash
1371%utf8::ToLower.
1372
1373=cut */
1374
1375UV
1376Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1377 SV **swashp, const char *normal, const char *special)
1378{
1379 dVAR;
1380 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1381 STRLEN len = 0;
1382
1383 const UV uv0 = utf8_to_uvchr(p, NULL);
1384 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1385 * are necessary in EBCDIC, they are redundant no-ops
1386 * in ASCII-ish platforms, and hopefully optimized away. */
1387 const UV uv1 = NATIVE_TO_UNI(uv0);
1388 uvuni_to_utf8(tmpbuf, uv1);
1389
1390 if (!*swashp) /* load on-demand */
1391 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1392
1393 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1394 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1395 /* It might be "special" (sometimes, but not always,
1396 * a multicharacter mapping) */
1397 HV * const hv = get_hv(special, FALSE);
1398 SV **svp;
1399
1400 if (hv &&
1401 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1402 (*svp)) {
1403 const char *s;
1404
1405 s = SvPV_const(*svp, len);
1406 if (len == 1)
1407 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1408 else {
1409#ifdef EBCDIC
1410 /* If we have EBCDIC we need to remap the characters
1411 * since any characters in the low 256 are Unicode
1412 * code points, not EBCDIC. */
1413 U8 *t = (U8*)s, *tend = t + len, *d;
1414
1415 d = tmpbuf;
1416 if (SvUTF8(*svp)) {
1417 STRLEN tlen = 0;
1418
1419 while (t < tend) {
1420 const UV c = utf8_to_uvchr(t, &tlen);
1421 if (tlen > 0) {
1422 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1423 t += tlen;
1424 }
1425 else
1426 break;
1427 }
1428 }
1429 else {
1430 while (t < tend) {
1431 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1432 t++;
1433 }
1434 }
1435 len = d - tmpbuf;
1436 Copy(tmpbuf, ustrp, len, U8);
1437#else
1438 Copy(s, ustrp, len, U8);
1439#endif
1440 }
1441 }
1442 }
1443
1444 if (!len && *swashp) {
1445 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1446
1447 if (uv2) {
1448 /* It was "normal" (a single character mapping). */
1449 const UV uv3 = UNI_TO_NATIVE(uv2);
1450 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1451 }
1452 }
1453
1454 if (!len) /* Neither: just copy. */
1455 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1456
1457 if (lenp)
1458 *lenp = len;
1459
1460 return len ? utf8_to_uvchr(ustrp, 0) : 0;
1461}
1462
1463/*
1464=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1465
1466Convert the UTF-8 encoded character at p to its uppercase version and
1467store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1468that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1469the uppercase version may be longer than the original character.
1470
1471The first character of the uppercased version is returned
1472(but note, as explained above, that there may be more.)
1473
1474=cut */
1475
1476UV
1477Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1478{
1479 dVAR;
1480 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1481 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1482}
1483
1484/*
1485=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1486
1487Convert the UTF-8 encoded character at p to its titlecase version and
1488store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1489that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1490titlecase version may be longer than the original character.
1491
1492The first character of the titlecased version is returned
1493(but note, as explained above, that there may be more.)
1494
1495=cut */
1496
1497UV
1498Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1499{
1500 dVAR;
1501 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1502 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1503}
1504
1505/*
1506=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1507
1508Convert the UTF-8 encoded character at p to its lowercase version and
1509store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1510that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1511lowercase version may be longer than the original character.
1512
1513The first character of the lowercased version is returned
1514(but note, as explained above, that there may be more.)
1515
1516=cut */
1517
1518UV
1519Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1520{
1521 dVAR;
1522 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1523 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1524}
1525
1526/*
1527=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1528
1529Convert the UTF-8 encoded character at p to its foldcase version and
1530store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1531that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1532foldcase version may be longer than the original character (up to
1533three characters).
1534
1535The first character of the foldcased version is returned
1536(but note, as explained above, that there may be more.)
1537
1538=cut */
1539
1540UV
1541Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1542{
1543 dVAR;
1544 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1545 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1546}
1547
1548/* Note:
1549 * A "swash" is a swatch hash.
1550 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1551 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1552 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1553 */
1554SV*
1555Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1556{
1557 dVAR;
1558 SV* retval;
1559 SV* const tokenbufsv = sv_newmortal();
1560 dSP;
1561 const size_t pkg_len = strlen(pkg);
1562 const size_t name_len = strlen(name);
1563 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1564 SV* errsv_save;
1565
1566 PUSHSTACKi(PERLSI_MAGIC);
1567 ENTER;
1568 SAVEI32(PL_hints);
1569 PL_hints = 0;
1570 save_re_context();
1571 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1572 ENTER;
1573 errsv_save = newSVsv(ERRSV);
1574 /* It is assumed that callers of this routine are not passing in any
1575 user derived data. */
1576 /* Need to do this after save_re_context() as it will set PL_tainted to
1577 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1578 Even line to create errsv_save can turn on PL_tainted. */
1579 SAVEBOOL(PL_tainted);
1580 PL_tainted = 0;
1581 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1582 NULL);
1583 if (!SvTRUE(ERRSV))
1584 sv_setsv(ERRSV, errsv_save);
1585 SvREFCNT_dec(errsv_save);
1586 LEAVE;
1587 }
1588 SPAGAIN;
1589 PUSHMARK(SP);
1590 EXTEND(SP,5);
1591 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1592 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1593 PUSHs(listsv);
1594 PUSHs(sv_2mortal(newSViv(minbits)));
1595 PUSHs(sv_2mortal(newSViv(none)));
1596 PUTBACK;
1597 if (IN_PERL_COMPILETIME) {
1598 /* XXX ought to be handled by lex_start */
1599 SAVEI32(PL_in_my);
1600 PL_in_my = 0;
1601 sv_setpv(tokenbufsv, PL_tokenbuf);
1602 }
1603 errsv_save = newSVsv(ERRSV);
1604 if (call_method("SWASHNEW", G_SCALAR))
1605 retval = newSVsv(*PL_stack_sp--);
1606 else
1607 retval = &PL_sv_undef;
1608 if (!SvTRUE(ERRSV))
1609 sv_setsv(ERRSV, errsv_save);
1610 SvREFCNT_dec(errsv_save);
1611 LEAVE;
1612 POPSTACK;
1613 if (IN_PERL_COMPILETIME) {
1614 STRLEN len;
1615 const char* const pv = SvPV_const(tokenbufsv, len);
1616
1617 Copy(pv, PL_tokenbuf, len+1, char);
1618 CopHINTS_set(PL_curcop, PL_hints);
1619 }
1620 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1621 if (SvPOK(retval))
1622 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1623 (void*)retval);
1624 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1625 }
1626 return retval;
1627}
1628
1629
1630/* This API is wrong for special case conversions since we may need to
1631 * return several Unicode characters for a single Unicode character
1632 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1633 * the lower-level routine, and it is similarly broken for returning
1634 * multiple values. --jhi */
1635/* Now SWASHGET is recasted into S_swash_get in this file. */
1636
1637/* Note:
1638 * Returns the value of property/mapping C<swash> for the first character
1639 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1640 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1641 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1642 */
1643UV
1644Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1645{
1646 dVAR;
1647 HV* const hv = (HV*)SvRV(swash);
1648 U32 klen;
1649 U32 off;
1650 STRLEN slen;
1651 STRLEN needents;
1652 const U8 *tmps = NULL;
1653 U32 bit;
1654 SV *swatch;
1655 U8 tmputf8[2];
1656 const UV c = NATIVE_TO_ASCII(*ptr);
1657
1658 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1659 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1660 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1661 ptr = tmputf8;
1662 }
1663 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1664 * then the "swatch" is a vec() for al the chars which start
1665 * with 0xAA..0xYY
1666 * So the key in the hash (klen) is length of encoded char -1
1667 */
1668 klen = UTF8SKIP(ptr) - 1;
1669 off = ptr[klen];
1670
1671 if (klen == 0) {
1672 /* If char in invariant then swatch is for all the invariant chars
1673 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1674 */
1675 needents = UTF_CONTINUATION_MARK;
1676 off = NATIVE_TO_UTF(ptr[klen]);
1677 }
1678 else {
1679 /* If char is encoded then swatch is for the prefix */
1680 needents = (1 << UTF_ACCUMULATION_SHIFT);
1681 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1682 }
1683
1684 /*
1685 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1686 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1687 * it's nothing to sniff at.) Pity we usually come through at least
1688 * two function calls to get here...
1689 *
1690 * NB: this code assumes that swatches are never modified, once generated!
1691 */
1692
1693 if (hv == PL_last_swash_hv &&
1694 klen == PL_last_swash_klen &&
1695 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1696 {
1697 tmps = PL_last_swash_tmps;
1698 slen = PL_last_swash_slen;
1699 }
1700 else {
1701 /* Try our second-level swatch cache, kept in a hash. */
1702 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1703
1704 /* If not cached, generate it via swash_get */
1705 if (!svp || !SvPOK(*svp)
1706 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1707 /* We use utf8n_to_uvuni() as we want an index into
1708 Unicode tables, not a native character number.
1709 */
1710 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1711 ckWARN(WARN_UTF8) ?
1712 0 : UTF8_ALLOW_ANY);
1713 swatch = swash_get(swash,
1714 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1715 (klen) ? (code_point & ~(needents - 1)) : 0,
1716 needents);
1717
1718 if (IN_PERL_COMPILETIME)
1719 CopHINTS_set(PL_curcop, PL_hints);
1720
1721 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1722
1723 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1724 || (slen << 3) < needents)
1725 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1726 }
1727
1728 PL_last_swash_hv = hv;
1729 PL_last_swash_klen = klen;
1730 /* FIXME change interpvar.h? */
1731 PL_last_swash_tmps = (U8 *) tmps;
1732 PL_last_swash_slen = slen;
1733 if (klen)
1734 Copy(ptr, PL_last_swash_key, klen, U8);
1735 }
1736
1737 switch ((int)((slen << 3) / needents)) {
1738 case 1:
1739 bit = 1 << (off & 7);
1740 off >>= 3;
1741 return (tmps[off] & bit) != 0;
1742 case 8:
1743 return tmps[off];
1744 case 16:
1745 off <<= 1;
1746 return (tmps[off] << 8) + tmps[off + 1] ;
1747 case 32:
1748 off <<= 2;
1749 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1750 }
1751 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1752 NORETURN_FUNCTION_END;
1753}
1754
1755/* Note:
1756 * Returns a swatch (a bit vector string) for a code point sequence
1757 * that starts from the value C<start> and comprises the number C<span>.
1758 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1759 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1760 */
1761STATIC SV*
1762S_swash_get(pTHX_ SV* swash, UV start, UV span)
1763{
1764 SV *swatch;
1765 U8 *l, *lend, *x, *xend, *s;
1766 STRLEN lcur, xcur, scur;
1767
1768 HV* const hv = (HV*)SvRV(swash);
1769 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1770 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1771 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1772 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1773 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
1774 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1775 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1776 const STRLEN bits = SvUV(*bitssvp);
1777 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1778 const UV none = SvUV(*nonesvp);
1779 const UV end = start + span;
1780
1781 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1782 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1783 (UV)bits);
1784 }
1785
1786 /* create and initialize $swatch */
1787 swatch = newSVpvs("");
1788 scur = octets ? (span * octets) : (span + 7) / 8;
1789 SvGROW(swatch, scur + 1);
1790 s = (U8*)SvPVX(swatch);
1791 if (octets && none) {
1792 const U8* const e = s + scur;
1793 while (s < e) {
1794 if (bits == 8)
1795 *s++ = (U8)(none & 0xff);
1796 else if (bits == 16) {
1797 *s++ = (U8)((none >> 8) & 0xff);
1798 *s++ = (U8)( none & 0xff);
1799 }
1800 else if (bits == 32) {
1801 *s++ = (U8)((none >> 24) & 0xff);
1802 *s++ = (U8)((none >> 16) & 0xff);
1803 *s++ = (U8)((none >> 8) & 0xff);
1804 *s++ = (U8)( none & 0xff);
1805 }
1806 }
1807 *s = '\0';
1808 }
1809 else {
1810 (void)memzero((U8*)s, scur + 1);
1811 }
1812 SvCUR_set(swatch, scur);
1813 s = (U8*)SvPVX(swatch);
1814
1815 /* read $swash->{LIST} */
1816 l = (U8*)SvPV(*listsvp, lcur);
1817 lend = l + lcur;
1818 while (l < lend) {
1819 UV min, max, val;
1820 STRLEN numlen;
1821 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1822
1823 U8* const nl = (U8*)memchr(l, '\n', lend - l);
1824
1825 numlen = lend - l;
1826 min = grok_hex((char *)l, &numlen, &flags, NULL);
1827 if (numlen)
1828 l += numlen;
1829 else if (nl) {
1830 l = nl + 1; /* 1 is length of "\n" */
1831 continue;
1832 }
1833 else {
1834 l = lend; /* to LIST's end at which \n is not found */
1835 break;
1836 }
1837
1838 if (isBLANK(*l)) {
1839 ++l;
1840 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1841 numlen = lend - l;
1842 max = grok_hex((char *)l, &numlen, &flags, NULL);
1843 if (numlen)
1844 l += numlen;
1845 else
1846 max = min;
1847
1848 if (octets) {
1849 if (isBLANK(*l)) {
1850 ++l;
1851 flags = PERL_SCAN_SILENT_ILLDIGIT |
1852 PERL_SCAN_DISALLOW_PREFIX;
1853 numlen = lend - l;
1854 val = grok_hex((char *)l, &numlen, &flags, NULL);
1855 if (numlen)
1856 l += numlen;
1857 else
1858 val = 0;
1859 }
1860 else {
1861 val = 0;
1862 if (typeto) {
1863 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1864 typestr, l);
1865 }
1866 }
1867 }
1868 else
1869 val = 0; /* bits == 1, then val should be ignored */
1870 }
1871 else {
1872 max = min;
1873 if (octets) {
1874 val = 0;
1875 if (typeto) {
1876 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1877 }
1878 }
1879 else
1880 val = 0; /* bits == 1, then val should be ignored */
1881 }
1882
1883 if (nl)
1884 l = nl + 1;
1885 else
1886 l = lend;
1887
1888 if (max < start)
1889 continue;
1890
1891 if (octets) {
1892 UV key;
1893 if (min < start) {
1894 if (!none || val < none) {
1895 val += start - min;
1896 }
1897 min = start;
1898 }
1899 for (key = min; key <= max; key++) {
1900 STRLEN offset;
1901 if (key >= end)
1902 goto go_out_list;
1903 /* offset must be non-negative (start <= min <= key < end) */
1904 offset = octets * (key - start);
1905 if (bits == 8)
1906 s[offset] = (U8)(val & 0xff);
1907 else if (bits == 16) {
1908 s[offset ] = (U8)((val >> 8) & 0xff);
1909 s[offset + 1] = (U8)( val & 0xff);
1910 }
1911 else if (bits == 32) {
1912 s[offset ] = (U8)((val >> 24) & 0xff);
1913 s[offset + 1] = (U8)((val >> 16) & 0xff);
1914 s[offset + 2] = (U8)((val >> 8) & 0xff);
1915 s[offset + 3] = (U8)( val & 0xff);
1916 }
1917
1918 if (!none || val < none)
1919 ++val;
1920 }
1921 }
1922 else { /* bits == 1, then val should be ignored */
1923 UV key;
1924 if (min < start)
1925 min = start;
1926 for (key = min; key <= max; key++) {
1927 const STRLEN offset = (STRLEN)(key - start);
1928 if (key >= end)
1929 goto go_out_list;
1930 s[offset >> 3] |= 1 << (offset & 7);
1931 }
1932 }
1933 } /* while */
1934 go_out_list:
1935
1936 /* read $swash->{EXTRAS} */
1937 x = (U8*)SvPV(*extssvp, xcur);
1938 xend = x + xcur;
1939 while (x < xend) {
1940 STRLEN namelen;
1941 U8 *namestr;
1942 SV** othersvp;
1943 HV* otherhv;
1944 STRLEN otherbits;
1945 SV **otherbitssvp, *other;
1946 U8 *s, *o, *nl;
1947 STRLEN slen, olen;
1948
1949 const U8 opc = *x++;
1950 if (opc == '\n')
1951 continue;
1952
1953 nl = (U8*)memchr(x, '\n', xend - x);
1954
1955 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1956 if (nl) {
1957 x = nl + 1; /* 1 is length of "\n" */
1958 continue;
1959 }
1960 else {
1961 x = xend; /* to EXTRAS' end at which \n is not found */
1962 break;
1963 }
1964 }
1965
1966 namestr = x;
1967 if (nl) {
1968 namelen = nl - namestr;
1969 x = nl + 1;
1970 }
1971 else {
1972 namelen = xend - namestr;
1973 x = xend;
1974 }
1975
1976 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1977 otherhv = (HV*)SvRV(*othersvp);
1978 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
1979 otherbits = (STRLEN)SvUV(*otherbitssvp);
1980 if (bits < otherbits)
1981 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
1982
1983 /* The "other" swatch must be destroyed after. */
1984 other = swash_get(*othersvp, start, span);
1985 o = (U8*)SvPV(other, olen);
1986
1987 if (!olen)
1988 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
1989
1990 s = (U8*)SvPV(swatch, slen);
1991 if (bits == 1 && otherbits == 1) {
1992 if (slen != olen)
1993 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
1994
1995 switch (opc) {
1996 case '+':
1997 while (slen--)
1998 *s++ |= *o++;
1999 break;
2000 case '!':
2001 while (slen--)
2002 *s++ |= ~*o++;
2003 break;
2004 case '-':
2005 while (slen--)
2006 *s++ &= ~*o++;
2007 break;
2008 case '&':
2009 while (slen--)
2010 *s++ &= *o++;
2011 break;
2012 default:
2013 break;
2014 }
2015 }
2016 else {
2017 STRLEN otheroctets = otherbits >> 3;
2018 STRLEN offset = 0;
2019 U8* const send = s + slen;
2020
2021 while (s < send) {
2022 UV otherval = 0;
2023
2024 if (otherbits == 1) {
2025 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2026 ++offset;
2027 }
2028 else {
2029 STRLEN vlen = otheroctets;
2030 otherval = *o++;
2031 while (--vlen) {
2032 otherval <<= 8;
2033 otherval |= *o++;
2034 }
2035 }
2036
2037 if (opc == '+' && otherval)
2038 NOOP; /* replace with otherval */
2039 else if (opc == '!' && !otherval)
2040 otherval = 1;
2041 else if (opc == '-' && otherval)
2042 otherval = 0;
2043 else if (opc == '&' && !otherval)
2044 otherval = 0;
2045 else {
2046 s += octets; /* no replacement */
2047 continue;
2048 }
2049
2050 if (bits == 8)
2051 *s++ = (U8)( otherval & 0xff);
2052 else if (bits == 16) {
2053 *s++ = (U8)((otherval >> 8) & 0xff);
2054 *s++ = (U8)( otherval & 0xff);
2055 }
2056 else if (bits == 32) {
2057 *s++ = (U8)((otherval >> 24) & 0xff);
2058 *s++ = (U8)((otherval >> 16) & 0xff);
2059 *s++ = (U8)((otherval >> 8) & 0xff);
2060 *s++ = (U8)( otherval & 0xff);
2061 }
2062 }
2063 }
2064 sv_free(other); /* through with it! */
2065 } /* while */
2066 return swatch;
2067}
2068
2069/*
2070=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2071
2072Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2073of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2074bytes available. The return value is the pointer to the byte after the
2075end of the new character. In other words,
2076
2077 d = uvchr_to_utf8(d, uv);
2078
2079is the recommended wide native character-aware way of saying
2080
2081 *(d++) = uv;
2082
2083=cut
2084*/
2085
2086/* On ASCII machines this is normally a macro but we want a
2087 real function in case XS code wants it
2088*/
2089U8 *
2090Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2091{
2092 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2093}
2094
2095U8 *
2096Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2097{
2098 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2099}
2100
2101/*
2102=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2103flags
2104
2105Returns the native character value of the first character in the string
2106C<s>
2107which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2108length, in bytes, of that character.
2109
2110Allows length and flags to be passed to low level routine.
2111
2112=cut
2113*/
2114/* On ASCII machines this is normally a macro but we want
2115 a real function in case XS code wants it
2116*/
2117UV
2118Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2119U32 flags)
2120{
2121 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2122 return UNI_TO_NATIVE(uv);
2123}
2124
2125/*
2126=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2127
2128Build to the scalar dsv a displayable version of the string spv,
2129length len, the displayable version being at most pvlim bytes long
2130(if longer, the rest is truncated and "..." will be appended).
2131
2132The flags argument can have UNI_DISPLAY_ISPRINT set to display
2133isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2134to display the \\[nrfta\\] as the backslashed versions (like '\n')
2135(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2136UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2137UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2138
2139The pointer to the PV of the dsv is returned.
2140
2141=cut */
2142char *
2143Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2144{
2145 int truncated = 0;
2146 const char *s, *e;
2147
2148 sv_setpvn(dsv, "", 0);
2149 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2150 UV u;
2151 /* This serves double duty as a flag and a character to print after
2152 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2153 */
2154 char ok = 0;
2155
2156 if (pvlim && SvCUR(dsv) >= pvlim) {
2157 truncated++;
2158 break;
2159 }
2160 u = utf8_to_uvchr((U8*)s, 0);
2161 if (u < 256) {
2162 const unsigned char c = (unsigned char)u & 0xFF;
2163 if (flags & UNI_DISPLAY_BACKSLASH) {
2164 switch (c) {
2165 case '\n':
2166 ok = 'n'; break;
2167 case '\r':
2168 ok = 'r'; break;
2169 case '\t':
2170 ok = 't'; break;
2171 case '\f':
2172 ok = 'f'; break;
2173 case '\a':
2174 ok = 'a'; break;
2175 case '\\':
2176 ok = '\\'; break;
2177 default: break;
2178 }
2179 if (ok) {
2180 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2181 }
2182 }
2183 /* isPRINT() is the locale-blind version. */
2184 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2185 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2186 ok = 1;
2187 }
2188 }
2189 if (!ok)
2190 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2191 }
2192 if (truncated)
2193 sv_catpvs(dsv, "...");
2194
2195 return SvPVX(dsv);
2196}
2197
2198/*
2199=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2200
2201Build to the scalar dsv a displayable version of the scalar sv,
2202the displayable version being at most pvlim bytes long
2203(if longer, the rest is truncated and "..." will be appended).
2204
2205The flags argument is as in pv_uni_display().
2206
2207The pointer to the PV of the dsv is returned.
2208
2209=cut
2210*/
2211char *
2212Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2213{
2214 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2215 SvCUR(ssv), pvlim, flags);
2216}
2217
2218/*
2219=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2220
2221Return true if the strings s1 and s2 differ case-insensitively, false
2222if not (if they are equal case-insensitively). If u1 is true, the
2223string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
2224the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2225are false, the respective string is assumed to be in native 8-bit
2226encoding.
2227
2228If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2229in there (they will point at the beginning of the I<next> character).
2230If the pointers behind pe1 or pe2 are non-NULL, they are the end
2231pointers beyond which scanning will not continue under any
2232circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
2233s2+l2 will be used as goal end pointers that will also stop the scan,
2234and which qualify towards defining a successful match: all the scans
2235that define an explicit length must reach their goal pointers for
2236a match to succeed).
2237
2238For case-insensitiveness, the "casefolding" of Unicode is used
2239instead of upper/lowercasing both the characters, see
2240http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2241
2242=cut */
2243I32
2244Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2245{
2246 dVAR;
2247 register const U8 *p1 = (const U8*)s1;
2248 register const U8 *p2 = (const U8*)s2;
2249 register const U8 *f1 = NULL;
2250 register const U8 *f2 = NULL;
2251 register U8 *e1 = NULL;
2252 register U8 *q1 = NULL;
2253 register U8 *e2 = NULL;
2254 register U8 *q2 = NULL;
2255 STRLEN n1 = 0, n2 = 0;
2256 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2257 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2258 U8 natbuf[1+1];
2259 STRLEN foldlen1, foldlen2;
2260 bool match;
2261
2262 if (pe1)
2263 e1 = *(U8**)pe1;
2264 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2265 f1 = (const U8*)s1 + l1;
2266 if (pe2)
2267 e2 = *(U8**)pe2;
2268 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2269 f2 = (const U8*)s2 + l2;
2270
2271 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2272 return 1; /* mismatch; possible infinite loop or false positive */
2273
2274 if (!u1 || !u2)
2275 natbuf[1] = 0; /* Need to terminate the buffer. */
2276
2277 while ((e1 == 0 || p1 < e1) &&
2278 (f1 == 0 || p1 < f1) &&
2279 (e2 == 0 || p2 < e2) &&
2280 (f2 == 0 || p2 < f2)) {
2281 if (n1 == 0) {
2282 if (u1)
2283 to_utf8_fold(p1, foldbuf1, &foldlen1);
2284 else {
2285 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2286 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2287 }
2288 q1 = foldbuf1;
2289 n1 = foldlen1;
2290 }
2291 if (n2 == 0) {
2292 if (u2)
2293 to_utf8_fold(p2, foldbuf2, &foldlen2);
2294 else {
2295 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2296 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2297 }
2298 q2 = foldbuf2;
2299 n2 = foldlen2;
2300 }
2301 while (n1 && n2) {
2302 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2303 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2304 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2305 return 1; /* mismatch */
2306 n1 -= UTF8SKIP(q1);
2307 q1 += UTF8SKIP(q1);
2308 n2 -= UTF8SKIP(q2);
2309 q2 += UTF8SKIP(q2);
2310 }
2311 if (n1 == 0)
2312 p1 += u1 ? UTF8SKIP(p1) : 1;
2313 if (n2 == 0)
2314 p2 += u2 ? UTF8SKIP(p2) : 1;
2315
2316 }
2317
2318 /* A match is defined by all the scans that specified
2319 * an explicit length reaching their final goals. */
2320 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2321
2322 if (match) {
2323 if (pe1)
2324 *pe1 = (char*)p1;
2325 if (pe2)
2326 *pe2 = (char*)p2;
2327 }
2328
2329 return match ? 0 : 1; /* 0 match, 1 mismatch */
2330}
2331
2332/*
2333 * Local variables:
2334 * c-indentation-style: bsd
2335 * c-basic-offset: 4
2336 * indent-tabs-mode: t
2337 * End:
2338 *
2339 * ex: set ts=8 sts=4 sw=4 noet:
2340 */