This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Remove details that changed over seven years ago.
[perl5.git] / utf8.c
... / ...
CommitLineData
1/* utf8.c
2 *
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008
4 * by Larry Wall and others
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * [p.603 of _The Lord of the Rings_, IV/I: "The Taming of Sméagol"]
17 *
18 * 'Well do I understand your speech,' he answered in the same language;
19 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
20 * as is the custom in the West, if you wish to be answered?'
21 * --Gandalf, addressing Théoden's door wardens
22 *
23 * [p.508 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
24 *
25 * ...the travellers perceived that the floor was paved with stones of many
26 * hues; branching runes and strange devices intertwined beneath their feet.
27 *
28 * [p.512 of _The Lord of the Rings_, III/vi: "The King of the Golden Hall"]
29 */
30
31#include "EXTERN.h"
32#define PERL_IN_UTF8_C
33#include "perl.h"
34
35#ifndef EBCDIC
36/* Separate prototypes needed because in ASCII systems these are
37 * usually macros but they still are compiled as code, too. */
38PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
39PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
40#endif
41
42static const char unees[] =
43 "Malformed UTF-8 character (unexpected end of string)";
44
45/*
46=head1 Unicode Support
47
48This file contains various utility functions for manipulating UTF8-encoded
49strings. For the uninitiated, this is a method of representing arbitrary
50Unicode characters as a variable number of bytes, in such a way that
51characters in the ASCII range are unmodified, and a zero byte never appears
52within non-zero characters.
53
54=cut
55*/
56
57/*
58=for apidoc is_ascii_string
59
60Returns true if the first C<len> bytes of the given string are the same whether
61or not the string is encoded in UTF-8 (or UTF-EBCDIC on EBCDIC machines). That
62is, if they are invariant. On ASCII-ish machines, only ASCII characters
63fit this definition, hence the function's name.
64
65See also is_utf8_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
66
67=cut
68*/
69
70bool
71Perl_is_ascii_string(const U8 *s, STRLEN len)
72{
73 const U8* const send = s + (len ? len : strlen((const char *)s));
74 const U8* x = s;
75
76 PERL_ARGS_ASSERT_IS_ASCII_STRING;
77
78 for (; x < send; ++x) {
79 if (!UTF8_IS_INVARIANT(*x))
80 break;
81 }
82
83 return x == send;
84}
85
86/*
87=for apidoc uvuni_to_utf8_flags
88
89Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
90of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
91bytes available. The return value is the pointer to the byte after the
92end of the new character. In other words,
93
94 d = uvuni_to_utf8_flags(d, uv, flags);
95
96or, in most cases,
97
98 d = uvuni_to_utf8(d, uv);
99
100(which is equivalent to)
101
102 d = uvuni_to_utf8_flags(d, uv, 0);
103
104is the recommended Unicode-aware way of saying
105
106 *(d++) = uv;
107
108=cut
109*/
110
111U8 *
112Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
113{
114 PERL_ARGS_ASSERT_UVUNI_TO_UTF8_FLAGS;
115
116 if (ckWARN(WARN_UTF8)) {
117 if (UNICODE_IS_SURROGATE(uv) &&
118 !(flags & UNICODE_ALLOW_SURROGATE))
119 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
120 else if (
121 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
122 !(flags & UNICODE_ALLOW_FDD0))
123 ||
124 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
125 !(flags & UNICODE_ALLOW_FFFF))) &&
126 /* UNICODE_ALLOW_SUPER includes
127 * FFFEs and FFFFs beyond 0x10FFFF. */
128 ((uv <= PERL_UNICODE_MAX) ||
129 !(flags & UNICODE_ALLOW_SUPER))
130 )
131 Perl_warner(aTHX_ packWARN(WARN_UTF8),
132 "Unicode non-character 0x%04"UVxf" is illegal for interchange", uv);
133 }
134 if (UNI_IS_INVARIANT(uv)) {
135 *d++ = (U8)UTF_TO_NATIVE(uv);
136 return d;
137 }
138#if defined(EBCDIC)
139 else {
140 STRLEN len = UNISKIP(uv);
141 U8 *p = d+len-1;
142 while (p > d) {
143 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
144 uv >>= UTF_ACCUMULATION_SHIFT;
145 }
146 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
147 return d+len;
148 }
149#else /* Non loop style */
150 if (uv < 0x800) {
151 *d++ = (U8)(( uv >> 6) | 0xc0);
152 *d++ = (U8)(( uv & 0x3f) | 0x80);
153 return d;
154 }
155 if (uv < 0x10000) {
156 *d++ = (U8)(( uv >> 12) | 0xe0);
157 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
158 *d++ = (U8)(( uv & 0x3f) | 0x80);
159 return d;
160 }
161 if (uv < 0x200000) {
162 *d++ = (U8)(( uv >> 18) | 0xf0);
163 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
164 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
165 *d++ = (U8)(( uv & 0x3f) | 0x80);
166 return d;
167 }
168 if (uv < 0x4000000) {
169 *d++ = (U8)(( uv >> 24) | 0xf8);
170 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
171 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
172 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
173 *d++ = (U8)(( uv & 0x3f) | 0x80);
174 return d;
175 }
176 if (uv < 0x80000000) {
177 *d++ = (U8)(( uv >> 30) | 0xfc);
178 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
179 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
180 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
181 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
182 *d++ = (U8)(( uv & 0x3f) | 0x80);
183 return d;
184 }
185#ifdef HAS_QUAD
186 if (uv < UTF8_QUAD_MAX)
187#endif
188 {
189 *d++ = 0xfe; /* Can't match U+FEFF! */
190 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
191 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
192 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
193 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
194 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
195 *d++ = (U8)(( uv & 0x3f) | 0x80);
196 return d;
197 }
198#ifdef HAS_QUAD
199 {
200 *d++ = 0xff; /* Can't match U+FFFE! */
201 *d++ = 0x80; /* 6 Reserved bits */
202 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
203 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
204 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
205 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
206 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
207 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
208 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
209 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
210 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
211 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
212 *d++ = (U8)(( uv & 0x3f) | 0x80);
213 return d;
214 }
215#endif
216#endif /* Loop style */
217}
218
219/*
220
221Tests if some arbitrary number of bytes begins in a valid UTF-8
222character. Note that an INVARIANT (i.e. ASCII) character is a valid
223UTF-8 character. The actual number of bytes in the UTF-8 character
224will be returned if it is valid, otherwise 0.
225
226This is the "slow" version as opposed to the "fast" version which is
227the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
228difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
229or less you should use the IS_UTF8_CHAR(), for lengths of five or more
230you should use the _slow(). In practice this means that the _slow()
231will be used very rarely, since the maximum Unicode code point (as of
232Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
233the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
234five bytes or more.
235
236=cut */
237STATIC STRLEN
238S_is_utf8_char_slow(const U8 *s, const STRLEN len)
239{
240 U8 u = *s;
241 STRLEN slen;
242 UV uv, ouv;
243
244 PERL_ARGS_ASSERT_IS_UTF8_CHAR_SLOW;
245
246 if (UTF8_IS_INVARIANT(u))
247 return 1;
248
249 if (!UTF8_IS_START(u))
250 return 0;
251
252 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
253 return 0;
254
255 slen = len - 1;
256 s++;
257#ifdef EBCDIC
258 u = NATIVE_TO_UTF(u);
259#endif
260 u &= UTF_START_MASK(len);
261 uv = u;
262 ouv = uv;
263 while (slen--) {
264 if (!UTF8_IS_CONTINUATION(*s))
265 return 0;
266 uv = UTF8_ACCUMULATE(uv, *s);
267 if (uv < ouv)
268 return 0;
269 ouv = uv;
270 s++;
271 }
272
273 if ((STRLEN)UNISKIP(uv) < len)
274 return 0;
275
276 return len;
277}
278
279/*
280=for apidoc is_utf8_char
281
282Tests if some arbitrary number of bytes begins in a valid UTF-8
283character. Note that an INVARIANT (i.e. ASCII on non-EBCDIC machines)
284character is a valid UTF-8 character. The actual number of bytes in the UTF-8
285character will be returned if it is valid, otherwise 0.
286
287=cut */
288STRLEN
289Perl_is_utf8_char(const U8 *s)
290{
291 const STRLEN len = UTF8SKIP(s);
292
293 PERL_ARGS_ASSERT_IS_UTF8_CHAR;
294#ifdef IS_UTF8_CHAR
295 if (IS_UTF8_CHAR_FAST(len))
296 return IS_UTF8_CHAR(s, len) ? len : 0;
297#endif /* #ifdef IS_UTF8_CHAR */
298 return is_utf8_char_slow(s, len);
299}
300
301
302/*
303=for apidoc is_utf8_string
304
305Returns true if first C<len> bytes of the given string form a valid
306UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
307not mean 'a string that contains code points above 0x7F encoded in UTF-8'
308because a valid ASCII string is a valid UTF-8 string.
309
310See also is_ascii_string(), is_utf8_string_loclen(), and is_utf8_string_loc().
311
312=cut
313*/
314
315bool
316Perl_is_utf8_string(const U8 *s, STRLEN len)
317{
318 const U8* const send = s + (len ? len : strlen((const char *)s));
319 const U8* x = s;
320
321 PERL_ARGS_ASSERT_IS_UTF8_STRING;
322
323 while (x < send) {
324 STRLEN c;
325 /* Inline the easy bits of is_utf8_char() here for speed... */
326 if (UTF8_IS_INVARIANT(*x))
327 c = 1;
328 else if (!UTF8_IS_START(*x))
329 goto out;
330 else {
331 /* ... and call is_utf8_char() only if really needed. */
332#ifdef IS_UTF8_CHAR
333 c = UTF8SKIP(x);
334 if (IS_UTF8_CHAR_FAST(c)) {
335 if (!IS_UTF8_CHAR(x, c))
336 c = 0;
337 }
338 else
339 c = is_utf8_char_slow(x, c);
340#else
341 c = is_utf8_char(x);
342#endif /* #ifdef IS_UTF8_CHAR */
343 if (!c)
344 goto out;
345 }
346 x += c;
347 }
348
349 out:
350 if (x != send)
351 return FALSE;
352
353 return TRUE;
354}
355
356/*
357Implemented as a macro in utf8.h
358
359=for apidoc is_utf8_string_loc
360
361Like is_utf8_string() but stores the location of the failure (in the
362case of "utf8ness failure") or the location s+len (in the case of
363"utf8ness success") in the C<ep>.
364
365See also is_utf8_string_loclen() and is_utf8_string().
366
367=for apidoc is_utf8_string_loclen
368
369Like is_utf8_string() but stores the location of the failure (in the
370case of "utf8ness failure") or the location s+len (in the case of
371"utf8ness success") in the C<ep>, and the number of UTF-8
372encoded characters in the C<el>.
373
374See also is_utf8_string_loc() and is_utf8_string().
375
376=cut
377*/
378
379bool
380Perl_is_utf8_string_loclen(const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
381{
382 const U8* const send = s + (len ? len : strlen((const char *)s));
383 const U8* x = s;
384 STRLEN c;
385 STRLEN outlen = 0;
386
387 PERL_ARGS_ASSERT_IS_UTF8_STRING_LOCLEN;
388
389 while (x < send) {
390 /* Inline the easy bits of is_utf8_char() here for speed... */
391 if (UTF8_IS_INVARIANT(*x))
392 c = 1;
393 else if (!UTF8_IS_START(*x))
394 goto out;
395 else {
396 /* ... and call is_utf8_char() only if really needed. */
397#ifdef IS_UTF8_CHAR
398 c = UTF8SKIP(x);
399 if (IS_UTF8_CHAR_FAST(c)) {
400 if (!IS_UTF8_CHAR(x, c))
401 c = 0;
402 } else
403 c = is_utf8_char_slow(x, c);
404#else
405 c = is_utf8_char(x);
406#endif /* #ifdef IS_UTF8_CHAR */
407 if (!c)
408 goto out;
409 }
410 x += c;
411 outlen++;
412 }
413
414 out:
415 if (el)
416 *el = outlen;
417
418 if (ep)
419 *ep = x;
420 return (x == send);
421}
422
423/*
424
425=for apidoc utf8n_to_uvuni
426
427Bottom level UTF-8 decode routine.
428Returns the Unicode code point value of the first character in the string C<s>
429which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
430C<retlen> will be set to the length, in bytes, of that character.
431
432If C<s> does not point to a well-formed UTF-8 character, the behaviour
433is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
434it is assumed that the caller will raise a warning, and this function
435will silently just set C<retlen> to C<-1> and return zero. If the
436C<flags> does not contain UTF8_CHECK_ONLY, warnings about
437malformations will be given, C<retlen> will be set to the expected
438length of the UTF-8 character in bytes, and zero will be returned.
439
440The C<flags> can also contain various flags to allow deviations from
441the strict UTF-8 encoding (see F<utf8.h>).
442
443Most code should use utf8_to_uvchr() rather than call this directly.
444
445=cut
446*/
447
448UV
449Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
450{
451 dVAR;
452 const U8 * const s0 = s;
453 UV uv = *s, ouv = 0;
454 STRLEN len = 1;
455 const bool dowarn = ckWARN_d(WARN_UTF8);
456 const UV startbyte = *s;
457 STRLEN expectlen = 0;
458 U32 warning = 0;
459 SV* sv;
460
461 PERL_ARGS_ASSERT_UTF8N_TO_UVUNI;
462
463/* This list is a superset of the UTF8_ALLOW_XXX. BUT it isn't, eg SUPER missing XXX */
464
465#define UTF8_WARN_EMPTY 1
466#define UTF8_WARN_CONTINUATION 2
467#define UTF8_WARN_NON_CONTINUATION 3
468#define UTF8_WARN_FE_FF 4
469#define UTF8_WARN_SHORT 5
470#define UTF8_WARN_OVERFLOW 6
471#define UTF8_WARN_SURROGATE 7
472#define UTF8_WARN_LONG 8
473#define UTF8_WARN_FFFF 9 /* Also FFFE. */
474
475 if (curlen == 0 &&
476 !(flags & UTF8_ALLOW_EMPTY)) {
477 warning = UTF8_WARN_EMPTY;
478 goto malformed;
479 }
480
481 if (UTF8_IS_INVARIANT(uv)) {
482 if (retlen)
483 *retlen = 1;
484 return (UV) (NATIVE_TO_UTF(*s));
485 }
486
487 if (UTF8_IS_CONTINUATION(uv) &&
488 !(flags & UTF8_ALLOW_CONTINUATION)) {
489 warning = UTF8_WARN_CONTINUATION;
490 goto malformed;
491 }
492
493 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
494 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
495 warning = UTF8_WARN_NON_CONTINUATION;
496 goto malformed;
497 }
498
499#ifdef EBCDIC
500 uv = NATIVE_TO_UTF(uv);
501#else
502 if ((uv == 0xfe || uv == 0xff) &&
503 !(flags & UTF8_ALLOW_FE_FF)) {
504 warning = UTF8_WARN_FE_FF;
505 goto malformed;
506 }
507#endif
508
509 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
510 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
511 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
512 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
513#ifdef EBCDIC
514 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
515 else { len = 7; uv &= 0x01; }
516#else
517 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
518 else if (!(uv & 0x01)) { len = 7; uv = 0; }
519 else { len = 13; uv = 0; } /* whoa! */
520#endif
521
522 if (retlen)
523 *retlen = len;
524
525 expectlen = len;
526
527 if ((curlen < expectlen) &&
528 !(flags & UTF8_ALLOW_SHORT)) {
529 warning = UTF8_WARN_SHORT;
530 goto malformed;
531 }
532
533 len--;
534 s++;
535 ouv = uv;
536
537 while (len--) {
538 if (!UTF8_IS_CONTINUATION(*s) &&
539 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
540 s--;
541 warning = UTF8_WARN_NON_CONTINUATION;
542 goto malformed;
543 }
544 else
545 uv = UTF8_ACCUMULATE(uv, *s);
546 if (!(uv > ouv)) {
547 /* These cannot be allowed. */
548 if (uv == ouv) {
549 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
550 warning = UTF8_WARN_LONG;
551 goto malformed;
552 }
553 }
554 else { /* uv < ouv */
555 /* This cannot be allowed. */
556 warning = UTF8_WARN_OVERFLOW;
557 goto malformed;
558 }
559 }
560 s++;
561 ouv = uv;
562 }
563
564 if (UNICODE_IS_SURROGATE(uv) &&
565 !(flags & UTF8_ALLOW_SURROGATE)) {
566 warning = UTF8_WARN_SURROGATE;
567 goto malformed;
568 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
569 !(flags & UTF8_ALLOW_LONG)) {
570 warning = UTF8_WARN_LONG;
571 goto malformed;
572 } else if (UNICODE_IS_ILLEGAL(uv) &&
573 !(flags & UTF8_ALLOW_FFFF)) {
574 warning = UTF8_WARN_FFFF;
575 goto malformed;
576 }
577
578 return uv;
579
580malformed:
581
582 if (flags & UTF8_CHECK_ONLY) {
583 if (retlen)
584 *retlen = ((STRLEN) -1);
585 return 0;
586 }
587
588 if (dowarn) {
589 if (warning == UTF8_WARN_FFFF) {
590 sv = newSVpvs_flags("Unicode non-character ", SVs_TEMP);
591 Perl_sv_catpvf(aTHX_ sv, "0x%04"UVxf" is illegal for interchange", uv);
592 }
593 else {
594 sv = newSVpvs_flags("Malformed UTF-8 character ", SVs_TEMP);
595
596 switch (warning) {
597 case 0: /* Intentionally empty. */ break;
598 case UTF8_WARN_EMPTY:
599 sv_catpvs(sv, "(empty string)");
600 break;
601 case UTF8_WARN_CONTINUATION:
602 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
603 break;
604 case UTF8_WARN_NON_CONTINUATION:
605 if (s == s0)
606 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
607 (UV)s[1], startbyte);
608 else {
609 const int len = (int)(s-s0);
610 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
611 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
612 }
613
614 break;
615 case UTF8_WARN_FE_FF:
616 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
617 break;
618 case UTF8_WARN_SHORT:
619 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
620 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
621 expectlen = curlen; /* distance for caller to skip */
622 break;
623 case UTF8_WARN_OVERFLOW:
624 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
625 ouv, *s, startbyte);
626 break;
627 case UTF8_WARN_SURROGATE:
628 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
629 break;
630 case UTF8_WARN_LONG:
631 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
632 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
633 break;
634 default:
635 sv_catpvs(sv, "(unknown reason)");
636 break;
637 }
638 }
639
640 if (warning) {
641 const char * const s = SvPVX_const(sv);
642
643 if (PL_op)
644 Perl_warner(aTHX_ packWARN(WARN_UTF8),
645 "%s in %s", s, OP_DESC(PL_op));
646 else
647 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
648 }
649 }
650
651 if (retlen)
652 *retlen = expectlen ? expectlen : len;
653
654 return 0;
655}
656
657/*
658=for apidoc utf8_to_uvchr
659
660Returns the native character value of the first character in the string C<s>
661which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
662length, in bytes, of that character.
663
664If C<s> does not point to a well-formed UTF-8 character, zero is
665returned and retlen is set, if possible, to -1.
666
667=cut
668*/
669
670UV
671Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
672{
673 PERL_ARGS_ASSERT_UTF8_TO_UVCHR;
674
675 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
676 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
677}
678
679/*
680=for apidoc utf8_to_uvuni
681
682Returns the Unicode code point of the first character in the string C<s>
683which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
684length, in bytes, of that character.
685
686This function should only be used when the returned UV is considered
687an index into the Unicode semantic tables (e.g. swashes).
688
689If C<s> does not point to a well-formed UTF-8 character, zero is
690returned and retlen is set, if possible, to -1.
691
692=cut
693*/
694
695UV
696Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
697{
698 PERL_ARGS_ASSERT_UTF8_TO_UVUNI;
699
700 /* Call the low level routine asking for checks */
701 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
702 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
703}
704
705/*
706=for apidoc utf8_length
707
708Return the length of the UTF-8 char encoded string C<s> in characters.
709Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
710up past C<e>, croaks.
711
712=cut
713*/
714
715STRLEN
716Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
717{
718 dVAR;
719 STRLEN len = 0;
720
721 PERL_ARGS_ASSERT_UTF8_LENGTH;
722
723 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
724 * the bitops (especially ~) can create illegal UTF-8.
725 * In other words: in Perl UTF-8 is not just for Unicode. */
726
727 if (e < s)
728 goto warn_and_return;
729 while (s < e) {
730 if (!UTF8_IS_INVARIANT(*s))
731 s += UTF8SKIP(s);
732 else
733 s++;
734 len++;
735 }
736
737 if (e != s) {
738 len--;
739 warn_and_return:
740 if (PL_op)
741 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8),
742 "%s in %s", unees, OP_DESC(PL_op));
743 else
744 Perl_ck_warner_d(aTHX_ packWARN(WARN_UTF8), unees);
745 }
746
747 return len;
748}
749
750/*
751=for apidoc utf8_distance
752
753Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
754and C<b>.
755
756WARNING: use only if you *know* that the pointers point inside the
757same UTF-8 buffer.
758
759=cut
760*/
761
762IV
763Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
764{
765 PERL_ARGS_ASSERT_UTF8_DISTANCE;
766
767 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
768}
769
770/*
771=for apidoc utf8_hop
772
773Return the UTF-8 pointer C<s> displaced by C<off> characters, either
774forward or backward.
775
776WARNING: do not use the following unless you *know* C<off> is within
777the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
778on the first byte of character or just after the last byte of a character.
779
780=cut
781*/
782
783U8 *
784Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
785{
786 PERL_ARGS_ASSERT_UTF8_HOP;
787
788 PERL_UNUSED_CONTEXT;
789 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
790 * the bitops (especially ~) can create illegal UTF-8.
791 * In other words: in Perl UTF-8 is not just for Unicode. */
792
793 if (off >= 0) {
794 while (off--)
795 s += UTF8SKIP(s);
796 }
797 else {
798 while (off++) {
799 s--;
800 while (UTF8_IS_CONTINUATION(*s))
801 s--;
802 }
803 }
804 return (U8 *)s;
805}
806
807/*
808=for apidoc utf8_to_bytes
809
810Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
811Unlike C<bytes_to_utf8>, this over-writes the original string, and
812updates len to contain the new length.
813Returns zero on failure, setting C<len> to -1.
814
815If you need a copy of the string, see C<bytes_from_utf8>.
816
817=cut
818*/
819
820U8 *
821Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
822{
823 U8 * const save = s;
824 U8 * const send = s + *len;
825 U8 *d;
826
827 PERL_ARGS_ASSERT_UTF8_TO_BYTES;
828
829 /* ensure valid UTF-8 and chars < 256 before updating string */
830 while (s < send) {
831 U8 c = *s++;
832
833 if (!UTF8_IS_INVARIANT(c) &&
834 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
835 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
836 *len = ((STRLEN) -1);
837 return 0;
838 }
839 }
840
841 d = s = save;
842 while (s < send) {
843 STRLEN ulen;
844 *d++ = (U8)utf8_to_uvchr(s, &ulen);
845 s += ulen;
846 }
847 *d = '\0';
848 *len = d - save;
849 return save;
850}
851
852/*
853=for apidoc bytes_from_utf8
854
855Converts a string C<s> of length C<len> from UTF-8 into native byte encoding.
856Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
857the newly-created string, and updates C<len> to contain the new
858length. Returns the original string if no conversion occurs, C<len>
859is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8600 if C<s> is converted or consisted entirely of characters that are invariant
861in utf8 (i.e., US-ASCII on non-EBCDIC machines).
862
863=cut
864*/
865
866U8 *
867Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
868{
869 U8 *d;
870 const U8 *start = s;
871 const U8 *send;
872 I32 count = 0;
873
874 PERL_ARGS_ASSERT_BYTES_FROM_UTF8;
875
876 PERL_UNUSED_CONTEXT;
877 if (!*is_utf8)
878 return (U8 *)start;
879
880 /* ensure valid UTF-8 and chars < 256 before converting string */
881 for (send = s + *len; s < send;) {
882 U8 c = *s++;
883 if (!UTF8_IS_INVARIANT(c)) {
884 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
885 (c = *s++) && UTF8_IS_CONTINUATION(c))
886 count++;
887 else
888 return (U8 *)start;
889 }
890 }
891
892 *is_utf8 = FALSE;
893
894 Newx(d, (*len) - count + 1, U8);
895 s = start; start = d;
896 while (s < send) {
897 U8 c = *s++;
898 if (!UTF8_IS_INVARIANT(c)) {
899 /* Then it is two-byte encoded */
900 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
901 c = ASCII_TO_NATIVE(c);
902 }
903 *d++ = c;
904 }
905 *d = '\0';
906 *len = d - start;
907 return (U8 *)start;
908}
909
910/*
911=for apidoc bytes_to_utf8
912
913Converts a string C<s> of length C<len> from the native encoding into UTF-8.
914Returns a pointer to the newly-created string, and sets C<len> to
915reflect the new length.
916
917A NUL character will be written after the end of the string.
918
919If you want to convert to UTF-8 from encodings other than
920the native (Latin1 or EBCDIC),
921see sv_recode_to_utf8().
922
923=cut
924*/
925
926U8*
927Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
928{
929 const U8 * const send = s + (*len);
930 U8 *d;
931 U8 *dst;
932
933 PERL_ARGS_ASSERT_BYTES_TO_UTF8;
934 PERL_UNUSED_CONTEXT;
935
936 Newx(d, (*len) * 2 + 1, U8);
937 dst = d;
938
939 while (s < send) {
940 const UV uv = NATIVE_TO_ASCII(*s++);
941 if (UNI_IS_INVARIANT(uv))
942 *d++ = (U8)UTF_TO_NATIVE(uv);
943 else {
944 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
945 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
946 }
947 }
948 *d = '\0';
949 *len = d-dst;
950 return dst;
951}
952
953/*
954 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
955 *
956 * Destination must be pre-extended to 3/2 source. Do not use in-place.
957 * We optimize for native, for obvious reasons. */
958
959U8*
960Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
961{
962 U8* pend;
963 U8* dstart = d;
964
965 PERL_ARGS_ASSERT_UTF16_TO_UTF8;
966
967 if (bytelen & 1)
968 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
969
970 pend = p + bytelen;
971
972 while (p < pend) {
973 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
974 p += 2;
975 if (uv < 0x80) {
976#ifdef EBCDIC
977 *d++ = UNI_TO_NATIVE(uv);
978#else
979 *d++ = (U8)uv;
980#endif
981 continue;
982 }
983 if (uv < 0x800) {
984 *d++ = (U8)(( uv >> 6) | 0xc0);
985 *d++ = (U8)(( uv & 0x3f) | 0x80);
986 continue;
987 }
988 if (uv >= 0xd800 && uv <= 0xdbff) { /* surrogates */
989 if (p >= pend) {
990 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
991 } else {
992 UV low = (p[0] << 8) + p[1];
993 p += 2;
994 if (low < 0xdc00 || low > 0xdfff)
995 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
996 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
997 }
998 } else if (uv >= 0xdc00 && uv <= 0xdfff) {
999 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
1000 }
1001 if (uv < 0x10000) {
1002 *d++ = (U8)(( uv >> 12) | 0xe0);
1003 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1004 *d++ = (U8)(( uv & 0x3f) | 0x80);
1005 continue;
1006 }
1007 else {
1008 *d++ = (U8)(( uv >> 18) | 0xf0);
1009 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
1010 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
1011 *d++ = (U8)(( uv & 0x3f) | 0x80);
1012 continue;
1013 }
1014 }
1015 *newlen = d - dstart;
1016 return d;
1017}
1018
1019/* Note: this one is slightly destructive of the source. */
1020
1021U8*
1022Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
1023{
1024 U8* s = (U8*)p;
1025 U8* const send = s + bytelen;
1026
1027 PERL_ARGS_ASSERT_UTF16_TO_UTF8_REVERSED;
1028
1029 if (bytelen & 1)
1030 Perl_croak(aTHX_ "panic: utf16_to_utf8_reversed: odd bytelen %"UVuf,
1031 (UV)bytelen);
1032
1033 while (s < send) {
1034 const U8 tmp = s[0];
1035 s[0] = s[1];
1036 s[1] = tmp;
1037 s += 2;
1038 }
1039 return utf16_to_utf8(p, d, bytelen, newlen);
1040}
1041
1042/* for now these are all defined (inefficiently) in terms of the utf8 versions */
1043
1044bool
1045Perl_is_uni_alnum(pTHX_ UV c)
1046{
1047 U8 tmpbuf[UTF8_MAXBYTES+1];
1048 uvchr_to_utf8(tmpbuf, c);
1049 return is_utf8_alnum(tmpbuf);
1050}
1051
1052bool
1053Perl_is_uni_idfirst(pTHX_ UV c)
1054{
1055 U8 tmpbuf[UTF8_MAXBYTES+1];
1056 uvchr_to_utf8(tmpbuf, c);
1057 return is_utf8_idfirst(tmpbuf);
1058}
1059
1060bool
1061Perl_is_uni_alpha(pTHX_ UV c)
1062{
1063 U8 tmpbuf[UTF8_MAXBYTES+1];
1064 uvchr_to_utf8(tmpbuf, c);
1065 return is_utf8_alpha(tmpbuf);
1066}
1067
1068bool
1069Perl_is_uni_ascii(pTHX_ UV c)
1070{
1071 U8 tmpbuf[UTF8_MAXBYTES+1];
1072 uvchr_to_utf8(tmpbuf, c);
1073 return is_utf8_ascii(tmpbuf);
1074}
1075
1076bool
1077Perl_is_uni_space(pTHX_ UV c)
1078{
1079 U8 tmpbuf[UTF8_MAXBYTES+1];
1080 uvchr_to_utf8(tmpbuf, c);
1081 return is_utf8_space(tmpbuf);
1082}
1083
1084bool
1085Perl_is_uni_digit(pTHX_ UV c)
1086{
1087 U8 tmpbuf[UTF8_MAXBYTES+1];
1088 uvchr_to_utf8(tmpbuf, c);
1089 return is_utf8_digit(tmpbuf);
1090}
1091
1092bool
1093Perl_is_uni_upper(pTHX_ UV c)
1094{
1095 U8 tmpbuf[UTF8_MAXBYTES+1];
1096 uvchr_to_utf8(tmpbuf, c);
1097 return is_utf8_upper(tmpbuf);
1098}
1099
1100bool
1101Perl_is_uni_lower(pTHX_ UV c)
1102{
1103 U8 tmpbuf[UTF8_MAXBYTES+1];
1104 uvchr_to_utf8(tmpbuf, c);
1105 return is_utf8_lower(tmpbuf);
1106}
1107
1108bool
1109Perl_is_uni_cntrl(pTHX_ UV c)
1110{
1111 U8 tmpbuf[UTF8_MAXBYTES+1];
1112 uvchr_to_utf8(tmpbuf, c);
1113 return is_utf8_cntrl(tmpbuf);
1114}
1115
1116bool
1117Perl_is_uni_graph(pTHX_ UV c)
1118{
1119 U8 tmpbuf[UTF8_MAXBYTES+1];
1120 uvchr_to_utf8(tmpbuf, c);
1121 return is_utf8_graph(tmpbuf);
1122}
1123
1124bool
1125Perl_is_uni_print(pTHX_ UV c)
1126{
1127 U8 tmpbuf[UTF8_MAXBYTES+1];
1128 uvchr_to_utf8(tmpbuf, c);
1129 return is_utf8_print(tmpbuf);
1130}
1131
1132bool
1133Perl_is_uni_punct(pTHX_ UV c)
1134{
1135 U8 tmpbuf[UTF8_MAXBYTES+1];
1136 uvchr_to_utf8(tmpbuf, c);
1137 return is_utf8_punct(tmpbuf);
1138}
1139
1140bool
1141Perl_is_uni_xdigit(pTHX_ UV c)
1142{
1143 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1144 uvchr_to_utf8(tmpbuf, c);
1145 return is_utf8_xdigit(tmpbuf);
1146}
1147
1148UV
1149Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1150{
1151 PERL_ARGS_ASSERT_TO_UNI_UPPER;
1152
1153 uvchr_to_utf8(p, c);
1154 return to_utf8_upper(p, p, lenp);
1155}
1156
1157UV
1158Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1159{
1160 PERL_ARGS_ASSERT_TO_UNI_TITLE;
1161
1162 uvchr_to_utf8(p, c);
1163 return to_utf8_title(p, p, lenp);
1164}
1165
1166UV
1167Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1168{
1169 PERL_ARGS_ASSERT_TO_UNI_LOWER;
1170
1171 uvchr_to_utf8(p, c);
1172 return to_utf8_lower(p, p, lenp);
1173}
1174
1175UV
1176Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1177{
1178 PERL_ARGS_ASSERT_TO_UNI_FOLD;
1179
1180 uvchr_to_utf8(p, c);
1181 return to_utf8_fold(p, p, lenp);
1182}
1183
1184/* for now these all assume no locale info available for Unicode > 255 */
1185
1186bool
1187Perl_is_uni_alnum_lc(pTHX_ UV c)
1188{
1189 return is_uni_alnum(c); /* XXX no locale support yet */
1190}
1191
1192bool
1193Perl_is_uni_idfirst_lc(pTHX_ UV c)
1194{
1195 return is_uni_idfirst(c); /* XXX no locale support yet */
1196}
1197
1198bool
1199Perl_is_uni_alpha_lc(pTHX_ UV c)
1200{
1201 return is_uni_alpha(c); /* XXX no locale support yet */
1202}
1203
1204bool
1205Perl_is_uni_ascii_lc(pTHX_ UV c)
1206{
1207 return is_uni_ascii(c); /* XXX no locale support yet */
1208}
1209
1210bool
1211Perl_is_uni_space_lc(pTHX_ UV c)
1212{
1213 return is_uni_space(c); /* XXX no locale support yet */
1214}
1215
1216bool
1217Perl_is_uni_digit_lc(pTHX_ UV c)
1218{
1219 return is_uni_digit(c); /* XXX no locale support yet */
1220}
1221
1222bool
1223Perl_is_uni_upper_lc(pTHX_ UV c)
1224{
1225 return is_uni_upper(c); /* XXX no locale support yet */
1226}
1227
1228bool
1229Perl_is_uni_lower_lc(pTHX_ UV c)
1230{
1231 return is_uni_lower(c); /* XXX no locale support yet */
1232}
1233
1234bool
1235Perl_is_uni_cntrl_lc(pTHX_ UV c)
1236{
1237 return is_uni_cntrl(c); /* XXX no locale support yet */
1238}
1239
1240bool
1241Perl_is_uni_graph_lc(pTHX_ UV c)
1242{
1243 return is_uni_graph(c); /* XXX no locale support yet */
1244}
1245
1246bool
1247Perl_is_uni_print_lc(pTHX_ UV c)
1248{
1249 return is_uni_print(c); /* XXX no locale support yet */
1250}
1251
1252bool
1253Perl_is_uni_punct_lc(pTHX_ UV c)
1254{
1255 return is_uni_punct(c); /* XXX no locale support yet */
1256}
1257
1258bool
1259Perl_is_uni_xdigit_lc(pTHX_ UV c)
1260{
1261 return is_uni_xdigit(c); /* XXX no locale support yet */
1262}
1263
1264U32
1265Perl_to_uni_upper_lc(pTHX_ U32 c)
1266{
1267 /* XXX returns only the first character -- do not use XXX */
1268 /* XXX no locale support yet */
1269 STRLEN len;
1270 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1271 return (U32)to_uni_upper(c, tmpbuf, &len);
1272}
1273
1274U32
1275Perl_to_uni_title_lc(pTHX_ U32 c)
1276{
1277 /* XXX returns only the first character XXX -- do not use XXX */
1278 /* XXX no locale support yet */
1279 STRLEN len;
1280 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1281 return (U32)to_uni_title(c, tmpbuf, &len);
1282}
1283
1284U32
1285Perl_to_uni_lower_lc(pTHX_ U32 c)
1286{
1287 /* XXX returns only the first character -- do not use XXX */
1288 /* XXX no locale support yet */
1289 STRLEN len;
1290 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1291 return (U32)to_uni_lower(c, tmpbuf, &len);
1292}
1293
1294static bool
1295S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1296 const char *const swashname)
1297{
1298 dVAR;
1299
1300 PERL_ARGS_ASSERT_IS_UTF8_COMMON;
1301
1302 if (!is_utf8_char(p))
1303 return FALSE;
1304 if (!*swash)
1305 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1306 return swash_fetch(*swash, p, TRUE) != 0;
1307}
1308
1309bool
1310Perl_is_utf8_alnum(pTHX_ const U8 *p)
1311{
1312 dVAR;
1313
1314 PERL_ARGS_ASSERT_IS_UTF8_ALNUM;
1315
1316 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1317 * descendant of isalnum(3), in other words, it doesn't
1318 * contain the '_'. --jhi */
1319 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
1320}
1321
1322bool
1323Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1324{
1325 dVAR;
1326
1327 PERL_ARGS_ASSERT_IS_UTF8_IDFIRST;
1328
1329 if (*p == '_')
1330 return TRUE;
1331 /* is_utf8_idstart would be more logical. */
1332 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
1333}
1334
1335bool
1336Perl_is_utf8_idcont(pTHX_ const U8 *p)
1337{
1338 dVAR;
1339
1340 PERL_ARGS_ASSERT_IS_UTF8_IDCONT;
1341
1342 if (*p == '_')
1343 return TRUE;
1344 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
1345}
1346
1347bool
1348Perl_is_utf8_alpha(pTHX_ const U8 *p)
1349{
1350 dVAR;
1351
1352 PERL_ARGS_ASSERT_IS_UTF8_ALPHA;
1353
1354 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
1355}
1356
1357bool
1358Perl_is_utf8_ascii(pTHX_ const U8 *p)
1359{
1360 dVAR;
1361
1362 PERL_ARGS_ASSERT_IS_UTF8_ASCII;
1363
1364 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
1365}
1366
1367bool
1368Perl_is_utf8_space(pTHX_ const U8 *p)
1369{
1370 dVAR;
1371
1372 PERL_ARGS_ASSERT_IS_UTF8_SPACE;
1373
1374 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
1375}
1376
1377bool
1378Perl_is_utf8_perl_space(pTHX_ const U8 *p)
1379{
1380 dVAR;
1381
1382 PERL_ARGS_ASSERT_IS_UTF8_PERL_SPACE;
1383
1384 return is_utf8_common(p, &PL_utf8_perl_space, "IsPerlSpace");
1385}
1386
1387bool
1388Perl_is_utf8_perl_word(pTHX_ const U8 *p)
1389{
1390 dVAR;
1391
1392 PERL_ARGS_ASSERT_IS_UTF8_PERL_WORD;
1393
1394 return is_utf8_common(p, &PL_utf8_perl_word, "IsPerlWord");
1395}
1396
1397bool
1398Perl_is_utf8_digit(pTHX_ const U8 *p)
1399{
1400 dVAR;
1401
1402 PERL_ARGS_ASSERT_IS_UTF8_DIGIT;
1403
1404 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
1405}
1406
1407bool
1408Perl_is_utf8_posix_digit(pTHX_ const U8 *p)
1409{
1410 dVAR;
1411
1412 PERL_ARGS_ASSERT_IS_UTF8_POSIX_DIGIT;
1413
1414 return is_utf8_common(p, &PL_utf8_posix_digit, "IsPosixDigit");
1415}
1416
1417bool
1418Perl_is_utf8_upper(pTHX_ const U8 *p)
1419{
1420 dVAR;
1421
1422 PERL_ARGS_ASSERT_IS_UTF8_UPPER;
1423
1424 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
1425}
1426
1427bool
1428Perl_is_utf8_lower(pTHX_ const U8 *p)
1429{
1430 dVAR;
1431
1432 PERL_ARGS_ASSERT_IS_UTF8_LOWER;
1433
1434 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
1435}
1436
1437bool
1438Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1439{
1440 dVAR;
1441
1442 PERL_ARGS_ASSERT_IS_UTF8_CNTRL;
1443
1444 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
1445}
1446
1447bool
1448Perl_is_utf8_graph(pTHX_ const U8 *p)
1449{
1450 dVAR;
1451
1452 PERL_ARGS_ASSERT_IS_UTF8_GRAPH;
1453
1454 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
1455}
1456
1457bool
1458Perl_is_utf8_print(pTHX_ const U8 *p)
1459{
1460 dVAR;
1461
1462 PERL_ARGS_ASSERT_IS_UTF8_PRINT;
1463
1464 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
1465}
1466
1467bool
1468Perl_is_utf8_punct(pTHX_ const U8 *p)
1469{
1470 dVAR;
1471
1472 PERL_ARGS_ASSERT_IS_UTF8_PUNCT;
1473
1474 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
1475}
1476
1477bool
1478Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1479{
1480 dVAR;
1481
1482 PERL_ARGS_ASSERT_IS_UTF8_XDIGIT;
1483
1484 return is_utf8_common(p, &PL_utf8_xdigit, "IsXDigit");
1485}
1486
1487bool
1488Perl_is_utf8_mark(pTHX_ const U8 *p)
1489{
1490 dVAR;
1491
1492 PERL_ARGS_ASSERT_IS_UTF8_MARK;
1493
1494 return is_utf8_common(p, &PL_utf8_mark, "IsM");
1495}
1496
1497bool
1498Perl_is_utf8_X_begin(pTHX_ const U8 *p)
1499{
1500 dVAR;
1501
1502 PERL_ARGS_ASSERT_IS_UTF8_X_BEGIN;
1503
1504 return is_utf8_common(p, &PL_utf8_X_begin, "_X_Begin");
1505}
1506
1507bool
1508Perl_is_utf8_X_extend(pTHX_ const U8 *p)
1509{
1510 dVAR;
1511
1512 PERL_ARGS_ASSERT_IS_UTF8_X_EXTEND;
1513
1514 return is_utf8_common(p, &PL_utf8_X_extend, "_X_Extend");
1515}
1516
1517bool
1518Perl_is_utf8_X_prepend(pTHX_ const U8 *p)
1519{
1520 dVAR;
1521
1522 PERL_ARGS_ASSERT_IS_UTF8_X_PREPEND;
1523
1524 return is_utf8_common(p, &PL_utf8_X_prepend, "GCB=Prepend");
1525}
1526
1527bool
1528Perl_is_utf8_X_non_hangul(pTHX_ const U8 *p)
1529{
1530 dVAR;
1531
1532 PERL_ARGS_ASSERT_IS_UTF8_X_NON_HANGUL;
1533
1534 return is_utf8_common(p, &PL_utf8_X_non_hangul, "HST=Not_Applicable");
1535}
1536
1537bool
1538Perl_is_utf8_X_L(pTHX_ const U8 *p)
1539{
1540 dVAR;
1541
1542 PERL_ARGS_ASSERT_IS_UTF8_X_L;
1543
1544 return is_utf8_common(p, &PL_utf8_X_L, "GCB=L");
1545}
1546
1547bool
1548Perl_is_utf8_X_LV(pTHX_ const U8 *p)
1549{
1550 dVAR;
1551
1552 PERL_ARGS_ASSERT_IS_UTF8_X_LV;
1553
1554 return is_utf8_common(p, &PL_utf8_X_LV, "GCB=LV");
1555}
1556
1557bool
1558Perl_is_utf8_X_LVT(pTHX_ const U8 *p)
1559{
1560 dVAR;
1561
1562 PERL_ARGS_ASSERT_IS_UTF8_X_LVT;
1563
1564 return is_utf8_common(p, &PL_utf8_X_LVT, "GCB=LVT");
1565}
1566
1567bool
1568Perl_is_utf8_X_T(pTHX_ const U8 *p)
1569{
1570 dVAR;
1571
1572 PERL_ARGS_ASSERT_IS_UTF8_X_T;
1573
1574 return is_utf8_common(p, &PL_utf8_X_T, "GCB=T");
1575}
1576
1577bool
1578Perl_is_utf8_X_V(pTHX_ const U8 *p)
1579{
1580 dVAR;
1581
1582 PERL_ARGS_ASSERT_IS_UTF8_X_V;
1583
1584 return is_utf8_common(p, &PL_utf8_X_V, "GCB=V");
1585}
1586
1587bool
1588Perl_is_utf8_X_LV_LVT_V(pTHX_ const U8 *p)
1589{
1590 dVAR;
1591
1592 PERL_ARGS_ASSERT_IS_UTF8_X_LV_LVT_V;
1593
1594 return is_utf8_common(p, &PL_utf8_X_LV_LVT_V, "_X_LV_LVT_V");
1595}
1596
1597/*
1598=for apidoc to_utf8_case
1599
1600The "p" contains the pointer to the UTF-8 string encoding
1601the character that is being converted.
1602
1603The "ustrp" is a pointer to the character buffer to put the
1604conversion result to. The "lenp" is a pointer to the length
1605of the result.
1606
1607The "swashp" is a pointer to the swash to use.
1608
1609Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1610and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
1611but not always, a multicharacter mapping), is tried first.
1612
1613The "special" is a string like "utf8::ToSpecLower", which means the
1614hash %utf8::ToSpecLower. The access to the hash is through
1615Perl_to_utf8_case().
1616
1617The "normal" is a string like "ToLower" which means the swash
1618%utf8::ToLower.
1619
1620=cut */
1621
1622UV
1623Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1624 SV **swashp, const char *normal, const char *special)
1625{
1626 dVAR;
1627 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1628 STRLEN len = 0;
1629 const UV uv0 = utf8_to_uvchr(p, NULL);
1630 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1631 * are necessary in EBCDIC, they are redundant no-ops
1632 * in ASCII-ish platforms, and hopefully optimized away. */
1633 const UV uv1 = NATIVE_TO_UNI(uv0);
1634
1635 PERL_ARGS_ASSERT_TO_UTF8_CASE;
1636
1637 uvuni_to_utf8(tmpbuf, uv1);
1638
1639 if (!*swashp) /* load on-demand */
1640 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1641 /* This is the beginnings of a skeleton of code to read the info section
1642 * that is in all the swashes in case we ever want to do that, so one can
1643 * read things whose maps aren't code points, and whose default if missing
1644 * is not to the code point itself. This was just to see if it actually
1645 * worked. Details on what the possibilities are are in perluniprops.pod
1646 HV * const hv = get_hv("utf8::SwashInfo", 0);
1647 if (hv) {
1648 SV **svp;
1649 svp = hv_fetch(hv, (const char*)normal, strlen(normal), FALSE);
1650 const char *s;
1651
1652 HV * const this_hash = SvRV(*svp);
1653 svp = hv_fetch(this_hash, "type", strlen("type"), FALSE);
1654 s = SvPV_const(*svp, len);
1655 }
1656 }*/
1657
1658 if (special) {
1659 /* It might be "special" (sometimes, but not always,
1660 * a multicharacter mapping) */
1661 HV * const hv = get_hv(special, 0);
1662 SV **svp;
1663
1664 if (hv &&
1665 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1666 (*svp)) {
1667 const char *s;
1668
1669 s = SvPV_const(*svp, len);
1670 if (len == 1)
1671 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1672 else {
1673#ifdef EBCDIC
1674 /* If we have EBCDIC we need to remap the characters
1675 * since any characters in the low 256 are Unicode
1676 * code points, not EBCDIC. */
1677 U8 *t = (U8*)s, *tend = t + len, *d;
1678
1679 d = tmpbuf;
1680 if (SvUTF8(*svp)) {
1681 STRLEN tlen = 0;
1682
1683 while (t < tend) {
1684 const UV c = utf8_to_uvchr(t, &tlen);
1685 if (tlen > 0) {
1686 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1687 t += tlen;
1688 }
1689 else
1690 break;
1691 }
1692 }
1693 else {
1694 while (t < tend) {
1695 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1696 t++;
1697 }
1698 }
1699 len = d - tmpbuf;
1700 Copy(tmpbuf, ustrp, len, U8);
1701#else
1702 Copy(s, ustrp, len, U8);
1703#endif
1704 }
1705 }
1706 }
1707
1708 if (!len && *swashp) {
1709 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1710
1711 if (uv2) {
1712 /* It was "normal" (a single character mapping). */
1713 const UV uv3 = UNI_TO_NATIVE(uv2);
1714 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1715 }
1716 }
1717
1718 if (!len) /* Neither: just copy. In other words, there was no mapping
1719 defined, which means that the code point maps to itself */
1720 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1721
1722 if (lenp)
1723 *lenp = len;
1724
1725 return len ? utf8_to_uvchr(ustrp, 0) : 0;
1726}
1727
1728/*
1729=for apidoc to_utf8_upper
1730
1731Convert the UTF-8 encoded character at p to its uppercase version and
1732store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1733that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1734the uppercase version may be longer than the original character.
1735
1736The first character of the uppercased version is returned
1737(but note, as explained above, that there may be more.)
1738
1739=cut */
1740
1741UV
1742Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1743{
1744 dVAR;
1745
1746 PERL_ARGS_ASSERT_TO_UTF8_UPPER;
1747
1748 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1749 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1750}
1751
1752/*
1753=for apidoc to_utf8_title
1754
1755Convert the UTF-8 encoded character at p to its titlecase version and
1756store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1757that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1758titlecase version may be longer than the original character.
1759
1760The first character of the titlecased version is returned
1761(but note, as explained above, that there may be more.)
1762
1763=cut */
1764
1765UV
1766Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1767{
1768 dVAR;
1769
1770 PERL_ARGS_ASSERT_TO_UTF8_TITLE;
1771
1772 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1773 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1774}
1775
1776/*
1777=for apidoc to_utf8_lower
1778
1779Convert the UTF-8 encoded character at p to its lowercase version and
1780store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1781that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1782lowercase version may be longer than the original character.
1783
1784The first character of the lowercased version is returned
1785(but note, as explained above, that there may be more.)
1786
1787=cut */
1788
1789UV
1790Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1791{
1792 dVAR;
1793
1794 PERL_ARGS_ASSERT_TO_UTF8_LOWER;
1795
1796 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1797 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1798}
1799
1800/*
1801=for apidoc to_utf8_fold
1802
1803Convert the UTF-8 encoded character at p to its foldcase version and
1804store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1805that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1806foldcase version may be longer than the original character (up to
1807three characters).
1808
1809The first character of the foldcased version is returned
1810(but note, as explained above, that there may be more.)
1811
1812=cut */
1813
1814UV
1815Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1816{
1817 dVAR;
1818
1819 PERL_ARGS_ASSERT_TO_UTF8_FOLD;
1820
1821 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1822 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1823}
1824
1825/* Note:
1826 * A "swash" is a swatch hash.
1827 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1828 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1829 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1830 */
1831SV*
1832Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1833{
1834 dVAR;
1835 SV* retval;
1836 dSP;
1837 const size_t pkg_len = strlen(pkg);
1838 const size_t name_len = strlen(name);
1839 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
1840 SV* errsv_save;
1841
1842 PERL_ARGS_ASSERT_SWASH_INIT;
1843
1844 PUSHSTACKi(PERLSI_MAGIC);
1845 ENTER;
1846 SAVEHINTS();
1847 save_re_context();
1848 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1849 ENTER;
1850 errsv_save = newSVsv(ERRSV);
1851 /* It is assumed that callers of this routine are not passing in any
1852 user derived data. */
1853 /* Need to do this after save_re_context() as it will set PL_tainted to
1854 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1855 Even line to create errsv_save can turn on PL_tainted. */
1856 SAVEBOOL(PL_tainted);
1857 PL_tainted = 0;
1858 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1859 NULL);
1860 if (!SvTRUE(ERRSV))
1861 sv_setsv(ERRSV, errsv_save);
1862 SvREFCNT_dec(errsv_save);
1863 LEAVE;
1864 }
1865 SPAGAIN;
1866 PUSHMARK(SP);
1867 EXTEND(SP,5);
1868 mPUSHp(pkg, pkg_len);
1869 mPUSHp(name, name_len);
1870 PUSHs(listsv);
1871 mPUSHi(minbits);
1872 mPUSHi(none);
1873 PUTBACK;
1874 errsv_save = newSVsv(ERRSV);
1875 if (call_method("SWASHNEW", G_SCALAR))
1876 retval = newSVsv(*PL_stack_sp--);
1877 else
1878 retval = &PL_sv_undef;
1879 if (!SvTRUE(ERRSV))
1880 sv_setsv(ERRSV, errsv_save);
1881 SvREFCNT_dec(errsv_save);
1882 LEAVE;
1883 POPSTACK;
1884 if (IN_PERL_COMPILETIME) {
1885 CopHINTS_set(PL_curcop, PL_hints);
1886 }
1887 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1888 if (SvPOK(retval))
1889 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1890 SVfARG(retval));
1891 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1892 }
1893 return retval;
1894}
1895
1896
1897/* This API is wrong for special case conversions since we may need to
1898 * return several Unicode characters for a single Unicode character
1899 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1900 * the lower-level routine, and it is similarly broken for returning
1901 * multiple values. --jhi */
1902/* Now SWASHGET is recasted into S_swash_get in this file. */
1903
1904/* Note:
1905 * Returns the value of property/mapping C<swash> for the first character
1906 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1907 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1908 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1909 */
1910UV
1911Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1912{
1913 dVAR;
1914 HV *const hv = MUTABLE_HV(SvRV(swash));
1915 U32 klen;
1916 U32 off;
1917 STRLEN slen;
1918 STRLEN needents;
1919 const U8 *tmps = NULL;
1920 U32 bit;
1921 SV *swatch;
1922 U8 tmputf8[2];
1923 const UV c = NATIVE_TO_ASCII(*ptr);
1924
1925 PERL_ARGS_ASSERT_SWASH_FETCH;
1926
1927 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1928 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1929 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1930 ptr = tmputf8;
1931 }
1932 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1933 * then the "swatch" is a vec() for all the chars which start
1934 * with 0xAA..0xYY
1935 * So the key in the hash (klen) is length of encoded char -1
1936 */
1937 klen = UTF8SKIP(ptr) - 1;
1938 off = ptr[klen];
1939
1940 if (klen == 0) {
1941 /* If char is invariant then swatch is for all the invariant chars
1942 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1943 */
1944 needents = UTF_CONTINUATION_MARK;
1945 off = NATIVE_TO_UTF(ptr[klen]);
1946 }
1947 else {
1948 /* If char is encoded then swatch is for the prefix */
1949 needents = (1 << UTF_ACCUMULATION_SHIFT);
1950 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1951 }
1952
1953 /*
1954 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1955 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1956 * it's nothing to sniff at.) Pity we usually come through at least
1957 * two function calls to get here...
1958 *
1959 * NB: this code assumes that swatches are never modified, once generated!
1960 */
1961
1962 if (hv == PL_last_swash_hv &&
1963 klen == PL_last_swash_klen &&
1964 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1965 {
1966 tmps = PL_last_swash_tmps;
1967 slen = PL_last_swash_slen;
1968 }
1969 else {
1970 /* Try our second-level swatch cache, kept in a hash. */
1971 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1972
1973 /* If not cached, generate it via swash_get */
1974 if (!svp || !SvPOK(*svp)
1975 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1976 /* We use utf8n_to_uvuni() as we want an index into
1977 Unicode tables, not a native character number.
1978 */
1979 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1980 ckWARN(WARN_UTF8) ?
1981 0 : UTF8_ALLOW_ANY);
1982 swatch = swash_get(swash,
1983 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1984 (klen) ? (code_point & ~(needents - 1)) : 0,
1985 needents);
1986
1987 if (IN_PERL_COMPILETIME)
1988 CopHINTS_set(PL_curcop, PL_hints);
1989
1990 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1991
1992 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1993 || (slen << 3) < needents)
1994 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1995 }
1996
1997 PL_last_swash_hv = hv;
1998 assert(klen <= sizeof(PL_last_swash_key));
1999 PL_last_swash_klen = (U8)klen;
2000 /* FIXME change interpvar.h? */
2001 PL_last_swash_tmps = (U8 *) tmps;
2002 PL_last_swash_slen = slen;
2003 if (klen)
2004 Copy(ptr, PL_last_swash_key, klen, U8);
2005 }
2006
2007 switch ((int)((slen << 3) / needents)) {
2008 case 1:
2009 bit = 1 << (off & 7);
2010 off >>= 3;
2011 return (tmps[off] & bit) != 0;
2012 case 8:
2013 return tmps[off];
2014 case 16:
2015 off <<= 1;
2016 return (tmps[off] << 8) + tmps[off + 1] ;
2017 case 32:
2018 off <<= 2;
2019 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
2020 }
2021 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
2022 NORETURN_FUNCTION_END;
2023}
2024
2025/* Note:
2026 * Returns a swatch (a bit vector string) for a code point sequence
2027 * that starts from the value C<start> and comprises the number C<span>.
2028 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
2029 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
2030 */
2031STATIC SV*
2032S_swash_get(pTHX_ SV* swash, UV start, UV span)
2033{
2034 SV *swatch;
2035 U8 *l, *lend, *x, *xend, *s;
2036 STRLEN lcur, xcur, scur;
2037 HV *const hv = MUTABLE_HV(SvRV(swash));
2038 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
2039 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
2040 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
2041 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
2042 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
2043 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
2044 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
2045 const STRLEN bits = SvUV(*bitssvp);
2046 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
2047 const UV none = SvUV(*nonesvp);
2048 const UV end = start + span;
2049
2050 PERL_ARGS_ASSERT_SWASH_GET;
2051
2052 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
2053 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
2054 (UV)bits);
2055 }
2056
2057 /* create and initialize $swatch */
2058 scur = octets ? (span * octets) : (span + 7) / 8;
2059 swatch = newSV(scur);
2060 SvPOK_on(swatch);
2061 s = (U8*)SvPVX(swatch);
2062 if (octets && none) {
2063 const U8* const e = s + scur;
2064 while (s < e) {
2065 if (bits == 8)
2066 *s++ = (U8)(none & 0xff);
2067 else if (bits == 16) {
2068 *s++ = (U8)((none >> 8) & 0xff);
2069 *s++ = (U8)( none & 0xff);
2070 }
2071 else if (bits == 32) {
2072 *s++ = (U8)((none >> 24) & 0xff);
2073 *s++ = (U8)((none >> 16) & 0xff);
2074 *s++ = (U8)((none >> 8) & 0xff);
2075 *s++ = (U8)( none & 0xff);
2076 }
2077 }
2078 *s = '\0';
2079 }
2080 else {
2081 (void)memzero((U8*)s, scur + 1);
2082 }
2083 SvCUR_set(swatch, scur);
2084 s = (U8*)SvPVX(swatch);
2085
2086 /* read $swash->{LIST} */
2087 l = (U8*)SvPV(*listsvp, lcur);
2088 lend = l + lcur;
2089 while (l < lend) {
2090 UV min, max, val;
2091 STRLEN numlen;
2092 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
2093
2094 U8* const nl = (U8*)memchr(l, '\n', lend - l);
2095
2096 numlen = lend - l;
2097 min = grok_hex((char *)l, &numlen, &flags, NULL);
2098 if (numlen)
2099 l += numlen;
2100 else if (nl) {
2101 l = nl + 1; /* 1 is length of "\n" */
2102 continue;
2103 }
2104 else {
2105 l = lend; /* to LIST's end at which \n is not found */
2106 break;
2107 }
2108
2109 if (isBLANK(*l)) {
2110 ++l;
2111 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
2112 numlen = lend - l;
2113 max = grok_hex((char *)l, &numlen, &flags, NULL);
2114 if (numlen)
2115 l += numlen;
2116 else
2117 max = min;
2118
2119 if (octets) {
2120 if (isBLANK(*l)) {
2121 ++l;
2122 flags = PERL_SCAN_SILENT_ILLDIGIT |
2123 PERL_SCAN_DISALLOW_PREFIX;
2124 numlen = lend - l;
2125 val = grok_hex((char *)l, &numlen, &flags, NULL);
2126 if (numlen)
2127 l += numlen;
2128 else
2129 val = 0;
2130 }
2131 else {
2132 val = 0;
2133 if (typeto) {
2134 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
2135 typestr, l);
2136 }
2137 }
2138 }
2139 else
2140 val = 0; /* bits == 1, then val should be ignored */
2141 }
2142 else {
2143 max = min;
2144 if (octets) {
2145 val = 0;
2146 if (typeto) {
2147 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
2148 }
2149 }
2150 else
2151 val = 0; /* bits == 1, then val should be ignored */
2152 }
2153
2154 if (nl)
2155 l = nl + 1;
2156 else
2157 l = lend;
2158
2159 if (max < start)
2160 continue;
2161
2162 if (octets) {
2163 UV key;
2164 if (min < start) {
2165 if (!none || val < none) {
2166 val += start - min;
2167 }
2168 min = start;
2169 }
2170 for (key = min; key <= max; key++) {
2171 STRLEN offset;
2172 if (key >= end)
2173 goto go_out_list;
2174 /* offset must be non-negative (start <= min <= key < end) */
2175 offset = octets * (key - start);
2176 if (bits == 8)
2177 s[offset] = (U8)(val & 0xff);
2178 else if (bits == 16) {
2179 s[offset ] = (U8)((val >> 8) & 0xff);
2180 s[offset + 1] = (U8)( val & 0xff);
2181 }
2182 else if (bits == 32) {
2183 s[offset ] = (U8)((val >> 24) & 0xff);
2184 s[offset + 1] = (U8)((val >> 16) & 0xff);
2185 s[offset + 2] = (U8)((val >> 8) & 0xff);
2186 s[offset + 3] = (U8)( val & 0xff);
2187 }
2188
2189 if (!none || val < none)
2190 ++val;
2191 }
2192 }
2193 else { /* bits == 1, then val should be ignored */
2194 UV key;
2195 if (min < start)
2196 min = start;
2197 for (key = min; key <= max; key++) {
2198 const STRLEN offset = (STRLEN)(key - start);
2199 if (key >= end)
2200 goto go_out_list;
2201 s[offset >> 3] |= 1 << (offset & 7);
2202 }
2203 }
2204 } /* while */
2205 go_out_list:
2206
2207 /* read $swash->{EXTRAS} */
2208 x = (U8*)SvPV(*extssvp, xcur);
2209 xend = x + xcur;
2210 while (x < xend) {
2211 STRLEN namelen;
2212 U8 *namestr;
2213 SV** othersvp;
2214 HV* otherhv;
2215 STRLEN otherbits;
2216 SV **otherbitssvp, *other;
2217 U8 *s, *o, *nl;
2218 STRLEN slen, olen;
2219
2220 const U8 opc = *x++;
2221 if (opc == '\n')
2222 continue;
2223
2224 nl = (U8*)memchr(x, '\n', xend - x);
2225
2226 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
2227 if (nl) {
2228 x = nl + 1; /* 1 is length of "\n" */
2229 continue;
2230 }
2231 else {
2232 x = xend; /* to EXTRAS' end at which \n is not found */
2233 break;
2234 }
2235 }
2236
2237 namestr = x;
2238 if (nl) {
2239 namelen = nl - namestr;
2240 x = nl + 1;
2241 }
2242 else {
2243 namelen = xend - namestr;
2244 x = xend;
2245 }
2246
2247 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
2248 otherhv = MUTABLE_HV(SvRV(*othersvp));
2249 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
2250 otherbits = (STRLEN)SvUV(*otherbitssvp);
2251 if (bits < otherbits)
2252 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
2253
2254 /* The "other" swatch must be destroyed after. */
2255 other = swash_get(*othersvp, start, span);
2256 o = (U8*)SvPV(other, olen);
2257
2258 if (!olen)
2259 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
2260
2261 s = (U8*)SvPV(swatch, slen);
2262 if (bits == 1 && otherbits == 1) {
2263 if (slen != olen)
2264 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
2265
2266 switch (opc) {
2267 case '+':
2268 while (slen--)
2269 *s++ |= *o++;
2270 break;
2271 case '!':
2272 while (slen--)
2273 *s++ |= ~*o++;
2274 break;
2275 case '-':
2276 while (slen--)
2277 *s++ &= ~*o++;
2278 break;
2279 case '&':
2280 while (slen--)
2281 *s++ &= *o++;
2282 break;
2283 default:
2284 break;
2285 }
2286 }
2287 else {
2288 STRLEN otheroctets = otherbits >> 3;
2289 STRLEN offset = 0;
2290 U8* const send = s + slen;
2291
2292 while (s < send) {
2293 UV otherval = 0;
2294
2295 if (otherbits == 1) {
2296 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2297 ++offset;
2298 }
2299 else {
2300 STRLEN vlen = otheroctets;
2301 otherval = *o++;
2302 while (--vlen) {
2303 otherval <<= 8;
2304 otherval |= *o++;
2305 }
2306 }
2307
2308 if (opc == '+' && otherval)
2309 NOOP; /* replace with otherval */
2310 else if (opc == '!' && !otherval)
2311 otherval = 1;
2312 else if (opc == '-' && otherval)
2313 otherval = 0;
2314 else if (opc == '&' && !otherval)
2315 otherval = 0;
2316 else {
2317 s += octets; /* no replacement */
2318 continue;
2319 }
2320
2321 if (bits == 8)
2322 *s++ = (U8)( otherval & 0xff);
2323 else if (bits == 16) {
2324 *s++ = (U8)((otherval >> 8) & 0xff);
2325 *s++ = (U8)( otherval & 0xff);
2326 }
2327 else if (bits == 32) {
2328 *s++ = (U8)((otherval >> 24) & 0xff);
2329 *s++ = (U8)((otherval >> 16) & 0xff);
2330 *s++ = (U8)((otherval >> 8) & 0xff);
2331 *s++ = (U8)( otherval & 0xff);
2332 }
2333 }
2334 }
2335 sv_free(other); /* through with it! */
2336 } /* while */
2337 return swatch;
2338}
2339
2340/*
2341=for apidoc uvchr_to_utf8
2342
2343Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2344of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2345bytes available. The return value is the pointer to the byte after the
2346end of the new character. In other words,
2347
2348 d = uvchr_to_utf8(d, uv);
2349
2350is the recommended wide native character-aware way of saying
2351
2352 *(d++) = uv;
2353
2354=cut
2355*/
2356
2357/* On ASCII machines this is normally a macro but we want a
2358 real function in case XS code wants it
2359*/
2360U8 *
2361Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2362{
2363 PERL_ARGS_ASSERT_UVCHR_TO_UTF8;
2364
2365 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2366}
2367
2368U8 *
2369Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2370{
2371 PERL_ARGS_ASSERT_UVCHR_TO_UTF8_FLAGS;
2372
2373 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2374}
2375
2376/*
2377=for apidoc utf8n_to_uvchr
2378flags
2379
2380Returns the native character value of the first character in the string
2381C<s>
2382which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2383length, in bytes, of that character.
2384
2385Allows length and flags to be passed to low level routine.
2386
2387=cut
2388*/
2389/* On ASCII machines this is normally a macro but we want
2390 a real function in case XS code wants it
2391*/
2392UV
2393Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2394U32 flags)
2395{
2396 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2397
2398 PERL_ARGS_ASSERT_UTF8N_TO_UVCHR;
2399
2400 return UNI_TO_NATIVE(uv);
2401}
2402
2403/*
2404=for apidoc pv_uni_display
2405
2406Build to the scalar dsv a displayable version of the string spv,
2407length len, the displayable version being at most pvlim bytes long
2408(if longer, the rest is truncated and "..." will be appended).
2409
2410The flags argument can have UNI_DISPLAY_ISPRINT set to display
2411isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2412to display the \\[nrfta\\] as the backslashed versions (like '\n')
2413(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2414UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2415UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2416
2417The pointer to the PV of the dsv is returned.
2418
2419=cut */
2420char *
2421Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2422{
2423 int truncated = 0;
2424 const char *s, *e;
2425
2426 PERL_ARGS_ASSERT_PV_UNI_DISPLAY;
2427
2428 sv_setpvs(dsv, "");
2429 SvUTF8_off(dsv);
2430 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2431 UV u;
2432 /* This serves double duty as a flag and a character to print after
2433 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2434 */
2435 char ok = 0;
2436
2437 if (pvlim && SvCUR(dsv) >= pvlim) {
2438 truncated++;
2439 break;
2440 }
2441 u = utf8_to_uvchr((U8*)s, 0);
2442 if (u < 256) {
2443 const unsigned char c = (unsigned char)u & 0xFF;
2444 if (flags & UNI_DISPLAY_BACKSLASH) {
2445 switch (c) {
2446 case '\n':
2447 ok = 'n'; break;
2448 case '\r':
2449 ok = 'r'; break;
2450 case '\t':
2451 ok = 't'; break;
2452 case '\f':
2453 ok = 'f'; break;
2454 case '\a':
2455 ok = 'a'; break;
2456 case '\\':
2457 ok = '\\'; break;
2458 default: break;
2459 }
2460 if (ok) {
2461 const char string = ok;
2462 sv_catpvs(dsv, "\\");
2463 sv_catpvn(dsv, &string, 1);
2464 }
2465 }
2466 /* isPRINT() is the locale-blind version. */
2467 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2468 const char string = c;
2469 sv_catpvn(dsv, &string, 1);
2470 ok = 1;
2471 }
2472 }
2473 if (!ok)
2474 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2475 }
2476 if (truncated)
2477 sv_catpvs(dsv, "...");
2478
2479 return SvPVX(dsv);
2480}
2481
2482/*
2483=for apidoc sv_uni_display
2484
2485Build to the scalar dsv a displayable version of the scalar sv,
2486the displayable version being at most pvlim bytes long
2487(if longer, the rest is truncated and "..." will be appended).
2488
2489The flags argument is as in pv_uni_display().
2490
2491The pointer to the PV of the dsv is returned.
2492
2493=cut
2494*/
2495char *
2496Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2497{
2498 PERL_ARGS_ASSERT_SV_UNI_DISPLAY;
2499
2500 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2501 SvCUR(ssv), pvlim, flags);
2502}
2503
2504/*
2505=for apidoc foldEQ_utf8
2506
2507Returns true if the leading portions of the strings s1 and s2 (either or both
2508of which may be in UTF-8) are the same case-insensitively; false otherwise.
2509How far into the strings to compare is determined by other input parameters.
2510
2511If u1 is true, the string s1 is assumed to be in UTF-8-encoded Unicode;
2512otherwise it is assumed to be in native 8-bit encoding. Correspondingly for u2
2513with respect to s2.
2514
2515If the byte length l1 is non-zero, it says how far into s1 to check for fold
2516equality. In other words, s1+l1 will be used as a goal to reach. The
2517scan will not be considered to be a match unless the goal is reached, and
2518scanning won't continue past that goal. Correspondingly for l2 with respect to
2519s2.
2520
2521If pe1 is non-NULL and the pointer it points to is not NULL, that pointer is
2522considered an end pointer beyond which scanning of s1 will not continue under
2523any circumstances. This means that if both l1 and pe1 are specified, and pe1
2524is less than s1+l1, the match will never be successful because it can never
2525get as far as its goal (and in fact is asserted against). Correspondingly for
2526pe2 with respect to s2.
2527
2528At least one of s1 and s2 must have a goal (at least one of l1 and l2 must be
2529non-zero), and if both do, both have to be
2530reached for a successful match. Also, if the fold of a character is multiple
2531characters, all of them must be matched (see tr21 reference below for
2532'folding').
2533
2534Upon a successful match, if pe1 is non-NULL,
2535it will be set to point to the beginning of the I<next> character of s1 beyond
2536what was matched. Correspondingly for pe2 and s2.
2537
2538For case-insensitiveness, the "casefolding" of Unicode is used
2539instead of upper/lowercasing both the characters, see
2540http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2541
2542=cut */
2543I32
2544Perl_foldEQ_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2545{
2546 dVAR;
2547 register const U8 *p1 = (const U8*)s1; /* Point to current char */
2548 register const U8 *p2 = (const U8*)s2;
2549 register const U8 *g1 = NULL; /* goal for s1 */
2550 register const U8 *g2 = NULL;
2551 register const U8 *e1 = NULL; /* Don't scan s1 past this */
2552 register U8 *f1 = NULL; /* Point to current folded */
2553 register const U8 *e2 = NULL;
2554 register U8 *f2 = NULL;
2555 STRLEN n1 = 0, n2 = 0; /* Number of bytes in current char */
2556 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2557 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2558 U8 natbuf[2]; /* Holds native 8-bit char converted to utf8;
2559 these always fit in 2 bytes */
2560
2561 PERL_ARGS_ASSERT_FOLDEQ_UTF8;
2562
2563 if (pe1) {
2564 e1 = *(U8**)pe1;
2565 }
2566
2567 if (l1) {
2568 g1 = (const U8*)s1 + l1;
2569 }
2570
2571 if (pe2) {
2572 e2 = *(U8**)pe2;
2573 }
2574
2575 if (l2) {
2576 g2 = (const U8*)s2 + l2;
2577 }
2578
2579 /* Must have at least one goal */
2580 assert(g1 || g2);
2581
2582 if (g1) {
2583
2584 /* Will never match if goal is out-of-bounds */
2585 assert(! e1 || e1 >= g1);
2586
2587 /* Here, there isn't an end pointer, or it is beyond the goal. We
2588 * only go as far as the goal */
2589 e1 = g1;
2590 }
2591 else {
2592 assert(e1); /* Must have an end for looking at s1 */
2593 }
2594
2595 /* Same for goal for s2 */
2596 if (g2) {
2597 assert(! e2 || e2 >= g2);
2598 e2 = g2;
2599 }
2600 else {
2601 assert(e2);
2602 }
2603
2604 /* Look through both strings, a character at a time */
2605 while (p1 < e1 && p2 < e2) {
2606
2607 /* If at the beginning of a new character in s1, get its fold to use
2608 * and the length of the fold */
2609 if (n1 == 0) {
2610 if (u1) {
2611 to_utf8_fold(p1, foldbuf1, &n1);
2612 }
2613 else { /* Not utf8, convert to it first and then get fold */
2614 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2615 to_utf8_fold(natbuf, foldbuf1, &n1);
2616 }
2617 f1 = foldbuf1;
2618 }
2619
2620 if (n2 == 0) { /* Same for s2 */
2621 if (u2) {
2622 to_utf8_fold(p2, foldbuf2, &n2);
2623 }
2624 else {
2625 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2626 to_utf8_fold(natbuf, foldbuf2, &n2);
2627 }
2628 f2 = foldbuf2;
2629 }
2630
2631 /* While there is more to look for in both folds, see if they
2632 * continue to match */
2633 while (n1 && n2) {
2634 U8 fold_length = UTF8SKIP(f1);
2635 if (fold_length != UTF8SKIP(f2)
2636 || (fold_length == 1 && *f1 != *f2) /* Short circuit memNE
2637 function call for single
2638 character */
2639 || memNE((char*)f1, (char*)f2, fold_length))
2640 {
2641 return 0; /* mismatch */
2642 }
2643
2644 /* Here, they matched, advance past them */
2645 n1 -= fold_length;
2646 f1 += fold_length;
2647 n2 -= fold_length;
2648 f2 += fold_length;
2649 }
2650
2651 /* When reach the end of any fold, advance the input past it */
2652 if (n1 == 0) {
2653 p1 += u1 ? UTF8SKIP(p1) : 1;
2654 }
2655 if (n2 == 0) {
2656 p2 += u2 ? UTF8SKIP(p2) : 1;
2657 }
2658 } /* End of loop through both strings */
2659
2660 /* A match is defined by each scan that specified an explicit length
2661 * reaching its final goal, and the other not having matched a partial
2662 * character (which can happen when the fold of a character is more than one
2663 * character). */
2664 if (! ((g1 == 0 || p1 == g1) && (g2 == 0 || p2 == g2)) || n1 || n2) {
2665 return 0;
2666 }
2667
2668 /* Successful match. Set output pointers */
2669 if (pe1) {
2670 *pe1 = (char*)p1;
2671 }
2672 if (pe2) {
2673 *pe2 = (char*)p2;
2674 }
2675 return 1;
2676}
2677
2678/*
2679 * Local variables:
2680 * c-indentation-style: bsd
2681 * c-basic-offset: 4
2682 * indent-tabs-mode: t
2683 * End:
2684 *
2685 * ex: set ts=8 sts=4 sw=4 noet:
2686 */