This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Test that constant overloading is propagated into eval
[perl5.git] / utf8.c
... / ...
CommitLineData
1/* utf8.c
2 *
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
4 * others
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
25#define PERL_IN_UTF8_C
26#include "perl.h"
27
28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
30
31/*
32=head1 Unicode Support
33
34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
36Unicode characters as a variable number of bytes, in such a way that
37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
39
40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
41
42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
44bytes available. The return value is the pointer to the byte after the
45end of the new character. In other words,
46
47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
51 d = uvuni_to_utf8(d, uv);
52
53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
64U8 *
65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
66{
67 if (ckWARN(WARN_UTF8)) {
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
78 * FFFEs and FFFFs beyond 0x10FFFF. */
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
85 if (UNI_IS_INVARIANT(uv)) {
86 *d++ = (U8)UTF_TO_NATIVE(uv);
87 return d;
88 }
89#if defined(EBCDIC)
90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
98 return d+len;
99 }
100#else /* Non loop style */
101 if (uv < 0x800) {
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
104 return d;
105 }
106 if (uv < 0x10000) {
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
110 return d;
111 }
112 if (uv < 0x200000) {
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
117 return d;
118 }
119 if (uv < 0x4000000) {
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
125 return d;
126 }
127 if (uv < 0x80000000) {
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
134 return d;
135 }
136#ifdef HAS_QUAD
137 if (uv < UTF8_QUAD_MAX)
138#endif
139 {
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
147 return d;
148 }
149#ifdef HAS_QUAD
150 {
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
164 return d;
165 }
166#endif
167#endif /* Loop style */
168}
169
170/*
171
172Tests if some arbitrary number of bytes begins in a valid UTF-8
173character. Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character. The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
176
177This is the "slow" version as opposed to the "fast" version which is
178the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
179difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
180or less you should use the IS_UTF8_CHAR(), for lengths of five or more
181you should use the _slow(). In practice this means that the _slow()
182will be used very rarely, since the maximum Unicode code point (as of
183Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
184the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
185five bytes or more.
186
187=cut */
188STATIC STRLEN
189S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
190{
191 U8 u = *s;
192 STRLEN slen;
193 UV uv, ouv;
194
195 if (UTF8_IS_INVARIANT(u))
196 return 1;
197
198 if (!UTF8_IS_START(u))
199 return 0;
200
201 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
202 return 0;
203
204 slen = len - 1;
205 s++;
206#ifdef EBCDIC
207 u = NATIVE_TO_UTF(u);
208#endif
209 u &= UTF_START_MASK(len);
210 uv = u;
211 ouv = uv;
212 while (slen--) {
213 if (!UTF8_IS_CONTINUATION(*s))
214 return 0;
215 uv = UTF8_ACCUMULATE(uv, *s);
216 if (uv < ouv)
217 return 0;
218 ouv = uv;
219 s++;
220 }
221
222 if ((STRLEN)UNISKIP(uv) < len)
223 return 0;
224
225 return len;
226}
227
228/*
229=for apidoc A|STRLEN|is_utf8_char|const U8 *s
230
231Tests if some arbitrary number of bytes begins in a valid UTF-8
232character. Note that an INVARIANT (i.e. ASCII) character is a valid
233UTF-8 character. The actual number of bytes in the UTF-8 character
234will be returned if it is valid, otherwise 0.
235
236=cut */
237STRLEN
238Perl_is_utf8_char(pTHX_ const U8 *s)
239{
240 const STRLEN len = UTF8SKIP(s);
241#ifdef IS_UTF8_CHAR
242 if (IS_UTF8_CHAR_FAST(len))
243 return IS_UTF8_CHAR(s, len) ? len : 0;
244#endif /* #ifdef IS_UTF8_CHAR */
245 return is_utf8_char_slow(s, len);
246}
247
248/*
249=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
250
251Returns true if first C<len> bytes of the given string form a valid
252UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
253not mean 'a string that contains code points above 0x7F encoded in UTF-8'
254because a valid ASCII string is a valid UTF-8 string.
255
256See also is_utf8_string_loclen() and is_utf8_string_loc().
257
258=cut
259*/
260
261bool
262Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
263{
264 const U8* x = s;
265 const U8* send;
266
267 if (!len)
268 len = strlen((const char *)s);
269 send = s + len;
270
271 while (x < send) {
272 STRLEN c;
273 /* Inline the easy bits of is_utf8_char() here for speed... */
274 if (UTF8_IS_INVARIANT(*x))
275 c = 1;
276 else if (!UTF8_IS_START(*x))
277 goto out;
278 else {
279 /* ... and call is_utf8_char() only if really needed. */
280#ifdef IS_UTF8_CHAR
281 c = UTF8SKIP(x);
282 if (IS_UTF8_CHAR_FAST(c)) {
283 if (!IS_UTF8_CHAR(x, c))
284 goto out;
285 } else if (!is_utf8_char_slow(x, c))
286 goto out;
287#else
288 c = is_utf8_char(x);
289#endif /* #ifdef IS_UTF8_CHAR */
290 if (!c)
291 goto out;
292 }
293 x += c;
294 }
295
296 out:
297 if (x != send)
298 return FALSE;
299
300 return TRUE;
301}
302
303/*
304Implemented as a macro in utf8.h
305
306=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
307
308Like is_utf8_string() but stores the location of the failure (in the
309case of "utf8ness failure") or the location s+len (in the case of
310"utf8ness success") in the C<ep>.
311
312See also is_utf8_string_loclen() and is_utf8_string().
313
314=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
315
316Like is_utf8_string() but stores the location of the failure (in the
317case of "utf8ness failure") or the location s+len (in the case of
318"utf8ness success") in the C<ep>, and the number of UTF-8
319encoded characters in the C<el>.
320
321See also is_utf8_string_loc() and is_utf8_string().
322
323=cut
324*/
325
326bool
327Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
328{
329 const U8* x = s;
330 const U8* send;
331 STRLEN c;
332
333 if (!len)
334 len = strlen((const char *)s);
335 send = s + len;
336 if (el)
337 *el = 0;
338
339 while (x < send) {
340 /* Inline the easy bits of is_utf8_char() here for speed... */
341 if (UTF8_IS_INVARIANT(*x))
342 c = 1;
343 else if (!UTF8_IS_START(*x))
344 goto out;
345 else {
346 /* ... and call is_utf8_char() only if really needed. */
347#ifdef IS_UTF8_CHAR
348 c = UTF8SKIP(x);
349 if (IS_UTF8_CHAR_FAST(c)) {
350 if (!IS_UTF8_CHAR(x, c))
351 c = 0;
352 } else
353 c = is_utf8_char_slow(x, c);
354#else
355 c = is_utf8_char(x);
356#endif /* #ifdef IS_UTF8_CHAR */
357 if (!c)
358 goto out;
359 }
360 x += c;
361 if (el)
362 (*el)++;
363 }
364
365 out:
366 if (ep)
367 *ep = x;
368 if (x != send)
369 return FALSE;
370
371 return TRUE;
372}
373
374/*
375
376=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
377
378Bottom level UTF-8 decode routine.
379Returns the unicode code point value of the first character in the string C<s>
380which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
381C<retlen> will be set to the length, in bytes, of that character.
382
383If C<s> does not point to a well-formed UTF-8 character, the behaviour
384is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
385it is assumed that the caller will raise a warning, and this function
386will silently just set C<retlen> to C<-1> and return zero. If the
387C<flags> does not contain UTF8_CHECK_ONLY, warnings about
388malformations will be given, C<retlen> will be set to the expected
389length of the UTF-8 character in bytes, and zero will be returned.
390
391The C<flags> can also contain various flags to allow deviations from
392the strict UTF-8 encoding (see F<utf8.h>).
393
394Most code should use utf8_to_uvchr() rather than call this directly.
395
396=cut
397*/
398
399UV
400Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
401{
402 const U8 *s0 = s;
403 UV uv = *s, ouv = 0;
404 STRLEN len = 1;
405 const bool dowarn = ckWARN_d(WARN_UTF8);
406 const UV startbyte = *s;
407 STRLEN expectlen = 0;
408 U32 warning = 0;
409
410/* This list is a superset of the UTF8_ALLOW_XXX. */
411
412#define UTF8_WARN_EMPTY 1
413#define UTF8_WARN_CONTINUATION 2
414#define UTF8_WARN_NON_CONTINUATION 3
415#define UTF8_WARN_FE_FF 4
416#define UTF8_WARN_SHORT 5
417#define UTF8_WARN_OVERFLOW 6
418#define UTF8_WARN_SURROGATE 7
419#define UTF8_WARN_LONG 8
420#define UTF8_WARN_FFFF 9 /* Also FFFE. */
421
422 if (curlen == 0 &&
423 !(flags & UTF8_ALLOW_EMPTY)) {
424 warning = UTF8_WARN_EMPTY;
425 goto malformed;
426 }
427
428 if (UTF8_IS_INVARIANT(uv)) {
429 if (retlen)
430 *retlen = 1;
431 return (UV) (NATIVE_TO_UTF(*s));
432 }
433
434 if (UTF8_IS_CONTINUATION(uv) &&
435 !(flags & UTF8_ALLOW_CONTINUATION)) {
436 warning = UTF8_WARN_CONTINUATION;
437 goto malformed;
438 }
439
440 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
441 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
442 warning = UTF8_WARN_NON_CONTINUATION;
443 goto malformed;
444 }
445
446#ifdef EBCDIC
447 uv = NATIVE_TO_UTF(uv);
448#else
449 if ((uv == 0xfe || uv == 0xff) &&
450 !(flags & UTF8_ALLOW_FE_FF)) {
451 warning = UTF8_WARN_FE_FF;
452 goto malformed;
453 }
454#endif
455
456 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
457 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
458 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
459 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
460#ifdef EBCDIC
461 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
462 else { len = 7; uv &= 0x01; }
463#else
464 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
465 else if (!(uv & 0x01)) { len = 7; uv = 0; }
466 else { len = 13; uv = 0; } /* whoa! */
467#endif
468
469 if (retlen)
470 *retlen = len;
471
472 expectlen = len;
473
474 if ((curlen < expectlen) &&
475 !(flags & UTF8_ALLOW_SHORT)) {
476 warning = UTF8_WARN_SHORT;
477 goto malformed;
478 }
479
480 len--;
481 s++;
482 ouv = uv;
483
484 while (len--) {
485 if (!UTF8_IS_CONTINUATION(*s) &&
486 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
487 s--;
488 warning = UTF8_WARN_NON_CONTINUATION;
489 goto malformed;
490 }
491 else
492 uv = UTF8_ACCUMULATE(uv, *s);
493 if (!(uv > ouv)) {
494 /* These cannot be allowed. */
495 if (uv == ouv) {
496 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
497 warning = UTF8_WARN_LONG;
498 goto malformed;
499 }
500 }
501 else { /* uv < ouv */
502 /* This cannot be allowed. */
503 warning = UTF8_WARN_OVERFLOW;
504 goto malformed;
505 }
506 }
507 s++;
508 ouv = uv;
509 }
510
511 if (UNICODE_IS_SURROGATE(uv) &&
512 !(flags & UTF8_ALLOW_SURROGATE)) {
513 warning = UTF8_WARN_SURROGATE;
514 goto malformed;
515 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
516 !(flags & UTF8_ALLOW_LONG)) {
517 warning = UTF8_WARN_LONG;
518 goto malformed;
519 } else if (UNICODE_IS_ILLEGAL(uv) &&
520 !(flags & UTF8_ALLOW_FFFF)) {
521 warning = UTF8_WARN_FFFF;
522 goto malformed;
523 }
524
525 return uv;
526
527malformed:
528
529 if (flags & UTF8_CHECK_ONLY) {
530 if (retlen)
531 *retlen = -1;
532 return 0;
533 }
534
535 if (dowarn) {
536 SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
537
538 switch (warning) {
539 case 0: /* Intentionally empty. */ break;
540 case UTF8_WARN_EMPTY:
541 Perl_sv_catpv(aTHX_ sv, "(empty string)");
542 break;
543 case UTF8_WARN_CONTINUATION:
544 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
545 break;
546 case UTF8_WARN_NON_CONTINUATION:
547 if (s == s0)
548 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
549 (UV)s[1], startbyte);
550 else {
551 const int len = (int)(s-s0);
552 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
553 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
554 }
555
556 break;
557 case UTF8_WARN_FE_FF:
558 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
559 break;
560 case UTF8_WARN_SHORT:
561 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
562 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
563 expectlen = curlen; /* distance for caller to skip */
564 break;
565 case UTF8_WARN_OVERFLOW:
566 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
567 ouv, *s, startbyte);
568 break;
569 case UTF8_WARN_SURROGATE:
570 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
571 break;
572 case UTF8_WARN_LONG:
573 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
574 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
575 break;
576 case UTF8_WARN_FFFF:
577 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
578 break;
579 default:
580 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
581 break;
582 }
583
584 if (warning) {
585 const char * const s = SvPVX_const(sv);
586
587 if (PL_op)
588 Perl_warner(aTHX_ packWARN(WARN_UTF8),
589 "%s in %s", s, OP_DESC(PL_op));
590 else
591 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
592 }
593 }
594
595 if (retlen)
596 *retlen = expectlen ? expectlen : len;
597
598 return 0;
599}
600
601/*
602=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
603
604Returns the native character value of the first character in the string C<s>
605which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
606length, in bytes, of that character.
607
608If C<s> does not point to a well-formed UTF-8 character, zero is
609returned and retlen is set, if possible, to -1.
610
611=cut
612*/
613
614UV
615Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
616{
617 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
618 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
619}
620
621/*
622=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
623
624Returns the Unicode code point of the first character in the string C<s>
625which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
626length, in bytes, of that character.
627
628This function should only be used when returned UV is considered
629an index into the Unicode semantic tables (e.g. swashes).
630
631If C<s> does not point to a well-formed UTF-8 character, zero is
632returned and retlen is set, if possible, to -1.
633
634=cut
635*/
636
637UV
638Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
639{
640 /* Call the low level routine asking for checks */
641 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
642 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
643}
644
645/*
646=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
647
648Return the length of the UTF-8 char encoded string C<s> in characters.
649Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
650up past C<e>, croaks.
651
652=cut
653*/
654
655STRLEN
656Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
657{
658 STRLEN len = 0;
659
660 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
661 * the bitops (especially ~) can create illegal UTF-8.
662 * In other words: in Perl UTF-8 is not just for Unicode. */
663
664 if (e < s)
665 goto warn_and_return;
666 while (s < e) {
667 const U8 t = UTF8SKIP(s);
668 if (e - s < t) {
669 warn_and_return:
670 if (ckWARN_d(WARN_UTF8)) {
671 if (PL_op)
672 Perl_warner(aTHX_ packWARN(WARN_UTF8),
673 "%s in %s", unees, OP_DESC(PL_op));
674 else
675 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
676 }
677 return len;
678 }
679 s += t;
680 len++;
681 }
682
683 return len;
684}
685
686/*
687=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
688
689Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
690and C<b>.
691
692WARNING: use only if you *know* that the pointers point inside the
693same UTF-8 buffer.
694
695=cut
696*/
697
698IV
699Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
700{
701 IV off = 0;
702
703 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
704 * the bitops (especially ~) can create illegal UTF-8.
705 * In other words: in Perl UTF-8 is not just for Unicode. */
706
707 if (a < b) {
708 while (a < b) {
709 const U8 c = UTF8SKIP(a);
710 if (b - a < c)
711 goto warn_and_return;
712 a += c;
713 off--;
714 }
715 }
716 else {
717 while (b < a) {
718 const U8 c = UTF8SKIP(b);
719
720 if (a - b < c) {
721 warn_and_return:
722 if (ckWARN_d(WARN_UTF8)) {
723 if (PL_op)
724 Perl_warner(aTHX_ packWARN(WARN_UTF8),
725 "%s in %s", unees, OP_DESC(PL_op));
726 else
727 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
728 }
729 return off;
730 }
731 b += c;
732 off++;
733 }
734 }
735
736 return off;
737}
738
739/*
740=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
741
742Return the UTF-8 pointer C<s> displaced by C<off> characters, either
743forward or backward.
744
745WARNING: do not use the following unless you *know* C<off> is within
746the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
747on the first byte of character or just after the last byte of a character.
748
749=cut
750*/
751
752U8 *
753Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
754{
755 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
756 * the bitops (especially ~) can create illegal UTF-8.
757 * In other words: in Perl UTF-8 is not just for Unicode. */
758
759 if (off >= 0) {
760 while (off--)
761 s += UTF8SKIP(s);
762 }
763 else {
764 while (off++) {
765 s--;
766 while (UTF8_IS_CONTINUATION(*s))
767 s--;
768 }
769 }
770 return (U8 *)s;
771}
772
773/*
774=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
775
776Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
777Unlike C<bytes_to_utf8>, this over-writes the original string, and
778updates len to contain the new length.
779Returns zero on failure, setting C<len> to -1.
780
781=cut
782*/
783
784U8 *
785Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
786{
787 U8 *send;
788 U8 *d;
789 U8 *save = s;
790
791 /* ensure valid UTF-8 and chars < 256 before updating string */
792 for (send = s + *len; s < send; ) {
793 U8 c = *s++;
794
795 if (!UTF8_IS_INVARIANT(c) &&
796 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
797 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
798 *len = -1;
799 return 0;
800 }
801 }
802
803 d = s = save;
804 while (s < send) {
805 STRLEN ulen;
806 *d++ = (U8)utf8_to_uvchr(s, &ulen);
807 s += ulen;
808 }
809 *d = '\0';
810 *len = d - save;
811 return save;
812}
813
814/*
815=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
816
817Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
818Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
819the newly-created string, and updates C<len> to contain the new
820length. Returns the original string if no conversion occurs, C<len>
821is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8220 if C<s> is converted or contains all 7bit characters.
823
824=cut
825*/
826
827U8 *
828Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
829{
830 U8 *d;
831 const U8 *start = s;
832 const U8 *send;
833 I32 count = 0;
834
835 if (!*is_utf8)
836 return (U8 *)start;
837
838 /* ensure valid UTF-8 and chars < 256 before converting string */
839 for (send = s + *len; s < send;) {
840 U8 c = *s++;
841 if (!UTF8_IS_INVARIANT(c)) {
842 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
843 (c = *s++) && UTF8_IS_CONTINUATION(c))
844 count++;
845 else
846 return (U8 *)start;
847 }
848 }
849
850 *is_utf8 = 0;
851
852 Newxz(d, (*len) - count + 1, U8);
853 s = start; start = d;
854 while (s < send) {
855 U8 c = *s++;
856 if (!UTF8_IS_INVARIANT(c)) {
857 /* Then it is two-byte encoded */
858 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
859 c = ASCII_TO_NATIVE(c);
860 }
861 *d++ = c;
862 }
863 *d = '\0';
864 *len = d - start;
865 return (U8 *)start;
866}
867
868/*
869=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
870
871Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
872Returns a pointer to the newly-created string, and sets C<len> to
873reflect the new length.
874
875If you want to convert to UTF-8 from other encodings than ASCII,
876see sv_recode_to_utf8().
877
878=cut
879*/
880
881U8*
882Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
883{
884 const U8 * const send = s + (*len);
885 U8 *d;
886 U8 *dst;
887
888 Newxz(d, (*len) * 2 + 1, U8);
889 dst = d;
890
891 while (s < send) {
892 const UV uv = NATIVE_TO_ASCII(*s++);
893 if (UNI_IS_INVARIANT(uv))
894 *d++ = (U8)UTF_TO_NATIVE(uv);
895 else {
896 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
897 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
898 }
899 }
900 *d = '\0';
901 *len = d-dst;
902 return dst;
903}
904
905/*
906 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
907 *
908 * Destination must be pre-extended to 3/2 source. Do not use in-place.
909 * We optimize for native, for obvious reasons. */
910
911U8*
912Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
913{
914 U8* pend;
915 U8* dstart = d;
916
917 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
918 d[0] = 0;
919 *newlen = 1;
920 return d;
921 }
922
923 if (bytelen & 1)
924 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
925
926 pend = p + bytelen;
927
928 while (p < pend) {
929 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
930 p += 2;
931 if (uv < 0x80) {
932 *d++ = (U8)uv;
933 continue;
934 }
935 if (uv < 0x800) {
936 *d++ = (U8)(( uv >> 6) | 0xc0);
937 *d++ = (U8)(( uv & 0x3f) | 0x80);
938 continue;
939 }
940 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
941 UV low = (p[0] << 8) + p[1];
942 p += 2;
943 if (low < 0xdc00 || low >= 0xdfff)
944 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
945 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
946 }
947 if (uv < 0x10000) {
948 *d++ = (U8)(( uv >> 12) | 0xe0);
949 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
950 *d++ = (U8)(( uv & 0x3f) | 0x80);
951 continue;
952 }
953 else {
954 *d++ = (U8)(( uv >> 18) | 0xf0);
955 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
956 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
957 *d++ = (U8)(( uv & 0x3f) | 0x80);
958 continue;
959 }
960 }
961 *newlen = d - dstart;
962 return d;
963}
964
965/* Note: this one is slightly destructive of the source. */
966
967U8*
968Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
969{
970 U8* s = (U8*)p;
971 U8* send = s + bytelen;
972 while (s < send) {
973 U8 tmp = s[0];
974 s[0] = s[1];
975 s[1] = tmp;
976 s += 2;
977 }
978 return utf16_to_utf8(p, d, bytelen, newlen);
979}
980
981/* for now these are all defined (inefficiently) in terms of the utf8 versions */
982
983bool
984Perl_is_uni_alnum(pTHX_ UV c)
985{
986 U8 tmpbuf[UTF8_MAXBYTES+1];
987 uvchr_to_utf8(tmpbuf, c);
988 return is_utf8_alnum(tmpbuf);
989}
990
991bool
992Perl_is_uni_alnumc(pTHX_ UV c)
993{
994 U8 tmpbuf[UTF8_MAXBYTES+1];
995 uvchr_to_utf8(tmpbuf, c);
996 return is_utf8_alnumc(tmpbuf);
997}
998
999bool
1000Perl_is_uni_idfirst(pTHX_ UV c)
1001{
1002 U8 tmpbuf[UTF8_MAXBYTES+1];
1003 uvchr_to_utf8(tmpbuf, c);
1004 return is_utf8_idfirst(tmpbuf);
1005}
1006
1007bool
1008Perl_is_uni_alpha(pTHX_ UV c)
1009{
1010 U8 tmpbuf[UTF8_MAXBYTES+1];
1011 uvchr_to_utf8(tmpbuf, c);
1012 return is_utf8_alpha(tmpbuf);
1013}
1014
1015bool
1016Perl_is_uni_ascii(pTHX_ UV c)
1017{
1018 U8 tmpbuf[UTF8_MAXBYTES+1];
1019 uvchr_to_utf8(tmpbuf, c);
1020 return is_utf8_ascii(tmpbuf);
1021}
1022
1023bool
1024Perl_is_uni_space(pTHX_ UV c)
1025{
1026 U8 tmpbuf[UTF8_MAXBYTES+1];
1027 uvchr_to_utf8(tmpbuf, c);
1028 return is_utf8_space(tmpbuf);
1029}
1030
1031bool
1032Perl_is_uni_digit(pTHX_ UV c)
1033{
1034 U8 tmpbuf[UTF8_MAXBYTES+1];
1035 uvchr_to_utf8(tmpbuf, c);
1036 return is_utf8_digit(tmpbuf);
1037}
1038
1039bool
1040Perl_is_uni_upper(pTHX_ UV c)
1041{
1042 U8 tmpbuf[UTF8_MAXBYTES+1];
1043 uvchr_to_utf8(tmpbuf, c);
1044 return is_utf8_upper(tmpbuf);
1045}
1046
1047bool
1048Perl_is_uni_lower(pTHX_ UV c)
1049{
1050 U8 tmpbuf[UTF8_MAXBYTES+1];
1051 uvchr_to_utf8(tmpbuf, c);
1052 return is_utf8_lower(tmpbuf);
1053}
1054
1055bool
1056Perl_is_uni_cntrl(pTHX_ UV c)
1057{
1058 U8 tmpbuf[UTF8_MAXBYTES+1];
1059 uvchr_to_utf8(tmpbuf, c);
1060 return is_utf8_cntrl(tmpbuf);
1061}
1062
1063bool
1064Perl_is_uni_graph(pTHX_ UV c)
1065{
1066 U8 tmpbuf[UTF8_MAXBYTES+1];
1067 uvchr_to_utf8(tmpbuf, c);
1068 return is_utf8_graph(tmpbuf);
1069}
1070
1071bool
1072Perl_is_uni_print(pTHX_ UV c)
1073{
1074 U8 tmpbuf[UTF8_MAXBYTES+1];
1075 uvchr_to_utf8(tmpbuf, c);
1076 return is_utf8_print(tmpbuf);
1077}
1078
1079bool
1080Perl_is_uni_punct(pTHX_ UV c)
1081{
1082 U8 tmpbuf[UTF8_MAXBYTES+1];
1083 uvchr_to_utf8(tmpbuf, c);
1084 return is_utf8_punct(tmpbuf);
1085}
1086
1087bool
1088Perl_is_uni_xdigit(pTHX_ UV c)
1089{
1090 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1091 uvchr_to_utf8(tmpbuf, c);
1092 return is_utf8_xdigit(tmpbuf);
1093}
1094
1095UV
1096Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
1097{
1098 uvchr_to_utf8(p, c);
1099 return to_utf8_upper(p, p, lenp);
1100}
1101
1102UV
1103Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
1104{
1105 uvchr_to_utf8(p, c);
1106 return to_utf8_title(p, p, lenp);
1107}
1108
1109UV
1110Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
1111{
1112 uvchr_to_utf8(p, c);
1113 return to_utf8_lower(p, p, lenp);
1114}
1115
1116UV
1117Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1118{
1119 uvchr_to_utf8(p, c);
1120 return to_utf8_fold(p, p, lenp);
1121}
1122
1123/* for now these all assume no locale info available for Unicode > 255 */
1124
1125bool
1126Perl_is_uni_alnum_lc(pTHX_ UV c)
1127{
1128 return is_uni_alnum(c); /* XXX no locale support yet */
1129}
1130
1131bool
1132Perl_is_uni_alnumc_lc(pTHX_ UV c)
1133{
1134 return is_uni_alnumc(c); /* XXX no locale support yet */
1135}
1136
1137bool
1138Perl_is_uni_idfirst_lc(pTHX_ UV c)
1139{
1140 return is_uni_idfirst(c); /* XXX no locale support yet */
1141}
1142
1143bool
1144Perl_is_uni_alpha_lc(pTHX_ UV c)
1145{
1146 return is_uni_alpha(c); /* XXX no locale support yet */
1147}
1148
1149bool
1150Perl_is_uni_ascii_lc(pTHX_ UV c)
1151{
1152 return is_uni_ascii(c); /* XXX no locale support yet */
1153}
1154
1155bool
1156Perl_is_uni_space_lc(pTHX_ UV c)
1157{
1158 return is_uni_space(c); /* XXX no locale support yet */
1159}
1160
1161bool
1162Perl_is_uni_digit_lc(pTHX_ UV c)
1163{
1164 return is_uni_digit(c); /* XXX no locale support yet */
1165}
1166
1167bool
1168Perl_is_uni_upper_lc(pTHX_ UV c)
1169{
1170 return is_uni_upper(c); /* XXX no locale support yet */
1171}
1172
1173bool
1174Perl_is_uni_lower_lc(pTHX_ UV c)
1175{
1176 return is_uni_lower(c); /* XXX no locale support yet */
1177}
1178
1179bool
1180Perl_is_uni_cntrl_lc(pTHX_ UV c)
1181{
1182 return is_uni_cntrl(c); /* XXX no locale support yet */
1183}
1184
1185bool
1186Perl_is_uni_graph_lc(pTHX_ UV c)
1187{
1188 return is_uni_graph(c); /* XXX no locale support yet */
1189}
1190
1191bool
1192Perl_is_uni_print_lc(pTHX_ UV c)
1193{
1194 return is_uni_print(c); /* XXX no locale support yet */
1195}
1196
1197bool
1198Perl_is_uni_punct_lc(pTHX_ UV c)
1199{
1200 return is_uni_punct(c); /* XXX no locale support yet */
1201}
1202
1203bool
1204Perl_is_uni_xdigit_lc(pTHX_ UV c)
1205{
1206 return is_uni_xdigit(c); /* XXX no locale support yet */
1207}
1208
1209U32
1210Perl_to_uni_upper_lc(pTHX_ U32 c)
1211{
1212 /* XXX returns only the first character -- do not use XXX */
1213 /* XXX no locale support yet */
1214 STRLEN len;
1215 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1216 return (U32)to_uni_upper(c, tmpbuf, &len);
1217}
1218
1219U32
1220Perl_to_uni_title_lc(pTHX_ U32 c)
1221{
1222 /* XXX returns only the first character XXX -- do not use XXX */
1223 /* XXX no locale support yet */
1224 STRLEN len;
1225 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1226 return (U32)to_uni_title(c, tmpbuf, &len);
1227}
1228
1229U32
1230Perl_to_uni_lower_lc(pTHX_ U32 c)
1231{
1232 /* XXX returns only the first character -- do not use XXX */
1233 /* XXX no locale support yet */
1234 STRLEN len;
1235 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1236 return (U32)to_uni_lower(c, tmpbuf, &len);
1237}
1238
1239static bool
1240S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1241 const char *const swashname)
1242{
1243 if (!is_utf8_char(p))
1244 return FALSE;
1245 if (!*swash)
1246 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
1247 return swash_fetch(*swash, p, TRUE) != 0;
1248}
1249
1250bool
1251Perl_is_utf8_alnum(pTHX_ const U8 *p)
1252{
1253 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1254 * descendant of isalnum(3), in other words, it doesn't
1255 * contain the '_'. --jhi */
1256 return S_is_utf8_common(aTHX_ p, &PL_utf8_alnum, "IsWord");
1257}
1258
1259bool
1260Perl_is_utf8_alnumc(pTHX_ const U8 *p)
1261{
1262 return S_is_utf8_common(aTHX_ p, &PL_utf8_alnumc, "IsAlnumC");
1263}
1264
1265bool
1266Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
1267{
1268 if (*p == '_')
1269 return TRUE;
1270 /* is_utf8_idstart would be more logical. */
1271 return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart");
1272}
1273
1274bool
1275Perl_is_utf8_idcont(pTHX_ const U8 *p)
1276{
1277 if (*p == '_')
1278 return TRUE;
1279 return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue");
1280}
1281
1282bool
1283Perl_is_utf8_alpha(pTHX_ const U8 *p)
1284{
1285 return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha");
1286}
1287
1288bool
1289Perl_is_utf8_ascii(pTHX_ const U8 *p)
1290{
1291 return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii");
1292}
1293
1294bool
1295Perl_is_utf8_space(pTHX_ const U8 *p)
1296{
1297 return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl");
1298}
1299
1300bool
1301Perl_is_utf8_digit(pTHX_ const U8 *p)
1302{
1303 return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit");
1304}
1305
1306bool
1307Perl_is_utf8_upper(pTHX_ const U8 *p)
1308{
1309 return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase");
1310}
1311
1312bool
1313Perl_is_utf8_lower(pTHX_ const U8 *p)
1314{
1315 return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase");
1316}
1317
1318bool
1319Perl_is_utf8_cntrl(pTHX_ const U8 *p)
1320{
1321 return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl");
1322}
1323
1324bool
1325Perl_is_utf8_graph(pTHX_ const U8 *p)
1326{
1327 return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph");
1328}
1329
1330bool
1331Perl_is_utf8_print(pTHX_ const U8 *p)
1332{
1333 return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint");
1334}
1335
1336bool
1337Perl_is_utf8_punct(pTHX_ const U8 *p)
1338{
1339 return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct");
1340}
1341
1342bool
1343Perl_is_utf8_xdigit(pTHX_ const U8 *p)
1344{
1345 return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit");
1346}
1347
1348bool
1349Perl_is_utf8_mark(pTHX_ const U8 *p)
1350{
1351 return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM");
1352}
1353
1354/*
1355=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1356
1357The "p" contains the pointer to the UTF-8 string encoding
1358the character that is being converted.
1359
1360The "ustrp" is a pointer to the character buffer to put the
1361conversion result to. The "lenp" is a pointer to the length
1362of the result.
1363
1364The "swashp" is a pointer to the swash to use.
1365
1366Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1367and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
1368but not always, a multicharacter mapping), is tried first.
1369
1370The "special" is a string like "utf8::ToSpecLower", which means the
1371hash %utf8::ToSpecLower. The access to the hash is through
1372Perl_to_utf8_case().
1373
1374The "normal" is a string like "ToLower" which means the swash
1375%utf8::ToLower.
1376
1377=cut */
1378
1379UV
1380Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1381 SV **swashp, const char *normal, const char *special)
1382{
1383 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
1384 STRLEN len = 0;
1385
1386 const UV uv0 = utf8_to_uvchr(p, NULL);
1387 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1388 * are necessary in EBCDIC, they are redundant no-ops
1389 * in ASCII-ish platforms, and hopefully optimized away. */
1390 const UV uv1 = NATIVE_TO_UNI(uv0);
1391 uvuni_to_utf8(tmpbuf, uv1);
1392
1393 if (!*swashp) /* load on-demand */
1394 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1395
1396 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1397 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
1398 /* It might be "special" (sometimes, but not always,
1399 * a multicharacter mapping) */
1400 HV *hv;
1401 SV **svp;
1402
1403 if ((hv = get_hv(special, FALSE)) &&
1404 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1405 (*svp)) {
1406 const char *s;
1407
1408 s = SvPV_const(*svp, len);
1409 if (len == 1)
1410 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
1411 else {
1412#ifdef EBCDIC
1413 /* If we have EBCDIC we need to remap the characters
1414 * since any characters in the low 256 are Unicode
1415 * code points, not EBCDIC. */
1416 U8 *t = (U8*)s, *tend = t + len, *d;
1417
1418 d = tmpbuf;
1419 if (SvUTF8(*svp)) {
1420 STRLEN tlen = 0;
1421
1422 while (t < tend) {
1423 UV c = utf8_to_uvchr(t, &tlen);
1424 if (tlen > 0) {
1425 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1426 t += tlen;
1427 }
1428 else
1429 break;
1430 }
1431 }
1432 else {
1433 while (t < tend) {
1434 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1435 t++;
1436 }
1437 }
1438 len = d - tmpbuf;
1439 Copy(tmpbuf, ustrp, len, U8);
1440#else
1441 Copy(s, ustrp, len, U8);
1442#endif
1443 }
1444 }
1445 }
1446
1447 if (!len && *swashp) {
1448 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1449
1450 if (uv2) {
1451 /* It was "normal" (a single character mapping). */
1452 UV uv3 = UNI_TO_NATIVE(uv2);
1453
1454 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
1455 }
1456 }
1457
1458 if (!len) /* Neither: just copy. */
1459 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1460
1461 if (lenp)
1462 *lenp = len;
1463
1464 return len ? utf8_to_uvchr(ustrp, 0) : 0;
1465}
1466
1467/*
1468=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
1469
1470Convert the UTF-8 encoded character at p to its uppercase version and
1471store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1472that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1473the uppercase version may be longer than the original character.
1474
1475The first character of the uppercased version is returned
1476(but note, as explained above, that there may be more.)
1477
1478=cut */
1479
1480UV
1481Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1482{
1483 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1484 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
1485}
1486
1487/*
1488=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
1489
1490Convert the UTF-8 encoded character at p to its titlecase version and
1491store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1492that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1493titlecase version may be longer than the original character.
1494
1495The first character of the titlecased version is returned
1496(but note, as explained above, that there may be more.)
1497
1498=cut */
1499
1500UV
1501Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1502{
1503 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1504 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
1505}
1506
1507/*
1508=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
1509
1510Convert the UTF-8 encoded character at p to its lowercase version and
1511store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1512that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1513lowercase version may be longer than the original character.
1514
1515The first character of the lowercased version is returned
1516(but note, as explained above, that there may be more.)
1517
1518=cut */
1519
1520UV
1521Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1522{
1523 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1524 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1525}
1526
1527/*
1528=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
1529
1530Convert the UTF-8 encoded character at p to its foldcase version and
1531store that in UTF-8 in ustrp and its length in bytes in lenp. Note
1532that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1533foldcase version may be longer than the original character (up to
1534three characters).
1535
1536The first character of the foldcased version is returned
1537(but note, as explained above, that there may be more.)
1538
1539=cut */
1540
1541UV
1542Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
1543{
1544 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1545 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
1546}
1547
1548/* Note:
1549 * A "swash" is a swatch hash.
1550 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1551 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1552 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1553 */
1554SV*
1555Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
1556{
1557 dVAR;
1558 SV* retval;
1559 SV* const tokenbufsv = sv_newmortal();
1560 dSP;
1561 const size_t pkg_len = strlen(pkg);
1562 const size_t name_len = strlen(name);
1563 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
1564 SV* errsv_save;
1565
1566 PUSHSTACKi(PERLSI_MAGIC);
1567 ENTER;
1568 SAVEI32(PL_hints);
1569 PL_hints = 0;
1570 save_re_context();
1571 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
1572 ENTER;
1573 errsv_save = newSVsv(ERRSV);
1574 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1575 Nullsv);
1576 if (!SvTRUE(ERRSV))
1577 sv_setsv(ERRSV, errsv_save);
1578 SvREFCNT_dec(errsv_save);
1579 LEAVE;
1580 }
1581 SPAGAIN;
1582 PUSHMARK(SP);
1583 EXTEND(SP,5);
1584 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1585 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
1586 PUSHs(listsv);
1587 PUSHs(sv_2mortal(newSViv(minbits)));
1588 PUSHs(sv_2mortal(newSViv(none)));
1589 PUTBACK;
1590 if (IN_PERL_COMPILETIME) {
1591 /* XXX ought to be handled by lex_start */
1592 SAVEI32(PL_in_my);
1593 PL_in_my = 0;
1594 sv_setpv(tokenbufsv, PL_tokenbuf);
1595 }
1596 errsv_save = newSVsv(ERRSV);
1597 if (call_method("SWASHNEW", G_SCALAR))
1598 retval = newSVsv(*PL_stack_sp--);
1599 else
1600 retval = &PL_sv_undef;
1601 if (!SvTRUE(ERRSV))
1602 sv_setsv(ERRSV, errsv_save);
1603 SvREFCNT_dec(errsv_save);
1604 LEAVE;
1605 POPSTACK;
1606 if (IN_PERL_COMPILETIME) {
1607 STRLEN len;
1608 const char* const pv = SvPV_const(tokenbufsv, len);
1609
1610 Copy(pv, PL_tokenbuf, len+1, char);
1611 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1612 }
1613 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1614 if (SvPOK(retval))
1615 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1616 retval);
1617 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
1618 }
1619 return retval;
1620}
1621
1622
1623/* This API is wrong for special case conversions since we may need to
1624 * return several Unicode characters for a single Unicode character
1625 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1626 * the lower-level routine, and it is similarly broken for returning
1627 * multiple values. --jhi */
1628/* Now SWASHGET is recasted into S_swash_get in this file. */
1629
1630/* Note:
1631 * Returns the value of property/mapping C<swash> for the first character
1632 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1633 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1634 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1635 */
1636UV
1637Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
1638{
1639 dVAR;
1640 HV* const hv = (HV*)SvRV(swash);
1641 U32 klen;
1642 U32 off;
1643 STRLEN slen;
1644 STRLEN needents;
1645 const U8 *tmps = NULL;
1646 U32 bit;
1647 SV *swatch;
1648 U8 tmputf8[2];
1649 UV c = NATIVE_TO_ASCII(*ptr);
1650
1651 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
1652 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1653 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1654 ptr = tmputf8;
1655 }
1656 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1657 * then the "swatch" is a vec() for al the chars which start
1658 * with 0xAA..0xYY
1659 * So the key in the hash (klen) is length of encoded char -1
1660 */
1661 klen = UTF8SKIP(ptr) - 1;
1662 off = ptr[klen];
1663
1664 if (klen == 0) {
1665 /* If char in invariant then swatch is for all the invariant chars
1666 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
1667 */
1668 needents = UTF_CONTINUATION_MARK;
1669 off = NATIVE_TO_UTF(ptr[klen]);
1670 }
1671 else {
1672 /* If char is encoded then swatch is for the prefix */
1673 needents = (1 << UTF_ACCUMULATION_SHIFT);
1674 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1675 }
1676
1677 /*
1678 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1679 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1680 * it's nothing to sniff at.) Pity we usually come through at least
1681 * two function calls to get here...
1682 *
1683 * NB: this code assumes that swatches are never modified, once generated!
1684 */
1685
1686 if (hv == PL_last_swash_hv &&
1687 klen == PL_last_swash_klen &&
1688 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
1689 {
1690 tmps = PL_last_swash_tmps;
1691 slen = PL_last_swash_slen;
1692 }
1693 else {
1694 /* Try our second-level swatch cache, kept in a hash. */
1695 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
1696
1697 /* If not cached, generate it via swash_get */
1698 if (!svp || !SvPOK(*svp)
1699 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
1700 /* We use utf8n_to_uvuni() as we want an index into
1701 Unicode tables, not a native character number.
1702 */
1703 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
1704 ckWARN(WARN_UTF8) ?
1705 0 : UTF8_ALLOW_ANY);
1706 swatch = swash_get(swash,
1707 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1708 (klen) ? (code_point & ~(needents - 1)) : 0,
1709 needents);
1710
1711 if (IN_PERL_COMPILETIME)
1712 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
1713
1714 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
1715
1716 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1717 || (slen << 3) < needents)
1718 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
1719 }
1720
1721 PL_last_swash_hv = hv;
1722 PL_last_swash_klen = klen;
1723 /* FIXME change interpvar.h? */
1724 PL_last_swash_tmps = (U8 *) tmps;
1725 PL_last_swash_slen = slen;
1726 if (klen)
1727 Copy(ptr, PL_last_swash_key, klen, U8);
1728 }
1729
1730 switch ((int)((slen << 3) / needents)) {
1731 case 1:
1732 bit = 1 << (off & 7);
1733 off >>= 3;
1734 return (tmps[off] & bit) != 0;
1735 case 8:
1736 return tmps[off];
1737 case 16:
1738 off <<= 1;
1739 return (tmps[off] << 8) + tmps[off + 1] ;
1740 case 32:
1741 off <<= 2;
1742 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1743 }
1744 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
1745 return 0;
1746}
1747
1748/* Note:
1749 * Returns a swatch (a bit vector string) for a code point sequence
1750 * that starts from the value C<start> and comprises the number C<span>.
1751 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1752 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1753 */
1754STATIC SV*
1755S_swash_get(pTHX_ SV* swash, UV start, UV span)
1756{
1757 SV *swatch;
1758 U8 *l, *lend, *x, *xend, *s;
1759 STRLEN lcur, xcur, scur;
1760
1761 HV* const hv = (HV*)SvRV(swash);
1762 SV** const listsvp = hv_fetch(hv, "LIST", 4, FALSE);
1763 SV** const typesvp = hv_fetch(hv, "TYPE", 4, FALSE);
1764 SV** const bitssvp = hv_fetch(hv, "BITS", 4, FALSE);
1765 SV** const nonesvp = hv_fetch(hv, "NONE", 4, FALSE);
1766 SV** const extssvp = hv_fetch(hv, "EXTRAS", 6, FALSE);
1767 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1768 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1769 const STRLEN bits = SvUV(*bitssvp);
1770 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1771 const UV none = SvUV(*nonesvp);
1772 const UV end = start + span;
1773
1774 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1775 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1776 (UV)bits);
1777 }
1778
1779 /* create and initialize $swatch */
1780 swatch = newSVpvn("",0);
1781 scur = octets ? (span * octets) : (span + 7) / 8;
1782 SvGROW(swatch, scur + 1);
1783 s = (U8*)SvPVX(swatch);
1784 if (octets && none) {
1785 const U8* const e = s + scur;
1786 while (s < e) {
1787 if (bits == 8)
1788 *s++ = (U8)(none & 0xff);
1789 else if (bits == 16) {
1790 *s++ = (U8)((none >> 8) & 0xff);
1791 *s++ = (U8)( none & 0xff);
1792 }
1793 else if (bits == 32) {
1794 *s++ = (U8)((none >> 24) & 0xff);
1795 *s++ = (U8)((none >> 16) & 0xff);
1796 *s++ = (U8)((none >> 8) & 0xff);
1797 *s++ = (U8)( none & 0xff);
1798 }
1799 }
1800 *s = '\0';
1801 }
1802 else {
1803 (void)memzero((U8*)s, scur + 1);
1804 }
1805 SvCUR_set(swatch, scur);
1806 s = (U8*)SvPVX(swatch);
1807
1808 /* read $swash->{LIST} */
1809 l = (U8*)SvPV(*listsvp, lcur);
1810 lend = l + lcur;
1811 while (l < lend) {
1812 UV min, max, val, key;
1813 STRLEN numlen;
1814 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1815
1816 U8* const nl = (U8*)memchr(l, '\n', lend - l);
1817
1818 numlen = lend - l;
1819 min = grok_hex((char *)l, &numlen, &flags, NULL);
1820 if (numlen)
1821 l += numlen;
1822 else if (nl) {
1823 l = nl + 1; /* 1 is length of "\n" */
1824 continue;
1825 }
1826 else {
1827 l = lend; /* to LIST's end at which \n is not found */
1828 break;
1829 }
1830
1831 if (isBLANK(*l)) {
1832 ++l;
1833 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1834 numlen = lend - l;
1835 max = grok_hex((char *)l, &numlen, &flags, NULL);
1836 if (numlen)
1837 l += numlen;
1838 else
1839 max = min;
1840
1841 if (octets) {
1842 if (isBLANK(*l)) {
1843 ++l;
1844 flags = PERL_SCAN_SILENT_ILLDIGIT |
1845 PERL_SCAN_DISALLOW_PREFIX;
1846 numlen = lend - l;
1847 val = grok_hex((char *)l, &numlen, &flags, NULL);
1848 if (numlen)
1849 l += numlen;
1850 else
1851 val = 0;
1852 }
1853 else {
1854 val = 0;
1855 if (typeto) {
1856 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1857 typestr, l);
1858 }
1859 }
1860 }
1861 else
1862 val = 0; /* bits == 1, then val should be ignored */
1863 }
1864 else {
1865 max = min;
1866 if (octets) {
1867 val = 0;
1868 if (typeto) {
1869 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1870 }
1871 }
1872 else
1873 val = 0; /* bits == 1, then val should be ignored */
1874 }
1875
1876 if (nl)
1877 l = nl + 1;
1878 else
1879 l = lend;
1880
1881 if (max < start)
1882 continue;
1883
1884 if (octets) {
1885 if (min < start) {
1886 if (!none || val < none) {
1887 val += start - min;
1888 }
1889 min = start;
1890 }
1891 for (key = min; key <= max; key++) {
1892 STRLEN offset;
1893 if (key >= end)
1894 goto go_out_list;
1895 /* offset must be non-negative (start <= min <= key < end) */
1896 offset = octets * (key - start);
1897 if (bits == 8)
1898 s[offset] = (U8)(val & 0xff);
1899 else if (bits == 16) {
1900 s[offset ] = (U8)((val >> 8) & 0xff);
1901 s[offset + 1] = (U8)( val & 0xff);
1902 }
1903 else if (bits == 32) {
1904 s[offset ] = (U8)((val >> 24) & 0xff);
1905 s[offset + 1] = (U8)((val >> 16) & 0xff);
1906 s[offset + 2] = (U8)((val >> 8) & 0xff);
1907 s[offset + 3] = (U8)( val & 0xff);
1908 }
1909
1910 if (!none || val < none)
1911 ++val;
1912 }
1913 }
1914 else { /* bits == 1, then val should be ignored */
1915 if (min < start)
1916 min = start;
1917 for (key = min; key <= max; key++) {
1918 const STRLEN offset = (STRLEN)(key - start);
1919 if (key >= end)
1920 goto go_out_list;
1921 s[offset >> 3] |= 1 << (offset & 7);
1922 }
1923 }
1924 } /* while */
1925 go_out_list:
1926
1927 /* read $swash->{EXTRAS} */
1928 x = (U8*)SvPV(*extssvp, xcur);
1929 xend = x + xcur;
1930 while (x < xend) {
1931 STRLEN namelen;
1932 U8 *namestr;
1933 SV** othersvp;
1934 HV* otherhv;
1935 STRLEN otherbits;
1936 SV **otherbitssvp, *other;
1937 U8 *s, *o, *nl;
1938 STRLEN slen, olen;
1939
1940 U8 opc = *x++;
1941 if (opc == '\n')
1942 continue;
1943
1944 nl = (U8*)memchr(x, '\n', xend - x);
1945
1946 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1947 if (nl) {
1948 x = nl + 1; /* 1 is length of "\n" */
1949 continue;
1950 }
1951 else {
1952 x = xend; /* to EXTRAS' end at which \n is not found */
1953 break;
1954 }
1955 }
1956
1957 namestr = x;
1958 if (nl) {
1959 namelen = nl - namestr;
1960 x = nl + 1;
1961 }
1962 else {
1963 namelen = xend - namestr;
1964 x = xend;
1965 }
1966
1967 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1968 otherhv = (HV*)SvRV(*othersvp);
1969 otherbitssvp = hv_fetch(otherhv, "BITS", 4, FALSE);
1970 otherbits = (STRLEN)SvUV(*otherbitssvp);
1971 if (bits < otherbits)
1972 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
1973
1974 /* The "other" swatch must be destroyed after. */
1975 other = swash_get(*othersvp, start, span);
1976 o = (U8*)SvPV(other, olen);
1977
1978 if (!olen)
1979 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
1980
1981 s = (U8*)SvPV(swatch, slen);
1982 if (bits == 1 && otherbits == 1) {
1983 if (slen != olen)
1984 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
1985
1986 switch (opc) {
1987 case '+':
1988 while (slen--)
1989 *s++ |= *o++;
1990 break;
1991 case '!':
1992 while (slen--)
1993 *s++ |= ~*o++;
1994 break;
1995 case '-':
1996 while (slen--)
1997 *s++ &= ~*o++;
1998 break;
1999 case '&':
2000 while (slen--)
2001 *s++ &= *o++;
2002 break;
2003 default:
2004 break;
2005 }
2006 }
2007 else {
2008 STRLEN otheroctets = otherbits >> 3;
2009 STRLEN offset = 0;
2010 U8* send = s + slen;
2011
2012 while (s < send) {
2013 UV otherval = 0;
2014
2015 if (otherbits == 1) {
2016 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2017 ++offset;
2018 }
2019 else {
2020 STRLEN vlen = otheroctets;
2021 otherval = *o++;
2022 while (--vlen) {
2023 otherval <<= 8;
2024 otherval |= *o++;
2025 }
2026 }
2027
2028 if (opc == '+' && otherval)
2029 ; /* replace with otherval */
2030 else if (opc == '!' && !otherval)
2031 otherval = 1;
2032 else if (opc == '-' && otherval)
2033 otherval = 0;
2034 else if (opc == '&' && !otherval)
2035 otherval = 0;
2036 else {
2037 s += octets; /* no replacement */
2038 continue;
2039 }
2040
2041 if (bits == 8)
2042 *s++ = (U8)( otherval & 0xff);
2043 else if (bits == 16) {
2044 *s++ = (U8)((otherval >> 8) & 0xff);
2045 *s++ = (U8)( otherval & 0xff);
2046 }
2047 else if (bits == 32) {
2048 *s++ = (U8)((otherval >> 24) & 0xff);
2049 *s++ = (U8)((otherval >> 16) & 0xff);
2050 *s++ = (U8)((otherval >> 8) & 0xff);
2051 *s++ = (U8)( otherval & 0xff);
2052 }
2053 }
2054 }
2055 sv_free(other); /* through with it! */
2056 } /* while */
2057 return swatch;
2058}
2059
2060/*
2061=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2062
2063Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2064of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2065bytes available. The return value is the pointer to the byte after the
2066end of the new character. In other words,
2067
2068 d = uvchr_to_utf8(d, uv);
2069
2070is the recommended wide native character-aware way of saying
2071
2072 *(d++) = uv;
2073
2074=cut
2075*/
2076
2077/* On ASCII machines this is normally a macro but we want a
2078 real function in case XS code wants it
2079*/
2080U8 *
2081Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2082{
2083 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2084}
2085
2086U8 *
2087Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2088{
2089 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2090}
2091
2092/*
2093=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2094flags
2095
2096Returns the native character value of the first character in the string
2097C<s>
2098which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2099length, in bytes, of that character.
2100
2101Allows length and flags to be passed to low level routine.
2102
2103=cut
2104*/
2105/* On ASCII machines this is normally a macro but we want
2106 a real function in case XS code wants it
2107*/
2108UV
2109Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2110U32 flags)
2111{
2112 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2113 return UNI_TO_NATIVE(uv);
2114}
2115
2116/*
2117=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2118
2119Build to the scalar dsv a displayable version of the string spv,
2120length len, the displayable version being at most pvlim bytes long
2121(if longer, the rest is truncated and "..." will be appended).
2122
2123The flags argument can have UNI_DISPLAY_ISPRINT set to display
2124isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
2125to display the \\[nrfta\\] as the backslashed versions (like '\n')
2126(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2127UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2128UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2129
2130The pointer to the PV of the dsv is returned.
2131
2132=cut */
2133char *
2134Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2135{
2136 int truncated = 0;
2137 const char *s, *e;
2138
2139 sv_setpvn(dsv, "", 0);
2140 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
2141 UV u;
2142 /* This serves double duty as a flag and a character to print after
2143 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2144 */
2145 char ok = 0;
2146
2147 if (pvlim && SvCUR(dsv) >= pvlim) {
2148 truncated++;
2149 break;
2150 }
2151 u = utf8_to_uvchr((U8*)s, 0);
2152 if (u < 256) {
2153 const unsigned char c = (unsigned char)u & 0xFF;
2154 if (flags & UNI_DISPLAY_BACKSLASH) {
2155 switch (c) {
2156 case '\n':
2157 ok = 'n'; break;
2158 case '\r':
2159 ok = 'r'; break;
2160 case '\t':
2161 ok = 't'; break;
2162 case '\f':
2163 ok = 'f'; break;
2164 case '\a':
2165 ok = 'a'; break;
2166 case '\\':
2167 ok = '\\'; break;
2168 default: break;
2169 }
2170 if (ok) {
2171 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2172 }
2173 }
2174 /* isPRINT() is the locale-blind version. */
2175 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2176 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2177 ok = 1;
2178 }
2179 }
2180 if (!ok)
2181 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
2182 }
2183 if (truncated)
2184 sv_catpvn(dsv, "...", 3);
2185
2186 return SvPVX(dsv);
2187}
2188
2189/*
2190=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2191
2192Build to the scalar dsv a displayable version of the scalar sv,
2193the displayable version being at most pvlim bytes long
2194(if longer, the rest is truncated and "..." will be appended).
2195
2196The flags argument is as in pv_uni_display().
2197
2198The pointer to the PV of the dsv is returned.
2199
2200=cut */
2201char *
2202Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2203{
2204 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2205 SvCUR(ssv), pvlim, flags);
2206}
2207
2208/*
2209=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
2210
2211Return true if the strings s1 and s2 differ case-insensitively, false
2212if not (if they are equal case-insensitively). If u1 is true, the
2213string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
2214the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2215are false, the respective string is assumed to be in native 8-bit
2216encoding.
2217
2218If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2219in there (they will point at the beginning of the I<next> character).
2220If the pointers behind pe1 or pe2 are non-NULL, they are the end
2221pointers beyond which scanning will not continue under any
2222circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
2223s2+l2 will be used as goal end pointers that will also stop the scan,
2224and which qualify towards defining a successful match: all the scans
2225that define an explicit length must reach their goal pointers for
2226a match to succeed).
2227
2228For case-insensitiveness, the "casefolding" of Unicode is used
2229instead of upper/lowercasing both the characters, see
2230http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2231
2232=cut */
2233I32
2234Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
2235{
2236 register const U8 *p1 = (const U8*)s1;
2237 register const U8 *p2 = (const U8*)s2;
2238 register const U8 *f1 = 0, *f2 = 0;
2239 register U8 *e1 = 0, *q1 = 0;
2240 register U8 *e2 = 0, *q2 = 0;
2241 STRLEN n1 = 0, n2 = 0;
2242 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2243 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
2244 U8 natbuf[1+1];
2245 STRLEN foldlen1, foldlen2;
2246 bool match;
2247
2248 if (pe1)
2249 e1 = *(U8**)pe1;
2250 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2251 f1 = (const U8*)s1 + l1;
2252 if (pe2)
2253 e2 = *(U8**)pe2;
2254 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2255 f2 = (const U8*)s2 + l2;
2256
2257 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2258 return 1; /* mismatch; possible infinite loop or false positive */
2259
2260 if (!u1 || !u2)
2261 natbuf[1] = 0; /* Need to terminate the buffer. */
2262
2263 while ((e1 == 0 || p1 < e1) &&
2264 (f1 == 0 || p1 < f1) &&
2265 (e2 == 0 || p2 < e2) &&
2266 (f2 == 0 || p2 < f2)) {
2267 if (n1 == 0) {
2268 if (u1)
2269 to_utf8_fold(p1, foldbuf1, &foldlen1);
2270 else {
2271 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
2272 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2273 }
2274 q1 = foldbuf1;
2275 n1 = foldlen1;
2276 }
2277 if (n2 == 0) {
2278 if (u2)
2279 to_utf8_fold(p2, foldbuf2, &foldlen2);
2280 else {
2281 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
2282 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2283 }
2284 q2 = foldbuf2;
2285 n2 = foldlen2;
2286 }
2287 while (n1 && n2) {
2288 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2289 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2290 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
2291 return 1; /* mismatch */
2292 n1 -= UTF8SKIP(q1);
2293 q1 += UTF8SKIP(q1);
2294 n2 -= UTF8SKIP(q2);
2295 q2 += UTF8SKIP(q2);
2296 }
2297 if (n1 == 0)
2298 p1 += u1 ? UTF8SKIP(p1) : 1;
2299 if (n2 == 0)
2300 p2 += u2 ? UTF8SKIP(p2) : 1;
2301
2302 }
2303
2304 /* A match is defined by all the scans that specified
2305 * an explicit length reaching their final goals. */
2306 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
2307
2308 if (match) {
2309 if (pe1)
2310 *pe1 = (char*)p1;
2311 if (pe2)
2312 *pe2 = (char*)p2;
2313 }
2314
2315 return match ? 0 : 1; /* 0 match, 1 mismatch */
2316}
2317
2318/*
2319 * Local variables:
2320 * c-indentation-style: bsd
2321 * c-basic-offset: 4
2322 * indent-tabs-mode: t
2323 * End:
2324 *
2325 * ex: set ts=8 sts=4 sw=4 noet:
2326 */