This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Oops. Didn't mean to commit 27426 to maint first. Integrate it:
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
b94e2f88
RGS
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3
LW
26#include "perl.h"
27
27da23d5
JH
28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7
JH
31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29
DM
34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1
JH
47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1
JH
53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485
SC
57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6
NIS
90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463
GS
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
104 return d;
105 }
106 if (uv < 0x10000) {
eb160463
GS
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
110 return d;
111 }
112 if (uv < 0x200000) {
eb160463
GS
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463
GS
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
138#endif
139 {
eb160463
GS
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463
GS
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
9041c2e3 169
646ca15d
JH
170/*
171
172Tests if some arbitrary number of bytes begins in a valid UTF-8
173character. Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character. The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
176
177This is the "slow" version as opposed to the "fast" version which is
178the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
179difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
180or less you should use the IS_UTF8_CHAR(), for lengths of five or more
181you should use the _slow(). In practice this means that the _slow()
182will be used very rarely, since the maximum Unicode code point (as of
183Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
184the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
185five bytes or more.
186
187=cut */
c053b435 188STATIC STRLEN
5f66b61c 189S_is_utf8_char_slow(const U8 *s, const STRLEN len)
646ca15d
JH
190{
191 U8 u = *s;
192 STRLEN slen;
193 UV uv, ouv;
194
195 if (UTF8_IS_INVARIANT(u))
196 return 1;
197
198 if (!UTF8_IS_START(u))
199 return 0;
200
201 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
202 return 0;
203
204 slen = len - 1;
205 s++;
77263263
TS
206#ifdef EBCDIC
207 u = NATIVE_TO_UTF(u);
208#endif
646ca15d
JH
209 u &= UTF_START_MASK(len);
210 uv = u;
211 ouv = uv;
212 while (slen--) {
213 if (!UTF8_IS_CONTINUATION(*s))
214 return 0;
215 uv = UTF8_ACCUMULATE(uv, *s);
216 if (uv < ouv)
217 return 0;
218 ouv = uv;
219 s++;
220 }
221
222 if ((STRLEN)UNISKIP(uv) < len)
223 return 0;
224
225 return len;
226}
9041c2e3
NIS
227
228/*
7fc63493 229=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 230
5da9da9e 231Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
232character. Note that an INVARIANT (i.e. ASCII) character is a valid
233UTF-8 character. The actual number of bytes in the UTF-8 character
234will be returned if it is valid, otherwise 0.
9041c2e3 235
82686b01 236=cut */
067a85ef 237STRLEN
7fc63493 238Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 239{
44f8325f 240 const STRLEN len = UTF8SKIP(s);
96a5add6 241 PERL_UNUSED_CONTEXT;
3b0fc154 242#ifdef IS_UTF8_CHAR
768c67ee 243 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
244 return IS_UTF8_CHAR(s, len) ? len : 0;
245#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 246 return is_utf8_char_slow(s, len);
386d01d6
GS
247}
248
6662521e 249/*
7fc63493 250=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 251
c9ada85f 252Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
253UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
254not mean 'a string that contains code points above 0x7F encoded in UTF-8'
255because a valid ASCII string is a valid UTF-8 string.
6662521e 256
768c67ee
JH
257See also is_utf8_string_loclen() and is_utf8_string_loc().
258
6662521e
GS
259=cut
260*/
261
8e84507e 262bool
7fc63493 263Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 264{
7fc63493
AL
265 const U8* x = s;
266 const U8* send;
067a85ef 267
96a5add6 268 PERL_UNUSED_CONTEXT;
44f8325f 269 if (!len)
e1ec3a88 270 len = strlen((const char *)s);
1aa99e6b
IH
271 send = s + len;
272
6662521e 273 while (x < send) {
a3b680e6 274 STRLEN c;
1acdb0da
JH
275 /* Inline the easy bits of is_utf8_char() here for speed... */
276 if (UTF8_IS_INVARIANT(*x))
277 c = 1;
278 else if (!UTF8_IS_START(*x))
768c67ee 279 goto out;
1acdb0da
JH
280 else {
281 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
282#ifdef IS_UTF8_CHAR
283 c = UTF8SKIP(x);
768c67ee
JH
284 if (IS_UTF8_CHAR_FAST(c)) {
285 if (!IS_UTF8_CHAR(x, c))
286 goto out;
287 } else if (!is_utf8_char_slow(x, c))
288 goto out;
646ca15d
JH
289#else
290 c = is_utf8_char(x);
291#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 292 if (!c)
768c67ee 293 goto out;
1acdb0da 294 }
6662521e 295 x += c;
6662521e 296 }
768c67ee
JH
297
298 out:
60006e79
JH
299 if (x != send)
300 return FALSE;
067a85ef
A
301
302 return TRUE;
6662521e
GS
303}
304
67e989fb 305/*
814fafa7
NC
306Implemented as a macro in utf8.h
307
308=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
309
310Like is_utf8_string() but stores the location of the failure (in the
311case of "utf8ness failure") or the location s+len (in the case of
312"utf8ness success") in the C<ep>.
313
314See also is_utf8_string_loclen() and is_utf8_string().
315
768c67ee 316=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 317
e3e4599f 318Like is_utf8_string() but stores the location of the failure (in the
768c67ee
JH
319case of "utf8ness failure") or the location s+len (in the case of
320"utf8ness success") in the C<ep>, and the number of UTF-8
321encoded characters in the C<el>.
322
323See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
324
325=cut
326*/
327
328bool
768c67ee 329Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 330{
7fc63493
AL
331 const U8* x = s;
332 const U8* send;
81cd54e3 333 STRLEN c;
96a5add6 334 PERL_UNUSED_CONTEXT;
81cd54e3 335
44f8325f 336 if (!len)
e1ec3a88 337 len = strlen((const char *)s);
81cd54e3 338 send = s + len;
768c67ee
JH
339 if (el)
340 *el = 0;
81cd54e3
JH
341
342 while (x < send) {
343 /* Inline the easy bits of is_utf8_char() here for speed... */
344 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
345 c = 1;
346 else if (!UTF8_IS_START(*x))
347 goto out;
81cd54e3 348 else {
768c67ee
JH
349 /* ... and call is_utf8_char() only if really needed. */
350#ifdef IS_UTF8_CHAR
351 c = UTF8SKIP(x);
352 if (IS_UTF8_CHAR_FAST(c)) {
353 if (!IS_UTF8_CHAR(x, c))
354 c = 0;
355 } else
356 c = is_utf8_char_slow(x, c);
357#else
358 c = is_utf8_char(x);
359#endif /* #ifdef IS_UTF8_CHAR */
360 if (!c)
361 goto out;
81cd54e3 362 }
768c67ee
JH
363 x += c;
364 if (el)
365 (*el)++;
81cd54e3 366 }
768c67ee
JH
367
368 out:
369 if (ep)
370 *ep = x;
371 if (x != send)
81cd54e3 372 return FALSE;
81cd54e3
JH
373
374 return TRUE;
375}
376
377/*
768c67ee 378
7fc63493 379=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 380
9041c2e3
NIS
381Bottom level UTF-8 decode routine.
382Returns the unicode code point value of the first character in the string C<s>
1e54db1a 383which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 384C<retlen> will be set to the length, in bytes, of that character.
67e989fb 385
1e54db1a 386If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
387is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
388it is assumed that the caller will raise a warning, and this function
28d3d195
JH
389will silently just set C<retlen> to C<-1> and return zero. If the
390C<flags> does not contain UTF8_CHECK_ONLY, warnings about
391malformations will be given, C<retlen> will be set to the expected
392length of the UTF-8 character in bytes, and zero will be returned.
393
394The C<flags> can also contain various flags to allow deviations from
395the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 396
9041c2e3
NIS
397Most code should use utf8_to_uvchr() rather than call this directly.
398
37607a96
PK
399=cut
400*/
67e989fb 401
a0ed51b3 402UV
7fc63493 403Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 404{
97aff369 405 dVAR;
7fc63493 406 const U8 *s0 = s;
9c5ffd7c 407 UV uv = *s, ouv = 0;
ba210ebe 408 STRLEN len = 1;
7fc63493
AL
409 const bool dowarn = ckWARN_d(WARN_UTF8);
410 const UV startbyte = *s;
ba210ebe 411 STRLEN expectlen = 0;
a0dbb045
JH
412 U32 warning = 0;
413
414/* This list is a superset of the UTF8_ALLOW_XXX. */
415
416#define UTF8_WARN_EMPTY 1
417#define UTF8_WARN_CONTINUATION 2
418#define UTF8_WARN_NON_CONTINUATION 3
419#define UTF8_WARN_FE_FF 4
420#define UTF8_WARN_SHORT 5
421#define UTF8_WARN_OVERFLOW 6
422#define UTF8_WARN_SURROGATE 7
c867b360
JH
423#define UTF8_WARN_LONG 8
424#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
425
426 if (curlen == 0 &&
427 !(flags & UTF8_ALLOW_EMPTY)) {
428 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
429 goto malformed;
430 }
431
1d72bdf6 432 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
433 if (retlen)
434 *retlen = 1;
c4d5f83a 435 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 436 }
67e989fb 437
421a8bf2 438 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 439 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 440 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
441 goto malformed;
442 }
443
421a8bf2 444 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 445 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 446 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
447 goto malformed;
448 }
9041c2e3 449
1d72bdf6 450#ifdef EBCDIC
75383841 451 uv = NATIVE_TO_UTF(uv);
1d72bdf6 452#else
fcc8fcf6
JH
453 if ((uv == 0xfe || uv == 0xff) &&
454 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 455 warning = UTF8_WARN_FE_FF;
ba210ebe 456 goto malformed;
a0ed51b3 457 }
1d72bdf6
NIS
458#endif
459
ba210ebe
JH
460 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
461 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
462 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
463 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
464#ifdef EBCDIC
465 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
466 else { len = 7; uv &= 0x01; }
467#else
ba210ebe
JH
468 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
469 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
470 else { len = 13; uv = 0; } /* whoa! */
471#endif
472
a0ed51b3
LW
473 if (retlen)
474 *retlen = len;
9041c2e3 475
ba210ebe
JH
476 expectlen = len;
477
fcc8fcf6
JH
478 if ((curlen < expectlen) &&
479 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 480 warning = UTF8_WARN_SHORT;
ba210ebe
JH
481 goto malformed;
482 }
483
484 len--;
a0ed51b3 485 s++;
ba210ebe
JH
486 ouv = uv;
487
a0ed51b3 488 while (len--) {
421a8bf2
JH
489 if (!UTF8_IS_CONTINUATION(*s) &&
490 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
491 s--;
492 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 493 goto malformed;
a0ed51b3
LW
494 }
495 else
8850bf83 496 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
497 if (!(uv > ouv)) {
498 /* These cannot be allowed. */
499 if (uv == ouv) {
75dbc644 500 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
501 warning = UTF8_WARN_LONG;
502 goto malformed;
503 }
504 }
505 else { /* uv < ouv */
506 /* This cannot be allowed. */
507 warning = UTF8_WARN_OVERFLOW;
508 goto malformed;
509 }
ba210ebe
JH
510 }
511 s++;
512 ouv = uv;
513 }
514
421a8bf2 515 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 516 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 517 warning = UTF8_WARN_SURROGATE;
ba210ebe 518 goto malformed;
eb160463 519 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 520 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 521 warning = UTF8_WARN_LONG;
ba210ebe 522 goto malformed;
421a8bf2 523 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 524 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 525 warning = UTF8_WARN_FFFF;
a9917092 526 goto malformed;
a0ed51b3 527 }
ba210ebe 528
a0ed51b3 529 return uv;
ba210ebe
JH
530
531malformed:
532
fcc8fcf6 533 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 534 if (retlen)
cc366d4b 535 *retlen = -1;
ba210ebe
JH
536 return 0;
537 }
538
a0dbb045 539 if (dowarn) {
396482e1 540 SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
a0dbb045
JH
541
542 switch (warning) {
543 case 0: /* Intentionally empty. */ break;
544 case UTF8_WARN_EMPTY:
396482e1 545 sv_catpvs(sv, "(empty string)");
a0dbb045
JH
546 break;
547 case UTF8_WARN_CONTINUATION:
097fb8e2 548 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
549 break;
550 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
551 if (s == s0)
552 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
553 (UV)s[1], startbyte);
551405c4
AL
554 else {
555 const int len = (int)(s-s0);
097fb8e2 556 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4
AL
557 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
558 }
559
a0dbb045
JH
560 break;
561 case UTF8_WARN_FE_FF:
562 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
563 break;
564 case UTF8_WARN_SHORT:
097fb8e2 565 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 566 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 567 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
568 break;
569 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
570 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
571 ouv, *s, startbyte);
a0dbb045
JH
572 break;
573 case UTF8_WARN_SURROGATE:
574 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
575 break;
a0dbb045 576 case UTF8_WARN_LONG:
097fb8e2 577 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 578 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
579 break;
580 case UTF8_WARN_FFFF:
581 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
582 break;
583 default:
396482e1 584 sv_catpvs(sv, "(unknown reason)");
a0dbb045
JH
585 break;
586 }
587
588 if (warning) {
44f8325f 589 const char * const s = SvPVX_const(sv);
a0dbb045
JH
590
591 if (PL_op)
9014280d 592 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 593 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 594 else
9014280d 595 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
596 }
597 }
598
ba210ebe 599 if (retlen)
28d3d195 600 *retlen = expectlen ? expectlen : len;
ba210ebe 601
28d3d195 602 return 0;
a0ed51b3
LW
603}
604
8e84507e 605/*
7fc63493 606=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3
NIS
607
608Returns the native character value of the first character in the string C<s>
1e54db1a 609which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
610length, in bytes, of that character.
611
1e54db1a 612If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
613returned and retlen is set, if possible, to -1.
614
615=cut
616*/
617
618UV
7fc63493 619Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 620{
1754c1a1
NC
621 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
622 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
623}
624
625/*
7fc63493 626=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3
NIS
627
628Returns the Unicode code point of the first character in the string C<s>
1e54db1a 629which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
630length, in bytes, of that character.
631
632This function should only be used when returned UV is considered
633an index into the Unicode semantic tables (e.g. swashes).
634
1e54db1a 635If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 636returned and retlen is set, if possible, to -1.
8e84507e
NIS
637
638=cut
639*/
640
641UV
7fc63493 642Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 643{
9041c2e3 644 /* Call the low level routine asking for checks */
89ebb4a3 645 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 646 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
647}
648
b76347f2 649/*
35a4481c 650=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2
JH
651
652Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
653Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
654up past C<e>, croaks.
b76347f2
JH
655
656=cut
657*/
658
659STRLEN
35a4481c 660Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 661{
97aff369 662 dVAR;
b76347f2
JH
663 STRLEN len = 0;
664
8850bf83
JH
665 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
666 * the bitops (especially ~) can create illegal UTF-8.
667 * In other words: in Perl UTF-8 is not just for Unicode. */
668
a3b680e6
AL
669 if (e < s)
670 goto warn_and_return;
b76347f2 671 while (s < e) {
4373e329 672 const U8 t = UTF8SKIP(s);
901b21bf 673 if (e - s < t) {
a3b680e6 674 warn_and_return:
901b21bf
JH
675 if (ckWARN_d(WARN_UTF8)) {
676 if (PL_op)
677 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 678 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
679 else
680 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
681 }
682 return len;
683 }
b76347f2
JH
684 s += t;
685 len++;
686 }
687
688 return len;
689}
690
b06226ff 691/*
35a4481c 692=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 693
1e54db1a 694Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
695and C<b>.
696
697WARNING: use only if you *know* that the pointers point inside the
698same UTF-8 buffer.
699
37607a96
PK
700=cut
701*/
a0ed51b3 702
02eb7b47 703IV
35a4481c 704Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 705{
97aff369 706 dVAR;
02eb7b47
JH
707 IV off = 0;
708
8850bf83
JH
709 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
710 * the bitops (especially ~) can create illegal UTF-8.
711 * In other words: in Perl UTF-8 is not just for Unicode. */
712
a0ed51b3
LW
713 if (a < b) {
714 while (a < b) {
35a4481c 715 const U8 c = UTF8SKIP(a);
a3b680e6
AL
716 if (b - a < c)
717 goto warn_and_return;
02eb7b47 718 a += c;
a0ed51b3
LW
719 off--;
720 }
721 }
722 else {
723 while (b < a) {
4373e329 724 const U8 c = UTF8SKIP(b);
02eb7b47 725
901b21bf 726 if (a - b < c) {
a3b680e6 727 warn_and_return:
901b21bf
JH
728 if (ckWARN_d(WARN_UTF8)) {
729 if (PL_op)
730 Perl_warner(aTHX_ packWARN(WARN_UTF8),
731 "%s in %s", unees, OP_DESC(PL_op));
732 else
733 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
734 }
735 return off;
736 }
02eb7b47 737 b += c;
a0ed51b3
LW
738 off++;
739 }
740 }
02eb7b47 741
a0ed51b3
LW
742 return off;
743}
744
b06226ff 745/*
37607a96 746=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 747
8850bf83
JH
748Return the UTF-8 pointer C<s> displaced by C<off> characters, either
749forward or backward.
b06226ff
JH
750
751WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
752the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
753on the first byte of character or just after the last byte of a character.
b06226ff 754
37607a96
PK
755=cut
756*/
a0ed51b3
LW
757
758U8 *
4373e329 759Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 760{
96a5add6 761 PERL_UNUSED_CONTEXT;
8850bf83
JH
762 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
763 * the bitops (especially ~) can create illegal UTF-8.
764 * In other words: in Perl UTF-8 is not just for Unicode. */
765
a0ed51b3
LW
766 if (off >= 0) {
767 while (off--)
768 s += UTF8SKIP(s);
769 }
770 else {
771 while (off++) {
772 s--;
8850bf83
JH
773 while (UTF8_IS_CONTINUATION(*s))
774 s--;
a0ed51b3
LW
775 }
776 }
4373e329 777 return (U8 *)s;
a0ed51b3
LW
778}
779
6940069f 780/*
eebe1485 781=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 782
1e54db1a 783Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
784Unlike C<bytes_to_utf8>, this over-writes the original string, and
785updates len to contain the new length.
67e989fb 786Returns zero on failure, setting C<len> to -1.
6940069f
GS
787
788=cut
789*/
790
791U8 *
37607a96 792Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 793{
6940069f
GS
794 U8 *send;
795 U8 *d;
dcad2880 796 U8 *save = s;
246fae53 797
1e54db1a 798 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880
JH
799 for (send = s + *len; s < send; ) {
800 U8 c = *s++;
801
1d72bdf6
NIS
802 if (!UTF8_IS_INVARIANT(c) &&
803 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
804 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
805 *len = -1;
806 return 0;
807 }
246fae53 808 }
dcad2880
JH
809
810 d = s = save;
6940069f 811 while (s < send) {
ed646e6e 812 STRLEN ulen;
9041c2e3 813 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 814 s += ulen;
6940069f
GS
815 }
816 *d = '\0';
246fae53 817 *len = d - save;
6940069f
GS
818 return save;
819}
820
821/*
e1ec3a88 822=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 823
1e54db1a 824Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 825Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
826the newly-created string, and updates C<len> to contain the new
827length. Returns the original string if no conversion occurs, C<len>
828is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8290 if C<s> is converted or contains all 7bit characters.
f9a63242 830
37607a96
PK
831=cut
832*/
f9a63242
JH
833
834U8 *
e1ec3a88 835Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 836{
f9a63242 837 U8 *d;
e1ec3a88
AL
838 const U8 *start = s;
839 const U8 *send;
f9a63242
JH
840 I32 count = 0;
841
96a5add6 842 PERL_UNUSED_CONTEXT;
f9a63242 843 if (!*is_utf8)
73d840c0 844 return (U8 *)start;
f9a63242 845
1e54db1a 846 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 847 for (send = s + *len; s < send;) {
e1ec3a88 848 U8 c = *s++;
1d72bdf6 849 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
850 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
851 (c = *s++) && UTF8_IS_CONTINUATION(c))
852 count++;
853 else
73d840c0 854 return (U8 *)start;
db42d148 855 }
f9a63242
JH
856 }
857
858 *is_utf8 = 0;
859
212542aa 860 Newx(d, (*len) - count + 1, U8);
ef9edfd0 861 s = start; start = d;
f9a63242
JH
862 while (s < send) {
863 U8 c = *s++;
c4d5f83a
NIS
864 if (!UTF8_IS_INVARIANT(c)) {
865 /* Then it is two-byte encoded */
866 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
867 c = ASCII_TO_NATIVE(c);
868 }
869 *d++ = c;
f9a63242
JH
870 }
871 *d = '\0';
872 *len = d - start;
73d840c0 873 return (U8 *)start;
f9a63242
JH
874}
875
876/*
35a4481c 877=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 878
1e54db1a 879Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
880Returns a pointer to the newly-created string, and sets C<len> to
881reflect the new length.
6940069f 882
1e54db1a 883If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
884see sv_recode_to_utf8().
885
497711e7 886=cut
6940069f
GS
887*/
888
889U8*
35a4481c 890Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 891{
35a4481c 892 const U8 * const send = s + (*len);
6940069f
GS
893 U8 *d;
894 U8 *dst;
96a5add6 895 PERL_UNUSED_CONTEXT;
6940069f 896
212542aa 897 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
898 dst = d;
899
900 while (s < send) {
35a4481c 901 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 902 if (UNI_IS_INVARIANT(uv))
eb160463 903 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 904 else {
eb160463
GS
905 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
906 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
907 }
908 }
909 *d = '\0';
6662521e 910 *len = d-dst;
6940069f
GS
911 return dst;
912}
913
a0ed51b3 914/*
dea0fc0b 915 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
916 *
917 * Destination must be pre-extended to 3/2 source. Do not use in-place.
918 * We optimize for native, for obvious reasons. */
919
920U8*
dea0fc0b 921Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 922{
dea0fc0b
JH
923 U8* pend;
924 U8* dstart = d;
925
1de9afcd
RGS
926 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
927 d[0] = 0;
928 *newlen = 1;
929 return d;
930 }
931
dea0fc0b 932 if (bytelen & 1)
014ead4b 933 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b
JH
934
935 pend = p + bytelen;
936
a0ed51b3 937 while (p < pend) {
dea0fc0b
JH
938 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
939 p += 2;
a0ed51b3 940 if (uv < 0x80) {
eb160463 941 *d++ = (U8)uv;
a0ed51b3
LW
942 continue;
943 }
944 if (uv < 0x800) {
eb160463
GS
945 *d++ = (U8)(( uv >> 6) | 0xc0);
946 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
947 continue;
948 }
949 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
TD
950 UV low = (p[0] << 8) + p[1];
951 p += 2;
dea0fc0b
JH
952 if (low < 0xdc00 || low >= 0xdfff)
953 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
954 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
955 }
956 if (uv < 0x10000) {
eb160463
GS
957 *d++ = (U8)(( uv >> 12) | 0xe0);
958 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
959 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
960 continue;
961 }
962 else {
eb160463
GS
963 *d++ = (U8)(( uv >> 18) | 0xf0);
964 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
965 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
966 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
967 continue;
968 }
969 }
dea0fc0b 970 *newlen = d - dstart;
a0ed51b3
LW
971 return d;
972}
973
974/* Note: this one is slightly destructive of the source. */
975
976U8*
dea0fc0b 977Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
978{
979 U8* s = (U8*)p;
980 U8* send = s + bytelen;
981 while (s < send) {
982 U8 tmp = s[0];
983 s[0] = s[1];
984 s[1] = tmp;
985 s += 2;
986 }
dea0fc0b 987 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
988}
989
990/* for now these are all defined (inefficiently) in terms of the utf8 versions */
991
992bool
84afefe6 993Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 994{
89ebb4a3 995 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 996 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
997 return is_utf8_alnum(tmpbuf);
998}
999
1000bool
84afefe6 1001Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 1002{
89ebb4a3 1003 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1004 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1005 return is_utf8_alnumc(tmpbuf);
1006}
1007
1008bool
84afefe6 1009Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1010{
89ebb4a3 1011 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1012 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1013 return is_utf8_idfirst(tmpbuf);
1014}
1015
1016bool
84afefe6 1017Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1018{
89ebb4a3 1019 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1020 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1021 return is_utf8_alpha(tmpbuf);
1022}
1023
1024bool
84afefe6 1025Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1026{
89ebb4a3 1027 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1028 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1029 return is_utf8_ascii(tmpbuf);
1030}
1031
1032bool
84afefe6 1033Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1034{
89ebb4a3 1035 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1036 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1037 return is_utf8_space(tmpbuf);
1038}
1039
1040bool
84afefe6 1041Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1042{
89ebb4a3 1043 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1044 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1045 return is_utf8_digit(tmpbuf);
1046}
1047
1048bool
84afefe6 1049Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1050{
89ebb4a3 1051 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1052 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1053 return is_utf8_upper(tmpbuf);
1054}
1055
1056bool
84afefe6 1057Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1058{
89ebb4a3 1059 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1060 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1061 return is_utf8_lower(tmpbuf);
1062}
1063
1064bool
84afefe6 1065Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1066{
89ebb4a3 1067 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1068 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1069 return is_utf8_cntrl(tmpbuf);
1070}
1071
1072bool
84afefe6 1073Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1074{
89ebb4a3 1075 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1076 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1077 return is_utf8_graph(tmpbuf);
1078}
1079
1080bool
84afefe6 1081Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1082{
89ebb4a3 1083 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1084 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1085 return is_utf8_print(tmpbuf);
1086}
1087
b8c5462f 1088bool
84afefe6 1089Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1090{
89ebb4a3 1091 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1092 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1093 return is_utf8_punct(tmpbuf);
1094}
1095
4d61ec05 1096bool
84afefe6 1097Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1098{
89ebb4a3 1099 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1100 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1101 return is_utf8_xdigit(tmpbuf);
1102}
1103
84afefe6
JH
1104UV
1105Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1106{
0ebc6274
JH
1107 uvchr_to_utf8(p, c);
1108 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1109}
1110
84afefe6
JH
1111UV
1112Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1113{
0ebc6274
JH
1114 uvchr_to_utf8(p, c);
1115 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1116}
1117
84afefe6
JH
1118UV
1119Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1120{
0ebc6274
JH
1121 uvchr_to_utf8(p, c);
1122 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1123}
1124
84afefe6
JH
1125UV
1126Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1127{
0ebc6274
JH
1128 uvchr_to_utf8(p, c);
1129 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1130}
1131
a0ed51b3
LW
1132/* for now these all assume no locale info available for Unicode > 255 */
1133
1134bool
84afefe6 1135Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1136{
1137 return is_uni_alnum(c); /* XXX no locale support yet */
1138}
1139
1140bool
84afefe6 1141Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1142{
1143 return is_uni_alnumc(c); /* XXX no locale support yet */
1144}
1145
1146bool
84afefe6 1147Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1148{
1149 return is_uni_idfirst(c); /* XXX no locale support yet */
1150}
1151
1152bool
84afefe6 1153Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1154{
1155 return is_uni_alpha(c); /* XXX no locale support yet */
1156}
1157
1158bool
84afefe6 1159Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1160{
1161 return is_uni_ascii(c); /* XXX no locale support yet */
1162}
1163
1164bool
84afefe6 1165Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1166{
1167 return is_uni_space(c); /* XXX no locale support yet */
1168}
1169
1170bool
84afefe6 1171Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1172{
1173 return is_uni_digit(c); /* XXX no locale support yet */
1174}
1175
1176bool
84afefe6 1177Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1178{
1179 return is_uni_upper(c); /* XXX no locale support yet */
1180}
1181
1182bool
84afefe6 1183Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1184{
1185 return is_uni_lower(c); /* XXX no locale support yet */
1186}
1187
1188bool
84afefe6 1189Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1190{
1191 return is_uni_cntrl(c); /* XXX no locale support yet */
1192}
1193
1194bool
84afefe6 1195Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1196{
1197 return is_uni_graph(c); /* XXX no locale support yet */
1198}
1199
1200bool
84afefe6 1201Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1202{
1203 return is_uni_print(c); /* XXX no locale support yet */
1204}
1205
b8c5462f 1206bool
84afefe6 1207Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1208{
1209 return is_uni_punct(c); /* XXX no locale support yet */
1210}
1211
4d61ec05 1212bool
84afefe6 1213Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1214{
1215 return is_uni_xdigit(c); /* XXX no locale support yet */
1216}
1217
b7ac61fa
JH
1218U32
1219Perl_to_uni_upper_lc(pTHX_ U32 c)
1220{
ee099d14
JH
1221 /* XXX returns only the first character -- do not use XXX */
1222 /* XXX no locale support yet */
1223 STRLEN len;
89ebb4a3 1224 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1225 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1226}
1227
1228U32
1229Perl_to_uni_title_lc(pTHX_ U32 c)
1230{
ee099d14
JH
1231 /* XXX returns only the first character XXX -- do not use XXX */
1232 /* XXX no locale support yet */
1233 STRLEN len;
89ebb4a3 1234 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1235 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1236}
1237
1238U32
1239Perl_to_uni_lower_lc(pTHX_ U32 c)
1240{
ee099d14
JH
1241 /* XXX returns only the first character -- do not use XXX */
1242 /* XXX no locale support yet */
1243 STRLEN len;
89ebb4a3 1244 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1245 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1246}
1247
7452cf6a 1248static bool
5141f98e 1249S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
1250 const char *const swashname)
1251{
97aff369 1252 dVAR;
bde6a22d
NC
1253 if (!is_utf8_char(p))
1254 return FALSE;
1255 if (!*swash)
711a919c 1256 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
bde6a22d
NC
1257 return swash_fetch(*swash, p, TRUE) != 0;
1258}
1259
1260bool
7fc63493 1261Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1262{
97aff369 1263 dVAR;
671c33bf
NC
1264 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1265 * descendant of isalnum(3), in other words, it doesn't
1266 * contain the '_'. --jhi */
1267 return S_is_utf8_common(aTHX_ p, &PL_utf8_alnum, "IsWord");
a0ed51b3
LW
1268}
1269
1270bool
7fc63493 1271Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1272{
97aff369 1273 dVAR;
671c33bf 1274 return S_is_utf8_common(aTHX_ p, &PL_utf8_alnumc, "IsAlnumC");
b8c5462f
JH
1275}
1276
1277bool
7fc63493 1278Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1279{
97aff369 1280 dVAR;
82686b01
JH
1281 if (*p == '_')
1282 return TRUE;
bde6a22d
NC
1283 /* is_utf8_idstart would be more logical. */
1284 return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1285}
1286
1287bool
7fc63493 1288Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1289{
97aff369 1290 dVAR;
82686b01
JH
1291 if (*p == '_')
1292 return TRUE;
bde6a22d 1293 return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1294}
1295
1296bool
7fc63493 1297Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1298{
97aff369 1299 dVAR;
bde6a22d 1300 return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1301}
1302
1303bool
7fc63493 1304Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1305{
97aff369 1306 dVAR;
bde6a22d 1307 return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii");
b8c5462f
JH
1308}
1309
1310bool
7fc63493 1311Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1312{
97aff369 1313 dVAR;
bde6a22d 1314 return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3
LW
1315}
1316
1317bool
7fc63493 1318Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1319{
97aff369 1320 dVAR;
bde6a22d 1321 return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1322}
1323
1324bool
7fc63493 1325Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1326{
97aff369 1327 dVAR;
bde6a22d 1328 return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1329}
1330
1331bool
7fc63493 1332Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1333{
97aff369 1334 dVAR;
bde6a22d 1335 return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1336}
1337
1338bool
7fc63493 1339Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1340{
97aff369 1341 dVAR;
bde6a22d 1342 return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f
JH
1343}
1344
1345bool
7fc63493 1346Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1347{
97aff369 1348 dVAR;
bde6a22d 1349 return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1350}
1351
1352bool
7fc63493 1353Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1354{
97aff369 1355 dVAR;
bde6a22d 1356 return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1357}
1358
1359bool
7fc63493 1360Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1361{
97aff369 1362 dVAR;
bde6a22d 1363 return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1364}
1365
1366bool
7fc63493 1367Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1368{
97aff369 1369 dVAR;
bde6a22d 1370 return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f
JH
1371}
1372
1373bool
7fc63493 1374Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1375{
97aff369 1376 dVAR;
bde6a22d 1377 return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1378}
1379
6b5c0936
JH
1380/*
1381=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1382
1383The "p" contains the pointer to the UTF-8 string encoding
1384the character that is being converted.
1385
1386The "ustrp" is a pointer to the character buffer to put the
1387conversion result to. The "lenp" is a pointer to the length
1388of the result.
1389
0134edef 1390The "swashp" is a pointer to the swash to use.
6b5c0936 1391
0134edef 1392Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
8fe4d5b2 1393and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 1394but not always, a multicharacter mapping), is tried first.
6b5c0936 1395
0134edef
JH
1396The "special" is a string like "utf8::ToSpecLower", which means the
1397hash %utf8::ToSpecLower. The access to the hash is through
1398Perl_to_utf8_case().
6b5c0936 1399
0134edef
JH
1400The "normal" is a string like "ToLower" which means the swash
1401%utf8::ToLower.
1402
1403=cut */
6b5c0936 1404
2104c8d9 1405UV
9a957fbc
AL
1406Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1407 SV **swashp, const char *normal, const char *special)
a0ed51b3 1408{
97aff369 1409 dVAR;
89ebb4a3 1410 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1411 STRLEN len = 0;
a0ed51b3 1412
aec46f14 1413 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1414 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1415 * are necessary in EBCDIC, they are redundant no-ops
1416 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1417 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1418 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1419
1420 if (!*swashp) /* load on-demand */
1421 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1422
b08cf34e
JH
1423 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1424 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1425 /* It might be "special" (sometimes, but not always,
2a37f04d 1426 * a multicharacter mapping) */
983ffd37 1427 HV *hv;
b08cf34e
JH
1428 SV **svp;
1429
1430 if ((hv = get_hv(special, FALSE)) &&
1431 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1432 (*svp)) {
cfd0369c 1433 const char *s;
47654450 1434
cfd0369c 1435 s = SvPV_const(*svp, len);
47654450
JH
1436 if (len == 1)
1437 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1438 else {
2f9475ad
JH
1439#ifdef EBCDIC
1440 /* If we have EBCDIC we need to remap the characters
1441 * since any characters in the low 256 are Unicode
1442 * code points, not EBCDIC. */
7cda7a3d 1443 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1444
1445 d = tmpbuf;
b08cf34e 1446 if (SvUTF8(*svp)) {
2f9475ad
JH
1447 STRLEN tlen = 0;
1448
1449 while (t < tend) {
1450 UV c = utf8_to_uvchr(t, &tlen);
1451 if (tlen > 0) {
1452 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1453 t += tlen;
1454 }
1455 else
1456 break;
1457 }
1458 }
1459 else {
36fec512
JH
1460 while (t < tend) {
1461 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1462 t++;
1463 }
2f9475ad
JH
1464 }
1465 len = d - tmpbuf;
1466 Copy(tmpbuf, ustrp, len, U8);
1467#else
d2dcd0fb 1468 Copy(s, ustrp, len, U8);
2f9475ad 1469#endif
29e98929 1470 }
983ffd37 1471 }
0134edef
JH
1472 }
1473
1474 if (!len && *swashp) {
1475 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1476
1477 if (uv2) {
1478 /* It was "normal" (a single character mapping). */
1479 UV uv3 = UNI_TO_NATIVE(uv2);
1480
e9101d72 1481 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1482 }
1483 }
1feea2c7 1484
0134edef
JH
1485 if (!len) /* Neither: just copy. */
1486 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1487
2a37f04d
JH
1488 if (lenp)
1489 *lenp = len;
1490
0134edef 1491 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1492}
1493
d3e79532 1494/*
7fc63493 1495=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1496
1497Convert the UTF-8 encoded character at p to its uppercase version and
1498store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1499that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1500the uppercase version may be longer than the original character.
d3e79532
JH
1501
1502The first character of the uppercased version is returned
1503(but note, as explained above, that there may be more.)
1504
1505=cut */
1506
2104c8d9 1507UV
7fc63493 1508Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1509{
97aff369 1510 dVAR;
983ffd37 1511 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1512 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1513}
a0ed51b3 1514
d3e79532 1515/*
7fc63493 1516=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1517
1518Convert the UTF-8 encoded character at p to its titlecase version and
1519store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1520that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1521titlecase version may be longer than the original character.
d3e79532
JH
1522
1523The first character of the titlecased version is returned
1524(but note, as explained above, that there may be more.)
1525
1526=cut */
1527
983ffd37 1528UV
7fc63493 1529Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1530{
97aff369 1531 dVAR;
983ffd37 1532 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1533 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1534}
1535
d3e79532 1536/*
7fc63493 1537=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1538
1539Convert the UTF-8 encoded character at p to its lowercase version and
1540store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1541that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1542lowercase version may be longer than the original character.
d3e79532
JH
1543
1544The first character of the lowercased version is returned
1545(but note, as explained above, that there may be more.)
1546
1547=cut */
1548
2104c8d9 1549UV
7fc63493 1550Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1551{
97aff369 1552 dVAR;
983ffd37 1553 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1554 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1555}
1556
d3e79532 1557/*
7fc63493 1558=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1559
1560Convert the UTF-8 encoded character at p to its foldcase version and
1561store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1562that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1563foldcase version may be longer than the original character (up to
1564three characters).
1565
1566The first character of the foldcased version is returned
1567(but note, as explained above, that there may be more.)
1568
1569=cut */
1570
b4e400f9 1571UV
7fc63493 1572Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1573{
97aff369 1574 dVAR;
b4e400f9
JH
1575 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1576 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1577}
1578
711a919c
TS
1579/* Note:
1580 * A "swash" is a swatch hash.
1581 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1582 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1583 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1584 */
a0ed51b3 1585SV*
7fc63493 1586Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1587{
27da23d5 1588 dVAR;
a0ed51b3 1589 SV* retval;
9a957fbc 1590 SV* const tokenbufsv = sv_newmortal();
8e84507e 1591 dSP;
7fc63493
AL
1592 const size_t pkg_len = strlen(pkg);
1593 const size_t name_len = strlen(name);
aec46f14 1594 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1595 SV* errsv_save;
ce3b816e 1596
96ca9f55
DM
1597 PUSHSTACKi(PERLSI_MAGIC);
1598 ENTER;
1599 SAVEI32(PL_hints);
1600 PL_hints = 0;
1601 save_re_context();
1b026014 1602 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1603 ENTER;
f8be5cf0 1604 errsv_save = newSVsv(ERRSV);
dc0c6abb
NC
1605 /* It is assumed that callers of this routine are not passing in any
1606 user derived data. */
1607 /* Need to do this after save_re_context() as it will set PL_tainted to
1608 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1609 Even line to create errsv_save can turn on PL_tainted. */
1610 SAVEBOOL(PL_tainted);
1611 PL_tainted = 0;
71bed85a 1612 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
a0714e2c 1613 NULL);
f8be5cf0
JH
1614 if (!SvTRUE(ERRSV))
1615 sv_setsv(ERRSV, errsv_save);
1616 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1617 LEAVE;
1618 }
1619 SPAGAIN;
a0ed51b3
LW
1620 PUSHMARK(SP);
1621 EXTEND(SP,5);
71bed85a
NC
1622 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1623 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1624 PUSHs(listsv);
1625 PUSHs(sv_2mortal(newSViv(minbits)));
1626 PUSHs(sv_2mortal(newSViv(none)));
1627 PUTBACK;
923e4eb5 1628 if (IN_PERL_COMPILETIME) {
bf1fed83 1629 /* XXX ought to be handled by lex_start */
82686b01 1630 SAVEI32(PL_in_my);
2b4bd638 1631 PL_in_my = 0;
bf1fed83 1632 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1633 }
f8be5cf0 1634 errsv_save = newSVsv(ERRSV);
864dbfa3 1635 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1636 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1637 else
e24b16f9 1638 retval = &PL_sv_undef;
f8be5cf0
JH
1639 if (!SvTRUE(ERRSV))
1640 sv_setsv(ERRSV, errsv_save);
1641 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1642 LEAVE;
1643 POPSTACK;
923e4eb5 1644 if (IN_PERL_COMPILETIME) {
bf1fed83 1645 STRLEN len;
aec46f14 1646 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83
JH
1647
1648 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1649 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1650 }
bc45ce41
JH
1651 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1652 if (SvPOK(retval))
35c1215d
NC
1653 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1654 retval);
cea2e8a9 1655 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1656 }
a0ed51b3
LW
1657 return retval;
1658}
1659
035d37be
JH
1660
1661/* This API is wrong for special case conversions since we may need to
1662 * return several Unicode characters for a single Unicode character
1663 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1664 * the lower-level routine, and it is similarly broken for returning
1665 * multiple values. --jhi */
979f2922 1666/* Now SWASHGET is recasted into S_swash_get in this file. */
680c470c
TS
1667
1668/* Note:
1669 * Returns the value of property/mapping C<swash> for the first character
1670 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1671 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1672 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1673 */
a0ed51b3 1674UV
680c470c 1675Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 1676{
27da23d5 1677 dVAR;
680c470c 1678 HV* const hv = (HV*)SvRV(swash);
3568d838
JH
1679 U32 klen;
1680 U32 off;
a0ed51b3 1681 STRLEN slen;
7d85a32c 1682 STRLEN needents;
cfd0369c 1683 const U8 *tmps = NULL;
a0ed51b3 1684 U32 bit;
979f2922 1685 SV *swatch;
3568d838
JH
1686 U8 tmputf8[2];
1687 UV c = NATIVE_TO_ASCII(*ptr);
1688
1689 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
979f2922
TS
1690 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1691 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1692 ptr = tmputf8;
3568d838
JH
1693 }
1694 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1695 * then the "swatch" is a vec() for al the chars which start
1696 * with 0xAA..0xYY
1697 * So the key in the hash (klen) is length of encoded char -1
1698 */
1699 klen = UTF8SKIP(ptr) - 1;
1700 off = ptr[klen];
a0ed51b3 1701
979f2922 1702 if (klen == 0) {
7d85a32c 1703 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1704 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1705 */
979f2922
TS
1706 needents = UTF_CONTINUATION_MARK;
1707 off = NATIVE_TO_UTF(ptr[klen]);
1708 }
1709 else {
7d85a32c 1710 /* If char is encoded then swatch is for the prefix */
979f2922
TS
1711 needents = (1 << UTF_ACCUMULATION_SHIFT);
1712 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1713 }
7d85a32c 1714
a0ed51b3
LW
1715 /*
1716 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1717 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1718 * it's nothing to sniff at.) Pity we usually come through at least
1719 * two function calls to get here...
1720 *
1721 * NB: this code assumes that swatches are never modified, once generated!
1722 */
1723
3568d838 1724 if (hv == PL_last_swash_hv &&
a0ed51b3 1725 klen == PL_last_swash_klen &&
27da23d5 1726 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1727 {
1728 tmps = PL_last_swash_tmps;
1729 slen = PL_last_swash_slen;
1730 }
1731 else {
1732 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1733 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1734
979f2922
TS
1735 /* If not cached, generate it via swash_get */
1736 if (!svp || !SvPOK(*svp)
1737 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0
NIS
1738 /* We use utf8n_to_uvuni() as we want an index into
1739 Unicode tables, not a native character number.
1740 */
aec46f14 1741 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1742 ckWARN(WARN_UTF8) ?
1743 0 : UTF8_ALLOW_ANY);
680c470c 1744 swatch = swash_get(swash,
979f2922
TS
1745 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1746 (klen) ? (code_point & ~(needents - 1)) : 0,
1747 needents);
1748
923e4eb5 1749 if (IN_PERL_COMPILETIME)
eb160463 1750 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1751
979f2922 1752 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 1753
979f2922
TS
1754 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1755 || (slen << 3) < needents)
660a4616 1756 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
a0ed51b3
LW
1757 }
1758
1759 PL_last_swash_hv = hv;
1760 PL_last_swash_klen = klen;
cfd0369c
NC
1761 /* FIXME change interpvar.h? */
1762 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1763 PL_last_swash_slen = slen;
1764 if (klen)
1765 Copy(ptr, PL_last_swash_key, klen, U8);
1766 }
1767
9faf8d75 1768 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1769 case 1:
1770 bit = 1 << (off & 7);
1771 off >>= 3;
1772 return (tmps[off] & bit) != 0;
1773 case 8:
1774 return tmps[off];
1775 case 16:
1776 off <<= 1;
1777 return (tmps[off] << 8) + tmps[off + 1] ;
1778 case 32:
1779 off <<= 2;
1780 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1781 }
660a4616 1782 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
a0ed51b3
LW
1783 return 0;
1784}
2b9d42f0 1785
979f2922
TS
1786/* Note:
1787 * Returns a swatch (a bit vector string) for a code point sequence
1788 * that starts from the value C<start> and comprises the number C<span>.
1789 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1790 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1791 */
1792STATIC SV*
1793S_swash_get(pTHX_ SV* swash, UV start, UV span)
1794{
1795 SV *swatch;
711a919c 1796 U8 *l, *lend, *x, *xend, *s;
979f2922
TS
1797 STRLEN lcur, xcur, scur;
1798
1799 HV* const hv = (HV*)SvRV(swash);
017a3ce5
GA
1800 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1801 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1802 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1803 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1804 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
0bd48802
AL
1805 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1806 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1807 const STRLEN bits = SvUV(*bitssvp);
1808 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1809 const UV none = SvUV(*nonesvp);
1810 const UV end = start + span;
979f2922
TS
1811
1812 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
660a4616
TS
1813 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1814 (UV)bits);
979f2922
TS
1815 }
1816
1817 /* create and initialize $swatch */
396482e1 1818 swatch = newSVpvs("");
979f2922
TS
1819 scur = octets ? (span * octets) : (span + 7) / 8;
1820 SvGROW(swatch, scur + 1);
1821 s = (U8*)SvPVX(swatch);
1822 if (octets && none) {
0bd48802 1823 const U8* const e = s + scur;
979f2922
TS
1824 while (s < e) {
1825 if (bits == 8)
1826 *s++ = (U8)(none & 0xff);
1827 else if (bits == 16) {
1828 *s++ = (U8)((none >> 8) & 0xff);
1829 *s++ = (U8)( none & 0xff);
1830 }
1831 else if (bits == 32) {
1832 *s++ = (U8)((none >> 24) & 0xff);
1833 *s++ = (U8)((none >> 16) & 0xff);
1834 *s++ = (U8)((none >> 8) & 0xff);
1835 *s++ = (U8)( none & 0xff);
1836 }
1837 }
1838 *s = '\0';
1839 }
1840 else {
1841 (void)memzero((U8*)s, scur + 1);
1842 }
1843 SvCUR_set(swatch, scur);
1844 s = (U8*)SvPVX(swatch);
1845
1846 /* read $swash->{LIST} */
1847 l = (U8*)SvPV(*listsvp, lcur);
1848 lend = l + lcur;
1849 while (l < lend) {
1850 UV min, max, val, key;
1851 STRLEN numlen;
1852 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1853
0bd48802 1854 U8* const nl = (U8*)memchr(l, '\n', lend - l);
979f2922
TS
1855
1856 numlen = lend - l;
1857 min = grok_hex((char *)l, &numlen, &flags, NULL);
1858 if (numlen)
1859 l += numlen;
1860 else if (nl) {
1861 l = nl + 1; /* 1 is length of "\n" */
1862 continue;
1863 }
1864 else {
1865 l = lend; /* to LIST's end at which \n is not found */
1866 break;
1867 }
1868
1869 if (isBLANK(*l)) {
1870 ++l;
1871 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1872 numlen = lend - l;
1873 max = grok_hex((char *)l, &numlen, &flags, NULL);
1874 if (numlen)
1875 l += numlen;
1876 else
1877 max = min;
1878
1879 if (octets) {
1880 if (isBLANK(*l)) {
1881 ++l;
1882 flags = PERL_SCAN_SILENT_ILLDIGIT |
1883 PERL_SCAN_DISALLOW_PREFIX;
1884 numlen = lend - l;
1885 val = grok_hex((char *)l, &numlen, &flags, NULL);
1886 if (numlen)
1887 l += numlen;
1888 else
1889 val = 0;
1890 }
1891 else {
1892 val = 0;
1893 if (typeto) {
1894 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1895 typestr, l);
1896 }
1897 }
1898 }
711a919c
TS
1899 else
1900 val = 0; /* bits == 1, then val should be ignored */
979f2922
TS
1901 }
1902 else {
1903 max = min;
1904 if (octets) {
1905 val = 0;
1906 if (typeto) {
1907 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1908 }
1909 }
711a919c
TS
1910 else
1911 val = 0; /* bits == 1, then val should be ignored */
979f2922
TS
1912 }
1913
1914 if (nl)
1915 l = nl + 1;
1916 else
1917 l = lend;
1918
1919 if (max < start)
1920 continue;
1921
1922 if (octets) {
1923 if (min < start) {
1924 if (!none || val < none) {
1925 val += start - min;
1926 }
1927 min = start;
1928 }
1929 for (key = min; key <= max; key++) {
1930 STRLEN offset;
1931 if (key >= end)
1932 goto go_out_list;
1933 /* offset must be non-negative (start <= min <= key < end) */
1934 offset = octets * (key - start);
1935 if (bits == 8)
1936 s[offset] = (U8)(val & 0xff);
1937 else if (bits == 16) {
1938 s[offset ] = (U8)((val >> 8) & 0xff);
1939 s[offset + 1] = (U8)( val & 0xff);
1940 }
1941 else if (bits == 32) {
1942 s[offset ] = (U8)((val >> 24) & 0xff);
1943 s[offset + 1] = (U8)((val >> 16) & 0xff);
1944 s[offset + 2] = (U8)((val >> 8) & 0xff);
1945 s[offset + 3] = (U8)( val & 0xff);
1946 }
1947
1948 if (!none || val < none)
1949 ++val;
1950 }
1951 }
711a919c 1952 else { /* bits == 1, then val should be ignored */
979f2922
TS
1953 if (min < start)
1954 min = start;
1955 for (key = min; key <= max; key++) {
0bd48802 1956 const STRLEN offset = (STRLEN)(key - start);
979f2922
TS
1957 if (key >= end)
1958 goto go_out_list;
1959 s[offset >> 3] |= 1 << (offset & 7);
1960 }
1961 }
1962 } /* while */
1963 go_out_list:
1964
1965 /* read $swash->{EXTRAS} */
1966 x = (U8*)SvPV(*extssvp, xcur);
1967 xend = x + xcur;
1968 while (x < xend) {
1969 STRLEN namelen;
1970 U8 *namestr;
1971 SV** othersvp;
1972 HV* otherhv;
1973 STRLEN otherbits;
1974 SV **otherbitssvp, *other;
711a919c 1975 U8 *s, *o, *nl;
979f2922
TS
1976 STRLEN slen, olen;
1977
1978 U8 opc = *x++;
1979 if (opc == '\n')
1980 continue;
1981
1982 nl = (U8*)memchr(x, '\n', xend - x);
1983
1984 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1985 if (nl) {
1986 x = nl + 1; /* 1 is length of "\n" */
1987 continue;
1988 }
1989 else {
1990 x = xend; /* to EXTRAS' end at which \n is not found */
1991 break;
1992 }
1993 }
1994
1995 namestr = x;
1996 if (nl) {
1997 namelen = nl - namestr;
1998 x = nl + 1;
1999 }
2000 else {
2001 namelen = xend - namestr;
2002 x = xend;
2003 }
2004
2005 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
660a4616 2006 otherhv = (HV*)SvRV(*othersvp);
017a3ce5 2007 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922
TS
2008 otherbits = (STRLEN)SvUV(*otherbitssvp);
2009 if (bits < otherbits)
660a4616 2010 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
979f2922
TS
2011
2012 /* The "other" swatch must be destroyed after. */
2013 other = swash_get(*othersvp, start, span);
2014 o = (U8*)SvPV(other, olen);
2015
2016 if (!olen)
660a4616 2017 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
979f2922
TS
2018
2019 s = (U8*)SvPV(swatch, slen);
2020 if (bits == 1 && otherbits == 1) {
2021 if (slen != olen)
660a4616 2022 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
979f2922
TS
2023
2024 switch (opc) {
2025 case '+':
2026 while (slen--)
2027 *s++ |= *o++;
2028 break;
2029 case '!':
2030 while (slen--)
2031 *s++ |= ~*o++;
2032 break;
2033 case '-':
2034 while (slen--)
2035 *s++ &= ~*o++;
2036 break;
2037 case '&':
2038 while (slen--)
2039 *s++ &= *o++;
2040 break;
2041 default:
2042 break;
2043 }
2044 }
711a919c 2045 else {
979f2922
TS
2046 STRLEN otheroctets = otherbits >> 3;
2047 STRLEN offset = 0;
2048 U8* send = s + slen;
2049
2050 while (s < send) {
2051 UV otherval = 0;
2052
2053 if (otherbits == 1) {
2054 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2055 ++offset;
2056 }
2057 else {
2058 STRLEN vlen = otheroctets;
2059 otherval = *o++;
2060 while (--vlen) {
2061 otherval <<= 8;
2062 otherval |= *o++;
2063 }
2064 }
2065
711a919c 2066 if (opc == '+' && otherval)
bb263b4e 2067 /*EMPTY*/; /* replace with otherval */
979f2922
TS
2068 else if (opc == '!' && !otherval)
2069 otherval = 1;
2070 else if (opc == '-' && otherval)
2071 otherval = 0;
2072 else if (opc == '&' && !otherval)
2073 otherval = 0;
2074 else {
711a919c 2075 s += octets; /* no replacement */
979f2922
TS
2076 continue;
2077 }
2078
2079 if (bits == 8)
2080 *s++ = (U8)( otherval & 0xff);
2081 else if (bits == 16) {
2082 *s++ = (U8)((otherval >> 8) & 0xff);
2083 *s++ = (U8)( otherval & 0xff);
2084 }
2085 else if (bits == 32) {
2086 *s++ = (U8)((otherval >> 24) & 0xff);
2087 *s++ = (U8)((otherval >> 16) & 0xff);
2088 *s++ = (U8)((otherval >> 8) & 0xff);
2089 *s++ = (U8)( otherval & 0xff);
2090 }
2091 }
2092 }
2093 sv_free(other); /* through with it! */
2094 } /* while */
2095 return swatch;
2096}
2097
0f830e0b
NC
2098/*
2099=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2100
2101Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2102of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2103bytes available. The return value is the pointer to the byte after the
2104end of the new character. In other words,
2105
2106 d = uvchr_to_utf8(d, uv);
2107
2108is the recommended wide native character-aware way of saying
2109
2110 *(d++) = uv;
2111
2112=cut
2113*/
2114
2115/* On ASCII machines this is normally a macro but we want a
2116 real function in case XS code wants it
2117*/
0f830e0b
NC
2118U8 *
2119Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2120{
2121 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2122}
2123
b851fbc1
JH
2124U8 *
2125Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2126{
2127 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2128}
2b9d42f0
NIS
2129
2130/*
0f830e0b
NC
2131=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2132flags
2133
2134Returns the native character value of the first character in the string
2135C<s>
2136which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2137length, in bytes, of that character.
2138
2139Allows length and flags to be passed to low level routine.
2140
2141=cut
2142*/
2143/* On ASCII machines this is normally a macro but we want
2144 a real function in case XS code wants it
2145*/
0f830e0b
NC
2146UV
2147Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2148U32 flags)
2149{
2150 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2151 return UNI_TO_NATIVE(uv);
2152}
2153
2154/*
d2cc3551
JH
2155=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2156
2157Build to the scalar dsv a displayable version of the string spv,
2158length len, the displayable version being at most pvlim bytes long
2159(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2160
9e55ce06 2161The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 2162isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
2163to display the \\[nrfta\\] as the backslashed versions (like '\n')
2164(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2165UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2166UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2167
d2cc3551
JH
2168The pointer to the PV of the dsv is returned.
2169
2170=cut */
e6b2e755 2171char *
e1ec3a88 2172Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
2173{
2174 int truncated = 0;
e1ec3a88 2175 const char *s, *e;
e6b2e755
JH
2176
2177 sv_setpvn(dsv, "", 0);
e1ec3a88 2178 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 2179 UV u;
a49f32c6
NC
2180 /* This serves double duty as a flag and a character to print after
2181 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2182 */
2183 char ok = 0;
c728cb41 2184
e6b2e755
JH
2185 if (pvlim && SvCUR(dsv) >= pvlim) {
2186 truncated++;
2187 break;
2188 }
2189 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 2190 if (u < 256) {
a3b680e6 2191 const unsigned char c = (unsigned char)u & 0xFF;
0bd48802 2192 if (flags & UNI_DISPLAY_BACKSLASH) {
a49f32c6 2193 switch (c) {
c728cb41 2194 case '\n':
a49f32c6 2195 ok = 'n'; break;
c728cb41 2196 case '\r':
a49f32c6 2197 ok = 'r'; break;
c728cb41 2198 case '\t':
a49f32c6 2199 ok = 't'; break;
c728cb41 2200 case '\f':
a49f32c6 2201 ok = 'f'; break;
c728cb41 2202 case '\a':
a49f32c6 2203 ok = 'a'; break;
c728cb41 2204 case '\\':
a49f32c6 2205 ok = '\\'; break;
c728cb41
JH
2206 default: break;
2207 }
a49f32c6
NC
2208 if (ok) {
2209 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2210 }
c728cb41 2211 }
00e86452 2212 /* isPRINT() is the locale-blind version. */
a49f32c6
NC
2213 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2214 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2215 ok = 1;
0a2ef054 2216 }
c728cb41
JH
2217 }
2218 if (!ok)
9e55ce06 2219 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
2220 }
2221 if (truncated)
396482e1 2222 sv_catpvs(dsv, "...");
e6b2e755
JH
2223
2224 return SvPVX(dsv);
2225}
2b9d42f0 2226
d2cc3551
JH
2227/*
2228=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2229
2230Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 2231the displayable version being at most pvlim bytes long
d2cc3551 2232(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
2233
2234The flags argument is as in pv_uni_display().
2235
d2cc3551
JH
2236The pointer to the PV of the dsv is returned.
2237
2238=cut */
e6b2e755
JH
2239char *
2240Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2241{
cfd0369c
NC
2242 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2243 SvCUR(ssv), pvlim, flags);
701a277b
JH
2244}
2245
d2cc3551 2246/*
d07ddd77 2247=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
2248
2249Return true if the strings s1 and s2 differ case-insensitively, false
2250if not (if they are equal case-insensitively). If u1 is true, the
2251string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
2252the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2253are false, the respective string is assumed to be in native 8-bit
2254encoding.
2255
2256If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2257in there (they will point at the beginning of the I<next> character).
2258If the pointers behind pe1 or pe2 are non-NULL, they are the end
2259pointers beyond which scanning will not continue under any
4cdaeff7 2260circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
2261s2+l2 will be used as goal end pointers that will also stop the scan,
2262and which qualify towards defining a successful match: all the scans
2263that define an explicit length must reach their goal pointers for
2264a match to succeed).
d2cc3551
JH
2265
2266For case-insensitiveness, the "casefolding" of Unicode is used
2267instead of upper/lowercasing both the characters, see
2268http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2269
2270=cut */
701a277b 2271I32
d07ddd77 2272Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2273{
97aff369 2274 dVAR;
e1ec3a88
AL
2275 register const U8 *p1 = (const U8*)s1;
2276 register const U8 *p2 = (const U8*)s2;
cbbf8932 2277 register const U8 *f1 = NULL;
2f73348c 2278 register const U8 *f2 = NULL;
cbbf8932
AL
2279 register U8 *e1 = NULL;
2280 register U8 *q1 = NULL;
2281 register U8 *e2 = NULL;
2282 register U8 *q2 = NULL;
d07ddd77 2283 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
2284 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2285 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2286 U8 natbuf[1+1];
2287 STRLEN foldlen1, foldlen2;
d07ddd77 2288 bool match;
332ddc25 2289
d07ddd77
JH
2290 if (pe1)
2291 e1 = *(U8**)pe1;
e1ec3a88
AL
2292 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2293 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2294 if (pe2)
2295 e2 = *(U8**)pe2;
e1ec3a88
AL
2296 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2297 f2 = (const U8*)s2 + l2;
d07ddd77
JH
2298
2299 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2300 return 1; /* mismatch; possible infinite loop or false positive */
2301
a6872d42
JH
2302 if (!u1 || !u2)
2303 natbuf[1] = 0; /* Need to terminate the buffer. */
2304
d07ddd77
JH
2305 while ((e1 == 0 || p1 < e1) &&
2306 (f1 == 0 || p1 < f1) &&
2307 (e2 == 0 || p2 < e2) &&
2308 (f2 == 0 || p2 < f2)) {
2309 if (n1 == 0) {
d7f013c8
JH
2310 if (u1)
2311 to_utf8_fold(p1, foldbuf1, &foldlen1);
2312 else {
809e8e66 2313 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2314 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2315 }
2316 q1 = foldbuf1;
d07ddd77 2317 n1 = foldlen1;
332ddc25 2318 }
d07ddd77 2319 if (n2 == 0) {
d7f013c8
JH
2320 if (u2)
2321 to_utf8_fold(p2, foldbuf2, &foldlen2);
2322 else {
809e8e66 2323 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2324 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2325 }
2326 q2 = foldbuf2;
d07ddd77 2327 n2 = foldlen2;
332ddc25 2328 }
d07ddd77
JH
2329 while (n1 && n2) {
2330 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2331 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2332 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2333 return 1; /* mismatch */
d07ddd77 2334 n1 -= UTF8SKIP(q1);
d7f013c8 2335 q1 += UTF8SKIP(q1);
d07ddd77 2336 n2 -= UTF8SKIP(q2);
d7f013c8 2337 q2 += UTF8SKIP(q2);
701a277b 2338 }
d07ddd77 2339 if (n1 == 0)
d7f013c8 2340 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2341 if (n2 == 0)
d7f013c8
JH
2342 p2 += u2 ? UTF8SKIP(p2) : 1;
2343
d2cc3551 2344 }
5469e704 2345
d07ddd77
JH
2346 /* A match is defined by all the scans that specified
2347 * an explicit length reaching their final goals. */
2348 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2349
2350 if (match) {
d07ddd77
JH
2351 if (pe1)
2352 *pe1 = (char*)p1;
2353 if (pe2)
2354 *pe2 = (char*)p2;
5469e704
JH
2355 }
2356
2357 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2358}
701a277b 2359
a49f32c6
NC
2360/*
2361 * Local variables:
2362 * c-indentation-style: bsd
2363 * c-basic-offset: 4
2364 * indent-tabs-mode: t
2365 * End:
2366 *
37442d52
RGS
2367 * ex: set ts=8 sts=4 sw=4 noet:
2368 */