This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
The last parameter to gv_stashpv/gv_stashpvn/gv_stashsv is a bitmask
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
fdf8c088 3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007
b94e2f88 4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3
LW
26#include "perl.h"
27
a0c21aa1
JH
28#ifndef EBCDIC
29/* Separate prototypes needed because in ASCII systems these
30 * usually macros but they still are compiled as code, too. */
31PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
32PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
33#endif
34
27da23d5
JH
35static const char unees[] =
36 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 37
ccfc67b7
JH
38/*
39=head1 Unicode Support
a0ed51b3 40
166f8a29
DM
41This file contains various utility functions for manipulating UTF8-encoded
42strings. For the uninitiated, this is a method of representing arbitrary
61296642 43Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
44characters in the ASCII range are unmodified, and a zero byte never appears
45within non-zero characters.
166f8a29 46
b851fbc1 47=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 48
1e54db1a 49Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 50of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 51bytes available. The return value is the pointer to the byte after the
9041c2e3 52end of the new character. In other words,
eebe1485 53
b851fbc1
JH
54 d = uvuni_to_utf8_flags(d, uv, flags);
55
56or, in most cases,
57
9041c2e3 58 d = uvuni_to_utf8(d, uv);
eebe1485 59
b851fbc1
JH
60(which is equivalent to)
61
62 d = uvuni_to_utf8_flags(d, uv, 0);
63
eebe1485
SC
64is the recommended Unicode-aware way of saying
65
66 *(d++) = uv;
67
68=cut
69*/
70
dfe13c55 71U8 *
b851fbc1 72Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 73{
62961d2e 74 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
75 if (UNICODE_IS_SURROGATE(uv) &&
76 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 77 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
78 else if (
79 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
80 !(flags & UNICODE_ALLOW_FDD0))
81 ||
c867b360 82 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
83 !(flags & UNICODE_ALLOW_FFFF))) &&
84 /* UNICODE_ALLOW_SUPER includes
2a20b9da 85 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
86 ((uv <= PERL_UNICODE_MAX) ||
87 !(flags & UNICODE_ALLOW_SUPER))
88 )
9014280d 89 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
90 "Unicode character 0x%04"UVxf" is illegal", uv);
91 }
c4d5f83a 92 if (UNI_IS_INVARIANT(uv)) {
eb160463 93 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
94 return d;
95 }
2d331972 96#if defined(EBCDIC)
1d72bdf6
NIS
97 else {
98 STRLEN len = UNISKIP(uv);
99 U8 *p = d+len-1;
100 while (p > d) {
eb160463 101 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
102 uv >>= UTF_ACCUMULATION_SHIFT;
103 }
eb160463 104 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
105 return d+len;
106 }
107#else /* Non loop style */
a0ed51b3 108 if (uv < 0x800) {
eb160463
GS
109 *d++ = (U8)(( uv >> 6) | 0xc0);
110 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
111 return d;
112 }
113 if (uv < 0x10000) {
eb160463
GS
114 *d++ = (U8)(( uv >> 12) | 0xe0);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x200000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 18) | 0xf0);
121 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
123 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
124 return d;
125 }
126 if (uv < 0x4000000) {
eb160463
GS
127 *d++ = (U8)(( uv >> 24) | 0xf8);
128 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
129 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
131 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
132 return d;
133 }
134 if (uv < 0x80000000) {
eb160463
GS
135 *d++ = (U8)(( uv >> 30) | 0xfc);
136 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
137 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
138 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
139 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
140 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
141 return d;
142 }
6b8eaf93 143#ifdef HAS_QUAD
d7578b48 144 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
145#endif
146 {
eb160463
GS
147 *d++ = 0xfe; /* Can't match U+FEFF! */
148 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
149 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
150 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
151 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
152 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
153 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
154 return d;
155 }
6b8eaf93 156#ifdef HAS_QUAD
a0ed51b3 157 {
eb160463
GS
158 *d++ = 0xff; /* Can't match U+FFFE! */
159 *d++ = 0x80; /* 6 Reserved bits */
160 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
161 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
163 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
164 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
165 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
166 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
167 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
168 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
169 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
170 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
171 return d;
172 }
173#endif
1d72bdf6 174#endif /* Loop style */
a0ed51b3 175}
9041c2e3 176
646ca15d
JH
177/*
178
179Tests if some arbitrary number of bytes begins in a valid UTF-8
180character. Note that an INVARIANT (i.e. ASCII) character is a valid
181UTF-8 character. The actual number of bytes in the UTF-8 character
182will be returned if it is valid, otherwise 0.
183
184This is the "slow" version as opposed to the "fast" version which is
185the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
186difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
187or less you should use the IS_UTF8_CHAR(), for lengths of five or more
188you should use the _slow(). In practice this means that the _slow()
189will be used very rarely, since the maximum Unicode code point (as of
190Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
191the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
192five bytes or more.
193
194=cut */
c053b435 195STATIC STRLEN
5f66b61c 196S_is_utf8_char_slow(const U8 *s, const STRLEN len)
646ca15d
JH
197{
198 U8 u = *s;
199 STRLEN slen;
200 UV uv, ouv;
201
202 if (UTF8_IS_INVARIANT(u))
203 return 1;
204
205 if (!UTF8_IS_START(u))
206 return 0;
207
208 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
209 return 0;
210
211 slen = len - 1;
212 s++;
77263263
ST
213#ifdef EBCDIC
214 u = NATIVE_TO_UTF(u);
215#endif
646ca15d
JH
216 u &= UTF_START_MASK(len);
217 uv = u;
218 ouv = uv;
219 while (slen--) {
220 if (!UTF8_IS_CONTINUATION(*s))
221 return 0;
222 uv = UTF8_ACCUMULATE(uv, *s);
223 if (uv < ouv)
224 return 0;
225 ouv = uv;
226 s++;
227 }
228
229 if ((STRLEN)UNISKIP(uv) < len)
230 return 0;
231
232 return len;
233}
9041c2e3
NIS
234
235/*
7fc63493 236=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 237
5da9da9e 238Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
239character. Note that an INVARIANT (i.e. ASCII) character is a valid
240UTF-8 character. The actual number of bytes in the UTF-8 character
241will be returned if it is valid, otherwise 0.
9041c2e3 242
82686b01 243=cut */
067a85ef 244STRLEN
7fc63493 245Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 246{
44f8325f 247 const STRLEN len = UTF8SKIP(s);
96a5add6 248 PERL_UNUSED_CONTEXT;
3b0fc154 249#ifdef IS_UTF8_CHAR
768c67ee 250 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
251 return IS_UTF8_CHAR(s, len) ? len : 0;
252#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 253 return is_utf8_char_slow(s, len);
386d01d6
GS
254}
255
6662521e 256/*
7fc63493 257=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 258
c9ada85f 259Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
260UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
261not mean 'a string that contains code points above 0x7F encoded in UTF-8'
262because a valid ASCII string is a valid UTF-8 string.
6662521e 263
768c67ee
JH
264See also is_utf8_string_loclen() and is_utf8_string_loc().
265
6662521e
GS
266=cut
267*/
268
8e84507e 269bool
7fc63493 270Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 271{
35da51f7 272 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 273 const U8* x = s;
067a85ef 274
96a5add6 275 PERL_UNUSED_CONTEXT;
1aa99e6b 276
6662521e 277 while (x < send) {
a3b680e6 278 STRLEN c;
1acdb0da
JH
279 /* Inline the easy bits of is_utf8_char() here for speed... */
280 if (UTF8_IS_INVARIANT(*x))
281 c = 1;
282 else if (!UTF8_IS_START(*x))
768c67ee 283 goto out;
1acdb0da
JH
284 else {
285 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
286#ifdef IS_UTF8_CHAR
287 c = UTF8SKIP(x);
768c67ee
JH
288 if (IS_UTF8_CHAR_FAST(c)) {
289 if (!IS_UTF8_CHAR(x, c))
3c614e38
AD
290 c = 0;
291 }
292 else
293 c = is_utf8_char_slow(x, c);
646ca15d
JH
294#else
295 c = is_utf8_char(x);
296#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 297 if (!c)
768c67ee 298 goto out;
1acdb0da 299 }
6662521e 300 x += c;
6662521e 301 }
768c67ee
JH
302
303 out:
60006e79
JH
304 if (x != send)
305 return FALSE;
067a85ef
A
306
307 return TRUE;
6662521e
GS
308}
309
67e989fb 310/*
814fafa7
NC
311Implemented as a macro in utf8.h
312
313=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
314
315Like is_utf8_string() but stores the location of the failure (in the
316case of "utf8ness failure") or the location s+len (in the case of
317"utf8ness success") in the C<ep>.
318
319See also is_utf8_string_loclen() and is_utf8_string().
320
768c67ee 321=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 322
e3e4599f 323Like is_utf8_string() but stores the location of the failure (in the
768c67ee
JH
324case of "utf8ness failure") or the location s+len (in the case of
325"utf8ness success") in the C<ep>, and the number of UTF-8
326encoded characters in the C<el>.
327
328See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
329
330=cut
331*/
332
333bool
768c67ee 334Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 335{
35da51f7 336 const U8* const send = s + (len ? len : strlen((const char *)s));
7fc63493 337 const U8* x = s;
81cd54e3 338 STRLEN c;
3ebfea28 339 STRLEN outlen = 0;
96a5add6 340 PERL_UNUSED_CONTEXT;
81cd54e3 341
81cd54e3
JH
342 while (x < send) {
343 /* Inline the easy bits of is_utf8_char() here for speed... */
344 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
345 c = 1;
346 else if (!UTF8_IS_START(*x))
347 goto out;
81cd54e3 348 else {
768c67ee
JH
349 /* ... and call is_utf8_char() only if really needed. */
350#ifdef IS_UTF8_CHAR
351 c = UTF8SKIP(x);
352 if (IS_UTF8_CHAR_FAST(c)) {
353 if (!IS_UTF8_CHAR(x, c))
354 c = 0;
355 } else
356 c = is_utf8_char_slow(x, c);
357#else
358 c = is_utf8_char(x);
359#endif /* #ifdef IS_UTF8_CHAR */
360 if (!c)
361 goto out;
81cd54e3 362 }
768c67ee 363 x += c;
3ebfea28 364 outlen++;
81cd54e3 365 }
768c67ee
JH
366
367 out:
3ebfea28
AL
368 if (el)
369 *el = outlen;
370
768c67ee
JH
371 if (ep)
372 *ep = x;
3ebfea28 373 return (x == send);
81cd54e3
JH
374}
375
376/*
768c67ee 377
7fc63493 378=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 379
9041c2e3
NIS
380Bottom level UTF-8 decode routine.
381Returns the unicode code point value of the first character in the string C<s>
1e54db1a 382which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 383C<retlen> will be set to the length, in bytes, of that character.
67e989fb 384
1e54db1a 385If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
386is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
387it is assumed that the caller will raise a warning, and this function
28d3d195
JH
388will silently just set C<retlen> to C<-1> and return zero. If the
389C<flags> does not contain UTF8_CHECK_ONLY, warnings about
390malformations will be given, C<retlen> will be set to the expected
391length of the UTF-8 character in bytes, and zero will be returned.
392
393The C<flags> can also contain various flags to allow deviations from
394the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 395
9041c2e3
NIS
396Most code should use utf8_to_uvchr() rather than call this directly.
397
37607a96
PK
398=cut
399*/
67e989fb 400
a0ed51b3 401UV
7fc63493 402Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 403{
97aff369 404 dVAR;
d4c19fe8 405 const U8 * const s0 = s;
9c5ffd7c 406 UV uv = *s, ouv = 0;
ba210ebe 407 STRLEN len = 1;
7fc63493
AL
408 const bool dowarn = ckWARN_d(WARN_UTF8);
409 const UV startbyte = *s;
ba210ebe 410 STRLEN expectlen = 0;
a0dbb045
JH
411 U32 warning = 0;
412
413/* This list is a superset of the UTF8_ALLOW_XXX. */
414
415#define UTF8_WARN_EMPTY 1
416#define UTF8_WARN_CONTINUATION 2
417#define UTF8_WARN_NON_CONTINUATION 3
418#define UTF8_WARN_FE_FF 4
419#define UTF8_WARN_SHORT 5
420#define UTF8_WARN_OVERFLOW 6
421#define UTF8_WARN_SURROGATE 7
c867b360
JH
422#define UTF8_WARN_LONG 8
423#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
424
425 if (curlen == 0 &&
426 !(flags & UTF8_ALLOW_EMPTY)) {
427 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
428 goto malformed;
429 }
430
1d72bdf6 431 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
432 if (retlen)
433 *retlen = 1;
c4d5f83a 434 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 435 }
67e989fb 436
421a8bf2 437 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 438 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 439 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
440 goto malformed;
441 }
442
421a8bf2 443 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 444 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 445 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
446 goto malformed;
447 }
9041c2e3 448
1d72bdf6 449#ifdef EBCDIC
75383841 450 uv = NATIVE_TO_UTF(uv);
1d72bdf6 451#else
fcc8fcf6
JH
452 if ((uv == 0xfe || uv == 0xff) &&
453 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 454 warning = UTF8_WARN_FE_FF;
ba210ebe 455 goto malformed;
a0ed51b3 456 }
1d72bdf6
NIS
457#endif
458
ba210ebe
JH
459 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
460 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
461 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
462 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
463#ifdef EBCDIC
464 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
465 else { len = 7; uv &= 0x01; }
466#else
ba210ebe
JH
467 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
468 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
469 else { len = 13; uv = 0; } /* whoa! */
470#endif
471
a0ed51b3
LW
472 if (retlen)
473 *retlen = len;
9041c2e3 474
ba210ebe
JH
475 expectlen = len;
476
fcc8fcf6
JH
477 if ((curlen < expectlen) &&
478 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 479 warning = UTF8_WARN_SHORT;
ba210ebe
JH
480 goto malformed;
481 }
482
483 len--;
a0ed51b3 484 s++;
ba210ebe
JH
485 ouv = uv;
486
a0ed51b3 487 while (len--) {
421a8bf2
JH
488 if (!UTF8_IS_CONTINUATION(*s) &&
489 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
490 s--;
491 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 492 goto malformed;
a0ed51b3
LW
493 }
494 else
8850bf83 495 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
496 if (!(uv > ouv)) {
497 /* These cannot be allowed. */
498 if (uv == ouv) {
75dbc644 499 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
500 warning = UTF8_WARN_LONG;
501 goto malformed;
502 }
503 }
504 else { /* uv < ouv */
505 /* This cannot be allowed. */
506 warning = UTF8_WARN_OVERFLOW;
507 goto malformed;
508 }
ba210ebe
JH
509 }
510 s++;
511 ouv = uv;
512 }
513
421a8bf2 514 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 515 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 516 warning = UTF8_WARN_SURROGATE;
ba210ebe 517 goto malformed;
eb160463 518 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 519 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 520 warning = UTF8_WARN_LONG;
ba210ebe 521 goto malformed;
421a8bf2 522 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 523 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 524 warning = UTF8_WARN_FFFF;
a9917092 525 goto malformed;
a0ed51b3 526 }
ba210ebe 527
a0ed51b3 528 return uv;
ba210ebe
JH
529
530malformed:
531
fcc8fcf6 532 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 533 if (retlen)
10edeb5d 534 *retlen = ((STRLEN) -1);
ba210ebe
JH
535 return 0;
536 }
537
a0dbb045 538 if (dowarn) {
396482e1 539 SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
a0dbb045
JH
540
541 switch (warning) {
542 case 0: /* Intentionally empty. */ break;
543 case UTF8_WARN_EMPTY:
396482e1 544 sv_catpvs(sv, "(empty string)");
a0dbb045
JH
545 break;
546 case UTF8_WARN_CONTINUATION:
097fb8e2 547 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
548 break;
549 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
550 if (s == s0)
551 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
552 (UV)s[1], startbyte);
551405c4
AL
553 else {
554 const int len = (int)(s-s0);
097fb8e2 555 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4
AL
556 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
557 }
558
a0dbb045
JH
559 break;
560 case UTF8_WARN_FE_FF:
561 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
562 break;
563 case UTF8_WARN_SHORT:
097fb8e2 564 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 565 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 566 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
567 break;
568 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
569 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
570 ouv, *s, startbyte);
a0dbb045
JH
571 break;
572 case UTF8_WARN_SURROGATE:
573 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
574 break;
a0dbb045 575 case UTF8_WARN_LONG:
097fb8e2 576 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 577 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
578 break;
579 case UTF8_WARN_FFFF:
580 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
581 break;
582 default:
396482e1 583 sv_catpvs(sv, "(unknown reason)");
a0dbb045
JH
584 break;
585 }
586
587 if (warning) {
44f8325f 588 const char * const s = SvPVX_const(sv);
a0dbb045
JH
589
590 if (PL_op)
9014280d 591 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 592 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 593 else
9014280d 594 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
595 }
596 }
597
ba210ebe 598 if (retlen)
28d3d195 599 *retlen = expectlen ? expectlen : len;
ba210ebe 600
28d3d195 601 return 0;
a0ed51b3
LW
602}
603
8e84507e 604/*
7fc63493 605=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3
NIS
606
607Returns the native character value of the first character in the string C<s>
1e54db1a 608which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
609length, in bytes, of that character.
610
1e54db1a 611If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
612returned and retlen is set, if possible, to -1.
613
614=cut
615*/
616
617UV
7fc63493 618Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 619{
1754c1a1
NC
620 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
621 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
622}
623
624/*
7fc63493 625=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3
NIS
626
627Returns the Unicode code point of the first character in the string C<s>
1e54db1a 628which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
629length, in bytes, of that character.
630
631This function should only be used when returned UV is considered
632an index into the Unicode semantic tables (e.g. swashes).
633
1e54db1a 634If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 635returned and retlen is set, if possible, to -1.
8e84507e
NIS
636
637=cut
638*/
639
640UV
7fc63493 641Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 642{
9041c2e3 643 /* Call the low level routine asking for checks */
89ebb4a3 644 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 645 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
646}
647
b76347f2 648/*
35a4481c 649=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2
JH
650
651Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
652Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
653up past C<e>, croaks.
b76347f2
JH
654
655=cut
656*/
657
658STRLEN
35a4481c 659Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 660{
97aff369 661 dVAR;
b76347f2 662 STRLEN len = 0;
10edeb5d 663 U8 t = 0;
b76347f2 664
8850bf83
JH
665 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
666 * the bitops (especially ~) can create illegal UTF-8.
667 * In other words: in Perl UTF-8 is not just for Unicode. */
668
a3b680e6
AL
669 if (e < s)
670 goto warn_and_return;
b76347f2 671 while (s < e) {
10edeb5d 672 t = UTF8SKIP(s);
901b21bf 673 if (e - s < t) {
a3b680e6 674 warn_and_return:
901b21bf
JH
675 if (ckWARN_d(WARN_UTF8)) {
676 if (PL_op)
677 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 678 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
679 else
680 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
681 }
682 return len;
683 }
b76347f2
JH
684 s += t;
685 len++;
686 }
687
688 return len;
689}
690
b06226ff 691/*
35a4481c 692=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 693
1e54db1a 694Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
695and C<b>.
696
697WARNING: use only if you *know* that the pointers point inside the
698same UTF-8 buffer.
699
37607a96
PK
700=cut
701*/
a0ed51b3 702
02eb7b47 703IV
35a4481c 704Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 705{
bf1665bc 706 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3
LW
707}
708
b06226ff 709/*
37607a96 710=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 711
8850bf83
JH
712Return the UTF-8 pointer C<s> displaced by C<off> characters, either
713forward or backward.
b06226ff
JH
714
715WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
716the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
717on the first byte of character or just after the last byte of a character.
b06226ff 718
37607a96
PK
719=cut
720*/
a0ed51b3
LW
721
722U8 *
4373e329 723Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 724{
96a5add6 725 PERL_UNUSED_CONTEXT;
8850bf83
JH
726 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
727 * the bitops (especially ~) can create illegal UTF-8.
728 * In other words: in Perl UTF-8 is not just for Unicode. */
729
a0ed51b3
LW
730 if (off >= 0) {
731 while (off--)
732 s += UTF8SKIP(s);
733 }
734 else {
735 while (off++) {
736 s--;
8850bf83
JH
737 while (UTF8_IS_CONTINUATION(*s))
738 s--;
a0ed51b3
LW
739 }
740 }
4373e329 741 return (U8 *)s;
a0ed51b3
LW
742}
743
6940069f 744/*
eebe1485 745=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 746
1e54db1a 747Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
748Unlike C<bytes_to_utf8>, this over-writes the original string, and
749updates len to contain the new length.
67e989fb 750Returns zero on failure, setting C<len> to -1.
6940069f 751
95be277c
NC
752If you need a copy of the string, see C<bytes_from_utf8>.
753
6940069f
GS
754=cut
755*/
756
757U8 *
37607a96 758Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 759{
d4c19fe8
AL
760 U8 * const save = s;
761 U8 * const send = s + *len;
6940069f 762 U8 *d;
246fae53 763
1e54db1a 764 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 765 while (s < send) {
dcad2880
JH
766 U8 c = *s++;
767
1d72bdf6
NIS
768 if (!UTF8_IS_INVARIANT(c) &&
769 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
770 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
10edeb5d 771 *len = ((STRLEN) -1);
dcad2880
JH
772 return 0;
773 }
246fae53 774 }
dcad2880
JH
775
776 d = s = save;
6940069f 777 while (s < send) {
ed646e6e 778 STRLEN ulen;
9041c2e3 779 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 780 s += ulen;
6940069f
GS
781 }
782 *d = '\0';
246fae53 783 *len = d - save;
6940069f
GS
784 return save;
785}
786
787/*
e1ec3a88 788=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 789
1e54db1a 790Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 791Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
792the newly-created string, and updates C<len> to contain the new
793length. Returns the original string if no conversion occurs, C<len>
794is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7950 if C<s> is converted or contains all 7bit characters.
f9a63242 796
37607a96
PK
797=cut
798*/
f9a63242
JH
799
800U8 *
e1ec3a88 801Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 802{
f9a63242 803 U8 *d;
e1ec3a88
AL
804 const U8 *start = s;
805 const U8 *send;
f9a63242
JH
806 I32 count = 0;
807
96a5add6 808 PERL_UNUSED_CONTEXT;
f9a63242 809 if (!*is_utf8)
73d840c0 810 return (U8 *)start;
f9a63242 811
1e54db1a 812 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 813 for (send = s + *len; s < send;) {
e1ec3a88 814 U8 c = *s++;
1d72bdf6 815 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
816 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
817 (c = *s++) && UTF8_IS_CONTINUATION(c))
818 count++;
819 else
73d840c0 820 return (U8 *)start;
db42d148 821 }
f9a63242
JH
822 }
823
35da51f7 824 *is_utf8 = FALSE;
f9a63242 825
212542aa 826 Newx(d, (*len) - count + 1, U8);
ef9edfd0 827 s = start; start = d;
f9a63242
JH
828 while (s < send) {
829 U8 c = *s++;
c4d5f83a
NIS
830 if (!UTF8_IS_INVARIANT(c)) {
831 /* Then it is two-byte encoded */
832 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
833 c = ASCII_TO_NATIVE(c);
834 }
835 *d++ = c;
f9a63242
JH
836 }
837 *d = '\0';
838 *len = d - start;
73d840c0 839 return (U8 *)start;
f9a63242
JH
840}
841
842/*
35a4481c 843=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 844
1e54db1a 845Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
846Returns a pointer to the newly-created string, and sets C<len> to
847reflect the new length.
6940069f 848
1e54db1a 849If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
850see sv_recode_to_utf8().
851
497711e7 852=cut
6940069f
GS
853*/
854
855U8*
35a4481c 856Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 857{
35a4481c 858 const U8 * const send = s + (*len);
6940069f
GS
859 U8 *d;
860 U8 *dst;
96a5add6 861 PERL_UNUSED_CONTEXT;
6940069f 862
212542aa 863 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
864 dst = d;
865
866 while (s < send) {
35a4481c 867 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 868 if (UNI_IS_INVARIANT(uv))
eb160463 869 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 870 else {
eb160463
GS
871 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
872 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
873 }
874 }
875 *d = '\0';
6662521e 876 *len = d-dst;
6940069f
GS
877 return dst;
878}
879
a0ed51b3 880/*
dea0fc0b 881 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
882 *
883 * Destination must be pre-extended to 3/2 source. Do not use in-place.
884 * We optimize for native, for obvious reasons. */
885
886U8*
dea0fc0b 887Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 888{
dea0fc0b
JH
889 U8* pend;
890 U8* dstart = d;
891
1de9afcd
RGS
892 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
893 d[0] = 0;
894 *newlen = 1;
895 return d;
896 }
897
dea0fc0b 898 if (bytelen & 1)
f5992bc4 899 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
900
901 pend = p + bytelen;
902
a0ed51b3 903 while (p < pend) {
dea0fc0b
JH
904 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
905 p += 2;
a0ed51b3 906 if (uv < 0x80) {
e294cc5d
JH
907#ifdef EBCDIC
908 *d++ = UNI_TO_NATIVE(uv);
909#else
eb160463 910 *d++ = (U8)uv;
e294cc5d 911#endif
a0ed51b3
LW
912 continue;
913 }
914 if (uv < 0x800) {
eb160463
GS
915 *d++ = (U8)(( uv >> 6) | 0xc0);
916 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
917 continue;
918 }
919 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
DT
920 UV low = (p[0] << 8) + p[1];
921 p += 2;
dea0fc0b
JH
922 if (low < 0xdc00 || low >= 0xdfff)
923 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
924 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
925 }
926 if (uv < 0x10000) {
eb160463
GS
927 *d++ = (U8)(( uv >> 12) | 0xe0);
928 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
929 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
930 continue;
931 }
932 else {
eb160463
GS
933 *d++ = (U8)(( uv >> 18) | 0xf0);
934 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
935 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
936 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
937 continue;
938 }
939 }
dea0fc0b 940 *newlen = d - dstart;
a0ed51b3
LW
941 return d;
942}
943
944/* Note: this one is slightly destructive of the source. */
945
946U8*
dea0fc0b 947Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
948{
949 U8* s = (U8*)p;
d4c19fe8 950 U8* const send = s + bytelen;
a0ed51b3 951 while (s < send) {
d4c19fe8 952 const U8 tmp = s[0];
a0ed51b3
LW
953 s[0] = s[1];
954 s[1] = tmp;
955 s += 2;
956 }
dea0fc0b 957 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
958}
959
960/* for now these are all defined (inefficiently) in terms of the utf8 versions */
961
962bool
84afefe6 963Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 964{
89ebb4a3 965 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 966 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
967 return is_utf8_alnum(tmpbuf);
968}
969
970bool
84afefe6 971Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 972{
89ebb4a3 973 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 974 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
975 return is_utf8_alnumc(tmpbuf);
976}
977
978bool
84afefe6 979Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 980{
89ebb4a3 981 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 982 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
983 return is_utf8_idfirst(tmpbuf);
984}
985
986bool
84afefe6 987Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 988{
89ebb4a3 989 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 990 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
991 return is_utf8_alpha(tmpbuf);
992}
993
994bool
84afefe6 995Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 996{
89ebb4a3 997 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 998 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
999 return is_utf8_ascii(tmpbuf);
1000}
1001
1002bool
84afefe6 1003Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1004{
89ebb4a3 1005 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1006 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1007 return is_utf8_space(tmpbuf);
1008}
1009
1010bool
84afefe6 1011Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1012{
89ebb4a3 1013 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1014 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1015 return is_utf8_digit(tmpbuf);
1016}
1017
1018bool
84afefe6 1019Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1020{
89ebb4a3 1021 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1022 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1023 return is_utf8_upper(tmpbuf);
1024}
1025
1026bool
84afefe6 1027Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1028{
89ebb4a3 1029 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1030 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1031 return is_utf8_lower(tmpbuf);
1032}
1033
1034bool
84afefe6 1035Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1036{
89ebb4a3 1037 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1038 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1039 return is_utf8_cntrl(tmpbuf);
1040}
1041
1042bool
84afefe6 1043Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1044{
89ebb4a3 1045 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1046 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1047 return is_utf8_graph(tmpbuf);
1048}
1049
1050bool
84afefe6 1051Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1052{
89ebb4a3 1053 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1054 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1055 return is_utf8_print(tmpbuf);
1056}
1057
b8c5462f 1058bool
84afefe6 1059Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1060{
89ebb4a3 1061 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1062 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1063 return is_utf8_punct(tmpbuf);
1064}
1065
4d61ec05 1066bool
84afefe6 1067Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1068{
89ebb4a3 1069 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1070 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1071 return is_utf8_xdigit(tmpbuf);
1072}
1073
84afefe6
JH
1074UV
1075Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1076{
0ebc6274
JH
1077 uvchr_to_utf8(p, c);
1078 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1079}
1080
84afefe6
JH
1081UV
1082Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1083{
0ebc6274
JH
1084 uvchr_to_utf8(p, c);
1085 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1086}
1087
84afefe6
JH
1088UV
1089Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1090{
0ebc6274
JH
1091 uvchr_to_utf8(p, c);
1092 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1093}
1094
84afefe6
JH
1095UV
1096Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1097{
0ebc6274
JH
1098 uvchr_to_utf8(p, c);
1099 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1100}
1101
a0ed51b3
LW
1102/* for now these all assume no locale info available for Unicode > 255 */
1103
1104bool
84afefe6 1105Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1106{
1107 return is_uni_alnum(c); /* XXX no locale support yet */
1108}
1109
1110bool
84afefe6 1111Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1112{
1113 return is_uni_alnumc(c); /* XXX no locale support yet */
1114}
1115
1116bool
84afefe6 1117Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1118{
1119 return is_uni_idfirst(c); /* XXX no locale support yet */
1120}
1121
1122bool
84afefe6 1123Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1124{
1125 return is_uni_alpha(c); /* XXX no locale support yet */
1126}
1127
1128bool
84afefe6 1129Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1130{
1131 return is_uni_ascii(c); /* XXX no locale support yet */
1132}
1133
1134bool
84afefe6 1135Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1136{
1137 return is_uni_space(c); /* XXX no locale support yet */
1138}
1139
1140bool
84afefe6 1141Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1142{
1143 return is_uni_digit(c); /* XXX no locale support yet */
1144}
1145
1146bool
84afefe6 1147Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1148{
1149 return is_uni_upper(c); /* XXX no locale support yet */
1150}
1151
1152bool
84afefe6 1153Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1154{
1155 return is_uni_lower(c); /* XXX no locale support yet */
1156}
1157
1158bool
84afefe6 1159Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1160{
1161 return is_uni_cntrl(c); /* XXX no locale support yet */
1162}
1163
1164bool
84afefe6 1165Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1166{
1167 return is_uni_graph(c); /* XXX no locale support yet */
1168}
1169
1170bool
84afefe6 1171Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1172{
1173 return is_uni_print(c); /* XXX no locale support yet */
1174}
1175
b8c5462f 1176bool
84afefe6 1177Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1178{
1179 return is_uni_punct(c); /* XXX no locale support yet */
1180}
1181
4d61ec05 1182bool
84afefe6 1183Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1184{
1185 return is_uni_xdigit(c); /* XXX no locale support yet */
1186}
1187
b7ac61fa
JH
1188U32
1189Perl_to_uni_upper_lc(pTHX_ U32 c)
1190{
ee099d14
JH
1191 /* XXX returns only the first character -- do not use XXX */
1192 /* XXX no locale support yet */
1193 STRLEN len;
89ebb4a3 1194 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1195 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1196}
1197
1198U32
1199Perl_to_uni_title_lc(pTHX_ U32 c)
1200{
ee099d14
JH
1201 /* XXX returns only the first character XXX -- do not use XXX */
1202 /* XXX no locale support yet */
1203 STRLEN len;
89ebb4a3 1204 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1205 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1206}
1207
1208U32
1209Perl_to_uni_lower_lc(pTHX_ U32 c)
1210{
ee099d14
JH
1211 /* XXX returns only the first character -- do not use XXX */
1212 /* XXX no locale support yet */
1213 STRLEN len;
89ebb4a3 1214 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1215 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1216}
1217
7452cf6a 1218static bool
5141f98e 1219S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
1220 const char *const swashname)
1221{
97aff369 1222 dVAR;
bde6a22d
NC
1223 if (!is_utf8_char(p))
1224 return FALSE;
1225 if (!*swash)
711a919c 1226 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
bde6a22d
NC
1227 return swash_fetch(*swash, p, TRUE) != 0;
1228}
1229
1230bool
7fc63493 1231Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1232{
97aff369 1233 dVAR;
671c33bf
NC
1234 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1235 * descendant of isalnum(3), in other words, it doesn't
1236 * contain the '_'. --jhi */
d4c19fe8 1237 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
a0ed51b3
LW
1238}
1239
1240bool
7fc63493 1241Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1242{
97aff369 1243 dVAR;
d4c19fe8 1244 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
b8c5462f
JH
1245}
1246
1247bool
7fc63493 1248Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1249{
97aff369 1250 dVAR;
82686b01
JH
1251 if (*p == '_')
1252 return TRUE;
bde6a22d 1253 /* is_utf8_idstart would be more logical. */
d4c19fe8 1254 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1255}
1256
1257bool
7fc63493 1258Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1259{
97aff369 1260 dVAR;
82686b01
JH
1261 if (*p == '_')
1262 return TRUE;
d4c19fe8 1263 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1264}
1265
1266bool
7fc63493 1267Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1268{
97aff369 1269 dVAR;
d4c19fe8 1270 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1271}
1272
1273bool
7fc63493 1274Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1275{
97aff369 1276 dVAR;
d4c19fe8 1277 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
b8c5462f
JH
1278}
1279
1280bool
7fc63493 1281Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1282{
97aff369 1283 dVAR;
d4c19fe8 1284 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3
LW
1285}
1286
1287bool
7fc63493 1288Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1289{
97aff369 1290 dVAR;
d4c19fe8 1291 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1292}
1293
1294bool
7fc63493 1295Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1296{
97aff369 1297 dVAR;
d4c19fe8 1298 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1299}
1300
1301bool
7fc63493 1302Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1303{
97aff369 1304 dVAR;
d4c19fe8 1305 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1306}
1307
1308bool
7fc63493 1309Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1310{
97aff369 1311 dVAR;
d4c19fe8 1312 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f
JH
1313}
1314
1315bool
7fc63493 1316Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1317{
97aff369 1318 dVAR;
d4c19fe8 1319 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1320}
1321
1322bool
7fc63493 1323Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1324{
97aff369 1325 dVAR;
d4c19fe8 1326 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1327}
1328
1329bool
7fc63493 1330Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1331{
97aff369 1332 dVAR;
d4c19fe8 1333 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1334}
1335
1336bool
7fc63493 1337Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1338{
97aff369 1339 dVAR;
d4c19fe8 1340 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f
JH
1341}
1342
1343bool
7fc63493 1344Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1345{
97aff369 1346 dVAR;
d4c19fe8 1347 return is_utf8_common(p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1348}
1349
6b5c0936
JH
1350/*
1351=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1352
1353The "p" contains the pointer to the UTF-8 string encoding
1354the character that is being converted.
1355
1356The "ustrp" is a pointer to the character buffer to put the
1357conversion result to. The "lenp" is a pointer to the length
1358of the result.
1359
0134edef 1360The "swashp" is a pointer to the swash to use.
6b5c0936 1361
0134edef 1362Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
8fe4d5b2 1363and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 1364but not always, a multicharacter mapping), is tried first.
6b5c0936 1365
0134edef
JH
1366The "special" is a string like "utf8::ToSpecLower", which means the
1367hash %utf8::ToSpecLower. The access to the hash is through
1368Perl_to_utf8_case().
6b5c0936 1369
0134edef
JH
1370The "normal" is a string like "ToLower" which means the swash
1371%utf8::ToLower.
1372
1373=cut */
6b5c0936 1374
2104c8d9 1375UV
9a957fbc
AL
1376Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1377 SV **swashp, const char *normal, const char *special)
a0ed51b3 1378{
97aff369 1379 dVAR;
89ebb4a3 1380 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1381 STRLEN len = 0;
a0ed51b3 1382
aec46f14 1383 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1384 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1385 * are necessary in EBCDIC, they are redundant no-ops
1386 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1387 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1388 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1389
1390 if (!*swashp) /* load on-demand */
1391 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1392
b08cf34e
JH
1393 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1394 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1395 /* It might be "special" (sometimes, but not always,
2a37f04d 1396 * a multicharacter mapping) */
35da51f7 1397 HV * const hv = get_hv(special, FALSE);
b08cf34e
JH
1398 SV **svp;
1399
35da51f7 1400 if (hv &&
b08cf34e
JH
1401 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1402 (*svp)) {
cfd0369c 1403 const char *s;
47654450 1404
cfd0369c 1405 s = SvPV_const(*svp, len);
47654450
JH
1406 if (len == 1)
1407 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1408 else {
2f9475ad
JH
1409#ifdef EBCDIC
1410 /* If we have EBCDIC we need to remap the characters
1411 * since any characters in the low 256 are Unicode
1412 * code points, not EBCDIC. */
7cda7a3d 1413 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1414
1415 d = tmpbuf;
b08cf34e 1416 if (SvUTF8(*svp)) {
2f9475ad
JH
1417 STRLEN tlen = 0;
1418
1419 while (t < tend) {
d4c19fe8 1420 const UV c = utf8_to_uvchr(t, &tlen);
2f9475ad
JH
1421 if (tlen > 0) {
1422 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1423 t += tlen;
1424 }
1425 else
1426 break;
1427 }
1428 }
1429 else {
36fec512
JH
1430 while (t < tend) {
1431 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1432 t++;
1433 }
2f9475ad
JH
1434 }
1435 len = d - tmpbuf;
1436 Copy(tmpbuf, ustrp, len, U8);
1437#else
d2dcd0fb 1438 Copy(s, ustrp, len, U8);
2f9475ad 1439#endif
29e98929 1440 }
983ffd37 1441 }
0134edef
JH
1442 }
1443
1444 if (!len && *swashp) {
d4c19fe8
AL
1445 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1446
0134edef
JH
1447 if (uv2) {
1448 /* It was "normal" (a single character mapping). */
d4c19fe8 1449 const UV uv3 = UNI_TO_NATIVE(uv2);
e9101d72 1450 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1451 }
1452 }
1feea2c7 1453
0134edef
JH
1454 if (!len) /* Neither: just copy. */
1455 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1456
2a37f04d
JH
1457 if (lenp)
1458 *lenp = len;
1459
0134edef 1460 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1461}
1462
d3e79532 1463/*
7fc63493 1464=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1465
1466Convert the UTF-8 encoded character at p to its uppercase version and
1467store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1468that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1469the uppercase version may be longer than the original character.
d3e79532
JH
1470
1471The first character of the uppercased version is returned
1472(but note, as explained above, that there may be more.)
1473
1474=cut */
1475
2104c8d9 1476UV
7fc63493 1477Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1478{
97aff369 1479 dVAR;
983ffd37 1480 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1481 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1482}
a0ed51b3 1483
d3e79532 1484/*
7fc63493 1485=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1486
1487Convert the UTF-8 encoded character at p to its titlecase version and
1488store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1489that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1490titlecase version may be longer than the original character.
d3e79532
JH
1491
1492The first character of the titlecased version is returned
1493(but note, as explained above, that there may be more.)
1494
1495=cut */
1496
983ffd37 1497UV
7fc63493 1498Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1499{
97aff369 1500 dVAR;
983ffd37 1501 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1502 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1503}
1504
d3e79532 1505/*
7fc63493 1506=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1507
1508Convert the UTF-8 encoded character at p to its lowercase version and
1509store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1510that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1511lowercase version may be longer than the original character.
d3e79532
JH
1512
1513The first character of the lowercased version is returned
1514(but note, as explained above, that there may be more.)
1515
1516=cut */
1517
2104c8d9 1518UV
7fc63493 1519Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1520{
97aff369 1521 dVAR;
983ffd37 1522 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1523 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1524}
1525
d3e79532 1526/*
7fc63493 1527=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1528
1529Convert the UTF-8 encoded character at p to its foldcase version and
1530store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1531that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1532foldcase version may be longer than the original character (up to
1533three characters).
1534
1535The first character of the foldcased version is returned
1536(but note, as explained above, that there may be more.)
1537
1538=cut */
1539
b4e400f9 1540UV
7fc63493 1541Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1542{
97aff369 1543 dVAR;
b4e400f9
JH
1544 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1545 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1546}
1547
711a919c
ST
1548/* Note:
1549 * A "swash" is a swatch hash.
1550 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1551 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1552 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1553 */
a0ed51b3 1554SV*
7fc63493 1555Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1556{
27da23d5 1557 dVAR;
a0ed51b3 1558 SV* retval;
9a957fbc 1559 SV* const tokenbufsv = sv_newmortal();
8e84507e 1560 dSP;
7fc63493
AL
1561 const size_t pkg_len = strlen(pkg);
1562 const size_t name_len = strlen(name);
da51bb9b 1563 HV * const stash = gv_stashpvn(pkg, pkg_len, 0);
f8be5cf0 1564 SV* errsv_save;
ce3b816e 1565
96ca9f55
DM
1566 PUSHSTACKi(PERLSI_MAGIC);
1567 ENTER;
1568 SAVEI32(PL_hints);
1569 PL_hints = 0;
1570 save_re_context();
1b026014 1571 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1572 ENTER;
f8be5cf0 1573 errsv_save = newSVsv(ERRSV);
dc0c6abb
NC
1574 /* It is assumed that callers of this routine are not passing in any
1575 user derived data. */
1576 /* Need to do this after save_re_context() as it will set PL_tainted to
1577 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1578 Even line to create errsv_save can turn on PL_tainted. */
1579 SAVEBOOL(PL_tainted);
1580 PL_tainted = 0;
71bed85a 1581 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
a0714e2c 1582 NULL);
f8be5cf0
JH
1583 if (!SvTRUE(ERRSV))
1584 sv_setsv(ERRSV, errsv_save);
1585 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1586 LEAVE;
1587 }
1588 SPAGAIN;
a0ed51b3
LW
1589 PUSHMARK(SP);
1590 EXTEND(SP,5);
71bed85a
NC
1591 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1592 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1593 PUSHs(listsv);
1594 PUSHs(sv_2mortal(newSViv(minbits)));
1595 PUSHs(sv_2mortal(newSViv(none)));
1596 PUTBACK;
923e4eb5 1597 if (IN_PERL_COMPILETIME) {
bf1fed83 1598 /* XXX ought to be handled by lex_start */
82686b01 1599 SAVEI32(PL_in_my);
2b4bd638 1600 PL_in_my = 0;
bf1fed83 1601 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1602 }
f8be5cf0 1603 errsv_save = newSVsv(ERRSV);
864dbfa3 1604 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1605 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1606 else
e24b16f9 1607 retval = &PL_sv_undef;
f8be5cf0
JH
1608 if (!SvTRUE(ERRSV))
1609 sv_setsv(ERRSV, errsv_save);
1610 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1611 LEAVE;
1612 POPSTACK;
923e4eb5 1613 if (IN_PERL_COMPILETIME) {
bf1fed83 1614 STRLEN len;
aec46f14 1615 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83
JH
1616
1617 Copy(pv, PL_tokenbuf, len+1, char);
623e6609 1618 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1619 }
bc45ce41
JH
1620 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1621 if (SvPOK(retval))
35c1215d 1622 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
be2597df 1623 SVfARG(retval));
cea2e8a9 1624 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1625 }
a0ed51b3
LW
1626 return retval;
1627}
1628
035d37be
JH
1629
1630/* This API is wrong for special case conversions since we may need to
1631 * return several Unicode characters for a single Unicode character
1632 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1633 * the lower-level routine, and it is similarly broken for returning
1634 * multiple values. --jhi */
979f2922 1635/* Now SWASHGET is recasted into S_swash_get in this file. */
680c470c
ST
1636
1637/* Note:
1638 * Returns the value of property/mapping C<swash> for the first character
1639 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1640 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1641 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1642 */
a0ed51b3 1643UV
680c470c 1644Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 1645{
27da23d5 1646 dVAR;
680c470c 1647 HV* const hv = (HV*)SvRV(swash);
3568d838
JH
1648 U32 klen;
1649 U32 off;
a0ed51b3 1650 STRLEN slen;
7d85a32c 1651 STRLEN needents;
cfd0369c 1652 const U8 *tmps = NULL;
a0ed51b3 1653 U32 bit;
979f2922 1654 SV *swatch;
3568d838 1655 U8 tmputf8[2];
35da51f7 1656 const UV c = NATIVE_TO_ASCII(*ptr);
3568d838
JH
1657
1658 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
979f2922
ST
1659 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1660 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1661 ptr = tmputf8;
3568d838
JH
1662 }
1663 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1664 * then the "swatch" is a vec() for al the chars which start
1665 * with 0xAA..0xYY
1666 * So the key in the hash (klen) is length of encoded char -1
1667 */
1668 klen = UTF8SKIP(ptr) - 1;
1669 off = ptr[klen];
a0ed51b3 1670
979f2922 1671 if (klen == 0) {
7d85a32c 1672 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1673 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1674 */
979f2922
ST
1675 needents = UTF_CONTINUATION_MARK;
1676 off = NATIVE_TO_UTF(ptr[klen]);
1677 }
1678 else {
7d85a32c 1679 /* If char is encoded then swatch is for the prefix */
979f2922
ST
1680 needents = (1 << UTF_ACCUMULATION_SHIFT);
1681 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1682 }
7d85a32c 1683
a0ed51b3
LW
1684 /*
1685 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1686 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1687 * it's nothing to sniff at.) Pity we usually come through at least
1688 * two function calls to get here...
1689 *
1690 * NB: this code assumes that swatches are never modified, once generated!
1691 */
1692
3568d838 1693 if (hv == PL_last_swash_hv &&
a0ed51b3 1694 klen == PL_last_swash_klen &&
27da23d5 1695 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1696 {
1697 tmps = PL_last_swash_tmps;
1698 slen = PL_last_swash_slen;
1699 }
1700 else {
1701 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1702 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1703
979f2922
ST
1704 /* If not cached, generate it via swash_get */
1705 if (!svp || !SvPOK(*svp)
1706 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0
NIS
1707 /* We use utf8n_to_uvuni() as we want an index into
1708 Unicode tables, not a native character number.
1709 */
aec46f14 1710 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1711 ckWARN(WARN_UTF8) ?
1712 0 : UTF8_ALLOW_ANY);
680c470c 1713 swatch = swash_get(swash,
979f2922
ST
1714 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1715 (klen) ? (code_point & ~(needents - 1)) : 0,
1716 needents);
1717
923e4eb5 1718 if (IN_PERL_COMPILETIME)
623e6609 1719 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1720
979f2922 1721 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 1722
979f2922
ST
1723 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1724 || (slen << 3) < needents)
660a4616 1725 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
a0ed51b3
LW
1726 }
1727
1728 PL_last_swash_hv = hv;
1729 PL_last_swash_klen = klen;
cfd0369c
NC
1730 /* FIXME change interpvar.h? */
1731 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1732 PL_last_swash_slen = slen;
1733 if (klen)
1734 Copy(ptr, PL_last_swash_key, klen, U8);
1735 }
1736
9faf8d75 1737 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1738 case 1:
1739 bit = 1 << (off & 7);
1740 off >>= 3;
1741 return (tmps[off] & bit) != 0;
1742 case 8:
1743 return tmps[off];
1744 case 16:
1745 off <<= 1;
1746 return (tmps[off] << 8) + tmps[off + 1] ;
1747 case 32:
1748 off <<= 2;
1749 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1750 }
660a4616 1751 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
670f1322 1752 NORETURN_FUNCTION_END;
a0ed51b3 1753}
2b9d42f0 1754
979f2922
ST
1755/* Note:
1756 * Returns a swatch (a bit vector string) for a code point sequence
1757 * that starts from the value C<start> and comprises the number C<span>.
1758 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1759 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1760 */
1761STATIC SV*
1762S_swash_get(pTHX_ SV* swash, UV start, UV span)
1763{
1764 SV *swatch;
711a919c 1765 U8 *l, *lend, *x, *xend, *s;
979f2922
ST
1766 STRLEN lcur, xcur, scur;
1767
1768 HV* const hv = (HV*)SvRV(swash);
017a3ce5
GA
1769 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1770 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1771 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1772 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1773 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
0bd48802
AL
1774 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1775 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1776 const STRLEN bits = SvUV(*bitssvp);
1777 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1778 const UV none = SvUV(*nonesvp);
1779 const UV end = start + span;
979f2922
ST
1780
1781 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
660a4616
ST
1782 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1783 (UV)bits);
979f2922
ST
1784 }
1785
1786 /* create and initialize $swatch */
396482e1 1787 swatch = newSVpvs("");
979f2922
ST
1788 scur = octets ? (span * octets) : (span + 7) / 8;
1789 SvGROW(swatch, scur + 1);
1790 s = (U8*)SvPVX(swatch);
1791 if (octets && none) {
0bd48802 1792 const U8* const e = s + scur;
979f2922
ST
1793 while (s < e) {
1794 if (bits == 8)
1795 *s++ = (U8)(none & 0xff);
1796 else if (bits == 16) {
1797 *s++ = (U8)((none >> 8) & 0xff);
1798 *s++ = (U8)( none & 0xff);
1799 }
1800 else if (bits == 32) {
1801 *s++ = (U8)((none >> 24) & 0xff);
1802 *s++ = (U8)((none >> 16) & 0xff);
1803 *s++ = (U8)((none >> 8) & 0xff);
1804 *s++ = (U8)( none & 0xff);
1805 }
1806 }
1807 *s = '\0';
1808 }
1809 else {
1810 (void)memzero((U8*)s, scur + 1);
1811 }
1812 SvCUR_set(swatch, scur);
1813 s = (U8*)SvPVX(swatch);
1814
1815 /* read $swash->{LIST} */
1816 l = (U8*)SvPV(*listsvp, lcur);
1817 lend = l + lcur;
1818 while (l < lend) {
35da51f7 1819 UV min, max, val;
979f2922
ST
1820 STRLEN numlen;
1821 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1822
0bd48802 1823 U8* const nl = (U8*)memchr(l, '\n', lend - l);
979f2922
ST
1824
1825 numlen = lend - l;
1826 min = grok_hex((char *)l, &numlen, &flags, NULL);
1827 if (numlen)
1828 l += numlen;
1829 else if (nl) {
1830 l = nl + 1; /* 1 is length of "\n" */
1831 continue;
1832 }
1833 else {
1834 l = lend; /* to LIST's end at which \n is not found */
1835 break;
1836 }
1837
1838 if (isBLANK(*l)) {
1839 ++l;
1840 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1841 numlen = lend - l;
1842 max = grok_hex((char *)l, &numlen, &flags, NULL);
1843 if (numlen)
1844 l += numlen;
1845 else
1846 max = min;
1847
1848 if (octets) {
1849 if (isBLANK(*l)) {
1850 ++l;
1851 flags = PERL_SCAN_SILENT_ILLDIGIT |
1852 PERL_SCAN_DISALLOW_PREFIX;
1853 numlen = lend - l;
1854 val = grok_hex((char *)l, &numlen, &flags, NULL);
1855 if (numlen)
1856 l += numlen;
1857 else
1858 val = 0;
1859 }
1860 else {
1861 val = 0;
1862 if (typeto) {
1863 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1864 typestr, l);
1865 }
1866 }
1867 }
711a919c
ST
1868 else
1869 val = 0; /* bits == 1, then val should be ignored */
979f2922
ST
1870 }
1871 else {
1872 max = min;
1873 if (octets) {
1874 val = 0;
1875 if (typeto) {
1876 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1877 }
1878 }
711a919c
ST
1879 else
1880 val = 0; /* bits == 1, then val should be ignored */
979f2922
ST
1881 }
1882
1883 if (nl)
1884 l = nl + 1;
1885 else
1886 l = lend;
1887
1888 if (max < start)
1889 continue;
1890
1891 if (octets) {
35da51f7 1892 UV key;
979f2922
ST
1893 if (min < start) {
1894 if (!none || val < none) {
1895 val += start - min;
1896 }
1897 min = start;
1898 }
1899 for (key = min; key <= max; key++) {
1900 STRLEN offset;
1901 if (key >= end)
1902 goto go_out_list;
1903 /* offset must be non-negative (start <= min <= key < end) */
1904 offset = octets * (key - start);
1905 if (bits == 8)
1906 s[offset] = (U8)(val & 0xff);
1907 else if (bits == 16) {
1908 s[offset ] = (U8)((val >> 8) & 0xff);
1909 s[offset + 1] = (U8)( val & 0xff);
1910 }
1911 else if (bits == 32) {
1912 s[offset ] = (U8)((val >> 24) & 0xff);
1913 s[offset + 1] = (U8)((val >> 16) & 0xff);
1914 s[offset + 2] = (U8)((val >> 8) & 0xff);
1915 s[offset + 3] = (U8)( val & 0xff);
1916 }
1917
1918 if (!none || val < none)
1919 ++val;
1920 }
1921 }
711a919c 1922 else { /* bits == 1, then val should be ignored */
35da51f7 1923 UV key;
979f2922
ST
1924 if (min < start)
1925 min = start;
1926 for (key = min; key <= max; key++) {
0bd48802 1927 const STRLEN offset = (STRLEN)(key - start);
979f2922
ST
1928 if (key >= end)
1929 goto go_out_list;
1930 s[offset >> 3] |= 1 << (offset & 7);
1931 }
1932 }
1933 } /* while */
1934 go_out_list:
1935
1936 /* read $swash->{EXTRAS} */
1937 x = (U8*)SvPV(*extssvp, xcur);
1938 xend = x + xcur;
1939 while (x < xend) {
1940 STRLEN namelen;
1941 U8 *namestr;
1942 SV** othersvp;
1943 HV* otherhv;
1944 STRLEN otherbits;
1945 SV **otherbitssvp, *other;
711a919c 1946 U8 *s, *o, *nl;
979f2922
ST
1947 STRLEN slen, olen;
1948
35da51f7 1949 const U8 opc = *x++;
979f2922
ST
1950 if (opc == '\n')
1951 continue;
1952
1953 nl = (U8*)memchr(x, '\n', xend - x);
1954
1955 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1956 if (nl) {
1957 x = nl + 1; /* 1 is length of "\n" */
1958 continue;
1959 }
1960 else {
1961 x = xend; /* to EXTRAS' end at which \n is not found */
1962 break;
1963 }
1964 }
1965
1966 namestr = x;
1967 if (nl) {
1968 namelen = nl - namestr;
1969 x = nl + 1;
1970 }
1971 else {
1972 namelen = xend - namestr;
1973 x = xend;
1974 }
1975
1976 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
660a4616 1977 otherhv = (HV*)SvRV(*othersvp);
017a3ce5 1978 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922
ST
1979 otherbits = (STRLEN)SvUV(*otherbitssvp);
1980 if (bits < otherbits)
660a4616 1981 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
979f2922
ST
1982
1983 /* The "other" swatch must be destroyed after. */
1984 other = swash_get(*othersvp, start, span);
1985 o = (U8*)SvPV(other, olen);
1986
1987 if (!olen)
660a4616 1988 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
979f2922
ST
1989
1990 s = (U8*)SvPV(swatch, slen);
1991 if (bits == 1 && otherbits == 1) {
1992 if (slen != olen)
660a4616 1993 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
979f2922
ST
1994
1995 switch (opc) {
1996 case '+':
1997 while (slen--)
1998 *s++ |= *o++;
1999 break;
2000 case '!':
2001 while (slen--)
2002 *s++ |= ~*o++;
2003 break;
2004 case '-':
2005 while (slen--)
2006 *s++ &= ~*o++;
2007 break;
2008 case '&':
2009 while (slen--)
2010 *s++ &= *o++;
2011 break;
2012 default:
2013 break;
2014 }
2015 }
711a919c 2016 else {
979f2922
ST
2017 STRLEN otheroctets = otherbits >> 3;
2018 STRLEN offset = 0;
35da51f7 2019 U8* const send = s + slen;
979f2922
ST
2020
2021 while (s < send) {
2022 UV otherval = 0;
2023
2024 if (otherbits == 1) {
2025 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2026 ++offset;
2027 }
2028 else {
2029 STRLEN vlen = otheroctets;
2030 otherval = *o++;
2031 while (--vlen) {
2032 otherval <<= 8;
2033 otherval |= *o++;
2034 }
2035 }
2036
711a919c 2037 if (opc == '+' && otherval)
6f207bd3 2038 NOOP; /* replace with otherval */
979f2922
ST
2039 else if (opc == '!' && !otherval)
2040 otherval = 1;
2041 else if (opc == '-' && otherval)
2042 otherval = 0;
2043 else if (opc == '&' && !otherval)
2044 otherval = 0;
2045 else {
711a919c 2046 s += octets; /* no replacement */
979f2922
ST
2047 continue;
2048 }
2049
2050 if (bits == 8)
2051 *s++ = (U8)( otherval & 0xff);
2052 else if (bits == 16) {
2053 *s++ = (U8)((otherval >> 8) & 0xff);
2054 *s++ = (U8)( otherval & 0xff);
2055 }
2056 else if (bits == 32) {
2057 *s++ = (U8)((otherval >> 24) & 0xff);
2058 *s++ = (U8)((otherval >> 16) & 0xff);
2059 *s++ = (U8)((otherval >> 8) & 0xff);
2060 *s++ = (U8)( otherval & 0xff);
2061 }
2062 }
2063 }
2064 sv_free(other); /* through with it! */
2065 } /* while */
2066 return swatch;
2067}
2068
0f830e0b
NC
2069/*
2070=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2071
2072Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2073of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2074bytes available. The return value is the pointer to the byte after the
2075end of the new character. In other words,
2076
2077 d = uvchr_to_utf8(d, uv);
2078
2079is the recommended wide native character-aware way of saying
2080
2081 *(d++) = uv;
2082
2083=cut
2084*/
2085
2086/* On ASCII machines this is normally a macro but we want a
2087 real function in case XS code wants it
2088*/
0f830e0b
NC
2089U8 *
2090Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2091{
2092 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2093}
2094
b851fbc1
JH
2095U8 *
2096Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2097{
2098 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2099}
2b9d42f0
NIS
2100
2101/*
0f830e0b
NC
2102=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2103flags
2104
2105Returns the native character value of the first character in the string
2106C<s>
2107which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2108length, in bytes, of that character.
2109
2110Allows length and flags to be passed to low level routine.
2111
2112=cut
2113*/
2114/* On ASCII machines this is normally a macro but we want
2115 a real function in case XS code wants it
2116*/
0f830e0b
NC
2117UV
2118Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2119U32 flags)
2120{
2121 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2122 return UNI_TO_NATIVE(uv);
2123}
2124
2125/*
d2cc3551
JH
2126=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2127
2128Build to the scalar dsv a displayable version of the string spv,
2129length len, the displayable version being at most pvlim bytes long
2130(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2131
9e55ce06 2132The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 2133isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
2134to display the \\[nrfta\\] as the backslashed versions (like '\n')
2135(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2136UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2137UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2138
d2cc3551
JH
2139The pointer to the PV of the dsv is returned.
2140
2141=cut */
e6b2e755 2142char *
e1ec3a88 2143Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
2144{
2145 int truncated = 0;
e1ec3a88 2146 const char *s, *e;
e6b2e755
JH
2147
2148 sv_setpvn(dsv, "", 0);
e1ec3a88 2149 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 2150 UV u;
a49f32c6
NC
2151 /* This serves double duty as a flag and a character to print after
2152 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2153 */
2154 char ok = 0;
c728cb41 2155
e6b2e755
JH
2156 if (pvlim && SvCUR(dsv) >= pvlim) {
2157 truncated++;
2158 break;
2159 }
2160 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 2161 if (u < 256) {
a3b680e6 2162 const unsigned char c = (unsigned char)u & 0xFF;
0bd48802 2163 if (flags & UNI_DISPLAY_BACKSLASH) {
a49f32c6 2164 switch (c) {
c728cb41 2165 case '\n':
a49f32c6 2166 ok = 'n'; break;
c728cb41 2167 case '\r':
a49f32c6 2168 ok = 'r'; break;
c728cb41 2169 case '\t':
a49f32c6 2170 ok = 't'; break;
c728cb41 2171 case '\f':
a49f32c6 2172 ok = 'f'; break;
c728cb41 2173 case '\a':
a49f32c6 2174 ok = 'a'; break;
c728cb41 2175 case '\\':
a49f32c6 2176 ok = '\\'; break;
c728cb41
JH
2177 default: break;
2178 }
a49f32c6
NC
2179 if (ok) {
2180 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2181 }
c728cb41 2182 }
00e86452 2183 /* isPRINT() is the locale-blind version. */
a49f32c6
NC
2184 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2185 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2186 ok = 1;
0a2ef054 2187 }
c728cb41
JH
2188 }
2189 if (!ok)
9e55ce06 2190 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
2191 }
2192 if (truncated)
396482e1 2193 sv_catpvs(dsv, "...");
e6b2e755
JH
2194
2195 return SvPVX(dsv);
2196}
2b9d42f0 2197
d2cc3551
JH
2198/*
2199=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2200
2201Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 2202the displayable version being at most pvlim bytes long
d2cc3551 2203(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
2204
2205The flags argument is as in pv_uni_display().
2206
d2cc3551
JH
2207The pointer to the PV of the dsv is returned.
2208
d4c19fe8
AL
2209=cut
2210*/
e6b2e755
JH
2211char *
2212Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2213{
cfd0369c
NC
2214 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2215 SvCUR(ssv), pvlim, flags);
701a277b
JH
2216}
2217
d2cc3551 2218/*
d07ddd77 2219=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
2220
2221Return true if the strings s1 and s2 differ case-insensitively, false
2222if not (if they are equal case-insensitively). If u1 is true, the
2223string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
2224the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2225are false, the respective string is assumed to be in native 8-bit
2226encoding.
2227
2228If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2229in there (they will point at the beginning of the I<next> character).
2230If the pointers behind pe1 or pe2 are non-NULL, they are the end
2231pointers beyond which scanning will not continue under any
4cdaeff7 2232circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
2233s2+l2 will be used as goal end pointers that will also stop the scan,
2234and which qualify towards defining a successful match: all the scans
2235that define an explicit length must reach their goal pointers for
2236a match to succeed).
d2cc3551
JH
2237
2238For case-insensitiveness, the "casefolding" of Unicode is used
2239instead of upper/lowercasing both the characters, see
2240http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2241
2242=cut */
701a277b 2243I32
d07ddd77 2244Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2245{
97aff369 2246 dVAR;
e1ec3a88
AL
2247 register const U8 *p1 = (const U8*)s1;
2248 register const U8 *p2 = (const U8*)s2;
cbbf8932 2249 register const U8 *f1 = NULL;
2f73348c 2250 register const U8 *f2 = NULL;
cbbf8932
AL
2251 register U8 *e1 = NULL;
2252 register U8 *q1 = NULL;
2253 register U8 *e2 = NULL;
2254 register U8 *q2 = NULL;
d07ddd77 2255 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
2256 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2257 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2258 U8 natbuf[1+1];
2259 STRLEN foldlen1, foldlen2;
d07ddd77 2260 bool match;
332ddc25 2261
d07ddd77
JH
2262 if (pe1)
2263 e1 = *(U8**)pe1;
e1ec3a88
AL
2264 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2265 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2266 if (pe2)
2267 e2 = *(U8**)pe2;
e1ec3a88
AL
2268 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2269 f2 = (const U8*)s2 + l2;
d07ddd77
JH
2270
2271 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2272 return 1; /* mismatch; possible infinite loop or false positive */
2273
a6872d42
JH
2274 if (!u1 || !u2)
2275 natbuf[1] = 0; /* Need to terminate the buffer. */
2276
d07ddd77
JH
2277 while ((e1 == 0 || p1 < e1) &&
2278 (f1 == 0 || p1 < f1) &&
2279 (e2 == 0 || p2 < e2) &&
2280 (f2 == 0 || p2 < f2)) {
2281 if (n1 == 0) {
d7f013c8
JH
2282 if (u1)
2283 to_utf8_fold(p1, foldbuf1, &foldlen1);
2284 else {
809e8e66 2285 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2286 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2287 }
2288 q1 = foldbuf1;
d07ddd77 2289 n1 = foldlen1;
332ddc25 2290 }
d07ddd77 2291 if (n2 == 0) {
d7f013c8
JH
2292 if (u2)
2293 to_utf8_fold(p2, foldbuf2, &foldlen2);
2294 else {
809e8e66 2295 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2296 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2297 }
2298 q2 = foldbuf2;
d07ddd77 2299 n2 = foldlen2;
332ddc25 2300 }
d07ddd77
JH
2301 while (n1 && n2) {
2302 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2303 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2304 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2305 return 1; /* mismatch */
d07ddd77 2306 n1 -= UTF8SKIP(q1);
d7f013c8 2307 q1 += UTF8SKIP(q1);
d07ddd77 2308 n2 -= UTF8SKIP(q2);
d7f013c8 2309 q2 += UTF8SKIP(q2);
701a277b 2310 }
d07ddd77 2311 if (n1 == 0)
d7f013c8 2312 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2313 if (n2 == 0)
d7f013c8
JH
2314 p2 += u2 ? UTF8SKIP(p2) : 1;
2315
d2cc3551 2316 }
5469e704 2317
d07ddd77
JH
2318 /* A match is defined by all the scans that specified
2319 * an explicit length reaching their final goals. */
2320 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2321
2322 if (match) {
d07ddd77
JH
2323 if (pe1)
2324 *pe1 = (char*)p1;
2325 if (pe2)
2326 *pe2 = (char*)p2;
5469e704
JH
2327 }
2328
2329 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2330}
701a277b 2331
a49f32c6
NC
2332/*
2333 * Local variables:
2334 * c-indentation-style: bsd
2335 * c-basic-offset: 4
2336 * indent-tabs-mode: t
2337 * End:
2338 *
37442d52
RGS
2339 * ex: set ts=8 sts=4 sw=4 noet:
2340 */