This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate:
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
2c351e65
NC
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3
LW
26#include "perl.h"
27
0bfdb59a
NC
28#ifndef EBCDIC
29/* Separate prototypes needed because in ASCII systems these
30 * usually macros but they still are compiled as code, too. */
31PERL_CALLCONV UV Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags);
32PERL_CALLCONV U8* Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv);
33#endif
34
fe20fd30
JH
35static const char unees[] =
36 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 37
ccfc67b7
JH
38/*
39=head1 Unicode Support
a0ed51b3 40
40d34c0d
SB
41This file contains various utility functions for manipulating UTF8-encoded
42strings. For the uninitiated, this is a method of representing arbitrary
43Unicode characters as a variable number of bytes, in such a way that
44characters in the ASCII range are unmodified, and a zero byte never appears
45within non-zero characters.
46
b851fbc1 47=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 48
cd458e05 49Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
a2a469f9 50of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 51bytes available. The return value is the pointer to the byte after the
9041c2e3 52end of the new character. In other words,
eebe1485 53
b851fbc1
JH
54 d = uvuni_to_utf8_flags(d, uv, flags);
55
56or, in most cases,
57
9041c2e3 58 d = uvuni_to_utf8(d, uv);
eebe1485 59
b851fbc1
JH
60(which is equivalent to)
61
62 d = uvuni_to_utf8_flags(d, uv, 0);
63
eebe1485
SC
64is the recommended Unicode-aware way of saying
65
66 *(d++) = uv;
67
68=cut
69*/
70
dfe13c55 71U8 *
b851fbc1 72Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 73{
62961d2e 74 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
75 if (UNICODE_IS_SURROGATE(uv) &&
76 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 77 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
78 else if (
79 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
80 !(flags & UNICODE_ALLOW_FDD0))
81 ||
c867b360 82 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
83 !(flags & UNICODE_ALLOW_FFFF))) &&
84 /* UNICODE_ALLOW_SUPER includes
2a20b9da 85 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
86 ((uv <= PERL_UNICODE_MAX) ||
87 !(flags & UNICODE_ALLOW_SUPER))
88 )
9014280d 89 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
90 "Unicode character 0x%04"UVxf" is illegal", uv);
91 }
c4d5f83a 92 if (UNI_IS_INVARIANT(uv)) {
eb160463 93 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
94 return d;
95 }
2d331972 96#if defined(EBCDIC)
1d72bdf6
NIS
97 else {
98 STRLEN len = UNISKIP(uv);
99 U8 *p = d+len-1;
100 while (p > d) {
eb160463 101 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
102 uv >>= UTF_ACCUMULATION_SHIFT;
103 }
eb160463 104 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
105 return d+len;
106 }
107#else /* Non loop style */
a0ed51b3 108 if (uv < 0x800) {
eb160463
GS
109 *d++ = (U8)(( uv >> 6) | 0xc0);
110 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
111 return d;
112 }
113 if (uv < 0x10000) {
eb160463
GS
114 *d++ = (U8)(( uv >> 12) | 0xe0);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x200000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 18) | 0xf0);
121 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
123 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
124 return d;
125 }
126 if (uv < 0x4000000) {
eb160463
GS
127 *d++ = (U8)(( uv >> 24) | 0xf8);
128 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
129 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
131 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
132 return d;
133 }
134 if (uv < 0x80000000) {
eb160463
GS
135 *d++ = (U8)(( uv >> 30) | 0xfc);
136 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
137 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
138 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
139 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
140 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
141 return d;
142 }
6b8eaf93 143#ifdef HAS_QUAD
d7578b48 144 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
145#endif
146 {
eb160463
GS
147 *d++ = 0xfe; /* Can't match U+FEFF! */
148 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
149 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
150 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
151 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
152 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
153 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
154 return d;
155 }
6b8eaf93 156#ifdef HAS_QUAD
a0ed51b3 157 {
eb160463
GS
158 *d++ = 0xff; /* Can't match U+FFFE! */
159 *d++ = 0x80; /* 6 Reserved bits */
160 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
161 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
163 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
164 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
165 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
166 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
167 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
168 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
169 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
170 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
171 return d;
172 }
173#endif
1d72bdf6 174#endif /* Loop style */
a0ed51b3 175}
9041c2e3 176
9041c2e3 177/*
eebe1485 178
5da9da9e 179Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
180character. Note that an INVARIANT (i.e. ASCII) character is a valid
181UTF-8 character. The actual number of bytes in the UTF-8 character
182will be returned if it is valid, otherwise 0.
9041c2e3 183
d7a71dc3
NC
184This is the "slow" version as opposed to the "fast" version which is
185the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
186difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
187or less you should use the IS_UTF8_CHAR(), for lengths of five or more
188you should use the _slow(). In practice this means that the _slow()
189will be used very rarely, since the maximum Unicode code point (as of
190Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
191the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
192five bytes or more.
193
82686b01 194=cut */
767c9e37 195STATIC STRLEN
5900599a 196S_is_utf8_char_slow(const U8 *s, const STRLEN len)
386d01d6
GS
197{
198 U8 u = *s;
d7a71dc3 199 STRLEN slen;
067a85ef 200 UV uv, ouv;
386d01d6 201
1d72bdf6 202 if (UTF8_IS_INVARIANT(u))
386d01d6
GS
203 return 1;
204
60006e79 205 if (!UTF8_IS_START(u))
386d01d6
GS
206 return 0;
207
60006e79 208 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef
A
209 return 0;
210
386d01d6
GS
211 slen = len - 1;
212 s++;
48ea04fa
TS
213#ifdef EBCDIC
214 u = NATIVE_TO_UTF(u);
215#endif
6eb6869e 216 u &= UTF_START_MASK(len);
1d72bdf6 217 uv = u;
067a85ef 218 ouv = uv;
386d01d6 219 while (slen--) {
60006e79 220 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 221 return 0;
8850bf83 222 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 223 if (uv < ouv)
067a85ef
A
224 return 0;
225 ouv = uv;
386d01d6
GS
226 s++;
227 }
067a85ef 228
eb160463 229 if ((STRLEN)UNISKIP(uv) < len)
067a85ef
A
230 return 0;
231
386d01d6
GS
232 return len;
233}
234
6662521e 235/*
d7a71dc3
NC
236=for apidoc A|STRLEN|is_utf8_char|U8 *s
237
238Tests if some arbitrary number of bytes begins in a valid UTF-8
239character. Note that an INVARIANT (i.e. ASCII) character is a valid
240UTF-8 character. The actual number of bytes in the UTF-8 character
241will be returned if it is valid, otherwise 0.
242
243=cut */
244STRLEN
245Perl_is_utf8_char(pTHX_ U8 *s)
246{
d015a557 247 const STRLEN len = UTF8SKIP(s);
1e7ed80e 248 PERL_UNUSED_CONTEXT;
d7a71dc3
NC
249#ifdef IS_UTF8_CHAR
250 if (IS_UTF8_CHAR_FAST(len))
251 return IS_UTF8_CHAR(s, len) ? len : 0;
252#endif /* #ifdef IS_UTF8_CHAR */
253 return is_utf8_char_slow(s, len);
254}
255
256/*
eebe1485 257=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 258
5835a535 259Returns true if first C<len> bytes of the given string form a valid
cd458e05
JH
260UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
261not mean 'a string that contains code points above 0x7F encoded in UTF-8'
262because a valid ASCII string is a valid UTF-8 string.
6662521e 263
d7a71dc3
NC
264See also is_utf8_string_loclen() and is_utf8_string_loc().
265
6662521e
GS
266=cut
267*/
268
8e84507e 269bool
6662521e
GS
270Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
271{
8e7b0921 272 const U8* const send = s + (len ? len : strlen((const char *)s));
1f49be52 273 const U8* x = s;
067a85ef 274
1e7ed80e 275 PERL_UNUSED_CONTEXT;
1aa99e6b 276
6662521e 277 while (x < send) {
8c18bf38 278 STRLEN c;
1acdb0da
JH
279 /* Inline the easy bits of is_utf8_char() here for speed... */
280 if (UTF8_IS_INVARIANT(*x))
281 c = 1;
282 else if (!UTF8_IS_START(*x))
d7a71dc3 283 goto out;
1acdb0da
JH
284 else {
285 /* ... and call is_utf8_char() only if really needed. */
d7a71dc3
NC
286#ifdef IS_UTF8_CHAR
287 c = UTF8SKIP(x);
288 if (IS_UTF8_CHAR_FAST(c)) {
289 if (!IS_UTF8_CHAR(x, c))
161d0ac1
NC
290 c = 0;
291 }
292 else
293 c = is_utf8_char_slow(x, c);
d7a71dc3
NC
294#else
295 c = is_utf8_char(x);
296#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 297 if (!c)
d7a71dc3 298 goto out;
1acdb0da 299 }
6662521e 300 x += c;
6662521e 301 }
d7a71dc3
NC
302
303 out:
60006e79
JH
304 if (x != send)
305 return FALSE;
067a85ef
A
306
307 return TRUE;
6662521e
GS
308}
309
67e989fb 310/*
5082993f
NC
311Implemented as a macro in utf8.h
312
313=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
314
315Like is_utf8_string() but stores the location of the failure (in the
316case of "utf8ness failure") or the location s+len (in the case of
317"utf8ness success") in the C<ep>.
318
319See also is_utf8_string_loclen() and is_utf8_string().
320
d7a71dc3 321=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
9c20fa4a 322
8f53e8d5 323Like is_utf8_string() but stores the location of the failure (in the
d7a71dc3
NC
324case of "utf8ness failure") or the location s+len (in the case of
325"utf8ness success") in the C<ep>, and the number of UTF-8
326encoded characters in the C<el>.
327
328See also is_utf8_string_loc() and is_utf8_string().
9c20fa4a
JH
329
330=cut
331*/
332
333bool
d7a71dc3 334Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9c20fa4a 335{
8e7b0921 336 const U8* const send = s + (len ? len : strlen((const char *)s));
1f49be52 337 const U8* x = s;
9c20fa4a 338 STRLEN c;
86d06bda 339 STRLEN outlen = 0;
1e7ed80e 340 PERL_UNUSED_CONTEXT;
9c20fa4a 341
9c20fa4a
JH
342 while (x < send) {
343 /* Inline the easy bits of is_utf8_char() here for speed... */
344 if (UTF8_IS_INVARIANT(*x))
d7a71dc3
NC
345 c = 1;
346 else if (!UTF8_IS_START(*x))
347 goto out;
9c20fa4a 348 else {
d7a71dc3
NC
349 /* ... and call is_utf8_char() only if really needed. */
350#ifdef IS_UTF8_CHAR
351 c = UTF8SKIP(x);
352 if (IS_UTF8_CHAR_FAST(c)) {
353 if (!IS_UTF8_CHAR(x, c))
354 c = 0;
355 } else
356 c = is_utf8_char_slow(x, c);
357#else
358 c = is_utf8_char(x);
359#endif /* #ifdef IS_UTF8_CHAR */
360 if (!c)
361 goto out;
9c20fa4a 362 }
d7a71dc3 363 x += c;
86d06bda 364 outlen++;
9c20fa4a 365 }
d7a71dc3
NC
366
367 out:
86d06bda
AL
368 if (el)
369 *el = outlen;
370
d7a71dc3
NC
371 if (ep)
372 *ep = x;
86d06bda 373 return (x == send);
9c20fa4a
JH
374}
375
376/*
d7a71dc3 377
9041c2e3 378=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 379
9041c2e3
NIS
380Bottom level UTF-8 decode routine.
381Returns the unicode code point value of the first character in the string C<s>
cd458e05 382which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 383C<retlen> will be set to the length, in bytes, of that character.
67e989fb 384
cd458e05 385If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
386is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
387it is assumed that the caller will raise a warning, and this function
28d3d195
JH
388will silently just set C<retlen> to C<-1> and return zero. If the
389C<flags> does not contain UTF8_CHECK_ONLY, warnings about
390malformations will be given, C<retlen> will be set to the expected
391length of the UTF-8 character in bytes, and zero will be returned.
392
393The C<flags> can also contain various flags to allow deviations from
394the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 395
9041c2e3
NIS
396Most code should use utf8_to_uvchr() rather than call this directly.
397
37607a96
PK
398=cut
399*/
67e989fb 400
a0ed51b3 401UV
37607a96 402Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 403{
43b2f713 404 const U8 * const s0 = s;
9c5ffd7c 405 UV uv = *s, ouv = 0;
ba210ebe 406 STRLEN len = 1;
1f49be52
AL
407 const bool dowarn = ckWARN_d(WARN_UTF8);
408 const UV startbyte = *s;
ba210ebe 409 STRLEN expectlen = 0;
a0dbb045
JH
410 U32 warning = 0;
411
412/* This list is a superset of the UTF8_ALLOW_XXX. */
413
414#define UTF8_WARN_EMPTY 1
415#define UTF8_WARN_CONTINUATION 2
416#define UTF8_WARN_NON_CONTINUATION 3
417#define UTF8_WARN_FE_FF 4
418#define UTF8_WARN_SHORT 5
419#define UTF8_WARN_OVERFLOW 6
420#define UTF8_WARN_SURROGATE 7
c867b360
JH
421#define UTF8_WARN_LONG 8
422#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
423
424 if (curlen == 0 &&
425 !(flags & UTF8_ALLOW_EMPTY)) {
426 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
427 goto malformed;
428 }
429
1d72bdf6 430 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
431 if (retlen)
432 *retlen = 1;
c4d5f83a 433 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 434 }
67e989fb 435
421a8bf2 436 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 437 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 438 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
439 goto malformed;
440 }
441
421a8bf2 442 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 443 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 444 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
445 goto malformed;
446 }
9041c2e3 447
1d72bdf6 448#ifdef EBCDIC
75383841 449 uv = NATIVE_TO_UTF(uv);
1d72bdf6 450#else
fcc8fcf6
JH
451 if ((uv == 0xfe || uv == 0xff) &&
452 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 453 warning = UTF8_WARN_FE_FF;
ba210ebe 454 goto malformed;
a0ed51b3 455 }
1d72bdf6
NIS
456#endif
457
ba210ebe
JH
458 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
459 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
460 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
461 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
462#ifdef EBCDIC
463 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
464 else { len = 7; uv &= 0x01; }
465#else
ba210ebe
JH
466 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
467 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
468 else { len = 13; uv = 0; } /* whoa! */
469#endif
470
a0ed51b3
LW
471 if (retlen)
472 *retlen = len;
9041c2e3 473
ba210ebe
JH
474 expectlen = len;
475
fcc8fcf6
JH
476 if ((curlen < expectlen) &&
477 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 478 warning = UTF8_WARN_SHORT;
ba210ebe
JH
479 goto malformed;
480 }
481
482 len--;
a0ed51b3 483 s++;
ba210ebe
JH
484 ouv = uv;
485
a0ed51b3 486 while (len--) {
421a8bf2
JH
487 if (!UTF8_IS_CONTINUATION(*s) &&
488 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
489 s--;
490 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 491 goto malformed;
a0ed51b3
LW
492 }
493 else
8850bf83 494 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
495 if (!(uv > ouv)) {
496 /* These cannot be allowed. */
497 if (uv == ouv) {
f66b99ab 498 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
499 warning = UTF8_WARN_LONG;
500 goto malformed;
501 }
502 }
503 else { /* uv < ouv */
504 /* This cannot be allowed. */
505 warning = UTF8_WARN_OVERFLOW;
506 goto malformed;
507 }
ba210ebe
JH
508 }
509 s++;
510 ouv = uv;
511 }
512
421a8bf2 513 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 514 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 515 warning = UTF8_WARN_SURROGATE;
ba210ebe 516 goto malformed;
eb160463 517 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 518 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 519 warning = UTF8_WARN_LONG;
ba210ebe 520 goto malformed;
421a8bf2 521 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 522 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 523 warning = UTF8_WARN_FFFF;
a9917092 524 goto malformed;
a0ed51b3 525 }
ba210ebe 526
a0ed51b3 527 return uv;
ba210ebe
JH
528
529malformed:
530
fcc8fcf6 531 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 532 if (retlen)
d2ae4405 533 *retlen = ((STRLEN) -1);
ba210ebe
JH
534 return 0;
535 }
536
a0dbb045 537 if (dowarn) {
d7559646 538 SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
a0dbb045
JH
539
540 switch (warning) {
541 case 0: /* Intentionally empty. */ break;
542 case UTF8_WARN_EMPTY:
d7559646 543 sv_catpvs(sv, "(empty string)");
a0dbb045
JH
544 break;
545 case UTF8_WARN_CONTINUATION:
097fb8e2 546 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
547 break;
548 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
549 if (s == s0)
550 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
551 (UV)s[1], startbyte);
78fd745e
AL
552 else {
553 const int len = (int)(s-s0);
097fb8e2 554 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
78fd745e
AL
555 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
556 }
557
a0dbb045
JH
558 break;
559 case UTF8_WARN_FE_FF:
560 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
561 break;
562 case UTF8_WARN_SHORT:
097fb8e2 563 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
228fe6e6 564 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 565 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
566 break;
567 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
568 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
569 ouv, *s, startbyte);
a0dbb045
JH
570 break;
571 case UTF8_WARN_SURROGATE:
572 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
573 break;
a0dbb045 574 case UTF8_WARN_LONG:
097fb8e2 575 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
228fe6e6 576 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
577 break;
578 case UTF8_WARN_FFFF:
579 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
580 break;
581 default:
d7559646 582 sv_catpvs(sv, "(unknown reason)");
a0dbb045
JH
583 break;
584 }
585
586 if (warning) {
d015a557 587 const char * const s = SvPVX_const(sv);
a0dbb045
JH
588
589 if (PL_op)
9014280d 590 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 591 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 592 else
9014280d 593 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
594 }
595 }
596
ba210ebe 597 if (retlen)
28d3d195 598 *retlen = expectlen ? expectlen : len;
ba210ebe 599
28d3d195 600 return 0;
a0ed51b3
LW
601}
602
8e84507e 603/*
37607a96 604=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3
NIS
605
606Returns the native character value of the first character in the string C<s>
cd458e05 607which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
608length, in bytes, of that character.
609
cd458e05 610If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
611returned and retlen is set, if possible, to -1.
612
613=cut
614*/
615
616UV
37607a96 617Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3 618{
5082993f
NC
619 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
620 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
621}
622
623/*
37607a96 624=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3
NIS
625
626Returns the Unicode code point of the first character in the string C<s>
cd458e05 627which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
628length, in bytes, of that character.
629
630This function should only be used when returned UV is considered
631an index into the Unicode semantic tables (e.g. swashes).
632
cd458e05 633If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 634returned and retlen is set, if possible, to -1.
8e84507e
NIS
635
636=cut
637*/
638
639UV
37607a96 640Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 641{
9041c2e3 642 /* Call the low level routine asking for checks */
a2a469f9 643 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 644 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
645}
646
b76347f2 647/*
37607a96 648=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2
JH
649
650Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
651Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
652up past C<e>, croaks.
b76347f2
JH
653
654=cut
655*/
656
657STRLEN
37607a96 658Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2
JH
659{
660 STRLEN len = 0;
d2ae4405 661 U8 t = 0;
b76347f2 662
8850bf83
JH
663 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
664 * the bitops (especially ~) can create illegal UTF-8.
665 * In other words: in Perl UTF-8 is not just for Unicode. */
666
8c18bf38
AL
667 if (e < s)
668 goto warn_and_return;
b76347f2 669 while (s < e) {
d2ae4405 670 t = UTF8SKIP(s);
901b21bf 671 if (e - s < t) {
8c18bf38 672 warn_and_return:
901b21bf
JH
673 if (ckWARN_d(WARN_UTF8)) {
674 if (PL_op)
675 Perl_warner(aTHX_ packWARN(WARN_UTF8),
8c18bf38 676 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
677 else
678 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
679 }
680 return len;
681 }
b76347f2
JH
682 s += t;
683 len++;
684 }
685
686 return len;
687}
688
b06226ff 689/*
eebe1485 690=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 691
cd458e05 692Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
693and C<b>.
694
695WARNING: use only if you *know* that the pointers point inside the
696same UTF-8 buffer.
697
37607a96
PK
698=cut
699*/
a0ed51b3 700
02eb7b47 701IV
864dbfa3 702Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 703{
2f583b3c 704 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3
LW
705}
706
b06226ff 707/*
37607a96 708=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 709
8850bf83
JH
710Return the UTF-8 pointer C<s> displaced by C<off> characters, either
711forward or backward.
b06226ff
JH
712
713WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
714the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
715on the first byte of character or just after the last byte of a character.
b06226ff 716
37607a96
PK
717=cut
718*/
a0ed51b3
LW
719
720U8 *
864dbfa3 721Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 722{
1e7ed80e 723 PERL_UNUSED_CONTEXT;
8850bf83
JH
724 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
725 * the bitops (especially ~) can create illegal UTF-8.
726 * In other words: in Perl UTF-8 is not just for Unicode. */
727
a0ed51b3
LW
728 if (off >= 0) {
729 while (off--)
730 s += UTF8SKIP(s);
731 }
732 else {
733 while (off++) {
734 s--;
8850bf83
JH
735 while (UTF8_IS_CONTINUATION(*s))
736 s--;
a0ed51b3
LW
737 }
738 }
065cbbe5 739 return (U8 *)s;
a0ed51b3
LW
740}
741
6940069f 742/*
eebe1485 743=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 744
cd458e05 745Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
746Unlike C<bytes_to_utf8>, this over-writes the original string, and
747updates len to contain the new length.
67e989fb 748Returns zero on failure, setting C<len> to -1.
6940069f
GS
749
750=cut
751*/
752
753U8 *
37607a96 754Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 755{
43b2f713
AL
756 U8 * const save = s;
757 U8 * const send = s + *len;
6940069f 758 U8 *d;
246fae53 759
cd458e05 760 /* ensure valid UTF-8 and chars < 256 before updating string */
43b2f713 761 while (s < send) {
dcad2880
JH
762 U8 c = *s++;
763
1d72bdf6
NIS
764 if (!UTF8_IS_INVARIANT(c) &&
765 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
766 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
d2ae4405 767 *len = ((STRLEN) -1);
dcad2880
JH
768 return 0;
769 }
246fae53 770 }
dcad2880
JH
771
772 d = s = save;
6940069f 773 while (s < send) {
ed646e6e 774 STRLEN ulen;
9041c2e3 775 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 776 s += ulen;
6940069f
GS
777 }
778 *d = '\0';
246fae53 779 *len = d - save;
6940069f
GS
780 return save;
781}
782
783/*
c05e0e2f 784=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 785
cd458e05 786Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
7120cae1 787Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
788the newly-created string, and updates C<len> to contain the new
789length. Returns the original string if no conversion occurs, C<len>
790is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7910 if C<s> is converted or contains all 7bit characters.
f9a63242 792
37607a96
PK
793=cut
794*/
f9a63242
JH
795
796U8 *
37607a96 797Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 798{
f9a63242 799 U8 *d;
c05e0e2f
AL
800 const U8 *start = s;
801 const U8 *send;
f9a63242 802 I32 count = 0;
ed6a78eb 803 const U8 *s2;
f9a63242 804
1e7ed80e 805 PERL_UNUSED_CONTEXT;
f9a63242 806 if (!*is_utf8)
e2b56717 807 return (U8 *)start;
f9a63242 808
cd458e05 809 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 810 for (send = s + *len; s < send;) {
c05e0e2f 811 U8 c = *s++;
1d72bdf6 812 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
813 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
814 (c = *s++) && UTF8_IS_CONTINUATION(c))
815 count++;
816 else
e2b56717 817 return (U8 *)start;
db42d148 818 }
f9a63242
JH
819 }
820
8e7b0921 821 *is_utf8 = FALSE;
f9a63242 822
e4720ea4 823 Newx(d, (*len) - count + 1, U8);
ed6a78eb
NC
824 s2 = start; start = d;
825 while (s2 < send) {
826 U8 c = *s2++;
c4d5f83a
NIS
827 if (!UTF8_IS_INVARIANT(c)) {
828 /* Then it is two-byte encoded */
ed6a78eb 829 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s2++);
c4d5f83a
NIS
830 c = ASCII_TO_NATIVE(c);
831 }
832 *d++ = c;
f9a63242
JH
833 }
834 *d = '\0';
835 *len = d - start;
e2b56717 836 return (U8 *)start;
f9a63242
JH
837}
838
839/*
eebe1485 840=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 841
cd458e05 842Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
843Returns a pointer to the newly-created string, and sets C<len> to
844reflect the new length.
6940069f 845
cd458e05 846If you want to convert to UTF-8 from other encodings than ASCII,
5835a535
JH
847see sv_recode_to_utf8().
848
497711e7 849=cut
6940069f
GS
850*/
851
852U8*
37607a96 853Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 854{
7120cae1 855 const U8 * const send = s + (*len);
6940069f
GS
856 U8 *d;
857 U8 *dst;
1e7ed80e 858 PERL_UNUSED_CONTEXT;
6940069f 859
e4720ea4 860 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
861 dst = d;
862
863 while (s < send) {
7120cae1 864 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 865 if (UNI_IS_INVARIANT(uv))
eb160463 866 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 867 else {
eb160463
GS
868 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
869 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
870 }
871 }
872 *d = '\0';
6662521e 873 *len = d-dst;
6940069f
GS
874 return dst;
875}
876
a0ed51b3 877/*
dea0fc0b 878 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
879 *
880 * Destination must be pre-extended to 3/2 source. Do not use in-place.
881 * We optimize for native, for obvious reasons. */
882
883U8*
dea0fc0b 884Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 885{
dea0fc0b
JH
886 U8* pend;
887 U8* dstart = d;
888
6dc97886
NC
889 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
890 d[0] = 0;
891 *newlen = 1;
892 return d;
893 }
894
dea0fc0b 895 if (bytelen & 1)
481ddcff 896 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVuf, (UV)bytelen);
dea0fc0b
JH
897
898 pend = p + bytelen;
899
a0ed51b3 900 while (p < pend) {
dea0fc0b
JH
901 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
902 p += 2;
a0ed51b3 903 if (uv < 0x80) {
209f66d7
NC
904#ifdef EBCDIC
905 *d++ = UNI_TO_NATIVE(uv);
906#else
eb160463 907 *d++ = (U8)uv;
209f66d7 908#endif
a0ed51b3
LW
909 continue;
910 }
911 if (uv < 0x800) {
eb160463
GS
912 *d++ = (U8)(( uv >> 6) | 0xc0);
913 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
914 continue;
915 }
916 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
ac388100
JH
917 UV low = (p[0] << 8) + p[1];
918 p += 2;
dea0fc0b
JH
919 if (low < 0xdc00 || low >= 0xdfff)
920 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
921 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
922 }
923 if (uv < 0x10000) {
eb160463
GS
924 *d++ = (U8)(( uv >> 12) | 0xe0);
925 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
926 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
927 continue;
928 }
929 else {
eb160463
GS
930 *d++ = (U8)(( uv >> 18) | 0xf0);
931 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
932 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
933 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
934 continue;
935 }
936 }
dea0fc0b 937 *newlen = d - dstart;
a0ed51b3
LW
938 return d;
939}
940
941/* Note: this one is slightly destructive of the source. */
942
943U8*
dea0fc0b 944Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
945{
946 U8* s = (U8*)p;
43b2f713 947 U8* const send = s + bytelen;
a0ed51b3 948 while (s < send) {
43b2f713 949 const U8 tmp = s[0];
a0ed51b3
LW
950 s[0] = s[1];
951 s[1] = tmp;
952 s += 2;
953 }
dea0fc0b 954 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
955}
956
957/* for now these are all defined (inefficiently) in terms of the utf8 versions */
958
959bool
84afefe6 960Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 961{
a2a469f9 962 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 963 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
964 return is_utf8_alnum(tmpbuf);
965}
966
967bool
84afefe6 968Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 969{
a2a469f9 970 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 971 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
972 return is_utf8_alnumc(tmpbuf);
973}
974
975bool
84afefe6 976Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 977{
a2a469f9 978 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 979 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
980 return is_utf8_idfirst(tmpbuf);
981}
982
983bool
84afefe6 984Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 985{
a2a469f9 986 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 987 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
988 return is_utf8_alpha(tmpbuf);
989}
990
991bool
84afefe6 992Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 993{
a2a469f9 994 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 995 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
996 return is_utf8_ascii(tmpbuf);
997}
998
999bool
84afefe6 1000Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1001{
a2a469f9 1002 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1003 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1004 return is_utf8_space(tmpbuf);
1005}
1006
1007bool
84afefe6 1008Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1009{
a2a469f9 1010 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1011 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1012 return is_utf8_digit(tmpbuf);
1013}
1014
1015bool
84afefe6 1016Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1017{
a2a469f9 1018 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1019 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1020 return is_utf8_upper(tmpbuf);
1021}
1022
1023bool
84afefe6 1024Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1025{
a2a469f9 1026 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1027 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1028 return is_utf8_lower(tmpbuf);
1029}
1030
1031bool
84afefe6 1032Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1033{
a2a469f9 1034 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1035 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1036 return is_utf8_cntrl(tmpbuf);
1037}
1038
1039bool
84afefe6 1040Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1041{
a2a469f9 1042 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1043 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1044 return is_utf8_graph(tmpbuf);
1045}
1046
1047bool
84afefe6 1048Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1049{
a2a469f9 1050 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1051 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1052 return is_utf8_print(tmpbuf);
1053}
1054
b8c5462f 1055bool
84afefe6 1056Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1057{
a2a469f9 1058 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1059 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1060 return is_utf8_punct(tmpbuf);
1061}
1062
4d61ec05 1063bool
84afefe6 1064Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1065{
a2a469f9 1066 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1067 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1068 return is_utf8_xdigit(tmpbuf);
1069}
1070
84afefe6
JH
1071UV
1072Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1073{
0ebc6274
JH
1074 uvchr_to_utf8(p, c);
1075 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1076}
1077
84afefe6
JH
1078UV
1079Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1080{
0ebc6274
JH
1081 uvchr_to_utf8(p, c);
1082 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1083}
1084
84afefe6
JH
1085UV
1086Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1087{
0ebc6274
JH
1088 uvchr_to_utf8(p, c);
1089 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1090}
1091
84afefe6
JH
1092UV
1093Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1094{
0ebc6274
JH
1095 uvchr_to_utf8(p, c);
1096 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1097}
1098
a0ed51b3
LW
1099/* for now these all assume no locale info available for Unicode > 255 */
1100
1101bool
84afefe6 1102Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1103{
1104 return is_uni_alnum(c); /* XXX no locale support yet */
1105}
1106
1107bool
84afefe6 1108Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1109{
1110 return is_uni_alnumc(c); /* XXX no locale support yet */
1111}
1112
1113bool
84afefe6 1114Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1115{
1116 return is_uni_idfirst(c); /* XXX no locale support yet */
1117}
1118
1119bool
84afefe6 1120Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1121{
1122 return is_uni_alpha(c); /* XXX no locale support yet */
1123}
1124
1125bool
84afefe6 1126Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1127{
1128 return is_uni_ascii(c); /* XXX no locale support yet */
1129}
1130
1131bool
84afefe6 1132Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1133{
1134 return is_uni_space(c); /* XXX no locale support yet */
1135}
1136
1137bool
84afefe6 1138Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1139{
1140 return is_uni_digit(c); /* XXX no locale support yet */
1141}
1142
1143bool
84afefe6 1144Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1145{
1146 return is_uni_upper(c); /* XXX no locale support yet */
1147}
1148
1149bool
84afefe6 1150Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1151{
1152 return is_uni_lower(c); /* XXX no locale support yet */
1153}
1154
1155bool
84afefe6 1156Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1157{
1158 return is_uni_cntrl(c); /* XXX no locale support yet */
1159}
1160
1161bool
84afefe6 1162Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1163{
1164 return is_uni_graph(c); /* XXX no locale support yet */
1165}
1166
1167bool
84afefe6 1168Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1169{
1170 return is_uni_print(c); /* XXX no locale support yet */
1171}
1172
b8c5462f 1173bool
84afefe6 1174Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1175{
1176 return is_uni_punct(c); /* XXX no locale support yet */
1177}
1178
4d61ec05 1179bool
84afefe6 1180Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1181{
1182 return is_uni_xdigit(c); /* XXX no locale support yet */
1183}
1184
b7ac61fa
JH
1185U32
1186Perl_to_uni_upper_lc(pTHX_ U32 c)
1187{
ee099d14
JH
1188 /* XXX returns only the first character -- do not use XXX */
1189 /* XXX no locale support yet */
1190 STRLEN len;
a2a469f9 1191 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1192 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1193}
1194
1195U32
1196Perl_to_uni_title_lc(pTHX_ U32 c)
1197{
ee099d14
JH
1198 /* XXX returns only the first character XXX -- do not use XXX */
1199 /* XXX no locale support yet */
1200 STRLEN len;
a2a469f9 1201 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1202 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1203}
1204
1205U32
1206Perl_to_uni_lower_lc(pTHX_ U32 c)
1207{
ee099d14
JH
1208 /* XXX returns only the first character -- do not use XXX */
1209 /* XXX no locale support yet */
1210 STRLEN len;
a2a469f9 1211 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1212 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1213}
1214
6d29369a 1215static bool
4127a31c
RH
1216S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
1217 const char *const swashname)
a0ed51b3 1218{
4127a31c 1219 if (!is_utf8_char((U8 *)p))
386d01d6 1220 return FALSE;
5082993f 1221 if (!*swash)
ef049b7c 1222 *swash = swash_init("utf8", (char *)swashname, &PL_sv_undef, 1, 0);
4127a31c 1223 return swash_fetch(*swash, (U8 *)p, TRUE) != 0;
5082993f
NC
1224}
1225
1226bool
1227Perl_is_utf8_alnum(pTHX_ U8 *p)
1228{
1229 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1230 * descendant of isalnum(3), in other words, it doesn't
1231 * contain the '_'. --jhi */
43b2f713 1232 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
a0ed51b3
LW
1233}
1234
1235bool
b8c5462f
JH
1236Perl_is_utf8_alnumc(pTHX_ U8 *p)
1237{
43b2f713 1238 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
b8c5462f
JH
1239}
1240
1241bool
82686b01 1242Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
a0ed51b3 1243{
82686b01
JH
1244 if (*p == '_')
1245 return TRUE;
5082993f 1246 /* is_utf8_idstart would be more logical. */
43b2f713 1247 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1248}
1249
1250bool
1251Perl_is_utf8_idcont(pTHX_ U8 *p)
1252{
1253 if (*p == '_')
1254 return TRUE;
43b2f713 1255 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1256}
1257
1258bool
864dbfa3 1259Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1260{
43b2f713 1261 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1262}
1263
1264bool
b8c5462f
JH
1265Perl_is_utf8_ascii(pTHX_ U8 *p)
1266{
43b2f713 1267 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
b8c5462f
JH
1268}
1269
1270bool
864dbfa3 1271Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1272{
43b2f713 1273 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3
LW
1274}
1275
1276bool
864dbfa3 1277Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1278{
43b2f713 1279 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1280}
1281
1282bool
864dbfa3 1283Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1284{
43b2f713 1285 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1286}
1287
1288bool
864dbfa3 1289Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1290{
43b2f713 1291 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1292}
1293
1294bool
b8c5462f
JH
1295Perl_is_utf8_cntrl(pTHX_ U8 *p)
1296{
43b2f713 1297 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f
JH
1298}
1299
1300bool
1301Perl_is_utf8_graph(pTHX_ U8 *p)
1302{
43b2f713 1303 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1304}
1305
1306bool
864dbfa3 1307Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1308{
43b2f713 1309 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1310}
1311
1312bool
b8c5462f
JH
1313Perl_is_utf8_punct(pTHX_ U8 *p)
1314{
43b2f713 1315 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1316}
1317
1318bool
1319Perl_is_utf8_xdigit(pTHX_ U8 *p)
1320{
43b2f713 1321 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f
JH
1322}
1323
1324bool
864dbfa3 1325Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1326{
43b2f713 1327 return is_utf8_common(p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1328}
1329
6b5c0936
JH
1330/*
1331=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1332
1333The "p" contains the pointer to the UTF-8 string encoding
1334the character that is being converted.
1335
1336The "ustrp" is a pointer to the character buffer to put the
1337conversion result to. The "lenp" is a pointer to the length
1338of the result.
1339
0134edef 1340The "swashp" is a pointer to the swash to use.
6b5c0936 1341
0134edef 1342Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
ef049b7c 1343and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 1344but not always, a multicharacter mapping), is tried first.
6b5c0936 1345
0134edef
JH
1346The "special" is a string like "utf8::ToSpecLower", which means the
1347hash %utf8::ToSpecLower. The access to the hash is through
1348Perl_to_utf8_case().
6b5c0936 1349
0134edef
JH
1350The "normal" is a string like "ToLower" which means the swash
1351%utf8::ToLower.
1352
1353=cut */
6b5c0936 1354
2104c8d9 1355UV
a6872d42 1356Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
a0ed51b3 1357{
a2a469f9 1358 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1359 STRLEN len = 0;
a0ed51b3 1360
c9dc1ff4 1361 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1362 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1363 * are necessary in EBCDIC, they are redundant no-ops
1364 * in ASCII-ish platforms, and hopefully optimized away. */
24c2fff4 1365 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1366 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1367
1368 if (!*swashp) /* load on-demand */
1369 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1370
08952adc
JH
1371 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1372 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1373 /* It might be "special" (sometimes, but not always,
2a37f04d 1374 * a multicharacter mapping) */
8e7b0921 1375 HV * const hv = get_hv(special, FALSE);
08952adc
JH
1376 SV **svp;
1377
8e7b0921 1378 if (hv &&
08952adc
JH
1379 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1380 (*svp)) {
71a0dd65 1381 const char *s;
47654450 1382
71a0dd65 1383 s = SvPV_const(*svp, len);
47654450
JH
1384 if (len == 1)
1385 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1386 else {
2f9475ad
JH
1387#ifdef EBCDIC
1388 /* If we have EBCDIC we need to remap the characters
1389 * since any characters in the low 256 are Unicode
1390 * code points, not EBCDIC. */
7cda7a3d 1391 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1392
1393 d = tmpbuf;
08952adc 1394 if (SvUTF8(*svp)) {
2f9475ad
JH
1395 STRLEN tlen = 0;
1396
1397 while (t < tend) {
43b2f713 1398 const UV c = utf8_to_uvchr(t, &tlen);
2f9475ad
JH
1399 if (tlen > 0) {
1400 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1401 t += tlen;
1402 }
1403 else
1404 break;
1405 }
1406 }
1407 else {
36fec512
JH
1408 while (t < tend) {
1409 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1410 t++;
1411 }
2f9475ad
JH
1412 }
1413 len = d - tmpbuf;
1414 Copy(tmpbuf, ustrp, len, U8);
1415#else
d2dcd0fb 1416 Copy(s, ustrp, len, U8);
2f9475ad 1417#endif
29e98929 1418 }
983ffd37 1419 }
0134edef
JH
1420 }
1421
1422 if (!len && *swashp) {
43b2f713
AL
1423 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1424
0134edef
JH
1425 if (uv2) {
1426 /* It was "normal" (a single character mapping). */
43b2f713 1427 const UV uv3 = UNI_TO_NATIVE(uv2);
e9101d72 1428 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1429 }
1430 }
1feea2c7 1431
0134edef
JH
1432 if (!len) /* Neither: just copy. */
1433 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1434
2a37f04d
JH
1435 if (lenp)
1436 *lenp = len;
1437
0134edef 1438 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1439}
1440
d3e79532
JH
1441/*
1442=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1443
1444Convert the UTF-8 encoded character at p to its uppercase version and
1445store that in UTF-8 in ustrp and its length in bytes in lenp. Note
a2a469f9
JH
1446that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1447the uppercase version may be longer than the original character.
d3e79532
JH
1448
1449The first character of the uppercased version is returned
1450(but note, as explained above, that there may be more.)
1451
1452=cut */
1453
2104c8d9 1454UV
983ffd37 1455Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1456{
983ffd37 1457 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1458 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1459}
a0ed51b3 1460
d3e79532
JH
1461/*
1462=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1463
1464Convert the UTF-8 encoded character at p to its titlecase version and
1465store that in UTF-8 in ustrp and its length in bytes in lenp. Note
a2a469f9
JH
1466that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1467titlecase version may be longer than the original character.
d3e79532
JH
1468
1469The first character of the titlecased version is returned
1470(but note, as explained above, that there may be more.)
1471
1472=cut */
1473
983ffd37
JH
1474UV
1475Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1476{
1477 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1478 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1479}
1480
d3e79532
JH
1481/*
1482=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1483
1484Convert the UTF-8 encoded character at p to its lowercase version and
1485store that in UTF-8 in ustrp and its length in bytes in lenp. Note
a2a469f9
JH
1486that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1487lowercase version may be longer than the original character.
d3e79532
JH
1488
1489The first character of the lowercased version is returned
1490(but note, as explained above, that there may be more.)
1491
1492=cut */
1493
2104c8d9 1494UV
a2a2844f 1495Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1496{
983ffd37 1497 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1498 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1499}
1500
d3e79532
JH
1501/*
1502=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1503
1504Convert the UTF-8 encoded character at p to its foldcase version and
1505store that in UTF-8 in ustrp and its length in bytes in lenp. Note
a2a469f9 1506that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1507foldcase version may be longer than the original character (up to
1508three characters).
1509
1510The first character of the foldcased version is returned
1511(but note, as explained above, that there may be more.)
1512
1513=cut */
1514
b4e400f9
JH
1515UV
1516Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1517{
1518 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1519 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1520}
1521
ef049b7c
TS
1522/* Note:
1523 * A "swash" is a swatch hash.
1524 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1525 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1526 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1527 */
a0ed51b3 1528SV*
864dbfa3 1529Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3
LW
1530{
1531 SV* retval;
e7b555a6 1532 SV* const tokenbufsv = sv_newmortal();
8e84507e 1533 dSP;
1f49be52
AL
1534 const size_t pkg_len = strlen(pkg);
1535 const size_t name_len = strlen(name);
c9dc1ff4 1536 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1537 SV* errsv_save;
ce3b816e 1538
f66b99ab
NC
1539 PUSHSTACKi(PERLSI_MAGIC);
1540 ENTER;
1541 SAVEI32(PL_hints);
1542 PL_hints = 0;
1543 save_re_context();
1b026014 1544 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1545 ENTER;
f8be5cf0 1546 errsv_save = newSVsv(ERRSV);
50dbc71b
NC
1547 /* It is assumed that callers of this routine are not passing in any
1548 user derived data. */
1549 /* Need to do this after save_re_context() as it will set PL_tainted to
1550 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1551 Even line to create errsv_save can turn on PL_tainted. */
1552 SAVEBOOL(PL_tainted);
1553 PL_tainted = 0;
b035a42e 1554 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
0e2d6244 1555 NULL);
f8be5cf0
JH
1556 if (!SvTRUE(ERRSV))
1557 sv_setsv(ERRSV, errsv_save);
1558 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1559 LEAVE;
1560 }
1561 SPAGAIN;
a0ed51b3
LW
1562 PUSHMARK(SP);
1563 EXTEND(SP,5);
b035a42e
NC
1564 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1565 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1566 PUSHs(listsv);
1567 PUSHs(sv_2mortal(newSViv(minbits)));
1568 PUSHs(sv_2mortal(newSViv(none)));
1569 PUTBACK;
ef7b71f0 1570 if (IN_PERL_COMPILETIME) {
bf1fed83 1571 /* XXX ought to be handled by lex_start */
82686b01 1572 SAVEI32(PL_in_my);
5b7ea690 1573 PL_in_my = 0;
bf1fed83 1574 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1575 }
f8be5cf0 1576 errsv_save = newSVsv(ERRSV);
864dbfa3 1577 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1578 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1579 else
e24b16f9 1580 retval = &PL_sv_undef;
f8be5cf0
JH
1581 if (!SvTRUE(ERRSV))
1582 sv_setsv(ERRSV, errsv_save);
1583 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1584 LEAVE;
1585 POPSTACK;
ef7b71f0 1586 if (IN_PERL_COMPILETIME) {
bf1fed83 1587 STRLEN len;
c9dc1ff4 1588 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83
JH
1589
1590 Copy(pv, PL_tokenbuf, len+1, char);
2cb86c03 1591 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1592 }
bc45ce41
JH
1593 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1594 if (SvPOK(retval))
c293eb2b 1595 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
f2139bd3 1596 (void*)retval);
cea2e8a9 1597 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1598 }
a0ed51b3
LW
1599 return retval;
1600}
1601
035d37be
JH
1602
1603/* This API is wrong for special case conversions since we may need to
1604 * return several Unicode characters for a single Unicode character
1605 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1606 * the lower-level routine, and it is similarly broken for returning
1607 * multiple values. --jhi */
ef049b7c
TS
1608/* Now SWASHGET is recasted into S_swash_get in this file. */
1609
1610/* Note:
1611 * Returns the value of property/mapping C<swash> for the first character
1612 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1613 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1614 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1615 */
a0ed51b3 1616UV
ef049b7c 1617Perl_swash_fetch(pTHX_ SV *swash, U8 *ptr, bool do_utf8)
a0ed51b3 1618{
ef049b7c 1619 HV* const hv = (HV*)SvRV(swash);
3568d838
JH
1620 U32 klen;
1621 U32 off;
a0ed51b3 1622 STRLEN slen;
7d85a32c 1623 STRLEN needents;
71a0dd65 1624 const U8 *tmps = NULL;
a0ed51b3 1625 U32 bit;
ef049b7c 1626 SV *swatch;
3568d838 1627 U8 tmputf8[2];
8e7b0921 1628 const UV c = NATIVE_TO_ASCII(*ptr);
3568d838
JH
1629
1630 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
ef049b7c
TS
1631 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1632 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1633 ptr = tmputf8;
3568d838
JH
1634 }
1635 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1636 * then the "swatch" is a vec() for al the chars which start
1637 * with 0xAA..0xYY
1638 * So the key in the hash (klen) is length of encoded char -1
1639 */
1640 klen = UTF8SKIP(ptr) - 1;
1641 off = ptr[klen];
a0ed51b3 1642
ef049b7c 1643 if (klen == 0) {
7d85a32c 1644 /* If char in invariant then swatch is for all the invariant chars
cd458e05 1645 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1646 */
ef049b7c
TS
1647 needents = UTF_CONTINUATION_MARK;
1648 off = NATIVE_TO_UTF(ptr[klen]);
1649 }
1650 else {
7d85a32c 1651 /* If char is encoded then swatch is for the prefix */
ef049b7c
TS
1652 needents = (1 << UTF_ACCUMULATION_SHIFT);
1653 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1654 }
7d85a32c 1655
a0ed51b3
LW
1656 /*
1657 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1658 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1659 * it's nothing to sniff at.) Pity we usually come through at least
1660 * two function calls to get here...
1661 *
1662 * NB: this code assumes that swatches are never modified, once generated!
1663 */
1664
3568d838 1665 if (hv == PL_last_swash_hv &&
a0ed51b3 1666 klen == PL_last_swash_klen &&
fe20fd30 1667 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1668 {
1669 tmps = PL_last_swash_tmps;
1670 slen = PL_last_swash_slen;
1671 }
1672 else {
1673 /* Try our second-level swatch cache, kept in a hash. */
c05e0e2f 1674 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1675
ef049b7c
TS
1676 /* If not cached, generate it via swash_get */
1677 if (!svp || !SvPOK(*svp)
1678 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0
NIS
1679 /* We use utf8n_to_uvuni() as we want an index into
1680 Unicode tables, not a native character number.
1681 */
c9dc1ff4 1682 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1683 ckWARN(WARN_UTF8) ?
1684 0 : UTF8_ALLOW_ANY);
ef049b7c
TS
1685 swatch = swash_get(swash,
1686 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1687 (klen) ? (code_point & ~(needents - 1)) : 0,
1688 needents);
1689
ef7b71f0 1690 if (IN_PERL_COMPILETIME)
2cb86c03 1691 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1692
ef049b7c 1693 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 1694
ef049b7c
TS
1695 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1696 || (slen << 3) < needents)
1697 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
a0ed51b3
LW
1698 }
1699
1700 PL_last_swash_hv = hv;
1701 PL_last_swash_klen = klen;
71a0dd65
NC
1702 /* FIXME change interpvar.h? */
1703 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1704 PL_last_swash_slen = slen;
1705 if (klen)
1706 Copy(ptr, PL_last_swash_key, klen, U8);
1707 }
1708
9faf8d75 1709 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1710 case 1:
1711 bit = 1 << (off & 7);
1712 off >>= 3;
1713 return (tmps[off] & bit) != 0;
1714 case 8:
1715 return tmps[off];
1716 case 16:
1717 off <<= 1;
1718 return (tmps[off] << 8) + tmps[off + 1] ;
1719 case 32:
1720 off <<= 2;
1721 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1722 }
ef049b7c 1723 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
75056b2a 1724 NORETURN_FUNCTION_END;
a0ed51b3 1725}
2b9d42f0 1726
ef049b7c
TS
1727/* Note:
1728 * Returns a swatch (a bit vector string) for a code point sequence
1729 * that starts from the value C<start> and comprises the number C<span>.
1730 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1731 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1732 */
1733STATIC SV*
1734S_swash_get(pTHX_ SV* swash, UV start, UV span)
1735{
1736 SV *swatch;
1737 U8 *l, *lend, *x, *xend, *s;
1738 STRLEN lcur, xcur, scur;
1739
1740 HV* const hv = (HV*)SvRV(swash);
d7559646
AL
1741 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1742 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1743 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1744 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1745 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
8c89da26
AL
1746 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1747 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1748 const STRLEN bits = SvUV(*bitssvp);
1749 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1750 const UV none = SvUV(*nonesvp);
1751 const UV end = start + span;
ef049b7c
TS
1752
1753 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
1754 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1755 (UV)bits);
1756 }
1757
1758 /* create and initialize $swatch */
d7559646 1759 swatch = newSVpvs("");
ef049b7c
TS
1760 scur = octets ? (span * octets) : (span + 7) / 8;
1761 SvGROW(swatch, scur + 1);
1762 s = (U8*)SvPVX(swatch);
1763 if (octets && none) {
8c89da26 1764 const U8* const e = s + scur;
ef049b7c
TS
1765 while (s < e) {
1766 if (bits == 8)
1767 *s++ = (U8)(none & 0xff);
1768 else if (bits == 16) {
1769 *s++ = (U8)((none >> 8) & 0xff);
1770 *s++ = (U8)( none & 0xff);
1771 }
1772 else if (bits == 32) {
1773 *s++ = (U8)((none >> 24) & 0xff);
1774 *s++ = (U8)((none >> 16) & 0xff);
1775 *s++ = (U8)((none >> 8) & 0xff);
1776 *s++ = (U8)( none & 0xff);
1777 }
1778 }
1779 *s = '\0';
1780 }
1781 else {
1782 (void)memzero((U8*)s, scur + 1);
1783 }
1784 SvCUR_set(swatch, scur);
1785 s = (U8*)SvPVX(swatch);
1786
1787 /* read $swash->{LIST} */
1788 l = (U8*)SvPV(*listsvp, lcur);
1789 lend = l + lcur;
1790 while (l < lend) {
8e7b0921 1791 UV min, max, val;
ef049b7c
TS
1792 STRLEN numlen;
1793 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1794
8c89da26 1795 U8* const nl = (U8*)memchr(l, '\n', lend - l);
ef049b7c
TS
1796
1797 numlen = lend - l;
1798 min = grok_hex((char *)l, &numlen, &flags, NULL);
1799 if (numlen)
1800 l += numlen;
1801 else if (nl) {
1802 l = nl + 1; /* 1 is length of "\n" */
1803 continue;
1804 }
1805 else {
1806 l = lend; /* to LIST's end at which \n is not found */
1807 break;
1808 }
1809
1810 if (isBLANK(*l)) {
1811 ++l;
1812 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1813 numlen = lend - l;
1814 max = grok_hex((char *)l, &numlen, &flags, NULL);
1815 if (numlen)
1816 l += numlen;
1817 else
1818 max = min;
1819
1820 if (octets) {
1821 if (isBLANK(*l)) {
1822 ++l;
1823 flags = PERL_SCAN_SILENT_ILLDIGIT |
1824 PERL_SCAN_DISALLOW_PREFIX;
1825 numlen = lend - l;
1826 val = grok_hex((char *)l, &numlen, &flags, NULL);
1827 if (numlen)
1828 l += numlen;
1829 else
1830 val = 0;
1831 }
1832 else {
1833 val = 0;
1834 if (typeto) {
1835 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1836 typestr, l);
1837 }
1838 }
1839 }
1840 else
1841 val = 0; /* bits == 1, then val should be ignored */
1842 }
1843 else {
1844 max = min;
1845 if (octets) {
1846 val = 0;
1847 if (typeto) {
1848 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1849 }
1850 }
1851 else
1852 val = 0; /* bits == 1, then val should be ignored */
1853 }
1854
1855 if (nl)
1856 l = nl + 1;
1857 else
1858 l = lend;
1859
1860 if (max < start)
1861 continue;
1862
1863 if (octets) {
8e7b0921 1864 UV key;
ef049b7c
TS
1865 if (min < start) {
1866 if (!none || val < none) {
1867 val += start - min;
1868 }
1869 min = start;
1870 }
1871 for (key = min; key <= max; key++) {
1872 STRLEN offset;
1873 if (key >= end)
1874 goto go_out_list;
1875 /* offset must be non-negative (start <= min <= key < end) */
1876 offset = octets * (key - start);
1877 if (bits == 8)
1878 s[offset] = (U8)(val & 0xff);
1879 else if (bits == 16) {
1880 s[offset ] = (U8)((val >> 8) & 0xff);
1881 s[offset + 1] = (U8)( val & 0xff);
1882 }
1883 else if (bits == 32) {
1884 s[offset ] = (U8)((val >> 24) & 0xff);
1885 s[offset + 1] = (U8)((val >> 16) & 0xff);
1886 s[offset + 2] = (U8)((val >> 8) & 0xff);
1887 s[offset + 3] = (U8)( val & 0xff);
1888 }
1889
1890 if (!none || val < none)
1891 ++val;
1892 }
1893 }
1894 else { /* bits == 1, then val should be ignored */
8e7b0921 1895 UV key;
ef049b7c
TS
1896 if (min < start)
1897 min = start;
1898 for (key = min; key <= max; key++) {
8c89da26 1899 const STRLEN offset = (STRLEN)(key - start);
ef049b7c
TS
1900 if (key >= end)
1901 goto go_out_list;
1902 s[offset >> 3] |= 1 << (offset & 7);
1903 }
1904 }
1905 } /* while */
1906 go_out_list:
1907
1908 /* read $swash->{EXTRAS} */
1909 x = (U8*)SvPV(*extssvp, xcur);
1910 xend = x + xcur;
1911 while (x < xend) {
1912 STRLEN namelen;
1913 U8 *namestr;
1914 SV** othersvp;
1915 HV* otherhv;
1916 STRLEN otherbits;
1917 SV **otherbitssvp, *other;
1918 U8 *s, *o, *nl;
1919 STRLEN slen, olen;
1920
8e7b0921 1921 const U8 opc = *x++;
ef049b7c
TS
1922 if (opc == '\n')
1923 continue;
1924
1925 nl = (U8*)memchr(x, '\n', xend - x);
1926
1927 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1928 if (nl) {
1929 x = nl + 1; /* 1 is length of "\n" */
1930 continue;
1931 }
1932 else {
1933 x = xend; /* to EXTRAS' end at which \n is not found */
1934 break;
1935 }
1936 }
1937
1938 namestr = x;
1939 if (nl) {
1940 namelen = nl - namestr;
1941 x = nl + 1;
1942 }
1943 else {
1944 namelen = xend - namestr;
1945 x = xend;
1946 }
1947
1948 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
1949 otherhv = (HV*)SvRV(*othersvp);
d7559646 1950 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
ef049b7c
TS
1951 otherbits = (STRLEN)SvUV(*otherbitssvp);
1952 if (bits < otherbits)
1953 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
1954
1955 /* The "other" swatch must be destroyed after. */
1956 other = swash_get(*othersvp, start, span);
1957 o = (U8*)SvPV(other, olen);
1958
1959 if (!olen)
1960 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
1961
1962 s = (U8*)SvPV(swatch, slen);
1963 if (bits == 1 && otherbits == 1) {
1964 if (slen != olen)
1965 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
1966
1967 switch (opc) {
1968 case '+':
1969 while (slen--)
1970 *s++ |= *o++;
1971 break;
1972 case '!':
1973 while (slen--)
1974 *s++ |= ~*o++;
1975 break;
1976 case '-':
1977 while (slen--)
1978 *s++ &= ~*o++;
1979 break;
1980 case '&':
1981 while (slen--)
1982 *s++ &= *o++;
1983 break;
1984 default:
1985 break;
1986 }
1987 }
1988 else {
1989 STRLEN otheroctets = otherbits >> 3;
1990 STRLEN offset = 0;
8e7b0921 1991 U8* const send = s + slen;
ef049b7c
TS
1992
1993 while (s < send) {
1994 UV otherval = 0;
1995
1996 if (otherbits == 1) {
1997 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
1998 ++offset;
1999 }
2000 else {
2001 STRLEN vlen = otheroctets;
2002 otherval = *o++;
2003 while (--vlen) {
2004 otherval <<= 8;
2005 otherval |= *o++;
2006 }
2007 }
2008
2009 if (opc == '+' && otherval)
c9c8c8e1 2010 NOOP; /* replace with otherval */
ef049b7c
TS
2011 else if (opc == '!' && !otherval)
2012 otherval = 1;
2013 else if (opc == '-' && otherval)
2014 otherval = 0;
2015 else if (opc == '&' && !otherval)
2016 otherval = 0;
2017 else {
2018 s += octets; /* no replacement */
2019 continue;
2020 }
2021
2022 if (bits == 8)
2023 *s++ = (U8)( otherval & 0xff);
2024 else if (bits == 16) {
2025 *s++ = (U8)((otherval >> 8) & 0xff);
2026 *s++ = (U8)( otherval & 0xff);
2027 }
2028 else if (bits == 32) {
2029 *s++ = (U8)((otherval >> 24) & 0xff);
2030 *s++ = (U8)((otherval >> 16) & 0xff);
2031 *s++ = (U8)((otherval >> 8) & 0xff);
2032 *s++ = (U8)( otherval & 0xff);
2033 }
2034 }
2035 }
2036 sv_free(other); /* through with it! */
2037 } /* while */
2038 return swatch;
2039}
2040
5082993f
NC
2041/*
2042=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2043
2044Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2045of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2046bytes available. The return value is the pointer to the byte after the
2047end of the new character. In other words,
2048
2049 d = uvchr_to_utf8(d, uv);
2050
2051is the recommended wide native character-aware way of saying
2052
2053 *(d++) = uv;
2054
2055=cut
2056*/
2057
2058/* On ASCII machines this is normally a macro but we want a
2059 real function in case XS code wants it
2060*/
2061U8 *
2062Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2063{
2064 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2065}
2066
b851fbc1
JH
2067U8 *
2068Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2069{
2070 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2071}
2b9d42f0
NIS
2072
2073/*
5082993f
NC
2074=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2075flags
2076
2077Returns the native character value of the first character in the string
2078C<s>
2079which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2080length, in bytes, of that character.
2081
2082Allows length and flags to be passed to low level routine.
2083
2084=cut
2085*/
2086/* On ASCII machines this is normally a macro but we want
2087 a real function in case XS code wants it
2088*/
2089UV
2090Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2091{
2092 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2093 return UNI_TO_NATIVE(uv);
2094}
2095
2096/*
d2cc3551
JH
2097=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2098
2099Build to the scalar dsv a displayable version of the string spv,
2100length len, the displayable version being at most pvlim bytes long
2101(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2102
9e55ce06 2103The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 2104isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
2105to display the \\[nrfta\\] as the backslashed versions (like '\n')
2106(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2107UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2108UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2109
d2cc3551
JH
2110The pointer to the PV of the dsv is returned.
2111
2112=cut */
e6b2e755
JH
2113char *
2114Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
2115{
2116 int truncated = 0;
c05e0e2f 2117 const char *s, *e;
e6b2e755
JH
2118
2119 sv_setpvn(dsv, "", 0);
c05e0e2f 2120 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 2121 UV u;
04851bb3
NC
2122 /* This serves double duty as a flag and a character to print after
2123 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2124 */
2125 char ok = 0;
c728cb41 2126
e6b2e755
JH
2127 if (pvlim && SvCUR(dsv) >= pvlim) {
2128 truncated++;
2129 break;
2130 }
2131 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 2132 if (u < 256) {
8c18bf38 2133 const unsigned char c = (unsigned char)u & 0xFF;
8c89da26 2134 if (flags & UNI_DISPLAY_BACKSLASH) {
04851bb3 2135 switch (c) {
c728cb41 2136 case '\n':
04851bb3 2137 ok = 'n'; break;
c728cb41 2138 case '\r':
04851bb3 2139 ok = 'r'; break;
c728cb41 2140 case '\t':
04851bb3 2141 ok = 't'; break;
c728cb41 2142 case '\f':
04851bb3 2143 ok = 'f'; break;
c728cb41 2144 case '\a':
04851bb3 2145 ok = 'a'; break;
c728cb41 2146 case '\\':
04851bb3 2147 ok = '\\'; break;
c728cb41
JH
2148 default: break;
2149 }
04851bb3
NC
2150 if (ok) {
2151 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2152 }
c728cb41 2153 }
00e86452 2154 /* isPRINT() is the locale-blind version. */
04851bb3
NC
2155 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2156 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2157 ok = 1;
0a2ef054 2158 }
c728cb41
JH
2159 }
2160 if (!ok)
9e55ce06 2161 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
2162 }
2163 if (truncated)
d7559646 2164 sv_catpvs(dsv, "...");
e6b2e755
JH
2165
2166 return SvPVX(dsv);
2167}
2b9d42f0 2168
d2cc3551
JH
2169/*
2170=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2171
2172Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 2173the displayable version being at most pvlim bytes long
d2cc3551 2174(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
2175
2176The flags argument is as in pv_uni_display().
2177
d2cc3551
JH
2178The pointer to the PV of the dsv is returned.
2179
43b2f713
AL
2180=cut
2181*/
e6b2e755
JH
2182char *
2183Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2184{
ed6a78eb 2185 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX_const(ssv),
71a0dd65 2186 SvCUR(ssv), pvlim, flags);
701a277b
JH
2187}
2188
d2cc3551 2189/*
d07ddd77 2190=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
2191
2192Return true if the strings s1 and s2 differ case-insensitively, false
2193if not (if they are equal case-insensitively). If u1 is true, the
2194string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
2195the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2196are false, the respective string is assumed to be in native 8-bit
2197encoding.
2198
2199If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2200in there (they will point at the beginning of the I<next> character).
2201If the pointers behind pe1 or pe2 are non-NULL, they are the end
2202pointers beyond which scanning will not continue under any
359e6eb4 2203circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
2204s2+l2 will be used as goal end pointers that will also stop the scan,
2205and which qualify towards defining a successful match: all the scans
2206that define an explicit length must reach their goal pointers for
2207a match to succeed).
d2cc3551
JH
2208
2209For case-insensitiveness, the "casefolding" of Unicode is used
2210instead of upper/lowercasing both the characters, see
2211http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2212
2213=cut */
701a277b 2214I32
d07ddd77 2215Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2216{
c05e0e2f
AL
2217 register const U8 *p1 = (const U8*)s1;
2218 register const U8 *p2 = (const U8*)s2;
4358d444 2219 register const U8 *f1 = NULL;
0bfdb59a 2220 register const U8 *f2 = NULL;
4358d444
NC
2221 register U8 *e1 = NULL;
2222 register U8 *q1 = NULL;
2223 register U8 *e2 = NULL;
2224 register U8 *q2 = NULL;
d07ddd77 2225 STRLEN n1 = 0, n2 = 0;
a2a469f9
JH
2226 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2227 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2228 U8 natbuf[1+1];
2229 STRLEN foldlen1, foldlen2;
d07ddd77 2230 bool match;
332ddc25 2231
d07ddd77
JH
2232 if (pe1)
2233 e1 = *(U8**)pe1;
c05e0e2f
AL
2234 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2235 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2236 if (pe2)
2237 e2 = *(U8**)pe2;
c05e0e2f
AL
2238 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2239 f2 = (const U8*)s2 + l2;
d07ddd77
JH
2240
2241 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2242 return 1; /* mismatch; possible infinite loop or false positive */
2243
a6872d42
JH
2244 if (!u1 || !u2)
2245 natbuf[1] = 0; /* Need to terminate the buffer. */
2246
d07ddd77
JH
2247 while ((e1 == 0 || p1 < e1) &&
2248 (f1 == 0 || p1 < f1) &&
2249 (e2 == 0 || p2 < e2) &&
2250 (f2 == 0 || p2 < f2)) {
2251 if (n1 == 0) {
d7f013c8 2252 if (u1)
ed6a78eb 2253 to_utf8_fold((U8 *)p1, foldbuf1, &foldlen1);
d7f013c8 2254 else {
e0138122 2255 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2256 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2257 }
2258 q1 = foldbuf1;
d07ddd77 2259 n1 = foldlen1;
332ddc25 2260 }
d07ddd77 2261 if (n2 == 0) {
d7f013c8 2262 if (u2)
ed6a78eb 2263 to_utf8_fold((U8 *)p2, foldbuf2, &foldlen2);
d7f013c8 2264 else {
e0138122 2265 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2266 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2267 }
2268 q2 = foldbuf2;
d07ddd77 2269 n2 = foldlen2;
332ddc25 2270 }
d07ddd77
JH
2271 while (n1 && n2) {
2272 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2273 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2274 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2275 return 1; /* mismatch */
d07ddd77 2276 n1 -= UTF8SKIP(q1);
d7f013c8 2277 q1 += UTF8SKIP(q1);
d07ddd77 2278 n2 -= UTF8SKIP(q2);
d7f013c8 2279 q2 += UTF8SKIP(q2);
701a277b 2280 }
d07ddd77 2281 if (n1 == 0)
d7f013c8 2282 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2283 if (n2 == 0)
d7f013c8
JH
2284 p2 += u2 ? UTF8SKIP(p2) : 1;
2285
d2cc3551 2286 }
5469e704 2287
d07ddd77
JH
2288 /* A match is defined by all the scans that specified
2289 * an explicit length reaching their final goals. */
2290 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2291
2292 if (match) {
d07ddd77
JH
2293 if (pe1)
2294 *pe1 = (char*)p1;
2295 if (pe2)
2296 *pe2 = (char*)p2;
5469e704
JH
2297 }
2298
2299 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2300}
701a277b 2301
04851bb3
NC
2302/*
2303 * Local variables:
2304 * c-indentation-style: bsd
2305 * c-basic-offset: 4
2306 * indent-tabs-mode: t
2307 * End:
2308 *
d8294a4d
NC
2309 * ex: set ts=8 sts=4 sw=4 noet:
2310 */