This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to Digest-MD5-2.36
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
af3babe4
NC
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
4 * others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3
LW
26#include "perl.h"
27
27da23d5
JH
28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7
JH
31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29
DM
34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1
JH
47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1
JH
53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485
SC
57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6
NIS
90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463
GS
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
104 return d;
105 }
106 if (uv < 0x10000) {
eb160463
GS
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
110 return d;
111 }
112 if (uv < 0x200000) {
eb160463
GS
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463
GS
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
138#endif
139 {
eb160463
GS
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463
GS
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
9041c2e3 169
646ca15d
JH
170/*
171
172Tests if some arbitrary number of bytes begins in a valid UTF-8
173character. Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character. The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
176
177This is the "slow" version as opposed to the "fast" version which is
178the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
179difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
180or less you should use the IS_UTF8_CHAR(), for lengths of five or more
181you should use the _slow(). In practice this means that the _slow()
182will be used very rarely, since the maximum Unicode code point (as of
183Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
184the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
185five bytes or more.
186
187=cut */
c053b435 188STATIC STRLEN
646ca15d
JH
189S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
190{
191 U8 u = *s;
192 STRLEN slen;
193 UV uv, ouv;
194
195 if (UTF8_IS_INVARIANT(u))
196 return 1;
197
198 if (!UTF8_IS_START(u))
199 return 0;
200
201 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
202 return 0;
203
204 slen = len - 1;
205 s++;
77263263
TS
206#ifdef EBCDIC
207 u = NATIVE_TO_UTF(u);
208#endif
646ca15d
JH
209 u &= UTF_START_MASK(len);
210 uv = u;
211 ouv = uv;
212 while (slen--) {
213 if (!UTF8_IS_CONTINUATION(*s))
214 return 0;
215 uv = UTF8_ACCUMULATE(uv, *s);
216 if (uv < ouv)
217 return 0;
218 ouv = uv;
219 s++;
220 }
221
222 if ((STRLEN)UNISKIP(uv) < len)
223 return 0;
224
225 return len;
226}
9041c2e3
NIS
227
228/*
7fc63493 229=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 230
5da9da9e 231Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
232character. Note that an INVARIANT (i.e. ASCII) character is a valid
233UTF-8 character. The actual number of bytes in the UTF-8 character
234will be returned if it is valid, otherwise 0.
9041c2e3 235
82686b01 236=cut */
067a85ef 237STRLEN
7fc63493 238Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 239{
44f8325f 240 const STRLEN len = UTF8SKIP(s);
3b0fc154 241#ifdef IS_UTF8_CHAR
768c67ee 242 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
243 return IS_UTF8_CHAR(s, len) ? len : 0;
244#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 245 return is_utf8_char_slow(s, len);
386d01d6
GS
246}
247
6662521e 248/*
7fc63493 249=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 250
c9ada85f 251Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
252UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
253not mean 'a string that contains code points above 0x7F encoded in UTF-8'
254because a valid ASCII string is a valid UTF-8 string.
6662521e 255
768c67ee
JH
256See also is_utf8_string_loclen() and is_utf8_string_loc().
257
6662521e
GS
258=cut
259*/
260
8e84507e 261bool
7fc63493 262Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 263{
7fc63493
AL
264 const U8* x = s;
265 const U8* send;
067a85ef 266
44f8325f 267 if (!len)
e1ec3a88 268 len = strlen((const char *)s);
1aa99e6b
IH
269 send = s + len;
270
6662521e 271 while (x < send) {
a3b680e6 272 STRLEN c;
1acdb0da
JH
273 /* Inline the easy bits of is_utf8_char() here for speed... */
274 if (UTF8_IS_INVARIANT(*x))
275 c = 1;
276 else if (!UTF8_IS_START(*x))
768c67ee 277 goto out;
1acdb0da
JH
278 else {
279 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
280#ifdef IS_UTF8_CHAR
281 c = UTF8SKIP(x);
768c67ee
JH
282 if (IS_UTF8_CHAR_FAST(c)) {
283 if (!IS_UTF8_CHAR(x, c))
284 goto out;
285 } else if (!is_utf8_char_slow(x, c))
286 goto out;
646ca15d
JH
287#else
288 c = is_utf8_char(x);
289#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 290 if (!c)
768c67ee 291 goto out;
1acdb0da 292 }
6662521e 293 x += c;
6662521e 294 }
768c67ee
JH
295
296 out:
60006e79
JH
297 if (x != send)
298 return FALSE;
067a85ef
A
299
300 return TRUE;
6662521e
GS
301}
302
67e989fb 303/*
814fafa7
NC
304Implemented as a macro in utf8.h
305
306=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
307
308Like is_utf8_string() but stores the location of the failure (in the
309case of "utf8ness failure") or the location s+len (in the case of
310"utf8ness success") in the C<ep>.
311
312See also is_utf8_string_loclen() and is_utf8_string().
313
768c67ee 314=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 315
e3e4599f 316Like is_utf8_string() but stores the location of the failure (in the
768c67ee
JH
317case of "utf8ness failure") or the location s+len (in the case of
318"utf8ness success") in the C<ep>, and the number of UTF-8
319encoded characters in the C<el>.
320
321See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
322
323=cut
324*/
325
326bool
768c67ee 327Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 328{
7fc63493
AL
329 const U8* x = s;
330 const U8* send;
81cd54e3
JH
331 STRLEN c;
332
44f8325f 333 if (!len)
e1ec3a88 334 len = strlen((const char *)s);
81cd54e3 335 send = s + len;
768c67ee
JH
336 if (el)
337 *el = 0;
81cd54e3
JH
338
339 while (x < send) {
340 /* Inline the easy bits of is_utf8_char() here for speed... */
341 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
342 c = 1;
343 else if (!UTF8_IS_START(*x))
344 goto out;
81cd54e3 345 else {
768c67ee
JH
346 /* ... and call is_utf8_char() only if really needed. */
347#ifdef IS_UTF8_CHAR
348 c = UTF8SKIP(x);
349 if (IS_UTF8_CHAR_FAST(c)) {
350 if (!IS_UTF8_CHAR(x, c))
351 c = 0;
352 } else
353 c = is_utf8_char_slow(x, c);
354#else
355 c = is_utf8_char(x);
356#endif /* #ifdef IS_UTF8_CHAR */
357 if (!c)
358 goto out;
81cd54e3 359 }
768c67ee
JH
360 x += c;
361 if (el)
362 (*el)++;
81cd54e3 363 }
768c67ee
JH
364
365 out:
366 if (ep)
367 *ep = x;
368 if (x != send)
81cd54e3 369 return FALSE;
81cd54e3
JH
370
371 return TRUE;
372}
373
374/*
768c67ee 375
7fc63493 376=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 377
9041c2e3
NIS
378Bottom level UTF-8 decode routine.
379Returns the unicode code point value of the first character in the string C<s>
1e54db1a 380which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 381C<retlen> will be set to the length, in bytes, of that character.
67e989fb 382
1e54db1a 383If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
384is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
385it is assumed that the caller will raise a warning, and this function
28d3d195
JH
386will silently just set C<retlen> to C<-1> and return zero. If the
387C<flags> does not contain UTF8_CHECK_ONLY, warnings about
388malformations will be given, C<retlen> will be set to the expected
389length of the UTF-8 character in bytes, and zero will be returned.
390
391The C<flags> can also contain various flags to allow deviations from
392the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 393
9041c2e3
NIS
394Most code should use utf8_to_uvchr() rather than call this directly.
395
37607a96
PK
396=cut
397*/
67e989fb 398
a0ed51b3 399UV
7fc63493 400Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 401{
7fc63493 402 const U8 *s0 = s;
9c5ffd7c 403 UV uv = *s, ouv = 0;
ba210ebe 404 STRLEN len = 1;
7fc63493
AL
405 const bool dowarn = ckWARN_d(WARN_UTF8);
406 const UV startbyte = *s;
ba210ebe 407 STRLEN expectlen = 0;
a0dbb045
JH
408 U32 warning = 0;
409
410/* This list is a superset of the UTF8_ALLOW_XXX. */
411
412#define UTF8_WARN_EMPTY 1
413#define UTF8_WARN_CONTINUATION 2
414#define UTF8_WARN_NON_CONTINUATION 3
415#define UTF8_WARN_FE_FF 4
416#define UTF8_WARN_SHORT 5
417#define UTF8_WARN_OVERFLOW 6
418#define UTF8_WARN_SURROGATE 7
c867b360
JH
419#define UTF8_WARN_LONG 8
420#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
421
422 if (curlen == 0 &&
423 !(flags & UTF8_ALLOW_EMPTY)) {
424 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
425 goto malformed;
426 }
427
1d72bdf6 428 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
429 if (retlen)
430 *retlen = 1;
c4d5f83a 431 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 432 }
67e989fb 433
421a8bf2 434 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 435 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 436 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
437 goto malformed;
438 }
439
421a8bf2 440 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 441 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 442 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
443 goto malformed;
444 }
9041c2e3 445
1d72bdf6 446#ifdef EBCDIC
75383841 447 uv = NATIVE_TO_UTF(uv);
1d72bdf6 448#else
fcc8fcf6
JH
449 if ((uv == 0xfe || uv == 0xff) &&
450 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 451 warning = UTF8_WARN_FE_FF;
ba210ebe 452 goto malformed;
a0ed51b3 453 }
1d72bdf6
NIS
454#endif
455
ba210ebe
JH
456 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
457 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
458 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
459 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
460#ifdef EBCDIC
461 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
462 else { len = 7; uv &= 0x01; }
463#else
ba210ebe
JH
464 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
465 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
466 else { len = 13; uv = 0; } /* whoa! */
467#endif
468
a0ed51b3
LW
469 if (retlen)
470 *retlen = len;
9041c2e3 471
ba210ebe
JH
472 expectlen = len;
473
fcc8fcf6
JH
474 if ((curlen < expectlen) &&
475 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 476 warning = UTF8_WARN_SHORT;
ba210ebe
JH
477 goto malformed;
478 }
479
480 len--;
a0ed51b3 481 s++;
ba210ebe
JH
482 ouv = uv;
483
a0ed51b3 484 while (len--) {
421a8bf2
JH
485 if (!UTF8_IS_CONTINUATION(*s) &&
486 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
487 s--;
488 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 489 goto malformed;
a0ed51b3
LW
490 }
491 else
8850bf83 492 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
493 if (!(uv > ouv)) {
494 /* These cannot be allowed. */
495 if (uv == ouv) {
75dbc644 496 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
497 warning = UTF8_WARN_LONG;
498 goto malformed;
499 }
500 }
501 else { /* uv < ouv */
502 /* This cannot be allowed. */
503 warning = UTF8_WARN_OVERFLOW;
504 goto malformed;
505 }
ba210ebe
JH
506 }
507 s++;
508 ouv = uv;
509 }
510
421a8bf2 511 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 512 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 513 warning = UTF8_WARN_SURROGATE;
ba210ebe 514 goto malformed;
eb160463 515 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 516 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 517 warning = UTF8_WARN_LONG;
ba210ebe 518 goto malformed;
421a8bf2 519 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 520 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 521 warning = UTF8_WARN_FFFF;
a9917092 522 goto malformed;
a0ed51b3 523 }
ba210ebe 524
a0ed51b3 525 return uv;
ba210ebe
JH
526
527malformed:
528
fcc8fcf6 529 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 530 if (retlen)
cc366d4b 531 *retlen = -1;
ba210ebe
JH
532 return 0;
533 }
534
a0dbb045 535 if (dowarn) {
44f8325f 536 SV* const sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
a0dbb045
JH
537
538 switch (warning) {
539 case 0: /* Intentionally empty. */ break;
540 case UTF8_WARN_EMPTY:
54667de8 541 Perl_sv_catpv(aTHX_ sv, "(empty string)");
a0dbb045
JH
542 break;
543 case UTF8_WARN_CONTINUATION:
097fb8e2 544 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
545 break;
546 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
547 if (s == s0)
548 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
549 (UV)s[1], startbyte);
551405c4
AL
550 else {
551 const int len = (int)(s-s0);
097fb8e2 552 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4
AL
553 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
554 }
555
a0dbb045
JH
556 break;
557 case UTF8_WARN_FE_FF:
558 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
559 break;
560 case UTF8_WARN_SHORT:
097fb8e2 561 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 562 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 563 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
564 break;
565 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
566 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
567 ouv, *s, startbyte);
a0dbb045
JH
568 break;
569 case UTF8_WARN_SURROGATE:
570 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
571 break;
a0dbb045 572 case UTF8_WARN_LONG:
097fb8e2 573 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 574 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
575 break;
576 case UTF8_WARN_FFFF:
577 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
578 break;
579 default:
54667de8 580 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
a0dbb045
JH
581 break;
582 }
583
584 if (warning) {
44f8325f 585 const char * const s = SvPVX_const(sv);
a0dbb045
JH
586
587 if (PL_op)
9014280d 588 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 589 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 590 else
9014280d 591 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
592 }
593 }
594
ba210ebe 595 if (retlen)
28d3d195 596 *retlen = expectlen ? expectlen : len;
ba210ebe 597
28d3d195 598 return 0;
a0ed51b3
LW
599}
600
8e84507e 601/*
7fc63493 602=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3
NIS
603
604Returns the native character value of the first character in the string C<s>
1e54db1a 605which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
606length, in bytes, of that character.
607
1e54db1a 608If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
609returned and retlen is set, if possible, to -1.
610
611=cut
612*/
613
614UV
7fc63493 615Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 616{
1754c1a1
NC
617 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
618 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
619}
620
621/*
7fc63493 622=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3
NIS
623
624Returns the Unicode code point of the first character in the string C<s>
1e54db1a 625which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
626length, in bytes, of that character.
627
628This function should only be used when returned UV is considered
629an index into the Unicode semantic tables (e.g. swashes).
630
1e54db1a 631If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 632returned and retlen is set, if possible, to -1.
8e84507e
NIS
633
634=cut
635*/
636
637UV
7fc63493 638Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 639{
9041c2e3 640 /* Call the low level routine asking for checks */
89ebb4a3 641 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 642 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
643}
644
b76347f2 645/*
35a4481c 646=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2
JH
647
648Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
649Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
650up past C<e>, croaks.
b76347f2
JH
651
652=cut
653*/
654
655STRLEN
35a4481c 656Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
657{
658 STRLEN len = 0;
659
8850bf83
JH
660 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
661 * the bitops (especially ~) can create illegal UTF-8.
662 * In other words: in Perl UTF-8 is not just for Unicode. */
663
a3b680e6
AL
664 if (e < s)
665 goto warn_and_return;
b76347f2 666 while (s < e) {
4373e329 667 const U8 t = UTF8SKIP(s);
901b21bf 668 if (e - s < t) {
a3b680e6 669 warn_and_return:
901b21bf
JH
670 if (ckWARN_d(WARN_UTF8)) {
671 if (PL_op)
672 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 673 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
674 else
675 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
676 }
677 return len;
678 }
b76347f2
JH
679 s += t;
680 len++;
681 }
682
683 return len;
684}
685
b06226ff 686/*
35a4481c 687=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 688
1e54db1a 689Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
690and C<b>.
691
692WARNING: use only if you *know* that the pointers point inside the
693same UTF-8 buffer.
694
37607a96
PK
695=cut
696*/
a0ed51b3 697
02eb7b47 698IV
35a4481c 699Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 700{
02eb7b47
JH
701 IV off = 0;
702
8850bf83
JH
703 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
704 * the bitops (especially ~) can create illegal UTF-8.
705 * In other words: in Perl UTF-8 is not just for Unicode. */
706
a0ed51b3
LW
707 if (a < b) {
708 while (a < b) {
35a4481c 709 const U8 c = UTF8SKIP(a);
a3b680e6
AL
710 if (b - a < c)
711 goto warn_and_return;
02eb7b47 712 a += c;
a0ed51b3
LW
713 off--;
714 }
715 }
716 else {
717 while (b < a) {
4373e329 718 const U8 c = UTF8SKIP(b);
02eb7b47 719
901b21bf 720 if (a - b < c) {
a3b680e6 721 warn_and_return:
901b21bf
JH
722 if (ckWARN_d(WARN_UTF8)) {
723 if (PL_op)
724 Perl_warner(aTHX_ packWARN(WARN_UTF8),
725 "%s in %s", unees, OP_DESC(PL_op));
726 else
727 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
728 }
729 return off;
730 }
02eb7b47 731 b += c;
a0ed51b3
LW
732 off++;
733 }
734 }
02eb7b47 735
a0ed51b3
LW
736 return off;
737}
738
b06226ff 739/*
37607a96 740=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 741
8850bf83
JH
742Return the UTF-8 pointer C<s> displaced by C<off> characters, either
743forward or backward.
b06226ff
JH
744
745WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
746the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
747on the first byte of character or just after the last byte of a character.
b06226ff 748
37607a96
PK
749=cut
750*/
a0ed51b3
LW
751
752U8 *
4373e329 753Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 754{
8850bf83
JH
755 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
756 * the bitops (especially ~) can create illegal UTF-8.
757 * In other words: in Perl UTF-8 is not just for Unicode. */
758
a0ed51b3
LW
759 if (off >= 0) {
760 while (off--)
761 s += UTF8SKIP(s);
762 }
763 else {
764 while (off++) {
765 s--;
8850bf83
JH
766 while (UTF8_IS_CONTINUATION(*s))
767 s--;
a0ed51b3
LW
768 }
769 }
4373e329 770 return (U8 *)s;
a0ed51b3
LW
771}
772
6940069f 773/*
eebe1485 774=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 775
1e54db1a 776Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
777Unlike C<bytes_to_utf8>, this over-writes the original string, and
778updates len to contain the new length.
67e989fb 779Returns zero on failure, setting C<len> to -1.
6940069f
GS
780
781=cut
782*/
783
784U8 *
37607a96 785Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 786{
6940069f
GS
787 U8 *send;
788 U8 *d;
dcad2880 789 U8 *save = s;
246fae53 790
1e54db1a 791 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880
JH
792 for (send = s + *len; s < send; ) {
793 U8 c = *s++;
794
1d72bdf6
NIS
795 if (!UTF8_IS_INVARIANT(c) &&
796 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
797 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
798 *len = -1;
799 return 0;
800 }
246fae53 801 }
dcad2880
JH
802
803 d = s = save;
6940069f 804 while (s < send) {
ed646e6e 805 STRLEN ulen;
9041c2e3 806 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 807 s += ulen;
6940069f
GS
808 }
809 *d = '\0';
246fae53 810 *len = d - save;
6940069f
GS
811 return save;
812}
813
814/*
e1ec3a88 815=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 816
1e54db1a 817Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 818Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
819the newly-created string, and updates C<len> to contain the new
820length. Returns the original string if no conversion occurs, C<len>
821is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8220 if C<s> is converted or contains all 7bit characters.
f9a63242 823
37607a96
PK
824=cut
825*/
f9a63242
JH
826
827U8 *
e1ec3a88 828Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 829{
f9a63242 830 U8 *d;
e1ec3a88
AL
831 const U8 *start = s;
832 const U8 *send;
f9a63242
JH
833 I32 count = 0;
834
835 if (!*is_utf8)
73d840c0 836 return (U8 *)start;
f9a63242 837
1e54db1a 838 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 839 for (send = s + *len; s < send;) {
e1ec3a88 840 U8 c = *s++;
1d72bdf6 841 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
842 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
843 (c = *s++) && UTF8_IS_CONTINUATION(c))
844 count++;
845 else
73d840c0 846 return (U8 *)start;
db42d148 847 }
f9a63242
JH
848 }
849
850 *is_utf8 = 0;
851
a02a5408 852 Newxz(d, (*len) - count + 1, U8);
ef9edfd0 853 s = start; start = d;
f9a63242
JH
854 while (s < send) {
855 U8 c = *s++;
c4d5f83a
NIS
856 if (!UTF8_IS_INVARIANT(c)) {
857 /* Then it is two-byte encoded */
858 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
859 c = ASCII_TO_NATIVE(c);
860 }
861 *d++ = c;
f9a63242
JH
862 }
863 *d = '\0';
864 *len = d - start;
73d840c0 865 return (U8 *)start;
f9a63242
JH
866}
867
868/*
35a4481c 869=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 870
1e54db1a 871Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
872Returns a pointer to the newly-created string, and sets C<len> to
873reflect the new length.
6940069f 874
1e54db1a 875If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
876see sv_recode_to_utf8().
877
497711e7 878=cut
6940069f
GS
879*/
880
881U8*
35a4481c 882Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 883{
35a4481c 884 const U8 * const send = s + (*len);
6940069f
GS
885 U8 *d;
886 U8 *dst;
6940069f 887
a02a5408 888 Newxz(d, (*len) * 2 + 1, U8);
6940069f
GS
889 dst = d;
890
891 while (s < send) {
35a4481c 892 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 893 if (UNI_IS_INVARIANT(uv))
eb160463 894 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 895 else {
eb160463
GS
896 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
897 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
898 }
899 }
900 *d = '\0';
6662521e 901 *len = d-dst;
6940069f
GS
902 return dst;
903}
904
a0ed51b3 905/*
dea0fc0b 906 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
907 *
908 * Destination must be pre-extended to 3/2 source. Do not use in-place.
909 * We optimize for native, for obvious reasons. */
910
911U8*
dea0fc0b 912Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 913{
dea0fc0b
JH
914 U8* pend;
915 U8* dstart = d;
916
1de9afcd
RGS
917 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
918 d[0] = 0;
919 *newlen = 1;
920 return d;
921 }
922
dea0fc0b 923 if (bytelen & 1)
014ead4b 924 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b
JH
925
926 pend = p + bytelen;
927
a0ed51b3 928 while (p < pend) {
dea0fc0b
JH
929 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
930 p += 2;
a0ed51b3 931 if (uv < 0x80) {
eb160463 932 *d++ = (U8)uv;
a0ed51b3
LW
933 continue;
934 }
935 if (uv < 0x800) {
eb160463
GS
936 *d++ = (U8)(( uv >> 6) | 0xc0);
937 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
938 continue;
939 }
940 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
TD
941 UV low = (p[0] << 8) + p[1];
942 p += 2;
dea0fc0b
JH
943 if (low < 0xdc00 || low >= 0xdfff)
944 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
945 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
946 }
947 if (uv < 0x10000) {
eb160463
GS
948 *d++ = (U8)(( uv >> 12) | 0xe0);
949 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
950 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
951 continue;
952 }
953 else {
eb160463
GS
954 *d++ = (U8)(( uv >> 18) | 0xf0);
955 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
956 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
957 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
958 continue;
959 }
960 }
dea0fc0b 961 *newlen = d - dstart;
a0ed51b3
LW
962 return d;
963}
964
965/* Note: this one is slightly destructive of the source. */
966
967U8*
dea0fc0b 968Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
969{
970 U8* s = (U8*)p;
971 U8* send = s + bytelen;
972 while (s < send) {
973 U8 tmp = s[0];
974 s[0] = s[1];
975 s[1] = tmp;
976 s += 2;
977 }
dea0fc0b 978 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
979}
980
981/* for now these are all defined (inefficiently) in terms of the utf8 versions */
982
983bool
84afefe6 984Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 985{
89ebb4a3 986 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 987 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
988 return is_utf8_alnum(tmpbuf);
989}
990
991bool
84afefe6 992Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 993{
89ebb4a3 994 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 995 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
996 return is_utf8_alnumc(tmpbuf);
997}
998
999bool
84afefe6 1000Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1001{
89ebb4a3 1002 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1003 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1004 return is_utf8_idfirst(tmpbuf);
1005}
1006
1007bool
84afefe6 1008Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1009{
89ebb4a3 1010 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1011 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1012 return is_utf8_alpha(tmpbuf);
1013}
1014
1015bool
84afefe6 1016Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1017{
89ebb4a3 1018 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1019 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1020 return is_utf8_ascii(tmpbuf);
1021}
1022
1023bool
84afefe6 1024Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1025{
89ebb4a3 1026 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1027 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1028 return is_utf8_space(tmpbuf);
1029}
1030
1031bool
84afefe6 1032Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1033{
89ebb4a3 1034 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1035 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1036 return is_utf8_digit(tmpbuf);
1037}
1038
1039bool
84afefe6 1040Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1041{
89ebb4a3 1042 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1043 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1044 return is_utf8_upper(tmpbuf);
1045}
1046
1047bool
84afefe6 1048Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1049{
89ebb4a3 1050 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1051 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1052 return is_utf8_lower(tmpbuf);
1053}
1054
1055bool
84afefe6 1056Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1057{
89ebb4a3 1058 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1059 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1060 return is_utf8_cntrl(tmpbuf);
1061}
1062
1063bool
84afefe6 1064Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1065{
89ebb4a3 1066 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1067 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1068 return is_utf8_graph(tmpbuf);
1069}
1070
1071bool
84afefe6 1072Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1073{
89ebb4a3 1074 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1075 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1076 return is_utf8_print(tmpbuf);
1077}
1078
b8c5462f 1079bool
84afefe6 1080Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1081{
89ebb4a3 1082 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1083 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1084 return is_utf8_punct(tmpbuf);
1085}
1086
4d61ec05 1087bool
84afefe6 1088Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1089{
89ebb4a3 1090 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1091 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1092 return is_utf8_xdigit(tmpbuf);
1093}
1094
84afefe6
JH
1095UV
1096Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1097{
0ebc6274
JH
1098 uvchr_to_utf8(p, c);
1099 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1100}
1101
84afefe6
JH
1102UV
1103Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1104{
0ebc6274
JH
1105 uvchr_to_utf8(p, c);
1106 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1107}
1108
84afefe6
JH
1109UV
1110Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1111{
0ebc6274
JH
1112 uvchr_to_utf8(p, c);
1113 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1114}
1115
84afefe6
JH
1116UV
1117Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1118{
0ebc6274
JH
1119 uvchr_to_utf8(p, c);
1120 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1121}
1122
a0ed51b3
LW
1123/* for now these all assume no locale info available for Unicode > 255 */
1124
1125bool
84afefe6 1126Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1127{
1128 return is_uni_alnum(c); /* XXX no locale support yet */
1129}
1130
1131bool
84afefe6 1132Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1133{
1134 return is_uni_alnumc(c); /* XXX no locale support yet */
1135}
1136
1137bool
84afefe6 1138Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1139{
1140 return is_uni_idfirst(c); /* XXX no locale support yet */
1141}
1142
1143bool
84afefe6 1144Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1145{
1146 return is_uni_alpha(c); /* XXX no locale support yet */
1147}
1148
1149bool
84afefe6 1150Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1151{
1152 return is_uni_ascii(c); /* XXX no locale support yet */
1153}
1154
1155bool
84afefe6 1156Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1157{
1158 return is_uni_space(c); /* XXX no locale support yet */
1159}
1160
1161bool
84afefe6 1162Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1163{
1164 return is_uni_digit(c); /* XXX no locale support yet */
1165}
1166
1167bool
84afefe6 1168Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1169{
1170 return is_uni_upper(c); /* XXX no locale support yet */
1171}
1172
1173bool
84afefe6 1174Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1175{
1176 return is_uni_lower(c); /* XXX no locale support yet */
1177}
1178
1179bool
84afefe6 1180Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1181{
1182 return is_uni_cntrl(c); /* XXX no locale support yet */
1183}
1184
1185bool
84afefe6 1186Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1187{
1188 return is_uni_graph(c); /* XXX no locale support yet */
1189}
1190
1191bool
84afefe6 1192Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1193{
1194 return is_uni_print(c); /* XXX no locale support yet */
1195}
1196
b8c5462f 1197bool
84afefe6 1198Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1199{
1200 return is_uni_punct(c); /* XXX no locale support yet */
1201}
1202
4d61ec05 1203bool
84afefe6 1204Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1205{
1206 return is_uni_xdigit(c); /* XXX no locale support yet */
1207}
1208
b7ac61fa
JH
1209U32
1210Perl_to_uni_upper_lc(pTHX_ U32 c)
1211{
ee099d14
JH
1212 /* XXX returns only the first character -- do not use XXX */
1213 /* XXX no locale support yet */
1214 STRLEN len;
89ebb4a3 1215 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1216 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1217}
1218
1219U32
1220Perl_to_uni_title_lc(pTHX_ U32 c)
1221{
ee099d14
JH
1222 /* XXX returns only the first character XXX -- do not use XXX */
1223 /* XXX no locale support yet */
1224 STRLEN len;
89ebb4a3 1225 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1226 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1227}
1228
1229U32
1230Perl_to_uni_lower_lc(pTHX_ U32 c)
1231{
ee099d14
JH
1232 /* XXX returns only the first character -- do not use XXX */
1233 /* XXX no locale support yet */
1234 STRLEN len;
89ebb4a3 1235 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1236 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1237}
1238
7452cf6a 1239static bool
5141f98e 1240S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
1241 const char *const swashname)
1242{
1243 if (!is_utf8_char(p))
1244 return FALSE;
1245 if (!*swash)
711a919c 1246 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
bde6a22d
NC
1247 return swash_fetch(*swash, p, TRUE) != 0;
1248}
1249
1250bool
7fc63493 1251Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1252{
671c33bf
NC
1253 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1254 * descendant of isalnum(3), in other words, it doesn't
1255 * contain the '_'. --jhi */
1256 return S_is_utf8_common(aTHX_ p, &PL_utf8_alnum, "IsWord");
a0ed51b3
LW
1257}
1258
1259bool
7fc63493 1260Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1261{
671c33bf 1262 return S_is_utf8_common(aTHX_ p, &PL_utf8_alnumc, "IsAlnumC");
b8c5462f
JH
1263}
1264
1265bool
7fc63493 1266Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1267{
82686b01
JH
1268 if (*p == '_')
1269 return TRUE;
bde6a22d
NC
1270 /* is_utf8_idstart would be more logical. */
1271 return S_is_utf8_common(aTHX_ p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1272}
1273
1274bool
7fc63493 1275Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01
JH
1276{
1277 if (*p == '_')
1278 return TRUE;
bde6a22d 1279 return S_is_utf8_common(aTHX_ p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1280}
1281
1282bool
7fc63493 1283Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1284{
bde6a22d 1285 return S_is_utf8_common(aTHX_ p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1286}
1287
1288bool
7fc63493 1289Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1290{
bde6a22d 1291 return S_is_utf8_common(aTHX_ p, &PL_utf8_ascii, "IsAscii");
b8c5462f
JH
1292}
1293
1294bool
7fc63493 1295Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1296{
bde6a22d 1297 return S_is_utf8_common(aTHX_ p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3
LW
1298}
1299
1300bool
7fc63493 1301Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1302{
bde6a22d 1303 return S_is_utf8_common(aTHX_ p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1304}
1305
1306bool
7fc63493 1307Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1308{
bde6a22d 1309 return S_is_utf8_common(aTHX_ p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1310}
1311
1312bool
7fc63493 1313Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1314{
bde6a22d 1315 return S_is_utf8_common(aTHX_ p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1316}
1317
1318bool
7fc63493 1319Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1320{
bde6a22d 1321 return S_is_utf8_common(aTHX_ p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f
JH
1322}
1323
1324bool
7fc63493 1325Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1326{
bde6a22d 1327 return S_is_utf8_common(aTHX_ p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1328}
1329
1330bool
7fc63493 1331Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1332{
bde6a22d 1333 return S_is_utf8_common(aTHX_ p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1334}
1335
1336bool
7fc63493 1337Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1338{
bde6a22d 1339 return S_is_utf8_common(aTHX_ p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1340}
1341
1342bool
7fc63493 1343Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1344{
bde6a22d 1345 return S_is_utf8_common(aTHX_ p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f
JH
1346}
1347
1348bool
7fc63493 1349Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1350{
bde6a22d 1351 return S_is_utf8_common(aTHX_ p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1352}
1353
6b5c0936
JH
1354/*
1355=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1356
1357The "p" contains the pointer to the UTF-8 string encoding
1358the character that is being converted.
1359
1360The "ustrp" is a pointer to the character buffer to put the
1361conversion result to. The "lenp" is a pointer to the length
1362of the result.
1363
0134edef 1364The "swashp" is a pointer to the swash to use.
6b5c0936 1365
0134edef 1366Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
8fe4d5b2 1367and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 1368but not always, a multicharacter mapping), is tried first.
6b5c0936 1369
0134edef
JH
1370The "special" is a string like "utf8::ToSpecLower", which means the
1371hash %utf8::ToSpecLower. The access to the hash is through
1372Perl_to_utf8_case().
6b5c0936 1373
0134edef
JH
1374The "normal" is a string like "ToLower" which means the swash
1375%utf8::ToLower.
1376
1377=cut */
6b5c0936 1378
2104c8d9 1379UV
9a957fbc
AL
1380Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1381 SV **swashp, const char *normal, const char *special)
a0ed51b3 1382{
89ebb4a3 1383 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1384 STRLEN len = 0;
a0ed51b3 1385
aec46f14 1386 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1387 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1388 * are necessary in EBCDIC, they are redundant no-ops
1389 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1390 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1391 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1392
1393 if (!*swashp) /* load on-demand */
1394 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1395
b08cf34e
JH
1396 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1397 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1398 /* It might be "special" (sometimes, but not always,
2a37f04d 1399 * a multicharacter mapping) */
983ffd37 1400 HV *hv;
b08cf34e
JH
1401 SV **svp;
1402
1403 if ((hv = get_hv(special, FALSE)) &&
1404 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1405 (*svp)) {
cfd0369c 1406 const char *s;
47654450 1407
cfd0369c 1408 s = SvPV_const(*svp, len);
47654450
JH
1409 if (len == 1)
1410 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1411 else {
2f9475ad
JH
1412#ifdef EBCDIC
1413 /* If we have EBCDIC we need to remap the characters
1414 * since any characters in the low 256 are Unicode
1415 * code points, not EBCDIC. */
7cda7a3d 1416 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1417
1418 d = tmpbuf;
b08cf34e 1419 if (SvUTF8(*svp)) {
2f9475ad
JH
1420 STRLEN tlen = 0;
1421
1422 while (t < tend) {
1423 UV c = utf8_to_uvchr(t, &tlen);
1424 if (tlen > 0) {
1425 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1426 t += tlen;
1427 }
1428 else
1429 break;
1430 }
1431 }
1432 else {
36fec512
JH
1433 while (t < tend) {
1434 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1435 t++;
1436 }
2f9475ad
JH
1437 }
1438 len = d - tmpbuf;
1439 Copy(tmpbuf, ustrp, len, U8);
1440#else
d2dcd0fb 1441 Copy(s, ustrp, len, U8);
2f9475ad 1442#endif
29e98929 1443 }
983ffd37 1444 }
0134edef
JH
1445 }
1446
1447 if (!len && *swashp) {
1448 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1449
1450 if (uv2) {
1451 /* It was "normal" (a single character mapping). */
1452 UV uv3 = UNI_TO_NATIVE(uv2);
1453
e9101d72 1454 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1455 }
1456 }
1feea2c7 1457
0134edef
JH
1458 if (!len) /* Neither: just copy. */
1459 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1460
2a37f04d
JH
1461 if (lenp)
1462 *lenp = len;
1463
0134edef 1464 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1465}
1466
d3e79532 1467/*
7fc63493 1468=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1469
1470Convert the UTF-8 encoded character at p to its uppercase version and
1471store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1472that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1473the uppercase version may be longer than the original character.
d3e79532
JH
1474
1475The first character of the uppercased version is returned
1476(but note, as explained above, that there may be more.)
1477
1478=cut */
1479
2104c8d9 1480UV
7fc63493 1481Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1482{
983ffd37 1483 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1484 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1485}
a0ed51b3 1486
d3e79532 1487/*
7fc63493 1488=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1489
1490Convert the UTF-8 encoded character at p to its titlecase version and
1491store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1492that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1493titlecase version may be longer than the original character.
d3e79532
JH
1494
1495The first character of the titlecased version is returned
1496(but note, as explained above, that there may be more.)
1497
1498=cut */
1499
983ffd37 1500UV
7fc63493 1501Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37
JH
1502{
1503 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1504 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1505}
1506
d3e79532 1507/*
7fc63493 1508=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1509
1510Convert the UTF-8 encoded character at p to its lowercase version and
1511store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1512that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1513lowercase version may be longer than the original character.
d3e79532
JH
1514
1515The first character of the lowercased version is returned
1516(but note, as explained above, that there may be more.)
1517
1518=cut */
1519
2104c8d9 1520UV
7fc63493 1521Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1522{
983ffd37 1523 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1524 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1525}
1526
d3e79532 1527/*
7fc63493 1528=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1529
1530Convert the UTF-8 encoded character at p to its foldcase version and
1531store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1532that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1533foldcase version may be longer than the original character (up to
1534three characters).
1535
1536The first character of the foldcased version is returned
1537(but note, as explained above, that there may be more.)
1538
1539=cut */
1540
b4e400f9 1541UV
7fc63493 1542Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9
JH
1543{
1544 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1545 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1546}
1547
711a919c
TS
1548/* Note:
1549 * A "swash" is a swatch hash.
1550 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1551 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1552 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1553 */
a0ed51b3 1554SV*
7fc63493 1555Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1556{
27da23d5 1557 dVAR;
a0ed51b3 1558 SV* retval;
9a957fbc 1559 SV* const tokenbufsv = sv_newmortal();
8e84507e 1560 dSP;
7fc63493
AL
1561 const size_t pkg_len = strlen(pkg);
1562 const size_t name_len = strlen(name);
aec46f14 1563 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1564 SV* errsv_save;
ce3b816e 1565
96ca9f55
DM
1566 PUSHSTACKi(PERLSI_MAGIC);
1567 ENTER;
1568 SAVEI32(PL_hints);
1569 PL_hints = 0;
1570 save_re_context();
1b026014 1571 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1572 ENTER;
f8be5cf0 1573 errsv_save = newSVsv(ERRSV);
71bed85a
NC
1574 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1575 Nullsv);
f8be5cf0
JH
1576 if (!SvTRUE(ERRSV))
1577 sv_setsv(ERRSV, errsv_save);
1578 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1579 LEAVE;
1580 }
1581 SPAGAIN;
a0ed51b3
LW
1582 PUSHMARK(SP);
1583 EXTEND(SP,5);
71bed85a
NC
1584 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1585 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1586 PUSHs(listsv);
1587 PUSHs(sv_2mortal(newSViv(minbits)));
1588 PUSHs(sv_2mortal(newSViv(none)));
1589 PUTBACK;
923e4eb5 1590 if (IN_PERL_COMPILETIME) {
bf1fed83 1591 /* XXX ought to be handled by lex_start */
82686b01 1592 SAVEI32(PL_in_my);
2b4bd638 1593 PL_in_my = 0;
bf1fed83 1594 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1595 }
f8be5cf0 1596 errsv_save = newSVsv(ERRSV);
864dbfa3 1597 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1598 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1599 else
e24b16f9 1600 retval = &PL_sv_undef;
f8be5cf0
JH
1601 if (!SvTRUE(ERRSV))
1602 sv_setsv(ERRSV, errsv_save);
1603 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1604 LEAVE;
1605 POPSTACK;
923e4eb5 1606 if (IN_PERL_COMPILETIME) {
bf1fed83 1607 STRLEN len;
aec46f14 1608 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83
JH
1609
1610 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1611 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1612 }
bc45ce41
JH
1613 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1614 if (SvPOK(retval))
35c1215d
NC
1615 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1616 retval);
cea2e8a9 1617 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1618 }
a0ed51b3
LW
1619 return retval;
1620}
1621
035d37be
JH
1622
1623/* This API is wrong for special case conversions since we may need to
1624 * return several Unicode characters for a single Unicode character
1625 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1626 * the lower-level routine, and it is similarly broken for returning
1627 * multiple values. --jhi */
979f2922 1628/* Now SWASHGET is recasted into S_swash_get in this file. */
680c470c
TS
1629
1630/* Note:
1631 * Returns the value of property/mapping C<swash> for the first character
1632 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1633 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1634 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1635 */
a0ed51b3 1636UV
680c470c 1637Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 1638{
27da23d5 1639 dVAR;
680c470c 1640 HV* const hv = (HV*)SvRV(swash);
3568d838
JH
1641 U32 klen;
1642 U32 off;
a0ed51b3 1643 STRLEN slen;
7d85a32c 1644 STRLEN needents;
cfd0369c 1645 const U8 *tmps = NULL;
a0ed51b3 1646 U32 bit;
979f2922 1647 SV *swatch;
3568d838
JH
1648 U8 tmputf8[2];
1649 UV c = NATIVE_TO_ASCII(*ptr);
1650
1651 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
979f2922
TS
1652 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1653 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1654 ptr = tmputf8;
3568d838
JH
1655 }
1656 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1657 * then the "swatch" is a vec() for al the chars which start
1658 * with 0xAA..0xYY
1659 * So the key in the hash (klen) is length of encoded char -1
1660 */
1661 klen = UTF8SKIP(ptr) - 1;
1662 off = ptr[klen];
a0ed51b3 1663
979f2922 1664 if (klen == 0) {
7d85a32c 1665 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1666 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1667 */
979f2922
TS
1668 needents = UTF_CONTINUATION_MARK;
1669 off = NATIVE_TO_UTF(ptr[klen]);
1670 }
1671 else {
7d85a32c 1672 /* If char is encoded then swatch is for the prefix */
979f2922
TS
1673 needents = (1 << UTF_ACCUMULATION_SHIFT);
1674 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1675 }
7d85a32c 1676
a0ed51b3
LW
1677 /*
1678 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1679 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1680 * it's nothing to sniff at.) Pity we usually come through at least
1681 * two function calls to get here...
1682 *
1683 * NB: this code assumes that swatches are never modified, once generated!
1684 */
1685
3568d838 1686 if (hv == PL_last_swash_hv &&
a0ed51b3 1687 klen == PL_last_swash_klen &&
27da23d5 1688 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1689 {
1690 tmps = PL_last_swash_tmps;
1691 slen = PL_last_swash_slen;
1692 }
1693 else {
1694 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1695 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1696
979f2922
TS
1697 /* If not cached, generate it via swash_get */
1698 if (!svp || !SvPOK(*svp)
1699 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0
NIS
1700 /* We use utf8n_to_uvuni() as we want an index into
1701 Unicode tables, not a native character number.
1702 */
aec46f14 1703 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1704 ckWARN(WARN_UTF8) ?
1705 0 : UTF8_ALLOW_ANY);
680c470c 1706 swatch = swash_get(swash,
979f2922
TS
1707 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1708 (klen) ? (code_point & ~(needents - 1)) : 0,
1709 needents);
1710
923e4eb5 1711 if (IN_PERL_COMPILETIME)
eb160463 1712 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1713
979f2922 1714 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 1715
979f2922
TS
1716 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1717 || (slen << 3) < needents)
660a4616 1718 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
a0ed51b3
LW
1719 }
1720
1721 PL_last_swash_hv = hv;
1722 PL_last_swash_klen = klen;
cfd0369c
NC
1723 /* FIXME change interpvar.h? */
1724 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1725 PL_last_swash_slen = slen;
1726 if (klen)
1727 Copy(ptr, PL_last_swash_key, klen, U8);
1728 }
1729
9faf8d75 1730 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1731 case 1:
1732 bit = 1 << (off & 7);
1733 off >>= 3;
1734 return (tmps[off] & bit) != 0;
1735 case 8:
1736 return tmps[off];
1737 case 16:
1738 off <<= 1;
1739 return (tmps[off] << 8) + tmps[off + 1] ;
1740 case 32:
1741 off <<= 2;
1742 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1743 }
660a4616 1744 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
a0ed51b3
LW
1745 return 0;
1746}
2b9d42f0 1747
979f2922
TS
1748/* Note:
1749 * Returns a swatch (a bit vector string) for a code point sequence
1750 * that starts from the value C<start> and comprises the number C<span>.
1751 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1752 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1753 */
1754STATIC SV*
1755S_swash_get(pTHX_ SV* swash, UV start, UV span)
1756{
1757 SV *swatch;
711a919c 1758 U8 *l, *lend, *x, *xend, *s;
979f2922
TS
1759 STRLEN lcur, xcur, scur;
1760
1761 HV* const hv = (HV*)SvRV(swash);
1762 SV** listsvp = hv_fetch(hv, "LIST", 4, FALSE);
1763 SV** typesvp = hv_fetch(hv, "TYPE", 4, FALSE);
1764 SV** bitssvp = hv_fetch(hv, "BITS", 4, FALSE);
1765 SV** nonesvp = hv_fetch(hv, "NONE", 4, FALSE);
1766 SV** extssvp = hv_fetch(hv, "EXTRAS", 6, FALSE);
1767 U8* typestr = (U8*)SvPV_nolen(*typesvp);
1768 int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1769 STRLEN bits = SvUV(*bitssvp);
1770 STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1771 UV none = SvUV(*nonesvp);
1772 UV end = start + span;
1773
1774 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
660a4616
TS
1775 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1776 (UV)bits);
979f2922
TS
1777 }
1778
1779 /* create and initialize $swatch */
1780 swatch = newSVpvn("",0);
1781 scur = octets ? (span * octets) : (span + 7) / 8;
1782 SvGROW(swatch, scur + 1);
1783 s = (U8*)SvPVX(swatch);
1784 if (octets && none) {
1785 const U8* e = s + scur;
1786 while (s < e) {
1787 if (bits == 8)
1788 *s++ = (U8)(none & 0xff);
1789 else if (bits == 16) {
1790 *s++ = (U8)((none >> 8) & 0xff);
1791 *s++ = (U8)( none & 0xff);
1792 }
1793 else if (bits == 32) {
1794 *s++ = (U8)((none >> 24) & 0xff);
1795 *s++ = (U8)((none >> 16) & 0xff);
1796 *s++ = (U8)((none >> 8) & 0xff);
1797 *s++ = (U8)( none & 0xff);
1798 }
1799 }
1800 *s = '\0';
1801 }
1802 else {
1803 (void)memzero((U8*)s, scur + 1);
1804 }
1805 SvCUR_set(swatch, scur);
1806 s = (U8*)SvPVX(swatch);
1807
1808 /* read $swash->{LIST} */
1809 l = (U8*)SvPV(*listsvp, lcur);
1810 lend = l + lcur;
1811 while (l < lend) {
1812 UV min, max, val, key;
1813 STRLEN numlen;
1814 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1815
711a919c 1816 U8* nl = (U8*)memchr(l, '\n', lend - l);
979f2922
TS
1817
1818 numlen = lend - l;
1819 min = grok_hex((char *)l, &numlen, &flags, NULL);
1820 if (numlen)
1821 l += numlen;
1822 else if (nl) {
1823 l = nl + 1; /* 1 is length of "\n" */
1824 continue;
1825 }
1826 else {
1827 l = lend; /* to LIST's end at which \n is not found */
1828 break;
1829 }
1830
1831 if (isBLANK(*l)) {
1832 ++l;
1833 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1834 numlen = lend - l;
1835 max = grok_hex((char *)l, &numlen, &flags, NULL);
1836 if (numlen)
1837 l += numlen;
1838 else
1839 max = min;
1840
1841 if (octets) {
1842 if (isBLANK(*l)) {
1843 ++l;
1844 flags = PERL_SCAN_SILENT_ILLDIGIT |
1845 PERL_SCAN_DISALLOW_PREFIX;
1846 numlen = lend - l;
1847 val = grok_hex((char *)l, &numlen, &flags, NULL);
1848 if (numlen)
1849 l += numlen;
1850 else
1851 val = 0;
1852 }
1853 else {
1854 val = 0;
1855 if (typeto) {
1856 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1857 typestr, l);
1858 }
1859 }
1860 }
711a919c
TS
1861 else
1862 val = 0; /* bits == 1, then val should be ignored */
979f2922
TS
1863 }
1864 else {
1865 max = min;
1866 if (octets) {
1867 val = 0;
1868 if (typeto) {
1869 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1870 }
1871 }
711a919c
TS
1872 else
1873 val = 0; /* bits == 1, then val should be ignored */
979f2922
TS
1874 }
1875
1876 if (nl)
1877 l = nl + 1;
1878 else
1879 l = lend;
1880
1881 if (max < start)
1882 continue;
1883
1884 if (octets) {
1885 if (min < start) {
1886 if (!none || val < none) {
1887 val += start - min;
1888 }
1889 min = start;
1890 }
1891 for (key = min; key <= max; key++) {
1892 STRLEN offset;
1893 if (key >= end)
1894 goto go_out_list;
1895 /* offset must be non-negative (start <= min <= key < end) */
1896 offset = octets * (key - start);
1897 if (bits == 8)
1898 s[offset] = (U8)(val & 0xff);
1899 else if (bits == 16) {
1900 s[offset ] = (U8)((val >> 8) & 0xff);
1901 s[offset + 1] = (U8)( val & 0xff);
1902 }
1903 else if (bits == 32) {
1904 s[offset ] = (U8)((val >> 24) & 0xff);
1905 s[offset + 1] = (U8)((val >> 16) & 0xff);
1906 s[offset + 2] = (U8)((val >> 8) & 0xff);
1907 s[offset + 3] = (U8)( val & 0xff);
1908 }
1909
1910 if (!none || val < none)
1911 ++val;
1912 }
1913 }
711a919c 1914 else { /* bits == 1, then val should be ignored */
979f2922
TS
1915 if (min < start)
1916 min = start;
1917 for (key = min; key <= max; key++) {
1918 STRLEN offset = (STRLEN)(key - start);
1919 if (key >= end)
1920 goto go_out_list;
1921 s[offset >> 3] |= 1 << (offset & 7);
1922 }
1923 }
1924 } /* while */
1925 go_out_list:
1926
1927 /* read $swash->{EXTRAS} */
1928 x = (U8*)SvPV(*extssvp, xcur);
1929 xend = x + xcur;
1930 while (x < xend) {
1931 STRLEN namelen;
1932 U8 *namestr;
1933 SV** othersvp;
1934 HV* otherhv;
1935 STRLEN otherbits;
1936 SV **otherbitssvp, *other;
711a919c 1937 U8 *s, *o, *nl;
979f2922
TS
1938 STRLEN slen, olen;
1939
1940 U8 opc = *x++;
1941 if (opc == '\n')
1942 continue;
1943
1944 nl = (U8*)memchr(x, '\n', xend - x);
1945
1946 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1947 if (nl) {
1948 x = nl + 1; /* 1 is length of "\n" */
1949 continue;
1950 }
1951 else {
1952 x = xend; /* to EXTRAS' end at which \n is not found */
1953 break;
1954 }
1955 }
1956
1957 namestr = x;
1958 if (nl) {
1959 namelen = nl - namestr;
1960 x = nl + 1;
1961 }
1962 else {
1963 namelen = xend - namestr;
1964 x = xend;
1965 }
1966
1967 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
660a4616 1968 otherhv = (HV*)SvRV(*othersvp);
979f2922
TS
1969 otherbitssvp = hv_fetch(otherhv, "BITS", 4, FALSE);
1970 otherbits = (STRLEN)SvUV(*otherbitssvp);
1971 if (bits < otherbits)
660a4616 1972 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
979f2922
TS
1973
1974 /* The "other" swatch must be destroyed after. */
1975 other = swash_get(*othersvp, start, span);
1976 o = (U8*)SvPV(other, olen);
1977
1978 if (!olen)
660a4616 1979 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
979f2922
TS
1980
1981 s = (U8*)SvPV(swatch, slen);
1982 if (bits == 1 && otherbits == 1) {
1983 if (slen != olen)
660a4616 1984 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
979f2922
TS
1985
1986 switch (opc) {
1987 case '+':
1988 while (slen--)
1989 *s++ |= *o++;
1990 break;
1991 case '!':
1992 while (slen--)
1993 *s++ |= ~*o++;
1994 break;
1995 case '-':
1996 while (slen--)
1997 *s++ &= ~*o++;
1998 break;
1999 case '&':
2000 while (slen--)
2001 *s++ &= *o++;
2002 break;
2003 default:
2004 break;
2005 }
2006 }
711a919c 2007 else {
979f2922
TS
2008 STRLEN otheroctets = otherbits >> 3;
2009 STRLEN offset = 0;
2010 U8* send = s + slen;
2011
2012 while (s < send) {
2013 UV otherval = 0;
2014
2015 if (otherbits == 1) {
2016 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2017 ++offset;
2018 }
2019 else {
2020 STRLEN vlen = otheroctets;
2021 otherval = *o++;
2022 while (--vlen) {
2023 otherval <<= 8;
2024 otherval |= *o++;
2025 }
2026 }
2027
711a919c
TS
2028 if (opc == '+' && otherval)
2029 ; /* replace with otherval */
979f2922
TS
2030 else if (opc == '!' && !otherval)
2031 otherval = 1;
2032 else if (opc == '-' && otherval)
2033 otherval = 0;
2034 else if (opc == '&' && !otherval)
2035 otherval = 0;
2036 else {
711a919c 2037 s += octets; /* no replacement */
979f2922
TS
2038 continue;
2039 }
2040
2041 if (bits == 8)
2042 *s++ = (U8)( otherval & 0xff);
2043 else if (bits == 16) {
2044 *s++ = (U8)((otherval >> 8) & 0xff);
2045 *s++ = (U8)( otherval & 0xff);
2046 }
2047 else if (bits == 32) {
2048 *s++ = (U8)((otherval >> 24) & 0xff);
2049 *s++ = (U8)((otherval >> 16) & 0xff);
2050 *s++ = (U8)((otherval >> 8) & 0xff);
2051 *s++ = (U8)( otherval & 0xff);
2052 }
2053 }
2054 }
2055 sv_free(other); /* through with it! */
2056 } /* while */
2057 return swatch;
2058}
2059
0f830e0b
NC
2060/*
2061=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2062
2063Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2064of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2065bytes available. The return value is the pointer to the byte after the
2066end of the new character. In other words,
2067
2068 d = uvchr_to_utf8(d, uv);
2069
2070is the recommended wide native character-aware way of saying
2071
2072 *(d++) = uv;
2073
2074=cut
2075*/
2076
2077/* On ASCII machines this is normally a macro but we want a
2078 real function in case XS code wants it
2079*/
0f830e0b
NC
2080U8 *
2081Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2082{
2083 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2084}
2085
b851fbc1
JH
2086U8 *
2087Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2088{
2089 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2090}
2b9d42f0
NIS
2091
2092/*
0f830e0b
NC
2093=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2094flags
2095
2096Returns the native character value of the first character in the string
2097C<s>
2098which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2099length, in bytes, of that character.
2100
2101Allows length and flags to be passed to low level routine.
2102
2103=cut
2104*/
2105/* On ASCII machines this is normally a macro but we want
2106 a real function in case XS code wants it
2107*/
0f830e0b
NC
2108UV
2109Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2110U32 flags)
2111{
2112 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2113 return UNI_TO_NATIVE(uv);
2114}
2115
2116/*
d2cc3551
JH
2117=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2118
2119Build to the scalar dsv a displayable version of the string spv,
2120length len, the displayable version being at most pvlim bytes long
2121(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2122
9e55ce06 2123The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 2124isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
2125to display the \\[nrfta\\] as the backslashed versions (like '\n')
2126(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2127UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2128UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2129
d2cc3551
JH
2130The pointer to the PV of the dsv is returned.
2131
2132=cut */
e6b2e755 2133char *
e1ec3a88 2134Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
2135{
2136 int truncated = 0;
e1ec3a88 2137 const char *s, *e;
e6b2e755
JH
2138
2139 sv_setpvn(dsv, "", 0);
e1ec3a88 2140 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 2141 UV u;
a49f32c6
NC
2142 /* This serves double duty as a flag and a character to print after
2143 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2144 */
2145 char ok = 0;
c728cb41 2146
e6b2e755
JH
2147 if (pvlim && SvCUR(dsv) >= pvlim) {
2148 truncated++;
2149 break;
2150 }
2151 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 2152 if (u < 256) {
a3b680e6 2153 const unsigned char c = (unsigned char)u & 0xFF;
c728cb41 2154 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
a49f32c6 2155 switch (c) {
c728cb41 2156 case '\n':
a49f32c6 2157 ok = 'n'; break;
c728cb41 2158 case '\r':
a49f32c6 2159 ok = 'r'; break;
c728cb41 2160 case '\t':
a49f32c6 2161 ok = 't'; break;
c728cb41 2162 case '\f':
a49f32c6 2163 ok = 'f'; break;
c728cb41 2164 case '\a':
a49f32c6 2165 ok = 'a'; break;
c728cb41 2166 case '\\':
a49f32c6 2167 ok = '\\'; break;
c728cb41
JH
2168 default: break;
2169 }
a49f32c6
NC
2170 if (ok) {
2171 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2172 }
c728cb41 2173 }
00e86452 2174 /* isPRINT() is the locale-blind version. */
a49f32c6
NC
2175 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2176 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2177 ok = 1;
0a2ef054 2178 }
c728cb41
JH
2179 }
2180 if (!ok)
9e55ce06 2181 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
2182 }
2183 if (truncated)
2184 sv_catpvn(dsv, "...", 3);
2185
2186 return SvPVX(dsv);
2187}
2b9d42f0 2188
d2cc3551
JH
2189/*
2190=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2191
2192Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 2193the displayable version being at most pvlim bytes long
d2cc3551 2194(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
2195
2196The flags argument is as in pv_uni_display().
2197
d2cc3551
JH
2198The pointer to the PV of the dsv is returned.
2199
2200=cut */
e6b2e755
JH
2201char *
2202Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2203{
cfd0369c
NC
2204 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2205 SvCUR(ssv), pvlim, flags);
701a277b
JH
2206}
2207
d2cc3551 2208/*
d07ddd77 2209=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
2210
2211Return true if the strings s1 and s2 differ case-insensitively, false
2212if not (if they are equal case-insensitively). If u1 is true, the
2213string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
2214the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2215are false, the respective string is assumed to be in native 8-bit
2216encoding.
2217
2218If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2219in there (they will point at the beginning of the I<next> character).
2220If the pointers behind pe1 or pe2 are non-NULL, they are the end
2221pointers beyond which scanning will not continue under any
4cdaeff7 2222circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
2223s2+l2 will be used as goal end pointers that will also stop the scan,
2224and which qualify towards defining a successful match: all the scans
2225that define an explicit length must reach their goal pointers for
2226a match to succeed).
d2cc3551
JH
2227
2228For case-insensitiveness, the "casefolding" of Unicode is used
2229instead of upper/lowercasing both the characters, see
2230http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2231
2232=cut */
701a277b 2233I32
d07ddd77 2234Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2235{
e1ec3a88
AL
2236 register const U8 *p1 = (const U8*)s1;
2237 register const U8 *p2 = (const U8*)s2;
2238 register const U8 *f1 = 0, *f2 = 0;
2239 register U8 *e1 = 0, *q1 = 0;
2240 register U8 *e2 = 0, *q2 = 0;
d07ddd77 2241 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
2242 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2243 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2244 U8 natbuf[1+1];
2245 STRLEN foldlen1, foldlen2;
d07ddd77 2246 bool match;
332ddc25 2247
d07ddd77
JH
2248 if (pe1)
2249 e1 = *(U8**)pe1;
e1ec3a88
AL
2250 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2251 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2252 if (pe2)
2253 e2 = *(U8**)pe2;
e1ec3a88
AL
2254 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2255 f2 = (const U8*)s2 + l2;
d07ddd77
JH
2256
2257 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2258 return 1; /* mismatch; possible infinite loop or false positive */
2259
a6872d42
JH
2260 if (!u1 || !u2)
2261 natbuf[1] = 0; /* Need to terminate the buffer. */
2262
d07ddd77
JH
2263 while ((e1 == 0 || p1 < e1) &&
2264 (f1 == 0 || p1 < f1) &&
2265 (e2 == 0 || p2 < e2) &&
2266 (f2 == 0 || p2 < f2)) {
2267 if (n1 == 0) {
d7f013c8
JH
2268 if (u1)
2269 to_utf8_fold(p1, foldbuf1, &foldlen1);
2270 else {
809e8e66 2271 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2272 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2273 }
2274 q1 = foldbuf1;
d07ddd77 2275 n1 = foldlen1;
332ddc25 2276 }
d07ddd77 2277 if (n2 == 0) {
d7f013c8
JH
2278 if (u2)
2279 to_utf8_fold(p2, foldbuf2, &foldlen2);
2280 else {
809e8e66 2281 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2282 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2283 }
2284 q2 = foldbuf2;
d07ddd77 2285 n2 = foldlen2;
332ddc25 2286 }
d07ddd77
JH
2287 while (n1 && n2) {
2288 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2289 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2290 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2291 return 1; /* mismatch */
d07ddd77 2292 n1 -= UTF8SKIP(q1);
d7f013c8 2293 q1 += UTF8SKIP(q1);
d07ddd77 2294 n2 -= UTF8SKIP(q2);
d7f013c8 2295 q2 += UTF8SKIP(q2);
701a277b 2296 }
d07ddd77 2297 if (n1 == 0)
d7f013c8 2298 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2299 if (n2 == 0)
d7f013c8
JH
2300 p2 += u2 ? UTF8SKIP(p2) : 1;
2301
d2cc3551 2302 }
5469e704 2303
d07ddd77
JH
2304 /* A match is defined by all the scans that specified
2305 * an explicit length reaching their final goals. */
2306 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2307
2308 if (match) {
d07ddd77
JH
2309 if (pe1)
2310 *pe1 = (char*)p1;
2311 if (pe2)
2312 *pe2 = (char*)p2;
5469e704
JH
2313 }
2314
2315 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2316}
701a277b 2317
a49f32c6
NC
2318/*
2319 * Local variables:
2320 * c-indentation-style: bsd
2321 * c-basic-offset: 4
2322 * indent-tabs-mode: t
2323 * End:
2324 *
37442d52
RGS
2325 * ex: set ts=8 sts=4 sw=4 noet:
2326 */