This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Perl_pack_cat() is a mathom too!
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
b94e2f88
RGS
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005, 2006,
4 * by Larry Wall and others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3
LW
26#include "perl.h"
27
27da23d5
JH
28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7
JH
31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29
DM
34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1
JH
47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1
JH
53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485
SC
57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6
NIS
90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463
GS
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
104 return d;
105 }
106 if (uv < 0x10000) {
eb160463
GS
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
110 return d;
111 }
112 if (uv < 0x200000) {
eb160463
GS
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463
GS
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
138#endif
139 {
eb160463
GS
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463
GS
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
9041c2e3 169
646ca15d
JH
170/*
171
172Tests if some arbitrary number of bytes begins in a valid UTF-8
173character. Note that an INVARIANT (i.e. ASCII) character is a valid
174UTF-8 character. The actual number of bytes in the UTF-8 character
175will be returned if it is valid, otherwise 0.
176
177This is the "slow" version as opposed to the "fast" version which is
178the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
179difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
180or less you should use the IS_UTF8_CHAR(), for lengths of five or more
181you should use the _slow(). In practice this means that the _slow()
182will be used very rarely, since the maximum Unicode code point (as of
183Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
184the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
185five bytes or more.
186
187=cut */
c053b435 188STATIC STRLEN
5f66b61c 189S_is_utf8_char_slow(const U8 *s, const STRLEN len)
646ca15d
JH
190{
191 U8 u = *s;
192 STRLEN slen;
193 UV uv, ouv;
194
195 if (UTF8_IS_INVARIANT(u))
196 return 1;
197
198 if (!UTF8_IS_START(u))
199 return 0;
200
201 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
202 return 0;
203
204 slen = len - 1;
205 s++;
77263263
TS
206#ifdef EBCDIC
207 u = NATIVE_TO_UTF(u);
208#endif
646ca15d
JH
209 u &= UTF_START_MASK(len);
210 uv = u;
211 ouv = uv;
212 while (slen--) {
213 if (!UTF8_IS_CONTINUATION(*s))
214 return 0;
215 uv = UTF8_ACCUMULATE(uv, *s);
216 if (uv < ouv)
217 return 0;
218 ouv = uv;
219 s++;
220 }
221
222 if ((STRLEN)UNISKIP(uv) < len)
223 return 0;
224
225 return len;
226}
9041c2e3
NIS
227
228/*
7fc63493 229=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 230
5da9da9e 231Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
232character. Note that an INVARIANT (i.e. ASCII) character is a valid
233UTF-8 character. The actual number of bytes in the UTF-8 character
234will be returned if it is valid, otherwise 0.
9041c2e3 235
82686b01 236=cut */
067a85ef 237STRLEN
7fc63493 238Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 239{
44f8325f 240 const STRLEN len = UTF8SKIP(s);
96a5add6 241 PERL_UNUSED_CONTEXT;
3b0fc154 242#ifdef IS_UTF8_CHAR
768c67ee 243 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
244 return IS_UTF8_CHAR(s, len) ? len : 0;
245#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 246 return is_utf8_char_slow(s, len);
386d01d6
GS
247}
248
6662521e 249/*
7fc63493 250=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 251
c9ada85f 252Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
253UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
254not mean 'a string that contains code points above 0x7F encoded in UTF-8'
255because a valid ASCII string is a valid UTF-8 string.
6662521e 256
768c67ee
JH
257See also is_utf8_string_loclen() and is_utf8_string_loc().
258
6662521e
GS
259=cut
260*/
261
8e84507e 262bool
7fc63493 263Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 264{
7fc63493
AL
265 const U8* x = s;
266 const U8* send;
067a85ef 267
96a5add6 268 PERL_UNUSED_CONTEXT;
44f8325f 269 if (!len)
e1ec3a88 270 len = strlen((const char *)s);
1aa99e6b
IH
271 send = s + len;
272
6662521e 273 while (x < send) {
a3b680e6 274 STRLEN c;
1acdb0da
JH
275 /* Inline the easy bits of is_utf8_char() here for speed... */
276 if (UTF8_IS_INVARIANT(*x))
277 c = 1;
278 else if (!UTF8_IS_START(*x))
768c67ee 279 goto out;
1acdb0da
JH
280 else {
281 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
282#ifdef IS_UTF8_CHAR
283 c = UTF8SKIP(x);
768c67ee
JH
284 if (IS_UTF8_CHAR_FAST(c)) {
285 if (!IS_UTF8_CHAR(x, c))
3c614e38
AD
286 c = 0;
287 }
288 else
289 c = is_utf8_char_slow(x, c);
646ca15d
JH
290#else
291 c = is_utf8_char(x);
292#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 293 if (!c)
768c67ee 294 goto out;
1acdb0da 295 }
6662521e 296 x += c;
6662521e 297 }
768c67ee
JH
298
299 out:
60006e79
JH
300 if (x != send)
301 return FALSE;
067a85ef
A
302
303 return TRUE;
6662521e
GS
304}
305
67e989fb 306/*
814fafa7
NC
307Implemented as a macro in utf8.h
308
309=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep
310
311Like is_utf8_string() but stores the location of the failure (in the
312case of "utf8ness failure") or the location s+len (in the case of
313"utf8ness success") in the C<ep>.
314
315See also is_utf8_string_loclen() and is_utf8_string().
316
768c67ee 317=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 318
e3e4599f 319Like is_utf8_string() but stores the location of the failure (in the
768c67ee
JH
320case of "utf8ness failure") or the location s+len (in the case of
321"utf8ness success") in the C<ep>, and the number of UTF-8
322encoded characters in the C<el>.
323
324See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
325
326=cut
327*/
328
329bool
768c67ee 330Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 331{
7fc63493
AL
332 const U8* x = s;
333 const U8* send;
81cd54e3 334 STRLEN c;
96a5add6 335 PERL_UNUSED_CONTEXT;
81cd54e3 336
44f8325f 337 if (!len)
e1ec3a88 338 len = strlen((const char *)s);
81cd54e3 339 send = s + len;
768c67ee
JH
340 if (el)
341 *el = 0;
81cd54e3
JH
342
343 while (x < send) {
344 /* Inline the easy bits of is_utf8_char() here for speed... */
345 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
346 c = 1;
347 else if (!UTF8_IS_START(*x))
348 goto out;
81cd54e3 349 else {
768c67ee
JH
350 /* ... and call is_utf8_char() only if really needed. */
351#ifdef IS_UTF8_CHAR
352 c = UTF8SKIP(x);
353 if (IS_UTF8_CHAR_FAST(c)) {
354 if (!IS_UTF8_CHAR(x, c))
355 c = 0;
356 } else
357 c = is_utf8_char_slow(x, c);
358#else
359 c = is_utf8_char(x);
360#endif /* #ifdef IS_UTF8_CHAR */
361 if (!c)
362 goto out;
81cd54e3 363 }
768c67ee
JH
364 x += c;
365 if (el)
366 (*el)++;
81cd54e3 367 }
768c67ee
JH
368
369 out:
370 if (ep)
371 *ep = x;
372 if (x != send)
81cd54e3 373 return FALSE;
81cd54e3
JH
374
375 return TRUE;
376}
377
378/*
768c67ee 379
7fc63493 380=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 381
9041c2e3
NIS
382Bottom level UTF-8 decode routine.
383Returns the unicode code point value of the first character in the string C<s>
1e54db1a 384which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 385C<retlen> will be set to the length, in bytes, of that character.
67e989fb 386
1e54db1a 387If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
388is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
389it is assumed that the caller will raise a warning, and this function
28d3d195
JH
390will silently just set C<retlen> to C<-1> and return zero. If the
391C<flags> does not contain UTF8_CHECK_ONLY, warnings about
392malformations will be given, C<retlen> will be set to the expected
393length of the UTF-8 character in bytes, and zero will be returned.
394
395The C<flags> can also contain various flags to allow deviations from
396the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 397
9041c2e3
NIS
398Most code should use utf8_to_uvchr() rather than call this directly.
399
37607a96
PK
400=cut
401*/
67e989fb 402
a0ed51b3 403UV
7fc63493 404Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 405{
97aff369 406 dVAR;
d4c19fe8 407 const U8 * const s0 = s;
9c5ffd7c 408 UV uv = *s, ouv = 0;
ba210ebe 409 STRLEN len = 1;
7fc63493
AL
410 const bool dowarn = ckWARN_d(WARN_UTF8);
411 const UV startbyte = *s;
ba210ebe 412 STRLEN expectlen = 0;
a0dbb045
JH
413 U32 warning = 0;
414
415/* This list is a superset of the UTF8_ALLOW_XXX. */
416
417#define UTF8_WARN_EMPTY 1
418#define UTF8_WARN_CONTINUATION 2
419#define UTF8_WARN_NON_CONTINUATION 3
420#define UTF8_WARN_FE_FF 4
421#define UTF8_WARN_SHORT 5
422#define UTF8_WARN_OVERFLOW 6
423#define UTF8_WARN_SURROGATE 7
c867b360
JH
424#define UTF8_WARN_LONG 8
425#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
426
427 if (curlen == 0 &&
428 !(flags & UTF8_ALLOW_EMPTY)) {
429 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
430 goto malformed;
431 }
432
1d72bdf6 433 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
434 if (retlen)
435 *retlen = 1;
c4d5f83a 436 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 437 }
67e989fb 438
421a8bf2 439 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 440 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 441 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
442 goto malformed;
443 }
444
421a8bf2 445 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 446 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 447 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
448 goto malformed;
449 }
9041c2e3 450
1d72bdf6 451#ifdef EBCDIC
75383841 452 uv = NATIVE_TO_UTF(uv);
1d72bdf6 453#else
fcc8fcf6
JH
454 if ((uv == 0xfe || uv == 0xff) &&
455 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 456 warning = UTF8_WARN_FE_FF;
ba210ebe 457 goto malformed;
a0ed51b3 458 }
1d72bdf6
NIS
459#endif
460
ba210ebe
JH
461 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
462 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
463 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
464 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
465#ifdef EBCDIC
466 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
467 else { len = 7; uv &= 0x01; }
468#else
ba210ebe
JH
469 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
470 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
471 else { len = 13; uv = 0; } /* whoa! */
472#endif
473
a0ed51b3
LW
474 if (retlen)
475 *retlen = len;
9041c2e3 476
ba210ebe
JH
477 expectlen = len;
478
fcc8fcf6
JH
479 if ((curlen < expectlen) &&
480 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 481 warning = UTF8_WARN_SHORT;
ba210ebe
JH
482 goto malformed;
483 }
484
485 len--;
a0ed51b3 486 s++;
ba210ebe
JH
487 ouv = uv;
488
a0ed51b3 489 while (len--) {
421a8bf2
JH
490 if (!UTF8_IS_CONTINUATION(*s) &&
491 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
492 s--;
493 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 494 goto malformed;
a0ed51b3
LW
495 }
496 else
8850bf83 497 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
498 if (!(uv > ouv)) {
499 /* These cannot be allowed. */
500 if (uv == ouv) {
75dbc644 501 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
502 warning = UTF8_WARN_LONG;
503 goto malformed;
504 }
505 }
506 else { /* uv < ouv */
507 /* This cannot be allowed. */
508 warning = UTF8_WARN_OVERFLOW;
509 goto malformed;
510 }
ba210ebe
JH
511 }
512 s++;
513 ouv = uv;
514 }
515
421a8bf2 516 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 517 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 518 warning = UTF8_WARN_SURROGATE;
ba210ebe 519 goto malformed;
eb160463 520 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 521 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 522 warning = UTF8_WARN_LONG;
ba210ebe 523 goto malformed;
421a8bf2 524 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 525 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 526 warning = UTF8_WARN_FFFF;
a9917092 527 goto malformed;
a0ed51b3 528 }
ba210ebe 529
a0ed51b3 530 return uv;
ba210ebe
JH
531
532malformed:
533
fcc8fcf6 534 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 535 if (retlen)
cc366d4b 536 *retlen = -1;
ba210ebe
JH
537 return 0;
538 }
539
a0dbb045 540 if (dowarn) {
396482e1 541 SV* const sv = sv_2mortal(newSVpvs("Malformed UTF-8 character "));
a0dbb045
JH
542
543 switch (warning) {
544 case 0: /* Intentionally empty. */ break;
545 case UTF8_WARN_EMPTY:
396482e1 546 sv_catpvs(sv, "(empty string)");
a0dbb045
JH
547 break;
548 case UTF8_WARN_CONTINUATION:
097fb8e2 549 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
550 break;
551 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
552 if (s == s0)
553 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
554 (UV)s[1], startbyte);
551405c4
AL
555 else {
556 const int len = (int)(s-s0);
097fb8e2 557 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
551405c4
AL
558 (UV)s[1], len, len > 1 ? "s" : "", startbyte, (int)expectlen);
559 }
560
a0dbb045
JH
561 break;
562 case UTF8_WARN_FE_FF:
563 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
564 break;
565 case UTF8_WARN_SHORT:
097fb8e2 566 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 567 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 568 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
569 break;
570 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
571 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
572 ouv, *s, startbyte);
a0dbb045
JH
573 break;
574 case UTF8_WARN_SURROGATE:
575 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
576 break;
a0dbb045 577 case UTF8_WARN_LONG:
097fb8e2 578 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 579 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
580 break;
581 case UTF8_WARN_FFFF:
582 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
583 break;
584 default:
396482e1 585 sv_catpvs(sv, "(unknown reason)");
a0dbb045
JH
586 break;
587 }
588
589 if (warning) {
44f8325f 590 const char * const s = SvPVX_const(sv);
a0dbb045
JH
591
592 if (PL_op)
9014280d 593 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 594 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 595 else
9014280d 596 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
597 }
598 }
599
ba210ebe 600 if (retlen)
28d3d195 601 *retlen = expectlen ? expectlen : len;
ba210ebe 602
28d3d195 603 return 0;
a0ed51b3
LW
604}
605
8e84507e 606/*
7fc63493 607=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3
NIS
608
609Returns the native character value of the first character in the string C<s>
1e54db1a 610which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
611length, in bytes, of that character.
612
1e54db1a 613If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
614returned and retlen is set, if possible, to -1.
615
616=cut
617*/
618
619UV
7fc63493 620Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 621{
1754c1a1
NC
622 return utf8n_to_uvchr(s, UTF8_MAXBYTES, retlen,
623 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
624}
625
626/*
7fc63493 627=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3
NIS
628
629Returns the Unicode code point of the first character in the string C<s>
1e54db1a 630which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
631length, in bytes, of that character.
632
633This function should only be used when returned UV is considered
634an index into the Unicode semantic tables (e.g. swashes).
635
1e54db1a 636If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 637returned and retlen is set, if possible, to -1.
8e84507e
NIS
638
639=cut
640*/
641
642UV
7fc63493 643Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 644{
9041c2e3 645 /* Call the low level routine asking for checks */
89ebb4a3 646 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 647 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
648}
649
b76347f2 650/*
35a4481c 651=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2
JH
652
653Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
654Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
655up past C<e>, croaks.
b76347f2
JH
656
657=cut
658*/
659
660STRLEN
35a4481c 661Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2 662{
97aff369 663 dVAR;
b76347f2
JH
664 STRLEN len = 0;
665
8850bf83
JH
666 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
667 * the bitops (especially ~) can create illegal UTF-8.
668 * In other words: in Perl UTF-8 is not just for Unicode. */
669
a3b680e6
AL
670 if (e < s)
671 goto warn_and_return;
b76347f2 672 while (s < e) {
4373e329 673 const U8 t = UTF8SKIP(s);
901b21bf 674 if (e - s < t) {
a3b680e6 675 warn_and_return:
901b21bf
JH
676 if (ckWARN_d(WARN_UTF8)) {
677 if (PL_op)
678 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 679 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
680 else
681 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
682 }
683 return len;
684 }
b76347f2
JH
685 s += t;
686 len++;
687 }
688
689 return len;
690}
691
b06226ff 692/*
35a4481c 693=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 694
1e54db1a 695Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
696and C<b>.
697
698WARNING: use only if you *know* that the pointers point inside the
699same UTF-8 buffer.
700
37607a96
PK
701=cut
702*/
a0ed51b3 703
02eb7b47 704IV
35a4481c 705Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 706{
bf1665bc 707 return (a < b) ? -1 * (IV) utf8_length(a, b) : (IV) utf8_length(b, a);
a0ed51b3
LW
708}
709
b06226ff 710/*
37607a96 711=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 712
8850bf83
JH
713Return the UTF-8 pointer C<s> displaced by C<off> characters, either
714forward or backward.
b06226ff
JH
715
716WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
717the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
718on the first byte of character or just after the last byte of a character.
b06226ff 719
37607a96
PK
720=cut
721*/
a0ed51b3
LW
722
723U8 *
4373e329 724Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 725{
96a5add6 726 PERL_UNUSED_CONTEXT;
8850bf83
JH
727 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
728 * the bitops (especially ~) can create illegal UTF-8.
729 * In other words: in Perl UTF-8 is not just for Unicode. */
730
a0ed51b3
LW
731 if (off >= 0) {
732 while (off--)
733 s += UTF8SKIP(s);
734 }
735 else {
736 while (off++) {
737 s--;
8850bf83
JH
738 while (UTF8_IS_CONTINUATION(*s))
739 s--;
a0ed51b3
LW
740 }
741 }
4373e329 742 return (U8 *)s;
a0ed51b3
LW
743}
744
6940069f 745/*
eebe1485 746=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 747
1e54db1a 748Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
749Unlike C<bytes_to_utf8>, this over-writes the original string, and
750updates len to contain the new length.
67e989fb 751Returns zero on failure, setting C<len> to -1.
6940069f 752
95be277c
NC
753If you need a copy of the string, see C<bytes_from_utf8>.
754
6940069f
GS
755=cut
756*/
757
758U8 *
37607a96 759Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 760{
d4c19fe8
AL
761 U8 * const save = s;
762 U8 * const send = s + *len;
6940069f 763 U8 *d;
246fae53 764
1e54db1a 765 /* ensure valid UTF-8 and chars < 256 before updating string */
d4c19fe8 766 while (s < send) {
dcad2880
JH
767 U8 c = *s++;
768
1d72bdf6
NIS
769 if (!UTF8_IS_INVARIANT(c) &&
770 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
771 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
772 *len = -1;
773 return 0;
774 }
246fae53 775 }
dcad2880
JH
776
777 d = s = save;
6940069f 778 while (s < send) {
ed646e6e 779 STRLEN ulen;
9041c2e3 780 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 781 s += ulen;
6940069f
GS
782 }
783 *d = '\0';
246fae53 784 *len = d - save;
6940069f
GS
785 return save;
786}
787
788/*
e1ec3a88 789=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 790
1e54db1a 791Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 792Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
793the newly-created string, and updates C<len> to contain the new
794length. Returns the original string if no conversion occurs, C<len>
795is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
7960 if C<s> is converted or contains all 7bit characters.
f9a63242 797
37607a96
PK
798=cut
799*/
f9a63242
JH
800
801U8 *
e1ec3a88 802Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 803{
f9a63242 804 U8 *d;
e1ec3a88
AL
805 const U8 *start = s;
806 const U8 *send;
f9a63242
JH
807 I32 count = 0;
808
96a5add6 809 PERL_UNUSED_CONTEXT;
f9a63242 810 if (!*is_utf8)
73d840c0 811 return (U8 *)start;
f9a63242 812
1e54db1a 813 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 814 for (send = s + *len; s < send;) {
e1ec3a88 815 U8 c = *s++;
1d72bdf6 816 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
817 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
818 (c = *s++) && UTF8_IS_CONTINUATION(c))
819 count++;
820 else
73d840c0 821 return (U8 *)start;
db42d148 822 }
f9a63242
JH
823 }
824
825 *is_utf8 = 0;
826
212542aa 827 Newx(d, (*len) - count + 1, U8);
ef9edfd0 828 s = start; start = d;
f9a63242
JH
829 while (s < send) {
830 U8 c = *s++;
c4d5f83a
NIS
831 if (!UTF8_IS_INVARIANT(c)) {
832 /* Then it is two-byte encoded */
833 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
834 c = ASCII_TO_NATIVE(c);
835 }
836 *d++ = c;
f9a63242
JH
837 }
838 *d = '\0';
839 *len = d - start;
73d840c0 840 return (U8 *)start;
f9a63242
JH
841}
842
843/*
35a4481c 844=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 845
1e54db1a 846Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
847Returns a pointer to the newly-created string, and sets C<len> to
848reflect the new length.
6940069f 849
1e54db1a 850If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
851see sv_recode_to_utf8().
852
497711e7 853=cut
6940069f
GS
854*/
855
856U8*
35a4481c 857Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 858{
35a4481c 859 const U8 * const send = s + (*len);
6940069f
GS
860 U8 *d;
861 U8 *dst;
96a5add6 862 PERL_UNUSED_CONTEXT;
6940069f 863
212542aa 864 Newx(d, (*len) * 2 + 1, U8);
6940069f
GS
865 dst = d;
866
867 while (s < send) {
35a4481c 868 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 869 if (UNI_IS_INVARIANT(uv))
eb160463 870 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 871 else {
eb160463
GS
872 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
873 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
874 }
875 }
876 *d = '\0';
6662521e 877 *len = d-dst;
6940069f
GS
878 return dst;
879}
880
a0ed51b3 881/*
dea0fc0b 882 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
883 *
884 * Destination must be pre-extended to 3/2 source. Do not use in-place.
885 * We optimize for native, for obvious reasons. */
886
887U8*
dea0fc0b 888Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 889{
dea0fc0b
JH
890 U8* pend;
891 U8* dstart = d;
892
1de9afcd
RGS
893 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
894 d[0] = 0;
895 *newlen = 1;
896 return d;
897 }
898
dea0fc0b 899 if (bytelen & 1)
014ead4b 900 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b
JH
901
902 pend = p + bytelen;
903
a0ed51b3 904 while (p < pend) {
dea0fc0b
JH
905 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
906 p += 2;
a0ed51b3 907 if (uv < 0x80) {
eb160463 908 *d++ = (U8)uv;
a0ed51b3
LW
909 continue;
910 }
911 if (uv < 0x800) {
eb160463
GS
912 *d++ = (U8)(( uv >> 6) | 0xc0);
913 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
914 continue;
915 }
916 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
TD
917 UV low = (p[0] << 8) + p[1];
918 p += 2;
dea0fc0b
JH
919 if (low < 0xdc00 || low >= 0xdfff)
920 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
921 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
922 }
923 if (uv < 0x10000) {
eb160463
GS
924 *d++ = (U8)(( uv >> 12) | 0xe0);
925 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
926 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
927 continue;
928 }
929 else {
eb160463
GS
930 *d++ = (U8)(( uv >> 18) | 0xf0);
931 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
932 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
933 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
934 continue;
935 }
936 }
dea0fc0b 937 *newlen = d - dstart;
a0ed51b3
LW
938 return d;
939}
940
941/* Note: this one is slightly destructive of the source. */
942
943U8*
dea0fc0b 944Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
945{
946 U8* s = (U8*)p;
d4c19fe8 947 U8* const send = s + bytelen;
a0ed51b3 948 while (s < send) {
d4c19fe8 949 const U8 tmp = s[0];
a0ed51b3
LW
950 s[0] = s[1];
951 s[1] = tmp;
952 s += 2;
953 }
dea0fc0b 954 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
955}
956
957/* for now these are all defined (inefficiently) in terms of the utf8 versions */
958
959bool
84afefe6 960Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 961{
89ebb4a3 962 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 963 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
964 return is_utf8_alnum(tmpbuf);
965}
966
967bool
84afefe6 968Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 969{
89ebb4a3 970 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 971 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
972 return is_utf8_alnumc(tmpbuf);
973}
974
975bool
84afefe6 976Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 977{
89ebb4a3 978 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 979 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
980 return is_utf8_idfirst(tmpbuf);
981}
982
983bool
84afefe6 984Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 985{
89ebb4a3 986 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 987 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
988 return is_utf8_alpha(tmpbuf);
989}
990
991bool
84afefe6 992Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 993{
89ebb4a3 994 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 995 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
996 return is_utf8_ascii(tmpbuf);
997}
998
999bool
84afefe6 1000Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1001{
89ebb4a3 1002 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1003 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1004 return is_utf8_space(tmpbuf);
1005}
1006
1007bool
84afefe6 1008Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1009{
89ebb4a3 1010 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1011 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1012 return is_utf8_digit(tmpbuf);
1013}
1014
1015bool
84afefe6 1016Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1017{
89ebb4a3 1018 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1019 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1020 return is_utf8_upper(tmpbuf);
1021}
1022
1023bool
84afefe6 1024Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1025{
89ebb4a3 1026 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1027 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1028 return is_utf8_lower(tmpbuf);
1029}
1030
1031bool
84afefe6 1032Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1033{
89ebb4a3 1034 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1035 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1036 return is_utf8_cntrl(tmpbuf);
1037}
1038
1039bool
84afefe6 1040Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1041{
89ebb4a3 1042 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1043 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1044 return is_utf8_graph(tmpbuf);
1045}
1046
1047bool
84afefe6 1048Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1049{
89ebb4a3 1050 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1051 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1052 return is_utf8_print(tmpbuf);
1053}
1054
b8c5462f 1055bool
84afefe6 1056Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1057{
89ebb4a3 1058 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1059 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1060 return is_utf8_punct(tmpbuf);
1061}
1062
4d61ec05 1063bool
84afefe6 1064Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1065{
89ebb4a3 1066 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1067 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1068 return is_utf8_xdigit(tmpbuf);
1069}
1070
84afefe6
JH
1071UV
1072Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1073{
0ebc6274
JH
1074 uvchr_to_utf8(p, c);
1075 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1076}
1077
84afefe6
JH
1078UV
1079Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1080{
0ebc6274
JH
1081 uvchr_to_utf8(p, c);
1082 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1083}
1084
84afefe6
JH
1085UV
1086Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1087{
0ebc6274
JH
1088 uvchr_to_utf8(p, c);
1089 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1090}
1091
84afefe6
JH
1092UV
1093Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1094{
0ebc6274
JH
1095 uvchr_to_utf8(p, c);
1096 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1097}
1098
a0ed51b3
LW
1099/* for now these all assume no locale info available for Unicode > 255 */
1100
1101bool
84afefe6 1102Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1103{
1104 return is_uni_alnum(c); /* XXX no locale support yet */
1105}
1106
1107bool
84afefe6 1108Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1109{
1110 return is_uni_alnumc(c); /* XXX no locale support yet */
1111}
1112
1113bool
84afefe6 1114Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1115{
1116 return is_uni_idfirst(c); /* XXX no locale support yet */
1117}
1118
1119bool
84afefe6 1120Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1121{
1122 return is_uni_alpha(c); /* XXX no locale support yet */
1123}
1124
1125bool
84afefe6 1126Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1127{
1128 return is_uni_ascii(c); /* XXX no locale support yet */
1129}
1130
1131bool
84afefe6 1132Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1133{
1134 return is_uni_space(c); /* XXX no locale support yet */
1135}
1136
1137bool
84afefe6 1138Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1139{
1140 return is_uni_digit(c); /* XXX no locale support yet */
1141}
1142
1143bool
84afefe6 1144Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1145{
1146 return is_uni_upper(c); /* XXX no locale support yet */
1147}
1148
1149bool
84afefe6 1150Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1151{
1152 return is_uni_lower(c); /* XXX no locale support yet */
1153}
1154
1155bool
84afefe6 1156Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1157{
1158 return is_uni_cntrl(c); /* XXX no locale support yet */
1159}
1160
1161bool
84afefe6 1162Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1163{
1164 return is_uni_graph(c); /* XXX no locale support yet */
1165}
1166
1167bool
84afefe6 1168Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1169{
1170 return is_uni_print(c); /* XXX no locale support yet */
1171}
1172
b8c5462f 1173bool
84afefe6 1174Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1175{
1176 return is_uni_punct(c); /* XXX no locale support yet */
1177}
1178
4d61ec05 1179bool
84afefe6 1180Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1181{
1182 return is_uni_xdigit(c); /* XXX no locale support yet */
1183}
1184
b7ac61fa
JH
1185U32
1186Perl_to_uni_upper_lc(pTHX_ U32 c)
1187{
ee099d14
JH
1188 /* XXX returns only the first character -- do not use XXX */
1189 /* XXX no locale support yet */
1190 STRLEN len;
89ebb4a3 1191 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1192 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1193}
1194
1195U32
1196Perl_to_uni_title_lc(pTHX_ U32 c)
1197{
ee099d14
JH
1198 /* XXX returns only the first character XXX -- do not use XXX */
1199 /* XXX no locale support yet */
1200 STRLEN len;
89ebb4a3 1201 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1202 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1203}
1204
1205U32
1206Perl_to_uni_lower_lc(pTHX_ U32 c)
1207{
ee099d14
JH
1208 /* XXX returns only the first character -- do not use XXX */
1209 /* XXX no locale support yet */
1210 STRLEN len;
89ebb4a3 1211 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1212 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1213}
1214
7452cf6a 1215static bool
5141f98e 1216S_is_utf8_common(pTHX_ const U8 *const p, SV **swash,
bde6a22d
NC
1217 const char *const swashname)
1218{
97aff369 1219 dVAR;
bde6a22d
NC
1220 if (!is_utf8_char(p))
1221 return FALSE;
1222 if (!*swash)
711a919c 1223 *swash = swash_init("utf8", swashname, &PL_sv_undef, 1, 0);
bde6a22d
NC
1224 return swash_fetch(*swash, p, TRUE) != 0;
1225}
1226
1227bool
7fc63493 1228Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1229{
97aff369 1230 dVAR;
671c33bf
NC
1231 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1232 * descendant of isalnum(3), in other words, it doesn't
1233 * contain the '_'. --jhi */
d4c19fe8 1234 return is_utf8_common(p, &PL_utf8_alnum, "IsWord");
a0ed51b3
LW
1235}
1236
1237bool
7fc63493 1238Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1239{
97aff369 1240 dVAR;
d4c19fe8 1241 return is_utf8_common(p, &PL_utf8_alnumc, "IsAlnumC");
b8c5462f
JH
1242}
1243
1244bool
7fc63493 1245Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1246{
97aff369 1247 dVAR;
82686b01
JH
1248 if (*p == '_')
1249 return TRUE;
bde6a22d 1250 /* is_utf8_idstart would be more logical. */
d4c19fe8 1251 return is_utf8_common(p, &PL_utf8_idstart, "IdStart");
82686b01
JH
1252}
1253
1254bool
7fc63493 1255Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01 1256{
97aff369 1257 dVAR;
82686b01
JH
1258 if (*p == '_')
1259 return TRUE;
d4c19fe8 1260 return is_utf8_common(p, &PL_utf8_idcont, "IdContinue");
a0ed51b3
LW
1261}
1262
1263bool
7fc63493 1264Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1265{
97aff369 1266 dVAR;
d4c19fe8 1267 return is_utf8_common(p, &PL_utf8_alpha, "IsAlpha");
a0ed51b3
LW
1268}
1269
1270bool
7fc63493 1271Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1272{
97aff369 1273 dVAR;
d4c19fe8 1274 return is_utf8_common(p, &PL_utf8_ascii, "IsAscii");
b8c5462f
JH
1275}
1276
1277bool
7fc63493 1278Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1279{
97aff369 1280 dVAR;
d4c19fe8 1281 return is_utf8_common(p, &PL_utf8_space, "IsSpacePerl");
a0ed51b3
LW
1282}
1283
1284bool
7fc63493 1285Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1286{
97aff369 1287 dVAR;
d4c19fe8 1288 return is_utf8_common(p, &PL_utf8_digit, "IsDigit");
a0ed51b3
LW
1289}
1290
1291bool
7fc63493 1292Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1293{
97aff369 1294 dVAR;
d4c19fe8 1295 return is_utf8_common(p, &PL_utf8_upper, "IsUppercase");
a0ed51b3
LW
1296}
1297
1298bool
7fc63493 1299Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1300{
97aff369 1301 dVAR;
d4c19fe8 1302 return is_utf8_common(p, &PL_utf8_lower, "IsLowercase");
a0ed51b3
LW
1303}
1304
1305bool
7fc63493 1306Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1307{
97aff369 1308 dVAR;
d4c19fe8 1309 return is_utf8_common(p, &PL_utf8_cntrl, "IsCntrl");
b8c5462f
JH
1310}
1311
1312bool
7fc63493 1313Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1314{
97aff369 1315 dVAR;
d4c19fe8 1316 return is_utf8_common(p, &PL_utf8_graph, "IsGraph");
b8c5462f
JH
1317}
1318
1319bool
7fc63493 1320Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1321{
97aff369 1322 dVAR;
d4c19fe8 1323 return is_utf8_common(p, &PL_utf8_print, "IsPrint");
a0ed51b3
LW
1324}
1325
1326bool
7fc63493 1327Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1328{
97aff369 1329 dVAR;
d4c19fe8 1330 return is_utf8_common(p, &PL_utf8_punct, "IsPunct");
b8c5462f
JH
1331}
1332
1333bool
7fc63493 1334Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1335{
97aff369 1336 dVAR;
d4c19fe8 1337 return is_utf8_common(p, &PL_utf8_xdigit, "Isxdigit");
b8c5462f
JH
1338}
1339
1340bool
7fc63493 1341Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1342{
97aff369 1343 dVAR;
d4c19fe8 1344 return is_utf8_common(p, &PL_utf8_mark, "IsM");
a0ed51b3
LW
1345}
1346
6b5c0936
JH
1347/*
1348=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1349
1350The "p" contains the pointer to the UTF-8 string encoding
1351the character that is being converted.
1352
1353The "ustrp" is a pointer to the character buffer to put the
1354conversion result to. The "lenp" is a pointer to the length
1355of the result.
1356
0134edef 1357The "swashp" is a pointer to the swash to use.
6b5c0936 1358
0134edef 1359Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
8fe4d5b2 1360and loaded by SWASHNEW, using lib/utf8_heavy.pl. The special (usually,
0134edef 1361but not always, a multicharacter mapping), is tried first.
6b5c0936 1362
0134edef
JH
1363The "special" is a string like "utf8::ToSpecLower", which means the
1364hash %utf8::ToSpecLower. The access to the hash is through
1365Perl_to_utf8_case().
6b5c0936 1366
0134edef
JH
1367The "normal" is a string like "ToLower" which means the swash
1368%utf8::ToLower.
1369
1370=cut */
6b5c0936 1371
2104c8d9 1372UV
9a957fbc
AL
1373Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1374 SV **swashp, const char *normal, const char *special)
a0ed51b3 1375{
97aff369 1376 dVAR;
89ebb4a3 1377 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1378 STRLEN len = 0;
a0ed51b3 1379
aec46f14 1380 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1381 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1382 * are necessary in EBCDIC, they are redundant no-ops
1383 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1384 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1385 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1386
1387 if (!*swashp) /* load on-demand */
1388 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1389
b08cf34e
JH
1390 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1391 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1392 /* It might be "special" (sometimes, but not always,
2a37f04d 1393 * a multicharacter mapping) */
983ffd37 1394 HV *hv;
b08cf34e
JH
1395 SV **svp;
1396
1397 if ((hv = get_hv(special, FALSE)) &&
1398 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1399 (*svp)) {
cfd0369c 1400 const char *s;
47654450 1401
cfd0369c 1402 s = SvPV_const(*svp, len);
47654450
JH
1403 if (len == 1)
1404 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1405 else {
2f9475ad
JH
1406#ifdef EBCDIC
1407 /* If we have EBCDIC we need to remap the characters
1408 * since any characters in the low 256 are Unicode
1409 * code points, not EBCDIC. */
7cda7a3d 1410 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1411
1412 d = tmpbuf;
b08cf34e 1413 if (SvUTF8(*svp)) {
2f9475ad
JH
1414 STRLEN tlen = 0;
1415
1416 while (t < tend) {
d4c19fe8 1417 const UV c = utf8_to_uvchr(t, &tlen);
2f9475ad
JH
1418 if (tlen > 0) {
1419 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1420 t += tlen;
1421 }
1422 else
1423 break;
1424 }
1425 }
1426 else {
36fec512
JH
1427 while (t < tend) {
1428 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1429 t++;
1430 }
2f9475ad
JH
1431 }
1432 len = d - tmpbuf;
1433 Copy(tmpbuf, ustrp, len, U8);
1434#else
d2dcd0fb 1435 Copy(s, ustrp, len, U8);
2f9475ad 1436#endif
29e98929 1437 }
983ffd37 1438 }
0134edef
JH
1439 }
1440
1441 if (!len && *swashp) {
d4c19fe8
AL
1442 const UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1443
0134edef
JH
1444 if (uv2) {
1445 /* It was "normal" (a single character mapping). */
d4c19fe8 1446 const UV uv3 = UNI_TO_NATIVE(uv2);
e9101d72 1447 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1448 }
1449 }
1feea2c7 1450
0134edef
JH
1451 if (!len) /* Neither: just copy. */
1452 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1453
2a37f04d
JH
1454 if (lenp)
1455 *lenp = len;
1456
0134edef 1457 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1458}
1459
d3e79532 1460/*
7fc63493 1461=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1462
1463Convert the UTF-8 encoded character at p to its uppercase version and
1464store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1465that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1466the uppercase version may be longer than the original character.
d3e79532
JH
1467
1468The first character of the uppercased version is returned
1469(but note, as explained above, that there may be more.)
1470
1471=cut */
1472
2104c8d9 1473UV
7fc63493 1474Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1475{
97aff369 1476 dVAR;
983ffd37 1477 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1478 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1479}
a0ed51b3 1480
d3e79532 1481/*
7fc63493 1482=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1483
1484Convert the UTF-8 encoded character at p to its titlecase version and
1485store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1486that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1487titlecase version may be longer than the original character.
d3e79532
JH
1488
1489The first character of the titlecased version is returned
1490(but note, as explained above, that there may be more.)
1491
1492=cut */
1493
983ffd37 1494UV
7fc63493 1495Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37 1496{
97aff369 1497 dVAR;
983ffd37 1498 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1499 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1500}
1501
d3e79532 1502/*
7fc63493 1503=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1504
1505Convert the UTF-8 encoded character at p to its lowercase version and
1506store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1507that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1508lowercase version may be longer than the original character.
d3e79532
JH
1509
1510The first character of the lowercased version is returned
1511(but note, as explained above, that there may be more.)
1512
1513=cut */
1514
2104c8d9 1515UV
7fc63493 1516Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1517{
97aff369 1518 dVAR;
983ffd37 1519 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1520 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1521}
1522
d3e79532 1523/*
7fc63493 1524=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1525
1526Convert the UTF-8 encoded character at p to its foldcase version and
1527store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1528that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1529foldcase version may be longer than the original character (up to
1530three characters).
1531
1532The first character of the foldcased version is returned
1533(but note, as explained above, that there may be more.)
1534
1535=cut */
1536
b4e400f9 1537UV
7fc63493 1538Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9 1539{
97aff369 1540 dVAR;
b4e400f9
JH
1541 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1542 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1543}
1544
711a919c
TS
1545/* Note:
1546 * A "swash" is a swatch hash.
1547 * A "swatch" is a bit vector generated by utf8.c:S_swash_get().
1548 * C<pkg> is a pointer to a package name for SWASHNEW, should be "utf8".
1549 * For other parameters, see utf8::SWASHNEW in lib/utf8_heavy.pl.
1550 */
a0ed51b3 1551SV*
7fc63493 1552Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1553{
27da23d5 1554 dVAR;
a0ed51b3 1555 SV* retval;
9a957fbc 1556 SV* const tokenbufsv = sv_newmortal();
8e84507e 1557 dSP;
7fc63493
AL
1558 const size_t pkg_len = strlen(pkg);
1559 const size_t name_len = strlen(name);
aec46f14 1560 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1561 SV* errsv_save;
ce3b816e 1562
96ca9f55
DM
1563 PUSHSTACKi(PERLSI_MAGIC);
1564 ENTER;
1565 SAVEI32(PL_hints);
1566 PL_hints = 0;
1567 save_re_context();
1b026014 1568 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1569 ENTER;
f8be5cf0 1570 errsv_save = newSVsv(ERRSV);
dc0c6abb
NC
1571 /* It is assumed that callers of this routine are not passing in any
1572 user derived data. */
1573 /* Need to do this after save_re_context() as it will set PL_tainted to
1574 1 while saving $1 etc (see the code after getrx: in Perl_magic_get).
1575 Even line to create errsv_save can turn on PL_tainted. */
1576 SAVEBOOL(PL_tainted);
1577 PL_tainted = 0;
71bed85a 1578 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
a0714e2c 1579 NULL);
f8be5cf0
JH
1580 if (!SvTRUE(ERRSV))
1581 sv_setsv(ERRSV, errsv_save);
1582 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1583 LEAVE;
1584 }
1585 SPAGAIN;
a0ed51b3
LW
1586 PUSHMARK(SP);
1587 EXTEND(SP,5);
71bed85a
NC
1588 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1589 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1590 PUSHs(listsv);
1591 PUSHs(sv_2mortal(newSViv(minbits)));
1592 PUSHs(sv_2mortal(newSViv(none)));
1593 PUTBACK;
923e4eb5 1594 if (IN_PERL_COMPILETIME) {
bf1fed83 1595 /* XXX ought to be handled by lex_start */
82686b01 1596 SAVEI32(PL_in_my);
2b4bd638 1597 PL_in_my = 0;
bf1fed83 1598 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1599 }
f8be5cf0 1600 errsv_save = newSVsv(ERRSV);
864dbfa3 1601 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1602 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1603 else
e24b16f9 1604 retval = &PL_sv_undef;
f8be5cf0
JH
1605 if (!SvTRUE(ERRSV))
1606 sv_setsv(ERRSV, errsv_save);
1607 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1608 LEAVE;
1609 POPSTACK;
923e4eb5 1610 if (IN_PERL_COMPILETIME) {
bf1fed83 1611 STRLEN len;
aec46f14 1612 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83
JH
1613
1614 Copy(pv, PL_tokenbuf, len+1, char);
623e6609 1615 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1616 }
bc45ce41
JH
1617 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1618 if (SvPOK(retval))
35c1215d 1619 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
95b63a38 1620 (void*)retval);
cea2e8a9 1621 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1622 }
a0ed51b3
LW
1623 return retval;
1624}
1625
035d37be
JH
1626
1627/* This API is wrong for special case conversions since we may need to
1628 * return several Unicode characters for a single Unicode character
1629 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1630 * the lower-level routine, and it is similarly broken for returning
1631 * multiple values. --jhi */
979f2922 1632/* Now SWASHGET is recasted into S_swash_get in this file. */
680c470c
TS
1633
1634/* Note:
1635 * Returns the value of property/mapping C<swash> for the first character
1636 * of the string C<ptr>. If C<do_utf8> is true, the string C<ptr> is
1637 * assumed to be in utf8. If C<do_utf8> is false, the string C<ptr> is
1638 * assumed to be in native 8-bit encoding. Caches the swatch in C<swash>.
1639 */
a0ed51b3 1640UV
680c470c 1641Perl_swash_fetch(pTHX_ SV *swash, const U8 *ptr, bool do_utf8)
a0ed51b3 1642{
27da23d5 1643 dVAR;
680c470c 1644 HV* const hv = (HV*)SvRV(swash);
3568d838
JH
1645 U32 klen;
1646 U32 off;
a0ed51b3 1647 STRLEN slen;
7d85a32c 1648 STRLEN needents;
cfd0369c 1649 const U8 *tmps = NULL;
a0ed51b3 1650 U32 bit;
979f2922 1651 SV *swatch;
3568d838
JH
1652 U8 tmputf8[2];
1653 UV c = NATIVE_TO_ASCII(*ptr);
1654
1655 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
979f2922
TS
1656 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1657 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
1658 ptr = tmputf8;
3568d838
JH
1659 }
1660 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1661 * then the "swatch" is a vec() for al the chars which start
1662 * with 0xAA..0xYY
1663 * So the key in the hash (klen) is length of encoded char -1
1664 */
1665 klen = UTF8SKIP(ptr) - 1;
1666 off = ptr[klen];
a0ed51b3 1667
979f2922 1668 if (klen == 0) {
7d85a32c 1669 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1670 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c 1671 */
979f2922
TS
1672 needents = UTF_CONTINUATION_MARK;
1673 off = NATIVE_TO_UTF(ptr[klen]);
1674 }
1675 else {
7d85a32c 1676 /* If char is encoded then swatch is for the prefix */
979f2922
TS
1677 needents = (1 << UTF_ACCUMULATION_SHIFT);
1678 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1679 }
7d85a32c 1680
a0ed51b3
LW
1681 /*
1682 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1683 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1684 * it's nothing to sniff at.) Pity we usually come through at least
1685 * two function calls to get here...
1686 *
1687 * NB: this code assumes that swatches are never modified, once generated!
1688 */
1689
3568d838 1690 if (hv == PL_last_swash_hv &&
a0ed51b3 1691 klen == PL_last_swash_klen &&
27da23d5 1692 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1693 {
1694 tmps = PL_last_swash_tmps;
1695 slen = PL_last_swash_slen;
1696 }
1697 else {
1698 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1699 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3 1700
979f2922
TS
1701 /* If not cached, generate it via swash_get */
1702 if (!svp || !SvPOK(*svp)
1703 || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
2b9d42f0
NIS
1704 /* We use utf8n_to_uvuni() as we want an index into
1705 Unicode tables, not a native character number.
1706 */
aec46f14 1707 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1708 ckWARN(WARN_UTF8) ?
1709 0 : UTF8_ALLOW_ANY);
680c470c 1710 swatch = swash_get(swash,
979f2922
TS
1711 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
1712 (klen) ? (code_point & ~(needents - 1)) : 0,
1713 needents);
1714
923e4eb5 1715 if (IN_PERL_COMPILETIME)
623e6609 1716 CopHINTS_set(PL_curcop, PL_hints);
a0ed51b3 1717
979f2922 1718 svp = hv_store(hv, (const char *)ptr, klen, swatch, 0);
a0ed51b3 1719
979f2922
TS
1720 if (!svp || !(tmps = (U8*)SvPV(*svp, slen))
1721 || (slen << 3) < needents)
660a4616 1722 Perl_croak(aTHX_ "panic: swash_fetch got improper swatch");
a0ed51b3
LW
1723 }
1724
1725 PL_last_swash_hv = hv;
1726 PL_last_swash_klen = klen;
cfd0369c
NC
1727 /* FIXME change interpvar.h? */
1728 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1729 PL_last_swash_slen = slen;
1730 if (klen)
1731 Copy(ptr, PL_last_swash_key, klen, U8);
1732 }
1733
9faf8d75 1734 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1735 case 1:
1736 bit = 1 << (off & 7);
1737 off >>= 3;
1738 return (tmps[off] & bit) != 0;
1739 case 8:
1740 return tmps[off];
1741 case 16:
1742 off <<= 1;
1743 return (tmps[off] << 8) + tmps[off + 1] ;
1744 case 32:
1745 off <<= 2;
1746 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1747 }
660a4616 1748 Perl_croak(aTHX_ "panic: swash_fetch got swatch of unexpected bit width");
a0ed51b3 1749}
2b9d42f0 1750
979f2922
TS
1751/* Note:
1752 * Returns a swatch (a bit vector string) for a code point sequence
1753 * that starts from the value C<start> and comprises the number C<span>.
1754 * A C<swash> must be an object created by SWASHNEW (see lib/utf8_heavy.pl).
1755 * Should be used via swash_fetch, which will cache the swatch in C<swash>.
1756 */
1757STATIC SV*
1758S_swash_get(pTHX_ SV* swash, UV start, UV span)
1759{
1760 SV *swatch;
711a919c 1761 U8 *l, *lend, *x, *xend, *s;
979f2922
TS
1762 STRLEN lcur, xcur, scur;
1763
1764 HV* const hv = (HV*)SvRV(swash);
017a3ce5
GA
1765 SV** const listsvp = hv_fetchs(hv, "LIST", FALSE);
1766 SV** const typesvp = hv_fetchs(hv, "TYPE", FALSE);
1767 SV** const bitssvp = hv_fetchs(hv, "BITS", FALSE);
1768 SV** const nonesvp = hv_fetchs(hv, "NONE", FALSE);
1769 SV** const extssvp = hv_fetchs(hv, "EXTRAS", FALSE);
0bd48802
AL
1770 const U8* const typestr = (U8*)SvPV_nolen(*typesvp);
1771 const int typeto = typestr[0] == 'T' && typestr[1] == 'o';
1772 const STRLEN bits = SvUV(*bitssvp);
1773 const STRLEN octets = bits >> 3; /* if bits == 1, then octets == 0 */
1774 const UV none = SvUV(*nonesvp);
1775 const UV end = start + span;
979f2922
TS
1776
1777 if (bits != 1 && bits != 8 && bits != 16 && bits != 32) {
660a4616
TS
1778 Perl_croak(aTHX_ "panic: swash_get doesn't expect bits %"UVuf,
1779 (UV)bits);
979f2922
TS
1780 }
1781
1782 /* create and initialize $swatch */
396482e1 1783 swatch = newSVpvs("");
979f2922
TS
1784 scur = octets ? (span * octets) : (span + 7) / 8;
1785 SvGROW(swatch, scur + 1);
1786 s = (U8*)SvPVX(swatch);
1787 if (octets && none) {
0bd48802 1788 const U8* const e = s + scur;
979f2922
TS
1789 while (s < e) {
1790 if (bits == 8)
1791 *s++ = (U8)(none & 0xff);
1792 else if (bits == 16) {
1793 *s++ = (U8)((none >> 8) & 0xff);
1794 *s++ = (U8)( none & 0xff);
1795 }
1796 else if (bits == 32) {
1797 *s++ = (U8)((none >> 24) & 0xff);
1798 *s++ = (U8)((none >> 16) & 0xff);
1799 *s++ = (U8)((none >> 8) & 0xff);
1800 *s++ = (U8)( none & 0xff);
1801 }
1802 }
1803 *s = '\0';
1804 }
1805 else {
1806 (void)memzero((U8*)s, scur + 1);
1807 }
1808 SvCUR_set(swatch, scur);
1809 s = (U8*)SvPVX(swatch);
1810
1811 /* read $swash->{LIST} */
1812 l = (U8*)SvPV(*listsvp, lcur);
1813 lend = l + lcur;
1814 while (l < lend) {
1815 UV min, max, val, key;
1816 STRLEN numlen;
1817 I32 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1818
0bd48802 1819 U8* const nl = (U8*)memchr(l, '\n', lend - l);
979f2922
TS
1820
1821 numlen = lend - l;
1822 min = grok_hex((char *)l, &numlen, &flags, NULL);
1823 if (numlen)
1824 l += numlen;
1825 else if (nl) {
1826 l = nl + 1; /* 1 is length of "\n" */
1827 continue;
1828 }
1829 else {
1830 l = lend; /* to LIST's end at which \n is not found */
1831 break;
1832 }
1833
1834 if (isBLANK(*l)) {
1835 ++l;
1836 flags = PERL_SCAN_SILENT_ILLDIGIT | PERL_SCAN_DISALLOW_PREFIX;
1837 numlen = lend - l;
1838 max = grok_hex((char *)l, &numlen, &flags, NULL);
1839 if (numlen)
1840 l += numlen;
1841 else
1842 max = min;
1843
1844 if (octets) {
1845 if (isBLANK(*l)) {
1846 ++l;
1847 flags = PERL_SCAN_SILENT_ILLDIGIT |
1848 PERL_SCAN_DISALLOW_PREFIX;
1849 numlen = lend - l;
1850 val = grok_hex((char *)l, &numlen, &flags, NULL);
1851 if (numlen)
1852 l += numlen;
1853 else
1854 val = 0;
1855 }
1856 else {
1857 val = 0;
1858 if (typeto) {
1859 Perl_croak(aTHX_ "%s: illegal mapping '%s'",
1860 typestr, l);
1861 }
1862 }
1863 }
711a919c
TS
1864 else
1865 val = 0; /* bits == 1, then val should be ignored */
979f2922
TS
1866 }
1867 else {
1868 max = min;
1869 if (octets) {
1870 val = 0;
1871 if (typeto) {
1872 Perl_croak(aTHX_ "%s: illegal mapping '%s'", typestr, l);
1873 }
1874 }
711a919c
TS
1875 else
1876 val = 0; /* bits == 1, then val should be ignored */
979f2922
TS
1877 }
1878
1879 if (nl)
1880 l = nl + 1;
1881 else
1882 l = lend;
1883
1884 if (max < start)
1885 continue;
1886
1887 if (octets) {
1888 if (min < start) {
1889 if (!none || val < none) {
1890 val += start - min;
1891 }
1892 min = start;
1893 }
1894 for (key = min; key <= max; key++) {
1895 STRLEN offset;
1896 if (key >= end)
1897 goto go_out_list;
1898 /* offset must be non-negative (start <= min <= key < end) */
1899 offset = octets * (key - start);
1900 if (bits == 8)
1901 s[offset] = (U8)(val & 0xff);
1902 else if (bits == 16) {
1903 s[offset ] = (U8)((val >> 8) & 0xff);
1904 s[offset + 1] = (U8)( val & 0xff);
1905 }
1906 else if (bits == 32) {
1907 s[offset ] = (U8)((val >> 24) & 0xff);
1908 s[offset + 1] = (U8)((val >> 16) & 0xff);
1909 s[offset + 2] = (U8)((val >> 8) & 0xff);
1910 s[offset + 3] = (U8)( val & 0xff);
1911 }
1912
1913 if (!none || val < none)
1914 ++val;
1915 }
1916 }
711a919c 1917 else { /* bits == 1, then val should be ignored */
979f2922
TS
1918 if (min < start)
1919 min = start;
1920 for (key = min; key <= max; key++) {
0bd48802 1921 const STRLEN offset = (STRLEN)(key - start);
979f2922
TS
1922 if (key >= end)
1923 goto go_out_list;
1924 s[offset >> 3] |= 1 << (offset & 7);
1925 }
1926 }
1927 } /* while */
1928 go_out_list:
1929
1930 /* read $swash->{EXTRAS} */
1931 x = (U8*)SvPV(*extssvp, xcur);
1932 xend = x + xcur;
1933 while (x < xend) {
1934 STRLEN namelen;
1935 U8 *namestr;
1936 SV** othersvp;
1937 HV* otherhv;
1938 STRLEN otherbits;
1939 SV **otherbitssvp, *other;
711a919c 1940 U8 *s, *o, *nl;
979f2922
TS
1941 STRLEN slen, olen;
1942
1943 U8 opc = *x++;
1944 if (opc == '\n')
1945 continue;
1946
1947 nl = (U8*)memchr(x, '\n', xend - x);
1948
1949 if (opc != '-' && opc != '+' && opc != '!' && opc != '&') {
1950 if (nl) {
1951 x = nl + 1; /* 1 is length of "\n" */
1952 continue;
1953 }
1954 else {
1955 x = xend; /* to EXTRAS' end at which \n is not found */
1956 break;
1957 }
1958 }
1959
1960 namestr = x;
1961 if (nl) {
1962 namelen = nl - namestr;
1963 x = nl + 1;
1964 }
1965 else {
1966 namelen = xend - namestr;
1967 x = xend;
1968 }
1969
1970 othersvp = hv_fetch(hv, (char *)namestr, namelen, FALSE);
660a4616 1971 otherhv = (HV*)SvRV(*othersvp);
017a3ce5 1972 otherbitssvp = hv_fetchs(otherhv, "BITS", FALSE);
979f2922
TS
1973 otherbits = (STRLEN)SvUV(*otherbitssvp);
1974 if (bits < otherbits)
660a4616 1975 Perl_croak(aTHX_ "panic: swash_get found swatch size mismatch");
979f2922
TS
1976
1977 /* The "other" swatch must be destroyed after. */
1978 other = swash_get(*othersvp, start, span);
1979 o = (U8*)SvPV(other, olen);
1980
1981 if (!olen)
660a4616 1982 Perl_croak(aTHX_ "panic: swash_get got improper swatch");
979f2922
TS
1983
1984 s = (U8*)SvPV(swatch, slen);
1985 if (bits == 1 && otherbits == 1) {
1986 if (slen != olen)
660a4616 1987 Perl_croak(aTHX_ "panic: swash_get found swatch length mismatch");
979f2922
TS
1988
1989 switch (opc) {
1990 case '+':
1991 while (slen--)
1992 *s++ |= *o++;
1993 break;
1994 case '!':
1995 while (slen--)
1996 *s++ |= ~*o++;
1997 break;
1998 case '-':
1999 while (slen--)
2000 *s++ &= ~*o++;
2001 break;
2002 case '&':
2003 while (slen--)
2004 *s++ &= *o++;
2005 break;
2006 default:
2007 break;
2008 }
2009 }
711a919c 2010 else {
979f2922
TS
2011 STRLEN otheroctets = otherbits >> 3;
2012 STRLEN offset = 0;
2013 U8* send = s + slen;
2014
2015 while (s < send) {
2016 UV otherval = 0;
2017
2018 if (otherbits == 1) {
2019 otherval = (o[offset >> 3] >> (offset & 7)) & 1;
2020 ++offset;
2021 }
2022 else {
2023 STRLEN vlen = otheroctets;
2024 otherval = *o++;
2025 while (--vlen) {
2026 otherval <<= 8;
2027 otherval |= *o++;
2028 }
2029 }
2030
711a919c 2031 if (opc == '+' && otherval)
6f207bd3 2032 NOOP; /* replace with otherval */
979f2922
TS
2033 else if (opc == '!' && !otherval)
2034 otherval = 1;
2035 else if (opc == '-' && otherval)
2036 otherval = 0;
2037 else if (opc == '&' && !otherval)
2038 otherval = 0;
2039 else {
711a919c 2040 s += octets; /* no replacement */
979f2922
TS
2041 continue;
2042 }
2043
2044 if (bits == 8)
2045 *s++ = (U8)( otherval & 0xff);
2046 else if (bits == 16) {
2047 *s++ = (U8)((otherval >> 8) & 0xff);
2048 *s++ = (U8)( otherval & 0xff);
2049 }
2050 else if (bits == 32) {
2051 *s++ = (U8)((otherval >> 24) & 0xff);
2052 *s++ = (U8)((otherval >> 16) & 0xff);
2053 *s++ = (U8)((otherval >> 8) & 0xff);
2054 *s++ = (U8)( otherval & 0xff);
2055 }
2056 }
2057 }
2058 sv_free(other); /* through with it! */
2059 } /* while */
2060 return swatch;
2061}
2062
0f830e0b
NC
2063/*
2064=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2065
2066Adds the UTF-8 representation of the Native codepoint C<uv> to the end
2067of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2068bytes available. The return value is the pointer to the byte after the
2069end of the new character. In other words,
2070
2071 d = uvchr_to_utf8(d, uv);
2072
2073is the recommended wide native character-aware way of saying
2074
2075 *(d++) = uv;
2076
2077=cut
2078*/
2079
2080/* On ASCII machines this is normally a macro but we want a
2081 real function in case XS code wants it
2082*/
0f830e0b
NC
2083U8 *
2084Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
2085{
2086 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2087}
2088
b851fbc1
JH
2089U8 *
2090Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
2091{
2092 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
2093}
2b9d42f0
NIS
2094
2095/*
0f830e0b
NC
2096=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32
2097flags
2098
2099Returns the native character value of the first character in the string
2100C<s>
2101which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2102length, in bytes, of that character.
2103
2104Allows length and flags to be passed to low level routine.
2105
2106=cut
2107*/
2108/* On ASCII machines this is normally a macro but we want
2109 a real function in case XS code wants it
2110*/
0f830e0b
NC
2111UV
2112Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen,
2113U32 flags)
2114{
2115 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2116 return UNI_TO_NATIVE(uv);
2117}
2118
2119/*
d2cc3551
JH
2120=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
2121
2122Build to the scalar dsv a displayable version of the string spv,
2123length len, the displayable version being at most pvlim bytes long
2124(if longer, the rest is truncated and "..." will be appended).
0a2ef054 2125
9e55ce06 2126The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 2127isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
2128to display the \\[nrfta\\] as the backslashed versions (like '\n')
2129(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
2130UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
2131UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
2132
d2cc3551
JH
2133The pointer to the PV of the dsv is returned.
2134
2135=cut */
e6b2e755 2136char *
e1ec3a88 2137Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
2138{
2139 int truncated = 0;
e1ec3a88 2140 const char *s, *e;
e6b2e755
JH
2141
2142 sv_setpvn(dsv, "", 0);
e1ec3a88 2143 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 2144 UV u;
a49f32c6
NC
2145 /* This serves double duty as a flag and a character to print after
2146 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
2147 */
2148 char ok = 0;
c728cb41 2149
e6b2e755
JH
2150 if (pvlim && SvCUR(dsv) >= pvlim) {
2151 truncated++;
2152 break;
2153 }
2154 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 2155 if (u < 256) {
a3b680e6 2156 const unsigned char c = (unsigned char)u & 0xFF;
0bd48802 2157 if (flags & UNI_DISPLAY_BACKSLASH) {
a49f32c6 2158 switch (c) {
c728cb41 2159 case '\n':
a49f32c6 2160 ok = 'n'; break;
c728cb41 2161 case '\r':
a49f32c6 2162 ok = 'r'; break;
c728cb41 2163 case '\t':
a49f32c6 2164 ok = 't'; break;
c728cb41 2165 case '\f':
a49f32c6 2166 ok = 'f'; break;
c728cb41 2167 case '\a':
a49f32c6 2168 ok = 'a'; break;
c728cb41 2169 case '\\':
a49f32c6 2170 ok = '\\'; break;
c728cb41
JH
2171 default: break;
2172 }
a49f32c6
NC
2173 if (ok) {
2174 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
2175 }
c728cb41 2176 }
00e86452 2177 /* isPRINT() is the locale-blind version. */
a49f32c6
NC
2178 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
2179 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
2180 ok = 1;
0a2ef054 2181 }
c728cb41
JH
2182 }
2183 if (!ok)
9e55ce06 2184 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
2185 }
2186 if (truncated)
396482e1 2187 sv_catpvs(dsv, "...");
e6b2e755
JH
2188
2189 return SvPVX(dsv);
2190}
2b9d42f0 2191
d2cc3551
JH
2192/*
2193=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
2194
2195Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 2196the displayable version being at most pvlim bytes long
d2cc3551 2197(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
2198
2199The flags argument is as in pv_uni_display().
2200
d2cc3551
JH
2201The pointer to the PV of the dsv is returned.
2202
d4c19fe8
AL
2203=cut
2204*/
e6b2e755
JH
2205char *
2206Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
2207{
cfd0369c
NC
2208 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
2209 SvCUR(ssv), pvlim, flags);
701a277b
JH
2210}
2211
d2cc3551 2212/*
d07ddd77 2213=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
2214
2215Return true if the strings s1 and s2 differ case-insensitively, false
2216if not (if they are equal case-insensitively). If u1 is true, the
2217string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
2218the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
2219are false, the respective string is assumed to be in native 8-bit
2220encoding.
2221
2222If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
2223in there (they will point at the beginning of the I<next> character).
2224If the pointers behind pe1 or pe2 are non-NULL, they are the end
2225pointers beyond which scanning will not continue under any
4cdaeff7 2226circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
2227s2+l2 will be used as goal end pointers that will also stop the scan,
2228and which qualify towards defining a successful match: all the scans
2229that define an explicit length must reach their goal pointers for
2230a match to succeed).
d2cc3551
JH
2231
2232For case-insensitiveness, the "casefolding" of Unicode is used
2233instead of upper/lowercasing both the characters, see
2234http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2235
2236=cut */
701a277b 2237I32
d07ddd77 2238Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2239{
97aff369 2240 dVAR;
e1ec3a88
AL
2241 register const U8 *p1 = (const U8*)s1;
2242 register const U8 *p2 = (const U8*)s2;
cbbf8932 2243 register const U8 *f1 = NULL;
2f73348c 2244 register const U8 *f2 = NULL;
cbbf8932
AL
2245 register U8 *e1 = NULL;
2246 register U8 *q1 = NULL;
2247 register U8 *e2 = NULL;
2248 register U8 *q2 = NULL;
d07ddd77 2249 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
2250 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2251 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2252 U8 natbuf[1+1];
2253 STRLEN foldlen1, foldlen2;
d07ddd77 2254 bool match;
332ddc25 2255
d07ddd77
JH
2256 if (pe1)
2257 e1 = *(U8**)pe1;
e1ec3a88
AL
2258 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2259 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2260 if (pe2)
2261 e2 = *(U8**)pe2;
e1ec3a88
AL
2262 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2263 f2 = (const U8*)s2 + l2;
d07ddd77
JH
2264
2265 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2266 return 1; /* mismatch; possible infinite loop or false positive */
2267
a6872d42
JH
2268 if (!u1 || !u2)
2269 natbuf[1] = 0; /* Need to terminate the buffer. */
2270
d07ddd77
JH
2271 while ((e1 == 0 || p1 < e1) &&
2272 (f1 == 0 || p1 < f1) &&
2273 (e2 == 0 || p2 < e2) &&
2274 (f2 == 0 || p2 < f2)) {
2275 if (n1 == 0) {
d7f013c8
JH
2276 if (u1)
2277 to_utf8_fold(p1, foldbuf1, &foldlen1);
2278 else {
809e8e66 2279 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2280 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2281 }
2282 q1 = foldbuf1;
d07ddd77 2283 n1 = foldlen1;
332ddc25 2284 }
d07ddd77 2285 if (n2 == 0) {
d7f013c8
JH
2286 if (u2)
2287 to_utf8_fold(p2, foldbuf2, &foldlen2);
2288 else {
809e8e66 2289 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2290 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2291 }
2292 q2 = foldbuf2;
d07ddd77 2293 n2 = foldlen2;
332ddc25 2294 }
d07ddd77
JH
2295 while (n1 && n2) {
2296 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2297 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2298 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2299 return 1; /* mismatch */
d07ddd77 2300 n1 -= UTF8SKIP(q1);
d7f013c8 2301 q1 += UTF8SKIP(q1);
d07ddd77 2302 n2 -= UTF8SKIP(q2);
d7f013c8 2303 q2 += UTF8SKIP(q2);
701a277b 2304 }
d07ddd77 2305 if (n1 == 0)
d7f013c8 2306 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2307 if (n2 == 0)
d7f013c8
JH
2308 p2 += u2 ? UTF8SKIP(p2) : 1;
2309
d2cc3551 2310 }
5469e704 2311
d07ddd77
JH
2312 /* A match is defined by all the scans that specified
2313 * an explicit length reaching their final goals. */
2314 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2315
2316 if (match) {
d07ddd77
JH
2317 if (pe1)
2318 *pe1 = (char*)p1;
2319 if (pe2)
2320 *pe2 = (char*)p2;
5469e704
JH
2321 }
2322
2323 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2324}
701a277b 2325
a49f32c6
NC
2326/*
2327 * Local variables:
2328 * c-indentation-style: bsd
2329 * c-basic-offset: 4
2330 * indent-tabs-mode: t
2331 * End:
2332 *
37442d52
RGS
2333 * ex: set ts=8 sts=4 sw=4 noet:
2334 */