This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Integrate:
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
31ab2e0d
NC
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
4 * others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3
LW
26#include "perl.h"
27
fe20fd30
JH
28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7
JH
31/*
32=head1 Unicode Support
a0ed51b3 33
40d34c0d
SB
34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
36Unicode characters as a variable number of bytes, in such a way that
37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
cd458e05 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
a2a469f9 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1
JH
47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1
JH
53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485
SC
57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6
NIS
90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463
GS
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
104 return d;
105 }
106 if (uv < 0x10000) {
eb160463
GS
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
110 return d;
111 }
112 if (uv < 0x200000) {
eb160463
GS
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463
GS
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
138#endif
139 {
eb160463
GS
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463
GS
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
b851fbc1
JH
169
170U8 *
171Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
172{
173 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
174}
9041c2e3 175
9041c2e3 176/*
eebe1485 177
5da9da9e 178Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
179character. Note that an INVARIANT (i.e. ASCII) character is a valid
180UTF-8 character. The actual number of bytes in the UTF-8 character
181will be returned if it is valid, otherwise 0.
9041c2e3 182
d7a71dc3
NC
183This is the "slow" version as opposed to the "fast" version which is
184the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
185difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
186or less you should use the IS_UTF8_CHAR(), for lengths of five or more
187you should use the _slow(). In practice this means that the _slow()
188will be used very rarely, since the maximum Unicode code point (as of
189Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
190the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
191five bytes or more.
192
82686b01 193=cut */
767c9e37 194STATIC STRLEN
d7a71dc3 195S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
386d01d6
GS
196{
197 U8 u = *s;
d7a71dc3 198 STRLEN slen;
067a85ef 199 UV uv, ouv;
386d01d6 200
1d72bdf6 201 if (UTF8_IS_INVARIANT(u))
386d01d6
GS
202 return 1;
203
60006e79 204 if (!UTF8_IS_START(u))
386d01d6
GS
205 return 0;
206
60006e79 207 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
067a85ef
A
208 return 0;
209
386d01d6
GS
210 slen = len - 1;
211 s++;
6eb6869e 212 u &= UTF_START_MASK(len);
1d72bdf6 213 uv = u;
067a85ef 214 ouv = uv;
386d01d6 215 while (slen--) {
60006e79 216 if (!UTF8_IS_CONTINUATION(*s))
386d01d6 217 return 0;
8850bf83 218 uv = UTF8_ACCUMULATE(uv, *s);
209a9bc1 219 if (uv < ouv)
067a85ef
A
220 return 0;
221 ouv = uv;
386d01d6
GS
222 s++;
223 }
067a85ef 224
eb160463 225 if ((STRLEN)UNISKIP(uv) < len)
067a85ef
A
226 return 0;
227
386d01d6
GS
228 return len;
229}
230
6662521e 231/*
d7a71dc3
NC
232=for apidoc A|STRLEN|is_utf8_char|U8 *s
233
234Tests if some arbitrary number of bytes begins in a valid UTF-8
235character. Note that an INVARIANT (i.e. ASCII) character is a valid
236UTF-8 character. The actual number of bytes in the UTF-8 character
237will be returned if it is valid, otherwise 0.
238
239=cut */
240STRLEN
241Perl_is_utf8_char(pTHX_ U8 *s)
242{
243 STRLEN len = UTF8SKIP(s);
244#ifdef IS_UTF8_CHAR
245 if (IS_UTF8_CHAR_FAST(len))
246 return IS_UTF8_CHAR(s, len) ? len : 0;
247#endif /* #ifdef IS_UTF8_CHAR */
248 return is_utf8_char_slow(s, len);
249}
250
251/*
eebe1485 252=for apidoc A|bool|is_utf8_string|U8 *s|STRLEN len
6662521e 253
5835a535 254Returns true if first C<len> bytes of the given string form a valid
cd458e05
JH
255UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
256not mean 'a string that contains code points above 0x7F encoded in UTF-8'
257because a valid ASCII string is a valid UTF-8 string.
6662521e 258
d7a71dc3
NC
259See also is_utf8_string_loclen() and is_utf8_string_loc().
260
6662521e
GS
261=cut
262*/
263
8e84507e 264bool
6662521e
GS
265Perl_is_utf8_string(pTHX_ U8 *s, STRLEN len)
266{
1f49be52
AL
267 const U8* x = s;
268 const U8* send;
067a85ef 269
d6e8a192 270 if (!len && s)
c05e0e2f 271 len = strlen((const char *)s);
1aa99e6b
IH
272 send = s + len;
273
6662521e 274 while (x < send) {
8c18bf38 275 STRLEN c;
1acdb0da
JH
276 /* Inline the easy bits of is_utf8_char() here for speed... */
277 if (UTF8_IS_INVARIANT(*x))
278 c = 1;
279 else if (!UTF8_IS_START(*x))
d7a71dc3 280 goto out;
1acdb0da
JH
281 else {
282 /* ... and call is_utf8_char() only if really needed. */
d7a71dc3
NC
283#ifdef IS_UTF8_CHAR
284 c = UTF8SKIP(x);
285 if (IS_UTF8_CHAR_FAST(c)) {
286 if (!IS_UTF8_CHAR(x, c))
287 goto out;
288 } else if (!is_utf8_char_slow(x, c))
289 goto out;
290#else
291 c = is_utf8_char(x);
292#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 293 if (!c)
d7a71dc3 294 goto out;
1acdb0da 295 }
6662521e 296 x += c;
6662521e 297 }
d7a71dc3
NC
298
299 out:
60006e79
JH
300 if (x != send)
301 return FALSE;
067a85ef
A
302
303 return TRUE;
6662521e
GS
304}
305
67e989fb 306/*
d7a71dc3 307=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
9c20fa4a 308
d7a71dc3
NC
309Like is_ut8_string() but stores the location of the failure (in the
310case of "utf8ness failure") or the location s+len (in the case of
311"utf8ness success") in the C<ep>, and the number of UTF-8
312encoded characters in the C<el>.
313
314See also is_utf8_string_loc() and is_utf8_string().
9c20fa4a
JH
315
316=cut
317*/
318
319bool
d7a71dc3 320Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
9c20fa4a 321{
1f49be52
AL
322 const U8* x = s;
323 const U8* send;
9c20fa4a
JH
324 STRLEN c;
325
d6e8a192 326 if (!len && s)
c05e0e2f 327 len = strlen((const char *)s);
9c20fa4a 328 send = s + len;
d7a71dc3
NC
329 if (el)
330 *el = 0;
9c20fa4a
JH
331
332 while (x < send) {
333 /* Inline the easy bits of is_utf8_char() here for speed... */
334 if (UTF8_IS_INVARIANT(*x))
d7a71dc3
NC
335 c = 1;
336 else if (!UTF8_IS_START(*x))
337 goto out;
9c20fa4a 338 else {
d7a71dc3
NC
339 /* ... and call is_utf8_char() only if really needed. */
340#ifdef IS_UTF8_CHAR
341 c = UTF8SKIP(x);
342 if (IS_UTF8_CHAR_FAST(c)) {
343 if (!IS_UTF8_CHAR(x, c))
344 c = 0;
345 } else
346 c = is_utf8_char_slow(x, c);
347#else
348 c = is_utf8_char(x);
349#endif /* #ifdef IS_UTF8_CHAR */
350 if (!c)
351 goto out;
9c20fa4a 352 }
d7a71dc3
NC
353 x += c;
354 if (el)
355 (*el)++;
9c20fa4a 356 }
d7a71dc3
NC
357
358 out:
359 if (ep)
360 *ep = x;
361 if (x != send)
9c20fa4a 362 return FALSE;
9c20fa4a
JH
363
364 return TRUE;
365}
366
367/*
d7a71dc3
NC
368=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
369
370Like is_ut8_string() but stores the location of the failure (in the
371case of "utf8ness failure") or the location s+len (in the case of
372"utf8ness success") in the C<ep>.
373
374See also is_utf8_string_loclen() and is_utf8_string().
375
376=cut
377*/
378
379bool
380Perl_is_utf8_string_loc(pTHX_ U8 *s, STRLEN len, U8 **ep)
381{
382 return is_utf8_string_loclen(s, len, (const U8 **)ep, 0);
383}
384
385/*
9041c2e3 386=for apidoc A|UV|utf8n_to_uvuni|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 387
9041c2e3
NIS
388Bottom level UTF-8 decode routine.
389Returns the unicode code point value of the first character in the string C<s>
cd458e05 390which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 391C<retlen> will be set to the length, in bytes, of that character.
67e989fb 392
cd458e05 393If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
394is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
395it is assumed that the caller will raise a warning, and this function
28d3d195
JH
396will silently just set C<retlen> to C<-1> and return zero. If the
397C<flags> does not contain UTF8_CHECK_ONLY, warnings about
398malformations will be given, C<retlen> will be set to the expected
399length of the UTF-8 character in bytes, and zero will be returned.
400
401The C<flags> can also contain various flags to allow deviations from
402the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 403
9041c2e3
NIS
404Most code should use utf8_to_uvchr() rather than call this directly.
405
37607a96
PK
406=cut
407*/
67e989fb 408
a0ed51b3 409UV
37607a96 410Perl_utf8n_to_uvuni(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 411{
1f49be52 412 const U8 *s0 = s;
9c5ffd7c 413 UV uv = *s, ouv = 0;
ba210ebe 414 STRLEN len = 1;
1f49be52
AL
415 const bool dowarn = ckWARN_d(WARN_UTF8);
416 const UV startbyte = *s;
ba210ebe 417 STRLEN expectlen = 0;
a0dbb045
JH
418 U32 warning = 0;
419
420/* This list is a superset of the UTF8_ALLOW_XXX. */
421
422#define UTF8_WARN_EMPTY 1
423#define UTF8_WARN_CONTINUATION 2
424#define UTF8_WARN_NON_CONTINUATION 3
425#define UTF8_WARN_FE_FF 4
426#define UTF8_WARN_SHORT 5
427#define UTF8_WARN_OVERFLOW 6
428#define UTF8_WARN_SURROGATE 7
c867b360
JH
429#define UTF8_WARN_LONG 8
430#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
431
432 if (curlen == 0 &&
433 !(flags & UTF8_ALLOW_EMPTY)) {
434 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
435 goto malformed;
436 }
437
1d72bdf6 438 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
439 if (retlen)
440 *retlen = 1;
c4d5f83a 441 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 442 }
67e989fb 443
421a8bf2 444 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 445 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 446 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
447 goto malformed;
448 }
449
421a8bf2 450 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 451 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 452 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
453 goto malformed;
454 }
9041c2e3 455
1d72bdf6 456#ifdef EBCDIC
75383841 457 uv = NATIVE_TO_UTF(uv);
1d72bdf6 458#else
fcc8fcf6
JH
459 if ((uv == 0xfe || uv == 0xff) &&
460 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 461 warning = UTF8_WARN_FE_FF;
ba210ebe 462 goto malformed;
a0ed51b3 463 }
1d72bdf6
NIS
464#endif
465
ba210ebe
JH
466 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
467 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
468 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
469 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
470#ifdef EBCDIC
471 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
472 else { len = 7; uv &= 0x01; }
473#else
ba210ebe
JH
474 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
475 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
476 else { len = 13; uv = 0; } /* whoa! */
477#endif
478
a0ed51b3
LW
479 if (retlen)
480 *retlen = len;
9041c2e3 481
ba210ebe
JH
482 expectlen = len;
483
fcc8fcf6
JH
484 if ((curlen < expectlen) &&
485 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 486 warning = UTF8_WARN_SHORT;
ba210ebe
JH
487 goto malformed;
488 }
489
490 len--;
a0ed51b3 491 s++;
ba210ebe
JH
492 ouv = uv;
493
a0ed51b3 494 while (len--) {
421a8bf2
JH
495 if (!UTF8_IS_CONTINUATION(*s) &&
496 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
497 s--;
498 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 499 goto malformed;
a0ed51b3
LW
500 }
501 else
8850bf83 502 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
503 if (!(uv > ouv)) {
504 /* These cannot be allowed. */
505 if (uv == ouv) {
f66b99ab 506 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
507 warning = UTF8_WARN_LONG;
508 goto malformed;
509 }
510 }
511 else { /* uv < ouv */
512 /* This cannot be allowed. */
513 warning = UTF8_WARN_OVERFLOW;
514 goto malformed;
515 }
ba210ebe
JH
516 }
517 s++;
518 ouv = uv;
519 }
520
421a8bf2 521 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 522 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 523 warning = UTF8_WARN_SURROGATE;
ba210ebe 524 goto malformed;
eb160463 525 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 526 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 527 warning = UTF8_WARN_LONG;
ba210ebe 528 goto malformed;
421a8bf2 529 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 530 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 531 warning = UTF8_WARN_FFFF;
a9917092 532 goto malformed;
a0ed51b3 533 }
ba210ebe 534
a0ed51b3 535 return uv;
ba210ebe
JH
536
537malformed:
538
fcc8fcf6 539 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 540 if (retlen)
cc366d4b 541 *retlen = -1;
ba210ebe
JH
542 return 0;
543 }
544
a0dbb045
JH
545 if (dowarn) {
546 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
547
548 switch (warning) {
549 case 0: /* Intentionally empty. */ break;
550 case UTF8_WARN_EMPTY:
04851bb3 551 Perl_sv_catpv(aTHX_ sv, "(empty string)");
a0dbb045
JH
552 break;
553 case UTF8_WARN_CONTINUATION:
097fb8e2 554 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
555 break;
556 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
557 if (s == s0)
558 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
559 (UV)s[1], startbyte);
560 else
561 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
228fe6e6 562 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, (int)expectlen);
097fb8e2 563
a0dbb045
JH
564 break;
565 case UTF8_WARN_FE_FF:
566 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
567 break;
568 case UTF8_WARN_SHORT:
097fb8e2 569 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
228fe6e6 570 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 571 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
572 break;
573 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
574 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
575 ouv, *s, startbyte);
a0dbb045
JH
576 break;
577 case UTF8_WARN_SURROGATE:
578 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
579 break;
a0dbb045 580 case UTF8_WARN_LONG:
097fb8e2 581 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
228fe6e6 582 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
583 break;
584 case UTF8_WARN_FFFF:
585 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
586 break;
587 default:
04851bb3 588 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
a0dbb045
JH
589 break;
590 }
591
592 if (warning) {
547d29e4 593 const char *s = SvPVX_const(sv);
a0dbb045
JH
594
595 if (PL_op)
9014280d 596 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 597 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 598 else
9014280d 599 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
600 }
601 }
602
ba210ebe 603 if (retlen)
28d3d195 604 *retlen = expectlen ? expectlen : len;
ba210ebe 605
28d3d195 606 return 0;
a0ed51b3
LW
607}
608
8e84507e 609/*
37607a96 610=for apidoc A|UV|utf8_to_uvchr|U8 *s|STRLEN *retlen
9041c2e3
NIS
611
612Returns the native character value of the first character in the string C<s>
cd458e05 613which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
614length, in bytes, of that character.
615
cd458e05 616If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
617returned and retlen is set, if possible, to -1.
618
619=cut
620*/
621
622UV
37607a96 623Perl_utf8_to_uvchr(pTHX_ U8 *s, STRLEN *retlen)
9041c2e3 624{
a2a469f9 625 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 626 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
627}
628
629/*
37607a96 630=for apidoc A|UV|utf8_to_uvuni|U8 *s|STRLEN *retlen
9041c2e3
NIS
631
632Returns the Unicode code point of the first character in the string C<s>
cd458e05 633which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
634length, in bytes, of that character.
635
636This function should only be used when returned UV is considered
637an index into the Unicode semantic tables (e.g. swashes).
638
cd458e05 639If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 640returned and retlen is set, if possible, to -1.
8e84507e
NIS
641
642=cut
643*/
644
645UV
37607a96 646Perl_utf8_to_uvuni(pTHX_ U8 *s, STRLEN *retlen)
8e84507e 647{
9041c2e3 648 /* Call the low level routine asking for checks */
a2a469f9 649 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 650 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
651}
652
b76347f2 653/*
37607a96 654=for apidoc A|STRLEN|utf8_length|U8 *s|U8 *e
b76347f2
JH
655
656Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
657Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
658up past C<e>, croaks.
b76347f2
JH
659
660=cut
661*/
662
663STRLEN
37607a96 664Perl_utf8_length(pTHX_ U8 *s, U8 *e)
b76347f2
JH
665{
666 STRLEN len = 0;
667
8850bf83
JH
668 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
669 * the bitops (especially ~) can create illegal UTF-8.
670 * In other words: in Perl UTF-8 is not just for Unicode. */
671
8c18bf38
AL
672 if (e < s)
673 goto warn_and_return;
b76347f2 674 while (s < e) {
065cbbe5 675 const U8 t = UTF8SKIP(s);
901b21bf 676 if (e - s < t) {
8c18bf38 677 warn_and_return:
901b21bf
JH
678 if (ckWARN_d(WARN_UTF8)) {
679 if (PL_op)
680 Perl_warner(aTHX_ packWARN(WARN_UTF8),
8c18bf38 681 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
682 else
683 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
684 }
685 return len;
686 }
b76347f2
JH
687 s += t;
688 len++;
689 }
690
691 return len;
692}
693
b06226ff 694/*
eebe1485 695=for apidoc A|IV|utf8_distance|U8 *a|U8 *b
b06226ff 696
cd458e05 697Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
698and C<b>.
699
700WARNING: use only if you *know* that the pointers point inside the
701same UTF-8 buffer.
702
37607a96
PK
703=cut
704*/
a0ed51b3 705
02eb7b47 706IV
864dbfa3 707Perl_utf8_distance(pTHX_ U8 *a, U8 *b)
a0ed51b3 708{
02eb7b47
JH
709 IV off = 0;
710
8850bf83
JH
711 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
712 * the bitops (especially ~) can create illegal UTF-8.
713 * In other words: in Perl UTF-8 is not just for Unicode. */
714
a0ed51b3
LW
715 if (a < b) {
716 while (a < b) {
7120cae1 717 const U8 c = UTF8SKIP(a);
8c18bf38
AL
718 if (b - a < c)
719 goto warn_and_return;
02eb7b47 720 a += c;
a0ed51b3
LW
721 off--;
722 }
723 }
724 else {
725 while (b < a) {
065cbbe5 726 const U8 c = UTF8SKIP(b);
02eb7b47 727
901b21bf 728 if (a - b < c) {
8c18bf38 729 warn_and_return:
901b21bf
JH
730 if (ckWARN_d(WARN_UTF8)) {
731 if (PL_op)
732 Perl_warner(aTHX_ packWARN(WARN_UTF8),
733 "%s in %s", unees, OP_DESC(PL_op));
734 else
735 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
736 }
737 return off;
738 }
02eb7b47 739 b += c;
a0ed51b3
LW
740 off++;
741 }
742 }
02eb7b47 743
a0ed51b3
LW
744 return off;
745}
746
b06226ff 747/*
37607a96 748=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 749
8850bf83
JH
750Return the UTF-8 pointer C<s> displaced by C<off> characters, either
751forward or backward.
b06226ff
JH
752
753WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
754the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
755on the first byte of character or just after the last byte of a character.
b06226ff 756
37607a96
PK
757=cut
758*/
a0ed51b3
LW
759
760U8 *
864dbfa3 761Perl_utf8_hop(pTHX_ U8 *s, I32 off)
a0ed51b3 762{
8850bf83
JH
763 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
764 * the bitops (especially ~) can create illegal UTF-8.
765 * In other words: in Perl UTF-8 is not just for Unicode. */
766
a0ed51b3
LW
767 if (off >= 0) {
768 while (off--)
769 s += UTF8SKIP(s);
770 }
771 else {
772 while (off++) {
773 s--;
8850bf83
JH
774 while (UTF8_IS_CONTINUATION(*s))
775 s--;
a0ed51b3
LW
776 }
777 }
065cbbe5 778 return (U8 *)s;
a0ed51b3
LW
779}
780
6940069f 781/*
eebe1485 782=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 783
cd458e05 784Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
785Unlike C<bytes_to_utf8>, this over-writes the original string, and
786updates len to contain the new length.
67e989fb 787Returns zero on failure, setting C<len> to -1.
6940069f
GS
788
789=cut
790*/
791
792U8 *
37607a96 793Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 794{
6940069f
GS
795 U8 *send;
796 U8 *d;
dcad2880 797 U8 *save = s;
246fae53 798
cd458e05 799 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880
JH
800 for (send = s + *len; s < send; ) {
801 U8 c = *s++;
802
1d72bdf6
NIS
803 if (!UTF8_IS_INVARIANT(c) &&
804 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
805 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
806 *len = -1;
807 return 0;
808 }
246fae53 809 }
dcad2880
JH
810
811 d = s = save;
6940069f 812 while (s < send) {
ed646e6e 813 STRLEN ulen;
9041c2e3 814 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 815 s += ulen;
6940069f
GS
816 }
817 *d = '\0';
246fae53 818 *len = d - save;
6940069f
GS
819 return save;
820}
821
822/*
c05e0e2f 823=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 824
cd458e05 825Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
7120cae1 826Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
827the newly-created string, and updates C<len> to contain the new
828length. Returns the original string if no conversion occurs, C<len>
829is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8300 if C<s> is converted or contains all 7bit characters.
f9a63242 831
37607a96
PK
832=cut
833*/
f9a63242
JH
834
835U8 *
37607a96 836Perl_bytes_from_utf8(pTHX_ U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 837{
f9a63242 838 U8 *d;
c05e0e2f
AL
839 const U8 *start = s;
840 const U8 *send;
f9a63242 841 I32 count = 0;
ed6a78eb 842 const U8 *s2;
f9a63242
JH
843
844 if (!*is_utf8)
e2b56717 845 return (U8 *)start;
f9a63242 846
cd458e05 847 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 848 for (send = s + *len; s < send;) {
c05e0e2f 849 U8 c = *s++;
1d72bdf6 850 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
851 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
852 (c = *s++) && UTF8_IS_CONTINUATION(c))
853 count++;
854 else
e2b56717 855 return (U8 *)start;
db42d148 856 }
f9a63242
JH
857 }
858
859 *is_utf8 = 0;
860
cd7a8267 861 Newxz(d, (*len) - count + 1, U8);
ed6a78eb
NC
862 s2 = start; start = d;
863 while (s2 < send) {
864 U8 c = *s2++;
c4d5f83a
NIS
865 if (!UTF8_IS_INVARIANT(c)) {
866 /* Then it is two-byte encoded */
ed6a78eb 867 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s2++);
c4d5f83a
NIS
868 c = ASCII_TO_NATIVE(c);
869 }
870 *d++ = c;
f9a63242
JH
871 }
872 *d = '\0';
873 *len = d - start;
e2b56717 874 return (U8 *)start;
f9a63242
JH
875}
876
877/*
eebe1485 878=for apidoc A|U8 *|bytes_to_utf8|U8 *s|STRLEN *len
6940069f 879
cd458e05 880Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
881Returns a pointer to the newly-created string, and sets C<len> to
882reflect the new length.
6940069f 883
cd458e05 884If you want to convert to UTF-8 from other encodings than ASCII,
5835a535
JH
885see sv_recode_to_utf8().
886
497711e7 887=cut
6940069f
GS
888*/
889
890U8*
37607a96 891Perl_bytes_to_utf8(pTHX_ U8 *s, STRLEN *len)
6940069f 892{
7120cae1 893 const U8 * const send = s + (*len);
6940069f
GS
894 U8 *d;
895 U8 *dst;
6940069f 896
cd7a8267 897 Newxz(d, (*len) * 2 + 1, U8);
6940069f
GS
898 dst = d;
899
900 while (s < send) {
7120cae1 901 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 902 if (UNI_IS_INVARIANT(uv))
eb160463 903 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 904 else {
eb160463
GS
905 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
906 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
907 }
908 }
909 *d = '\0';
6662521e 910 *len = d-dst;
6940069f
GS
911 return dst;
912}
913
a0ed51b3 914/*
dea0fc0b 915 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
916 *
917 * Destination must be pre-extended to 3/2 source. Do not use in-place.
918 * We optimize for native, for obvious reasons. */
919
920U8*
dea0fc0b 921Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 922{
dea0fc0b
JH
923 U8* pend;
924 U8* dstart = d;
925
6dc97886
NC
926 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
927 d[0] = 0;
928 *newlen = 1;
929 return d;
930 }
931
dea0fc0b 932 if (bytelen & 1)
9982c483 933 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b
JH
934
935 pend = p + bytelen;
936
a0ed51b3 937 while (p < pend) {
dea0fc0b
JH
938 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
939 p += 2;
a0ed51b3 940 if (uv < 0x80) {
eb160463 941 *d++ = (U8)uv;
a0ed51b3
LW
942 continue;
943 }
944 if (uv < 0x800) {
eb160463
GS
945 *d++ = (U8)(( uv >> 6) | 0xc0);
946 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
947 continue;
948 }
949 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
ac388100
JH
950 UV low = (p[0] << 8) + p[1];
951 p += 2;
dea0fc0b
JH
952 if (low < 0xdc00 || low >= 0xdfff)
953 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
954 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
955 }
956 if (uv < 0x10000) {
eb160463
GS
957 *d++ = (U8)(( uv >> 12) | 0xe0);
958 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
959 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
960 continue;
961 }
962 else {
eb160463
GS
963 *d++ = (U8)(( uv >> 18) | 0xf0);
964 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
965 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
966 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
967 continue;
968 }
969 }
dea0fc0b 970 *newlen = d - dstart;
a0ed51b3
LW
971 return d;
972}
973
974/* Note: this one is slightly destructive of the source. */
975
976U8*
dea0fc0b 977Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
978{
979 U8* s = (U8*)p;
980 U8* send = s + bytelen;
981 while (s < send) {
982 U8 tmp = s[0];
983 s[0] = s[1];
984 s[1] = tmp;
985 s += 2;
986 }
dea0fc0b 987 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
988}
989
990/* for now these are all defined (inefficiently) in terms of the utf8 versions */
991
992bool
84afefe6 993Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 994{
a2a469f9 995 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 996 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
997 return is_utf8_alnum(tmpbuf);
998}
999
1000bool
84afefe6 1001Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 1002{
a2a469f9 1003 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1004 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1005 return is_utf8_alnumc(tmpbuf);
1006}
1007
1008bool
84afefe6 1009Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1010{
a2a469f9 1011 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1012 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1013 return is_utf8_idfirst(tmpbuf);
1014}
1015
1016bool
84afefe6 1017Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1018{
a2a469f9 1019 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1020 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1021 return is_utf8_alpha(tmpbuf);
1022}
1023
1024bool
84afefe6 1025Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1026{
a2a469f9 1027 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1028 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1029 return is_utf8_ascii(tmpbuf);
1030}
1031
1032bool
84afefe6 1033Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1034{
a2a469f9 1035 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1036 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1037 return is_utf8_space(tmpbuf);
1038}
1039
1040bool
84afefe6 1041Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1042{
a2a469f9 1043 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1044 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1045 return is_utf8_digit(tmpbuf);
1046}
1047
1048bool
84afefe6 1049Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1050{
a2a469f9 1051 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1052 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1053 return is_utf8_upper(tmpbuf);
1054}
1055
1056bool
84afefe6 1057Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1058{
a2a469f9 1059 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1060 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1061 return is_utf8_lower(tmpbuf);
1062}
1063
1064bool
84afefe6 1065Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1066{
a2a469f9 1067 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1068 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1069 return is_utf8_cntrl(tmpbuf);
1070}
1071
1072bool
84afefe6 1073Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1074{
a2a469f9 1075 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1076 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1077 return is_utf8_graph(tmpbuf);
1078}
1079
1080bool
84afefe6 1081Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1082{
a2a469f9 1083 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1084 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1085 return is_utf8_print(tmpbuf);
1086}
1087
b8c5462f 1088bool
84afefe6 1089Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1090{
a2a469f9 1091 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1092 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1093 return is_utf8_punct(tmpbuf);
1094}
1095
4d61ec05 1096bool
84afefe6 1097Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1098{
a2a469f9 1099 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1100 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1101 return is_utf8_xdigit(tmpbuf);
1102}
1103
84afefe6
JH
1104UV
1105Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1106{
0ebc6274
JH
1107 uvchr_to_utf8(p, c);
1108 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1109}
1110
84afefe6
JH
1111UV
1112Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1113{
0ebc6274
JH
1114 uvchr_to_utf8(p, c);
1115 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1116}
1117
84afefe6
JH
1118UV
1119Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1120{
0ebc6274
JH
1121 uvchr_to_utf8(p, c);
1122 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1123}
1124
84afefe6
JH
1125UV
1126Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1127{
0ebc6274
JH
1128 uvchr_to_utf8(p, c);
1129 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1130}
1131
a0ed51b3
LW
1132/* for now these all assume no locale info available for Unicode > 255 */
1133
1134bool
84afefe6 1135Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1136{
1137 return is_uni_alnum(c); /* XXX no locale support yet */
1138}
1139
1140bool
84afefe6 1141Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1142{
1143 return is_uni_alnumc(c); /* XXX no locale support yet */
1144}
1145
1146bool
84afefe6 1147Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1148{
1149 return is_uni_idfirst(c); /* XXX no locale support yet */
1150}
1151
1152bool
84afefe6 1153Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1154{
1155 return is_uni_alpha(c); /* XXX no locale support yet */
1156}
1157
1158bool
84afefe6 1159Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1160{
1161 return is_uni_ascii(c); /* XXX no locale support yet */
1162}
1163
1164bool
84afefe6 1165Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1166{
1167 return is_uni_space(c); /* XXX no locale support yet */
1168}
1169
1170bool
84afefe6 1171Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1172{
1173 return is_uni_digit(c); /* XXX no locale support yet */
1174}
1175
1176bool
84afefe6 1177Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1178{
1179 return is_uni_upper(c); /* XXX no locale support yet */
1180}
1181
1182bool
84afefe6 1183Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1184{
1185 return is_uni_lower(c); /* XXX no locale support yet */
1186}
1187
1188bool
84afefe6 1189Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1190{
1191 return is_uni_cntrl(c); /* XXX no locale support yet */
1192}
1193
1194bool
84afefe6 1195Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1196{
1197 return is_uni_graph(c); /* XXX no locale support yet */
1198}
1199
1200bool
84afefe6 1201Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1202{
1203 return is_uni_print(c); /* XXX no locale support yet */
1204}
1205
b8c5462f 1206bool
84afefe6 1207Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1208{
1209 return is_uni_punct(c); /* XXX no locale support yet */
1210}
1211
4d61ec05 1212bool
84afefe6 1213Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1214{
1215 return is_uni_xdigit(c); /* XXX no locale support yet */
1216}
1217
b7ac61fa
JH
1218U32
1219Perl_to_uni_upper_lc(pTHX_ U32 c)
1220{
ee099d14
JH
1221 /* XXX returns only the first character -- do not use XXX */
1222 /* XXX no locale support yet */
1223 STRLEN len;
a2a469f9 1224 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1225 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1226}
1227
1228U32
1229Perl_to_uni_title_lc(pTHX_ U32 c)
1230{
ee099d14
JH
1231 /* XXX returns only the first character XXX -- do not use XXX */
1232 /* XXX no locale support yet */
1233 STRLEN len;
a2a469f9 1234 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1235 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1236}
1237
1238U32
1239Perl_to_uni_lower_lc(pTHX_ U32 c)
1240{
ee099d14
JH
1241 /* XXX returns only the first character -- do not use XXX */
1242 /* XXX no locale support yet */
1243 STRLEN len;
a2a469f9 1244 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1245 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1246}
1247
a0ed51b3 1248bool
864dbfa3 1249Perl_is_utf8_alnum(pTHX_ U8 *p)
a0ed51b3 1250{
386d01d6
GS
1251 if (!is_utf8_char(p))
1252 return FALSE;
a0ed51b3 1253 if (!PL_utf8_alnum)
289d4f09
ML
1254 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1255 * descendant of isalnum(3), in other words, it doesn't
1256 * contain the '_'. --jhi */
1257 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
eb160463 1258 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1259/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1260#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1261 if (!PL_utf8_alnum)
1262 PL_utf8_alnum = swash_init("utf8", "",
1263 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1264 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1265#endif
1266}
1267
1268bool
b8c5462f
JH
1269Perl_is_utf8_alnumc(pTHX_ U8 *p)
1270{
386d01d6
GS
1271 if (!is_utf8_char(p))
1272 return FALSE;
b8c5462f
JH
1273 if (!PL_utf8_alnum)
1274 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
eb160463 1275 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1276/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1277#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1278 if (!PL_utf8_alnum)
1279 PL_utf8_alnum = swash_init("utf8", "",
1280 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1281 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1282#endif
1283}
1284
1285bool
82686b01 1286Perl_is_utf8_idfirst(pTHX_ U8 *p) /* The naming is historical. */
a0ed51b3 1287{
82686b01
JH
1288 if (*p == '_')
1289 return TRUE;
1290 if (!is_utf8_char(p))
1291 return FALSE;
1292 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1293 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
eb160463 1294 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
82686b01
JH
1295}
1296
1297bool
1298Perl_is_utf8_idcont(pTHX_ U8 *p)
1299{
1300 if (*p == '_')
1301 return TRUE;
1302 if (!is_utf8_char(p))
1303 return FALSE;
1304 if (!PL_utf8_idcont)
1305 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
eb160463 1306 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
a0ed51b3
LW
1307}
1308
1309bool
864dbfa3 1310Perl_is_utf8_alpha(pTHX_ U8 *p)
a0ed51b3 1311{
386d01d6
GS
1312 if (!is_utf8_char(p))
1313 return FALSE;
a0ed51b3 1314 if (!PL_utf8_alpha)
e24b16f9 1315 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
eb160463 1316 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
a0ed51b3
LW
1317}
1318
1319bool
b8c5462f
JH
1320Perl_is_utf8_ascii(pTHX_ U8 *p)
1321{
386d01d6
GS
1322 if (!is_utf8_char(p))
1323 return FALSE;
b8c5462f
JH
1324 if (!PL_utf8_ascii)
1325 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
eb160463 1326 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
b8c5462f
JH
1327}
1328
1329bool
864dbfa3 1330Perl_is_utf8_space(pTHX_ U8 *p)
a0ed51b3 1331{
386d01d6
GS
1332 if (!is_utf8_char(p))
1333 return FALSE;
a0ed51b3 1334 if (!PL_utf8_space)
3bec3564 1335 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
eb160463 1336 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
a0ed51b3
LW
1337}
1338
1339bool
864dbfa3 1340Perl_is_utf8_digit(pTHX_ U8 *p)
a0ed51b3 1341{
386d01d6
GS
1342 if (!is_utf8_char(p))
1343 return FALSE;
a0ed51b3 1344 if (!PL_utf8_digit)
e24b16f9 1345 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
eb160463 1346 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
a0ed51b3
LW
1347}
1348
1349bool
864dbfa3 1350Perl_is_utf8_upper(pTHX_ U8 *p)
a0ed51b3 1351{
386d01d6
GS
1352 if (!is_utf8_char(p))
1353 return FALSE;
a0ed51b3 1354 if (!PL_utf8_upper)
5fcb6c34 1355 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
eb160463 1356 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
a0ed51b3
LW
1357}
1358
1359bool
864dbfa3 1360Perl_is_utf8_lower(pTHX_ U8 *p)
a0ed51b3 1361{
386d01d6
GS
1362 if (!is_utf8_char(p))
1363 return FALSE;
a0ed51b3 1364 if (!PL_utf8_lower)
5fcb6c34 1365 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
eb160463 1366 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
a0ed51b3
LW
1367}
1368
1369bool
b8c5462f
JH
1370Perl_is_utf8_cntrl(pTHX_ U8 *p)
1371{
386d01d6
GS
1372 if (!is_utf8_char(p))
1373 return FALSE;
b8c5462f
JH
1374 if (!PL_utf8_cntrl)
1375 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
eb160463 1376 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
b8c5462f
JH
1377}
1378
1379bool
1380Perl_is_utf8_graph(pTHX_ U8 *p)
1381{
386d01d6
GS
1382 if (!is_utf8_char(p))
1383 return FALSE;
b8c5462f
JH
1384 if (!PL_utf8_graph)
1385 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
eb160463 1386 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
b8c5462f
JH
1387}
1388
1389bool
864dbfa3 1390Perl_is_utf8_print(pTHX_ U8 *p)
a0ed51b3 1391{
386d01d6
GS
1392 if (!is_utf8_char(p))
1393 return FALSE;
a0ed51b3 1394 if (!PL_utf8_print)
e24b16f9 1395 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
eb160463 1396 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
a0ed51b3
LW
1397}
1398
1399bool
b8c5462f
JH
1400Perl_is_utf8_punct(pTHX_ U8 *p)
1401{
386d01d6
GS
1402 if (!is_utf8_char(p))
1403 return FALSE;
b8c5462f
JH
1404 if (!PL_utf8_punct)
1405 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
eb160463 1406 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
b8c5462f
JH
1407}
1408
1409bool
1410Perl_is_utf8_xdigit(pTHX_ U8 *p)
1411{
386d01d6
GS
1412 if (!is_utf8_char(p))
1413 return FALSE;
b8c5462f
JH
1414 if (!PL_utf8_xdigit)
1415 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
eb160463 1416 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
b8c5462f
JH
1417}
1418
1419bool
864dbfa3 1420Perl_is_utf8_mark(pTHX_ U8 *p)
a0ed51b3 1421{
386d01d6
GS
1422 if (!is_utf8_char(p))
1423 return FALSE;
a0ed51b3 1424 if (!PL_utf8_mark)
e24b16f9 1425 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
eb160463 1426 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
a0ed51b3
LW
1427}
1428
6b5c0936
JH
1429/*
1430=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1431
1432The "p" contains the pointer to the UTF-8 string encoding
1433the character that is being converted.
1434
1435The "ustrp" is a pointer to the character buffer to put the
1436conversion result to. The "lenp" is a pointer to the length
1437of the result.
1438
0134edef 1439The "swashp" is a pointer to the swash to use.
6b5c0936 1440
0134edef
JH
1441Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1442and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1443but not always, a multicharacter mapping), is tried first.
6b5c0936 1444
0134edef
JH
1445The "special" is a string like "utf8::ToSpecLower", which means the
1446hash %utf8::ToSpecLower. The access to the hash is through
1447Perl_to_utf8_case().
6b5c0936 1448
0134edef
JH
1449The "normal" is a string like "ToLower" which means the swash
1450%utf8::ToLower.
1451
1452=cut */
6b5c0936 1453
2104c8d9 1454UV
a6872d42 1455Perl_to_utf8_case(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, char *normal, char *special)
a0ed51b3 1456{
a2a469f9 1457 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1458 STRLEN len = 0;
a0ed51b3 1459
c9dc1ff4 1460 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1461 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1462 * are necessary in EBCDIC, they are redundant no-ops
1463 * in ASCII-ish platforms, and hopefully optimized away. */
24c2fff4 1464 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1465 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1466
1467 if (!*swashp) /* load on-demand */
1468 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1469
08952adc
JH
1470 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1471 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1472 /* It might be "special" (sometimes, but not always,
2a37f04d 1473 * a multicharacter mapping) */
983ffd37 1474 HV *hv;
08952adc
JH
1475 SV **svp;
1476
1477 if ((hv = get_hv(special, FALSE)) &&
1478 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1479 (*svp)) {
71a0dd65 1480 const char *s;
47654450 1481
71a0dd65 1482 s = SvPV_const(*svp, len);
47654450
JH
1483 if (len == 1)
1484 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1485 else {
2f9475ad
JH
1486#ifdef EBCDIC
1487 /* If we have EBCDIC we need to remap the characters
1488 * since any characters in the low 256 are Unicode
1489 * code points, not EBCDIC. */
7cda7a3d 1490 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1491
1492 d = tmpbuf;
08952adc 1493 if (SvUTF8(*svp)) {
2f9475ad
JH
1494 STRLEN tlen = 0;
1495
1496 while (t < tend) {
1497 UV c = utf8_to_uvchr(t, &tlen);
1498 if (tlen > 0) {
1499 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1500 t += tlen;
1501 }
1502 else
1503 break;
1504 }
1505 }
1506 else {
36fec512
JH
1507 while (t < tend) {
1508 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1509 t++;
1510 }
2f9475ad
JH
1511 }
1512 len = d - tmpbuf;
1513 Copy(tmpbuf, ustrp, len, U8);
1514#else
d2dcd0fb 1515 Copy(s, ustrp, len, U8);
2f9475ad 1516#endif
29e98929 1517 }
983ffd37 1518 }
0134edef
JH
1519 }
1520
1521 if (!len && *swashp) {
1522 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1523
1524 if (uv2) {
1525 /* It was "normal" (a single character mapping). */
1526 UV uv3 = UNI_TO_NATIVE(uv2);
1527
e9101d72 1528 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1529 }
1530 }
1feea2c7 1531
0134edef
JH
1532 if (!len) /* Neither: just copy. */
1533 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1534
2a37f04d
JH
1535 if (lenp)
1536 *lenp = len;
1537
0134edef 1538 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1539}
1540
d3e79532
JH
1541/*
1542=for apidoc A|UV|to_utf8_upper|U8 *p|U8 *ustrp|STRLEN *lenp
1543
1544Convert the UTF-8 encoded character at p to its uppercase version and
1545store that in UTF-8 in ustrp and its length in bytes in lenp. Note
a2a469f9
JH
1546that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1547the uppercase version may be longer than the original character.
d3e79532
JH
1548
1549The first character of the uppercased version is returned
1550(but note, as explained above, that there may be more.)
1551
1552=cut */
1553
2104c8d9 1554UV
983ffd37 1555Perl_to_utf8_upper(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1556{
983ffd37 1557 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1558 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1559}
a0ed51b3 1560
d3e79532
JH
1561/*
1562=for apidoc A|UV|to_utf8_title|U8 *p|U8 *ustrp|STRLEN *lenp
1563
1564Convert the UTF-8 encoded character at p to its titlecase version and
1565store that in UTF-8 in ustrp and its length in bytes in lenp. Note
a2a469f9
JH
1566that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1567titlecase version may be longer than the original character.
d3e79532
JH
1568
1569The first character of the titlecased version is returned
1570(but note, as explained above, that there may be more.)
1571
1572=cut */
1573
983ffd37
JH
1574UV
1575Perl_to_utf8_title(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1576{
1577 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1578 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1579}
1580
d3e79532
JH
1581/*
1582=for apidoc A|UV|to_utf8_lower|U8 *p|U8 *ustrp|STRLEN *lenp
1583
1584Convert the UTF-8 encoded character at p to its lowercase version and
1585store that in UTF-8 in ustrp and its length in bytes in lenp. Note
a2a469f9
JH
1586that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1587lowercase version may be longer than the original character.
d3e79532
JH
1588
1589The first character of the lowercased version is returned
1590(but note, as explained above, that there may be more.)
1591
1592=cut */
1593
2104c8d9 1594UV
a2a2844f 1595Perl_to_utf8_lower(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1596{
983ffd37 1597 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1598 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1599}
1600
d3e79532
JH
1601/*
1602=for apidoc A|UV|to_utf8_fold|U8 *p|U8 *ustrp|STRLEN *lenp
1603
1604Convert the UTF-8 encoded character at p to its foldcase version and
1605store that in UTF-8 in ustrp and its length in bytes in lenp. Note
a2a469f9 1606that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1607foldcase version may be longer than the original character (up to
1608three characters).
1609
1610The first character of the foldcased version is returned
1611(but note, as explained above, that there may be more.)
1612
1613=cut */
1614
b4e400f9
JH
1615UV
1616Perl_to_utf8_fold(pTHX_ U8 *p, U8* ustrp, STRLEN *lenp)
1617{
1618 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1619 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1620}
1621
1622/* a "swash" is a swatch hash */
1623
1624SV*
864dbfa3 1625Perl_swash_init(pTHX_ char* pkg, char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3
LW
1626{
1627 SV* retval;
e7b555a6 1628 SV* const tokenbufsv = sv_newmortal();
8e84507e 1629 dSP;
1f49be52
AL
1630 const size_t pkg_len = strlen(pkg);
1631 const size_t name_len = strlen(name);
c9dc1ff4 1632 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1633 SV* errsv_save;
ce3b816e 1634
f66b99ab
NC
1635 PUSHSTACKi(PERLSI_MAGIC);
1636 ENTER;
1637 SAVEI32(PL_hints);
1638 PL_hints = 0;
1639 save_re_context();
1b026014 1640 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1641 ENTER;
f8be5cf0 1642 errsv_save = newSVsv(ERRSV);
b035a42e
NC
1643 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1644 Nullsv);
f8be5cf0
JH
1645 if (!SvTRUE(ERRSV))
1646 sv_setsv(ERRSV, errsv_save);
1647 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1648 LEAVE;
1649 }
1650 SPAGAIN;
a0ed51b3
LW
1651 PUSHMARK(SP);
1652 EXTEND(SP,5);
b035a42e
NC
1653 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1654 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1655 PUSHs(listsv);
1656 PUSHs(sv_2mortal(newSViv(minbits)));
1657 PUSHs(sv_2mortal(newSViv(none)));
1658 PUTBACK;
ef7b71f0 1659 if (IN_PERL_COMPILETIME) {
bf1fed83 1660 /* XXX ought to be handled by lex_start */
82686b01 1661 SAVEI32(PL_in_my);
5b7ea690 1662 PL_in_my = 0;
bf1fed83 1663 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1664 }
f8be5cf0 1665 errsv_save = newSVsv(ERRSV);
864dbfa3 1666 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1667 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1668 else
e24b16f9 1669 retval = &PL_sv_undef;
f8be5cf0
JH
1670 if (!SvTRUE(ERRSV))
1671 sv_setsv(ERRSV, errsv_save);
1672 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1673 LEAVE;
1674 POPSTACK;
ef7b71f0 1675 if (IN_PERL_COMPILETIME) {
bf1fed83 1676 STRLEN len;
c9dc1ff4 1677 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83
JH
1678
1679 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1680 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1681 }
bc45ce41
JH
1682 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1683 if (SvPOK(retval))
c293eb2b
NC
1684 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1685 retval);
cea2e8a9 1686 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1687 }
a0ed51b3
LW
1688 return retval;
1689}
1690
035d37be
JH
1691
1692/* This API is wrong for special case conversions since we may need to
1693 * return several Unicode characters for a single Unicode character
1694 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1695 * the lower-level routine, and it is similarly broken for returning
1696 * multiple values. --jhi */
a0ed51b3 1697UV
3568d838 1698Perl_swash_fetch(pTHX_ SV *sv, U8 *ptr, bool do_utf8)
a0ed51b3 1699{
c9dc1ff4 1700 HV* const hv = (HV*)SvRV(sv);
3568d838
JH
1701 U32 klen;
1702 U32 off;
a0ed51b3 1703 STRLEN slen;
7d85a32c 1704 STRLEN needents;
71a0dd65 1705 const U8 *tmps = NULL;
a0ed51b3
LW
1706 U32 bit;
1707 SV *retval;
3568d838
JH
1708 U8 tmputf8[2];
1709 UV c = NATIVE_TO_ASCII(*ptr);
1710
1711 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463
GS
1712 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1713 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838
JH
1714 ptr = tmputf8;
1715 }
1716 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1717 * then the "swatch" is a vec() for al the chars which start
1718 * with 0xAA..0xYY
1719 * So the key in the hash (klen) is length of encoded char -1
1720 */
1721 klen = UTF8SKIP(ptr) - 1;
1722 off = ptr[klen];
a0ed51b3 1723
7d85a32c
JH
1724 if (klen == 0)
1725 {
1726 /* If char in invariant then swatch is for all the invariant chars
cd458e05 1727 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c
JH
1728 */
1729 needents = UTF_CONTINUATION_MARK;
1730 off = NATIVE_TO_UTF(ptr[klen]);
1731 }
1732 else
1733 {
1734 /* If char is encoded then swatch is for the prefix */
1735 needents = (1 << UTF_ACCUMULATION_SHIFT);
1736 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1737 }
1738
a0ed51b3
LW
1739 /*
1740 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1741 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1742 * it's nothing to sniff at.) Pity we usually come through at least
1743 * two function calls to get here...
1744 *
1745 * NB: this code assumes that swatches are never modified, once generated!
1746 */
1747
3568d838 1748 if (hv == PL_last_swash_hv &&
a0ed51b3 1749 klen == PL_last_swash_klen &&
fe20fd30 1750 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1751 {
1752 tmps = PL_last_swash_tmps;
1753 slen = PL_last_swash_slen;
1754 }
1755 else {
1756 /* Try our second-level swatch cache, kept in a hash. */
c05e0e2f 1757 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3
LW
1758
1759 /* If not cached, generate it via utf8::SWASHGET */
71a0dd65 1760 if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
a0ed51b3 1761 dSP;
2b9d42f0
NIS
1762 /* We use utf8n_to_uvuni() as we want an index into
1763 Unicode tables, not a native character number.
1764 */
c9dc1ff4 1765 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1766 ckWARN(WARN_UTF8) ?
1767 0 : UTF8_ALLOW_ANY);
f8be5cf0 1768 SV *errsv_save;
a0ed51b3
LW
1769 ENTER;
1770 SAVETMPS;
1771 save_re_context();
1772 PUSHSTACKi(PERLSI_MAGIC);
1773 PUSHMARK(SP);
1774 EXTEND(SP,3);
1775 PUSHs((SV*)sv);
ffbc6a93 1776 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838
JH
1777 PUSHs(sv_2mortal(newSViv((klen) ?
1778 (code_point & ~(needents - 1)) : 0)));
a0ed51b3
LW
1779 PUSHs(sv_2mortal(newSViv(needents)));
1780 PUTBACK;
f8be5cf0 1781 errsv_save = newSVsv(ERRSV);
864dbfa3 1782 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1783 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1784 else
e24b16f9 1785 retval = &PL_sv_undef;
f8be5cf0
JH
1786 if (!SvTRUE(ERRSV))
1787 sv_setsv(ERRSV, errsv_save);
1788 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1789 POPSTACK;
1790 FREETMPS;
1791 LEAVE;
ef7b71f0 1792 if (IN_PERL_COMPILETIME)
eb160463 1793 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1794
c05e0e2f 1795 svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
a0ed51b3 1796
7d85a32c 1797 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1798 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1799 }
1800
1801 PL_last_swash_hv = hv;
1802 PL_last_swash_klen = klen;
71a0dd65
NC
1803 /* FIXME change interpvar.h? */
1804 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1805 PL_last_swash_slen = slen;
1806 if (klen)
1807 Copy(ptr, PL_last_swash_key, klen, U8);
1808 }
1809
9faf8d75 1810 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1811 case 1:
1812 bit = 1 << (off & 7);
1813 off >>= 3;
1814 return (tmps[off] & bit) != 0;
1815 case 8:
1816 return tmps[off];
1817 case 16:
1818 off <<= 1;
1819 return (tmps[off] << 8) + tmps[off + 1] ;
1820 case 32:
1821 off <<= 2;
1822 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1823 }
cea2e8a9 1824 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1825 return 0;
1826}
2b9d42f0
NIS
1827
1828
1829/*
37607a96 1830=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1831
cd458e05 1832Adds the UTF-8 representation of the Native codepoint C<uv> to the end
a2a469f9 1833of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2b9d42f0
NIS
1834bytes available. The return value is the pointer to the byte after the
1835end of the new character. In other words,
1836
1837 d = uvchr_to_utf8(d, uv);
1838
1839is the recommended wide native character-aware way of saying
1840
1841 *(d++) = uv;
1842
1843=cut
1844*/
1845
1846/* On ASCII machines this is normally a macro but we want a
1847 real function in case XS code wants it
1848*/
1849#undef Perl_uvchr_to_utf8
1850U8 *
1851Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1852{
b851fbc1 1853 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0
NIS
1854}
1855
b851fbc1
JH
1856U8 *
1857Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1858{
1859 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1860}
2b9d42f0
NIS
1861
1862/*
37607a96 1863=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0
NIS
1864
1865Returns the native character value of the first character in the string C<s>
cd458e05 1866which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2b9d42f0
NIS
1867length, in bytes, of that character.
1868
1869Allows length and flags to be passed to low level routine.
1870
1871=cut
1872*/
0a2ef054
JH
1873/* On ASCII machines this is normally a macro but we want
1874 a real function in case XS code wants it
2b9d42f0
NIS
1875*/
1876#undef Perl_utf8n_to_uvchr
1877UV
37607a96 1878Perl_utf8n_to_uvchr(pTHX_ U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1879{
c9dc1ff4 1880 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2b9d42f0
NIS
1881 return UNI_TO_NATIVE(uv);
1882}
1883
d2cc3551
JH
1884/*
1885=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1886
1887Build to the scalar dsv a displayable version of the string spv,
1888length len, the displayable version being at most pvlim bytes long
1889(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1890
9e55ce06 1891The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1892isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
1893to display the \\[nrfta\\] as the backslashed versions (like '\n')
1894(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1895UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1896UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1897
d2cc3551
JH
1898The pointer to the PV of the dsv is returned.
1899
1900=cut */
e6b2e755
JH
1901char *
1902Perl_pv_uni_display(pTHX_ SV *dsv, U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
1903{
1904 int truncated = 0;
c05e0e2f 1905 const char *s, *e;
e6b2e755
JH
1906
1907 sv_setpvn(dsv, "", 0);
c05e0e2f 1908 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 1909 UV u;
04851bb3
NC
1910 /* This serves double duty as a flag and a character to print after
1911 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1912 */
1913 char ok = 0;
c728cb41 1914
e6b2e755
JH
1915 if (pvlim && SvCUR(dsv) >= pvlim) {
1916 truncated++;
1917 break;
1918 }
1919 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1920 if (u < 256) {
8c18bf38 1921 const unsigned char c = (unsigned char)u & 0xFF;
c728cb41 1922 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
04851bb3 1923 switch (c) {
c728cb41 1924 case '\n':
04851bb3 1925 ok = 'n'; break;
c728cb41 1926 case '\r':
04851bb3 1927 ok = 'r'; break;
c728cb41 1928 case '\t':
04851bb3 1929 ok = 't'; break;
c728cb41 1930 case '\f':
04851bb3 1931 ok = 'f'; break;
c728cb41 1932 case '\a':
04851bb3 1933 ok = 'a'; break;
c728cb41 1934 case '\\':
04851bb3 1935 ok = '\\'; break;
c728cb41
JH
1936 default: break;
1937 }
04851bb3
NC
1938 if (ok) {
1939 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1940 }
c728cb41 1941 }
00e86452 1942 /* isPRINT() is the locale-blind version. */
04851bb3
NC
1943 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1944 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1945 ok = 1;
0a2ef054 1946 }
c728cb41
JH
1947 }
1948 if (!ok)
9e55ce06 1949 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
1950 }
1951 if (truncated)
1952 sv_catpvn(dsv, "...", 3);
1953
1954 return SvPVX(dsv);
1955}
2b9d42f0 1956
d2cc3551
JH
1957/*
1958=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1959
1960Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1961the displayable version being at most pvlim bytes long
d2cc3551 1962(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
1963
1964The flags argument is as in pv_uni_display().
1965
d2cc3551
JH
1966The pointer to the PV of the dsv is returned.
1967
1968=cut */
e6b2e755
JH
1969char *
1970Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1971{
ed6a78eb 1972 return Perl_pv_uni_display(aTHX_ dsv, (U8*)SvPVX_const(ssv),
71a0dd65 1973 SvCUR(ssv), pvlim, flags);
701a277b
JH
1974}
1975
d2cc3551 1976/*
d07ddd77 1977=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
1978
1979Return true if the strings s1 and s2 differ case-insensitively, false
1980if not (if they are equal case-insensitively). If u1 is true, the
1981string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
1982the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1983are false, the respective string is assumed to be in native 8-bit
1984encoding.
1985
1986If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1987in there (they will point at the beginning of the I<next> character).
1988If the pointers behind pe1 or pe2 are non-NULL, they are the end
1989pointers beyond which scanning will not continue under any
359e6eb4 1990circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
1991s2+l2 will be used as goal end pointers that will also stop the scan,
1992and which qualify towards defining a successful match: all the scans
1993that define an explicit length must reach their goal pointers for
1994a match to succeed).
d2cc3551
JH
1995
1996For case-insensitiveness, the "casefolding" of Unicode is used
1997instead of upper/lowercasing both the characters, see
1998http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
1999
2000=cut */
701a277b 2001I32
d07ddd77 2002Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2003{
c05e0e2f
AL
2004 register const U8 *p1 = (const U8*)s1;
2005 register const U8 *p2 = (const U8*)s2;
2006 register const U8 *f1 = 0, *f2 = 0;
2007 register U8 *e1 = 0, *q1 = 0;
2008 register U8 *e2 = 0, *q2 = 0;
d07ddd77 2009 STRLEN n1 = 0, n2 = 0;
a2a469f9
JH
2010 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2011 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2012 U8 natbuf[1+1];
2013 STRLEN foldlen1, foldlen2;
d07ddd77 2014 bool match;
332ddc25 2015
d07ddd77
JH
2016 if (pe1)
2017 e1 = *(U8**)pe1;
c05e0e2f
AL
2018 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2019 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2020 if (pe2)
2021 e2 = *(U8**)pe2;
c05e0e2f
AL
2022 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2023 f2 = (const U8*)s2 + l2;
d07ddd77
JH
2024
2025 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2026 return 1; /* mismatch; possible infinite loop or false positive */
2027
a6872d42
JH
2028 if (!u1 || !u2)
2029 natbuf[1] = 0; /* Need to terminate the buffer. */
2030
d07ddd77
JH
2031 while ((e1 == 0 || p1 < e1) &&
2032 (f1 == 0 || p1 < f1) &&
2033 (e2 == 0 || p2 < e2) &&
2034 (f2 == 0 || p2 < f2)) {
2035 if (n1 == 0) {
d7f013c8 2036 if (u1)
ed6a78eb 2037 to_utf8_fold((U8 *)p1, foldbuf1, &foldlen1);
d7f013c8 2038 else {
e0138122 2039 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2040 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2041 }
2042 q1 = foldbuf1;
d07ddd77 2043 n1 = foldlen1;
332ddc25 2044 }
d07ddd77 2045 if (n2 == 0) {
d7f013c8 2046 if (u2)
ed6a78eb 2047 to_utf8_fold((U8 *)p2, foldbuf2, &foldlen2);
d7f013c8 2048 else {
e0138122 2049 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2050 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2051 }
2052 q2 = foldbuf2;
d07ddd77 2053 n2 = foldlen2;
332ddc25 2054 }
d07ddd77
JH
2055 while (n1 && n2) {
2056 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2057 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2058 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2059 return 1; /* mismatch */
d07ddd77 2060 n1 -= UTF8SKIP(q1);
d7f013c8 2061 q1 += UTF8SKIP(q1);
d07ddd77 2062 n2 -= UTF8SKIP(q2);
d7f013c8 2063 q2 += UTF8SKIP(q2);
701a277b 2064 }
d07ddd77 2065 if (n1 == 0)
d7f013c8 2066 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2067 if (n2 == 0)
d7f013c8
JH
2068 p2 += u2 ? UTF8SKIP(p2) : 1;
2069
d2cc3551 2070 }
5469e704 2071
d07ddd77
JH
2072 /* A match is defined by all the scans that specified
2073 * an explicit length reaching their final goals. */
2074 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2075
2076 if (match) {
d07ddd77
JH
2077 if (pe1)
2078 *pe1 = (char*)p1;
2079 if (pe2)
2080 *pe2 = (char*)p2;
5469e704
JH
2081 }
2082
2083 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2084}
701a277b 2085
04851bb3
NC
2086/*
2087 * Local variables:
2088 * c-indentation-style: bsd
2089 * c-basic-offset: 4
2090 * indent-tabs-mode: t
2091 * End:
2092 *
d8294a4d
NC
2093 * ex: set ts=8 sts=4 sw=4 noet:
2094 */