This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
Upgrade to ExtUtils-ParseXS-2.13
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
af3babe4
NC
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
4 * others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3
LW
26#include "perl.h"
27
27da23d5
JH
28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7
JH
31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29
DM
34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1
JH
47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1
JH
53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485
SC
57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6
NIS
90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463
GS
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
104 return d;
105 }
106 if (uv < 0x10000) {
eb160463
GS
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
110 return d;
111 }
112 if (uv < 0x200000) {
eb160463
GS
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463
GS
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
138#endif
139 {
eb160463
GS
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463
GS
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
b851fbc1
JH
169
170U8 *
171Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
172{
173 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
174}
9041c2e3 175
646ca15d
JH
176/*
177
178Tests if some arbitrary number of bytes begins in a valid UTF-8
179character. Note that an INVARIANT (i.e. ASCII) character is a valid
180UTF-8 character. The actual number of bytes in the UTF-8 character
181will be returned if it is valid, otherwise 0.
182
183This is the "slow" version as opposed to the "fast" version which is
184the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
185difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
186or less you should use the IS_UTF8_CHAR(), for lengths of five or more
187you should use the _slow(). In practice this means that the _slow()
188will be used very rarely, since the maximum Unicode code point (as of
189Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
190the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
191five bytes or more.
192
193=cut */
c053b435 194STATIC STRLEN
646ca15d
JH
195S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
196{
197 U8 u = *s;
198 STRLEN slen;
199 UV uv, ouv;
200
201 if (UTF8_IS_INVARIANT(u))
202 return 1;
203
204 if (!UTF8_IS_START(u))
205 return 0;
206
207 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
208 return 0;
209
210 slen = len - 1;
211 s++;
212 u &= UTF_START_MASK(len);
213 uv = u;
214 ouv = uv;
215 while (slen--) {
216 if (!UTF8_IS_CONTINUATION(*s))
217 return 0;
218 uv = UTF8_ACCUMULATE(uv, *s);
219 if (uv < ouv)
220 return 0;
221 ouv = uv;
222 s++;
223 }
224
225 if ((STRLEN)UNISKIP(uv) < len)
226 return 0;
227
228 return len;
229}
9041c2e3
NIS
230
231/*
7fc63493 232=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 233
5da9da9e 234Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
235character. Note that an INVARIANT (i.e. ASCII) character is a valid
236UTF-8 character. The actual number of bytes in the UTF-8 character
237will be returned if it is valid, otherwise 0.
9041c2e3 238
82686b01 239=cut */
067a85ef 240STRLEN
7fc63493 241Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 242{
768c67ee 243 STRLEN len = UTF8SKIP(s);
3b0fc154 244#ifdef IS_UTF8_CHAR
768c67ee 245 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
246 return IS_UTF8_CHAR(s, len) ? len : 0;
247#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 248 return is_utf8_char_slow(s, len);
386d01d6
GS
249}
250
6662521e 251/*
7fc63493 252=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 253
c9ada85f 254Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
255UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
256not mean 'a string that contains code points above 0x7F encoded in UTF-8'
257because a valid ASCII string is a valid UTF-8 string.
6662521e 258
768c67ee
JH
259See also is_utf8_string_loclen() and is_utf8_string_loc().
260
6662521e
GS
261=cut
262*/
263
8e84507e 264bool
7fc63493 265Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 266{
7fc63493
AL
267 const U8* x = s;
268 const U8* send;
067a85ef 269
61468b03 270 if (!len && s)
e1ec3a88 271 len = strlen((const char *)s);
1aa99e6b
IH
272 send = s + len;
273
6662521e 274 while (x < send) {
a3b680e6 275 STRLEN c;
1acdb0da
JH
276 /* Inline the easy bits of is_utf8_char() here for speed... */
277 if (UTF8_IS_INVARIANT(*x))
278 c = 1;
279 else if (!UTF8_IS_START(*x))
768c67ee 280 goto out;
1acdb0da
JH
281 else {
282 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
283#ifdef IS_UTF8_CHAR
284 c = UTF8SKIP(x);
768c67ee
JH
285 if (IS_UTF8_CHAR_FAST(c)) {
286 if (!IS_UTF8_CHAR(x, c))
287 goto out;
288 } else if (!is_utf8_char_slow(x, c))
289 goto out;
646ca15d
JH
290#else
291 c = is_utf8_char(x);
292#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 293 if (!c)
768c67ee 294 goto out;
1acdb0da 295 }
6662521e 296 x += c;
6662521e 297 }
768c67ee
JH
298
299 out:
60006e79
JH
300 if (x != send)
301 return FALSE;
067a85ef
A
302
303 return TRUE;
6662521e
GS
304}
305
67e989fb 306/*
768c67ee 307=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 308
768c67ee
JH
309Like is_ut8_string() but stores the location of the failure (in the
310case of "utf8ness failure") or the location s+len (in the case of
311"utf8ness success") in the C<ep>, and the number of UTF-8
312encoded characters in the C<el>.
313
314See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
315
316=cut
317*/
318
319bool
768c67ee 320Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 321{
7fc63493
AL
322 const U8* x = s;
323 const U8* send;
81cd54e3
JH
324 STRLEN c;
325
61468b03 326 if (!len && s)
e1ec3a88 327 len = strlen((const char *)s);
81cd54e3 328 send = s + len;
768c67ee
JH
329 if (el)
330 *el = 0;
81cd54e3
JH
331
332 while (x < send) {
333 /* Inline the easy bits of is_utf8_char() here for speed... */
334 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
335 c = 1;
336 else if (!UTF8_IS_START(*x))
337 goto out;
81cd54e3 338 else {
768c67ee
JH
339 /* ... and call is_utf8_char() only if really needed. */
340#ifdef IS_UTF8_CHAR
341 c = UTF8SKIP(x);
342 if (IS_UTF8_CHAR_FAST(c)) {
343 if (!IS_UTF8_CHAR(x, c))
344 c = 0;
345 } else
346 c = is_utf8_char_slow(x, c);
347#else
348 c = is_utf8_char(x);
349#endif /* #ifdef IS_UTF8_CHAR */
350 if (!c)
351 goto out;
81cd54e3 352 }
768c67ee
JH
353 x += c;
354 if (el)
355 (*el)++;
81cd54e3 356 }
768c67ee
JH
357
358 out:
359 if (ep)
360 *ep = x;
361 if (x != send)
81cd54e3 362 return FALSE;
81cd54e3
JH
363
364 return TRUE;
365}
366
367/*
768c67ee
JH
368=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
369
370Like is_ut8_string() but stores the location of the failure (in the
371case of "utf8ness failure") or the location s+len (in the case of
372"utf8ness success") in the C<ep>.
373
374See also is_utf8_string_loclen() and is_utf8_string().
375
376=cut
377*/
378
379bool
380Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **ep)
381{
382 return is_utf8_string_loclen(s, len, ep, 0);
383}
384
385/*
7fc63493 386=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 387
9041c2e3
NIS
388Bottom level UTF-8 decode routine.
389Returns the unicode code point value of the first character in the string C<s>
1e54db1a 390which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 391C<retlen> will be set to the length, in bytes, of that character.
67e989fb 392
1e54db1a 393If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
394is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
395it is assumed that the caller will raise a warning, and this function
28d3d195
JH
396will silently just set C<retlen> to C<-1> and return zero. If the
397C<flags> does not contain UTF8_CHECK_ONLY, warnings about
398malformations will be given, C<retlen> will be set to the expected
399length of the UTF-8 character in bytes, and zero will be returned.
400
401The C<flags> can also contain various flags to allow deviations from
402the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 403
9041c2e3
NIS
404Most code should use utf8_to_uvchr() rather than call this directly.
405
37607a96
PK
406=cut
407*/
67e989fb 408
a0ed51b3 409UV
7fc63493 410Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 411{
7fc63493 412 const U8 *s0 = s;
9c5ffd7c 413 UV uv = *s, ouv = 0;
ba210ebe 414 STRLEN len = 1;
7fc63493
AL
415 const bool dowarn = ckWARN_d(WARN_UTF8);
416 const UV startbyte = *s;
ba210ebe 417 STRLEN expectlen = 0;
a0dbb045
JH
418 U32 warning = 0;
419
420/* This list is a superset of the UTF8_ALLOW_XXX. */
421
422#define UTF8_WARN_EMPTY 1
423#define UTF8_WARN_CONTINUATION 2
424#define UTF8_WARN_NON_CONTINUATION 3
425#define UTF8_WARN_FE_FF 4
426#define UTF8_WARN_SHORT 5
427#define UTF8_WARN_OVERFLOW 6
428#define UTF8_WARN_SURROGATE 7
c867b360
JH
429#define UTF8_WARN_LONG 8
430#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
431
432 if (curlen == 0 &&
433 !(flags & UTF8_ALLOW_EMPTY)) {
434 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
435 goto malformed;
436 }
437
1d72bdf6 438 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
439 if (retlen)
440 *retlen = 1;
c4d5f83a 441 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 442 }
67e989fb 443
421a8bf2 444 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 445 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 446 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
447 goto malformed;
448 }
449
421a8bf2 450 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 451 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 452 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
453 goto malformed;
454 }
9041c2e3 455
1d72bdf6 456#ifdef EBCDIC
75383841 457 uv = NATIVE_TO_UTF(uv);
1d72bdf6 458#else
fcc8fcf6
JH
459 if ((uv == 0xfe || uv == 0xff) &&
460 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 461 warning = UTF8_WARN_FE_FF;
ba210ebe 462 goto malformed;
a0ed51b3 463 }
1d72bdf6
NIS
464#endif
465
ba210ebe
JH
466 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
467 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
468 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
469 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
470#ifdef EBCDIC
471 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
472 else { len = 7; uv &= 0x01; }
473#else
ba210ebe
JH
474 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
475 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
476 else { len = 13; uv = 0; } /* whoa! */
477#endif
478
a0ed51b3
LW
479 if (retlen)
480 *retlen = len;
9041c2e3 481
ba210ebe
JH
482 expectlen = len;
483
fcc8fcf6
JH
484 if ((curlen < expectlen) &&
485 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 486 warning = UTF8_WARN_SHORT;
ba210ebe
JH
487 goto malformed;
488 }
489
490 len--;
a0ed51b3 491 s++;
ba210ebe
JH
492 ouv = uv;
493
a0ed51b3 494 while (len--) {
421a8bf2
JH
495 if (!UTF8_IS_CONTINUATION(*s) &&
496 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
497 s--;
498 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 499 goto malformed;
a0ed51b3
LW
500 }
501 else
8850bf83 502 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
503 if (!(uv > ouv)) {
504 /* These cannot be allowed. */
505 if (uv == ouv) {
75dbc644 506 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
507 warning = UTF8_WARN_LONG;
508 goto malformed;
509 }
510 }
511 else { /* uv < ouv */
512 /* This cannot be allowed. */
513 warning = UTF8_WARN_OVERFLOW;
514 goto malformed;
515 }
ba210ebe
JH
516 }
517 s++;
518 ouv = uv;
519 }
520
421a8bf2 521 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 522 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 523 warning = UTF8_WARN_SURROGATE;
ba210ebe 524 goto malformed;
eb160463 525 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 526 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 527 warning = UTF8_WARN_LONG;
ba210ebe 528 goto malformed;
421a8bf2 529 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 530 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 531 warning = UTF8_WARN_FFFF;
a9917092 532 goto malformed;
a0ed51b3 533 }
ba210ebe 534
a0ed51b3 535 return uv;
ba210ebe
JH
536
537malformed:
538
fcc8fcf6 539 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 540 if (retlen)
cc366d4b 541 *retlen = -1;
ba210ebe
JH
542 return 0;
543 }
544
a0dbb045
JH
545 if (dowarn) {
546 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
547
548 switch (warning) {
549 case 0: /* Intentionally empty. */ break;
550 case UTF8_WARN_EMPTY:
54667de8 551 Perl_sv_catpv(aTHX_ sv, "(empty string)");
a0dbb045
JH
552 break;
553 case UTF8_WARN_CONTINUATION:
097fb8e2 554 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
555 break;
556 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
557 if (s == s0)
558 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
559 (UV)s[1], startbyte);
560 else
561 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
5d7488b2 562 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, (int)expectlen);
097fb8e2 563
a0dbb045
JH
564 break;
565 case UTF8_WARN_FE_FF:
566 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
567 break;
568 case UTF8_WARN_SHORT:
097fb8e2 569 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 570 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 571 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
572 break;
573 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
574 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
575 ouv, *s, startbyte);
a0dbb045
JH
576 break;
577 case UTF8_WARN_SURROGATE:
578 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
579 break;
a0dbb045 580 case UTF8_WARN_LONG:
097fb8e2 581 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 582 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
583 break;
584 case UTF8_WARN_FFFF:
585 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
586 break;
587 default:
54667de8 588 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
a0dbb045
JH
589 break;
590 }
591
592 if (warning) {
504618e9 593 const char *s = SvPVX_const(sv);
a0dbb045
JH
594
595 if (PL_op)
9014280d 596 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 597 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 598 else
9014280d 599 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
600 }
601 }
602
ba210ebe 603 if (retlen)
28d3d195 604 *retlen = expectlen ? expectlen : len;
ba210ebe 605
28d3d195 606 return 0;
a0ed51b3
LW
607}
608
8e84507e 609/*
7fc63493 610=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3
NIS
611
612Returns the native character value of the first character in the string C<s>
1e54db1a 613which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
614length, in bytes, of that character.
615
1e54db1a 616If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
617returned and retlen is set, if possible, to -1.
618
619=cut
620*/
621
622UV
7fc63493 623Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 624{
89ebb4a3 625 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 626 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
627}
628
629/*
7fc63493 630=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3
NIS
631
632Returns the Unicode code point of the first character in the string C<s>
1e54db1a 633which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
634length, in bytes, of that character.
635
636This function should only be used when returned UV is considered
637an index into the Unicode semantic tables (e.g. swashes).
638
1e54db1a 639If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 640returned and retlen is set, if possible, to -1.
8e84507e
NIS
641
642=cut
643*/
644
645UV
7fc63493 646Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 647{
9041c2e3 648 /* Call the low level routine asking for checks */
89ebb4a3 649 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 650 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
651}
652
b76347f2 653/*
35a4481c 654=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2
JH
655
656Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
657Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
658up past C<e>, croaks.
b76347f2
JH
659
660=cut
661*/
662
663STRLEN
35a4481c 664Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
665{
666 STRLEN len = 0;
667
8850bf83
JH
668 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
669 * the bitops (especially ~) can create illegal UTF-8.
670 * In other words: in Perl UTF-8 is not just for Unicode. */
671
a3b680e6
AL
672 if (e < s)
673 goto warn_and_return;
b76347f2 674 while (s < e) {
4373e329 675 const U8 t = UTF8SKIP(s);
901b21bf 676 if (e - s < t) {
a3b680e6 677 warn_and_return:
901b21bf
JH
678 if (ckWARN_d(WARN_UTF8)) {
679 if (PL_op)
680 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 681 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
682 else
683 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
684 }
685 return len;
686 }
b76347f2
JH
687 s += t;
688 len++;
689 }
690
691 return len;
692}
693
b06226ff 694/*
35a4481c 695=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 696
1e54db1a 697Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
698and C<b>.
699
700WARNING: use only if you *know* that the pointers point inside the
701same UTF-8 buffer.
702
37607a96
PK
703=cut
704*/
a0ed51b3 705
02eb7b47 706IV
35a4481c 707Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 708{
02eb7b47
JH
709 IV off = 0;
710
8850bf83
JH
711 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
712 * the bitops (especially ~) can create illegal UTF-8.
713 * In other words: in Perl UTF-8 is not just for Unicode. */
714
a0ed51b3
LW
715 if (a < b) {
716 while (a < b) {
35a4481c 717 const U8 c = UTF8SKIP(a);
a3b680e6
AL
718 if (b - a < c)
719 goto warn_and_return;
02eb7b47 720 a += c;
a0ed51b3
LW
721 off--;
722 }
723 }
724 else {
725 while (b < a) {
4373e329 726 const U8 c = UTF8SKIP(b);
02eb7b47 727
901b21bf 728 if (a - b < c) {
a3b680e6 729 warn_and_return:
901b21bf
JH
730 if (ckWARN_d(WARN_UTF8)) {
731 if (PL_op)
732 Perl_warner(aTHX_ packWARN(WARN_UTF8),
733 "%s in %s", unees, OP_DESC(PL_op));
734 else
735 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
736 }
737 return off;
738 }
02eb7b47 739 b += c;
a0ed51b3
LW
740 off++;
741 }
742 }
02eb7b47 743
a0ed51b3
LW
744 return off;
745}
746
b06226ff 747/*
37607a96 748=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 749
8850bf83
JH
750Return the UTF-8 pointer C<s> displaced by C<off> characters, either
751forward or backward.
b06226ff
JH
752
753WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
754the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
755on the first byte of character or just after the last byte of a character.
b06226ff 756
37607a96
PK
757=cut
758*/
a0ed51b3
LW
759
760U8 *
4373e329 761Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 762{
8850bf83
JH
763 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
764 * the bitops (especially ~) can create illegal UTF-8.
765 * In other words: in Perl UTF-8 is not just for Unicode. */
766
a0ed51b3
LW
767 if (off >= 0) {
768 while (off--)
769 s += UTF8SKIP(s);
770 }
771 else {
772 while (off++) {
773 s--;
8850bf83
JH
774 while (UTF8_IS_CONTINUATION(*s))
775 s--;
a0ed51b3
LW
776 }
777 }
4373e329 778 return (U8 *)s;
a0ed51b3
LW
779}
780
6940069f 781/*
eebe1485 782=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 783
1e54db1a 784Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
785Unlike C<bytes_to_utf8>, this over-writes the original string, and
786updates len to contain the new length.
67e989fb 787Returns zero on failure, setting C<len> to -1.
6940069f
GS
788
789=cut
790*/
791
792U8 *
37607a96 793Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 794{
6940069f
GS
795 U8 *send;
796 U8 *d;
dcad2880 797 U8 *save = s;
246fae53 798
1e54db1a 799 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880
JH
800 for (send = s + *len; s < send; ) {
801 U8 c = *s++;
802
1d72bdf6
NIS
803 if (!UTF8_IS_INVARIANT(c) &&
804 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
805 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
806 *len = -1;
807 return 0;
808 }
246fae53 809 }
dcad2880
JH
810
811 d = s = save;
6940069f 812 while (s < send) {
ed646e6e 813 STRLEN ulen;
9041c2e3 814 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 815 s += ulen;
6940069f
GS
816 }
817 *d = '\0';
246fae53 818 *len = d - save;
6940069f
GS
819 return save;
820}
821
822/*
e1ec3a88 823=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 824
1e54db1a 825Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 826Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
827the newly-created string, and updates C<len> to contain the new
828length. Returns the original string if no conversion occurs, C<len>
829is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8300 if C<s> is converted or contains all 7bit characters.
f9a63242 831
37607a96
PK
832=cut
833*/
f9a63242
JH
834
835U8 *
e1ec3a88 836Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 837{
f9a63242 838 U8 *d;
e1ec3a88
AL
839 const U8 *start = s;
840 const U8 *send;
f9a63242
JH
841 I32 count = 0;
842
843 if (!*is_utf8)
73d840c0 844 return (U8 *)start;
f9a63242 845
1e54db1a 846 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 847 for (send = s + *len; s < send;) {
e1ec3a88 848 U8 c = *s++;
1d72bdf6 849 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
850 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
851 (c = *s++) && UTF8_IS_CONTINUATION(c))
852 count++;
853 else
73d840c0 854 return (U8 *)start;
db42d148 855 }
f9a63242
JH
856 }
857
858 *is_utf8 = 0;
859
a02a5408 860 Newxz(d, (*len) - count + 1, U8);
ef9edfd0 861 s = start; start = d;
f9a63242
JH
862 while (s < send) {
863 U8 c = *s++;
c4d5f83a
NIS
864 if (!UTF8_IS_INVARIANT(c)) {
865 /* Then it is two-byte encoded */
866 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
867 c = ASCII_TO_NATIVE(c);
868 }
869 *d++ = c;
f9a63242
JH
870 }
871 *d = '\0';
872 *len = d - start;
73d840c0 873 return (U8 *)start;
f9a63242
JH
874}
875
876/*
35a4481c 877=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 878
1e54db1a 879Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
880Returns a pointer to the newly-created string, and sets C<len> to
881reflect the new length.
6940069f 882
1e54db1a 883If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
884see sv_recode_to_utf8().
885
497711e7 886=cut
6940069f
GS
887*/
888
889U8*
35a4481c 890Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 891{
35a4481c 892 const U8 * const send = s + (*len);
6940069f
GS
893 U8 *d;
894 U8 *dst;
6940069f 895
a02a5408 896 Newxz(d, (*len) * 2 + 1, U8);
6940069f
GS
897 dst = d;
898
899 while (s < send) {
35a4481c 900 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 901 if (UNI_IS_INVARIANT(uv))
eb160463 902 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 903 else {
eb160463
GS
904 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
905 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
906 }
907 }
908 *d = '\0';
6662521e 909 *len = d-dst;
6940069f
GS
910 return dst;
911}
912
a0ed51b3 913/*
dea0fc0b 914 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
915 *
916 * Destination must be pre-extended to 3/2 source. Do not use in-place.
917 * We optimize for native, for obvious reasons. */
918
919U8*
dea0fc0b 920Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 921{
dea0fc0b
JH
922 U8* pend;
923 U8* dstart = d;
924
1de9afcd
RGS
925 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
926 d[0] = 0;
927 *newlen = 1;
928 return d;
929 }
930
dea0fc0b 931 if (bytelen & 1)
014ead4b 932 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b
JH
933
934 pend = p + bytelen;
935
a0ed51b3 936 while (p < pend) {
dea0fc0b
JH
937 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
938 p += 2;
a0ed51b3 939 if (uv < 0x80) {
eb160463 940 *d++ = (U8)uv;
a0ed51b3
LW
941 continue;
942 }
943 if (uv < 0x800) {
eb160463
GS
944 *d++ = (U8)(( uv >> 6) | 0xc0);
945 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
946 continue;
947 }
948 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
TD
949 UV low = (p[0] << 8) + p[1];
950 p += 2;
dea0fc0b
JH
951 if (low < 0xdc00 || low >= 0xdfff)
952 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
953 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
954 }
955 if (uv < 0x10000) {
eb160463
GS
956 *d++ = (U8)(( uv >> 12) | 0xe0);
957 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
958 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
959 continue;
960 }
961 else {
eb160463
GS
962 *d++ = (U8)(( uv >> 18) | 0xf0);
963 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
964 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
965 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
966 continue;
967 }
968 }
dea0fc0b 969 *newlen = d - dstart;
a0ed51b3
LW
970 return d;
971}
972
973/* Note: this one is slightly destructive of the source. */
974
975U8*
dea0fc0b 976Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
977{
978 U8* s = (U8*)p;
979 U8* send = s + bytelen;
980 while (s < send) {
981 U8 tmp = s[0];
982 s[0] = s[1];
983 s[1] = tmp;
984 s += 2;
985 }
dea0fc0b 986 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
987}
988
989/* for now these are all defined (inefficiently) in terms of the utf8 versions */
990
991bool
84afefe6 992Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 993{
89ebb4a3 994 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 995 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
996 return is_utf8_alnum(tmpbuf);
997}
998
999bool
84afefe6 1000Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 1001{
89ebb4a3 1002 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1003 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1004 return is_utf8_alnumc(tmpbuf);
1005}
1006
1007bool
84afefe6 1008Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1009{
89ebb4a3 1010 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1011 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1012 return is_utf8_idfirst(tmpbuf);
1013}
1014
1015bool
84afefe6 1016Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1017{
89ebb4a3 1018 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1019 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1020 return is_utf8_alpha(tmpbuf);
1021}
1022
1023bool
84afefe6 1024Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1025{
89ebb4a3 1026 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1027 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1028 return is_utf8_ascii(tmpbuf);
1029}
1030
1031bool
84afefe6 1032Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1033{
89ebb4a3 1034 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1035 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1036 return is_utf8_space(tmpbuf);
1037}
1038
1039bool
84afefe6 1040Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1041{
89ebb4a3 1042 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1043 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1044 return is_utf8_digit(tmpbuf);
1045}
1046
1047bool
84afefe6 1048Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1049{
89ebb4a3 1050 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1051 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1052 return is_utf8_upper(tmpbuf);
1053}
1054
1055bool
84afefe6 1056Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1057{
89ebb4a3 1058 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1059 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1060 return is_utf8_lower(tmpbuf);
1061}
1062
1063bool
84afefe6 1064Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1065{
89ebb4a3 1066 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1067 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1068 return is_utf8_cntrl(tmpbuf);
1069}
1070
1071bool
84afefe6 1072Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1073{
89ebb4a3 1074 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1075 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1076 return is_utf8_graph(tmpbuf);
1077}
1078
1079bool
84afefe6 1080Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1081{
89ebb4a3 1082 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1083 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1084 return is_utf8_print(tmpbuf);
1085}
1086
b8c5462f 1087bool
84afefe6 1088Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1089{
89ebb4a3 1090 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1091 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1092 return is_utf8_punct(tmpbuf);
1093}
1094
4d61ec05 1095bool
84afefe6 1096Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1097{
89ebb4a3 1098 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1099 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1100 return is_utf8_xdigit(tmpbuf);
1101}
1102
84afefe6
JH
1103UV
1104Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1105{
0ebc6274
JH
1106 uvchr_to_utf8(p, c);
1107 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1108}
1109
84afefe6
JH
1110UV
1111Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1112{
0ebc6274
JH
1113 uvchr_to_utf8(p, c);
1114 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1115}
1116
84afefe6
JH
1117UV
1118Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1119{
0ebc6274
JH
1120 uvchr_to_utf8(p, c);
1121 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1122}
1123
84afefe6
JH
1124UV
1125Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1126{
0ebc6274
JH
1127 uvchr_to_utf8(p, c);
1128 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1129}
1130
a0ed51b3
LW
1131/* for now these all assume no locale info available for Unicode > 255 */
1132
1133bool
84afefe6 1134Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1135{
1136 return is_uni_alnum(c); /* XXX no locale support yet */
1137}
1138
1139bool
84afefe6 1140Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1141{
1142 return is_uni_alnumc(c); /* XXX no locale support yet */
1143}
1144
1145bool
84afefe6 1146Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1147{
1148 return is_uni_idfirst(c); /* XXX no locale support yet */
1149}
1150
1151bool
84afefe6 1152Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1153{
1154 return is_uni_alpha(c); /* XXX no locale support yet */
1155}
1156
1157bool
84afefe6 1158Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1159{
1160 return is_uni_ascii(c); /* XXX no locale support yet */
1161}
1162
1163bool
84afefe6 1164Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1165{
1166 return is_uni_space(c); /* XXX no locale support yet */
1167}
1168
1169bool
84afefe6 1170Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1171{
1172 return is_uni_digit(c); /* XXX no locale support yet */
1173}
1174
1175bool
84afefe6 1176Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1177{
1178 return is_uni_upper(c); /* XXX no locale support yet */
1179}
1180
1181bool
84afefe6 1182Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1183{
1184 return is_uni_lower(c); /* XXX no locale support yet */
1185}
1186
1187bool
84afefe6 1188Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1189{
1190 return is_uni_cntrl(c); /* XXX no locale support yet */
1191}
1192
1193bool
84afefe6 1194Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1195{
1196 return is_uni_graph(c); /* XXX no locale support yet */
1197}
1198
1199bool
84afefe6 1200Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1201{
1202 return is_uni_print(c); /* XXX no locale support yet */
1203}
1204
b8c5462f 1205bool
84afefe6 1206Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1207{
1208 return is_uni_punct(c); /* XXX no locale support yet */
1209}
1210
4d61ec05 1211bool
84afefe6 1212Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1213{
1214 return is_uni_xdigit(c); /* XXX no locale support yet */
1215}
1216
b7ac61fa
JH
1217U32
1218Perl_to_uni_upper_lc(pTHX_ U32 c)
1219{
ee099d14
JH
1220 /* XXX returns only the first character -- do not use XXX */
1221 /* XXX no locale support yet */
1222 STRLEN len;
89ebb4a3 1223 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1224 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1225}
1226
1227U32
1228Perl_to_uni_title_lc(pTHX_ U32 c)
1229{
ee099d14
JH
1230 /* XXX returns only the first character XXX -- do not use XXX */
1231 /* XXX no locale support yet */
1232 STRLEN len;
89ebb4a3 1233 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1234 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1235}
1236
1237U32
1238Perl_to_uni_lower_lc(pTHX_ U32 c)
1239{
ee099d14
JH
1240 /* XXX returns only the first character -- do not use XXX */
1241 /* XXX no locale support yet */
1242 STRLEN len;
89ebb4a3 1243 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1244 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1245}
1246
a0ed51b3 1247bool
7fc63493 1248Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1249{
386d01d6
GS
1250 if (!is_utf8_char(p))
1251 return FALSE;
a0ed51b3 1252 if (!PL_utf8_alnum)
289d4f09
ML
1253 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1254 * descendant of isalnum(3), in other words, it doesn't
1255 * contain the '_'. --jhi */
1256 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
eb160463 1257 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1258/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1259#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1260 if (!PL_utf8_alnum)
1261 PL_utf8_alnum = swash_init("utf8", "",
1262 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1263 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1264#endif
1265}
1266
1267bool
7fc63493 1268Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1269{
386d01d6
GS
1270 if (!is_utf8_char(p))
1271 return FALSE;
b8c5462f
JH
1272 if (!PL_utf8_alnum)
1273 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
eb160463 1274 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1275/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1276#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1277 if (!PL_utf8_alnum)
1278 PL_utf8_alnum = swash_init("utf8", "",
1279 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1280 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1281#endif
1282}
1283
1284bool
7fc63493 1285Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1286{
82686b01
JH
1287 if (*p == '_')
1288 return TRUE;
1289 if (!is_utf8_char(p))
1290 return FALSE;
1291 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1292 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
eb160463 1293 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
82686b01
JH
1294}
1295
1296bool
7fc63493 1297Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01
JH
1298{
1299 if (*p == '_')
1300 return TRUE;
1301 if (!is_utf8_char(p))
1302 return FALSE;
1303 if (!PL_utf8_idcont)
1304 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
eb160463 1305 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
a0ed51b3
LW
1306}
1307
1308bool
7fc63493 1309Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1310{
386d01d6
GS
1311 if (!is_utf8_char(p))
1312 return FALSE;
a0ed51b3 1313 if (!PL_utf8_alpha)
e24b16f9 1314 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
eb160463 1315 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
a0ed51b3
LW
1316}
1317
1318bool
7fc63493 1319Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1320{
386d01d6
GS
1321 if (!is_utf8_char(p))
1322 return FALSE;
b8c5462f
JH
1323 if (!PL_utf8_ascii)
1324 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
eb160463 1325 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
b8c5462f
JH
1326}
1327
1328bool
7fc63493 1329Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1330{
386d01d6
GS
1331 if (!is_utf8_char(p))
1332 return FALSE;
a0ed51b3 1333 if (!PL_utf8_space)
3bec3564 1334 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
eb160463 1335 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
a0ed51b3
LW
1336}
1337
1338bool
7fc63493 1339Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1340{
386d01d6
GS
1341 if (!is_utf8_char(p))
1342 return FALSE;
a0ed51b3 1343 if (!PL_utf8_digit)
e24b16f9 1344 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
eb160463 1345 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
a0ed51b3
LW
1346}
1347
1348bool
7fc63493 1349Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1350{
386d01d6
GS
1351 if (!is_utf8_char(p))
1352 return FALSE;
a0ed51b3 1353 if (!PL_utf8_upper)
c65e4d19 1354 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
eb160463 1355 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
a0ed51b3
LW
1356}
1357
1358bool
7fc63493 1359Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1360{
386d01d6
GS
1361 if (!is_utf8_char(p))
1362 return FALSE;
a0ed51b3 1363 if (!PL_utf8_lower)
c65e4d19 1364 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
eb160463 1365 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
a0ed51b3
LW
1366}
1367
1368bool
7fc63493 1369Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1370{
386d01d6
GS
1371 if (!is_utf8_char(p))
1372 return FALSE;
b8c5462f
JH
1373 if (!PL_utf8_cntrl)
1374 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
eb160463 1375 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
b8c5462f
JH
1376}
1377
1378bool
7fc63493 1379Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1380{
386d01d6
GS
1381 if (!is_utf8_char(p))
1382 return FALSE;
b8c5462f
JH
1383 if (!PL_utf8_graph)
1384 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
eb160463 1385 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
b8c5462f
JH
1386}
1387
1388bool
7fc63493 1389Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1390{
386d01d6
GS
1391 if (!is_utf8_char(p))
1392 return FALSE;
a0ed51b3 1393 if (!PL_utf8_print)
e24b16f9 1394 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
eb160463 1395 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
a0ed51b3
LW
1396}
1397
1398bool
7fc63493 1399Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1400{
386d01d6
GS
1401 if (!is_utf8_char(p))
1402 return FALSE;
b8c5462f
JH
1403 if (!PL_utf8_punct)
1404 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
eb160463 1405 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
b8c5462f
JH
1406}
1407
1408bool
7fc63493 1409Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1410{
386d01d6
GS
1411 if (!is_utf8_char(p))
1412 return FALSE;
b8c5462f
JH
1413 if (!PL_utf8_xdigit)
1414 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
eb160463 1415 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
b8c5462f
JH
1416}
1417
1418bool
7fc63493 1419Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1420{
386d01d6
GS
1421 if (!is_utf8_char(p))
1422 return FALSE;
a0ed51b3 1423 if (!PL_utf8_mark)
e24b16f9 1424 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
eb160463 1425 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
a0ed51b3
LW
1426}
1427
6b5c0936
JH
1428/*
1429=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1430
1431The "p" contains the pointer to the UTF-8 string encoding
1432the character that is being converted.
1433
1434The "ustrp" is a pointer to the character buffer to put the
1435conversion result to. The "lenp" is a pointer to the length
1436of the result.
1437
0134edef 1438The "swashp" is a pointer to the swash to use.
6b5c0936 1439
0134edef
JH
1440Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1441and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1442but not always, a multicharacter mapping), is tried first.
6b5c0936 1443
0134edef
JH
1444The "special" is a string like "utf8::ToSpecLower", which means the
1445hash %utf8::ToSpecLower. The access to the hash is through
1446Perl_to_utf8_case().
6b5c0936 1447
0134edef
JH
1448The "normal" is a string like "ToLower" which means the swash
1449%utf8::ToLower.
1450
1451=cut */
6b5c0936 1452
2104c8d9 1453UV
9a957fbc
AL
1454Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp,
1455 SV **swashp, const char *normal, const char *special)
a0ed51b3 1456{
89ebb4a3 1457 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1458 STRLEN len = 0;
a0ed51b3 1459
aec46f14 1460 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1461 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1462 * are necessary in EBCDIC, they are redundant no-ops
1463 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1464 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1465 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1466
1467 if (!*swashp) /* load on-demand */
1468 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1469
b08cf34e
JH
1470 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1471 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1472 /* It might be "special" (sometimes, but not always,
2a37f04d 1473 * a multicharacter mapping) */
983ffd37 1474 HV *hv;
b08cf34e
JH
1475 SV **svp;
1476
1477 if ((hv = get_hv(special, FALSE)) &&
1478 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1479 (*svp)) {
cfd0369c 1480 const char *s;
47654450 1481
cfd0369c 1482 s = SvPV_const(*svp, len);
47654450
JH
1483 if (len == 1)
1484 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1485 else {
2f9475ad
JH
1486#ifdef EBCDIC
1487 /* If we have EBCDIC we need to remap the characters
1488 * since any characters in the low 256 are Unicode
1489 * code points, not EBCDIC. */
7cda7a3d 1490 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1491
1492 d = tmpbuf;
b08cf34e 1493 if (SvUTF8(*svp)) {
2f9475ad
JH
1494 STRLEN tlen = 0;
1495
1496 while (t < tend) {
1497 UV c = utf8_to_uvchr(t, &tlen);
1498 if (tlen > 0) {
1499 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1500 t += tlen;
1501 }
1502 else
1503 break;
1504 }
1505 }
1506 else {
36fec512
JH
1507 while (t < tend) {
1508 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1509 t++;
1510 }
2f9475ad
JH
1511 }
1512 len = d - tmpbuf;
1513 Copy(tmpbuf, ustrp, len, U8);
1514#else
d2dcd0fb 1515 Copy(s, ustrp, len, U8);
2f9475ad 1516#endif
29e98929 1517 }
983ffd37 1518 }
0134edef
JH
1519 }
1520
1521 if (!len && *swashp) {
1522 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1523
1524 if (uv2) {
1525 /* It was "normal" (a single character mapping). */
1526 UV uv3 = UNI_TO_NATIVE(uv2);
1527
e9101d72 1528 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1529 }
1530 }
1feea2c7 1531
0134edef
JH
1532 if (!len) /* Neither: just copy. */
1533 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1534
2a37f04d
JH
1535 if (lenp)
1536 *lenp = len;
1537
0134edef 1538 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1539}
1540
d3e79532 1541/*
7fc63493 1542=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1543
1544Convert the UTF-8 encoded character at p to its uppercase version and
1545store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1546that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1547the uppercase version may be longer than the original character.
d3e79532
JH
1548
1549The first character of the uppercased version is returned
1550(but note, as explained above, that there may be more.)
1551
1552=cut */
1553
2104c8d9 1554UV
7fc63493 1555Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1556{
983ffd37 1557 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1558 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1559}
a0ed51b3 1560
d3e79532 1561/*
7fc63493 1562=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1563
1564Convert the UTF-8 encoded character at p to its titlecase version and
1565store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1566that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1567titlecase version may be longer than the original character.
d3e79532
JH
1568
1569The first character of the titlecased version is returned
1570(but note, as explained above, that there may be more.)
1571
1572=cut */
1573
983ffd37 1574UV
7fc63493 1575Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37
JH
1576{
1577 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1578 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1579}
1580
d3e79532 1581/*
7fc63493 1582=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1583
1584Convert the UTF-8 encoded character at p to its lowercase version and
1585store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1586that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1587lowercase version may be longer than the original character.
d3e79532
JH
1588
1589The first character of the lowercased version is returned
1590(but note, as explained above, that there may be more.)
1591
1592=cut */
1593
2104c8d9 1594UV
7fc63493 1595Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1596{
983ffd37 1597 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1598 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1599}
1600
d3e79532 1601/*
7fc63493 1602=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1603
1604Convert the UTF-8 encoded character at p to its foldcase version and
1605store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1606that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1607foldcase version may be longer than the original character (up to
1608three characters).
1609
1610The first character of the foldcased version is returned
1611(but note, as explained above, that there may be more.)
1612
1613=cut */
1614
b4e400f9 1615UV
7fc63493 1616Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9
JH
1617{
1618 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1619 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1620}
1621
1622/* a "swash" is a swatch hash */
1623
1624SV*
7fc63493 1625Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1626{
27da23d5 1627 dVAR;
a0ed51b3 1628 SV* retval;
9a957fbc 1629 SV* const tokenbufsv = sv_newmortal();
8e84507e 1630 dSP;
7fc63493
AL
1631 const size_t pkg_len = strlen(pkg);
1632 const size_t name_len = strlen(name);
aec46f14 1633 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1634 SV* errsv_save;
ce3b816e 1635
96ca9f55
DM
1636 PUSHSTACKi(PERLSI_MAGIC);
1637 ENTER;
1638 SAVEI32(PL_hints);
1639 PL_hints = 0;
1640 save_re_context();
1b026014 1641 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1642 ENTER;
f8be5cf0 1643 errsv_save = newSVsv(ERRSV);
71bed85a
NC
1644 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1645 Nullsv);
f8be5cf0
JH
1646 if (!SvTRUE(ERRSV))
1647 sv_setsv(ERRSV, errsv_save);
1648 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1649 LEAVE;
1650 }
1651 SPAGAIN;
a0ed51b3
LW
1652 PUSHMARK(SP);
1653 EXTEND(SP,5);
71bed85a
NC
1654 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1655 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1656 PUSHs(listsv);
1657 PUSHs(sv_2mortal(newSViv(minbits)));
1658 PUSHs(sv_2mortal(newSViv(none)));
1659 PUTBACK;
923e4eb5 1660 if (IN_PERL_COMPILETIME) {
bf1fed83 1661 /* XXX ought to be handled by lex_start */
82686b01 1662 SAVEI32(PL_in_my);
2b4bd638 1663 PL_in_my = 0;
bf1fed83 1664 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1665 }
f8be5cf0 1666 errsv_save = newSVsv(ERRSV);
864dbfa3 1667 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1668 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1669 else
e24b16f9 1670 retval = &PL_sv_undef;
f8be5cf0
JH
1671 if (!SvTRUE(ERRSV))
1672 sv_setsv(ERRSV, errsv_save);
1673 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1674 LEAVE;
1675 POPSTACK;
923e4eb5 1676 if (IN_PERL_COMPILETIME) {
bf1fed83 1677 STRLEN len;
aec46f14 1678 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83
JH
1679
1680 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1681 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1682 }
bc45ce41
JH
1683 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1684 if (SvPOK(retval))
35c1215d
NC
1685 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1686 retval);
cea2e8a9 1687 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1688 }
a0ed51b3
LW
1689 return retval;
1690}
1691
035d37be
JH
1692
1693/* This API is wrong for special case conversions since we may need to
1694 * return several Unicode characters for a single Unicode character
1695 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1696 * the lower-level routine, and it is similarly broken for returning
1697 * multiple values. --jhi */
a0ed51b3 1698UV
7fc63493 1699Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
a0ed51b3 1700{
27da23d5 1701 dVAR;
aec46f14 1702 HV* const hv = (HV*)SvRV(sv);
3568d838
JH
1703 U32 klen;
1704 U32 off;
a0ed51b3 1705 STRLEN slen;
7d85a32c 1706 STRLEN needents;
cfd0369c 1707 const U8 *tmps = NULL;
a0ed51b3
LW
1708 U32 bit;
1709 SV *retval;
3568d838
JH
1710 U8 tmputf8[2];
1711 UV c = NATIVE_TO_ASCII(*ptr);
1712
1713 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463
GS
1714 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1715 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838
JH
1716 ptr = tmputf8;
1717 }
1718 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1719 * then the "swatch" is a vec() for al the chars which start
1720 * with 0xAA..0xYY
1721 * So the key in the hash (klen) is length of encoded char -1
1722 */
1723 klen = UTF8SKIP(ptr) - 1;
1724 off = ptr[klen];
a0ed51b3 1725
7d85a32c
JH
1726 if (klen == 0)
1727 {
1728 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1729 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c
JH
1730 */
1731 needents = UTF_CONTINUATION_MARK;
1732 off = NATIVE_TO_UTF(ptr[klen]);
1733 }
1734 else
1735 {
1736 /* If char is encoded then swatch is for the prefix */
1737 needents = (1 << UTF_ACCUMULATION_SHIFT);
1738 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1739 }
1740
a0ed51b3
LW
1741 /*
1742 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1743 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1744 * it's nothing to sniff at.) Pity we usually come through at least
1745 * two function calls to get here...
1746 *
1747 * NB: this code assumes that swatches are never modified, once generated!
1748 */
1749
3568d838 1750 if (hv == PL_last_swash_hv &&
a0ed51b3 1751 klen == PL_last_swash_klen &&
27da23d5 1752 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1753 {
1754 tmps = PL_last_swash_tmps;
1755 slen = PL_last_swash_slen;
1756 }
1757 else {
1758 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1759 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3
LW
1760
1761 /* If not cached, generate it via utf8::SWASHGET */
cfd0369c 1762 if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
a0ed51b3 1763 dSP;
2b9d42f0
NIS
1764 /* We use utf8n_to_uvuni() as we want an index into
1765 Unicode tables, not a native character number.
1766 */
aec46f14 1767 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1768 ckWARN(WARN_UTF8) ?
1769 0 : UTF8_ALLOW_ANY);
f8be5cf0 1770 SV *errsv_save;
a0ed51b3
LW
1771 ENTER;
1772 SAVETMPS;
1773 save_re_context();
1774 PUSHSTACKi(PERLSI_MAGIC);
1775 PUSHMARK(SP);
1776 EXTEND(SP,3);
1777 PUSHs((SV*)sv);
ffbc6a93 1778 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838
JH
1779 PUSHs(sv_2mortal(newSViv((klen) ?
1780 (code_point & ~(needents - 1)) : 0)));
a0ed51b3
LW
1781 PUSHs(sv_2mortal(newSViv(needents)));
1782 PUTBACK;
f8be5cf0 1783 errsv_save = newSVsv(ERRSV);
864dbfa3 1784 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1785 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1786 else
e24b16f9 1787 retval = &PL_sv_undef;
f8be5cf0
JH
1788 if (!SvTRUE(ERRSV))
1789 sv_setsv(ERRSV, errsv_save);
1790 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1791 POPSTACK;
1792 FREETMPS;
1793 LEAVE;
923e4eb5 1794 if (IN_PERL_COMPILETIME)
eb160463 1795 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1796
e1ec3a88 1797 svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
a0ed51b3 1798
7d85a32c 1799 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1800 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1801 }
1802
1803 PL_last_swash_hv = hv;
1804 PL_last_swash_klen = klen;
cfd0369c
NC
1805 /* FIXME change interpvar.h? */
1806 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1807 PL_last_swash_slen = slen;
1808 if (klen)
1809 Copy(ptr, PL_last_swash_key, klen, U8);
1810 }
1811
9faf8d75 1812 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1813 case 1:
1814 bit = 1 << (off & 7);
1815 off >>= 3;
1816 return (tmps[off] & bit) != 0;
1817 case 8:
1818 return tmps[off];
1819 case 16:
1820 off <<= 1;
1821 return (tmps[off] << 8) + tmps[off + 1] ;
1822 case 32:
1823 off <<= 2;
1824 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1825 }
cea2e8a9 1826 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1827 return 0;
1828}
2b9d42f0
NIS
1829
1830
1831/*
37607a96 1832=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1833
1e54db1a 1834Adds the UTF-8 representation of the Native codepoint C<uv> to the end
89ebb4a3 1835of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2b9d42f0
NIS
1836bytes available. The return value is the pointer to the byte after the
1837end of the new character. In other words,
1838
1839 d = uvchr_to_utf8(d, uv);
1840
1841is the recommended wide native character-aware way of saying
1842
1843 *(d++) = uv;
1844
1845=cut
1846*/
1847
1848/* On ASCII machines this is normally a macro but we want a
1849 real function in case XS code wants it
1850*/
1851#undef Perl_uvchr_to_utf8
1852U8 *
1853Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1854{
b851fbc1 1855 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0
NIS
1856}
1857
b851fbc1
JH
1858U8 *
1859Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1860{
1861 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1862}
2b9d42f0
NIS
1863
1864/*
37607a96 1865=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0
NIS
1866
1867Returns the native character value of the first character in the string C<s>
1e54db1a 1868which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2b9d42f0
NIS
1869length, in bytes, of that character.
1870
1871Allows length and flags to be passed to low level routine.
1872
1873=cut
1874*/
0a2ef054
JH
1875/* On ASCII machines this is normally a macro but we want
1876 a real function in case XS code wants it
2b9d42f0
NIS
1877*/
1878#undef Perl_utf8n_to_uvchr
1879UV
7fc63493 1880Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1881{
aec46f14 1882 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2b9d42f0
NIS
1883 return UNI_TO_NATIVE(uv);
1884}
1885
d2cc3551
JH
1886/*
1887=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1888
1889Build to the scalar dsv a displayable version of the string spv,
1890length len, the displayable version being at most pvlim bytes long
1891(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1892
9e55ce06 1893The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1894isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
1895to display the \\[nrfta\\] as the backslashed versions (like '\n')
1896(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1897UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1898UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1899
d2cc3551
JH
1900The pointer to the PV of the dsv is returned.
1901
1902=cut */
e6b2e755 1903char *
e1ec3a88 1904Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
1905{
1906 int truncated = 0;
e1ec3a88 1907 const char *s, *e;
e6b2e755
JH
1908
1909 sv_setpvn(dsv, "", 0);
e1ec3a88 1910 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 1911 UV u;
a49f32c6
NC
1912 /* This serves double duty as a flag and a character to print after
1913 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1914 */
1915 char ok = 0;
c728cb41 1916
e6b2e755
JH
1917 if (pvlim && SvCUR(dsv) >= pvlim) {
1918 truncated++;
1919 break;
1920 }
1921 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1922 if (u < 256) {
a3b680e6 1923 const unsigned char c = (unsigned char)u & 0xFF;
c728cb41 1924 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
a49f32c6 1925 switch (c) {
c728cb41 1926 case '\n':
a49f32c6 1927 ok = 'n'; break;
c728cb41 1928 case '\r':
a49f32c6 1929 ok = 'r'; break;
c728cb41 1930 case '\t':
a49f32c6 1931 ok = 't'; break;
c728cb41 1932 case '\f':
a49f32c6 1933 ok = 'f'; break;
c728cb41 1934 case '\a':
a49f32c6 1935 ok = 'a'; break;
c728cb41 1936 case '\\':
a49f32c6 1937 ok = '\\'; break;
c728cb41
JH
1938 default: break;
1939 }
a49f32c6
NC
1940 if (ok) {
1941 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1942 }
c728cb41 1943 }
00e86452 1944 /* isPRINT() is the locale-blind version. */
a49f32c6
NC
1945 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1946 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1947 ok = 1;
0a2ef054 1948 }
c728cb41
JH
1949 }
1950 if (!ok)
9e55ce06 1951 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
1952 }
1953 if (truncated)
1954 sv_catpvn(dsv, "...", 3);
1955
1956 return SvPVX(dsv);
1957}
2b9d42f0 1958
d2cc3551
JH
1959/*
1960=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1961
1962Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1963the displayable version being at most pvlim bytes long
d2cc3551 1964(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
1965
1966The flags argument is as in pv_uni_display().
1967
d2cc3551
JH
1968The pointer to the PV of the dsv is returned.
1969
1970=cut */
e6b2e755
JH
1971char *
1972Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1973{
cfd0369c
NC
1974 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
1975 SvCUR(ssv), pvlim, flags);
701a277b
JH
1976}
1977
d2cc3551 1978/*
d07ddd77 1979=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
1980
1981Return true if the strings s1 and s2 differ case-insensitively, false
1982if not (if they are equal case-insensitively). If u1 is true, the
1983string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
1984the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1985are false, the respective string is assumed to be in native 8-bit
1986encoding.
1987
1988If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1989in there (they will point at the beginning of the I<next> character).
1990If the pointers behind pe1 or pe2 are non-NULL, they are the end
1991pointers beyond which scanning will not continue under any
4cdaeff7 1992circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
1993s2+l2 will be used as goal end pointers that will also stop the scan,
1994and which qualify towards defining a successful match: all the scans
1995that define an explicit length must reach their goal pointers for
1996a match to succeed).
d2cc3551
JH
1997
1998For case-insensitiveness, the "casefolding" of Unicode is used
1999instead of upper/lowercasing both the characters, see
2000http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2001
2002=cut */
701a277b 2003I32
d07ddd77 2004Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2005{
e1ec3a88
AL
2006 register const U8 *p1 = (const U8*)s1;
2007 register const U8 *p2 = (const U8*)s2;
2008 register const U8 *f1 = 0, *f2 = 0;
2009 register U8 *e1 = 0, *q1 = 0;
2010 register U8 *e2 = 0, *q2 = 0;
d07ddd77 2011 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
2012 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2013 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2014 U8 natbuf[1+1];
2015 STRLEN foldlen1, foldlen2;
d07ddd77 2016 bool match;
332ddc25 2017
d07ddd77
JH
2018 if (pe1)
2019 e1 = *(U8**)pe1;
e1ec3a88
AL
2020 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2021 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2022 if (pe2)
2023 e2 = *(U8**)pe2;
e1ec3a88
AL
2024 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2025 f2 = (const U8*)s2 + l2;
d07ddd77
JH
2026
2027 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2028 return 1; /* mismatch; possible infinite loop or false positive */
2029
a6872d42
JH
2030 if (!u1 || !u2)
2031 natbuf[1] = 0; /* Need to terminate the buffer. */
2032
d07ddd77
JH
2033 while ((e1 == 0 || p1 < e1) &&
2034 (f1 == 0 || p1 < f1) &&
2035 (e2 == 0 || p2 < e2) &&
2036 (f2 == 0 || p2 < f2)) {
2037 if (n1 == 0) {
d7f013c8
JH
2038 if (u1)
2039 to_utf8_fold(p1, foldbuf1, &foldlen1);
2040 else {
809e8e66 2041 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2042 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2043 }
2044 q1 = foldbuf1;
d07ddd77 2045 n1 = foldlen1;
332ddc25 2046 }
d07ddd77 2047 if (n2 == 0) {
d7f013c8
JH
2048 if (u2)
2049 to_utf8_fold(p2, foldbuf2, &foldlen2);
2050 else {
809e8e66 2051 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2052 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2053 }
2054 q2 = foldbuf2;
d07ddd77 2055 n2 = foldlen2;
332ddc25 2056 }
d07ddd77
JH
2057 while (n1 && n2) {
2058 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2059 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2060 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2061 return 1; /* mismatch */
d07ddd77 2062 n1 -= UTF8SKIP(q1);
d7f013c8 2063 q1 += UTF8SKIP(q1);
d07ddd77 2064 n2 -= UTF8SKIP(q2);
d7f013c8 2065 q2 += UTF8SKIP(q2);
701a277b 2066 }
d07ddd77 2067 if (n1 == 0)
d7f013c8 2068 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2069 if (n2 == 0)
d7f013c8
JH
2070 p2 += u2 ? UTF8SKIP(p2) : 1;
2071
d2cc3551 2072 }
5469e704 2073
d07ddd77
JH
2074 /* A match is defined by all the scans that specified
2075 * an explicit length reaching their final goals. */
2076 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2077
2078 if (match) {
d07ddd77
JH
2079 if (pe1)
2080 *pe1 = (char*)p1;
2081 if (pe2)
2082 *pe2 = (char*)p2;
5469e704
JH
2083 }
2084
2085 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2086}
701a277b 2087
a49f32c6
NC
2088/*
2089 * Local variables:
2090 * c-indentation-style: bsd
2091 * c-basic-offset: 4
2092 * indent-tabs-mode: t
2093 * End:
2094 *
37442d52
RGS
2095 * ex: set ts=8 sts=4 sw=4 noet:
2096 */