This is a live mirror of the Perl 5 development currently hosted at https://github.com/perl/perl5
$SIG{__WARN__} = sub { goto &foo } could recurse infinitely
[perl5.git] / utf8.c
CommitLineData
a0ed51b3
LW
1/* utf8.c
2 *
af3babe4
NC
3 * Copyright (C) 2000, 2001, 2002, 2003, 2004, 2005 by Larry Wall and
4 * others
a0ed51b3
LW
5 *
6 * You may distribute under the terms of either the GNU General Public
7 * License or the Artistic License, as specified in the README file.
8 *
9 */
10
11/*
12 * 'What a fix!' said Sam. 'That's the one place in all the lands we've ever
13 * heard of that we don't want to see any closer; and that's the one place
14 * we're trying to get to! And that's just where we can't get, nohow.'
15 *
16 * 'Well do I understand your speech,' he answered in the same language;
17 * 'yet few strangers do so. Why then do you not speak in the Common Tongue,
18 * as is the custom in the West, if you wish to be answered?'
19 *
20 * ...the travellers perceived that the floor was paved with stones of many
21 * hues; branching runes and strange devices intertwined beneath their feet.
22 */
23
24#include "EXTERN.h"
864dbfa3 25#define PERL_IN_UTF8_C
a0ed51b3
LW
26#include "perl.h"
27
27da23d5
JH
28static const char unees[] =
29 "Malformed UTF-8 character (unexpected end of string)";
901b21bf 30
ccfc67b7
JH
31/*
32=head1 Unicode Support
a0ed51b3 33
166f8a29
DM
34This file contains various utility functions for manipulating UTF8-encoded
35strings. For the uninitiated, this is a method of representing arbitrary
61296642 36Unicode characters as a variable number of bytes, in such a way that
56da48f7
DM
37characters in the ASCII range are unmodified, and a zero byte never appears
38within non-zero characters.
166f8a29 39
b851fbc1 40=for apidoc A|U8 *|uvuni_to_utf8_flags|U8 *d|UV uv|UV flags
eebe1485 41
1e54db1a 42Adds the UTF-8 representation of the Unicode codepoint C<uv> to the end
89ebb4a3 43of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
eebe1485 44bytes available. The return value is the pointer to the byte after the
9041c2e3 45end of the new character. In other words,
eebe1485 46
b851fbc1
JH
47 d = uvuni_to_utf8_flags(d, uv, flags);
48
49or, in most cases,
50
9041c2e3 51 d = uvuni_to_utf8(d, uv);
eebe1485 52
b851fbc1
JH
53(which is equivalent to)
54
55 d = uvuni_to_utf8_flags(d, uv, 0);
56
eebe1485
SC
57is the recommended Unicode-aware way of saying
58
59 *(d++) = uv;
60
61=cut
62*/
63
dfe13c55 64U8 *
b851fbc1 65Perl_uvuni_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
a0ed51b3 66{
62961d2e 67 if (ckWARN(WARN_UTF8)) {
b851fbc1
JH
68 if (UNICODE_IS_SURROGATE(uv) &&
69 !(flags & UNICODE_ALLOW_SURROGATE))
9014280d 70 Perl_warner(aTHX_ packWARN(WARN_UTF8), "UTF-16 surrogate 0x%04"UVxf, uv);
b851fbc1
JH
71 else if (
72 ((uv >= 0xFDD0 && uv <= 0xFDEF &&
73 !(flags & UNICODE_ALLOW_FDD0))
74 ||
c867b360 75 ((uv & 0xFFFE) == 0xFFFE && /* Either FFFE or FFFF. */
b851fbc1
JH
76 !(flags & UNICODE_ALLOW_FFFF))) &&
77 /* UNICODE_ALLOW_SUPER includes
2a20b9da 78 * FFFEs and FFFFs beyond 0x10FFFF. */
b851fbc1
JH
79 ((uv <= PERL_UNICODE_MAX) ||
80 !(flags & UNICODE_ALLOW_SUPER))
81 )
9014280d 82 Perl_warner(aTHX_ packWARN(WARN_UTF8),
507b9800
JH
83 "Unicode character 0x%04"UVxf" is illegal", uv);
84 }
c4d5f83a 85 if (UNI_IS_INVARIANT(uv)) {
eb160463 86 *d++ = (U8)UTF_TO_NATIVE(uv);
a0ed51b3
LW
87 return d;
88 }
2d331972 89#if defined(EBCDIC)
1d72bdf6
NIS
90 else {
91 STRLEN len = UNISKIP(uv);
92 U8 *p = d+len-1;
93 while (p > d) {
eb160463 94 *p-- = (U8)UTF_TO_NATIVE((uv & UTF_CONTINUATION_MASK) | UTF_CONTINUATION_MARK);
1d72bdf6
NIS
95 uv >>= UTF_ACCUMULATION_SHIFT;
96 }
eb160463 97 *p = (U8)UTF_TO_NATIVE((uv & UTF_START_MASK(len)) | UTF_START_MARK(len));
1d72bdf6
NIS
98 return d+len;
99 }
100#else /* Non loop style */
a0ed51b3 101 if (uv < 0x800) {
eb160463
GS
102 *d++ = (U8)(( uv >> 6) | 0xc0);
103 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
104 return d;
105 }
106 if (uv < 0x10000) {
eb160463
GS
107 *d++ = (U8)(( uv >> 12) | 0xe0);
108 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
109 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
110 return d;
111 }
112 if (uv < 0x200000) {
eb160463
GS
113 *d++ = (U8)(( uv >> 18) | 0xf0);
114 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
115 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
116 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
117 return d;
118 }
119 if (uv < 0x4000000) {
eb160463
GS
120 *d++ = (U8)(( uv >> 24) | 0xf8);
121 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
122 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
123 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
124 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
125 return d;
126 }
127 if (uv < 0x80000000) {
eb160463
GS
128 *d++ = (U8)(( uv >> 30) | 0xfc);
129 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
130 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
131 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
132 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
133 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
134 return d;
135 }
6b8eaf93 136#ifdef HAS_QUAD
d7578b48 137 if (uv < UTF8_QUAD_MAX)
a0ed51b3
LW
138#endif
139 {
eb160463
GS
140 *d++ = 0xfe; /* Can't match U+FEFF! */
141 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
142 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
143 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
144 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
145 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
146 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
147 return d;
148 }
6b8eaf93 149#ifdef HAS_QUAD
a0ed51b3 150 {
eb160463
GS
151 *d++ = 0xff; /* Can't match U+FFFE! */
152 *d++ = 0x80; /* 6 Reserved bits */
153 *d++ = (U8)(((uv >> 60) & 0x0f) | 0x80); /* 2 Reserved bits */
154 *d++ = (U8)(((uv >> 54) & 0x3f) | 0x80);
155 *d++ = (U8)(((uv >> 48) & 0x3f) | 0x80);
156 *d++ = (U8)(((uv >> 42) & 0x3f) | 0x80);
157 *d++ = (U8)(((uv >> 36) & 0x3f) | 0x80);
158 *d++ = (U8)(((uv >> 30) & 0x3f) | 0x80);
159 *d++ = (U8)(((uv >> 24) & 0x3f) | 0x80);
160 *d++ = (U8)(((uv >> 18) & 0x3f) | 0x80);
161 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
162 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
163 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
164 return d;
165 }
166#endif
1d72bdf6 167#endif /* Loop style */
a0ed51b3 168}
b851fbc1
JH
169
170U8 *
171Perl_uvuni_to_utf8(pTHX_ U8 *d, UV uv)
172{
173 return Perl_uvuni_to_utf8_flags(aTHX_ d, uv, 0);
174}
9041c2e3 175
646ca15d
JH
176/*
177
178Tests if some arbitrary number of bytes begins in a valid UTF-8
179character. Note that an INVARIANT (i.e. ASCII) character is a valid
180UTF-8 character. The actual number of bytes in the UTF-8 character
181will be returned if it is valid, otherwise 0.
182
183This is the "slow" version as opposed to the "fast" version which is
184the "unrolled" IS_UTF8_CHAR(). E.g. for t/uni/class.t the speed
185difference is a factor of 2 to 3. For lengths (UTF8SKIP(s)) of four
186or less you should use the IS_UTF8_CHAR(), for lengths of five or more
187you should use the _slow(). In practice this means that the _slow()
188will be used very rarely, since the maximum Unicode code point (as of
189Unicode 4.1) is U+10FFFF, which encodes in UTF-8 to four bytes. Only
190the "Perl extended UTF-8" (the infamous 'v-strings') will encode into
191five bytes or more.
192
193=cut */
c053b435 194STATIC STRLEN
646ca15d
JH
195S_is_utf8_char_slow(pTHX_ const U8 *s, const STRLEN len)
196{
197 U8 u = *s;
198 STRLEN slen;
199 UV uv, ouv;
200
201 if (UTF8_IS_INVARIANT(u))
202 return 1;
203
204 if (!UTF8_IS_START(u))
205 return 0;
206
207 if (len < 2 || !UTF8_IS_CONTINUATION(s[1]))
208 return 0;
209
210 slen = len - 1;
211 s++;
212 u &= UTF_START_MASK(len);
213 uv = u;
214 ouv = uv;
215 while (slen--) {
216 if (!UTF8_IS_CONTINUATION(*s))
217 return 0;
218 uv = UTF8_ACCUMULATE(uv, *s);
219 if (uv < ouv)
220 return 0;
221 ouv = uv;
222 s++;
223 }
224
225 if ((STRLEN)UNISKIP(uv) < len)
226 return 0;
227
228 return len;
229}
9041c2e3
NIS
230
231/*
7fc63493 232=for apidoc A|STRLEN|is_utf8_char|const U8 *s
eebe1485 233
5da9da9e 234Tests if some arbitrary number of bytes begins in a valid UTF-8
82686b01
JH
235character. Note that an INVARIANT (i.e. ASCII) character is a valid
236UTF-8 character. The actual number of bytes in the UTF-8 character
237will be returned if it is valid, otherwise 0.
9041c2e3 238
82686b01 239=cut */
067a85ef 240STRLEN
7fc63493 241Perl_is_utf8_char(pTHX_ const U8 *s)
386d01d6 242{
768c67ee 243 STRLEN len = UTF8SKIP(s);
3b0fc154 244#ifdef IS_UTF8_CHAR
768c67ee 245 if (IS_UTF8_CHAR_FAST(len))
3b0fc154
JH
246 return IS_UTF8_CHAR(s, len) ? len : 0;
247#endif /* #ifdef IS_UTF8_CHAR */
2c0c5f92 248 return is_utf8_char_slow(s, len);
386d01d6
GS
249}
250
6662521e 251/*
7fc63493 252=for apidoc A|bool|is_utf8_string|const U8 *s|STRLEN len
6662521e 253
c9ada85f 254Returns true if first C<len> bytes of the given string form a valid
1e54db1a
JH
255UTF-8 string, false otherwise. Note that 'a valid UTF-8 string' does
256not mean 'a string that contains code points above 0x7F encoded in UTF-8'
257because a valid ASCII string is a valid UTF-8 string.
6662521e 258
768c67ee
JH
259See also is_utf8_string_loclen() and is_utf8_string_loc().
260
6662521e
GS
261=cut
262*/
263
8e84507e 264bool
7fc63493 265Perl_is_utf8_string(pTHX_ const U8 *s, STRLEN len)
6662521e 266{
7fc63493
AL
267 const U8* x = s;
268 const U8* send;
067a85ef 269
61468b03 270 if (!len && s)
e1ec3a88 271 len = strlen((const char *)s);
1aa99e6b
IH
272 send = s + len;
273
6662521e 274 while (x < send) {
a3b680e6 275 STRLEN c;
1acdb0da
JH
276 /* Inline the easy bits of is_utf8_char() here for speed... */
277 if (UTF8_IS_INVARIANT(*x))
278 c = 1;
279 else if (!UTF8_IS_START(*x))
768c67ee 280 goto out;
1acdb0da
JH
281 else {
282 /* ... and call is_utf8_char() only if really needed. */
646ca15d
JH
283#ifdef IS_UTF8_CHAR
284 c = UTF8SKIP(x);
768c67ee
JH
285 if (IS_UTF8_CHAR_FAST(c)) {
286 if (!IS_UTF8_CHAR(x, c))
287 goto out;
288 } else if (!is_utf8_char_slow(x, c))
289 goto out;
646ca15d
JH
290#else
291 c = is_utf8_char(x);
292#endif /* #ifdef IS_UTF8_CHAR */
1acdb0da 293 if (!c)
768c67ee 294 goto out;
1acdb0da 295 }
6662521e 296 x += c;
6662521e 297 }
768c67ee
JH
298
299 out:
60006e79
JH
300 if (x != send)
301 return FALSE;
067a85ef
A
302
303 return TRUE;
6662521e
GS
304}
305
67e989fb 306/*
768c67ee 307=for apidoc A|bool|is_utf8_string_loclen|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
81cd54e3 308
768c67ee
JH
309Like is_ut8_string() but stores the location of the failure (in the
310case of "utf8ness failure") or the location s+len (in the case of
311"utf8ness success") in the C<ep>, and the number of UTF-8
312encoded characters in the C<el>.
313
314See also is_utf8_string_loc() and is_utf8_string().
81cd54e3
JH
315
316=cut
317*/
318
319bool
768c67ee 320Perl_is_utf8_string_loclen(pTHX_ const U8 *s, STRLEN len, const U8 **ep, STRLEN *el)
81cd54e3 321{
7fc63493
AL
322 const U8* x = s;
323 const U8* send;
81cd54e3
JH
324 STRLEN c;
325
61468b03 326 if (!len && s)
e1ec3a88 327 len = strlen((const char *)s);
81cd54e3 328 send = s + len;
768c67ee
JH
329 if (el)
330 *el = 0;
81cd54e3
JH
331
332 while (x < send) {
333 /* Inline the easy bits of is_utf8_char() here for speed... */
334 if (UTF8_IS_INVARIANT(*x))
768c67ee
JH
335 c = 1;
336 else if (!UTF8_IS_START(*x))
337 goto out;
81cd54e3 338 else {
768c67ee
JH
339 /* ... and call is_utf8_char() only if really needed. */
340#ifdef IS_UTF8_CHAR
341 c = UTF8SKIP(x);
342 if (IS_UTF8_CHAR_FAST(c)) {
343 if (!IS_UTF8_CHAR(x, c))
344 c = 0;
345 } else
346 c = is_utf8_char_slow(x, c);
347#else
348 c = is_utf8_char(x);
349#endif /* #ifdef IS_UTF8_CHAR */
350 if (!c)
351 goto out;
81cd54e3 352 }
768c67ee
JH
353 x += c;
354 if (el)
355 (*el)++;
81cd54e3 356 }
768c67ee
JH
357
358 out:
359 if (ep)
360 *ep = x;
361 if (x != send)
81cd54e3 362 return FALSE;
81cd54e3
JH
363
364 return TRUE;
365}
366
367/*
768c67ee
JH
368=for apidoc A|bool|is_utf8_string_loc|const U8 *s|STRLEN len|const U8 **ep|const STRLEN *el
369
370Like is_ut8_string() but stores the location of the failure (in the
371case of "utf8ness failure") or the location s+len (in the case of
372"utf8ness success") in the C<ep>.
373
374See also is_utf8_string_loclen() and is_utf8_string().
375
376=cut
377*/
378
379bool
380Perl_is_utf8_string_loc(pTHX_ const U8 *s, STRLEN len, const U8 **ep)
381{
382 return is_utf8_string_loclen(s, len, ep, 0);
383}
384
385/*
7fc63493 386=for apidoc A|UV|utf8n_to_uvuni|const U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
67e989fb 387
9041c2e3
NIS
388Bottom level UTF-8 decode routine.
389Returns the unicode code point value of the first character in the string C<s>
1e54db1a 390which is assumed to be in UTF-8 encoding and no longer than C<curlen>;
7df053ec 391C<retlen> will be set to the length, in bytes, of that character.
67e989fb 392
1e54db1a 393If C<s> does not point to a well-formed UTF-8 character, the behaviour
dcad2880
JH
394is dependent on the value of C<flags>: if it contains UTF8_CHECK_ONLY,
395it is assumed that the caller will raise a warning, and this function
28d3d195
JH
396will silently just set C<retlen> to C<-1> and return zero. If the
397C<flags> does not contain UTF8_CHECK_ONLY, warnings about
398malformations will be given, C<retlen> will be set to the expected
399length of the UTF-8 character in bytes, and zero will be returned.
400
401The C<flags> can also contain various flags to allow deviations from
402the strict UTF-8 encoding (see F<utf8.h>).
67e989fb 403
9041c2e3
NIS
404Most code should use utf8_to_uvchr() rather than call this directly.
405
37607a96
PK
406=cut
407*/
67e989fb 408
a0ed51b3 409UV
7fc63493 410Perl_utf8n_to_uvuni(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
a0ed51b3 411{
7fc63493 412 const U8 *s0 = s;
9c5ffd7c 413 UV uv = *s, ouv = 0;
ba210ebe 414 STRLEN len = 1;
7fc63493
AL
415 const bool dowarn = ckWARN_d(WARN_UTF8);
416 const UV startbyte = *s;
ba210ebe 417 STRLEN expectlen = 0;
a0dbb045
JH
418 U32 warning = 0;
419
420/* This list is a superset of the UTF8_ALLOW_XXX. */
421
422#define UTF8_WARN_EMPTY 1
423#define UTF8_WARN_CONTINUATION 2
424#define UTF8_WARN_NON_CONTINUATION 3
425#define UTF8_WARN_FE_FF 4
426#define UTF8_WARN_SHORT 5
427#define UTF8_WARN_OVERFLOW 6
428#define UTF8_WARN_SURROGATE 7
c867b360
JH
429#define UTF8_WARN_LONG 8
430#define UTF8_WARN_FFFF 9 /* Also FFFE. */
a0dbb045
JH
431
432 if (curlen == 0 &&
433 !(flags & UTF8_ALLOW_EMPTY)) {
434 warning = UTF8_WARN_EMPTY;
0c443dc2
JH
435 goto malformed;
436 }
437
1d72bdf6 438 if (UTF8_IS_INVARIANT(uv)) {
a0ed51b3
LW
439 if (retlen)
440 *retlen = 1;
c4d5f83a 441 return (UV) (NATIVE_TO_UTF(*s));
a0ed51b3 442 }
67e989fb 443
421a8bf2 444 if (UTF8_IS_CONTINUATION(uv) &&
fcc8fcf6 445 !(flags & UTF8_ALLOW_CONTINUATION)) {
a0dbb045 446 warning = UTF8_WARN_CONTINUATION;
ba210ebe
JH
447 goto malformed;
448 }
449
421a8bf2 450 if (UTF8_IS_START(uv) && curlen > 1 && !UTF8_IS_CONTINUATION(s[1]) &&
fcc8fcf6 451 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045 452 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe
JH
453 goto malformed;
454 }
9041c2e3 455
1d72bdf6 456#ifdef EBCDIC
75383841 457 uv = NATIVE_TO_UTF(uv);
1d72bdf6 458#else
fcc8fcf6
JH
459 if ((uv == 0xfe || uv == 0xff) &&
460 !(flags & UTF8_ALLOW_FE_FF)) {
a0dbb045 461 warning = UTF8_WARN_FE_FF;
ba210ebe 462 goto malformed;
a0ed51b3 463 }
1d72bdf6
NIS
464#endif
465
ba210ebe
JH
466 if (!(uv & 0x20)) { len = 2; uv &= 0x1f; }
467 else if (!(uv & 0x10)) { len = 3; uv &= 0x0f; }
468 else if (!(uv & 0x08)) { len = 4; uv &= 0x07; }
469 else if (!(uv & 0x04)) { len = 5; uv &= 0x03; }
1d72bdf6
NIS
470#ifdef EBCDIC
471 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
472 else { len = 7; uv &= 0x01; }
473#else
ba210ebe
JH
474 else if (!(uv & 0x02)) { len = 6; uv &= 0x01; }
475 else if (!(uv & 0x01)) { len = 7; uv = 0; }
1d72bdf6
NIS
476 else { len = 13; uv = 0; } /* whoa! */
477#endif
478
a0ed51b3
LW
479 if (retlen)
480 *retlen = len;
9041c2e3 481
ba210ebe
JH
482 expectlen = len;
483
fcc8fcf6
JH
484 if ((curlen < expectlen) &&
485 !(flags & UTF8_ALLOW_SHORT)) {
a0dbb045 486 warning = UTF8_WARN_SHORT;
ba210ebe
JH
487 goto malformed;
488 }
489
490 len--;
a0ed51b3 491 s++;
ba210ebe
JH
492 ouv = uv;
493
a0ed51b3 494 while (len--) {
421a8bf2
JH
495 if (!UTF8_IS_CONTINUATION(*s) &&
496 !(flags & UTF8_ALLOW_NON_CONTINUATION)) {
a0dbb045
JH
497 s--;
498 warning = UTF8_WARN_NON_CONTINUATION;
ba210ebe 499 goto malformed;
a0ed51b3
LW
500 }
501 else
8850bf83 502 uv = UTF8_ACCUMULATE(uv, *s);
a0dbb045
JH
503 if (!(uv > ouv)) {
504 /* These cannot be allowed. */
505 if (uv == ouv) {
75dbc644 506 if (expectlen != 13 && !(flags & UTF8_ALLOW_LONG)) {
a0dbb045
JH
507 warning = UTF8_WARN_LONG;
508 goto malformed;
509 }
510 }
511 else { /* uv < ouv */
512 /* This cannot be allowed. */
513 warning = UTF8_WARN_OVERFLOW;
514 goto malformed;
515 }
ba210ebe
JH
516 }
517 s++;
518 ouv = uv;
519 }
520
421a8bf2 521 if (UNICODE_IS_SURROGATE(uv) &&
fcc8fcf6 522 !(flags & UTF8_ALLOW_SURROGATE)) {
a0dbb045 523 warning = UTF8_WARN_SURROGATE;
ba210ebe 524 goto malformed;
eb160463 525 } else if ((expectlen > (STRLEN)UNISKIP(uv)) &&
fcc8fcf6 526 !(flags & UTF8_ALLOW_LONG)) {
a0dbb045 527 warning = UTF8_WARN_LONG;
ba210ebe 528 goto malformed;
421a8bf2 529 } else if (UNICODE_IS_ILLEGAL(uv) &&
a9917092 530 !(flags & UTF8_ALLOW_FFFF)) {
a0dbb045 531 warning = UTF8_WARN_FFFF;
a9917092 532 goto malformed;
a0ed51b3 533 }
ba210ebe 534
a0ed51b3 535 return uv;
ba210ebe
JH
536
537malformed:
538
fcc8fcf6 539 if (flags & UTF8_CHECK_ONLY) {
ba210ebe 540 if (retlen)
cc366d4b 541 *retlen = -1;
ba210ebe
JH
542 return 0;
543 }
544
a0dbb045
JH
545 if (dowarn) {
546 SV* sv = sv_2mortal(newSVpv("Malformed UTF-8 character ", 0));
547
548 switch (warning) {
549 case 0: /* Intentionally empty. */ break;
550 case UTF8_WARN_EMPTY:
54667de8 551 Perl_sv_catpv(aTHX_ sv, "(empty string)");
a0dbb045
JH
552 break;
553 case UTF8_WARN_CONTINUATION:
097fb8e2 554 Perl_sv_catpvf(aTHX_ sv, "(unexpected continuation byte 0x%02"UVxf", with no preceding start byte)", uv);
a0dbb045
JH
555 break;
556 case UTF8_WARN_NON_CONTINUATION:
097fb8e2
JH
557 if (s == s0)
558 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", immediately after start byte 0x%02"UVxf")",
559 (UV)s[1], startbyte);
560 else
561 Perl_sv_catpvf(aTHX_ sv, "(unexpected non-continuation byte 0x%02"UVxf", %d byte%s after start byte 0x%02"UVxf", expected %d bytes)",
5d7488b2 562 (UV)s[1], s - s0, s - s0 > 1 ? "s" : "", startbyte, (int)expectlen);
097fb8e2 563
a0dbb045
JH
564 break;
565 case UTF8_WARN_FE_FF:
566 Perl_sv_catpvf(aTHX_ sv, "(byte 0x%02"UVxf")", uv);
567 break;
568 case UTF8_WARN_SHORT:
097fb8e2 569 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 570 (int)curlen, curlen == 1 ? "" : "s", (int)expectlen, startbyte);
b31f83c2 571 expectlen = curlen; /* distance for caller to skip */
a0dbb045
JH
572 break;
573 case UTF8_WARN_OVERFLOW:
097fb8e2
JH
574 Perl_sv_catpvf(aTHX_ sv, "(overflow at 0x%"UVxf", byte 0x%02x, after start byte 0x%02"UVxf")",
575 ouv, *s, startbyte);
a0dbb045
JH
576 break;
577 case UTF8_WARN_SURROGATE:
578 Perl_sv_catpvf(aTHX_ sv, "(UTF-16 surrogate 0x%04"UVxf")", uv);
579 break;
a0dbb045 580 case UTF8_WARN_LONG:
097fb8e2 581 Perl_sv_catpvf(aTHX_ sv, "(%d byte%s, need %d, after start byte 0x%02"UVxf")",
5d7488b2 582 (int)expectlen, expectlen == 1 ? "": "s", UNISKIP(uv), startbyte);
a0dbb045
JH
583 break;
584 case UTF8_WARN_FFFF:
585 Perl_sv_catpvf(aTHX_ sv, "(character 0x%04"UVxf")", uv);
586 break;
587 default:
54667de8 588 Perl_sv_catpv(aTHX_ sv, "(unknown reason)");
a0dbb045
JH
589 break;
590 }
591
592 if (warning) {
504618e9 593 const char *s = SvPVX_const(sv);
a0dbb045
JH
594
595 if (PL_op)
9014280d 596 Perl_warner(aTHX_ packWARN(WARN_UTF8),
53e06cf0 597 "%s in %s", s, OP_DESC(PL_op));
a0dbb045 598 else
9014280d 599 Perl_warner(aTHX_ packWARN(WARN_UTF8), "%s", s);
a0dbb045
JH
600 }
601 }
602
ba210ebe 603 if (retlen)
28d3d195 604 *retlen = expectlen ? expectlen : len;
ba210ebe 605
28d3d195 606 return 0;
a0ed51b3
LW
607}
608
8e84507e 609/*
7fc63493 610=for apidoc A|UV|utf8_to_uvchr|const U8 *s|STRLEN *retlen
9041c2e3
NIS
611
612Returns the native character value of the first character in the string C<s>
1e54db1a 613which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
614length, in bytes, of that character.
615
1e54db1a 616If C<s> does not point to a well-formed UTF-8 character, zero is
9041c2e3
NIS
617returned and retlen is set, if possible, to -1.
618
619=cut
620*/
621
622UV
7fc63493 623Perl_utf8_to_uvchr(pTHX_ const U8 *s, STRLEN *retlen)
9041c2e3 624{
89ebb4a3 625 return Perl_utf8n_to_uvchr(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 626 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
9041c2e3
NIS
627}
628
629/*
7fc63493 630=for apidoc A|UV|utf8_to_uvuni|const U8 *s|STRLEN *retlen
9041c2e3
NIS
631
632Returns the Unicode code point of the first character in the string C<s>
1e54db1a 633which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
9041c2e3
NIS
634length, in bytes, of that character.
635
636This function should only be used when returned UV is considered
637an index into the Unicode semantic tables (e.g. swashes).
638
1e54db1a 639If C<s> does not point to a well-formed UTF-8 character, zero is
ba210ebe 640returned and retlen is set, if possible, to -1.
8e84507e
NIS
641
642=cut
643*/
644
645UV
7fc63493 646Perl_utf8_to_uvuni(pTHX_ const U8 *s, STRLEN *retlen)
8e84507e 647{
9041c2e3 648 /* Call the low level routine asking for checks */
89ebb4a3 649 return Perl_utf8n_to_uvuni(aTHX_ s, UTF8_MAXBYTES, retlen,
872c91ae 650 ckWARN(WARN_UTF8) ? 0 : UTF8_ALLOW_ANY);
8e84507e
NIS
651}
652
b76347f2 653/*
35a4481c 654=for apidoc A|STRLEN|utf8_length|const U8 *s|const U8 *e
b76347f2
JH
655
656Return the length of the UTF-8 char encoded string C<s> in characters.
02eb7b47
JH
657Stops at C<e> (inclusive). If C<e E<lt> s> or if the scan would end
658up past C<e>, croaks.
b76347f2
JH
659
660=cut
661*/
662
663STRLEN
35a4481c 664Perl_utf8_length(pTHX_ const U8 *s, const U8 *e)
b76347f2
JH
665{
666 STRLEN len = 0;
667
8850bf83
JH
668 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
669 * the bitops (especially ~) can create illegal UTF-8.
670 * In other words: in Perl UTF-8 is not just for Unicode. */
671
a3b680e6
AL
672 if (e < s)
673 goto warn_and_return;
b76347f2 674 while (s < e) {
4373e329 675 const U8 t = UTF8SKIP(s);
901b21bf 676 if (e - s < t) {
a3b680e6 677 warn_and_return:
901b21bf
JH
678 if (ckWARN_d(WARN_UTF8)) {
679 if (PL_op)
680 Perl_warner(aTHX_ packWARN(WARN_UTF8),
a3b680e6 681 "%s in %s", unees, OP_DESC(PL_op));
901b21bf
JH
682 else
683 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
684 }
685 return len;
686 }
b76347f2
JH
687 s += t;
688 len++;
689 }
690
691 return len;
692}
693
b06226ff 694/*
35a4481c 695=for apidoc A|IV|utf8_distance|const U8 *a|const U8 *b
b06226ff 696
1e54db1a 697Returns the number of UTF-8 characters between the UTF-8 pointers C<a>
b06226ff
JH
698and C<b>.
699
700WARNING: use only if you *know* that the pointers point inside the
701same UTF-8 buffer.
702
37607a96
PK
703=cut
704*/
a0ed51b3 705
02eb7b47 706IV
35a4481c 707Perl_utf8_distance(pTHX_ const U8 *a, const U8 *b)
a0ed51b3 708{
02eb7b47
JH
709 IV off = 0;
710
8850bf83
JH
711 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g.
712 * the bitops (especially ~) can create illegal UTF-8.
713 * In other words: in Perl UTF-8 is not just for Unicode. */
714
a0ed51b3
LW
715 if (a < b) {
716 while (a < b) {
35a4481c 717 const U8 c = UTF8SKIP(a);
a3b680e6
AL
718 if (b - a < c)
719 goto warn_and_return;
02eb7b47 720 a += c;
a0ed51b3
LW
721 off--;
722 }
723 }
724 else {
725 while (b < a) {
4373e329 726 const U8 c = UTF8SKIP(b);
02eb7b47 727
901b21bf 728 if (a - b < c) {
a3b680e6 729 warn_and_return:
901b21bf
JH
730 if (ckWARN_d(WARN_UTF8)) {
731 if (PL_op)
732 Perl_warner(aTHX_ packWARN(WARN_UTF8),
733 "%s in %s", unees, OP_DESC(PL_op));
734 else
735 Perl_warner(aTHX_ packWARN(WARN_UTF8), unees);
736 }
737 return off;
738 }
02eb7b47 739 b += c;
a0ed51b3
LW
740 off++;
741 }
742 }
02eb7b47 743
a0ed51b3
LW
744 return off;
745}
746
b06226ff 747/*
37607a96 748=for apidoc A|U8 *|utf8_hop|U8 *s|I32 off
b06226ff 749
8850bf83
JH
750Return the UTF-8 pointer C<s> displaced by C<off> characters, either
751forward or backward.
b06226ff
JH
752
753WARNING: do not use the following unless you *know* C<off> is within
8850bf83
JH
754the UTF-8 data pointed to by C<s> *and* that on entry C<s> is aligned
755on the first byte of character or just after the last byte of a character.
b06226ff 756
37607a96
PK
757=cut
758*/
a0ed51b3
LW
759
760U8 *
4373e329 761Perl_utf8_hop(pTHX_ const U8 *s, I32 off)
a0ed51b3 762{
8850bf83
JH
763 /* Note: cannot use UTF8_IS_...() too eagerly here since e.g
764 * the bitops (especially ~) can create illegal UTF-8.
765 * In other words: in Perl UTF-8 is not just for Unicode. */
766
a0ed51b3
LW
767 if (off >= 0) {
768 while (off--)
769 s += UTF8SKIP(s);
770 }
771 else {
772 while (off++) {
773 s--;
8850bf83
JH
774 while (UTF8_IS_CONTINUATION(*s))
775 s--;
a0ed51b3
LW
776 }
777 }
4373e329 778 return (U8 *)s;
a0ed51b3
LW
779}
780
6940069f 781/*
eebe1485 782=for apidoc A|U8 *|utf8_to_bytes|U8 *s|STRLEN *len
6940069f 783
1e54db1a 784Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
246fae53
MG
785Unlike C<bytes_to_utf8>, this over-writes the original string, and
786updates len to contain the new length.
67e989fb 787Returns zero on failure, setting C<len> to -1.
6940069f
GS
788
789=cut
790*/
791
792U8 *
37607a96 793Perl_utf8_to_bytes(pTHX_ U8 *s, STRLEN *len)
6940069f 794{
6940069f
GS
795 U8 *send;
796 U8 *d;
dcad2880 797 U8 *save = s;
246fae53 798
1e54db1a 799 /* ensure valid UTF-8 and chars < 256 before updating string */
dcad2880
JH
800 for (send = s + *len; s < send; ) {
801 U8 c = *s++;
802
1d72bdf6
NIS
803 if (!UTF8_IS_INVARIANT(c) &&
804 (!UTF8_IS_DOWNGRADEABLE_START(c) || (s >= send)
805 || !(c = *s++) || !UTF8_IS_CONTINUATION(c))) {
dcad2880
JH
806 *len = -1;
807 return 0;
808 }
246fae53 809 }
dcad2880
JH
810
811 d = s = save;
6940069f 812 while (s < send) {
ed646e6e 813 STRLEN ulen;
9041c2e3 814 *d++ = (U8)utf8_to_uvchr(s, &ulen);
ed646e6e 815 s += ulen;
6940069f
GS
816 }
817 *d = '\0';
246fae53 818 *len = d - save;
6940069f
GS
819 return save;
820}
821
822/*
e1ec3a88 823=for apidoc A|U8 *|bytes_from_utf8|const U8 *s|STRLEN *len|bool *is_utf8
f9a63242 824
1e54db1a 825Converts a string C<s> of length C<len> from UTF-8 into byte encoding.
35a4481c 826Unlike C<utf8_to_bytes> but like C<bytes_to_utf8>, returns a pointer to
ef9edfd0
JH
827the newly-created string, and updates C<len> to contain the new
828length. Returns the original string if no conversion occurs, C<len>
829is unchanged. Do nothing if C<is_utf8> points to 0. Sets C<is_utf8> to
8300 if C<s> is converted or contains all 7bit characters.
f9a63242 831
37607a96
PK
832=cut
833*/
f9a63242
JH
834
835U8 *
e1ec3a88 836Perl_bytes_from_utf8(pTHX_ const U8 *s, STRLEN *len, bool *is_utf8)
f9a63242 837{
f9a63242 838 U8 *d;
e1ec3a88
AL
839 const U8 *start = s;
840 const U8 *send;
f9a63242
JH
841 I32 count = 0;
842
843 if (!*is_utf8)
73d840c0 844 return (U8 *)start;
f9a63242 845
1e54db1a 846 /* ensure valid UTF-8 and chars < 256 before converting string */
f9a63242 847 for (send = s + *len; s < send;) {
e1ec3a88 848 U8 c = *s++;
1d72bdf6 849 if (!UTF8_IS_INVARIANT(c)) {
db42d148
NIS
850 if (UTF8_IS_DOWNGRADEABLE_START(c) && s < send &&
851 (c = *s++) && UTF8_IS_CONTINUATION(c))
852 count++;
853 else
73d840c0 854 return (U8 *)start;
db42d148 855 }
f9a63242
JH
856 }
857
858 *is_utf8 = 0;
859
a02a5408 860 Newxz(d, (*len) - count + 1, U8);
ef9edfd0 861 s = start; start = d;
f9a63242
JH
862 while (s < send) {
863 U8 c = *s++;
c4d5f83a
NIS
864 if (!UTF8_IS_INVARIANT(c)) {
865 /* Then it is two-byte encoded */
866 c = UTF8_ACCUMULATE(NATIVE_TO_UTF(c), *s++);
867 c = ASCII_TO_NATIVE(c);
868 }
869 *d++ = c;
f9a63242
JH
870 }
871 *d = '\0';
872 *len = d - start;
73d840c0 873 return (U8 *)start;
f9a63242
JH
874}
875
876/*
35a4481c 877=for apidoc A|U8 *|bytes_to_utf8|const U8 *s|STRLEN *len
6940069f 878
1e54db1a 879Converts a string C<s> of length C<len> from ASCII into UTF-8 encoding.
6662521e
GS
880Returns a pointer to the newly-created string, and sets C<len> to
881reflect the new length.
6940069f 882
1e54db1a 883If you want to convert to UTF-8 from other encodings than ASCII,
c9ada85f
JH
884see sv_recode_to_utf8().
885
497711e7 886=cut
6940069f
GS
887*/
888
889U8*
35a4481c 890Perl_bytes_to_utf8(pTHX_ const U8 *s, STRLEN *len)
6940069f 891{
35a4481c 892 const U8 * const send = s + (*len);
6940069f
GS
893 U8 *d;
894 U8 *dst;
6940069f 895
a02a5408 896 Newxz(d, (*len) * 2 + 1, U8);
6940069f
GS
897 dst = d;
898
899 while (s < send) {
35a4481c 900 const UV uv = NATIVE_TO_ASCII(*s++);
c4d5f83a 901 if (UNI_IS_INVARIANT(uv))
eb160463 902 *d++ = (U8)UTF_TO_NATIVE(uv);
6940069f 903 else {
eb160463
GS
904 *d++ = (U8)UTF8_EIGHT_BIT_HI(uv);
905 *d++ = (U8)UTF8_EIGHT_BIT_LO(uv);
6940069f
GS
906 }
907 }
908 *d = '\0';
6662521e 909 *len = d-dst;
6940069f
GS
910 return dst;
911}
912
a0ed51b3 913/*
dea0fc0b 914 * Convert native (big-endian) or reversed (little-endian) UTF-16 to UTF-8.
a0ed51b3
LW
915 *
916 * Destination must be pre-extended to 3/2 source. Do not use in-place.
917 * We optimize for native, for obvious reasons. */
918
919U8*
dea0fc0b 920Perl_utf16_to_utf8(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3 921{
dea0fc0b
JH
922 U8* pend;
923 U8* dstart = d;
924
1de9afcd
RGS
925 if (bytelen == 1 && p[0] == 0) { /* Be understanding. */
926 d[0] = 0;
927 *newlen = 1;
928 return d;
929 }
930
dea0fc0b 931 if (bytelen & 1)
014ead4b 932 Perl_croak(aTHX_ "panic: utf16_to_utf8: odd bytelen %"UVf, (UV)bytelen);
dea0fc0b
JH
933
934 pend = p + bytelen;
935
a0ed51b3 936 while (p < pend) {
dea0fc0b
JH
937 UV uv = (p[0] << 8) + p[1]; /* UTF-16BE */
938 p += 2;
a0ed51b3 939 if (uv < 0x80) {
eb160463 940 *d++ = (U8)uv;
a0ed51b3
LW
941 continue;
942 }
943 if (uv < 0x800) {
eb160463
GS
944 *d++ = (U8)(( uv >> 6) | 0xc0);
945 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
946 continue;
947 }
948 if (uv >= 0xd800 && uv < 0xdbff) { /* surrogates */
30f84f9e
TD
949 UV low = (p[0] << 8) + p[1];
950 p += 2;
dea0fc0b
JH
951 if (low < 0xdc00 || low >= 0xdfff)
952 Perl_croak(aTHX_ "Malformed UTF-16 surrogate");
a0ed51b3
LW
953 uv = ((uv - 0xd800) << 10) + (low - 0xdc00) + 0x10000;
954 }
955 if (uv < 0x10000) {
eb160463
GS
956 *d++ = (U8)(( uv >> 12) | 0xe0);
957 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
958 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
959 continue;
960 }
961 else {
eb160463
GS
962 *d++ = (U8)(( uv >> 18) | 0xf0);
963 *d++ = (U8)(((uv >> 12) & 0x3f) | 0x80);
964 *d++ = (U8)(((uv >> 6) & 0x3f) | 0x80);
965 *d++ = (U8)(( uv & 0x3f) | 0x80);
a0ed51b3
LW
966 continue;
967 }
968 }
dea0fc0b 969 *newlen = d - dstart;
a0ed51b3
LW
970 return d;
971}
972
973/* Note: this one is slightly destructive of the source. */
974
975U8*
dea0fc0b 976Perl_utf16_to_utf8_reversed(pTHX_ U8* p, U8* d, I32 bytelen, I32 *newlen)
a0ed51b3
LW
977{
978 U8* s = (U8*)p;
979 U8* send = s + bytelen;
980 while (s < send) {
981 U8 tmp = s[0];
982 s[0] = s[1];
983 s[1] = tmp;
984 s += 2;
985 }
dea0fc0b 986 return utf16_to_utf8(p, d, bytelen, newlen);
a0ed51b3
LW
987}
988
989/* for now these are all defined (inefficiently) in terms of the utf8 versions */
990
991bool
84afefe6 992Perl_is_uni_alnum(pTHX_ UV c)
a0ed51b3 993{
89ebb4a3 994 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 995 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
996 return is_utf8_alnum(tmpbuf);
997}
998
999bool
84afefe6 1000Perl_is_uni_alnumc(pTHX_ UV c)
b8c5462f 1001{
89ebb4a3 1002 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1003 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1004 return is_utf8_alnumc(tmpbuf);
1005}
1006
1007bool
84afefe6 1008Perl_is_uni_idfirst(pTHX_ UV c)
a0ed51b3 1009{
89ebb4a3 1010 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1011 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1012 return is_utf8_idfirst(tmpbuf);
1013}
1014
1015bool
84afefe6 1016Perl_is_uni_alpha(pTHX_ UV c)
a0ed51b3 1017{
89ebb4a3 1018 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1019 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1020 return is_utf8_alpha(tmpbuf);
1021}
1022
1023bool
84afefe6 1024Perl_is_uni_ascii(pTHX_ UV c)
4d61ec05 1025{
89ebb4a3 1026 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1027 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1028 return is_utf8_ascii(tmpbuf);
1029}
1030
1031bool
84afefe6 1032Perl_is_uni_space(pTHX_ UV c)
a0ed51b3 1033{
89ebb4a3 1034 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1035 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1036 return is_utf8_space(tmpbuf);
1037}
1038
1039bool
84afefe6 1040Perl_is_uni_digit(pTHX_ UV c)
a0ed51b3 1041{
89ebb4a3 1042 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1043 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1044 return is_utf8_digit(tmpbuf);
1045}
1046
1047bool
84afefe6 1048Perl_is_uni_upper(pTHX_ UV c)
a0ed51b3 1049{
89ebb4a3 1050 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1051 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1052 return is_utf8_upper(tmpbuf);
1053}
1054
1055bool
84afefe6 1056Perl_is_uni_lower(pTHX_ UV c)
a0ed51b3 1057{
89ebb4a3 1058 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1059 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1060 return is_utf8_lower(tmpbuf);
1061}
1062
1063bool
84afefe6 1064Perl_is_uni_cntrl(pTHX_ UV c)
b8c5462f 1065{
89ebb4a3 1066 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1067 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1068 return is_utf8_cntrl(tmpbuf);
1069}
1070
1071bool
84afefe6 1072Perl_is_uni_graph(pTHX_ UV c)
b8c5462f 1073{
89ebb4a3 1074 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1075 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1076 return is_utf8_graph(tmpbuf);
1077}
1078
1079bool
84afefe6 1080Perl_is_uni_print(pTHX_ UV c)
a0ed51b3 1081{
89ebb4a3 1082 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1083 uvchr_to_utf8(tmpbuf, c);
a0ed51b3
LW
1084 return is_utf8_print(tmpbuf);
1085}
1086
b8c5462f 1087bool
84afefe6 1088Perl_is_uni_punct(pTHX_ UV c)
b8c5462f 1089{
89ebb4a3 1090 U8 tmpbuf[UTF8_MAXBYTES+1];
230880c1 1091 uvchr_to_utf8(tmpbuf, c);
b8c5462f
JH
1092 return is_utf8_punct(tmpbuf);
1093}
1094
4d61ec05 1095bool
84afefe6 1096Perl_is_uni_xdigit(pTHX_ UV c)
4d61ec05 1097{
89ebb4a3 1098 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
230880c1 1099 uvchr_to_utf8(tmpbuf, c);
4d61ec05
GS
1100 return is_utf8_xdigit(tmpbuf);
1101}
1102
84afefe6
JH
1103UV
1104Perl_to_uni_upper(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1105{
0ebc6274
JH
1106 uvchr_to_utf8(p, c);
1107 return to_utf8_upper(p, p, lenp);
a0ed51b3
LW
1108}
1109
84afefe6
JH
1110UV
1111Perl_to_uni_title(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1112{
0ebc6274
JH
1113 uvchr_to_utf8(p, c);
1114 return to_utf8_title(p, p, lenp);
a0ed51b3
LW
1115}
1116
84afefe6
JH
1117UV
1118Perl_to_uni_lower(pTHX_ UV c, U8* p, STRLEN *lenp)
a0ed51b3 1119{
0ebc6274
JH
1120 uvchr_to_utf8(p, c);
1121 return to_utf8_lower(p, p, lenp);
a0ed51b3
LW
1122}
1123
84afefe6
JH
1124UV
1125Perl_to_uni_fold(pTHX_ UV c, U8* p, STRLEN *lenp)
1126{
0ebc6274
JH
1127 uvchr_to_utf8(p, c);
1128 return to_utf8_fold(p, p, lenp);
84afefe6
JH
1129}
1130
a0ed51b3
LW
1131/* for now these all assume no locale info available for Unicode > 255 */
1132
1133bool
84afefe6 1134Perl_is_uni_alnum_lc(pTHX_ UV c)
a0ed51b3
LW
1135{
1136 return is_uni_alnum(c); /* XXX no locale support yet */
1137}
1138
1139bool
84afefe6 1140Perl_is_uni_alnumc_lc(pTHX_ UV c)
b8c5462f
JH
1141{
1142 return is_uni_alnumc(c); /* XXX no locale support yet */
1143}
1144
1145bool
84afefe6 1146Perl_is_uni_idfirst_lc(pTHX_ UV c)
a0ed51b3
LW
1147{
1148 return is_uni_idfirst(c); /* XXX no locale support yet */
1149}
1150
1151bool
84afefe6 1152Perl_is_uni_alpha_lc(pTHX_ UV c)
a0ed51b3
LW
1153{
1154 return is_uni_alpha(c); /* XXX no locale support yet */
1155}
1156
1157bool
84afefe6 1158Perl_is_uni_ascii_lc(pTHX_ UV c)
4d61ec05
GS
1159{
1160 return is_uni_ascii(c); /* XXX no locale support yet */
1161}
1162
1163bool
84afefe6 1164Perl_is_uni_space_lc(pTHX_ UV c)
a0ed51b3
LW
1165{
1166 return is_uni_space(c); /* XXX no locale support yet */
1167}
1168
1169bool
84afefe6 1170Perl_is_uni_digit_lc(pTHX_ UV c)
a0ed51b3
LW
1171{
1172 return is_uni_digit(c); /* XXX no locale support yet */
1173}
1174
1175bool
84afefe6 1176Perl_is_uni_upper_lc(pTHX_ UV c)
a0ed51b3
LW
1177{
1178 return is_uni_upper(c); /* XXX no locale support yet */
1179}
1180
1181bool
84afefe6 1182Perl_is_uni_lower_lc(pTHX_ UV c)
a0ed51b3
LW
1183{
1184 return is_uni_lower(c); /* XXX no locale support yet */
1185}
1186
1187bool
84afefe6 1188Perl_is_uni_cntrl_lc(pTHX_ UV c)
b8c5462f
JH
1189{
1190 return is_uni_cntrl(c); /* XXX no locale support yet */
1191}
1192
1193bool
84afefe6 1194Perl_is_uni_graph_lc(pTHX_ UV c)
b8c5462f
JH
1195{
1196 return is_uni_graph(c); /* XXX no locale support yet */
1197}
1198
1199bool
84afefe6 1200Perl_is_uni_print_lc(pTHX_ UV c)
a0ed51b3
LW
1201{
1202 return is_uni_print(c); /* XXX no locale support yet */
1203}
1204
b8c5462f 1205bool
84afefe6 1206Perl_is_uni_punct_lc(pTHX_ UV c)
b8c5462f
JH
1207{
1208 return is_uni_punct(c); /* XXX no locale support yet */
1209}
1210
4d61ec05 1211bool
84afefe6 1212Perl_is_uni_xdigit_lc(pTHX_ UV c)
4d61ec05
GS
1213{
1214 return is_uni_xdigit(c); /* XXX no locale support yet */
1215}
1216
b7ac61fa
JH
1217U32
1218Perl_to_uni_upper_lc(pTHX_ U32 c)
1219{
ee099d14
JH
1220 /* XXX returns only the first character -- do not use XXX */
1221 /* XXX no locale support yet */
1222 STRLEN len;
89ebb4a3 1223 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1224 return (U32)to_uni_upper(c, tmpbuf, &len);
b7ac61fa
JH
1225}
1226
1227U32
1228Perl_to_uni_title_lc(pTHX_ U32 c)
1229{
ee099d14
JH
1230 /* XXX returns only the first character XXX -- do not use XXX */
1231 /* XXX no locale support yet */
1232 STRLEN len;
89ebb4a3 1233 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1234 return (U32)to_uni_title(c, tmpbuf, &len);
b7ac61fa
JH
1235}
1236
1237U32
1238Perl_to_uni_lower_lc(pTHX_ U32 c)
1239{
ee099d14
JH
1240 /* XXX returns only the first character -- do not use XXX */
1241 /* XXX no locale support yet */
1242 STRLEN len;
89ebb4a3 1243 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
ee099d14 1244 return (U32)to_uni_lower(c, tmpbuf, &len);
b7ac61fa
JH
1245}
1246
a0ed51b3 1247bool
7fc63493 1248Perl_is_utf8_alnum(pTHX_ const U8 *p)
a0ed51b3 1249{
386d01d6
GS
1250 if (!is_utf8_char(p))
1251 return FALSE;
a0ed51b3 1252 if (!PL_utf8_alnum)
289d4f09
ML
1253 /* NOTE: "IsWord", not "IsAlnum", since Alnum is a true
1254 * descendant of isalnum(3), in other words, it doesn't
1255 * contain the '_'. --jhi */
1256 PL_utf8_alnum = swash_init("utf8", "IsWord", &PL_sv_undef, 0, 0);
eb160463 1257 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1258/* return *p == '_' || is_utf8_alpha(p) || is_utf8_digit(p); */
1259#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1260 if (!PL_utf8_alnum)
1261 PL_utf8_alnum = swash_init("utf8", "",
1262 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1263 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
a0ed51b3
LW
1264#endif
1265}
1266
1267bool
7fc63493 1268Perl_is_utf8_alnumc(pTHX_ const U8 *p)
b8c5462f 1269{
386d01d6
GS
1270 if (!is_utf8_char(p))
1271 return FALSE;
b8c5462f
JH
1272 if (!PL_utf8_alnum)
1273 PL_utf8_alnum = swash_init("utf8", "IsAlnumC", &PL_sv_undef, 0, 0);
eb160463 1274 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1275/* return is_utf8_alpha(p) || is_utf8_digit(p); */
1276#ifdef SURPRISINGLY_SLOWER /* probably because alpha is usually true */
1277 if (!PL_utf8_alnum)
1278 PL_utf8_alnum = swash_init("utf8", "",
1279 sv_2mortal(newSVpv("+utf8::IsAlpha\n+utf8::IsDigit\n005F\n",0)), 0, 0);
eb160463 1280 return swash_fetch(PL_utf8_alnum, p, TRUE) != 0;
b8c5462f
JH
1281#endif
1282}
1283
1284bool
7fc63493 1285Perl_is_utf8_idfirst(pTHX_ const U8 *p) /* The naming is historical. */
a0ed51b3 1286{
82686b01
JH
1287 if (*p == '_')
1288 return TRUE;
1289 if (!is_utf8_char(p))
1290 return FALSE;
1291 if (!PL_utf8_idstart) /* is_utf8_idstart would be more logical. */
1292 PL_utf8_idstart = swash_init("utf8", "IdStart", &PL_sv_undef, 0, 0);
eb160463 1293 return swash_fetch(PL_utf8_idstart, p, TRUE) != 0;
82686b01
JH
1294}
1295
1296bool
7fc63493 1297Perl_is_utf8_idcont(pTHX_ const U8 *p)
82686b01
JH
1298{
1299 if (*p == '_')
1300 return TRUE;
1301 if (!is_utf8_char(p))
1302 return FALSE;
1303 if (!PL_utf8_idcont)
1304 PL_utf8_idcont = swash_init("utf8", "IdContinue", &PL_sv_undef, 0, 0);
eb160463 1305 return swash_fetch(PL_utf8_idcont, p, TRUE) != 0;
a0ed51b3
LW
1306}
1307
1308bool
7fc63493 1309Perl_is_utf8_alpha(pTHX_ const U8 *p)
a0ed51b3 1310{
386d01d6
GS
1311 if (!is_utf8_char(p))
1312 return FALSE;
a0ed51b3 1313 if (!PL_utf8_alpha)
e24b16f9 1314 PL_utf8_alpha = swash_init("utf8", "IsAlpha", &PL_sv_undef, 0, 0);
eb160463 1315 return swash_fetch(PL_utf8_alpha, p, TRUE) != 0;
a0ed51b3
LW
1316}
1317
1318bool
7fc63493 1319Perl_is_utf8_ascii(pTHX_ const U8 *p)
b8c5462f 1320{
386d01d6
GS
1321 if (!is_utf8_char(p))
1322 return FALSE;
b8c5462f
JH
1323 if (!PL_utf8_ascii)
1324 PL_utf8_ascii = swash_init("utf8", "IsAscii", &PL_sv_undef, 0, 0);
eb160463 1325 return swash_fetch(PL_utf8_ascii, p, TRUE) != 0;
b8c5462f
JH
1326}
1327
1328bool
7fc63493 1329Perl_is_utf8_space(pTHX_ const U8 *p)
a0ed51b3 1330{
386d01d6
GS
1331 if (!is_utf8_char(p))
1332 return FALSE;
a0ed51b3 1333 if (!PL_utf8_space)
3bec3564 1334 PL_utf8_space = swash_init("utf8", "IsSpacePerl", &PL_sv_undef, 0, 0);
eb160463 1335 return swash_fetch(PL_utf8_space, p, TRUE) != 0;
a0ed51b3
LW
1336}
1337
1338bool
7fc63493 1339Perl_is_utf8_digit(pTHX_ const U8 *p)
a0ed51b3 1340{
386d01d6
GS
1341 if (!is_utf8_char(p))
1342 return FALSE;
a0ed51b3 1343 if (!PL_utf8_digit)
e24b16f9 1344 PL_utf8_digit = swash_init("utf8", "IsDigit", &PL_sv_undef, 0, 0);
eb160463 1345 return swash_fetch(PL_utf8_digit, p, TRUE) != 0;
a0ed51b3
LW
1346}
1347
1348bool
7fc63493 1349Perl_is_utf8_upper(pTHX_ const U8 *p)
a0ed51b3 1350{
386d01d6
GS
1351 if (!is_utf8_char(p))
1352 return FALSE;
a0ed51b3 1353 if (!PL_utf8_upper)
c65e4d19 1354 PL_utf8_upper = swash_init("utf8", "IsUppercase", &PL_sv_undef, 0, 0);
eb160463 1355 return swash_fetch(PL_utf8_upper, p, TRUE) != 0;
a0ed51b3
LW
1356}
1357
1358bool
7fc63493 1359Perl_is_utf8_lower(pTHX_ const U8 *p)
a0ed51b3 1360{
386d01d6
GS
1361 if (!is_utf8_char(p))
1362 return FALSE;
a0ed51b3 1363 if (!PL_utf8_lower)
c65e4d19 1364 PL_utf8_lower = swash_init("utf8", "IsLowercase", &PL_sv_undef, 0, 0);
eb160463 1365 return swash_fetch(PL_utf8_lower, p, TRUE) != 0;
a0ed51b3
LW
1366}
1367
1368bool
7fc63493 1369Perl_is_utf8_cntrl(pTHX_ const U8 *p)
b8c5462f 1370{
386d01d6
GS
1371 if (!is_utf8_char(p))
1372 return FALSE;
b8c5462f
JH
1373 if (!PL_utf8_cntrl)
1374 PL_utf8_cntrl = swash_init("utf8", "IsCntrl", &PL_sv_undef, 0, 0);
eb160463 1375 return swash_fetch(PL_utf8_cntrl, p, TRUE) != 0;
b8c5462f
JH
1376}
1377
1378bool
7fc63493 1379Perl_is_utf8_graph(pTHX_ const U8 *p)
b8c5462f 1380{
386d01d6
GS
1381 if (!is_utf8_char(p))
1382 return FALSE;
b8c5462f
JH
1383 if (!PL_utf8_graph)
1384 PL_utf8_graph = swash_init("utf8", "IsGraph", &PL_sv_undef, 0, 0);
eb160463 1385 return swash_fetch(PL_utf8_graph, p, TRUE) != 0;
b8c5462f
JH
1386}
1387
1388bool
7fc63493 1389Perl_is_utf8_print(pTHX_ const U8 *p)
a0ed51b3 1390{
386d01d6
GS
1391 if (!is_utf8_char(p))
1392 return FALSE;
a0ed51b3 1393 if (!PL_utf8_print)
e24b16f9 1394 PL_utf8_print = swash_init("utf8", "IsPrint", &PL_sv_undef, 0, 0);
eb160463 1395 return swash_fetch(PL_utf8_print, p, TRUE) != 0;
a0ed51b3
LW
1396}
1397
1398bool
7fc63493 1399Perl_is_utf8_punct(pTHX_ const U8 *p)
b8c5462f 1400{
386d01d6
GS
1401 if (!is_utf8_char(p))
1402 return FALSE;
b8c5462f
JH
1403 if (!PL_utf8_punct)
1404 PL_utf8_punct = swash_init("utf8", "IsPunct", &PL_sv_undef, 0, 0);
eb160463 1405 return swash_fetch(PL_utf8_punct, p, TRUE) != 0;
b8c5462f
JH
1406}
1407
1408bool
7fc63493 1409Perl_is_utf8_xdigit(pTHX_ const U8 *p)
b8c5462f 1410{
386d01d6
GS
1411 if (!is_utf8_char(p))
1412 return FALSE;
b8c5462f
JH
1413 if (!PL_utf8_xdigit)
1414 PL_utf8_xdigit = swash_init("utf8", "IsXDigit", &PL_sv_undef, 0, 0);
eb160463 1415 return swash_fetch(PL_utf8_xdigit, p, TRUE) != 0;
b8c5462f
JH
1416}
1417
1418bool
7fc63493 1419Perl_is_utf8_mark(pTHX_ const U8 *p)
a0ed51b3 1420{
386d01d6
GS
1421 if (!is_utf8_char(p))
1422 return FALSE;
a0ed51b3 1423 if (!PL_utf8_mark)
e24b16f9 1424 PL_utf8_mark = swash_init("utf8", "IsM", &PL_sv_undef, 0, 0);
eb160463 1425 return swash_fetch(PL_utf8_mark, p, TRUE) != 0;
a0ed51b3
LW
1426}
1427
6b5c0936
JH
1428/*
1429=for apidoc A|UV|to_utf8_case|U8 *p|U8* ustrp|STRLEN *lenp|SV **swash|char *normal|char *special
1430
1431The "p" contains the pointer to the UTF-8 string encoding
1432the character that is being converted.
1433
1434The "ustrp" is a pointer to the character buffer to put the
1435conversion result to. The "lenp" is a pointer to the length
1436of the result.
1437
0134edef 1438The "swashp" is a pointer to the swash to use.
6b5c0936 1439
0134edef
JH
1440Both the special and normal mappings are stored lib/unicore/To/Foo.pl,
1441and loaded by SWASHGET, using lib/utf8_heavy.pl. The special (usually,
1442but not always, a multicharacter mapping), is tried first.
6b5c0936 1443
0134edef
JH
1444The "special" is a string like "utf8::ToSpecLower", which means the
1445hash %utf8::ToSpecLower. The access to the hash is through
1446Perl_to_utf8_case().
6b5c0936 1447
0134edef
JH
1448The "normal" is a string like "ToLower" which means the swash
1449%utf8::ToLower.
1450
1451=cut */
6b5c0936 1452
2104c8d9 1453UV
e1ec3a88 1454Perl_to_utf8_case(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp, SV **swashp, const char *normal, const char *special)
a0ed51b3 1455{
89ebb4a3 1456 U8 tmpbuf[UTF8_MAXBYTES_CASE+1];
0134edef 1457 STRLEN len = 0;
a0ed51b3 1458
aec46f14 1459 const UV uv0 = utf8_to_uvchr(p, NULL);
1feea2c7
JH
1460 /* The NATIVE_TO_UNI() and UNI_TO_NATIVE() mappings
1461 * are necessary in EBCDIC, they are redundant no-ops
1462 * in ASCII-ish platforms, and hopefully optimized away. */
f54cb97a 1463 const UV uv1 = NATIVE_TO_UNI(uv0);
1feea2c7 1464 uvuni_to_utf8(tmpbuf, uv1);
0134edef
JH
1465
1466 if (!*swashp) /* load on-demand */
1467 *swashp = swash_init("utf8", normal, &PL_sv_undef, 4, 0);
1468
b08cf34e
JH
1469 /* The 0xDF is the only special casing Unicode code point below 0x100. */
1470 if (special && (uv1 == 0xDF || uv1 > 0xFF)) {
0134edef 1471 /* It might be "special" (sometimes, but not always,
2a37f04d 1472 * a multicharacter mapping) */
983ffd37 1473 HV *hv;
b08cf34e
JH
1474 SV **svp;
1475
1476 if ((hv = get_hv(special, FALSE)) &&
1477 (svp = hv_fetch(hv, (const char*)tmpbuf, UNISKIP(uv1), FALSE)) &&
1478 (*svp)) {
cfd0369c 1479 const char *s;
47654450 1480
cfd0369c 1481 s = SvPV_const(*svp, len);
47654450
JH
1482 if (len == 1)
1483 len = uvuni_to_utf8(ustrp, NATIVE_TO_UNI(*(U8*)s)) - ustrp;
2a37f04d 1484 else {
2f9475ad
JH
1485#ifdef EBCDIC
1486 /* If we have EBCDIC we need to remap the characters
1487 * since any characters in the low 256 are Unicode
1488 * code points, not EBCDIC. */
7cda7a3d 1489 U8 *t = (U8*)s, *tend = t + len, *d;
2f9475ad
JH
1490
1491 d = tmpbuf;
b08cf34e 1492 if (SvUTF8(*svp)) {
2f9475ad
JH
1493 STRLEN tlen = 0;
1494
1495 while (t < tend) {
1496 UV c = utf8_to_uvchr(t, &tlen);
1497 if (tlen > 0) {
1498 d = uvchr_to_utf8(d, UNI_TO_NATIVE(c));
1499 t += tlen;
1500 }
1501 else
1502 break;
1503 }
1504 }
1505 else {
36fec512
JH
1506 while (t < tend) {
1507 d = uvchr_to_utf8(d, UNI_TO_NATIVE(*t));
1508 t++;
1509 }
2f9475ad
JH
1510 }
1511 len = d - tmpbuf;
1512 Copy(tmpbuf, ustrp, len, U8);
1513#else
d2dcd0fb 1514 Copy(s, ustrp, len, U8);
2f9475ad 1515#endif
29e98929 1516 }
983ffd37 1517 }
0134edef
JH
1518 }
1519
1520 if (!len && *swashp) {
1521 UV uv2 = swash_fetch(*swashp, tmpbuf, TRUE);
1522
1523 if (uv2) {
1524 /* It was "normal" (a single character mapping). */
1525 UV uv3 = UNI_TO_NATIVE(uv2);
1526
e9101d72 1527 len = uvchr_to_utf8(ustrp, uv3) - ustrp;
2a37f04d
JH
1528 }
1529 }
1feea2c7 1530
0134edef
JH
1531 if (!len) /* Neither: just copy. */
1532 len = uvchr_to_utf8(ustrp, uv0) - ustrp;
1533
2a37f04d
JH
1534 if (lenp)
1535 *lenp = len;
1536
0134edef 1537 return len ? utf8_to_uvchr(ustrp, 0) : 0;
a0ed51b3
LW
1538}
1539
d3e79532 1540/*
7fc63493 1541=for apidoc A|UV|to_utf8_upper|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1542
1543Convert the UTF-8 encoded character at p to its uppercase version and
1544store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1545that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since
1546the uppercase version may be longer than the original character.
d3e79532
JH
1547
1548The first character of the uppercased version is returned
1549(but note, as explained above, that there may be more.)
1550
1551=cut */
1552
2104c8d9 1553UV
7fc63493 1554Perl_to_utf8_upper(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1555{
983ffd37 1556 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1557 &PL_utf8_toupper, "ToUpper", "utf8::ToSpecUpper");
983ffd37 1558}
a0ed51b3 1559
d3e79532 1560/*
7fc63493 1561=for apidoc A|UV|to_utf8_title|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1562
1563Convert the UTF-8 encoded character at p to its titlecase version and
1564store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1565that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1566titlecase version may be longer than the original character.
d3e79532
JH
1567
1568The first character of the titlecased version is returned
1569(but note, as explained above, that there may be more.)
1570
1571=cut */
1572
983ffd37 1573UV
7fc63493 1574Perl_to_utf8_title(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
983ffd37
JH
1575{
1576 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9 1577 &PL_utf8_totitle, "ToTitle", "utf8::ToSpecTitle");
a0ed51b3
LW
1578}
1579
d3e79532 1580/*
7fc63493 1581=for apidoc A|UV|to_utf8_lower|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1582
1583Convert the UTF-8 encoded character at p to its lowercase version and
1584store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3
JH
1585that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
1586lowercase version may be longer than the original character.
d3e79532
JH
1587
1588The first character of the lowercased version is returned
1589(but note, as explained above, that there may be more.)
1590
1591=cut */
1592
2104c8d9 1593UV
7fc63493 1594Perl_to_utf8_lower(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
a0ed51b3 1595{
983ffd37 1596 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
b4e400f9
JH
1597 &PL_utf8_tolower, "ToLower", "utf8::ToSpecLower");
1598}
1599
d3e79532 1600/*
7fc63493 1601=for apidoc A|UV|to_utf8_fold|const U8 *p|U8 *ustrp|STRLEN *lenp
d3e79532
JH
1602
1603Convert the UTF-8 encoded character at p to its foldcase version and
1604store that in UTF-8 in ustrp and its length in bytes in lenp. Note
89ebb4a3 1605that the ustrp needs to be at least UTF8_MAXBYTES_CASE+1 bytes since the
d3e79532
JH
1606foldcase version may be longer than the original character (up to
1607three characters).
1608
1609The first character of the foldcased version is returned
1610(but note, as explained above, that there may be more.)
1611
1612=cut */
1613
b4e400f9 1614UV
7fc63493 1615Perl_to_utf8_fold(pTHX_ const U8 *p, U8* ustrp, STRLEN *lenp)
b4e400f9
JH
1616{
1617 return Perl_to_utf8_case(aTHX_ p, ustrp, lenp,
1618 &PL_utf8_tofold, "ToFold", "utf8::ToSpecFold");
a0ed51b3
LW
1619}
1620
1621/* a "swash" is a swatch hash */
1622
1623SV*
7fc63493 1624Perl_swash_init(pTHX_ const char* pkg, const char* name, SV *listsv, I32 minbits, I32 none)
a0ed51b3 1625{
27da23d5 1626 dVAR;
a0ed51b3 1627 SV* retval;
243b1711 1628 SV* tokenbufsv = sv_newmortal();
8e84507e 1629 dSP;
7fc63493
AL
1630 const size_t pkg_len = strlen(pkg);
1631 const size_t name_len = strlen(name);
aec46f14 1632 HV * const stash = gv_stashpvn(pkg, pkg_len, FALSE);
f8be5cf0 1633 SV* errsv_save;
ce3b816e 1634
96ca9f55
DM
1635 PUSHSTACKi(PERLSI_MAGIC);
1636 ENTER;
1637 SAVEI32(PL_hints);
1638 PL_hints = 0;
1639 save_re_context();
1b026014 1640 if (!gv_fetchmeth(stash, "SWASHNEW", 8, -1)) { /* demand load utf8 */
ce3b816e 1641 ENTER;
f8be5cf0 1642 errsv_save = newSVsv(ERRSV);
71bed85a
NC
1643 Perl_load_module(aTHX_ PERL_LOADMOD_NOIMPORT, newSVpvn(pkg,pkg_len),
1644 Nullsv);
f8be5cf0
JH
1645 if (!SvTRUE(ERRSV))
1646 sv_setsv(ERRSV, errsv_save);
1647 SvREFCNT_dec(errsv_save);
ce3b816e
GS
1648 LEAVE;
1649 }
1650 SPAGAIN;
a0ed51b3
LW
1651 PUSHMARK(SP);
1652 EXTEND(SP,5);
71bed85a
NC
1653 PUSHs(sv_2mortal(newSVpvn(pkg, pkg_len)));
1654 PUSHs(sv_2mortal(newSVpvn(name, name_len)));
a0ed51b3
LW
1655 PUSHs(listsv);
1656 PUSHs(sv_2mortal(newSViv(minbits)));
1657 PUSHs(sv_2mortal(newSViv(none)));
1658 PUTBACK;
923e4eb5 1659 if (IN_PERL_COMPILETIME) {
bf1fed83 1660 /* XXX ought to be handled by lex_start */
82686b01 1661 SAVEI32(PL_in_my);
2b4bd638 1662 PL_in_my = 0;
bf1fed83 1663 sv_setpv(tokenbufsv, PL_tokenbuf);
82686b01 1664 }
f8be5cf0 1665 errsv_save = newSVsv(ERRSV);
864dbfa3 1666 if (call_method("SWASHNEW", G_SCALAR))
8e84507e 1667 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1668 else
e24b16f9 1669 retval = &PL_sv_undef;
f8be5cf0
JH
1670 if (!SvTRUE(ERRSV))
1671 sv_setsv(ERRSV, errsv_save);
1672 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1673 LEAVE;
1674 POPSTACK;
923e4eb5 1675 if (IN_PERL_COMPILETIME) {
bf1fed83 1676 STRLEN len;
aec46f14 1677 const char* const pv = SvPV_const(tokenbufsv, len);
bf1fed83
JH
1678
1679 Copy(pv, PL_tokenbuf, len+1, char);
eb160463 1680 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1681 }
bc45ce41
JH
1682 if (!SvROK(retval) || SvTYPE(SvRV(retval)) != SVt_PVHV) {
1683 if (SvPOK(retval))
35c1215d
NC
1684 Perl_croak(aTHX_ "Can't find Unicode property definition \"%"SVf"\"",
1685 retval);
cea2e8a9 1686 Perl_croak(aTHX_ "SWASHNEW didn't return an HV ref");
bc45ce41 1687 }
a0ed51b3
LW
1688 return retval;
1689}
1690
035d37be
JH
1691
1692/* This API is wrong for special case conversions since we may need to
1693 * return several Unicode characters for a single Unicode character
1694 * (see lib/unicore/SpecCase.txt) The SWASHGET in lib/utf8_heavy.pl is
1695 * the lower-level routine, and it is similarly broken for returning
1696 * multiple values. --jhi */
a0ed51b3 1697UV
7fc63493 1698Perl_swash_fetch(pTHX_ SV *sv, const U8 *ptr, bool do_utf8)
a0ed51b3 1699{
27da23d5 1700 dVAR;
aec46f14 1701 HV* const hv = (HV*)SvRV(sv);
3568d838
JH
1702 U32 klen;
1703 U32 off;
a0ed51b3 1704 STRLEN slen;
7d85a32c 1705 STRLEN needents;
cfd0369c 1706 const U8 *tmps = NULL;
a0ed51b3
LW
1707 U32 bit;
1708 SV *retval;
3568d838
JH
1709 U8 tmputf8[2];
1710 UV c = NATIVE_TO_ASCII(*ptr);
1711
1712 if (!do_utf8 && !UNI_IS_INVARIANT(c)) {
eb160463
GS
1713 tmputf8[0] = (U8)UTF8_EIGHT_BIT_HI(c);
1714 tmputf8[1] = (U8)UTF8_EIGHT_BIT_LO(c);
3568d838
JH
1715 ptr = tmputf8;
1716 }
1717 /* Given a UTF-X encoded char 0xAA..0xYY,0xZZ
1718 * then the "swatch" is a vec() for al the chars which start
1719 * with 0xAA..0xYY
1720 * So the key in the hash (klen) is length of encoded char -1
1721 */
1722 klen = UTF8SKIP(ptr) - 1;
1723 off = ptr[klen];
a0ed51b3 1724
7d85a32c
JH
1725 if (klen == 0)
1726 {
1727 /* If char in invariant then swatch is for all the invariant chars
1e54db1a 1728 * In both UTF-8 and UTF-8-MOD that happens to be UTF_CONTINUATION_MARK
7d85a32c
JH
1729 */
1730 needents = UTF_CONTINUATION_MARK;
1731 off = NATIVE_TO_UTF(ptr[klen]);
1732 }
1733 else
1734 {
1735 /* If char is encoded then swatch is for the prefix */
1736 needents = (1 << UTF_ACCUMULATION_SHIFT);
1737 off = NATIVE_TO_UTF(ptr[klen]) & UTF_CONTINUATION_MASK;
1738 }
1739
a0ed51b3
LW
1740 /*
1741 * This single-entry cache saves about 1/3 of the utf8 overhead in test
1742 * suite. (That is, only 7-8% overall over just a hash cache. Still,
1743 * it's nothing to sniff at.) Pity we usually come through at least
1744 * two function calls to get here...
1745 *
1746 * NB: this code assumes that swatches are never modified, once generated!
1747 */
1748
3568d838 1749 if (hv == PL_last_swash_hv &&
a0ed51b3 1750 klen == PL_last_swash_klen &&
27da23d5 1751 (!klen || memEQ((char *)ptr, (char *)PL_last_swash_key, klen)) )
a0ed51b3
LW
1752 {
1753 tmps = PL_last_swash_tmps;
1754 slen = PL_last_swash_slen;
1755 }
1756 else {
1757 /* Try our second-level swatch cache, kept in a hash. */
e1ec3a88 1758 SV** svp = hv_fetch(hv, (const char*)ptr, klen, FALSE);
a0ed51b3
LW
1759
1760 /* If not cached, generate it via utf8::SWASHGET */
cfd0369c 1761 if (!svp || !SvPOK(*svp) || !(tmps = (const U8*)SvPV_const(*svp, slen))) {
a0ed51b3 1762 dSP;
2b9d42f0
NIS
1763 /* We use utf8n_to_uvuni() as we want an index into
1764 Unicode tables, not a native character number.
1765 */
aec46f14 1766 const UV code_point = utf8n_to_uvuni(ptr, UTF8_MAXBYTES, 0,
872c91ae
JH
1767 ckWARN(WARN_UTF8) ?
1768 0 : UTF8_ALLOW_ANY);
f8be5cf0 1769 SV *errsv_save;
a0ed51b3
LW
1770 ENTER;
1771 SAVETMPS;
1772 save_re_context();
1773 PUSHSTACKi(PERLSI_MAGIC);
1774 PUSHMARK(SP);
1775 EXTEND(SP,3);
1776 PUSHs((SV*)sv);
ffbc6a93 1777 /* On EBCDIC & ~(0xA0-1) isn't a useful thing to do */
3568d838
JH
1778 PUSHs(sv_2mortal(newSViv((klen) ?
1779 (code_point & ~(needents - 1)) : 0)));
a0ed51b3
LW
1780 PUSHs(sv_2mortal(newSViv(needents)));
1781 PUTBACK;
f8be5cf0 1782 errsv_save = newSVsv(ERRSV);
864dbfa3 1783 if (call_method("SWASHGET", G_SCALAR))
8e84507e 1784 retval = newSVsv(*PL_stack_sp--);
a0ed51b3 1785 else
e24b16f9 1786 retval = &PL_sv_undef;
f8be5cf0
JH
1787 if (!SvTRUE(ERRSV))
1788 sv_setsv(ERRSV, errsv_save);
1789 SvREFCNT_dec(errsv_save);
a0ed51b3
LW
1790 POPSTACK;
1791 FREETMPS;
1792 LEAVE;
923e4eb5 1793 if (IN_PERL_COMPILETIME)
eb160463 1794 PL_curcop->op_private = (U8)(PL_hints & HINT_PRIVATE_MASK);
a0ed51b3 1795
e1ec3a88 1796 svp = hv_store(hv, (const char *)ptr, klen, retval, 0);
a0ed51b3 1797
7d85a32c 1798 if (!svp || !(tmps = (U8*)SvPV(*svp, slen)) || (slen << 3) < needents)
cea2e8a9 1799 Perl_croak(aTHX_ "SWASHGET didn't return result of proper length");
a0ed51b3
LW
1800 }
1801
1802 PL_last_swash_hv = hv;
1803 PL_last_swash_klen = klen;
cfd0369c
NC
1804 /* FIXME change interpvar.h? */
1805 PL_last_swash_tmps = (U8 *) tmps;
a0ed51b3
LW
1806 PL_last_swash_slen = slen;
1807 if (klen)
1808 Copy(ptr, PL_last_swash_key, klen, U8);
1809 }
1810
9faf8d75 1811 switch ((int)((slen << 3) / needents)) {
a0ed51b3
LW
1812 case 1:
1813 bit = 1 << (off & 7);
1814 off >>= 3;
1815 return (tmps[off] & bit) != 0;
1816 case 8:
1817 return tmps[off];
1818 case 16:
1819 off <<= 1;
1820 return (tmps[off] << 8) + tmps[off + 1] ;
1821 case 32:
1822 off <<= 2;
1823 return (tmps[off] << 24) + (tmps[off+1] << 16) + (tmps[off+2] << 8) + tmps[off + 3] ;
1824 }
cea2e8a9 1825 Perl_croak(aTHX_ "panic: swash_fetch");
a0ed51b3
LW
1826 return 0;
1827}
2b9d42f0
NIS
1828
1829
1830/*
37607a96 1831=for apidoc A|U8 *|uvchr_to_utf8|U8 *d|UV uv
2b9d42f0 1832
1e54db1a 1833Adds the UTF-8 representation of the Native codepoint C<uv> to the end
89ebb4a3 1834of the string C<d>; C<d> should be have at least C<UTF8_MAXBYTES+1> free
2b9d42f0
NIS
1835bytes available. The return value is the pointer to the byte after the
1836end of the new character. In other words,
1837
1838 d = uvchr_to_utf8(d, uv);
1839
1840is the recommended wide native character-aware way of saying
1841
1842 *(d++) = uv;
1843
1844=cut
1845*/
1846
1847/* On ASCII machines this is normally a macro but we want a
1848 real function in case XS code wants it
1849*/
1850#undef Perl_uvchr_to_utf8
1851U8 *
1852Perl_uvchr_to_utf8(pTHX_ U8 *d, UV uv)
1853{
b851fbc1 1854 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), 0);
2b9d42f0
NIS
1855}
1856
b851fbc1
JH
1857U8 *
1858Perl_uvchr_to_utf8_flags(pTHX_ U8 *d, UV uv, UV flags)
1859{
1860 return Perl_uvuni_to_utf8_flags(aTHX_ d, NATIVE_TO_UNI(uv), flags);
1861}
2b9d42f0
NIS
1862
1863/*
37607a96 1864=for apidoc A|UV|utf8n_to_uvchr|U8 *s|STRLEN curlen|STRLEN *retlen|U32 flags
2b9d42f0
NIS
1865
1866Returns the native character value of the first character in the string C<s>
1e54db1a 1867which is assumed to be in UTF-8 encoding; C<retlen> will be set to the
2b9d42f0
NIS
1868length, in bytes, of that character.
1869
1870Allows length and flags to be passed to low level routine.
1871
1872=cut
1873*/
0a2ef054
JH
1874/* On ASCII machines this is normally a macro but we want
1875 a real function in case XS code wants it
2b9d42f0
NIS
1876*/
1877#undef Perl_utf8n_to_uvchr
1878UV
7fc63493 1879Perl_utf8n_to_uvchr(pTHX_ const U8 *s, STRLEN curlen, STRLEN *retlen, U32 flags)
2b9d42f0 1880{
aec46f14 1881 const UV uv = Perl_utf8n_to_uvuni(aTHX_ s, curlen, retlen, flags);
2b9d42f0
NIS
1882 return UNI_TO_NATIVE(uv);
1883}
1884
d2cc3551
JH
1885/*
1886=for apidoc A|char *|pv_uni_display|SV *dsv|U8 *spv|STRLEN len|STRLEN pvlim|UV flags
1887
1888Build to the scalar dsv a displayable version of the string spv,
1889length len, the displayable version being at most pvlim bytes long
1890(if longer, the rest is truncated and "..." will be appended).
0a2ef054 1891
9e55ce06 1892The flags argument can have UNI_DISPLAY_ISPRINT set to display
00e86452 1893isPRINT()able characters as themselves, UNI_DISPLAY_BACKSLASH
0a2ef054
JH
1894to display the \\[nrfta\\] as the backslashed versions (like '\n')
1895(UNI_DISPLAY_BACKSLASH is preferred over UNI_DISPLAY_ISPRINT for \\).
1896UNI_DISPLAY_QQ (and its alias UNI_DISPLAY_REGEX) have both
1897UNI_DISPLAY_BACKSLASH and UNI_DISPLAY_ISPRINT turned on.
1898
d2cc3551
JH
1899The pointer to the PV of the dsv is returned.
1900
1901=cut */
e6b2e755 1902char *
e1ec3a88 1903Perl_pv_uni_display(pTHX_ SV *dsv, const U8 *spv, STRLEN len, STRLEN pvlim, UV flags)
e6b2e755
JH
1904{
1905 int truncated = 0;
e1ec3a88 1906 const char *s, *e;
e6b2e755
JH
1907
1908 sv_setpvn(dsv, "", 0);
e1ec3a88 1909 for (s = (const char *)spv, e = s + len; s < e; s += UTF8SKIP(s)) {
e6b2e755 1910 UV u;
a49f32c6
NC
1911 /* This serves double duty as a flag and a character to print after
1912 a \ when flags & UNI_DISPLAY_BACKSLASH is true.
1913 */
1914 char ok = 0;
c728cb41 1915
e6b2e755
JH
1916 if (pvlim && SvCUR(dsv) >= pvlim) {
1917 truncated++;
1918 break;
1919 }
1920 u = utf8_to_uvchr((U8*)s, 0);
c728cb41 1921 if (u < 256) {
a3b680e6 1922 const unsigned char c = (unsigned char)u & 0xFF;
c728cb41 1923 if (!ok && (flags & UNI_DISPLAY_BACKSLASH)) {
a49f32c6 1924 switch (c) {
c728cb41 1925 case '\n':
a49f32c6 1926 ok = 'n'; break;
c728cb41 1927 case '\r':
a49f32c6 1928 ok = 'r'; break;
c728cb41 1929 case '\t':
a49f32c6 1930 ok = 't'; break;
c728cb41 1931 case '\f':
a49f32c6 1932 ok = 'f'; break;
c728cb41 1933 case '\a':
a49f32c6 1934 ok = 'a'; break;
c728cb41 1935 case '\\':
a49f32c6 1936 ok = '\\'; break;
c728cb41
JH
1937 default: break;
1938 }
a49f32c6
NC
1939 if (ok) {
1940 Perl_sv_catpvf(aTHX_ dsv, "\\%c", ok);
1941 }
c728cb41 1942 }
00e86452 1943 /* isPRINT() is the locale-blind version. */
a49f32c6
NC
1944 if (!ok && (flags & UNI_DISPLAY_ISPRINT) && isPRINT(c)) {
1945 Perl_sv_catpvf(aTHX_ dsv, "%c", c);
1946 ok = 1;
0a2ef054 1947 }
c728cb41
JH
1948 }
1949 if (!ok)
9e55ce06 1950 Perl_sv_catpvf(aTHX_ dsv, "\\x{%"UVxf"}", u);
e6b2e755
JH
1951 }
1952 if (truncated)
1953 sv_catpvn(dsv, "...", 3);
1954
1955 return SvPVX(dsv);
1956}
2b9d42f0 1957
d2cc3551
JH
1958/*
1959=for apidoc A|char *|sv_uni_display|SV *dsv|SV *ssv|STRLEN pvlim|UV flags
1960
1961Build to the scalar dsv a displayable version of the scalar sv,
0a2ef054 1962the displayable version being at most pvlim bytes long
d2cc3551 1963(if longer, the rest is truncated and "..." will be appended).
0a2ef054
JH
1964
1965The flags argument is as in pv_uni_display().
1966
d2cc3551
JH
1967The pointer to the PV of the dsv is returned.
1968
1969=cut */
e6b2e755
JH
1970char *
1971Perl_sv_uni_display(pTHX_ SV *dsv, SV *ssv, STRLEN pvlim, UV flags)
1972{
cfd0369c
NC
1973 return Perl_pv_uni_display(aTHX_ dsv, (const U8*)SvPVX_const(ssv),
1974 SvCUR(ssv), pvlim, flags);
701a277b
JH
1975}
1976
d2cc3551 1977/*
d07ddd77 1978=for apidoc A|I32|ibcmp_utf8|const char *s1|char **pe1|register UV l1|bool u1|const char *s2|char **pe2|register UV l2|bool u2
d2cc3551
JH
1979
1980Return true if the strings s1 and s2 differ case-insensitively, false
1981if not (if they are equal case-insensitively). If u1 is true, the
1982string s1 is assumed to be in UTF-8-encoded Unicode. If u2 is true,
d07ddd77
JH
1983the string s2 is assumed to be in UTF-8-encoded Unicode. If u1 or u2
1984are false, the respective string is assumed to be in native 8-bit
1985encoding.
1986
1987If the pe1 and pe2 are non-NULL, the scanning pointers will be copied
1988in there (they will point at the beginning of the I<next> character).
1989If the pointers behind pe1 or pe2 are non-NULL, they are the end
1990pointers beyond which scanning will not continue under any
4cdaeff7 1991circumstances. If the byte lengths l1 and l2 are non-zero, s1+l1 and
d07ddd77
JH
1992s2+l2 will be used as goal end pointers that will also stop the scan,
1993and which qualify towards defining a successful match: all the scans
1994that define an explicit length must reach their goal pointers for
1995a match to succeed).
d2cc3551
JH
1996
1997For case-insensitiveness, the "casefolding" of Unicode is used
1998instead of upper/lowercasing both the characters, see
1999http://www.unicode.org/unicode/reports/tr21/ (Case Mappings).
2000
2001=cut */
701a277b 2002I32
d07ddd77 2003Perl_ibcmp_utf8(pTHX_ const char *s1, char **pe1, register UV l1, bool u1, const char *s2, char **pe2, register UV l2, bool u2)
332ddc25 2004{
e1ec3a88
AL
2005 register const U8 *p1 = (const U8*)s1;
2006 register const U8 *p2 = (const U8*)s2;
2007 register const U8 *f1 = 0, *f2 = 0;
2008 register U8 *e1 = 0, *q1 = 0;
2009 register U8 *e2 = 0, *q2 = 0;
d07ddd77 2010 STRLEN n1 = 0, n2 = 0;
89ebb4a3
JH
2011 U8 foldbuf1[UTF8_MAXBYTES_CASE+1];
2012 U8 foldbuf2[UTF8_MAXBYTES_CASE+1];
d7f013c8
JH
2013 U8 natbuf[1+1];
2014 STRLEN foldlen1, foldlen2;
d07ddd77 2015 bool match;
332ddc25 2016
d07ddd77
JH
2017 if (pe1)
2018 e1 = *(U8**)pe1;
e1ec3a88
AL
2019 if (e1 == 0 || (l1 && l1 < (UV)(e1 - (const U8*)s1)))
2020 f1 = (const U8*)s1 + l1;
d07ddd77
JH
2021 if (pe2)
2022 e2 = *(U8**)pe2;
e1ec3a88
AL
2023 if (e2 == 0 || (l2 && l2 < (UV)(e2 - (const U8*)s2)))
2024 f2 = (const U8*)s2 + l2;
d07ddd77
JH
2025
2026 if ((e1 == 0 && f1 == 0) || (e2 == 0 && f2 == 0) || (f1 == 0 && f2 == 0))
2027 return 1; /* mismatch; possible infinite loop or false positive */
2028
a6872d42
JH
2029 if (!u1 || !u2)
2030 natbuf[1] = 0; /* Need to terminate the buffer. */
2031
d07ddd77
JH
2032 while ((e1 == 0 || p1 < e1) &&
2033 (f1 == 0 || p1 < f1) &&
2034 (e2 == 0 || p2 < e2) &&
2035 (f2 == 0 || p2 < f2)) {
2036 if (n1 == 0) {
d7f013c8
JH
2037 if (u1)
2038 to_utf8_fold(p1, foldbuf1, &foldlen1);
2039 else {
809e8e66 2040 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p1)));
d7f013c8
JH
2041 to_utf8_fold(natbuf, foldbuf1, &foldlen1);
2042 }
2043 q1 = foldbuf1;
d07ddd77 2044 n1 = foldlen1;
332ddc25 2045 }
d07ddd77 2046 if (n2 == 0) {
d7f013c8
JH
2047 if (u2)
2048 to_utf8_fold(p2, foldbuf2, &foldlen2);
2049 else {
809e8e66 2050 uvuni_to_utf8(natbuf, (UV) NATIVE_TO_UNI(((UV)*p2)));
d7f013c8
JH
2051 to_utf8_fold(natbuf, foldbuf2, &foldlen2);
2052 }
2053 q2 = foldbuf2;
d07ddd77 2054 n2 = foldlen2;
332ddc25 2055 }
d07ddd77
JH
2056 while (n1 && n2) {
2057 if ( UTF8SKIP(q1) != UTF8SKIP(q2) ||
2058 (UTF8SKIP(q1) == 1 && *q1 != *q2) ||
2059 memNE((char*)q1, (char*)q2, UTF8SKIP(q1)) )
d7f013c8 2060 return 1; /* mismatch */
d07ddd77 2061 n1 -= UTF8SKIP(q1);
d7f013c8 2062 q1 += UTF8SKIP(q1);
d07ddd77 2063 n2 -= UTF8SKIP(q2);
d7f013c8 2064 q2 += UTF8SKIP(q2);
701a277b 2065 }
d07ddd77 2066 if (n1 == 0)
d7f013c8 2067 p1 += u1 ? UTF8SKIP(p1) : 1;
d07ddd77 2068 if (n2 == 0)
d7f013c8
JH
2069 p2 += u2 ? UTF8SKIP(p2) : 1;
2070
d2cc3551 2071 }
5469e704 2072
d07ddd77
JH
2073 /* A match is defined by all the scans that specified
2074 * an explicit length reaching their final goals. */
2075 match = (f1 == 0 || p1 == f1) && (f2 == 0 || p2 == f2);
5469e704
JH
2076
2077 if (match) {
d07ddd77
JH
2078 if (pe1)
2079 *pe1 = (char*)p1;
2080 if (pe2)
2081 *pe2 = (char*)p2;
5469e704
JH
2082 }
2083
2084 return match ? 0 : 1; /* 0 match, 1 mismatch */
e6b2e755 2085}
701a277b 2086
a49f32c6
NC
2087/*
2088 * Local variables:
2089 * c-indentation-style: bsd
2090 * c-basic-offset: 4
2091 * indent-tabs-mode: t
2092 * End:
2093 *
37442d52
RGS
2094 * ex: set ts=8 sts=4 sw=4 noet:
2095 */